Hangul-MAK is an attempt to store Hangul syllabic blocks more efficiently by using a final state machine. MAK is an abbreviation for "móó annerschd kodiert", which is saarlandish (a german dialect) for "encoded differently".
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
hangul-mak/hangul-mak.cpp

325 lines
10 KiB

/**
* This file is part of Hangul-MAK - Hangul móó annerschd kodiert
*
* Copyright (C) 2021 Moritz Strohm <ncc1988@posteo.de>
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http s ://www.gnu.org/licenses/>.
*/
#include <cinttypes>
#include <iostream>
#include <memory>
#include <vector>
enum class State
{
ONE_BYTE,
TWO_BYTES,
THREE_BYTES,
CONTROL,
UNICODE,
UTF8
};
class HangulMak
{
protected:
State state = State::ONE_BYTE;
std::basic_ostream<uint8_t> output;
std::basic_istream<uint8_t> input;
/**
* This vector maps hangul-mak consonants to unicode consonant numbers.
* @see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_in_Unicode
*/
std::vector<uint8_t> consonant_unicode_map = {
0, 2, 3, 5, 6, 7, 9, 11, 12, 14, 15, 16, 17, 18
//ㄱ ㄴ ㄷ ㄹ ㅁ ㅂ ㅅ ㅇ ㅈ ㅊ ㅋ ㅌ ㅍ ㅎ
};
/**
* This vector maps hangul-mak vowels to unicode vowel numbers.
* @see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_in_Unicode
*/
std::vector<uint8_t> vowel_unicode_map = {
0, 2, 4, 6, 8, 12, 13, 17, 18, 20
//ㅏ ㅑ ㅓ ㅕ ㅗ ㅛ ㅜ ㅠ ㅡ ㅣ
};
/**
* This vector maps hangul-mak tense consonants to unicode consonant numbers.
* @see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_in_Unicode
*/
std::vector<uint8_t> tense_consonant_unicode_map = {
1, 4, 8, 10, 13
//ㄲ ㄸ ㅃ ㅆ ㅉ
};
/**
* This vector maps hangul-mak complex vowels to unicode vowel numbers.
* @see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_in_Unicode
*/
std::vector<uint8_t> complex_vowel_unicode_map = {
1, 3, 5, 7, 9, 10, 11, 14, 15, 16, 19
//ㅐ ㅒ ㅔ ㅖ ㅘ ㅙ ㅚ ㅝ ㅞ ㅟ ㅢ
};
/**
* This vector maps hangul-mak consonants to unicode final consonant numbers.
* @see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_in_Unicode
*/
std::vector<uint8_t> final_consonant_unicode_map = {
1, 4, 7, 8, 16, 17, 19, 21, 22, 23, 24, 25, 26, 27
//ㄱ ㄴ ㄷ ㄹ ㅁ ㅂ ㅅ ㅇ ㅈ ㅊ ㅋ ㅌ ㅍ ㅎ
};
/**
* This vector maps hangul-mak tense consonants to unicode final consonant numbers.
* @see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_in_Unicode
*/
std::vector<uint8_t> final_tense_consonant_unicode_map = {
2, 20
//ㄲ ㅆ
};
/**
* This vector maps hangul-mak complex consonants to unicode consonant numbers.
* Note that these are only used as final consonants.
* @see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_in_Unicode
*/
std::vector<uint8_t> complex_consonant_unicode_map = {
3, 5, 6, 9, 10, 11, 12, 13, 14, 15, 18
//ㄳ ㄵ ㄶ ㄺ ㄻ ㄼ ㄽ ㄾ ㄿ ㅀ ㅄ
};
/**
* @throws std::out_of_range& In case the character cannot be resolved.
*/
uint8_t mapConsonantToUnicode(uint8_t character)
{
uint8_t char_number = character & 0x0F;
uint8_t char_flags = (character & 0xF0) >> 4;
if (char_flags & 0x01) {
//It is a tense character.
return this->tense_consonant_unicode_map.at(char_number);
}
return this->consonant_unicode_map.at(char_number);
}
uint8_t mapVowelToUnicode(uint8_t character)
{
uint8_t char_number = character & 0x0F;
uint8_t char_flags = (character & 0xF0) >> 4;
if (char_flags & 0x02) {
//It is a complex vowel.
return this->complex_vowel_unicode_map.at(char_number);
}
return this->vowel_unicode_map.at(char_number);
}
uint8_t mapFinalConsonantToUnicode(uint8_t character)
{
uint8_t char_number = character & 0x0F;
uint8_t char_flags = (character & 0xF0) >> 4;
if (char_flags & 0x01) {
//It is a tense consonant.
return this->final_tense_consonant_unicode_map.at(char_number);
} else if (char_flags & 0x02) {
//It is a complex consonant.
return this->complex_consonant_unicode_map.at(char_number);
}
return this->final_consonant_unicode_map.at(char_number);
}
uint32_t charToUnicode(std::vector<uint8_t> characters)
{
if (characters.size() == 2) {
//Basic consonant and vowel.
try {
uint8_t unicode_consonant = this->mapConsonantToUnicode(characters[0]);
uint8_t unicode_vowel = this->mapVowelToUnicode(characters[1]);
return unicode_consonant * 588 + unicode_vowel * 28 + 44032;
} catch (std::out_of_range& e) {
//Invalid character sequence.
return 0;
}
} else if (characters.size() == 3) {
try {
uint8_t unicode_consonant = this->mapConsonantToUnicode(characters[0]);
uint8_t unicode_vowel = this->mapVowelToUnicode(characters[1]);
uint8_t unicode_extended = this->mapFinalConsonantToUnicode(characters[2]);
return unicode_consonant * 588 + unicode_vowel * 28 + + unicode_extended + 44032;
} catch (std::out_of_range& e) {
//Invalid character sequence.
return 0;
}
} else if (characters.size() == 4) {
//TODO
}
}
std::vector<uint8_t> unicodeTochar(std::string unicode_char)
{
//TODO
return {};
}
void cleanupAfterCharacter()
{
this->nibbles = {0, 0, 0, 0, 0, 0};
this->state = State::ONE_BYTE;
}
std::vector<uint8_t> nibbles = {0, 0, 0, 0, 0, 0};
public:
HangulMak(std::basic_istream<uint8_t>& input, std::basic_ostream<uint8_t>& output)
: input(input),
output(output)
{
//nothing else
}
void encode()
{
}
void decode()
{
while (this->input && this->output) {
if (this->state == State::ONE_BYTE) {
//Read byte 1 from input:
uint8_t byte1 = this->input.get();
this->nibbles[0] = (byte1 & 0xF0) >> 4;
this->nibbles[1] = byte1 & 0x0F;
if (this->nibbles[0] < 12) {
//A character with a basic vowel and consonant.
//It can be directly converted to a unicode character.
//Note that the order must be reversed, since the storing
//order is vowel - consonant.
output.put(this->charToUnicode({this->nibbles[1], this->nibbles[0]}));
} else {
//Switch to two byte mode.
this->state = State::TWO_BYTES;
}
} else if (this->state == State::TWO_BYTES) {
//Read byte 2 from input:
uint8_t byte2 = this->input.get();
this->nibbles[2] = (byte2 & 0xF0) >> 4;
this->nibbles[3] = byte2 & 0x0F;
//Check the two least significant bits of nibble 1 to determine
//if it is a two-byte or three-byte character.
if ((this->nibbles[0] & 0x02) == 0) {
//It is a two-byte character.
if ((this->nibbles[0] & 0x01) == 0) {
//Nibble 2-4 contain basic characters in the order
//consonant - vowel - consonant:
output.put(
this->charToUnicode(
{
this->nibbles[1],
this->nibbles[2],
this->nibbles[3]
}
)
);
} else {
//Nibble 1 contains a tense consonant,
//nibble 2 a basic vowel,
//nibble 3 contains either a tense consonant
//or a normal one, depending on the most significant bit
//of the tense consonant in nibble 2.
bool nibble3_tense = this->nibbles[1] & 0x08 > 0;
if (nibble3_tense) {
this->nibbles[3] |= 0x10;
}
output.put(
this->charToUnicode(
{
(0x10 | this->nibbles[1]),
this->nibbles[2],
this->nibbles[3]
}
)
);
}
this->cleanupAfterCharacter();
} else {
//It is a three byte character.
this->state = State::THREE_BYTES;
}
} else if (this->state == State::THREE_BYTES) {
//Read byte 3 from input:
uint8_t byte3 = this->input.get();
this->nibbles[4] = (byte3 & 0xF0) >> 4;
this->nibbles[5] = byte3 & 0x0F;
if ((this->nibbles[0] & 0x01) == 0) {
//A three-letter syllable follows.
//Nibble 1 contains three flags for each letter,
//starting with the MSB:
//0 = basic, 1 = complex
//If the LSB of nibble 1 is set to one, the last consonant
//is a tense one.
} else {
//A four-letter syllable follows.
//Nibble 1 contains four flags for each letter:
//0 = basic letter, 1 = complex letter
}
}
}
}
};
int main(int argc, char** argv)
{
auto decoder = std::make_unique<HangulMak>(std::cin, std::cout);
}