172 lines
4.5 KiB
Python
172 lines
4.5 KiB
Python
from .FontGen import CUSTOM_CHARS, INVERSE_CUSTOM_CHARS
|
||
|
||
# Charaters to be remapped prior to source-encoding transformation
|
||
# This transformation is applied to the translation prior to being converted to the final encoding,
|
||
# and maps UTF8 to UTF8. It replaces unavailable symbols in the translation to a close
|
||
# representation in the source encoding.
|
||
# sources
|
||
# https://en.wikipedia.org/wiki/Czech_orthography
|
||
# https://en.wikipedia.org/wiki/German_orthography
|
||
# https://en.wikipedia.org/wiki/French_orthography
|
||
# https://en.wikipedia.org/wiki/Spanish_orthography
|
||
# https://en.wikipedia.org/wiki/Italian_orthography
|
||
# https://en.wikipedia.org/wiki/Polish_alphabet
|
||
# https://en.wikipedia.org/wiki/Dutch_orthography
|
||
# https://en.wikipedia.org/wiki/Romanian_alphabet
|
||
# https://en.wikipedia.org/wiki/Hungarian_alphabet
|
||
# https://en.wikipedia.org/wiki/Gaj%27s_Latin_alphabet
|
||
# https://en.wikipedia.org/wiki/Slovak_orthography
|
||
# https://en.wikipedia.org/wiki/Swedish_alphabet
|
||
# https://en.wikipedia.org/wiki/Norwegian_orthography
|
||
|
||
TRANS_CHARS = {
|
||
# 'á': 'a', #cz,fr,es,hu,sk
|
||
# 'Á': 'A', #cz,fr,hu,sk
|
||
# 'à': 'a', #fr,it
|
||
'À': 'à', #fr,it
|
||
# 'â': 'a', #fr,ro
|
||
'Â': 'â', #ro
|
||
# 'Ä': 'ä', #de,sv,no,sk
|
||
# 'å': 'a', #sv,no
|
||
# 'Å': 'A', #sv,no
|
||
# 'æ': 'ä', #sv,no
|
||
# 'ą': 'a', #pl
|
||
# 'Ą': 'A', #pl
|
||
# 'ă': 'a', #ro - a-breve
|
||
# 'Ă': 'A', #ro - A-breve
|
||
'ǎ': 'ă', #ro - a-caron
|
||
'Ǎ': 'Ă', #ro - A-caron
|
||
# 'ć': 'c', #pl,hr
|
||
'Ć': 'ć', #pl,hr
|
||
'ç': 'c', #fr,nl
|
||
'Ç': 'C', #fr,nl
|
||
# 'č': 'c', #cz,hr,sk
|
||
# 'Č': 'č', #cz,hr,sk
|
||
# 'ď': 'd', #cz,sk
|
||
'Ď': 'ď', #cz,sk
|
||
'đ': 'd', #hr
|
||
'Đ': 'D', #hr
|
||
# 'é': 'e', #cz,fr,es,it,nl,hu,sk
|
||
# 'É': 'E', #cz,fr,it,hu,sk
|
||
# 'è': 'e', #fr,it,nl
|
||
'È': 'è', #fr,it
|
||
# 'ê': 'e', #fr,nl
|
||
'Ê': 'ê', #fr
|
||
# 'ě': 'e', #cz
|
||
'Ě': 'ě', #cz
|
||
# 'ë': 'e', #fr
|
||
# 'ę': 'e', #pl
|
||
# 'Ę': 'ę', #pl
|
||
# 'í': 'i', #cz,es,it,sk
|
||
# 'Í': 'í', #cz,it,sk
|
||
'ì': 'i',
|
||
'Ì': 'I',
|
||
# 'î': 'i', #fr,ro
|
||
# 'Î': 'I', #ro
|
||
# 'ĺ': 'l', #sk
|
||
'Ĺ': 'ĺ', #sk
|
||
# 'ł': 'l', #pl
|
||
# 'Ł': 'L', #pl
|
||
# 'ľ': 'l', #sk
|
||
# 'Ľ': 'L', #sk
|
||
# 'ń': 'n', #pl
|
||
'Ń': 'ń', #pl
|
||
# 'ň': 'n', #cz,sk
|
||
'Ň': 'ň', #cz,sk
|
||
'ñ': 'n', #es,nl
|
||
# 'ó': 'o', #cz,es,pl,hu,sk
|
||
# 'Ó': 'ó', #cz,pl,hu,sk
|
||
# 'ò': 'o', #it
|
||
'Ò': 'ò', #it
|
||
# 'ô': 'o', #fr,nl,sk
|
||
'Ô': 'ô', #sk
|
||
'œ': 'o', #fr
|
||
'œ': 'o', #fr
|
||
# 'ø': 'ö', #sv,no
|
||
# 'Ø': 'ø', #sv,no
|
||
# 'Ö': 'ö', #de,sv,no,hu
|
||
# 'ő': 'o', #hu
|
||
'Ő': 'ő', #hu
|
||
'ŕ': 'r', #sk
|
||
'Ŕ': 'R', #sk
|
||
# 'ř': 'r', #cz
|
||
# 'Ř': 'ř', #cz
|
||
# 'ś': 's', #pl
|
||
# 'Ś': 'ś', #pl
|
||
# 'š': 's', #cz,hr,sk
|
||
# 'Š': 'š', #cz,hr,sk
|
||
# 'ș': 's', #ro - s-comma
|
||
# 'Ș': 'ș', #ro - S-comma
|
||
'ş': 'ș', #ro - s-cedilla
|
||
'Ş': 'Ș', #ro - S-cedilla
|
||
# 'ß': 'ss',#de
|
||
'ẞ': 'ß',#de
|
||
# 'ť': 't', #cz,sk
|
||
'Ť': 'ť', #cz,sk
|
||
# 'ț': 't', #ro - t-comma
|
||
'Ț': 'ț', #ro - T-comma
|
||
'ţ': 'ț', #ro - t-cedilla
|
||
'Ţ': 'Ț', #ro - T-cedilla
|
||
# 'ú': 'u', #cz,es,hu,sk
|
||
'Ú': 'ú', #cz,hu,sk
|
||
'ù': 'u', #it
|
||
'Ù': 'U', #it
|
||
'û': 'u', #fr
|
||
'Û': 'U', #fr
|
||
# 'Ü': 'ü', #de,hu
|
||
# 'ů': 'u', #cz
|
||
'Ů': 'ů', #cz
|
||
# 'ű': 'u', #hu
|
||
'Ű': 'ű', #hu
|
||
# 'ý': 'y', #cz,sk
|
||
# 'Ý': 'ý', #cz,sk
|
||
'ÿ': 'y', #fr
|
||
'Ÿ': 'y', #fr
|
||
# 'ź': 'z', #pl
|
||
'Ź': 'ź', #pl
|
||
# 'ž': 'z', #cz,hr,sk
|
||
# 'Ž': 'ž', #cz,hr,sk
|
||
# 'ż': 'z', #pl
|
||
'Ż': 'ż', #pl
|
||
'¿': '', #es
|
||
'¡': '', #es
|
||
'’': '\'',
|
||
}
|
||
|
||
|
||
def _character_check(buf, valid_chars):
|
||
for c in buf:
|
||
if (not c.isascii() or not c.isprintable()) and c not in valid_chars:
|
||
return c
|
||
return None
|
||
|
||
def source_check(buf):
|
||
valid_chars = set(CUSTOM_CHARS.values())
|
||
valid_chars.add('\n')
|
||
return _character_check(buf, valid_chars)
|
||
|
||
def translation_check(buf):
|
||
valid_chars = set(CUSTOM_CHARS.keys())
|
||
valid_chars.add('\n')
|
||
return _character_check(buf, valid_chars)
|
||
|
||
def trans_replace(buf):
|
||
for src, dst in TRANS_CHARS.items():
|
||
buf = buf.replace(src, dst)
|
||
return buf
|
||
|
||
def source_to_unicode(buf):
|
||
buf = trans_replace(buf)
|
||
out = u''
|
||
for c in buf:
|
||
out += CUSTOM_CHARS.get(c, c)
|
||
return out
|
||
|
||
def unicode_to_source(buf):
|
||
buf = trans_replace(buf)
|
||
out = ''
|
||
for c in buf:
|
||
out += INVERSE_CUSTOM_CHARS.get(c, c)
|
||
return out
|
||
|