From 655bf1db92ec5ec647d96f35855d8e81ee970344 Mon Sep 17 00:00:00 2001 From: 3d-gussner <3d.gussner@gmail.com> Date: Mon, 17 Oct 2022 10:11:33 +0200 Subject: [PATCH] Update diacritics --- lang/lib/charset.py | 118 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 110 insertions(+), 8 deletions(-) diff --git a/lang/lib/charset.py b/lang/lib/charset.py index 96c61a9c1..4308f289c 100644 --- a/lang/lib/charset.py +++ b/lang/lib/charset.py @@ -14,15 +14,117 @@ CUSTOM_CHARS = { # This transformation is applied to the translation prior to being converted to the final encoding, # and maps UTF8 to UTF8. It replaces unavailable symbols in the translation to a close # representation in the source encoding. +# sources +# https://en.wikipedia.org/wiki/Czech_orthography +# https://en.wikipedia.org/wiki/German_orthography +# https://en.wikipedia.org/wiki/French_orthography +# https://en.wikipedia.org/wiki/Spanish_orthography +# https://en.wikipedia.org/wiki/Italian_orthography +# https://en.wikipedia.org/wiki/Polish_alphabet +# https://en.wikipedia.org/wiki/Dutch_orthography +# https://en.wikipedia.org/wiki/Romanian_alphabet +# https://en.wikipedia.org/wiki/Hungarian_alphabet +# https://en.wikipedia.org/wiki/Gaj%27s_Latin_alphabet +# https://en.wikipedia.org/wiki/Slovak_orthography +# https://en.wikipedia.org/wiki/Swedish_alphabet +# https://en.wikipedia.org/wiki/Norwegian_orthography + TRANS_CHARS = { - 'Ä': 'ä', - 'Å': 'A', - 'Ö': 'ö', - 'Ü': 'ü', - 'å': 'a', - 'æ': 'ä', - 'ø': 'ö', - 'ß': 'ss', + 'á': 'a', #cz,fr,es,hu,sk + 'Á': 'A', #cz,fr,hu,sk + 'à': 'a', #fr,it + 'À': 'A', #fr,it + 'â': 'a', #fr,ro + 'Â': 'A', #ro + 'Ä': 'ä', #de,sv,no,sk + 'å': 'a', #sv,no + 'Å': 'A', #sv,no + 'æ': 'ä', #sv,no + 'ą': 'a', #pl + 'Ą': 'A', #pl + 'ă': 'a', #ro + 'Ă': 'A', #ro + 'ć': 'c', #pl,hr + 'Ć': 'C', #pl,hr + 'ç': 'c', #fr,nl + 'č': 'c', #cz,hr,sk + 'Č': 'C', #cz,hr,sk + 'ď': 'd', #cz,sk + 'Ď': 'D', #cz,sk + 'đ': 'd', #hr + 'Đ': 'D', #hr + 'é': 'e', #cz,fr,es,it,nl,hu,sk + 'É': 'E', #cz,fr,it,hu,sk + 'è': 'e', #fr,it,nl + 'È': 'E', #fr,it + 'ê': 'e', #fr,nl + 'ě': 'e', #cz + 'ë': 'e', #fr + 'Ě': 'E', #cz + 'ę': 'e', #pl + 'Ę': 'E', #pl + 'í': 'i', #cz,es,it,sk + 'Í': 'I', #cz,it,sk + 'î': 'i', #fr,ro + 'Î': 'I', #ro + 'ĺ': 'l', #sk + 'Ĺ': 'L', #sk + 'ł': 'l', #pl + 'Ł': 'L', #pl + 'ľ': 'l', #sk + 'Ľ': 'L', #sk + 'ń': 'n', #pl + 'Ń': 'N', #pl + 'ň': 'n', #cz,sk + 'Ň': 'N', #cz,sk + 'ñ': 'n', #es,nl + 'ó': 'o', #cz,es,pl,hu,sk + 'Ó': 'O', #cz,pl,hu,sk + 'ò': 'o', #it + 'Ò': 'O', #it + 'ô': 'o', #fr,nl,sk + 'Ô': 'O', #sk + 'œ': 'o', #fr + 'ø': 'ö', #sv,no + 'Ö': 'ö', #de,sv,no,hu + 'ő': 'o', #hu + 'Ő': 'O', #hu + 'ŕ': 'r', #sk + 'Ŕ': 'R', #sk + 'ř': 'r', #cz + 'Ř': 'R', #cz + 'ś': 's', #pl + 'Ś': 's', #pl + 'š': 's', #cz,hr,sk + 'Š': 'S', #cz,hr,sk + 'ș': 's', #ro + 'Ș': 'S', #ro + 'ß': 'ss',#de + 'ť': 't', #cz,sk + 'Ť': 'T', #cz,sk + 'ț': 't', #ro + 'Ț': 'T', #ro + 'ú': 'u', #cz,es,hu,sk + 'Ú': 'U', #cz,hu,sk + 'ù': 'u', #it + 'Ù': 'U', #it + 'û': 'u', #fr + 'Ü': 'ü', #de,hu + 'ů': 'u', #cz + 'Ů': 'U', #cz + 'ű': 'u', #hu + 'Ű': 'U', #hu + 'ý': 'y', #cz,sk + 'Ý': 'Y', #cz,sk + 'ÿ': 'y', #fr + 'ź': 'z', #pl + 'Ź': 'Z', #pl + 'ž': 'z', #cz,hr,sk + 'Ž': 'z', #cz,hr,sk + 'ż': 'z', #pl + 'Ż': 'Z', #pl + '¿': '', #es + '¡': '', #es }