Prusa-Firmware/lang/lang-check.py

#!/usr/bin/env python3
#
# Version 1.0.2 - Build 43
#############################################################################
# Change log:
#  7 May  2019, ondratu   , Initial
# 13 June 2019, 3d-gussner, Fix length false positives
# 14 Sep. 2019, 3d-gussner, Prepare adding new language
# 18 Sep. 2020, 3d-gussner, Fix execution of lang-check.py
#  2 Apr. 2021, 3d-gussner, Fix and improve text warp
# 22 Apr. 2021, DRracer   , add English source to output
# 23 Apr. 2021, wavexx    , improve
# 24 Apr. 2021, wavexx    , improve
# 26 Apr. 2021, wavexx    , add character ruler
# 21 Dec. 2021, 3d-gussner, Prepare more community languages
#                             Swedish
#                             Danish
#                             Slovanian
#                             Hungarian
#                             Luxembourgian
#                             Croatian
#  3 Jan. 2022, 3d-gussner, Prepare Lithuanian
#  7 Jan. 2022, 3d-gussner, Check for Syntax errors and exit with error
#                         , add Build number 'git rev-list --count HEAD lang-check.py'
# 30 Jan. 2022, 3d-gussner, Add arguments. Requested by @AttilaSVK
#                             --information == output all source and translated messages
#                             --import-check == used by `lang-import.sh`to verify
#                                               newly import `lang_en_??.txt` files
# 14 Mar. 2022, 3d-gussner, Check if translation isn't equal to origin
#############################################################################

"""Check PO files for formatting errors."""
from argparse import ArgumentParser
from sys import stdout, stderr, exit
import codecs
import polib
import textwrap
import re
import os

from lib import charset as cs
from lib.io import load_map
import enum

COLORIZE = (stdout.isatty() and os.getenv("TERM", "dumb") != "dumb") or os.getenv('NO_COLOR') == "0"
LCD_WIDTH = 20

GH_ANNOTATIONS = os.getenv('GH_ANNOTATIONS') == "1"
CURRENT_PO = "Unknown file"
GH_ERR_COUNT = 0

class AN_TYPE(enum.Enum):

    def __new__(cls, *args, **kwds):
        value = len(cls.__members__) + 1
        obj = object.__new__(cls)
        obj._value_ = value
        return obj
    def __init__(self, a, b):
        self.prefix = a
        self.print_fmt = b

    ERROR = "error", "[E]"
    WARNING = "warning", "[W]"
    NOTICE = "notice", "[S]"

def color_maybe(color_attr, text):
    if COLORIZE:
        return '\033[0;' + str(color_attr) + 'm' + text + '\033[0m'
    else:
        return text

red = lambda text: color_maybe(31, text)
green = lambda text: color_maybe(32, text)
yellow = lambda text: color_maybe(33, text)
cyan = lambda text: color_maybe(36, text)


def print_wrapped(wrapped_text, rows, cols):
    if type(wrapped_text) == str:
        wrapped_text = [wrapped_text]
    for r, line in enumerate(wrapped_text):
        r_ = str(r + 1).rjust(3)
        if r >= rows:
            r_ = red(r_)
        print((' {} |{:' + str(cols) + 's}|').format(r_, line))

def print_truncated(text, cols):
    if len(text) <= cols:
        prefix = text.ljust(cols)
        suffix = ''
    else:
        prefix = text[0:cols]
        suffix = red(text[cols:])
    print('   |' + prefix + '|' + suffix)

def print_ruler(spc, cols):
    print(' ' * spc + cyan(('₀₁₂₃₄₅₆₇₈₉'*4)[:cols]))

def print_source_translation(source, translation, wrapped_source, wrapped_translation, rows, cols):
    if rows == 1:
        print(' source text:')
        print_ruler(4, cols);
        print_truncated(source, cols)
        print(' translated text:')
        print_ruler(4, cols);
        print_truncated(translation, cols)
    else:
        print(' source text:')
        print_ruler(6, cols);
        print_wrapped(wrapped_source, rows, cols)
        print(' translated text:')
        print_ruler(6, cols);
        print_wrapped(wrapped_translation, rows, cols)
    print()

def highlight_trailing_white(text):
    if type(text) == str:
        return re.sub(r' $', '·', text)
    else:
        ret = text[:]
        ret[-1] = highlight_trailing_white(ret[-1])
        return ret

def wrap_text(text, cols):
    ret = []
    for line in text.split('\n'):
        # wrap each input line in text individually
        tmp = list(textwrap.TextWrapper(width=cols).wrap(line))
        if len(ret):
            # add back trailing whitespace
            tmp[-1] += ' ' * (len(text) - len(text.rstrip()))
        ret.extend(tmp)
    return ret

def ign_char_first(c):
    return c.isalnum() or c in ['%', '?'] or c in [chr(c) for c in range(0x80, 0xE0)]

def ign_char_last(c):
    return c.isalnum() or c in ['.', '\''] or c in [chr(c) for c in range(0x80, 0xE0)]

# Print_anyway is used to reduce code copypasta.
# specifically, if we have all the info here to construct the "normal" message as well, it's done here

def gh_annotate(an_type, start_line, message, end_line = None, print_anyway = False):
    if not GH_ANNOTATIONS:
        if print_anyway:
            if end_line is not None:
                line_text = "lines {}-{}".format(start_line, end_line)
            else:
                line_text = "line {}".format(start_line)
            message_simple = "{} on {}".format(message, line_text)
            if an_type == AN_TYPE.ERROR:
                print(red("{}: {}".format(an_type.print_fmt, message_simple)))
            else:
                print(yellow("{}: {}".format(an_type.print_fmt, message_simple)))
        return
    if end_line is not None:
        line_info = "line={},endLine={}".format(start_line,end_line)
    else:
        line_info = "line={}".format(start_line)

    print("::{} file={},{}::{}".format(an_type.prefix, CURRENT_PO, line_info, message))
    if an_type == AN_TYPE.ERROR:
        global GH_ERR_COUNT
        GH_ERR_COUNT += 1


def check_translation(entry, msgids, is_pot, no_warning, no_suggest, warn_empty, warn_same, information, shorter):
    """Check strings to display definition."""

    # do not check obsolete/deleted entriees
    if entry.obsolete:
        return True

    # fetch/decode entry for easy access
    meta = entry.comment.split('\n', 1)[0]
    source = entry.msgid
    translation = entry.msgstr
    line = entry.linenum
    known_msgid = msgids is None or source in msgids
    errors = 0

    # Check comment syntax (non-empty and include a MSG id)
    if known_msgid or warn_empty:
        if len(meta) == 0:
            gh_annotate(AN_TYPE.ERROR, line, "Translation missing comment metadata", None, True)
            return False
        if not meta.startswith('MSG'):
            gh_annotate(AN_TYPE.ERROR, line, "Critical Syntax Error: comment doesn't start with MSG", None, True)
            print(red(" comment: " + meta))
            return False

    # Check if columns and rows are defined
    tokens = meta.split(' ')
    cols = None
    rows = None
    for item in tokens[1:]:
        try:
            key, val = item.split('=')
            if key == 'c':
                cols = int(val)
            elif key == 'r':
                rows = int(val)
            else:
                raise ValueError
        except ValueError:
            gh_annotate(AN_TYPE.ERROR, line, "Invalid display definition", None, True)
            print(red(" definition: " + meta))
            return False

    if not cols:
        if not no_warning and known_msgid and not rows:
            errors += 1
            gh_annotate(AN_TYPE.WARNING, line, "No usable display definition", None, True)
        # probably fullscreen, guess from the message length to continue checking
        cols = LCD_WIDTH
    if cols > LCD_WIDTH:
        errors += 1
        gh_annotate(AN_TYPE.WARNING, line, "Invalid column count", None, True)
    if not rows:
        rows = 1
    elif rows > 1 and cols != LCD_WIDTH:
        errors += 1
        gh_annotate(AN_TYPE.WARNING, line, "Multiple rows with odd number of columns", None, True)

    # Check if translation contains unsupported characters
    invalid_char = cs.translation_check(cs.unicode_to_source(translation))
    if invalid_char is not None:
        gh_annotate(AN_TYPE.ERROR, line, "Critical syntax: Unhandled char %s found".format(repr(invalid_char)), None, True )
        print(red(' translation: ' + translation))
        return False

    # Pre-process the translation to translated characters for a correct preview and length check
    translation = cs.trans_replace(translation)

    wrapped_source = wrap_text(source, cols)
    rows_count_source = len(wrapped_source)
    wrapped_translation = wrap_text(translation, cols)
    rows_count_translation = len(wrapped_translation)

    # Incorrect number of rows/cols on the definition
    if rows == 1 and (len(source) > cols or rows_count_source > rows):
        errors += 1
        gh_annotate(AN_TYPE.WARNING, line, "Source text longer than %d cols as defined".format(cols), None, True)
        print_ruler(4, cols);
        print_truncated(source, cols)
        print()
    elif rows_count_source > rows:
        errors += 1
        gh_annotate(AN_TYPE.WARNING, line, "Source text longer than %d rows as defined".format(rows), None, True)
        print_ruler(6, cols);
        print_wrapped(wrapped_source, rows, cols)
        print()

    # All further checks are against the translation
    if is_pot:
        return (errors == 0)

    # Missing translation
    if len(translation) == 0 and (warn_empty or (not no_warning and known_msgid)):
        errors += 1
        if rows == 1:
            gh_annotate(AN_TYPE.WARNING, line, "Empty translation for \"{}\"".format(source), line + rows, True )
        else:
            gh_annotate(AN_TYPE.WARNING, line, "Empty translation", line + rows, True )
            print_ruler(6, cols);
            print_wrapped(wrapped_source, rows, cols)
            print()

    # Check for translation length too long
    if (rows_count_translation > rows) or (rows == 1 and len(translation) > cols):
        errors += 1
        gh_annotate(AN_TYPE.ERROR, line, "Text is longer than definition", line + rows)
        print(red('[E]: Text is longer than definition on line %d: cols=%d rows=%d (rows diff=%d)'
                % (line, cols, rows, rows_count_translation-rows)))
        print_source_translation(source, translation,
                                wrapped_source, wrapped_translation,
                                rows, cols)

    # Check for translation length shorter
    if shorter and (rows_count_translation < rows-1):
        gh_annotate(AN_TYPE.NOTICE, line, "Text is shorter than definition", line + rows)
        print(yellow('[S]: Text is shorter than definition on line %d: cols=%d rows=%d (rows diff=%d)'
                % (line, cols, rows, rows_count_translation-rows)))
        print_source_translation(source, translation,
                                wrapped_source, wrapped_translation,
                                rows, cols)

    # Different count of % sequences
    if source.count('%') != translation.count('%') and len(translation) > 0:
        errors += 1
        gh_annotate(AN_TYPE.ERROR, line, "Unequal count of %% escapes", None, True)
        print_source_translation(source, translation,
                                wrapped_source, wrapped_translation,
                                rows, cols)

    # Different first/last character
    if not no_suggest and len(source) > 0 and len(translation) > 0:
        source_end = source.rstrip()[-1]
        translation_end = translation.rstrip()[-1]
        start_diff = not (ign_char_first(source[0]) and ign_char_first(translation[0])) and source[0] != translation[0]
        end_diff = not (ign_char_last(source_end) and ign_char_last(translation_end)) and source_end != translation_end
        if start_diff or end_diff:
            if start_diff:
                gh_annotate(AN_TYPE.NOTICE, line, "Differing first punctuation character: ({} => {})".format(source[0],translation[0]), None, True)
            if end_diff:
                gh_annotate(AN_TYPE.NOTICE, line, "Differing last punctuation character: ({} => {})".format(source[-1],translation[-1]), None, True)
            print_source_translation(source, translation,
                                    wrapped_source, wrapped_translation,
                                    rows, cols)
    if not no_suggest and source == translation and (warn_same or len(source.split(' ', 1)) > 1):
        gh_annotate(AN_TYPE.NOTICE, line, "Translation same as original text", None, True)
        print_source_translation(source, translation,
                                wrapped_source, wrapped_translation,
                                rows, cols)

    # Short translation
    if not no_suggest and len(source) > 0 and len(translation) > 0:
        if len(translation.rstrip()) < len(source.rstrip()) / 2:
            gh_annotate(AN_TYPE.NOTICE, line, "Short translation", None, True)
            print_source_translation(source, translation,
                                    wrapped_source, wrapped_translation,
                                    rows, cols)

    # Incorrect trailing whitespace in translation
    if not no_warning and len(translation) > 0 and \
     (source.rstrip() == source or (rows == 1 and len(source) == cols)) and \
     translation.rstrip() != translation and \
     (rows > 1 or len(translation) != len(source)):
        errors += 1
        gh_annotate(AN_TYPE.WARNING, line, "Incorrect trailing whitespace for translation", None, True)
        source = highlight_trailing_white(source)
        translation = highlight_trailing_white(translation)
        wrapped_translation = highlight_trailing_white(wrapped_translation)
        print_source_translation(source, translation,
                                wrapped_source, wrapped_translation,
                                rows, cols)

    # show the information
    if information and errors == 0:
        print(green('[I]: %s' % (meta)))
        print_source_translation(source, translation,
                                wrapped_source, wrapped_translation,
                                rows, cols)
    return (errors == 0)


def main():
    """Main function."""
    parser = ArgumentParser(description=__doc__)
    parser.add_argument("po", help="PO file to check")
    parser.add_argument(
        "--no-warning", action="store_true",
        help="Disable warnings")
    parser.add_argument(
        "--no-suggest", action="store_true",
        help="Disable suggestions")
    parser.add_argument(
        "--errors-only", action="store_true",
        help="Only check errors")
    parser.add_argument(
        "--pot", action="store_true",
        help="Do not check translations")
    parser.add_argument(
        "--information", action="store_true",
        help="Output all translations")
    parser.add_argument("--map",
        help="Provide a map file to suppress warnings about unused translations")
    parser.add_argument(
        "--warn-empty", action="store_true",
        help="Warn about empty definitions and translations even if unused")
    parser.add_argument(
        "--warn-same", action="store_true",
        help="Warn about one-word translations which are identical to the source")
    parser.add_argument(
        "--shorter", action="store_true",
        help="Show message if it is shorter than expected.")

    # load the translations
    args = parser.parse_args()
    if not os.path.isfile(args.po):
        print("{}: file does not exist or is not a regular file".format(args.po), file=stderr)
        return 1

    if args.errors_only:
        args.no_warning = True
        args.no_suggest = True

    # load the symbol map to supress empty (but unused) translation warnings
    msgids = None
    if args.map:
        msgids = set()
        for sym in load_map(args.map):
            if type(sym['data']) == bytes:
                msgid = cs.source_to_unicode(codecs.decode(sym['data'], 'unicode_escape', 'strict'))
                msgids.add(msgid)

    # check each translation in turn
    status = True
    for translation in polib.pofile(args.po):
        global CURRENT_PO
        CURRENT_PO=args.po
        status &= check_translation(translation, msgids, args.pot, args.no_warning, args.no_suggest,
                                    args.warn_empty, args.warn_same, args.information, args.shorter)
    if GH_ANNOTATIONS:
        return GH_ERR_COUNT > 0 # Do not cause a failure if only warnings or notices.
    else:
        return 0 if status else 1

if __name__ == "__main__":
    exit(main())