#!/usr/bin/python3 """Lemma testing for GiellaLT spell-checkers and lexicons.""" import subprocess import sys import tempfile from argparse import ArgumentParser from subprocess import Popen from .lexc import scrapelemmas def main(): """CLI for speller lemma testing.""" argp = ArgumentParser() argp.add_argument("lexcfilenames", nargs="+", help="read lemmas from the lexc files") argp.add_argument("-z", "--zhfst", type=str, dest="zhfstfilename", help="ZHFST speller for analysing missing lemmas", required=True) argp.add_argument("-D", "--runner", type=str, dest="runnerfilename", help="external runner capable of handling zhfst", required=True) argp.add_argument("-T", "--threshold", type=int, help="required % proportion of succesful generations", default=99) argp.add_argument("-d", "--debug", action="store_true", default=False, help="prints debugging outputs") argp.add_argument("-v", "--verbose", action="store_true", default=False, help="prints some outputs") argp.add_argument("-Z", "--acceptable-forms", type=open, help="do not count oov if analysis contained in file") argp.add_argument("-X", "--exclude", action="append", help="exclude lines matching regex") argp.add_argument("-Q", "--oov-limit", type=int, default=100_000, help="stop trying after so many oovs") argp.add_argument("-B", "--time-out", type=int, default=60, help="max time used to test lemmas") argp.add_argument("-E", "--editor", type=str, help="open failures in EDITOR afterwards") options = argp.parse_args() logfile = tempfile.NamedTemporaryFile(prefix="gtlemmaspell", suffix=".txt", delete=False, encoding="UTF-8", mode="w+") if "divvunspell" in options.runnerfilename: spellargs = [options.runnerfilename, "suggest", "--archive", options.zhfstfilename] elif "hfst-ospell" in options.runnerfilename: spellargs = [options.runnerfilename, "-S", options.zhfstfilename] else: print(f"fail - unknown runner {options.runnerfilename}") sys.exit(1) skipforms = None if options.acceptable_forms: skipforms = [l.strip() for l in options.acceptable_forms.readlines()] lemmas = set() for lexcfilename in options.lexcfilenames: with open(lexcfilename, encoding="utf-8") as lexcfile: more = scrapelemmas(lexcfile, options.exclude, options.debug) for lemma in more: lemmas.add(lemma) lines = 0 oovs = 0 if options.verbose: print(f"collected {len(lemmas)} lemmas, sending...") lemmabytes = "\n".join(lemmas).encode("utf-8") try: results = subprocess.run(spellargs, input=lemmabytes, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, timeout=options.time_out) except subprocess.TimeoutExpired: print("Warning: lemma checking timed out") sys.exit(77) skipping = True if options.verbose: print("processing done.") for line in results.stdout.decode("utf-8").strip().split("\n"): if "Input:" in line: lemma = line.split()[1] if lemma in {"", "#", "#;"}: skipping = True elif skipforms and lemma in skipforms: skipping = True else: skipping = False lines += 1 if skipping: continue if "[INCORRECT]" in line: oovs += 1 if options.verbose: print(f"{lemma} is not accepted") print(f"{lemma}", file=logfile) print("\tfollowing suggestions:", file=logfile) else: if "Input:" not in line: print(f"\t{line}", file=logfile) if oovs >= options.oov_limit: print("too many fails, bailing to save time...") break if lines == 0: print(f"SKIP: could not find lemmas in {options.lexcfilenames}") sys.exit(77) coverage = (1.0 - (float(oovs) / float(lines))) * 100.0 if options.verbose: print("Lemma statistics:") print(f"\t{len(lemmas)} lemmas") print(f"\t{coverage} % accepted") if coverage < options.threshold: print("FAIL: too many lemmas weren't generating!", f"{coverage} < {options.threshold}") print(f"see {logfile.name} for details ({oovs} ungenerated strings)") if options.editor: Popen([options.editor, logfile.name]) sys.exit(1) else: print(f"PASS: {len(lemmas)} lemmas {coverage} % accepted") if coverage < 100: print(f"see {logfile.name} for remaining lemmas") if __name__ == "__main__": main()