diff --git a/.gitignore b/.gitignore index a75e7c05..adc7bbbc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.cache .idea +*.egg-info/ *.pyc -.cache +dist/ diff --git a/README.md b/README.md index 0b41297f..36465c78 100644 --- a/README.md +++ b/README.md @@ -6,28 +6,24 @@ Python framework for processing Universal Dependencies data [![Documentation Status](https://readthedocs.org/projects/udapi/badge/)](http://udapi.readthedocs.io) ## Requirements -- You need Python 3.6 or higher. -- If the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser is needed, - make sure you have a C++11 compiler (e.g. [g++ 4.7 or newer](.travis.yml#L9)) - and install UDPipe with `pip3 install --user --upgrade ufal.udpipe`. +- You need Python 3.9 or higher. +- It is recommended to install Udapi in a Python virtual environment. +- If you need the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser (to be used from Udapi) + install it (with `pip install --upgrade ufal.udpipe`). ## Install Udapi for developers -Let's clone the git repo to `~/udapi-python/`, install dependencies -and setup `$PATH` and `$PYTHONPATH` accordingly. +Let's clone the git repo e.g. to `~/udapi-python/` and make an [editable installation](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) ```bash cd git clone https://github.com/udapi/udapi-python.git -pip3 install --user -r udapi-python/requirements.txt -echo '## Use Udapi from ~/udapi-python/ ##' >> ~/.bashrc -echo 'export PATH="$HOME/udapi-python/bin:$PATH"' >> ~/.bashrc -echo 'export PYTHONPATH="$HOME/udapi-python/:$PYTHONPATH"' >> ~/.bashrc -source ~/.bashrc # or open new bash +cd udapi-python +pip install -e . ``` ## Install Udapi for users This is similar to the above, but installs Udapi from PyPI to the standard (user) Python paths. ``` -pip3 install --user --upgrade udapi +pip install --upgrade udapi ``` Try `udapy -h` to check it is installed correctly. If it fails, make sure your `PATH` includes the directory where `pip3` installed the `udapy` script. diff --git a/bin/udapy b/bin/udapy index 30cb2595..83c7a6f2 100755 --- a/bin/udapy +++ b/bin/udapy @@ -1,136 +1,7 @@ #!/usr/bin/env python3 -import os -import gc +"""Thin wrapper for backward compatibility. Calls udapi.cli.main().""" import sys -import atexit -import logging -import argparse +from udapi.cli import main -from udapi.core.run import Run - -# Parse command line arguments. -argparser = argparse.ArgumentParser( - formatter_class=argparse.RawTextHelpFormatter, - usage="udapy [optional_arguments] scenario", - epilog="See http://udapi.github.io", - description="udapy - Python interface to Udapi - API for Universal Dependencies\n\n" - "Examples of usage:\n" - " udapy -s read.Sentences udpipe.En < in.txt > out.conllu\n" - " udapy -T < sample.conllu | less -R\n" - " udapy -HAM ud.MarkBugs < sample.conllu > bugs.html\n") -argparser.add_argument( - "-q", "--quiet", action="store_true", - help="Warning, info and debug messages are suppressed. Only fatal errors are reported.") -argparser.add_argument( - "-v", "--verbose", action="store_true", - help="Warning, info and debug messages are printed to the STDERR.") -argparser.add_argument( - "-s", "--save", action="store_true", - help="Add write.Conllu to the end of the scenario") -argparser.add_argument( - "-T", "--save_text_mode_trees", action="store_true", - help="Add write.TextModeTrees color=1 to the end of the scenario") -argparser.add_argument( - "-H", "--save_html", action="store_true", - help="Add write.TextModeTreesHtml color=1 to the end of the scenario") -argparser.add_argument( - "-A", "--save_all_attributes", action="store_true", - help="Add attributes=form,lemma,upos,xpos,feats,deprel,misc (to be used after -T and -H)") -argparser.add_argument( - "-C", "--save_comments", action="store_true", - help="Add print_comments=1 (to be used after -T and -H)") -argparser.add_argument( - "-M", "--marked_only", action="store_true", - help="Add marked_only=1 to the end of the scenario (to be used after -T and -H)") -argparser.add_argument( - "-N", "--no_color", action="store_true", - help="Add color=0 to the end of the scenario, this overrides color=1 of -T and -H") -argparser.add_argument( - "-X", "--extra", action="append", - help="Add a specified parameter (or a block name) to the end of the scenario\n" - "For example 'udapy -TNX attributes=form,misc -X layout=align < my.conllu'") -argparser.add_argument( - "--gc", action="store_true", - help="By default, udapy disables Python garbage collection and at-exit cleanup\n" - "to speed up everything (especially reading CoNLL-U files). In edge cases,\n" - "when processing many files and running out of memory, you can disable this\n" - "optimization (i.e. enable garbage collection) with 'udapy --gc'.") -argparser.add_argument( - 'scenario', nargs=argparse.REMAINDER, help="A sequence of blocks and their parameters.") - -args = argparser.parse_args() - -# Set the level of logs according to parameters. -if args.verbose: - level = logging.DEBUG -elif args.quiet: - level = logging.CRITICAL -else: - level = logging.INFO - -logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', - level=level) - -# Global flag to track if an unhandled exception occurred -_unhandled_exception_occurred = False - -def _custom_excepthook(exc_type, exc_value, traceback): - global _unhandled_exception_occurred - _unhandled_exception_occurred = True - - # Call the default excepthook to allow normal error reporting - sys.__excepthook__(exc_type, exc_value, traceback) - -# Override the default excepthook -sys.excepthook = _custom_excepthook - - -# Process and provide the scenario. if __name__ == "__main__": - - # Disabling garbage collections makes the whole processing much faster. - # Similarly, we can save several seconds by partially disabling the at-exit Python cleanup - # (atexit hooks are called in reversed order of their registration, - # so flushing stdio buffers etc. will be still done before the os._exit(0) call). - # See https://instagram-engineering.com/dismissing-python-garbage-collection-at-instagram-4dca40b29172 - # Is it safe to disable GC? - # OS will free the memory allocated by this process after it ends anyway. - # The udapy wrapper is aimed for one-time tasks, not a long-running server, - # so in a typical case a document is loaded and almost no memory is freed before the end. - # Udapi documents have a many cyclic references, so running GC is quite slow. - if not args.gc: - gc.disable() - # When an exception/error has happened, udapy should exit with a non-zero exit code, - # so that users can use `udapy ... || echo "Error detected"` (or Makefile reports errors). - # However, we cannot use `atexit.register(lambda: os._exit(1 if sys.exc_info()[0] else 0))` - # because the Python has already exited the exception-handling block - # (the exception/error has been already reported and sys.exc_info()[0] is None). - # We thus keep record whether _unhandled_exception_occurred. - atexit.register(lambda: os._exit(1 if _unhandled_exception_occurred else 0)) - atexit.register(sys.stderr.flush) - if args.save: - args.scenario = args.scenario + ['write.Conllu'] - if args.save_text_mode_trees: - args.scenario = args.scenario + ['write.TextModeTrees', 'color=1'] - if args.save_html: - args.scenario = args.scenario + ['write.TextModeTreesHtml', 'color=1'] - if args.save_all_attributes: - args.scenario = args.scenario + ['attributes=form,lemma,upos,xpos,feats,deprel,misc'] - if args.save_comments: - args.scenario = args.scenario + ['print_comments=1'] - if args.marked_only: - args.scenario = args.scenario + ['marked_only=1'] - if args.no_color: - args.scenario = args.scenario + ['color=0'] - if args.extra: - args.scenario += args.extra - - runner = Run(args) - # udapy is often piped to head etc., e.g. - # `seq 1000 | udapy -s read.Sentences | head` - # Let's prevent Python from reporting (with distracting stacktrace) - # "BrokenPipeError: [Errno 32] Broken pipe" - try: - runner.execute() - except BrokenPipeError: - pass + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml index 374b58cb..18d5c717 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,36 @@ [build-system] -requires = [ - "setuptools>=42", - "wheel" -] +requires = ["setuptools>=42", "wheel"] build-backend = "setuptools.build_meta" + +[project] +name = "udapi" +version = "0.5.2" +description = "Python framework for processing Universal Dependencies data" +readme = "README.md" +requires-python = ">=3.9" +license = "GPL-3.0-or-later" +authors = [ + {name = "Martin Popel", email = "popel@ufal.mff.cuni.cz"} +] +classifiers = [ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", +] +dependencies = [ + "colorama", + "termcolor", +] + +[project.urls] +Homepage = "https://github.com/udapi/udapi-python" + +[project.optional-dependencies] +test = ["pytest"] +udpipe = ["ufal.udpipe"] + +[project.scripts] +udapy = "udapi.cli:main" + +[tool.setuptools] +packages = {find = {}} +include-package-data = true diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index fdbae292..00000000 --- a/setup.cfg +++ /dev/null @@ -1,29 +0,0 @@ -[metadata] -name = udapi -version = 0.5.1 -author = Martin Popel -author_email = popel@ufal.mff.cuni.cz -description = Python framework for processing Universal Dependencies data -long_description = file: README.md -long_description_content_type = text/markdown -url = https://github.com/udapi/udapi-python -classifiers = - Programming Language :: Python :: 3 - License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+) - Operating System :: OS Independent - -[options] -packages = find: -python_requires = >=3.9 -include_package_data = True -scripts = - bin/udapy -install_requires = - colorama - termcolor - -[options.extras_require] -test = - pytest - - diff --git a/setup.py b/setup.py deleted file mode 100644 index 7f1a1763..00000000 --- a/setup.py +++ /dev/null @@ -1,4 +0,0 @@ -from setuptools import setup - -if __name__ == "__main__": - setup() diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index 5368cabc..527159e9 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -1,14 +1,17 @@ from udapi.core.block import Block from collections import Counter +import re class Stats(Block): """Block corefud.Stats prints various coreference-related statistics.""" def __init__(self, m_len_max=5, e_len_max=5, report_basics=False, report_mentions=True, report_entities=True, - report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _', + report_details=True, report_words_per_doc=False, report_entity_range=False, + selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _', exclude_singletons=False, exclude_nonsingletons=False, style='human', per_doc=False, max_rows_per_page=50, docname='newdoc', docname_len=15, + highlight_docnames=None, **kwargs): super().__init__(**kwargs) self.m_len_max = m_len_max @@ -17,6 +20,8 @@ def __init__(self, m_len_max=5, e_len_max=5, self.report_mentions = report_mentions self.report_entities = report_entities self.report_details = report_details + self.report_words_per_doc = report_words_per_doc + self.report_entity_range = report_entity_range self.exclude_singletons = exclude_singletons self.exclude_nonsingletons = exclude_nonsingletons self.style = style @@ -28,6 +33,7 @@ def __init__(self, m_len_max=5, e_len_max=5, raise ValueError(f'Unknown style {style}') self.docname = docname self.docname_len = docname_len + self.highlight_docnames = highlight_docnames self._header_printed = False self._lines_printed = None @@ -40,10 +46,17 @@ def __init__(self, m_len_max=5, e_len_max=5, self.longest_entity = 0 self.m_words = 0 self.selected_upos = None if selected_upos == 'all' else selected_upos.split() + self.entity_ranges = [] def process_document(self, doc): self.total_nodes += len(list(doc.nodes)) self.counter['documents'] += 1 + node2docord, current_docord = {}, 0 + if self.report_entity_range: + for node in doc.nodes_and_empty: + node2docord[node] = current_docord + current_docord += 1 + for entity in doc.coref_entities: len_mentions = len(entity.mentions) if len_mentions == 1: @@ -52,6 +65,8 @@ def process_document(self, doc): continue elif len_mentions > 1 and self.exclude_nonsingletons: continue + if self.report_entity_range: + self.entity_ranges.append(node2docord[entity.mentions[-1].head] - node2docord[entity.mentions[0].head]) self.longest_entity = max(len_mentions, self.longest_entity) self.counter['c_total_len'] += len_mentions self.counter[f"c_len_{min(len_mentions, self.e_len_max)}"] += 1 @@ -83,11 +98,17 @@ def process_document(self, doc): self.counter['m_nontreelet'] += 1 if heads > 1 else 0 if self.report_basics: + doc_words = 0 for tree in doc.trees: - self.counter['newdocs'] += 1 if tree.newdoc else 0 self.counter['sents'] += 1 self.counter['words'] += len(tree.descendants) self.counter['empty'] += len(tree.empty_nodes) + if tree.newdoc: + self.counter['newdocs'] += 1 + if doc_words > self.counter['max_words_per_doc']: + self.counter['max_words_per_doc'] = doc_words + doc_words = 0 + doc_words += len(tree.descendants) def after_process_document(self, doc): if self.per_doc: @@ -100,6 +121,7 @@ def after_process_document(self, doc): self.longest_mention = 0 self.longest_entity = 0 self.m_words = 0 + self.entity_ranges = [] def process_end(self, skip=True, doc=None): if not self._lines_printed: @@ -111,6 +133,10 @@ def process_end(self, skip=True, doc=None): return else: docname = doc.meta['loaded_from'] if self.docname == 'filename' else doc[0].trees[0].newdoc + if self.style.startswith('tex'): + if self.highlight_docnames and re.search(self.highlight_docnames, docname): + docname = r"\NEW " + docname + docname = docname.replace('_', r'\_') print(f"{docname:{self.docname_len}}", end='&' if self.style.startswith('tex') else '\n') elif self.style.startswith('tex-'): print(f"{self.counter['documents']:4} documents &") @@ -122,15 +148,22 @@ def process_end(self, skip=True, doc=None): columns =[ ] if self.report_basics: - columns += [('docs', f"{self.counter['newdocs']:7,}"), + columns += [('docs', f"{self.counter['newdocs']:6,}"), ('sents', f"{self.counter['sents']:7,}"), - ('words', f"{self.counter['words']:7,}"), + ('words', f"{self.counter['words']:9,}"), ('empty', f"{self.counter['empty']:7,}"),] + if self.report_words_per_doc: + columns += [('max_words/doc', f"{self.counter['max_words_per_doc']:7,}"), + ('words/doc', f"{self.counter['words']/self.counter['newdocs']:7,.0f}"),] if self.report_entities: columns += [('entities', f"{self.entities:7,}"), ('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"), ('longest_entity', f"{self.longest_entity:6}"), ('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")] + if self.report_entity_range: + self.entity_ranges.sort() + percentile = self.entity_ranges[int(0.95 * (len(self.entity_ranges) - 1))] if self.entity_ranges else 0 + columns += [('entity_range_95percentile', f"{percentile:6,}"),] for i in range(1, self.e_len_max + 1): percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero columns.append((f"c_len_{i}{'' if i < self.e_len_max else '+'}", f"{percent:5.1f}")) @@ -155,7 +188,7 @@ def process_end(self, skip=True, doc=None): columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}")) if self.style.startswith('tex'): - print(" & ".join(c[1] for c in columns), end=" \\\\\n") + print(" &".join(c[1] for c in columns), end=" \\\\\n") elif self.style == 'human': for c in columns: print(f"{c[0]:>15} = {c[1].strip():>10}") @@ -172,23 +205,34 @@ def print_header(self): if self._lines_printed is None: print(r'\documentclass[multi=mypage]{standalone}') print(r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}') + print(r'\usepackage[table]{xcolor}\newcommand{\NEW}{\rowcolor{gray!50}}') print(r'\title{Udapi coreference statistics}') print(r'\begin{document}') print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}') - lines = [r'\begin{mypage}\begin{tabular}{@{}l ', + lines = [r'\begin{mypage}'+"\n"+r'\begin{tabular}{@{}l ', " " * self.docname_len, ("document" if self.per_doc else "dataset ") + " " * (self.docname_len-8), " " * self.docname_len] if self.report_basics: lines[0] += "rrrr " - lines[1] += r'& \MC{4}{total number of} ' - lines[2] += r'& & & & ' - lines[3] += r'& docs & sents & words & empty n.' + lines[1] += r'& \MC{4}{text size} ' + lines[2] += r'& \MC{4}{total number of} ' + lines[3] += r'& docs & sents & words &empty n.' + if self.report_words_per_doc: + lines[0] += "rr " + lines[1] += r'& & ' + lines[2] += r'&\MC{2}{words/doc}' + lines[3] += r'& max & avg ' if self.report_entities: lines[0] += "rrrr " - lines[1] += r'& \MC{4}{entities} ' - lines[2] += r'& total & per 1k & \MC{2}{length} ' - lines[3] += r'& count & words & max & avg. ' + lines[1] += r'& \MC{4}{entities} ' + lines[2] += r'& total &per 1k &\MC{2}{length}' + lines[3] += r'& count & words & max & avg ' + if self.report_entity_range: + lines[0] += "r " + lines[1] += r'& ' + lines[2] += r'& range ' + lines[3] += r'& p95 ' if self.e_len_max: for i in range(1, self.e_len_max + 1): lines[0] += "r" @@ -198,9 +242,9 @@ def print_header(self): lines[1] += r'& \MC{' + str(self.e_len_max) + r'}{distribution of entity lengths}' if self.report_mentions: lines[0] += "rrrr " - lines[1] += r'& \MC{4}{mentions} ' - lines[2] += r'& total & per 1k & \MC{2}{length} ' - lines[3] += r'& count & words & max & avg. ' + lines[1] += r'& \MC{4}{mentions} ' + lines[2] += r'& total &per 1k &\MC{2}{length}' + lines[3] += r'& count & words & max & avg ' if self.m_len_max: for i in range(0, self.m_len_max + 1): lines[0] += "r" @@ -227,12 +271,17 @@ def print_header(self): lines[2] += r'\\' lines[3] += r'\\\midrule' if self.report_basics: + lines[1] += r'\cmidrule(lr){2-7}' if self.report_words_per_doc else r'\cmidrule(lr){2-5}' + lines[2] += r'\cmidrule(lr){2-5}' last_col += 4 - lines[1] += r'\cmidrule(lr){2-5}' + if self.report_words_per_doc: + lines[2] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+2}" + '}' + last_col += 2 if self.report_entities: - lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}' + _cols = 5 if self.report_entity_range else 5 + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+_cols}" + '}' lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}' - last_col += 4 + last_col += _cols if self.e_len_max: last_col += self.e_len_max lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}' @@ -251,6 +300,6 @@ def print_header(self): def print_footer(self, end_doc=True): if not self.style.startswith('tex-'): return - print(r'\bottomrule\end{tabular}\end{mypage}') + print(r'\bottomrule\end{tabular}'+"\n"+r'\end{mypage}') if self.style == 'tex-doc' and end_doc: print(r'\end{document}') diff --git a/udapi/block/msf/phrase.py b/udapi/block/msf/phrase.py index 90ea5d2d..cf5a8f81 100644 --- a/udapi/block/msf/phrase.py +++ b/udapi/block/msf/phrase.py @@ -10,52 +10,63 @@ class Phrase(Block): - def process_node(self, node): + def __init__(self, feature_prefix='CW', **kwargs): """ - Override this in a derived class! + Parameters: + feature_prefix (string) - The prefix of phrase features (e. g. 'CW', 'Phrase'), default is 'CG' """ - logging.fatal('process_node() not implemented.') + super().__init__(**kwargs) + self.feature_prefix = feature_prefix - dictionary = { - 'person': 'PhrasePerson', - 'number': 'PhraseNumber', - 'mood': 'PhraseMood', - 'tense': 'PhraseTense', - 'voice': 'PhraseVoice', - 'aspect':'PhraseAspect', - 'form': 'PhraseForm', - 'reflex': 'PhraseReflex', - 'polarity': 'PhrasePolarity', - 'gender':'PhraseGender', - 'animacy':'PhraseAnimacy', - 'ords':'Phrase', - 'expl':'PhraseExpl', - } - - # a dictionary where the key is the lemma of a negative particle and the value is a list of the lemmas of their possible children that have a 'fixed' relation - # we do not want to include these negative particles in the phrase; these are expressions like "never", etc. - negation_fixed = { - # Belarusian - 'ні' : ['раз'], - 'ня' : ['толькі'], + self.dictionary = { + 'person': f'{feature_prefix}Person', + 'number': f'{feature_prefix}Number', + 'mood': f'{feature_prefix}Mood', + 'tense': f'{feature_prefix}Tense', + 'voice': f'{feature_prefix}Voice', + 'aspect':f'{feature_prefix}Aspect', + 'form': f'{feature_prefix}Form', + 'reflex': f'{feature_prefix}Reflex', + 'polarity': f'{feature_prefix}Polarity', + 'gender': f'{feature_prefix}Gender', + 'animacy': f'{feature_prefix}Animacy', + 'ords': feature_prefix, + 'expl': f'{feature_prefix}Expl', + 'analytic': 'Analytic', + } + + # a dictionary where the key is the lemma of a negative particle and the value is a list of the lemmas of their possible children that have a 'fixed' relation + # we do not want to include these negative particles in the phrase; these are expressions like "never", etc. + self.negation_fixed = { + # Belarusian + 'ні' : ['раз'], + 'ня' : ['толькі'], - # Upper Sorbian - 'nic' : ['naposledku'], + # Upper Sorbian + 'nic' : ['naposledku'], - # Polish - 'nie' : ['mało'], + # Polish + 'nie' : ['mało'], - # Pomak - 'néma' : ['kak'], + # Pomak + 'néma' : ['kak'], - # Slovenian - 'ne' : ['le'], + # Slovenian + 'ne' : ['le'], + + # Russian and Old East Slavic + 'не' : ['то', 'токмо'], + 'ни' : ['в', 'раз', 'шатко'], + 'нет' : ['нет'] + } + + def process_node(self, node): + """ + Override this in a derived class! + """ + logging.fatal('process_node() not implemented.') - # Russian and Old East Slavic - 'не' : ['то', 'токмо'], - 'ни' : ['в', 'раз', 'шатко'], - 'нет' : ['нет'] - } + def write_node_info(self, node, tense = None, @@ -70,7 +81,8 @@ def write_node_info(self, node, gender = None, animacy = None, aspect = None, - expl=None): + expl=None, + analytic=None): arguments = locals() del arguments['self'] # delete self and node from arguments, del arguments['node'] # we want only grammatical categories @@ -125,6 +137,15 @@ def get_is_reflex(self,node,refl): if len(refl) == 0: return node.feats['Reflex'] return 'Yes' + + def get_expl_type(self,node, refl): + if node.feats['Voice'] == 'Mid': + return 'Pv' + if not refl: + return '' + if refl[0].deprel == 'expl': + return 'Pv' + return refl[0].deprel.split(':')[1].capitalize() def is_expl_pass(self,refl): if len(refl) == 0: @@ -136,4 +157,12 @@ def get_voice(self,node,refl): if self.is_expl_pass(refl): return 'Pass' return voice + + def get_analytic_bool(self,node): + auxes = [x for x in node.children if x.udeprel == 'aux'] + + if auxes: + return 'Yes' + else: + return 'No' diff --git a/udapi/block/msf/romance/preprocessor.py b/udapi/block/msf/romance/preprocessor.py new file mode 100644 index 00000000..ad7aec1e --- /dev/null +++ b/udapi/block/msf/romance/preprocessor.py @@ -0,0 +1,20 @@ +from udapi.core.block import Block + +class Preprocessor(Block): + + + def process_node(self, node): + + # In Porttinari treebank, the negative adverb não is not marked with feat Polarity=Neg + if node.lemma == 'não' and node.upos == 'ADV': + node.feats['Polarity'] = 'Neg' + + if node.upos == 'ADV' and node.feats['PronType'] == 'Neg': + node.feats['PronType'] = '' + node.feats['Polarity'] = 'Neg' + + # In Romanian RRT treebank, there is no annotation of the voice feature + # Automatically assign passive voice + pass_auxes = [x for x in node.children if x.deprel == 'aux:pass'] + if pass_auxes: + node.feats['Voice'] = 'Pass' \ No newline at end of file diff --git a/udapi/block/msf/romance/romance.py b/udapi/block/msf/romance/romance.py index 81041e55..ed05fa89 100644 --- a/udapi/block/msf/romance/romance.py +++ b/udapi/block/msf/romance/romance.py @@ -7,12 +7,14 @@ 'potere', 'dovere', 'volere', 'sapere'] # Italian class Aspect(str, Enum): + ANT = 'Ant' IMP = 'Imp' IMPPROG = 'ImpProg' PERF = 'Perf' PERFPROG = 'PerfProg' PROG = 'Prog' PQP = 'Pqp' + PQPPROG = 'PqpProg' class Tense(str, Enum): FUT = 'Fut' @@ -28,16 +30,20 @@ def __init__(self, neg=True, **kwargs): """ Parameters: neg (bool) - If True, process negation and generate the PhrasePolarity=Neg attribute. + feature_prefix (string) - The prefix of phrase features (e. g. 'CG', 'Phrase'), default is 'CG' """ super().__init__(**kwargs) self.neg = neg def process_node(self, node): + if node.misc[self.feature_prefix] != '': + return + cop = [x for x in node.children if x.udeprel == 'cop'] # only expl or expl:pv, no expl:impers or expl:pass - refl = [x for x in node.children if x.lemma == 'se' and x.upos == 'PRON' and x.udeprel == 'expl' and x.deprel != 'expl:impers' and x.deprel != 'expl:pass'] + refl = [x for x in node.children if (x.lemma == 'se' or x.lemma == 'soi') and x.upos == 'PRON' and x.udeprel == 'expl' and x.deprel != 'expl:impers' and x.deprel != 'expl:pass'] if refl: expl='Pv' @@ -97,6 +103,7 @@ def process_node(self, node): gender=node.feats['Gender'], voice=node.feats['Voice'], expl=expl, + analytic=self.get_analytic_bool(node), ords=[node.ord] ) return @@ -115,6 +122,7 @@ def process_node(self, node): phrase_ords = [node.ord] + [r.ord for r in refl] phrase_ords.sort() + self.process_phrases_with_ir_aller_estar(node, expl, polarity, phrase_ords, node) self.process_simple_verb_forms(node, expl, polarity, phrase_ords, node) @@ -145,6 +153,7 @@ def process_node(self, node): phrase_ords.sort() # TODO phrase-level features are currently determined based on the first passive auxiliary, but it can happen that there are more than one passive auxiliary + self.process_phrases_with_ir_aller_estar(auxes[0], expl, polarity, phrase_ords, node) self.process_simple_verb_forms(auxes[0], expl, polarity, phrase_ords, node) # head verb has passive auxiliary and also other auxiliaries @@ -222,6 +231,7 @@ def process_modal_verbs(self, modals, modal_auxes, modal_neg): polarity='Neg' else: phrase_ords = [modals[0].ord] + self.process_phrases_with_ir_aller_estar(modals[0], '', polarity, phrase_ords, modals[0]) self.process_simple_verb_forms(modals[0], '', polarity, phrase_ords, modals[0]) else: @@ -236,6 +246,94 @@ def process_modal_verbs(self, modals, modal_auxes, modal_neg): self.process_periphrastic_verb_forms(modals[0], modal_auxes, '', polarity, phrase_ords, modals[0]) + def process_phrases_with_ir_aller_estar(self, node, expl, polarity, phrase_ords, head_node): + aspect = '' + tense = node.feats['Tense'] + + # phrase already annotated + if head_node.misc[self.feature_prefix] != '': + return + + xcomps = [x for x in node.children if x.udeprel == 'xcomp'] + if node.lemma in ['ir', 'aller', 'estar', 'ter'] and node.upos == 'VERB' and xcomps: + node.misc['PeriAux'] = 'Yes' + + voice = node.feats['Voice'] + auxes = [x for x in xcomps[0].children if x.udeprel == 'aux'] + aux_pass = [x for x in auxes if x.deprel == 'aux:pass'] + auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass'] + + # European Portuguese: estar + a + Inf + if node.lemma == 'estar': + + if node.feats['Tense'] == 'Pres': + tense=Tense.PRES.value + aspect =Aspect.PROG.value + + elif node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + elif node.feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect=Aspect.PERFPROG.value + + elif node.feats['Tense'] == 'Fut': + tense=Tense.FUT.value + aspect=Aspect.PROG.value + + elif node.lemma == 'ter' and len(xcomps) > 1: + tense=Tense.PAST.value + aspect=Aspect.PROG.value + xcomps[0].misc['PeriAux'] = 'Yes' + + elif node.feats['Tense'] == 'Pres': + tense=Tense.FUT.value + + elif node.feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMP.value + + elif node.feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + + elif node.feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERF.value + + + if auxes_without_pass: + if auxes[0].lemma == 'estar': + aspect += 'Prog' + if auxes[0].lemma == 'haber': + aspect += 'Perf' + + + + adp_a = [x for x in xcomps[-1].children if x.lemma == 'a' and x.udeprel == 'mark'] + cop = [x for x in xcomps[0].children if x.udeprel == 'cop'] + phrase_ords = [node.ord] + [x.ord for x in xcomps] + [x.ord for x in auxes] + [x.ord for x in cop] + if adp_a: + phrase_ords += [x.ord for x in adp_a] + + if aux_pass: + voice='Pass' + + phrase_ords.sort() + + self.write_node_info(xcomps[-1], + tense = tense, + number = node.feats['Number'], + person = node.feats['Person'], + aspect = aspect, + mood = node.feats['Mood'], + form = 'Fin', + voice=voice, + expl = expl, + polarity = polarity, + analytic='Yes', + ords=phrase_ords) + return def process_simple_verb_forms(self, node, expl, polarity, phrase_ords, head_node): """ @@ -249,6 +347,9 @@ def process_simple_verb_forms(self, node, expl, polarity, phrase_ords, head_node head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase. """ + if node.misc['PeriAux'] != '': + return + # Portuguese # presente -> PhraseTense=Pres, PhraseAspect='' # Futuro do presente -> PhraseTense=Fut, PhraseAspect='' @@ -263,7 +364,8 @@ def process_simple_verb_forms(self, node, expl, polarity, phrase_ords, head_node aspect = '' tense = node.feats['Tense'] - + form = node.feats['VerbForm'] + if node.feats['Mood'] == 'Ind': # Portuguese @@ -286,15 +388,30 @@ def process_simple_verb_forms(self, node, expl, polarity, phrase_ords, head_node # Italian # pass remoto -> PhraseTense=Past, PhraseAspect=Perf - if node.feats['Tense'] == 'Past': + elif node.feats['Tense'] == 'Past': aspect=Aspect.PERF.value # Portuguese # pretérito mais que perfeito simples -> PhraseTense=Past, PhraseAspect=Pqp - if node.feats['Tense'] == 'Pqp': + elif node.feats['Tense'] == 'Pqp': tense=Tense.PAST.value aspect=Aspect.PQP.value - + + else: + # viitorul popular/colocvial (intentional future) -> PhraseTense=Fut, PhraseAspect='' + o = [x for x in node.children if x.lemma == 'o' and x.upos == 'PART'] + sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART'] + + + if o and sa: + tense = Tense.FUT.value + phrase_ords.append(o[0].ord) + phrase_ords.append(sa[0].ord) + + phrase_ords.sort() + + + # Portuguese # subjunctive presente -> PhraseTense=Pres, PhraseAspect='' # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' @@ -334,22 +451,28 @@ def process_simple_verb_forms(self, node, expl, polarity, phrase_ords, head_node aspect='' tense=Tense.PRES.value + adp_en = [x for x in head_node.children if x.upos == 'ADP' and x.lemma == 'en' and x.udeprel == 'mark'] + if node.feats['VerbForm'] == 'Part' and adp_en: + phrase_ords.append(adp_en[0].ord) + phrase_ords.sort() + form = 'Ger' + self.write_node_info(head_node, person=node.feats['Person'], aspect=aspect, number=node.feats['Number'], mood=node.feats['Mood'], - form=node.feats['VerbForm'], + form=form, tense=tense, gender=head_node.feats['Gender'], voice=head_node.feats['Voice'], expl=expl, polarity=polarity, + analytic=self.get_analytic_bool(head_node), ords=phrase_ords ) - def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_ords, head_node): """ Annotate periphrastic verb forms with the Phrase* attributes. @@ -363,7 +486,11 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase. """ - if len(auxes) == 1: + # phrase already annotated + if head_node.misc[self.feature_prefix] != '': + return + + if len(auxes) == 1: # Cnd if auxes[0].feats['Mood'] == 'Cnd' and (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'Ger'): @@ -395,6 +522,7 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or expl=expl, polarity=polarity, voice=head_node.feats['Voice'], + analytic='Yes', ords=phrase_ords) return @@ -418,6 +546,7 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or voice=head_node.feats['Voice'], expl=expl, polarity=polarity, + analytic='Yes', ords=phrase_ords) return @@ -442,23 +571,7 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or voice=head_node.feats['Voice'], expl=expl, polarity=polarity, - ords=phrase_ords) - return - - # Portuguese - # pretérito mais que perfeito composto (aux haver) -> PhraseTense=Past, PhraseAspect=Perf - if auxes[0].lemma == 'haver' and auxes[0].feats['Tense'] == 'Imp' and node.feats['VerbForm'] == 'Part': - - self.write_node_info(head_node, - tense=Tense.PAST.value, - aspect=Aspect.PERF.value, - number=auxes[0].feats['Number'], - person=auxes[0].feats['Person'], - mood=auxes[0].feats['Mood'], - form='Fin', - voice=head_node.feats['Voice'], - expl=expl, - polarity=polarity, + analytic='Yes', ords=phrase_ords) return @@ -497,6 +610,7 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or aspect=aspect, expl=expl, polarity=polarity, + analytic='Yes', ords=phrase_ords) return @@ -514,24 +628,46 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or # Futuro anteriore -> PhraseTense=Fut, PhraseAspect=Perf aspect=Aspect.PERF.value tense=auxes[0].feats['Tense'] + form='Fin' + mood=auxes[0].feats['Mood'] + + adp_en = [x for x in node.children if x.lemma == 'en' and x.upos == 'ADP' and x.udeprel == 'mark'] + if auxes[0].feats['VerbForm'] == 'Part' and adp_en: + tense=Tense.PAST.value + aspect='' + phrase_ords.append(adp_en[0].ord) + phrase_ords.sort() + form='Ger' + + + # Romanian + # Perfect compus -> PhraseTense=Past, PhraseAspect=Perf + elif auxes[0].lemma == 'avea': + tense = Tense.PAST.value + aspect = Aspect.PERF.value + form = 'Fin' # Spanish # Pretérito perfecto compuesto ante presente -> PhraseTense=Past, PhraseAspect=Perf # Italian # Passato prossimo (aux avere/essere) -> PhraseTense=Past, PhraseAspect=Perf - if auxes[0].feats['Tense'] == 'Pres': + elif auxes[0].feats['Tense'] == 'Pres': # Portuguese # pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf # subjonctive pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf, PhraseMood=Sub - if auxes[0].lemma == 'ter' or auxes[0].feats['Mood'] == 'Sub': + if auxes[0].lemma == 'fi' or auxes[0].feats['Mood'] == 'Sub': tense = Tense.PASTPRES.value + + # subjonctive mood not annotated in Romanian data + if auxes[0].lemma == 'fi': + mood='Sub' else: tense=Tense.PAST.value # Portuguese - # pretérito mais que perfeito composto (aux ter) -> PhraseTense=Past, PhraseAspect=Pqp + # pretérito mais que perfeito composto (aux ter/haver) -> PhraseTense=Past, PhraseAspect=Pqp # subjonctive pretérito mais-que-perfeito composto (aux ter) -> PhraseTense=Past, PhraseAspect=Pqp, PhraseMood=Sub # Spanish @@ -539,26 +675,38 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or # Italian # Trapassato prossimo -> PhraseTense=Past, PhraseAspect=Pqp - elif auxes[0].feats['Tense'] in ['Imp', 'Past']: # TODO prej neni v Past, jenom Imp + elif auxes[0].feats['Tense'] == 'Imp': tense=Tense.PAST.value aspect=Aspect.PQP.value + # Spanish + # pretérito anterior ante pretérito -> PhraseTense=Past, PhraseAspect=Ant + + # Italian + # trapassato remoto -> PhraseTense=Past, PhraseAspect=Ant + + # French + # passé antérieur -> PhraseTense=Past, PhraseAspect=Ant + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect = Aspect.ANT.value + self.write_node_info(head_node, tense=tense, number=auxes[0].feats['Number'], person=auxes[0].feats['Person'], - mood=auxes[0].feats['Mood'], + mood=mood, aspect=aspect, - form='Fin', + form=form, voice=head_node.feats['Voice'], expl=expl, polarity=polarity, + analytic='Yes', ords=phrase_ords) return - # auxiliary 'ir' followed by infinitive - # TODO solve these verb forms for Spanish (VERB 'ir' + ADP 'a' + infinitive) - if auxes[0].lemma == 'ir' and node.feats['VerbForm'] == 'Inf': + # auxiliary 'ir' or 'vrea' followed by infinitive + if auxes[0].lemma in ['ir', 'vrea'] and node.feats['VerbForm'] == 'Inf': tense=node.feats['Tense'] aspect='' @@ -581,7 +729,12 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or # Futuro perifrástico passado perf -> PhraseTense=PastFut, PhraseAspect=Perf elif auxes[0].feats['Tense'] == 'Past': tense=Tense.PASTFUT.value - aspect=Aspect.PERF.value + aspect=Aspect.PERF.value + + # Viitorul standard/literar/simplu -> PhraseTense=Fut, PhraseAspect='' + if auxes[0].lemma == 'vrea': + tense = Tense.FUT.value + aspect = '' self.write_node_info(head_node, tense=tense, @@ -593,15 +746,97 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or voice=head_node.feats['Voice'], expl=expl, polarity=polarity, + analytic='Yes', ords=phrase_ords) + return + + # condițional-optativ prezent -> PhraseTense=Pres, PhraseAspect='' + if auxes[0].lemma == 'avea' and node.feats['VerbForm'] == 'Inf': + tense=Tense.PRES.value + aspect='' + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood='Cnd', + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # viitor popular/colloquial (obligative future) -> PhraseTense=Fut, PhraseAspect='' + # viitor popular (potential future - contracted form) -> PhraseTense=Fut, PhraseAspect='' + if node.feats['VerbForm'] == 'Fin': + sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART'] + + if sa: + phrase_ords.append(sa[0].ord) + phrase_ords.sort() + + tense=Tense.FUT.value + aspect='' + + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=head_node.feats['Number'], + person=head_node.feats['Person'], + mood=head_node.feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return elif len(auxes) == 2: + # Romanian + # viitor anterior -> PhraseTense=Fut, PhraseAsoect=Perf + if auxes[0].lemma == 'vrea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part': + + self.write_node_info(head_node, + tense=Tense.PAST.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=Aspect.PERF.value, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # condițional-optativ perfect -> PhraseTense=Past + if auxes[0].lemma == 'avea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part': + + self.write_node_info(head_node, + tense=Tense.PAST.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood='Cnd', + form='Fin', + aspect='', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + # Portuguese # auxiliry 'ir' followed by auxiliary 'estar' in infinitive and a gerund - - # TODO Spanish - # VERB 'ir' + ADP 'a' + AUX 'estar'.Inf + gerund if auxes[0].lemma == 'ir' and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect=Prog @@ -633,6 +868,7 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or voice=head_node.feats['Voice'], expl=expl, polarity=polarity, + analytic='Yes', ords=phrase_ords) # auxiliriy 'ir' in present or future tense followed by auxiliary 'ter' in infinitive and a participle @@ -658,10 +894,9 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or voice=head_node.feats['Voice'], expl=expl, polarity=polarity, + analytic='Yes', ords=phrase_ords) - - - + # Cnd (only ter/haber), Sub and Past,Pres,Fut tenses: 2 auxes - ter/haber + estar if auxes[0].lemma in AUXES_HAVE and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': @@ -682,7 +917,7 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or # subjonctive Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub elif auxes[0].feats['Tense'] in ['Imp', 'Past']: tense=Tense.PAST.value - aspect=Aspect.IMPPROG.value + aspect=Aspect.PQPPROG.value # Futuro do presente composto -> PhraseTense=Fut, PhraseAspect=PerfProg elif auxes[0].feats['Tense'] == 'Fut' and auxes[0].lemma == 'ter': @@ -698,6 +933,7 @@ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_or voice=head_node.feats['Voice'], expl=expl, polarity=polarity, + analytic='Yes', ords=phrase_ords, ) return @@ -718,11 +954,12 @@ def process_copulas(self, node, cop, expl, polarity, phrase_ords): """ # classify the morphological features of the copula node and propagate them to the entire phrase (treating the copula as the content verb) + self.process_phrases_with_ir_aller_estar(cop[0], expl, polarity, phrase_ords, node) self.process_simple_verb_forms(cop[0], expl, polarity, phrase_ords, node) # adjust PhraseAspect based on the lemma of the copula if cop[0].feats['Tense'] in ['Pres', 'Fut']: if cop[0].lemma == 'ser': - node.misc['PhraseAspect'] = Aspect.PERF.value + node.misc['PeriAspect'] = Aspect.PERF.value elif cop[0].lemma == 'estar': - node.misc['PhraseAspect'] = Aspect.IMP.value + node.misc['PeriAspect'] = Aspect.IMP.value \ No newline at end of file diff --git a/udapi/block/msf/slavic/conditional.py b/udapi/block/msf/slavic/conditional.py index 89eafd6c..9d15418f 100644 --- a/udapi/block/msf/slavic/conditional.py +++ b/udapi/block/msf/slavic/conditional.py @@ -30,12 +30,12 @@ def process_node(self, node): phrase_ords = [x.ord for x in phrase_nodes] phrase_ords.sort() - - auxVerb = aux_cnd[0] person='3' # TODO there is a problem in russian etc. (same as in past tense) - if auxVerb.feats['Person'] != '': - person=auxVerb.feats['Person'] + + for aux_verb in aux: + if aux_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] self.write_node_info(node, @@ -44,12 +44,13 @@ def process_node(self, node): mood='Cnd', form='Fin', aspect=node.feats['Aspect'], - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), polarity=self.get_polarity(phrase_nodes), voice=self.get_voice(node, refl), ords=phrase_ords, gender=node.feats['Gender'], - animacy=node.feats['Animacy'] + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) ) return @@ -68,18 +69,29 @@ def process_node(self, node): phrase_nodes += neg copVerb = cop[0] + + person = '3' + + for aux_verb in aux: + if aux_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + for cop_verb in cop: + if cop_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + phrase_ords = [x.ord for x in phrase_nodes] phrase_ords.sort() self.write_node_info(node, aspect=copVerb.feats['Aspect'], - person=copVerb.feats['Person'], + person=person, number=copVerb.feats['Number'], mood='Cnd', form='Fin', voice=self.get_voice(copVerb, refl), polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node, refl), + expl=self.get_expl_type(node, refl), ords=phrase_ords, gender=copVerb.feats['Gender'], - animacy=copVerb.feats['Animacy'] + animacy=copVerb.feats['Animacy'], + analytic=self.get_analytic_bool(node) ) \ No newline at end of file diff --git a/udapi/block/msf/slavic/converb.py b/udapi/block/msf/slavic/converb.py index 6b725d56..32714630 100644 --- a/udapi/block/msf/slavic/converb.py +++ b/udapi/block/msf/slavic/converb.py @@ -27,11 +27,12 @@ def process_node(self, node): tense=node.feats['Tense'], aspect=node.feats['Aspect'], polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), ords=phrase_ords, gender=node.feats['Gender'], animacy=node.feats['Animacy'], - voice=self.get_voice(node, refl) + voice=self.get_voice(node, refl), + analytic=self.get_analytic_bool(node) ) # passive voice @@ -57,7 +58,8 @@ def process_node(self, node): ords=phrase_ords, gender=auxVerb.feats['Gender'], animacy=auxVerb.feats['Animacy'], - voice='Pass' + voice='Pass', + analytic=self.get_analytic_bool(node) ) # copulas @@ -87,5 +89,6 @@ def process_node(self, node): form='Conv', polarity=self.get_polarity(phrase_nodes), ords=phrase_ords, - voice=self.get_voice(copVerb, refl) + voice=self.get_voice(copVerb, refl), + analytic=self.get_analytic_bool(node) ) diff --git a/udapi/block/msf/slavic/future.py b/udapi/block/msf/slavic/future.py index 02452c36..9cc17717 100644 --- a/udapi/block/msf/slavic/future.py +++ b/udapi/block/msf/slavic/future.py @@ -34,9 +34,10 @@ def process_node(self, node): aspect=node.feats['Aspect'], # srbstina ani chorvatstina vidy nema form='Fin', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), gender=node.feats['Gender'], animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node), ords=phrase_ords ) else: @@ -54,9 +55,10 @@ def process_node(self, node): aspect=node.feats['Aspect'], form='Fin', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), gender=node.feats['Gender'], animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node), ords=phrase_ords ) @@ -85,7 +87,8 @@ def process_node(self, node): aspect=node.feats['Aspect'], form='Fin', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) return @@ -93,7 +96,7 @@ def process_node(self, node): # future tense of perfect verbs # Upper Sorbian forms the future tense in this way, however, the feats[Aspect] are not listed in the data # in some languages ​​(e.g. in Russian) these verbs have the Tense Fut, in others (e.g. in Czech) they have the Tense Pres - """if node.feats['Aspect'] == 'Perf' and (node.feats['Tense'] == 'Pres' or node.feats['Tense'] == 'Fut') and node.feats['VerbForm'] != 'Conv': + if node.feats['Aspect'] == 'Perf' and (node.feats['Tense'] == 'Pres' or node.feats['Tense'] == 'Fut') and node.feats['VerbForm'] != 'Conv': refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] phrase_nodes = [node] + refl @@ -112,10 +115,11 @@ def process_node(self, node): form='Fin', aspect='Perf', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) - return""" + return # future tense of imperfect verbs and passive voice @@ -145,10 +149,11 @@ def process_node(self, node): aspect=node.feats['Aspect'], form='Fin', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), ords=phrase_ords, gender=node.feats['Gender'], - animacy=node.feats['Animacy'] + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) ) return @@ -166,7 +171,8 @@ def process_node(self, node): aspect=node.feats['Aspect'], form='Fin', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) return @@ -195,6 +201,7 @@ def process_node(self, node): form='Fin', voice=self.get_voice(copVerb, refl), polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) diff --git a/udapi/block/msf/slavic/imperative.py b/udapi/block/msf/slavic/imperative.py index d4fedd50..5a30d05e 100644 --- a/udapi/block/msf/slavic/imperative.py +++ b/udapi/block/msf/slavic/imperative.py @@ -28,7 +28,8 @@ def process_node(self, node): form='Fin', voice='Act', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) return @@ -54,7 +55,8 @@ def process_node(self, node): polarity=self.get_polarity(phrase_nodes), ords=phrase_ords, gender=node.feats['Gender'], - animacy=node.feats['Animacy'] + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) ) return @@ -80,7 +82,8 @@ def process_node(self, node): mood='Imp', form='Fin', voice=self.get_voice(copVerb, refl), - reflex=self.get_is_reflex(node, refl), + expl=self.get_expl_type(node, refl), polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) diff --git a/udapi/block/msf/slavic/infinitive.py b/udapi/block/msf/slavic/infinitive.py index f39a2646..83bc0766 100644 --- a/udapi/block/msf/slavic/infinitive.py +++ b/udapi/block/msf/slavic/infinitive.py @@ -27,7 +27,8 @@ def process_node(self,node): voice=self.get_voice(node,refl), form='Inf', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) return @@ -50,11 +51,12 @@ def process_node(self,node): voice='Pass', form='Inf', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node, refl), + expl=self.get_expl_type(node, refl), ords=phrase_ords, gender=node.feats['Gender'], animacy=node.feats['Animacy'], - number=node.feats['Number'] + number=node.feats['Number'], + analytic=self.get_analytic_bool(node) ) return @@ -78,7 +80,8 @@ def process_node(self,node): voice=self.get_voice(cop[0], refl), form='Inf', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node, refl), + expl=self.get_expl_type(node, refl), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) @@ -98,6 +101,7 @@ def process_node(self,node): voice='Act', form='Sup', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node, refl), + expl=self.get_expl_type(node, refl), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) diff --git a/udapi/block/msf/slavic/past.py b/udapi/block/msf/slavic/past.py index 423bff45..130d972d 100644 --- a/udapi/block/msf/slavic/past.py +++ b/udapi/block/msf/slavic/past.py @@ -46,15 +46,16 @@ def process_node(self, node): aspect=node.feats['Aspect'], form=node.feats['VerbForm'], polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), ords=phrase_ords, gender=node.feats['Gender'], - animacy=node.feats['Animacy'] + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) ) # compound past tense - if (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'PartRes') and node.upos == 'VERB' and node.feats['Voice'] != 'Pass': - aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] + if (node.feats['VerbForm'] in ['Part', 'PartRes', 'Fin']) and node.upos == 'VERB' and node.feats['Voice'] != 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in ['Pres', '']] aux_pqp = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in past_tenses] refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] @@ -93,10 +94,11 @@ def process_node(self, node): aspect=node.feats['Aspect'], form='Fin', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), ords=phrase_ords, gender=node.feats['Gender'], - animacy=node.feats['Animacy'] + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) ) @@ -127,10 +129,11 @@ def process_node(self, node): aspect=node.feats['Aspect'], form=node.feats['VerbForm'], polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), ords=phrase_ords, gender=node.feats['Gender'], - animacy=node.feats['Animacy'] + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) ) @@ -166,7 +169,8 @@ def process_node(self, node): polarity=self.get_polarity(phrase_nodes), ords=phrase_ords, gender=node.feats['Gender'], - animacy=node.feats['Animacy'] + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) ) else: @@ -199,9 +203,10 @@ def process_node(self, node): mood='Ind', voice=self.get_voice(cop[0], refl), form='Fin', - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), polarity=self.get_polarity(phrase_nodes), ords=phrase_ords, gender=cop[0].feats['Gender'], - animacy=cop[0].feats['Animacy'] + animacy=cop[0].feats['Animacy'], + analytic=self.get_analytic_bool(node) ) diff --git a/udapi/block/msf/slavic/preprocessor.py b/udapi/block/msf/slavic/preprocessor.py index 804a081f..0672812b 100644 --- a/udapi/block/msf/slavic/preprocessor.py +++ b/udapi/block/msf/slavic/preprocessor.py @@ -17,8 +17,8 @@ def process_node(self,node): # in some languages, participles are annotated with UPOS=VERB, while in others they are annotated with UPOS=ADJ # we change the UPOS to ADJ when a participle expresses case - if node.upos == 'VERB' and node.feats['VerbForm'] == 'Part' and node.feats['Case'] != '': - node.upos = 'ADJ' + #if node.upos == 'VERB' and node.feats['VerbForm'] == 'Part' and node.feats['Case'] != '': + # node.upos = 'ADJ' # in Polish, the conditional mood for auxiliary verbs is marked as deprel == 'aux:cnd' and not as in the last Slavic languages ​​feats['Mood'] == 'Cnd' if node.deprel == 'aux:cnd': @@ -54,8 +54,8 @@ def process_node(self,node): node.feats['Number'] = subj[0].feats['Number'] # participles in passive are sometimes annotated as VERB, sometimes as ADJ - if node.upos == 'VERB' and node.feats['Voice'] == 'Pass': - node.upos = 'ADJ' + #if node.upos == 'VERB' and node.feats['Voice'] == 'Pass': + # node.upos = 'ADJ' # there are cases where the node has deprel=='expl:pv' or 'expl:pass' or 'expl:impers' and Reflex is not Yes (i.e. Macedonian treebank) # we add the Reflex=Yes feature diff --git a/udapi/block/msf/slavic/present.py b/udapi/block/msf/slavic/present.py index 9a743a9e..7521a08d 100644 --- a/udapi/block/msf/slavic/present.py +++ b/udapi/block/msf/slavic/present.py @@ -11,7 +11,7 @@ class Present(udapi.block.msf.phrase.Phrase): def process_node(self,node): # the condition VerbForm == 'Fin' ensures that there are no transgressives between the found verbs # the aspect is not always given in Czech treebanks, so we can't rely on the fact that the imperfect aspect is specified - if node.feats['Tense'] == 'Pres' and node.upos == 'VERB' and node.feats['VerbForm'] == 'Fin': #and node.feats['Aspect']=='Imp': + if node.feats['Tense'] == 'Pres' and node.upos == 'VERB' and node.feats['VerbForm'] == 'Fin' and node.feats['Aspect'] !='Perf': aux_forb = [x for x in node.children if x.upos == 'AUX' and (x.lemma == 'ќе' or x.lemma == 'ще' or x.feats['Mood'] == 'Cnd')] # forbidden auxiliaries for present tense (these auxiliaries are used for the future tense or the conditional mood) @@ -34,7 +34,8 @@ def process_node(self,node): voice=self.get_voice(node,refl), form='Fin', polarity=self.get_polarity(phrase_nodes), - reflex=self.get_is_reflex(node,refl), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) return @@ -65,7 +66,8 @@ def process_node(self,node): polarity=self.get_polarity(phrase_nodes), ords=phrase_ords, gender=node.feats['Gender'], - animacy=node.feats['Animacy'] + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) ) return @@ -91,8 +93,9 @@ def process_node(self,node): number=node.feats['Number'], form='Part', voice=self.get_voice(node, refl), - reflex=self.get_is_reflex(node, refl), + expl=self.get_expl_type(node, refl), polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) return @@ -122,7 +125,8 @@ def process_node(self,node): mood='Ind', form='Fin', voice=self.get_voice(copVerb, refl), - reflex=self.get_is_reflex(node, refl), + expl=self.get_expl_type(node, refl), polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), ords=phrase_ords ) diff --git a/udapi/block/read/addbratann.py b/udapi/block/read/addbratann.py new file mode 100644 index 00000000..4f5fc877 --- /dev/null +++ b/udapi/block/read/addbratann.py @@ -0,0 +1,230 @@ +"""Add Brat coreference annotation from *.ann files. + +So far, tested on French LitBank data only. + +T12 HIST 362 366 qui +T13 HIST 349 362 une aventure +R1431 Coreference Arg1:T12 Arg2:T13 + +""" + +from udapi.core.block import Block +from udapi.core.files import Files +import logging +from bisect import bisect_left +import networkx as nx + +def _m(range_s, range_e, offset): + return f"{range_s}-{offset}:{range_e}-{offset}" if offset else f"{range_s}:{range_e}" + +class AddBratAnn(Block): + + def __init__(self, files, zone='', offset=0, detect_bom=True, keep_mention_id=True, + coref_attr="R", no_type_value='_Unsorted_', + **kwargs): + """Args: + files: file names with the coreference annotations (*.ann) + offset: what number to substract from the chatacter indices in the ann files + detect_bom: if True and the current txt file starts with BOM (byte-order mark), add 1 to the offset + """ + super().__init__(**kwargs) + self.zone = zone + self.files = Files(filenames=files) + self.offset = offset + self.detect_bom = detect_bom + self.keep_mention_id = keep_mention_id + self.coref_attr = coref_attr + self.no_type_value = no_type_value + + def process_document(self, document): + + # Read all the important info from the *.ann file. + mentions, attrs, split_ante, clusters = {}, [], [], [] + ann_filehandle = self.files.next_filehandle() + offset = self.offset + if self.detect_bom: + txt_filename = self.files.filename.replace("ann", "txt") + with open(txt_filename, 'rb') as txt_fh: + raw_bytes = txt_fh.read(3) + if raw_bytes == b'\xef\xbb\xbf': + offset += 1 + + for line in ann_filehandle: + line = line.rstrip('\n') + if not "\t" in line: + logging.warning(f"Unexpected line without tabs: {line}") + elif line.startswith("T"): + # T13 HIST 349 362 une aventure + try: + mention_id, type_and_range, form = line.split("\t") + # Usually range are two numbers, but can be more, e.g. type_and_range="Abstract 605 653;654 703" + # Let's take the first and last number only.´ + parts = type_and_range.split() + ne_type, range_s, range_e = parts[0], int(parts[1]), int(parts[-1]) + + # If form ends with spaces, remove them and adjust range_e + stripped_form = form.rstrip(" ") + if form != stripped_form: + num_spaces = len(form) - len(stripped_form) + logging.debug(f"Stripping {num_spaces} space{'s' if num_spaces>1 else ''} from {mention_id} '{form}' ({_m(range_s,range_e,offset)}->{range_e-num_spaces})") + form = stripped_form + range_e = range_e - num_spaces + + + mentions[mention_id] = [ne_type, range_s, range_e, form] + if self.keep_mention_id: + attrs.append(["mention_id", mention_id, mention_id]) + except Exception as e: + logging.warning(f"Unexpected mention line: {line}\n{e}") + elif line.startswith(self.coref_attr): + try: + cor_attr, mention_ids = line.rstrip().split("\t") + parts = mention_ids.split() + assert(parts[0] == "Coreference") + except Exception as e: + logging.warning(f"Unexpected coref line: '{line}'\n{e}") + clusters.append([p.split(":")[1] for p in parts[1:]]) + elif line.startswith("#"): + pass # Let's ignore annotators' comments + else: + logging.warning(f"Unexpected line in {self.files.filename}:\n{line}") + + # Some Brat ann files use link-based representation, e.g. + # R123 Coreference Arg1:T11 Arg2:T13 + # R124 Coreference Arg1:T12 Arg2:T14 + # R125 Coreference Arg1:T13 Arg2:T14 + # This actually means that all four mentions T11, T12, T13 and T14 are in the same cluster (entity). + # However, clusters = [["T11", "T13"], ["T12", "T14"], ["T13", "T14"]] + # and we need to convert it to clusters = [["T11", "T12", "T13", "T14"]] + # Note that if creating entities for link, in their original order, + # R123 and R125 would result in creating two entities and when hitting R125 + # we would need to merge them, i.e. delete one of them and move their mentions to the other. + # This is the solution of corefud.Link2Cluster, but here it seems easier to find connected components. + coref_graph = nx.Graph() + for mention_ids in clusters: + coref_graph.add_node(mention_ids[0]) + for mention_id in mention_ids[1:]: + coref_graph.add_node(mention_id) + coref_graph.add_edge(mention_id, mention_ids[0]) + clusters = [list(component) for component in nx.connected_components(coref_graph)] + + # Create entity objects for non-singletons. + entity_map = {} + for mention_ids in clusters: + etype, etype_index = None, 0 + for index, m_id in enumerate(mention_ids): + if mentions[m_id][0] == self.no_type_value: + pass + elif etype is None: + etype, etype_index = mentions[m_id][0], index + elif etype != mentions[m_id][0]: + logging.warning(f"Mention type mismatch {mention_ids[etype_index]}:{etype} != {m_id}:{mentions[m_id][0]}. Using the former.") + if etype is None: + etype = "other" + entity = document.create_coref_entity(etype=etype) + for m_id in mention_ids: + if m_id in entity_map: + logging.warning(f"Mention {m_id} already in Entity {entity_map[m_id].eid}, not adding to {entity.eid}") + else: + entity_map[m_id] = entity + + # Collect TokenRange (as pre-filled by UDPipe) for each token. + tokens, starts, ends = [], [], [] + for tree in document.trees: + for token in tree.token_descendants: + tokens.append(token) + range_s, range_e = token.misc["TokenRange"].split(":") + starts.append(int(range_s)) + ends.append(int(range_e)) + + # Create mention objects. + mention_map = {} + for mention_id, mention_values in mentions.items(): + + # Find Udapi tokens for each mention. + ne_type, range_s, range_e, form = mention_values + index_s = bisect_left(starts, range_s - offset) + if starts[index_s] != range_s - offset and index_s > 0: + index_s -= 1 + index_e = bisect_left(ends, range_e - offset) + mtokens = tokens[index_s : index_e+1] + token_s, token_e = tokens[index_s], tokens[index_e] + + # Solve cases when the character range crosses Udapi (UDPipe-predicted) token boundaries. + # If the start token is a multi-word token (MWT), + # we can still try to find the proper word within the MWT. + ok_s, ok_e = True, True + if starts[index_s] != range_s - offset: + ok_s = False + if token_s.is_mwt(): + mtokens.pop(0) + first_form = form.split()[0] + new_start = ends[index_s] + for w in reversed(token_s.words): + mtokens = [w] + mtokens + new_start -= len(w.form) + if w.form == first_form or new_start < range_s - offset: + ok_s = True + break + + # similarly for the end token + if ends[index_e] != range_e - offset: + ok_e = False + if token_e.is_mwt(): + mtokens.pop() + last_form = form.split()[-1] + new_end = starts[index_e] + for w in token_e.words: + mtokens.append(w) + new_end += len(w.form) + if w.form == last_form or new_end > range_e - offset: + ok_e = True + break + + if not ok_s or not ok_e: + logging.warning(f"Mention {mention_id} range {_m(range_s, range_e, offset)} ({form})" + f" crosses token boundaries: {token_s.misc} ({token_s.form}) " + f".. {token_e.misc} ({token_e.form})") + + # Project tokens (including MWTs) to words and check forms match. + words, udapi_form = [], "" + for token in mtokens: + words += token.words + udapi_form += token.form + if not token.no_space_after: + udapi_form += " " + udapi_form = udapi_form.rstrip() + if form != udapi_form: + logging.warning(f"Mention {mention_id}: ann form '{form}' != Udapi form '{udapi_form}'") + + # Make sure all words of the mention are in the same sentence. + root = words[0].root + mwords = [words[0]] + for word in words[1:]: + if word.root is root: + mwords.append(word) + else: + logging.warning(f"Cross-sentence mention. Word {word} not in {root}, thus omitting from the mention.") + + # Create entities for singletons + if mention_id not in entity_map: + entity_map[mention_id] = document.create_coref_entity(etype=ne_type) + + # Create the Udapi mention object + mention = entity_map[mention_id].create_mention(words=mwords) + mention_map[mention_id] = mention + + # Fill-in the additional mention attributes. + for attr_name, mention_id, attr_value in attrs: + if mention_id in mention_map: + mention_map[mention_id].other[attr_name] = attr_value + + # Fill-in split antecedents + for arg1, arg2 in split_ante: + if arg1 in entity_map and arg2 in entity_map: + if entity_map[arg1] in entity_map[arg2].split_ante: + logging.warning(f"Repeated SplitAnte: {arg1=} ({entity_map[arg1].eid}) {arg2=} ({entity_map[arg2].eid})") + else: + entity_map[arg2].split_ante.append(entity_map[arg1]) + else: + logging.warning(f"{arg1} or {arg2} not indexed in entity_map") diff --git a/udapi/block/read/addtext.py b/udapi/block/read/addtext.py index 040174be..4d0b7771 100644 --- a/udapi/block/read/addtext.py +++ b/udapi/block/read/addtext.py @@ -32,7 +32,7 @@ def process_document(self, document): self.finished = True return text = ''.join(self.filehandle.readlines()) - i, end, was_newpar = 0, len(text), True + i, end, was_newpar = 0, len(text)-1, True while i <= end and text[i].isspace(): i += 1 diff --git a/udapi/block/read/conll2012.py b/udapi/block/read/conll2012.py index f4b73dc8..2adbd00f 100644 --- a/udapi/block/read/conll2012.py +++ b/udapi/block/read/conll2012.py @@ -18,7 +18,7 @@ class Conll2012(udapi.block.read.conllu.Conllu): """A reader of the Conll2012 files.""" - def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', **kwargs): + def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', emptyval='_', **kwargs): """Create the Conll2012 reader object. Args: @@ -29,10 +29,15 @@ def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', **kwar word-order number/index (usualy called ID). For Corref-PT-SemEval, use attributes='ord,form,_,_,_,_,coref'. For Summ-it++v2, use attributes='ord,form,_,_,_,_,_,_,coref'. + For FantasyCoref, use attributes='docname,_,ord,form,_,_,_,_,_,_,_,coref'. + emptyval: a symbol that represents an empty value, especially in the coref column + (default='_' suitable for LitBank, Corref-PT-SemEval, and Summ-it++v2) + For FantasyCoref, use emptyval='-'. """ super().__init__(**kwargs) self.node_attributes = attributes.split(',') self._docname = 'd' + self.emptyval = emptyval def parse_comment_line(self, line, root): if line.startswith("#end document"): @@ -40,7 +45,7 @@ def parse_comment_line(self, line, root): match = RE_BEGIN.match(line) if match: docname = match.group(1) - # LitBank uses e.g. + # LitBank and FantasyCoref use e.g. # #begin document (1023_bleak_house_brat); part 0 if docname.startswith('(') and docname.endswith(');'): docname = docname[1:-2] @@ -51,6 +56,9 @@ def parse_comment_line(self, line, root): # Corref-PT-SemEval uses e.g. # #begin document D1_C30_Folha_07-08-2007_09h19.txt.xml docname = docname.replace('.txt', '').replace('.xml', '') + # FantasyCoref may use parentheses within the document ID e.g. + # #begin document (051_Fundevogel_(Bird-foundling)); part 000 + docname = docname.replace('(', '').replace(')', '') root.newdoc = docname self._global_entity = 'eid-etype-head-other' @@ -72,6 +80,8 @@ def parse_node_line(self, line, root, nodes): for (n_attribute, attribute_name) in enumerate(self.node_attributes): value = fields[n_attribute] if attribute_name == 'docname': + # FantasyCoref may use parentheses within the document ID + value = value.replace('(', '').replace(')', '') if value != self._docname: logging.warning(f"Document name mismatch {value} != {self._docname}") @@ -83,7 +93,7 @@ def parse_node_line(self, line, root, nodes): logging.warning(f"Mismatch: expected {node.ord=}, but found {int(value) + 1} {line=}") elif attribute_name == 'coref': - if value and value != '_': + if value and value != self.emptyval: # LitBank always separates chunks by a vertical bar, e.g. (13)|10) # Summ-it++v2 does not, e.g. (13)10) if '|' in value: diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 9151a216..e19cd676 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -27,7 +27,7 @@ def __init__(self, strict=False, empty_parent='warn', fix_cycles=False, **kwargs strict: raise an exception if errors found (default=False, i.e. a robust mode) empty_parent: What to do if HEAD is _? Default=warn: issue a warning and attach to the root or if strict=1 issue an exception. With `empty_parent=ignore` no warning is issued. - fix_cycles: fix cycles by attaching a node in the cycle to the root + fix_cycles: fix cycles by attaching a node in the cycle to the root; fix also HEAD index out of range """ super().__init__(**kwargs) self.strict = strict @@ -193,12 +193,15 @@ def read_tree_from_lines(self, lines): try: parent = nodes[parents[node_ord]] except IndexError: - raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) + if self.fix_cycles: + logging.warning(f"Ignoring out-of-range HEAD (attaching to the root instead): {node} HEAD={parents[node_ord]}") + parent = root + else: + raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) if node is parent: if self.fix_cycles: - logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) - node._parent = root - root._children.append(node) + logging.warning("Ignoring a self-cycle (attaching to the root instead):\n%s", node) + parent = root else: raise ValueError(f"Detected a cycle: {node} attached to itself") elif node._children: diff --git a/udapi/block/read/text.py b/udapi/block/read/text.py index 0213bdcb..161b6b6e 100644 --- a/udapi/block/read/text.py +++ b/udapi/block/read/text.py @@ -16,9 +16,17 @@ class Text(BaseReader): so that `udpipe.Base` keeps these characters in `SpacesAfter`. As most blocks do not expect whitespace other than a space to appear in the processed text, using this feature is at your own risk. + empty_line: how empty lines are handled. Default 'new_sentence' preserves + the current behaviour (empty lines mark sentence boundaries). Use + 'keep' to read the entire file content into a single sentence (tree), including + empty lines. Use 'newpar' to behave like 'new_sentence' but also set + `root.newpar = True` on each sentence. """ - def __init__(self, rstrip='\r\n ', **kwargs): + def __init__(self, rstrip='\r\n ', empty_line='new_sentence', **kwargs): + if empty_line not in {'new_sentence', 'keep', 'newpar'}: + raise ValueError("empty_line must be 'new_sentence', 'keep' or 'newpar'") self.rstrip = rstrip + self.empty_line = empty_line super().__init__(**kwargs) @staticmethod @@ -32,6 +40,13 @@ def is_multizone_reader(): def read_tree(self, document=None): if self.filehandle is None: return None + if self.empty_line == 'keep': + content = self.filehandle.read() + if content == '': + return None + root = Root() + root.text = content + return root lines = [] line = None while True: @@ -54,4 +69,6 @@ def read_tree(self, document=None): root = Root() root.text = " ".join(lines) + if self.empty_line == 'newpar': + root.newpar = True return root diff --git a/udapi/block/ud/cs/addmwt.py b/udapi/block/ud/cs/addmwt.py index 37c6d94e..a690c95b 100644 --- a/udapi/block/ud/cs/addmwt.py +++ b/udapi/block/ud/cs/addmwt.py @@ -8,7 +8,9 @@ 'abych': {'form': 'aby bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, 'kdybych': {'form': 'když bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, 'abys': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'abysi': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, 'kdybys': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybysi': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, 'aby': {'form': 'aby by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, 'kdyby': {'form': 'když by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, 'abychom': {'form': 'aby bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, @@ -17,7 +19,9 @@ 'abychme': {'form': 'aby bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, 'kdybychme': {'form': 'když bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, 'abyste': {'form': 'aby byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'abyšte': {'form': 'aby byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, 'kdybyste': {'form': 'když byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyšte': {'form': 'když byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, # Old Czech 'abyšta' == dual number; 2nd or 3rd person, the one example in data so far is 3rd. 'abyšta': {'form': 'aby byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, 'kdybyšta': {'form': 'když byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, @@ -54,6 +58,31 @@ 'main': 1, 'shape': 'subtree', } +# In 19th century texts (Hičkok etalon), one instance of 'seč' was also split (and annotated as ADP + accusative!) +# A few additional instances were found in older texts, too (e.g. 16th century). +# We must do it separately, as the preposition is vocalized. +MWTS['seč'] = { + 'form': 'se' + ' co', + 'lemma': 's' + ' co', + 'upos': 'ADP PRON', + 'xpos': 'RV--4---------- PQ--4----------', + 'feats': 'AdpType=Voc|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', +} + +# Old Czech 'toliť' (special case with 3 subtokens; general -ť will be solved dynamically below). +MWTS['toliť'] = { + 'form': 'to li ť', + 'lemma': 'ten li ť', + 'upos': 'DET SCONJ PART', + 'xpos': '* J,------------- TT-------------', + 'feats': '* _ _', + 'deprel': '* mark discourse', + 'main': 0, + 'shape': 'siblings' +} @@ -130,6 +159,22 @@ def multiword_analysis(self, node): 'main': 0, 'shape': 'subtree', } + # dajžto = dajž + to + if subtokens[1] == 'to': + if token_from_subtokens != node.form: + logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) + return None + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' ' + subtokens[1], + 'lemma': '* ten', + 'upos': '* DET', + 'xpos': '* PDNS4----------', + 'feats': '* Case=Acc|Gender=Neut|Number=Sing|PronType=Dem', + 'deprel': '* obj', + 'main': 0, + 'shape': 'subtree', + } # Contractions of prepositions and pronouns almost could be processed # regardless of AddMwt instructions by the annotator, but we still # require it to be on the safe side. For example, both 'přědeň' and @@ -138,7 +183,7 @@ def multiword_analysis(self, node): # could be masculine or neuter. We pick Gender=Masc and Animacy=Anim # by default, unless the original token was annotated as Animacy=Inan # or Gender=Neut. - m = re.match(r"^(na|nade|o|pro|přěde|ski?rz[eě]|za)[nň](ž?)$", node.form.lower()) + m = re.match(r"^(na|nade|o|po|pro|přěde|ski?rz[eě]|za)[nň](ž?)$", node.form.lower()) if m: node.misc['AddMwt'] = '' # Remove vocalization from 'přěde' (přěd něj) but keep it in 'skrze' diff --git a/udapi/block/ud/cs/fixmorpho.py b/udapi/block/ud/cs/fixmorpho.py new file mode 100644 index 00000000..7fcb0e12 --- /dev/null +++ b/udapi/block/ud/cs/fixmorpho.py @@ -0,0 +1,471 @@ +""" +A Czech-specific block to fix lemmas, UPOS and morphological features in UD. +It should increase consistency across the Czech treebanks. It focuses on +individual closed-class verbs (such as the auxiliary "být") or on entire classes +of words (e.g. whether or not nouns should have the Polarity feature). It was +created as part of the Hičkok project (while importing nineteenth-century Czech +data) but it should be applicable on any other Czech treebank. +""" +from udapi.core.block import Block +import logging +import re + +class FixMorpho(Block): + + def process_node(self, node): + # Do not touch words marked as Foreign or Typo. They may not behave the + # way we expect in Czech data. + if node.feats['Foreign'] == 'Yes' or node.feats['Typo'] == 'Yes': + return + #---------------------------------------------------------------------- + # NOUNS, PROPER NOUNS, AND ADJECTIVES + #---------------------------------------------------------------------- + # Nouns do not have polarity but the Prague-style tagsets may mark it. + if node.upos in ['NOUN', 'PROPN']: + if node.feats['Polarity'] == 'Pos': + node.feats['Polarity'] = '' + elif node.feats['Polarity'] == 'Neg': + logging.warn(f'To remove Polarity=Neg from the NOUN {node.form}, we may have to change its lemma ({node.lemma}).') + # For some nouns, there is disagreement in whether to tag and lemmatize + # them as proper nouns. We must be careful and not add too many to this + # rule, as many of them could be used as surnames and then they should + # be PROPN. + if node.upos == 'PROPN' and re.fullmatch(r'(bůh|duch|hospodin|město|milost|pan|pán|panna|stvořitel|trojice)', node.lemma.lower()): + node.lemma = node.lemma.lower() + node.upos = 'NOUN' + # Lemmatization. + if node.upos == 'NOUN' and node.lemma == 'zem': + node.lemma = 'země' + if node.upos == 'ADJ': + # Adjectives should be lemmatized to lowercase even if they are part of + # a multiword name, e.g., "Malá" in "Malá Strana" should be lemmatized + # to "malý". Exception: Possessive adjectives derived from personal + # names, e.g., "Karlův". + if node.feats['Poss'] != 'Yes': + node.lemma = node.lemma.lower() + # Short forms of adjectives are rare in Modern Czech and uninflected + # (they are used as predicates), so they lack the Case feature. But + # they were inflected for Case in the past, so it is better to add + # Case=Nom for consistency. + if node.feats['Variant'] == 'Short' and node.feats['Case'] == '': + node.feats['Case'] = 'Nom' + #---------------------------------------------------------------------- + # PRONOUNS AND DETERMINERS + #---------------------------------------------------------------------- + # Clitic forms of personal pronouns have Variant=Short if there is also a longer, full form. + if node.upos == 'PRON' and node.feats['PronType'] == 'Prs' and re.fullmatch(r'(mi|mě|ti|tě|si|se|ho|mu)', node.form.lower()): + node.feats['Variant'] = 'Short' + # Forms of "my" should be lemmatized as "já". + if node.upos == 'PRON' and node.lemma == 'my': + node.lemma = 'já' + # Forms of "vy" should be lemmatized as "ty". + if node.upos == 'PRON' and node.lemma == 'vy': + node.lemma = 'ty' + # Forms of "oni" should be lemmatized as "on" and cases that allow + # a preposition should have PrepCase. + if node.upos == 'PRON' and node.lemma in ['on', 'oni']: + node.lemma = 'on' + if node.feats['Case'] not in ['Nom', 'Voc']: + if node.form.lower().startswith('j'): + node.feats['PrepCase'] = 'Npr' + elif re.match(r'[nň]', node.form.lower()): + node.feats['PrepCase'] = 'Pre' + # In 19th century data, the grammaticalized usages of "se", "si" are + # tagged as PART (rather than a reflexive PRON, which is the standard). + # Even if it already was tagged PRON, some features may have to be added. + if node.upos in ['PRON', 'PART'] and node.form.lower() in ['se', 'si']: + node.lemma = 'se' + node.upos = 'PRON' + node.feats['PronType'] = 'Prs' + node.feats['Reflex'] = 'Yes' + if node.form.lower() == 'se': + # Occasionally "se" can be genitive: "z prudkého do se dorážení". + if not node.feats['Case'] == 'Gen': + node.feats['Case'] = 'Acc' + else: + node.feats['Case'] = 'Dat' + node.feats['Variant'] = 'Short' + # As the genitive/accusative form of "on", "jeho" should have PrepCase. + if node.upos == 'PRON' and node.form.lower() == 'jeho': + node.feats['PrepCase'] = 'Npr' + # Possessive pronouns have Person, Gender[psor] and Number[psor]. + # Although it is questionable, plural possessors are lemmatized to singular + # possessors in an analogy to personal pronouns: "my" --> "já", "náš" --> "můj". + # Some source corpora lack Person and [psor] features, others do not respect + # the lemmatization rule, so in the end we have to look at the forms; but + # there are potentially many variants, especially in old texts. + if node.upos == 'DET' and node.feats['Poss'] == 'Yes': + if node.form.lower().startswith('m'): + # můj muoj mój mého mému mém mým moje má mojí mé moji mou mí mých mými + node.feats['Person'] = '1' + node.feats['Number[psor]'] = 'Sing' + elif node.form.lower().startswith('t'): + # tvůj tvuoj tvój tvého tvému tvém tvým tvoje tvá tvojí tvé tvoji tvou tví tvých tvými + node.feats['Person'] = '2' + node.feats['Number[psor]'] = 'Sing' + elif node.form.lower().startswith('n'): + # náš našeho našemu našem naším naše naší naši našich našim našimi + node.lemma = 'můj' + node.feats['Person'] = '1' + node.feats['Number[psor]'] = 'Plur' + elif node.form.lower().startswith('v'): + # váš vašeho vašemu vašem vaším vaše vaší vaši vašich vašim vašimi + node.lemma = 'tvůj' + node.feats['Person'] = '2' + node.feats['Number[psor]'] = 'Plur' + elif node.form.lower() == 'jeho': + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Sing' + if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']): + node.feats['Gender[psor]'] = 'Masc,Neut' + elif re.fullmatch(r'jehož', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Sing' + if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']): + node.feats['Gender[psor]'] = 'Masc,Neut' + elif re.fullmatch(r'(její|jejího|jejímu|jejím|jejích|jejími|jejíma)', node.form.lower()): + node.lemma = 'jeho' + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Sing' + node.feats['Gender[psor]'] = 'Fem' + elif re.fullmatch(r'jejíž', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Sing' + node.feats['Gender[psor]'] = 'Fem' + elif re.fullmatch(r'jich|jejich', node.form.lower()): + node.lemma = 'jeho' + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Plur' + elif re.fullmatch(r'jichž|jejichž', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Plur' + elif re.fullmatch(r'jichžto|jejichžto', node.form.lower()): + node.lemma = 'jehožto' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Plur' + elif node.lemma == 'čí': + node.feats['Poss'] = 'Yes' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Reflexive possessive pronoun should not forget the Reflex=Yes feature. + if node.upos == 'DET' and node.lemma == 'svůj': + node.feats['Reflex'] = 'Yes' + # Demonstrative, interrogative, relative, negative, total and indefinite + # pronouns (or determiners, because some of them get the DET tag). + if node.upos in ['PRON', 'DET']: + # Relative pronoun "jenž" should be PRON, not DET + # (it inflects for Gender but it can never be used as congruent attribute). + if re.fullmatch(r'(jenž|jenžto)', node.lemma): + node.upos = 'PRON' + if node.form.lower().startswith('j'): + node.feats['PrepCase'] = 'Npr' + else: + node.feats['PrepCase'] = 'Pre' + # Relative pronoun "ješto" should be PRON, not DET (if it is not SCONJ, but that was excluded by a condition above) + # (it inflects for Gender but it can never be used as congruent attribute). + elif node.form.lower() in ['ješto', 'ježto']: + node.lemma = 'jenžto' + node.upos = 'PRON' + node.feats['PrepCase'] = 'Npr' + # Relative pronoun "an" is PRON (not DET). + elif node.lemma == 'an': + node.upos = 'PRON' + node.feats['PronType'] = 'Rel' + # Pronoun "kdo" is PRON (not DET). + elif node.lemma == 'kdo': + node.lemma = 'kdo' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "kdož" is PRON (not DET). + elif node.lemma == 'kdož': + node.lemma = 'kdož' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Rel' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "někdo", "kdosi" is PRON (not DET). + elif re.fullmatch(r'(kdosi|někdo)', node.lemma): + node.upos = 'PRON' + node.feats['PronType'] = 'Ind' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "nikdo" is PRON (not DET). + elif node.lemma == 'nikdo': + node.lemma = 'nikdo' + node.upos = 'PRON' + node.feats['PronType'] = 'Neg' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "co" is PRON (not DET). + elif node.lemma == 'co': + node.lemma = 'co' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "což" is PRON (not DET). + elif node.lemma in ['což', 'cože']: + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Rel' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "něco" is PRON (not DET). + elif re.fullmatch(r'(cokoli|cosi|něco)', node.lemma): + node.upos = 'PRON' + node.feats['PronType'] = 'Ind' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "nic" is PRON (not DET). + elif node.lemma == 'nic': + node.lemma = 'nic' + node.upos = 'PRON' + node.feats['PronType'] = 'Neg' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "týž" is DET and PronType=Dem. + elif re.fullmatch(r'(tentýž|týž)', node.lemma): + node.upos = 'DET' + node.feats['PronType'] = 'Dem' + # Pronoun "každý" is DET and PronType=Tot. + elif node.lemma == 'každý': + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + # Pronoun "vše" is lemmatized to "všechen", it is DET and PronType=Tot. + elif node.form.lower() == 'vše': + node.lemma = 'všechen' + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + elif node.lemma == 'všechen': + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + elif re.fullmatch(r'(všecek|všecka|všecku|všecko|všickni)', node.form.lower()): + node.lemma = 'všechen' + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + # Pronoun "sám" is lemmatized to the long form, it is DET and PronType=Emp. + elif node.lemma in ['sám', 'samý']: + node.lemma = 'samý' + node.upos = 'DET' + node.feats['PronType'] = 'Emp' + node.feats['Variant'] = 'Short' if re.fullmatch(r'(sám|sama|samo|sami|samy|samu)', node.form.lower()) else '' + #---------------------------------------------------------------------- + # PRONOMINAL NUMERALS AND ADVERBS + #---------------------------------------------------------------------- + # The numeral "oba" should be NUM, not PRON or DET. But it should have PronType=Tot. + if node.upos in ['NUM', 'PRON', 'DET'] and node.lemma == 'oba': + node.upos = 'NUM' + node.feats['NumType'] = 'Card' + node.feats['NumForm'] = 'Word' + node.feats['PronType'] = 'Tot' + # Pronominal cardinal numerals should be DET, not NUM. + if node.upos == 'NUM': + if re.fullmatch(r'(mnoho|málo|několik)', node.lemma): + node.upos = 'DET' + node.feats['PronType'] = 'Ind' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' ###!!! so we are losing the distinction mnoho/nemnoho? + elif re.fullmatch(r'(toliko?)', node.lemma): + node.lemma = 'tolik' + node.upos = 'DET' + node.feats['PronType'] = 'Dem' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(kolik)', node.lemma): + node.upos = 'DET' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' + if node.upos in ['ADV', 'NUM']: + if re.fullmatch(r'(mnoho|málo|několi)krát', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + node.feats['PronType'] = 'Ind' + elif re.fullmatch(r'(tolikrát)', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + node.feats['PronType'] = 'Dem' + elif re.fullmatch(r'(kolikrát)', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Pronominal adverbs have PronType but most of them do not have Degree + # and Polarity. + if node.upos == 'ADV': + if re.fullmatch(r'(dosud|dotud|nyní|odsud|odtud|proto|sem|tady|tak|takož|takto|tam|tamto|teď|tehdy|tenkrát|tu|tudy|zde)', node.lemma): + node.feats['PronType'] = 'Dem' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(dokdy|dokud|jak|kam|kde|kdy|kterak|kudy|odkdy|odkud|proč)', node.lemma): + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(kdežto)', node.lemma): + node.feats['PronType'] = 'Rel' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(jakkoli|jaksi|kamkoli|kamsi|kdekoli|kdesi|kdykoli|kdysi|kudykoli|kudysi|nějak|někam|někde|někdy|někudy)', node.lemma): + node.feats['PronType'] = 'Ind' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(nic|nijak|nikam|nikde|nikdy|nikudy)', node.lemma): + node.feats['PronType'] = 'Neg' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + # Total pronominals can be negated ("nevždy"). Then they get Degree, too. + elif re.fullmatch(r'(odevšad|všude|všudy|ve?ždy|ve?ždycky)', node.lemma): + node.feats['PronType'] = 'Tot' + node.feats['Degree'] = 'Pos' + node.feats['Polarity'] = 'Pos' + #---------------------------------------------------------------------- + # VERBS AND AUXILIARIES + #---------------------------------------------------------------------- + # In Czech UD, "být" is always tagged as AUX and never as VERB, regardless + # of the fact that it can participate in purely existential constructions + # where it no longer acts as a copula. Czech tagsets typically do not + # distinguish AUX from VERB, which means that converted data may have to + # be fixed. + if node.upos == 'VERB' and node.lemma in ['být', 'bývat', 'bývávat']: + node.upos = 'AUX' + if node.upos in ['ADV', 'VERB'] and re.fullmatch(r'(ne)?lze', node.form.lower()): + node.upos = 'ADV' + node.lemma = 'lze' # not 'nelze' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + node.feats['Aspect'] = '' + node.feats['Mood'] = '' + node.feats['Tense'] = '' + node.feats['Person'] = '' + node.feats['Number'] = '' + node.feats['Degree'] = 'Pos' + if node.upos in ['VERB', 'AUX']: + # Most non-passive verb forms have Voice=Act, and infinitives should + # have it, too. Passive infinitives are always periphrastic. + # (This is not done in the PDT tagset, but we should add it.) + if node.feats['VerbForm'] == 'Inf': + node.feats['Voice'] = 'Act' + # Same for imperatives. + elif node.feats['Mood'] == 'Imp': + node.feats['Voice'] = 'Act' + # Some verbs lack the Aspect feature although they are not biaspectual. + if node.feats['Aspect'] == '': + if re.fullmatch(r'(cítit|čekat|činit|číst|dávat|dělat|dít|dívat|hledat|chodit|chtít|jít|kralovat|ležet|milovat|mít|mluvit|moci|mus[ei]t|mysl[ei]t|patřit|počínat|prosit|ptát|působit|sedět|snažit|vědět|vidět|vyprávět|zdát|znamenat|žít)', node.lemma): + node.feats['Aspect'] = 'Imp' + elif re.fullmatch(r'(dát|dojít|dostat|nalézt|napadnout|nechat|obrátit|odpovědět|otevřít|počít|položit|pomoci|poslat|postavit|povědět|poznat|přijít|přinést|říci|učinit|udělat|ukázat|vrátit|vstát|vydat|vzít|začít|zeptat|zůstat)', node.lemma): + node.feats['Aspect'] = 'Perf' + # We must look at word form to distinguish imperfective "stát" from perfective "stát se". + elif re.fullmatch(r'(stojí(me?|š|te)?|stál(a|o|i|y)?)', node.form.lower()): + node.feats['Aspect'] = 'Imp' + elif re.fullmatch(r'(stan(u|eš|e|eme?|ete|ou)|stal(a|o|i|y)?)', node.form.lower()): + node.feats['Aspect'] = 'Perf' + # Present forms of perfective verbs normally have Tense=Pres despite + # meaning future. However, a few imperfective verbs have a separate + # future form (distinct from present form), which gets Tense=Fut + # despite inflecting similarly to present forms. + if node.feats['Mood'] == 'Ind' and node.feats['Tense'] == 'Pres' and node.feats['Aspect'] != 'Perf' and re.match(r'(ne)?((bud|půjd|pojed|polez|pones)(u|eš|e|eme?|ete|ou)|polet(ím|íš|í|íme|íte))', node.form.lower()): + node.feats['Tense'] = 'Fut' + # Passive participles (including the short forms) should be ADJ, not VERB. + # But they keep the verbal features of VerbForm, Voice, Aspect. + if node.feats['VerbForm'] == 'Part' and node.feats['Voice'] == 'Pass': + node.upos = 'ADJ' + # But now we need an adjectival lemma. + ###!!! Bohužel to občas zahodí normalizaci, kterou tam Martinův tým zavedl ručně, např. "rozhřita" mělo lemma "rozehřát", ale já teď místo "rozehřátý" vyrobím "rozhřitý". + ###!!! odepříno - odepříný místo odepřený + ###!!! dovolíno - dovolíný místo dovolený + ###!!! vyslyšána - vyslyšaný místo vyslyšený + ###!!! obmezený místo omezený, oslyšaný místo oslyšený + node.misc['LDeriv'] = node.lemma + node.lemma = re.sub(r'([nt])[auoiy]?$', r'\1ý', node.form.lower()) + node.lemma = re.sub(r'áný$', r'aný', node.lemma) # ztroskotány --> ztroskotáný --> ztroskotaný; zachován, spořádán + if node.feats['Polarity'] == 'Neg': + node.lemma = re.sub(r'^ne', '', node.lemma) + if node.feats['Case'] == '': + node.feats['Case'] = 'Nom' + if node.feats['Degree'] == '': + node.feats['Degree'] = 'Pos' + node.feats['Variant'] = 'Short' + #---------------------------------------------------------------------- + # ADVERBS + #---------------------------------------------------------------------- + # Words that indicate the speaker's attitude are tagged ADV in UD, + # although the Czech tagsets often treat them as particles. + if node.upos == 'PART' and re.fullmatch(r'(ani|asi?|až|bezpochyby|bohdá|co|dokonce|jen|jistě|již|hlavně|hned|jednoduše|leda|možná|naopak|nejen|nejspíše?|opravdu|ovšem|patrně|právě|prej|prý|přece|především|rozhodně|skoro|skutečně|snad|spíše?|teda|tedy|třeba|určitě|věru|vlastně|vůbec|zajisté|zase|zrovna|zřejmě|zvlášť|zvláště)', node.lemma): + node.upos = 'ADV' + node.feats['Degree'] = 'Pos' + node.feats['Polarity'] = 'Pos' + node.misc['CzechParticle'] = 'Yes' + # Adverb "brzo" should be lemmatized as "brzy". + if node.upos == 'ADV' and node.form.lower() == 'brzo': + node.lemma = 'brzy' + if node.upos == 'ADV' and node.form.lower() == 'teprv': + node.lemma = 'teprve' + # All non-pronominal adverbs (and also some pronominal ones) should + # have Degree and Polarity. At least for now we also exclude adverbial + # numerals, e.g. "jednou" – "nejednou". + if node.upos == 'ADV' and node.feats['PronType'] == '' and node.feats['NumType'] == '': + if node.feats['Degree'] == '': + node.feats['Degree'] = 'Pos' + if node.feats['Polarity'] == '': + node.feats['Polarity'] = 'Pos' + #---------------------------------------------------------------------- + # PREPOSITIONS + #---------------------------------------------------------------------- + # Preposition "u" may combine with Case=Loc|Acc in old texts, and then + # it functions as a vocalized counterpart of "v". Nevertheless, we always + # lemmatize it as "u" and thus AdpType is Prep, not Voc. + if node.upos == 'ADP' and node.form.lower() == 'u': + node.lemma = 'u' + node.feats['AdpType'] = 'Prep' + #---------------------------------------------------------------------- + # CONJUNCTIONS + #---------------------------------------------------------------------- + # As a conjunction (and not particle/adverb), "ani" is coordinating and + # not subordinating. + if node.upos == 'SCONJ' and node.lemma == 'ani': + node.upos = 'CCONJ' + if node.upos == 'CCONJ' and node.lemma == 'nebť': + node.lemma = 'neboť' + #---------------------------------------------------------------------- + # PARTICLES (other than those already grabbed above) + #---------------------------------------------------------------------- + # "jako" should be SCONJ but 19th century data have it as PART. + if node.upos == 'PART': + if node.lemma == 'jako': + node.upos = 'SCONJ' + elif node.lemma == 'ti': + node.lemma = 'ť' diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index d353a127..da9f5bda 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -37,6 +37,7 @@ def process_node(self, node): 'Gender': ['Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Emph': ['Yes'], 'Foreign': ['Yes'], 'Abbr': ['Yes'] }) @@ -47,6 +48,7 @@ def process_node(self, node): 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Emph': ['Yes'], 'Foreign': ['Yes'], 'Abbr': ['Yes']}) else: @@ -54,6 +56,7 @@ def process_node(self, node): 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Emph': ['Yes'], 'Foreign': ['Yes'], 'Abbr': ['Yes']}) # PROPER NOUNS ######################################################### @@ -66,7 +69,7 @@ def process_node(self, node): 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur', 'Geo', 'Nat'], + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'], 'Foreign': ['Yes'], 'Abbr': ['Yes']}) else: @@ -74,7 +77,7 @@ def process_node(self, node): 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur', 'Geo', 'Nat'], + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'], 'Foreign': ['Yes'], 'Abbr': ['Yes']}) # ADJECTIVES ########################################################### @@ -91,7 +94,8 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names 'Emph': ['Yes'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { @@ -102,29 +106,34 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names 'Emph': ['Yes'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) elif node.feats['NumType'] == 'Ord' or node.feats['NumType'] == 'Mult': # ordinal numerals are a subtype of adjectives; same for some multiplicative numerals (dvojí, trojí) if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) self.check_allowed_features(node, { 'NumType': ['Ord', 'Mult'], + 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho') 'Gender': ['Masc', 'Fem', 'Neut'], 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Variant': ['Short'], # sedmer (Mult Short) duch tvój; pól čtverta (Ord Short) komára 'Emph': ['Yes'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { 'NumType': ['Ord', 'Mult'], + 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho') 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Variant': ['Short'], 'Emph': ['Yes'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives self.check_required_features(node, ['VerbForm', 'Voice']) if node.feats['Voice'] == 'Act': # active participles have tense, passives don't but they have degree @@ -143,7 +152,8 @@ def process_node(self, node): 'Polarity': ['Pos', 'Neg'], 'Variant': ['Short'], 'Emph': ['Yes'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) @@ -158,7 +168,8 @@ def process_node(self, node): 'Polarity': ['Pos', 'Neg'], 'Variant': ['Short'], 'Emph': ['Yes'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: if node.feats['Gender'] == 'Masc': # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). @@ -175,7 +186,8 @@ def process_node(self, node): 'Degree': ['Pos', 'Cmp', 'Sup'], 'Variant': ['Short'], 'Emph': ['Yes'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Number', 'Case', 'Polarity', 'Degree']) @@ -190,7 +202,8 @@ def process_node(self, node): 'Degree': ['Pos', 'Cmp', 'Sup'], 'Variant': ['Short'], 'Emph': ['Yes'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: # regular adjectives, including short forms if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) @@ -203,7 +216,8 @@ def process_node(self, node): 'Polarity': ['Pos', 'Neg'], 'Variant': ['Short'], 'Emph': ['Yes'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) self.check_allowed_features(node, { @@ -214,7 +228,8 @@ def process_node(self, node): 'Polarity': ['Pos', 'Neg'], 'Variant': ['Short'], 'Emph': ['Yes'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) # PRONOUNS ############################################################# elif node.upos == 'PRON': self.check_required_features(node, ['PronType']) @@ -281,7 +296,7 @@ def process_node(self, node): 'Animacy': ['Anim'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] }) - elif re.match(r'^(co(si?)?|což|což?koliv?|něco|lečco|lecco|nic|nicož)$', node.lemma): + elif re.match(r'^(co(si?)?|což|což?koliv?|něco|lečco|lecco|ledacos?|nic|nicož)$', node.lemma): # Although these pronouns behave by default as neuter singular, # no Gender and Number is annotated. However, quite unusually, # there is Animacy=Inan without Gender. @@ -385,7 +400,9 @@ def process_node(self, node): }) # Relative possessive determiners 'jehož' and 'jejichž' behave similarly # to the personal possessive determiners but they do not have Person. - elif re.match(r'^(jeho|jejich|j[ií]ch)ž(e|to)?$', node.form.lower()): + # Normally determiners do not change j->n after prepositions but we + # have an example in Old Czech (štěpové zlatí, na nichžto větviech...) + elif re.match(r'^(jeho|jejich|[jn][ií]ch)ž(e|to)?$', node.form.lower()): self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]']) self.check_allowed_features(node, { 'PronType': ['Rel'], @@ -489,7 +506,7 @@ def process_node(self, node): 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) - elif re.match(r'^(můj|tvůj|svůj)$', node.lemma): + elif re.match(r'^(můj|tvůj|svůj)(ž(e|to)?)?$', node.lemma): if node.feats['Reflex'] == 'Yes': self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { 'PronType': ['Prs'], @@ -519,6 +536,11 @@ def process_node(self, node): elif node.lemma == 'žádný': # In Old Czech, this determiner also allows Variant=Short: žáden, žádna, žádnu, žádno, žádni, žádny. self.check_adjective_like(node, ['PronType'], {'PronType': ['Neg'], 'Variant': ['Short']}) + elif node.feats['NumType'] in ['Ord', 'Mult']: # pronominal numerals 'několikátý', 'několikerý', 'několiký' etc. + self.check_adjective_like(node, ['PronType', 'NumType'], { + 'PronType': ['Ind', 'Int', 'Rel', 'Dem'], + 'NumType': ['Ord', 'Mult'] + }) elif node.feats['NumType'] == 'Card': # pronominal quantifiers 'mnoho', 'málo', 'několik' etc. if node.lemma == 'nejeden': self.check_adjective_like(node, ['PronType', 'NumType'], {'PronType': ['Ind'], 'NumType': ['Card']}) @@ -549,13 +571,14 @@ def process_node(self, node): }) else: if node.feats['NumType'] == 'Sets': - # 'jedny', 'dvoje', 'troje', 'čtvery' + # 'jedny', 'dvoje', 'oboje', 'troje', 'čtvery' # Number should perhaps be only Plur because the counted noun will be Plur. # Gender is not annotated in PDT but there are different forms ('jedni' vs. 'jedny', # and in Old Czech also 'dvoji' vs. 'dvoje'), so we should allow Gender (and Animacy). self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) self.check_allowed_features(node, { 'NumType': ['Sets'], + 'PronType': ['Tot'], # for 'oboje' 'NumForm': ['Word'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Animacy': ['Anim', 'Inan'], @@ -599,6 +622,16 @@ def process_node(self, node): 'Number': ['Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) + elif re.match(r'^(dvé|obé)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'obé' + 'NumForm': ['Word'], + 'Gender': ['Neut'], + 'Number': ['Sing'], # when 'dvé' is subject, the verb is neuter singular + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) elif node.lemma == 'půl': self.check_required_features(node, ['NumType', 'NumForm']) self.check_allowed_features(node, { @@ -708,7 +741,8 @@ def process_node(self, node): 'Person': ['1', '2', '3'], 'Number': ['Sing', 'Dual', 'Plur'], 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist + 'Variant': ['Short', 'Long'], # distinguishes sigmatic (Long) and asigmatic (Short) aorist + 'Emph': ['Yes'] }) elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB if node.feats['Gender'] == 'Masc': @@ -842,12 +876,14 @@ def process_node(self, node): # SUBORDINATING CONJUNCTIONS ########################################### elif node.upos == 'SCONJ': self.check_allowed_features(node, { - 'Emph': ['Yes'] + 'Emph': ['Yes'], + 'Abbr': ['Yes'] }) # COORDINATING CONJUNCTIONS ############################################ elif node.upos == 'CCONJ': self.check_allowed_features(node, { - 'Emph': ['Yes'] + 'Emph': ['Yes'], + 'Abbr': ['Yes'] }) # PARTICLES ############################################################ elif node.upos == 'PART': diff --git a/udapi/block/ud/fixadvmodbyupos.py b/udapi/block/ud/fixadvmodbyupos.py index d6e51e1f..a2e4439c 100644 --- a/udapi/block/ud/fixadvmodbyupos.py +++ b/udapi/block/ud/fixadvmodbyupos.py @@ -44,7 +44,9 @@ def process_node(self, node): if node.upos != 'AUX': node.deprel = 'dep' elif node.udeprel == 'case': - if node.upos == 'DET': + if node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'DET': node.deprel = 'det' elif node.upos == 'PRON': node.deprel = 'nmod' @@ -64,11 +66,15 @@ def process_node(self, node): node.deprel = 'det' elif node.upos == 'INTJ': node.deprel = 'discourse' + elif node.upos == 'NOUN': + node.deprel = 'dep' elif node.udeprel == 'det': if node.upos == 'NOUN': node.deprel = 'nmod' elif node.upos == 'ADJ': node.deprel = 'amod' + elif node.upos == 'NUM': + node.deprel = 'nummod' elif node.upos == 'ADV': node.deprel = 'advmod' elif node.upos == 'AUX': @@ -77,13 +83,21 @@ def process_node(self, node): node.deprel = 'dep' elif node.upos == 'SCONJ': node.deprel = 'mark' + elif node.upos == 'CCONJ': + node.deprel = 'cc' elif node.upos == 'X': node.deprel = 'dep' elif node.udeprel == 'nummod': - if node.upos == 'PRON': + if node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'PRON': node.deprel = 'nmod' elif node.upos == 'DET': node.deprel = 'det' + elif node.upos == 'ADP': + node.deprel = 'case' elif node.udeprel == 'punct': if node.upos != 'PUNCT': node.deprel = 'dep' + elif node.udeprel == 'obl' and node.parent.upos in ['NOUN', 'PROPN', 'PRON'] and node.parent.udeprel in ['nsubj', 'obj', 'iobj', 'obl', 'vocative', 'dislocated', 'expl', 'nmod']: + node.deprel = 'nmod' diff --git a/udapi/block/ud/fixmultiobjects.py b/udapi/block/ud/fixmultiobjects.py index df7d5a44..485b85f0 100644 --- a/udapi/block/ud/fixmultiobjects.py +++ b/udapi/block/ud/fixmultiobjects.py @@ -11,8 +11,37 @@ class FixMultiObjects(Block): def process_node(self, node): objects = [x for x in node.children if x.udeprel == 'obj'] - # For the moment, we take the dummiest approach possible: The first object survives and all others are forced to a different deprel. if len(objects) > 1: - objects = objects[1:] - for o in objects: - o.deprel = 'iobj' + subjects = [x for x in node.children if x.udeprel in ['nsubj', 'csubj']] + # Some heuristics that could work in AnCora: + # If all objects are after the verb, keep the one that is closest to the verb. + if objects[0].ord > node.ord: + objects = objects[1:] + for o in objects: + o.deprel = 'obl:arg' + o.deps[0]['deprel'] = 'obl:arg' + elif objects[-1].ord < node.ord: + objects = objects[:-1] + for o in objects: + o.deprel = 'dislocated' + o.deps[0]['deprel'] = 'dislocated' + # ho experimenta tot + elif objects[-1].lemma in ['tot', 'todo']: + objects[-1].parent = objects[0] + objects[-1].deprel = 'nmod' + objects[-1].deps[0]['parent'] = objects[0] + objects[-1].deps[0]['deprel'] = 'nmod' + # X se llama Y + elif node.lemma in ['llamar', 'considerar', 'decir', 'denunciar', 'causar', 'escribir', 'hacer', 'rubricar']: + objects[-1].deprel = 'xcomp' + objects[-1].deps[0]['deprel'] = 'xcomp' + elif len(subjects) == 0: + objects[0].deprel = 'nsubj' + objects[0].deps[0]['deprel'] = 'nsubj' + else: + objects[0].deprel = 'dislocated' + objects[0].deps[0]['deprel'] = 'dislocated' + # For the moment, we take the dummiest approach possible: The first object survives and all others are forced to a different deprel. + #objects = objects[1:] + #for o in objects: + # o.deprel = 'iobj' diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index 069fc9fb..9d053cb7 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -1,9 +1,15 @@ """Block udpipe.Base for tagging and parsing using UDPipe.""" from udapi.core.block import Block -from udapi.tool.udpipe import UDPipe from udapi.tool.udpipeonline import UDPipeOnline from udapi.core.bundle import Bundle +# Import UDPipe only if available (requires ufal.udpipe) +try: + from udapi.tool.udpipe import UDPipe + UDPIPE_AVAILABLE = True +except ImportError: + UDPIPE_AVAILABLE = False + KNOWN_MODELS = { 'af': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', 'af_afribooms': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', @@ -143,6 +149,8 @@ def tool(self): if self.online: self._tool = UDPipeOnline(model=self.model) else: + if not UDPIPE_AVAILABLE: + raise ImportError("UDPipe is not available. Install ufal.udpipe or use online=1") self._tool = UDPipe(model=self.model) return self._tool diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index 885f797f..a8a7ab3d 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -23,7 +23,7 @@ class TextModeTrees(BaseWriter): - """An ASCII pretty printer of dependency trees. + r"""An ASCII pretty printer of dependency trees. .. code-block:: bash @@ -110,7 +110,7 @@ class TextModeTrees(BaseWriter): │ ╰─┶ boxer NOUN acl:relcl ╰─╼ . PUNCT punct - Some non-projective trees cannot be printed witout crossing edges. + Some non-projective trees cannot be printed without crossing edges. TextModeTrees uses a special "bridge" symbol ─╪─ to mark this:: ─┮ @@ -123,17 +123,17 @@ class TextModeTrees(BaseWriter): (not file or pipe), each node attribute is printed in different color. If a given node's MISC contains any of `ToDo`, `Bug` or `Mark` attributes (or any other specified in the parameter `mark`), the node will be highlighted - (by reveresing the background and foreground colors). + (by reversing the background and foreground colors). This block's method `process_tree` can be called on any node (not only root), which is useful for printing subtrees using ``node.draw()``, which is internally implemented using this block. For use in LaTeX, you can insert the output of this block (without colors) - into \begin{verbatim}...\end{verbatim}, but you need to compile with pdflatex (xelatex not supported) - and you must add the following code into the preambule:: + into ``\begin{verbatim}...\end{verbatim}``, but you need to compile with pdflatex (xelatex not supported) + and you must add the following code into the preamble:: - \\usepackage{pmboxdraw} + \usepackage{pmboxdraw} \DeclareUnicodeCharacter{256D}{\textSFi} %╭ \DeclareUnicodeCharacter{2570}{\textSFii} %╰ @@ -144,41 +144,44 @@ class TextModeTrees(BaseWriter): def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color='auto', attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, print_empty=True, - mark='(ToDo|ToDoOrigText|Bug|Mark)', marked_only=False, hints=True, + print_mwt=False, mark='(ToDo|ToDoOrigText|Bug|Mark)', marked_only=False, hints=True, layout='classic', **kwargs): """Create new TextModeTrees block object. Args: - print_sent_id: Print ID of the tree (its root, aka "sent_id") above each tree? - print_sentence: Print plain-text detokenized sentence on a line above each tree? - add_empty_line: Print an empty line after each tree? - indent: Number of characters to indent node depth in the tree for better readability. - minimize_cross: Minimize crossings of edges in non-projective trees? - Trees without crossings are subjectively more readable, but usually - in practice also "deeper", that is with higher maximal line length. - color: Print the node attribute with ANSI terminal colors? - Default = 'auto' which means that color output only if the output filehandle - is interactive (console). Each attribute is assigned a color (the mapping is - tested on black background terminals and can be changed only in source code). - If you plan to pipe the output (e.g. to "less -R") and you want the colors, - you need to set explicitly color=1, see the example in Synopsis. - attributes: A comma-separated list of node attributes which should be printed. Possible - values are ord, form, lemma, upos, xpos, feats, deprel, deps, misc. - print_undef_as: What should be printed instead of undefined attribute values (if any)? - print_doc_meta: Print `document.meta` metadata before each document? - print_comments: Print comments (other than sent_id and text)? - print_empty: Print empty nodes? - mark: a regex. If `re.search(mark + '=', str(node.misc))` the node is highlighted. - If `print_comments and re.search(r'^ %s = ' % mark, root.comment, re.M)` - the comment is highlighted. - Empty string means no highlighting. Default = 'ToDo|ToDoOrigText|Bug|Mark'. - marked_only: print only trees containing one or more marked nodes/comments. Default=False. - hints: use thick-marked segments (┡ and ┢) to distinguish whether a given node precedes - or follows its parent. Default=True. If False, plain ├ is used in both cases. - layout: 'classic' (default) shows word attributes immediately next to each node, - 'compact' never print edges after (right to) words even in non-projectivities, - 'align-words' as 'compact' but all first attributes (forms by default) are aligned, - 'align' as 'align-words' but all attributes are aligned in columns. + print_sent_id: Print ID of the tree (its root, aka "sent_id") above each tree? + print_text: Print plain-text detokenized sentence on a line above each tree? + add_empty_line: Print an empty line after each tree? + indent: Number of characters to indent node depth in the tree for better readability. + minimize_cross: Minimize crossings of edges in non-projective trees? + Trees without crossings are subjectively more readable, but usually + in practice also "deeper", that is with higher maximal line length. + color: Print the node attribute with ANSI terminal colors? + Default = 'auto' which means that color output only if the output filehandle + is interactive (console). Each attribute is assigned a color (the mapping is + tested on black background terminals and can be changed only in source code). + If you plan to pipe the output (e.g. to "less -R") and you want the colors, + you need to set explicitly color=1, see the example in Synopsis. + attributes: A comma-separated list of node attributes which should be printed. Possible + values are ``ord``, ``form``, ``lemma``, ``upos``, ``xpos``, ``feats``, ``deprel``, ``deps``, ``misc``. + print_undef_as: What should be printed instead of undefined attribute values (if any)? + print_doc_meta: Print ``document.meta`` metadata before each document? + print_comments: Print comments (other than ``sent_id`` and ``text``)? + print_empty: Print empty nodes? Default=True + print_mwt: Print multi-word tokens? Default=False + mark: A regex pattern. If ``re.search(mark + '=', str(node.misc))`` matches, the node is highlighted. + If ``print_comments`` and ``re.search(r'^ %s = ' % mark, root.comment, re.M)`` matches, + the comment is highlighted. Empty string means no highlighting. + Default = ``'(ToDo|ToDoOrigText|Bug|Mark)'``. + marked_only: Print only trees containing one or more marked nodes/comments. Default ``False``. + hints: Use thick-marked segments (┡ and ┢) to distinguish whether a given node precedes + or follows its parent. Default ``True``. If ``False``, plain ├ is used in both cases. + layout: Tree layout style: + + - ``'classic'`` (default): shows word attributes immediately next to each node + - ``'compact'``: never print edges after (right to) words even in non-projectivities + - ``'align-words'``: like ``'compact'`` but all first attributes (forms by default) are aligned + - ``'align'``: like ``'align-words'`` but all attributes are aligned in columns """ super().__init__(**kwargs) self.print_sent_id = print_sent_id @@ -191,6 +194,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.print_doc_meta = print_doc_meta self.print_comments = print_comments self.print_empty = print_empty + self.print_mwt = print_mwt self.mark = mark self.marked_only = marked_only self.layout = layout @@ -248,18 +252,18 @@ def should_print_tree(self, root, allnodes): def process_tree(self, root, force_print=False): """Print the tree to (possibly redirected) sys.stdout.""" if self.print_empty: - if root.is_root(): + if root.is_root() and not self.print_mwt: allnodes = [root] + root.descendants_and_empty else: - allnodes = root.descendants(add_self=1) + allnodes = root.descendants(add_self=1, add_mwt=self.print_mwt) empty = [e for e in root._root.empty_nodes if e > allnodes[0] and e < allnodes[-1]] allnodes.extend(empty) allnodes.sort() else: - allnodes = root.descendants(add_self=1) + allnodes = root.descendants(add_self=1, add_mwt=self.print_mwt) if not force_print and not self.should_print_tree(root, allnodes): return - self._index_of = {allnodes[i].ord: i for i in range(len(allnodes))} + self._index_of = {allnodes[i].ord_range if allnodes[i].is_mwt() else allnodes[i].ord: i for i in range(len(allnodes))} self.lines = [''] * len(allnodes) self.lengths = [0] * len(allnodes) @@ -286,7 +290,7 @@ def process_tree(self, root, force_print=False): if self.layout == 'classic': self.add_node(idx, node) else: - if idx_node.parent is not node: + if idx_node.is_mwt() or idx_node.parent is not node: self._add(idx, self._vert[self._ends(idx, '─╭╰╪┡┢')]) else: precedes_parent = idx < self._index_of[node.ord] @@ -304,7 +308,7 @@ def process_tree(self, root, force_print=False): if self.layout == 'classic': for idx, node in enumerate(allnodes): - if node.is_empty(): + if node.is_empty() or node.is_mwt(): self.add_node(idx, node) else: columns_attrs = [[a] for a in self.attrs] if self.layout == 'align' else [self.attrs] @@ -363,7 +367,7 @@ def _add(self, idx, text): def add_node(self, idx, node): """Render a node with its attributes.""" - if not node.is_root(): + if node.is_mwt() or not node.is_root(): values = node.get_attrs(self.attrs, undefs=self.print_undef_as) self.lengths[idx] += 1 + len(' '.join(values)) marked = self.is_marked(node) diff --git a/udapi/cli.py b/udapi/cli.py new file mode 100755 index 00000000..de55f8cb --- /dev/null +++ b/udapi/cli.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +import os +import gc +import sys +import atexit +import logging +import argparse + +from udapi.core.run import Run + +# Parse command line arguments. +argparser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter, + usage="udapy [optional_arguments] scenario", + epilog="See http://udapi.github.io", + description="udapy - Python interface to Udapi - API for Universal Dependencies\n\n" + "Examples of usage:\n" + " udapy -s read.Sentences udpipe.En < in.txt > out.conllu\n" + " udapy -T < sample.conllu | less -R\n" + " udapy -HAM ud.MarkBugs < sample.conllu > bugs.html\n") +argparser.add_argument( + "-q", "--quiet", action="store_true", + help="Warning, info and debug messages are suppressed. Only fatal errors are reported.") +argparser.add_argument( + "-v", "--verbose", action="store_true", + help="Warning, info and debug messages are printed to the STDERR.") +argparser.add_argument( + "-s", "--save", action="store_true", + help="Add write.Conllu to the end of the scenario") +argparser.add_argument( + "-T", "--save_text_mode_trees", action="store_true", + help="Add write.TextModeTrees color=1 to the end of the scenario") +argparser.add_argument( + "-H", "--save_html", action="store_true", + help="Add write.TextModeTreesHtml color=1 to the end of the scenario") +argparser.add_argument( + "-A", "--save_all_attributes", action="store_true", + help="Add attributes=form,lemma,upos,xpos,feats,deprel,misc (to be used after -T and -H)") +argparser.add_argument( + "-C", "--save_comments", action="store_true", + help="Add print_comments=1 (to be used after -T and -H)") +argparser.add_argument( + "-M", "--marked_only", action="store_true", + help="Add marked_only=1 to the end of the scenario (to be used after -T and -H)") +argparser.add_argument( + "-N", "--no_color", action="store_true", + help="Add color=0 to the end of the scenario, this overrides color=1 of -T and -H") +argparser.add_argument( + "-X", "--extra", action="append", + help="Add a specified parameter (or a block name) to the end of the scenario\n" + "For example 'udapy -TNX attributes=form,misc -X layout=align < my.conllu'") +argparser.add_argument( + "--gc", action="store_true", + help="By default, udapy disables Python garbage collection and at-exit cleanup\n" + "to speed up everything (especially reading CoNLL-U files). In edge cases,\n" + "when processing many files and running out of memory, you can disable this\n" + "optimization (i.e. enable garbage collection) with 'udapy --gc'.") +argparser.add_argument( + 'scenario', nargs=argparse.REMAINDER, help="A sequence of blocks and their parameters.") + + +# Process and provide the scenario. +def main(argv=None): + args = argparser.parse_args(argv) + + # Set the level of logs according to parameters. + if args.verbose: + level = logging.DEBUG + elif args.quiet: + level = logging.CRITICAL + else: + level = logging.INFO + + logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', + level=level) + + # Global flag to track if an unhandled exception occurred + _unhandled_exception_occurred = False + + def _custom_excepthook(exc_type, exc_value, traceback): + global _unhandled_exception_occurred + _unhandled_exception_occurred = True + + # Call the default excepthook to allow normal error reporting + sys.__excepthook__(exc_type, exc_value, traceback) + + # Override the default excepthook + sys.excepthook = _custom_excepthook + + # Disabling garbage collections makes the whole processing much faster. + # Similarly, we can save several seconds by partially disabling the at-exit Python cleanup + # (atexit hooks are called in reversed order of their registration, + # so flushing stdio buffers etc. will be still done before the os._exit(0) call). + # See https://instagram-engineering.com/dismissing-python-garbage-collection-at-instagram-4dca40b29172 + # Is it safe to disable GC? + # OS will free the memory allocated by this process after it ends anyway. + # The udapy wrapper is aimed for one-time tasks, not a long-running server, + # so in a typical case a document is loaded and almost no memory is freed before the end. + # Udapi documents have a many cyclic references, so running GC is quite slow. + if not args.gc: + gc.disable() + # When an exception/error has happened, udapy should exit with a non-zero exit code, + # so that users can use `udapy ... || echo "Error detected"` (or Makefile reports errors). + # However, we cannot use `atexit.register(lambda: os._exit(1 if sys.exc_info()[0] else 0))` + # because the Python has already exited the exception-handling block + # (the exception/error has been already reported and sys.exc_info()[0] is None). + # We thus keep record whether _unhandled_exception_occurred. + atexit.register(lambda: os._exit(1 if _unhandled_exception_occurred else 0)) + atexit.register(sys.stderr.flush) + if args.save: + args.scenario = args.scenario + ['write.Conllu'] + if args.save_text_mode_trees: + args.scenario = args.scenario + ['write.TextModeTrees', 'color=1'] + if args.save_html: + args.scenario = args.scenario + ['write.TextModeTreesHtml', 'color=1'] + if args.save_all_attributes: + args.scenario = args.scenario + ['attributes=form,lemma,upos,xpos,feats,deprel,misc'] + if args.save_comments: + args.scenario = args.scenario + ['print_comments=1'] + if args.marked_only: + args.scenario = args.scenario + ['marked_only=1'] + if args.no_color: + args.scenario = args.scenario + ['color=0'] + if args.extra: + args.scenario += args.extra + + runner = Run(args) + # udapy is often piped to head etc., e.g. + # `seq 1000 | udapy -s read.Sentences | head` + # Let's prevent Python from reporting (with distracting stacktrace) + # "BrokenPipeError: [Errno 32] Broken pipe" + try: + runner.execute() + except BrokenPipeError: + pass + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/udapi/core/mwt.py b/udapi/core/mwt.py index 087b30a3..00ba935c 100644 --- a/udapi/core/mwt.py +++ b/udapi/core/mwt.py @@ -77,9 +77,79 @@ def is_mwt(): @property def no_space_after(self): - """Boolean property as a shortcut for `node.misc["SpaceAfter"] == "No"`.""" + """Boolean property as a shortcut for `mwt.misc["SpaceAfter"] == "No"`.""" return self.misc["SpaceAfter"] == "No" + @staticmethod + def is_empty(): + """Is this an Empty node? + + Returns always False because multi-word tokens cannot be empty nodes. + """ + return False + + @staticmethod + def is_leaf(): + """Is this a node/mwt without any children? + + Returns always True because multi-word tokens cannot have children. + """ + return True + + def _get_attr(self, name): # pylint: disable=too-many-return-statements + if name == 'form': + return self.form + if name == 'ord': + return self.ord_range + if name in ('edge', 'children', 'siblings', 'depth'): + return 0 + if name == 'feats_split': + return str(self.feats).split('|') + if name == 'misc_split': + return str(self.misc).split('|') + if name.startswith('feats['): + return self.feats[name[6:-1]] + if name.startswith('misc['): + return self.misc[name[5:-1]] + return '' + + def get_attrs(self, attrs, undefs=None, stringify=True): + """Return multiple attributes or pseudo-attributes, possibly substituting empty ones. + + MWTs do not have children nor parents nor prev/next nodes, + so the pseudo-attributes: p_xy, c_xy, l_xy and r_xy are irrelevant (and return nothing). + Other pseudo-attributes (e.g. dir) return always the string "". + The only relevant pseudo-attributes are + feats_split and misc_split: a list of name=value formatted strings. + The `ord` attribute returns actually `mwt.ord_range`. + + Args: + attrs: A list of attribute names, e.g. ``['form', 'ord', 'feats_split']``. + undefs: A value to be used instead of None for empty (undefined) values. + stringify: Apply `str()` on each value (except for None) + """ + values = [] + for name in attrs: + nodes = [self] + if name[1] == '_': + nodes, name = [], name[2:] + for node in (n for n in nodes if n is not None): + if name in {'feats_split', 'misc_split'}: + values.extend(node._get_attr(name)) + else: + values.append(node._get_attr(name)) + + if undefs is not None: + values = [x if x is not None else undefs for x in values] + if stringify: + values = [str(x) if x is not None else None for x in values] + return values + + @property + def _ord(self): + self.words.sort() + return self.words[0]._ord + # TODO: node.remove() should check if the node is not part of any MWT # TODO: Document that editing words by mwt.words.append(node), del or remove(node) is not supported # TODO: Make mwt._words private and provide a setter diff --git a/udapi/core/node.py b/udapi/core/node.py index b5b0ea06..c6a7a26a 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -1151,6 +1151,7 @@ class ListOfNodes(list): nodes = node.children nodes = node.children() nodes = node.children(add_self=True, following_only=True) + nodes = node.descendants(add_self=True, add_mwt=True) """ __slots__ = ('origin',) @@ -1164,16 +1165,28 @@ def __init__(self, iterable, origin): super().__init__(iterable) self.origin = origin - def __call__(self, add_self=False, following_only=False, preceding_only=False): + def __call__(self, add_self=False, following_only=False, preceding_only=False, add_mwt=False): """Returns a subset of nodes contained in this list as specified by the args.""" if add_self: self.append(self.origin) self.sort() + result = self if preceding_only: - return [x for x in self if x._ord <= self.origin._ord] + result = [x for x in result if x._ord <= self.origin._ord] if following_only: - return [x for x in self if x._ord >= self.origin._ord] - return self + result = [x for x in result if x._ord >= self.origin._ord] + if add_mwt: + new = [] + last_mwt_id = -1 + for node in result: + mwt = node.multiword_token + if mwt: + if node.ord > last_mwt_id: + last_mwt_id = mwt.words[-1].ord + new.append(mwt) + new.append(node) + result = new + return result def find_minimal_common_treelet(*args): diff --git a/udapi/tool/udpipeonline.py b/udapi/tool/udpipeonline.py index f0a835c9..ced96d56 100644 --- a/udapi/tool/udpipeonline.py +++ b/udapi/tool/udpipeonline.py @@ -8,6 +8,7 @@ import os import sys import urllib.error +import urllib.parse import urllib.request from udapi.block.read.conllu import Conllu as ConlluReader @@ -62,6 +63,35 @@ def perform_request(self, params, method="process"): return response["result"] + def perform_request_urlencoded(self, params, method="process"): + """Perform a request using application/x-www-form-urlencoded to preserve LF newlines. + + This avoids CRLF normalization done by the email MIME serializer, ensuring that + the content of the 'data' field retains Unix LF ("\n") exactly as provided. + """ + request_data = urllib.parse.urlencode(params).encode("utf-8") + request_headers = {"Content-Type": "application/x-www-form-urlencoded; charset=utf-8"} + + try: + with urllib.request.urlopen(urllib.request.Request( + url=f"{self.server}/{method}", headers=request_headers, data=request_data + )) as request: + response = json.loads(request.read()) + except urllib.error.HTTPError as e: + print("An exception was raised during UDPipe '{}' REST request.\n" + "The service returned the following error:\n" + " {}".format(method, e.fp.read().decode("utf-8")), file=sys.stderr) + raise + except json.JSONDecodeError as e: + print("Cannot parse the JSON response of UDPipe '{}' REST request.\n" + " {}".format(method, e.msg), file=sys.stderr) + raise + + if "model" not in response or "result" not in response: + raise ValueError("Cannot parse the UDPipe '{}' REST request response.".format(method)) + + return response["result"] + def tag_parse_tree(self, root, tag=True, parse=True): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" if not tag and not parse: @@ -76,7 +106,7 @@ def tag_parse_tree(self, root, tag=True, parse=True): params["parser"] = "" attrs.append('deprel') - out_data = self.perform_request(params=params) + out_data = self.perform_request_urlencoded(params=params) conllu_reader = ConlluReader(empty_parent="ignore") conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = conllu_reader.read_tree() @@ -108,7 +138,7 @@ def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, r params["parser"] = "" if ranges: params["tokenizer"] = "presegmented;ranges" if resegment else "ranges" - out_data = self.perform_request(params=params) + out_data = self.perform_request_urlencoded(params=params) conllu_reader = ConlluReader(empty_parent="ignore") conllu_reader.files.filehandle = io.StringIO(out_data) trees = conllu_reader.read_trees() @@ -126,7 +156,7 @@ def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, r def segment_text(self, text): """Segment the provided text into sentences returned as a Python list.""" params = {"model": self.model, "data": text, "tokenizer":"", "output": "plaintext=normalized_spaces"} - return self.perform_request(params=params).rstrip().split("\n") + return self.perform_request_urlencoded(params=params).rstrip().split("\n") def process_document(self, doc, tokenize=True, tag=True, parse=True, resegment=False, ranges=False): """Delete all existing bundles and substitute them with those parsed by UDPipe.""" @@ -152,7 +182,7 @@ def process_document(self, doc, tokenize=True, tag=True, parse=True, resegment=F params["input"] = "horizontal" params["data"] = "\n".join(" ".join([n.form for n in root.descendants]) for root in doc.trees) + "\n" - out_data = self.perform_request(params=params) + out_data = self.perform_request_urlencoded(params=params) conllu_reader = ConlluReader(empty_parent="ignore") conllu_reader.files.filehandle = io.StringIO(out_data) trees = conllu_reader.read_trees()