From 329e0f7fc7c81c7c0c138834775d502e3d12e5f3 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 27 Jun 2017 12:36:08 +0200 Subject: [PATCH 0001/1201] support for reading and writing doc-level and tree-level json data --- udapi/block/read/conllu.py | 12 ++++++++++-- udapi/block/write/conllu.py | 14 ++++++++++++++ udapi/core/document.py | 1 + udapi/core/root.py | 3 ++- udapi/core/tests/data/babinsky.conllu | 20 ++++++++++++++++++++ udapi/core/tests/external_tests.sh | 3 +++ 6 files changed, 50 insertions(+), 3 deletions(-) create mode 100644 udapi/core/tests/data/babinsky.conllu diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 8303d096..8edf1359 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -1,4 +1,5 @@ """"Conllu is a reader block for the CoNLL-U files.""" +import json import logging import re @@ -11,6 +12,7 @@ RE_SENT_ID = re.compile(r'^# sent_id\s*=?\s*(\S+)') RE_TEXT = re.compile(r'^# text\s*=\s*(.+)') RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc) (?:\s*id\s*=\s*(.+))?') +RE_JSON = re.compile(r'^# json_(doc_)?([^ =]+)\s*=\s*(.+)') class Conllu(BaseReader): @@ -54,7 +56,7 @@ def __init__(self, strict=False, separator='tab', empty_parent='warn', self.empty_parent = empty_parent @staticmethod - def parse_comment_line(line, root): + def parse_comment_line(line, root, document): """Parse one line of CoNLL-U and fill sent_id, text, newpar, newdoc in root.""" sent_id_match = RE_SENT_ID.match(line) if sent_id_match is not None: @@ -75,6 +77,12 @@ def parse_comment_line(line, root): root.newdoc = value return + json_match = RE_JSON.match(line) + if json_match is not None: + container = document if json_match.group(1) == 'doc_' else root + container.json[json_match.group(2)] = json.loads(json_match.group(3)) + return + root.comment += line[1:] + "\n" # pylint: disable=too-many-locals,too-many-branches,too-many-statements @@ -93,7 +101,7 @@ def read_tree(self, document=None): if line == '': break if line[0] == '#': - self.parse_comment_line(line, root) + self.parse_comment_line(line, root, document) else: if self.separator == 'tab': fields = line.split('\t') diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 6c2dc314..5185906b 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -1,4 +1,5 @@ """Conllu class is a a writer of files in the CoNLL-U format.""" +import json from udapi.core.basewriter import BaseWriter @@ -35,6 +36,11 @@ def process_tree(self, tree): # pylint: disable=too-many-branches if self.print_text: print("# text = " + tree.get_sentence()) + if tree.json: + for key, value in sorted(tree.json.items()): + print("# json_%s = %s" + % (key, json.dumps(value, ensure_ascii=False, sort_keys=True))) + comment = tree.comment if comment: comment = comment.rstrip() @@ -73,3 +79,11 @@ def process_tree(self, tree): # pylint: disable=too-many-branches # Empty line separates trees in CoNLL-U (and is required after the last tree as well) print("") + + def before_process_document(self, document): + """Print json_doc_* headers.""" + super().before_process_document(document) + if document.json: + for key, value in sorted(document.json.items()): + print("# json_doc_%s = %s" + % (key, json.dumps(value, ensure_ascii=False, sort_keys=True))) diff --git a/udapi/core/document.py b/udapi/core/document.py index b64ee29c..66164997 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -13,6 +13,7 @@ def __init__(self): self.bundles = [] self._highest_bundle_id = 0 self.meta = {} + self.json = {} def __iter__(self): return iter(self.bundles) diff --git a/udapi/core/root.py b/udapi/core/root.py index 56105872..7460e77b 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -12,7 +12,7 @@ class Root(Node): """Class for representing root nodes (technical roots) in UD trees.""" __slots__ = ['_sent_id', '_zone', '_bundle', '_descendants', '_mwts', - 'empty_nodes', 'text', 'comment', 'newpar', 'newdoc'] + 'empty_nodes', 'text', 'comment', 'newpar', 'newdoc', 'json'] # pylint: disable=too-many-arguments def __init__(self, zone=None, comment='', text=None, newpar=None, newdoc=None): @@ -30,6 +30,7 @@ def __init__(self, zone=None, comment='', text=None, newpar=None, newdoc=None): self.text = text self.newpar = newpar self.newdoc = newdoc + self.json = {} # TODO: or None and mask as {} in property reader&writer to save memory? self._sent_id = None self._zone = zone diff --git a/udapi/core/tests/data/babinsky.conllu b/udapi/core/tests/data/babinsky.conllu new file mode 100644 index 00000000..034ac7d0 --- /dev/null +++ b/udapi/core/tests/data/babinsky.conllu @@ -0,0 +1,20 @@ +# json_doc_entities = [{"id": "E1", "labels": ["Jaroslav Vomáčka", "Jarda"], "mentions": ["s1#m1", "s2#m1"]}, {"id": "E2", "labels": ["Babinský"], "mentions": ["s2#m2"]}] +# json_doc_relations = [{"from": "E1", "id": "R1", "to": "E2", "type": "call"}] +# sent_id = 1 +# text = Vyšetřování Jaroslava Vomáčky. +# json_mentions = [{"id": "s1#m1", "label": "Jaroslav Vomáčka", "span": [2, 3]}] +1 Vyšetřování vyšetřování NOUN NNNS1-----A---- Case=Nom|Gender=Neut|Number=Sing|Polarity=Pos 0 root _ _ +2 Jaroslava Jaroslav PROPN NNMS2-----A---- Animacy=Anim|Case=Gen|Gender=Masc|NameType=Giv|Number=Sing|Polarity=Pos 1 nmod _ _ +3 Vomáčky Vomáčka PROPN NNMS2-----A---- Animacy=Anim|Case=Gen|Gender=Masc|NameType=Sur|Number=Sing|Polarity=Pos 2 flat _ SpaceAfter=No +4 . . PUNCT Z:------------- _ 1 punct _ SpaceAfter=No + +# sent_id = 2 +# text = Jarda telefonoval loupežníkovi Babinskému. +# json_mention_relations = [{"from": "s2#m1", "id": "r1", "to": "s2#m2", "type": "call"}] +# json_mentions = [{"id": "s2#m1", "label": "Jaroslav", "span": [1]}, {"id": "s2#m2", "label": "Babinský", "span": [4]}] +1 Jarda Jarda PROPN NNMS1-----A---- Animacy=Anim|Case=Nom|Gender=Masc|NameType=Giv|Number=Sing|Polarity=Pos 2 nsubj _ _ +2 telefonoval telefonovat VERB VpYS---XR-AA--- Aspect=Imp|Gender=Masc|Number=Sing|Polarity=Pos|Tense=Past|VerbForm=Part|Voice=Act 0 root _ _ +3 loupežníkovi loupežník NOUN NNMS3-----A---- Animacy=Anim|Case=Dat|Gender=Masc|Number=Sing|Polarity=Pos 4 nmod _ _ +4 Babinskému Babinský PROPN NNMS3-----A---- Animacy=Anim|Case=Dat|Gender=Masc|NameType=Sur|Number=Sing|Polarity=Pos 2 obj _ SpaceAfter=No +5 . . PUNCT Z:------------- _ 2 punct _ SpaceAfter=No + diff --git a/udapi/core/tests/external_tests.sh b/udapi/core/tests/external_tests.sh index 55ded49d..85cda295 100755 --- a/udapi/core/tests/external_tests.sh +++ b/udapi/core/tests/external_tests.sh @@ -1,3 +1,6 @@ #!/bin/bash +set -e udapy read.Conllu files=data/UD_Czech_sample.conllu write.Conllu print_sent_id=0 print_text=0 > out.conllu && diff data/UD_Czech_sample.conllu out.conllu && rm out.conllu + +cat data/babinsky.conllu | udapy -s > out.conllu && diff data/babinsky.conllu out.conllu && rm out.conllu \ No newline at end of file From b495764532a26ef13bcfb59c4ac6fac57e958b1c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 27 Jun 2017 19:22:26 +0200 Subject: [PATCH 0002/1201] rename json_doc_ => doc_json_ so it does not share prefix with "json_" --- udapi/block/read/conllu.py | 2 +- udapi/block/write/conllu.py | 4 ++-- udapi/core/tests/data/babinsky.conllu | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 8edf1359..59284f89 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -12,7 +12,7 @@ RE_SENT_ID = re.compile(r'^# sent_id\s*=?\s*(\S+)') RE_TEXT = re.compile(r'^# text\s*=\s*(.+)') RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc) (?:\s*id\s*=\s*(.+))?') -RE_JSON = re.compile(r'^# json_(doc_)?([^ =]+)\s*=\s*(.+)') +RE_JSON = re.compile(r'^# (doc_)?json_([^ =]+)\s*=\s*(.+)') class Conllu(BaseReader): diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 5185906b..cb69159d 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -81,9 +81,9 @@ def process_tree(self, tree): # pylint: disable=too-many-branches print("") def before_process_document(self, document): - """Print json_doc_* headers.""" + """Print doc_json_* headers.""" super().before_process_document(document) if document.json: for key, value in sorted(document.json.items()): - print("# json_doc_%s = %s" + print("# doc_json_%s = %s" % (key, json.dumps(value, ensure_ascii=False, sort_keys=True))) diff --git a/udapi/core/tests/data/babinsky.conllu b/udapi/core/tests/data/babinsky.conllu index 034ac7d0..32bc53cf 100644 --- a/udapi/core/tests/data/babinsky.conllu +++ b/udapi/core/tests/data/babinsky.conllu @@ -1,5 +1,5 @@ -# json_doc_entities = [{"id": "E1", "labels": ["Jaroslav Vomáčka", "Jarda"], "mentions": ["s1#m1", "s2#m1"]}, {"id": "E2", "labels": ["Babinský"], "mentions": ["s2#m2"]}] -# json_doc_relations = [{"from": "E1", "id": "R1", "to": "E2", "type": "call"}] +# doc_json_entities = [{"id": "E1", "labels": ["Jaroslav Vomáčka", "Jarda"], "mentions": ["s1#m1", "s2#m1"]}, {"id": "E2", "labels": ["Babinský"], "mentions": ["s2#m2"]}] +# doc_json_relations = [{"from": "E1", "id": "R1", "to": "E2", "type": "call"}] # sent_id = 1 # text = Vyšetřování Jaroslava Vomáčky. # json_mentions = [{"id": "s1#m1", "label": "Jaroslav Vomáčka", "span": [2, 3]}] From f78cc6b0b7c41ffb8936c924e4f90b9809756f9d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 23 Oct 2017 18:21:48 +0200 Subject: [PATCH 0003/1201] udapy -h prints examples of usage Fixes #20 --- bin/udapy | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/udapy b/bin/udapy index c756c5cb..9bb58f53 100755 --- a/bin/udapy +++ b/bin/udapy @@ -7,7 +7,14 @@ from udapi.core.run import Run # Parse command line arguments. argparser = argparse.ArgumentParser( - description='udapy - Python interface to Udapi - API for Universal Dependencies') + formatter_class=argparse.RawTextHelpFormatter, + usage="udapy [optional_arguments] scenario", + epilog="See http://udapi.github.io", + description="udapy - Python interface to Udapi - API for Universal Dependencies\n\n" + "Examples of usage:\n" + " udapy -s read.Sentences udpipe.En < in.txt > out.conllu\n" + " udapy -T < sample.conllu | less -R\n" + " udapy -HAM ud.MarkBugs < sample.conllu > bugs.html\n") argparser.add_argument( "-q", "--quiet", action="store_true", help="Warning, info and debug messages are suppressed. Only fatal errors are reported.") From 48e9a6931645e09bea01a49681544c1968d8f8a7 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 23 Oct 2017 19:45:09 +0200 Subject: [PATCH 0004/1201] fix https redirects found with cd docs && make linkcheck --- udapi/block/write/html.py | 2 +- udapi/block/write/sdparse.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index ec33b0fd..b06c971a 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -34,7 +34,7 @@ class Html(BaseWriter): This block is based on `Treex::View `_ but takes a different approach. `Treex::View` depends on (older version of) - `Valence` (Perl interface to `Electron `_) + `Valence` (Perl interface to `Electron `_) and comes with a script `view-treex`, which takes a treex file, converts it to json behind the scenes (which is quite slow) and displays the json in a Valence window. diff --git a/udapi/block/write/sdparse.py b/udapi/block/write/sdparse.py index 60b78d6d..13487738 100644 --- a/udapi/block/write/sdparse.py +++ b/udapi/block/write/sdparse.py @@ -29,7 +29,7 @@ class Sdparse(BaseWriter): Notes: The original `Stanford dependencies format - `_ + `_ allows explicit specification of the root dependency, e.g. `root(ROOT-0, makes-8)`. However, this is not allowed by Brat, so this writer does not print it. From 043b3533ebdd1f152989290cfc898a48762c0d59 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 23 Oct 2017 19:57:21 +0200 Subject: [PATCH 0005/1201] trying to fix readthedocs build fails https://readthedocs.org/projects/udapi/builds/6164583/ http://docs.readthedocs.io/en/latest/faq.html#i-get-import-errors-on-libraries-that-depend-on-c-modules --- docs/conf.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 3e7864a5..47967fc1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,6 +21,15 @@ import sys sys.path.insert(0, os.path.abspath('..')) +from unittest.mock import MagicMock + +class Mock(MagicMock): + @classmethod + def __getattr__(cls, name): + return MagicMock() + +MOCK_MODULES = ['ufal.udpipe'] +sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) # -- General configuration ------------------------------------------------ @@ -62,7 +71,7 @@ # The short X.Y version. version = '0' # The full version, including alpha/beta/rc tags. -release = '1' +release = '2' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 31c87219f7976f571c1e6fc8161cb3fd912fb38b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 23 Oct 2017 20:41:46 +0200 Subject: [PATCH 0006/1201] update setup.py * rename the distribution to just "udapi" (the original suffix "-python" on PyPI makes no sense) * bump version to 0.2.1 (semver.org requires X.Y.Z where Z cannot be omitted) * exclude ufal.udpipe from install_requires, but keep it in requirements.txt - https://packaging.python.org/discussions/install-requires-vs-requirements/ says "install_requires [...] should be used to specify what a project minimally needs" while requirements.txt often contains an exhaustive listing. - readthedocs.io has problems with ufal.udpipe because of missing C++11 * add fields required by "setup.py sdist" --- setup.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 1ec8e468..bbc49234 100644 --- a/setup.py +++ b/setup.py @@ -9,15 +9,25 @@ raise SystemExit('Udapi requires Python 3.3 or higher.') setup( - name='udapi-python', - version='0.2', + name='udapi', + version='0.2.1', description='Python framework for processing Universal Dependencies data', + long_description=( + 'Udapi is an open-source framework providing API for processing ' + 'Universal Dependencies data. It is available in Python, Perl and Java. ' + 'Udapi is suitable both for full-fledged applications and fast ' + 'prototyping: visualization of dependency trees, format conversions, ' + 'querying, editing and transformations, validity tests, dependency ' + 'parsing, evaluation etc.' + ), author='Martin Popel', author_email='popel@ufal.mff.cuni.cz', url='https://github.com/udapi/udapi-python', packages=find_packages(), scripts=['bin/udapy'], tests_require=['pytest'], - install_requires=['colorama', 'termcolor', 'ufal.udpipe'], + install_requires=['colorama', 'termcolor'], python_requires='>=3.3', + license='GPL 2 or newer', + platforms='any', ) From 1fb1e08bcd9b9d93ee0395ccd4b4b861b98c6d70 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 23 Oct 2017 20:53:08 +0200 Subject: [PATCH 0007/1201] Create LICENSE --- LICENSE | 674 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 674 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..9cecc1d4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + {one line to give the program's name and a brief idea of what it does.} + Copyright (C) {year} {name of author} + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + {project} Copyright (C) {year} {fullname} + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. From 27dfb39d1431825d09dd959658da88ccdef04fa8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 24 Oct 2017 11:16:28 +0200 Subject: [PATCH 0008/1201] Update README.md --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4621c918..a082cc00 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,9 @@ Python framework for processing Universal Dependencies data ## Requirements - You need Python 3.3 or higher. -- If the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser is needed, make sure you have a C++11 compiler (e.g. [g++ 4.7 or newer](.travis.yml#L9)). +- If the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser is needed, + make sure you have a C++11 compiler (e.g. [g++ 4.7 or newer](.travis.yml#L9)) + and install UDPipe with `pip3 install --user --upgrade ufal.udpipe`. ## Install Udapi for developers Let's clone the git repo to `~/udapi-python/`, install dependencies @@ -23,9 +25,9 @@ source ~/.bashrc # or open new bash ``` ## Install Udapi for users -This is similar to the above, but installs Udapi to the standard (user) Python paths. +This is similar to the above, but installs Udapi from PyPI to the standard (user) Python paths. ``` -pip3 install --user --upgrade git+https://github.com/udapi/udapi-python.git +pip3 install --user --upgrade udapi ``` Try `udapy -h` to check it is installed correctly. If it fails, make sure your `PATH` includes the directory where `pip3` installed the `udapy` script. From ad69273c5eb065cee60968152ea586396be29ef1 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 24 Oct 2017 17:25:22 +0200 Subject: [PATCH 0009/1201] bugfix: newdoc does not need to be followed by a space --- udapi/block/read/conllu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 59284f89..bcd935f3 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -11,7 +11,7 @@ # This reader accepts also older-style sent_id (until UD v2.0 treebanks are released). RE_SENT_ID = re.compile(r'^# sent_id\s*=?\s*(\S+)') RE_TEXT = re.compile(r'^# text\s*=\s*(.+)') -RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc) (?:\s*id\s*=\s*(.+))?') +RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?') RE_JSON = re.compile(r'^# (doc_)?json_([^ =]+)\s*=\s*(.+)') From 75f71e4d526f6850fcba0dfcedae0740e9bd3485 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 24 Oct 2017 18:31:49 +0200 Subject: [PATCH 0010/1201] requirements for ReadTheDocs without ufal.udpipe --- docs/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 docs/requirements.txt diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..a994db47 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +colorama +termcolor From 555c9c4b77eaedd3984f284d3d1b59f10f1e1aca Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 24 Oct 2017 18:38:41 +0200 Subject: [PATCH 0011/1201] mock for ufal.udpipe is not needed It did not solve the problem anyway. --- docs/conf.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 47967fc1..45966b57 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,16 +21,6 @@ import sys sys.path.insert(0, os.path.abspath('..')) -from unittest.mock import MagicMock - -class Mock(MagicMock): - @classmethod - def __getattr__(cls, name): - return MagicMock() - -MOCK_MODULES = ['ufal.udpipe'] -sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. From c04f5f40373ba54c501fc10a703cfbfde7353fca Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Nov 2017 23:19:04 +0100 Subject: [PATCH 0012/1201] fix an edge case with punctuation-only sentences fix https://github.com/UniversalDependencies/tools/issues/21 --- udapi/block/ud/fixpunct.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index e810c58d..23f4be74 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -3,10 +3,9 @@ Punctuation in Universal Dependencies has the tag PUNCT, dependency relation punct, and is always attached projectively, usually to the head of a neighboring subtree to its left or right. -Punctuation normally does not have children. If it does, we will skip it. -It is unclear what to do anyway, and we won't have to check for cycles. +Punctuation normally does not have children. If it does, we will fix it first. -Tries to re-attach punctuation projectively. +This block tries to re-attach punctuation projectively and according to the guidelines. It should help in cases where punctuation is attached randomly, always to the root or always to the neighboring word. However, there are limits to what it can do; for example it cannot always recognize whether a comma is introduced to separate @@ -14,7 +13,7 @@ this block is almost good, the block may actually do more harm than good. Since the punctuation should not have children, we should not create a non-projectivity -if we check the roof edges going to the right. +if we check the root edges going to the right. However, it is still possible that we will attach the punctuation non-projectively by joining a non-projectivity that already exists. For example, the left neighbor (node i-1) may have its parent at i-3, @@ -47,7 +46,7 @@ class FixPunct(Block): - """Make sure punct nodes are attached punctuation is attached projectively.""" + """Make sure punctuation nodes are attached projectively.""" def __init__(self, **kwargs): """Create the ud.FixPunct block instance.""" @@ -78,6 +77,15 @@ def process_tree(self, root): if node.upos == "PUNCT" and not self._punct_type[node.ord]: self._fix_subord_punct(node) + # Finally, check if root is still marked with deprel=root. + # This may not hold if the original root was a paired punctuation, which was rehanged. + for node in root.children: + if node.udeprel != 'root': + node.udeprel = 'root' + for another_node in root.descendants: + if another_node.parent != root and another_node.udeprel == 'root': + another_node.udeprel = 'punct' + def _fix_subord_punct(self, node): # Dot used as the ordinal-number marker (in some languages) or abbreviation marker. # TODO: detect these cases somehow @@ -168,10 +176,17 @@ def _fix_paired_punct(self, root, opening_node, closing_punct): def _fix_pair(self, root, opening_node, closing_node): heads = [] + punct_heads = [] for node in root.descendants[opening_node.ord: closing_node.ord - 1]: if node.parent.precedes(opening_node) or closing_node.precedes(node.parent): - if node.upos != 'PUNCT': + if node.upos == 'PUNCT': + punct_heads.append(node) + else: heads.append(node) + # Punctuation should not have children, but if there is no other head candidate, + # let's break this rule. + if len(heads) == 0: + heads = punct_heads if len(heads) == 1: opening_node.parent = heads[0] closing_node.parent = heads[0] From 5886be4aeb8c0e178d94cbc9aa01bacffba654b9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 13 Nov 2017 13:05:36 +0100 Subject: [PATCH 0013/1201] root.sent_id returns always the same as root.address() `root.address()` seems redundant now, but it is parallel to `node.address()` and `bundle.address()`, so let's keep it. Also fixing a bug with split_docs, where the tree with `newdoc` (which is a leftover from the last document, stored in `_buffer`) could have `bundle.bundle_id = 1` not respecting the `sent_id` stored in the tree. --- udapi/core/basereader.py | 22 ++++++++++++++-------- udapi/core/root.py | 36 ++++++++++++++++++------------------ 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 1f732568..25feb7f0 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -96,12 +96,18 @@ def process_document(self, document): # There may be a tree left in the buffer when reading the last doc. if self._buffer: - # TODO list.pop(0) is inefficient, use collections.deque.popleft() - bundle = orig_bundles.pop(0) if orig_bundles else document.create_bundle() - bundle.add_tree(self._buffer) - if self._buffer.newdoc and self._buffer.newdoc is not True: - document.meta["docname"] = self._buffer.newdoc + root = self._buffer self._buffer = None + if orig_bundles: + # TODO list.pop(0) is inefficient, use collections.deque.popleft() + bundle = orig_bundles.pop(0) + else: + bundle = document.create_bundle() + if root._sent_id is not None: + bundle.bundle_id = root._sent_id.split('/', 1)[0] + bundle.add_tree(root) + if root.newdoc and root.newdoc is not True: + document.meta["docname"] = root.newdoc filehandle = self.filehandle if filehandle is None: @@ -123,9 +129,9 @@ def process_document(self, document): trees_loaded += 1 if self.ignore_sent_id: - root.sent_id = None - if root.sent_id is not None: - parts = root.sent_id.split('/', 1) + root._sent_id = None + if root._sent_id is not None: + parts = root._sent_id.split('/', 1) bundle_id = parts[0] if len(parts) == 2: root.zone = parts[1] diff --git a/udapi/core/root.py b/udapi/core/root.py index 7460e77b..ae259718 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -42,6 +42,13 @@ def __init__(self, zone=None, comment='', text=None, newpar=None, newdoc=None): @property def sent_id(self): """ID of this tree, stored in the sent_id comment in CoNLL-U.""" + if self._sent_id is not None: + return self._sent_id + zone = '/' + self.zone if self.zone else '' + if self._bundle is not None: + self._sent_id = self._bundle.address() + zone + else: + self._sent_id = '?' + zone return self._sent_id @sent_id.setter @@ -53,6 +60,17 @@ def sent_id(self, sent_id): self.zone = parts[1] self._sent_id = sent_id + def address(self): + """Full (document-wide) id of the root. + + The general format of root nodes is: + root.bundle.bundle_id + '/' + root.zone, e.g. s123/en_udpipe. + If zone is empty, the slash is excluded as well, e.g. s123. + If bundle is missing (could occur during loading), '?' is used instead. + Root's address is stored in CoNLL-U files as sent_id (in a special comment). + """ + return self.sent_id + @property def bundle(self): """Return the bundle which this tree belongs to.""" @@ -124,24 +142,6 @@ def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): """Attempts at changing the word order of root result in Exception.""" raise Exception('Technical root cannot be shifted as it is always the first node') - def address(self): - """Full (document-wide) id of the root. - - The general format of root nodes is: - root.bundle.bundle_id + '/' + root.zone, e.g. s123/en_udpipe. - If zone is empty, the slash is excluded as well, e.g. s123. - If bundle is missing (could occur during loading), '?' is used instead. - Root's address is stored in CoNLL-U files as sent_id (in a special comment). - TODO: Make sure root.sent_id returns always the same string as root.address. - """ - zone = '/' + self.zone if self.zone else '' - if self._bundle is not None: - return self._bundle.address() + zone - elif self.sent_id is not None: - return self.sent_id + zone - else: - return '?' + zone - # TODO document whether misc is a string or dict or it can be both def create_multiword_token(self, words=None, form=None, misc=None): """Create and return a new multi-word token (MWT) in this tree. From af2fcc5b1d3da6be161b3022acecd134c87e0a2e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 13 Nov 2017 13:11:54 +0100 Subject: [PATCH 0014/1201] fixing a bug with doc_json and newdoc doc_json annotations were loaded to the previous document if multiple documents were stored in one file and loaded with `read.Conllu split_docs=1`. At the time of parsing a CoNLL-U tree, we don't know whether it will be stored in a "current" document, or whether a new document will have to be created (because of `newdoc` annotation). - adding a test for this bug - fixing it by storing the data in a special temp location `root.json['__doc__']` and moving to the corrent document later on --- udapi/block/read/conllu.py | 8 ++++++-- udapi/core/bundle.py | 4 ++++ udapi/core/tests/data/babinsky.conllu | 10 ++++++++++ udapi/core/tests/external_tests.sh | 4 ++-- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index bcd935f3..645b9685 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -79,8 +79,12 @@ def parse_comment_line(line, root, document): json_match = RE_JSON.match(line) if json_match is not None: - container = document if json_match.group(1) == 'doc_' else root - container.json[json_match.group(2)] = json.loads(json_match.group(3)) + container = root.json + if json_match.group(1) == 'doc_': + if '__doc__' not in root.json: + root.json['__doc__'] = {} + container = root.json['__doc__'] + container[json_match.group(2)] = json.loads(json_match.group(3)) return root.comment += line[1:] + "\n" diff --git a/udapi/core/bundle.py b/udapi/core/bundle.py index ffffa565..5a9d6808 100644 --- a/udapi/core/bundle.py +++ b/udapi/core/bundle.py @@ -88,6 +88,10 @@ def add_tree(self, root): self.check_zone(root.zone) root.bundle = self self.trees.append(root) + doc_json = root.json.get('__doc__') + if doc_json: + self._document.json.update(doc_json) + del root.json['__doc__'] return root def remove(self): diff --git a/udapi/core/tests/data/babinsky.conllu b/udapi/core/tests/data/babinsky.conllu index 32bc53cf..f23a81e7 100644 --- a/udapi/core/tests/data/babinsky.conllu +++ b/udapi/core/tests/data/babinsky.conllu @@ -18,3 +18,13 @@ 4 Babinskému Babinský PROPN NNMS3-----A---- Animacy=Anim|Case=Dat|Gender=Masc|NameType=Sur|Number=Sing|Polarity=Pos 2 obj _ SpaceAfter=No 5 . . PUNCT Z:------------- _ 2 punct _ SpaceAfter=No +# doc_json_entities = [{"id": "E3", "labels": ["Rumcajs"], "mentions": ["s3#m1"]}] +# newdoc +# sent_id = 3 +# text = Rumcajs je loupežník. +# json_mentions = [{"id": "s3#m1", "label": "Rumcajs", "span": [1]}] +1 Rumcajs Rumcajs PROPN NNMS1-----A---- Animacy=Anim|Case=Nom|Gender=Masc|NameType=Sur|Number=Sing|Polarity=Pos 3 nsubj _ _ +2 je být AUX VB-S---3P-AA--- Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act 3 cop _ _ +3 loupežník loupežník NOUN NNMS1-----A---- Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing|Polarity=Pos 0 root _ SpaceAfter=No +4 . . PUNCT Z:------------- _ 3 punct _ SpaceAfter=No + diff --git a/udapi/core/tests/external_tests.sh b/udapi/core/tests/external_tests.sh index 85cda295..ac93cadb 100755 --- a/udapi/core/tests/external_tests.sh +++ b/udapi/core/tests/external_tests.sh @@ -1,6 +1,6 @@ #!/bin/bash set -e -udapy read.Conllu files=data/UD_Czech_sample.conllu write.Conllu print_sent_id=0 print_text=0 > out.conllu && diff data/UD_Czech_sample.conllu out.conllu && rm out.conllu +udapy write.Conllu print_sent_id=0 print_text=0 < data/UD_Czech_sample.conllu > out.conllu && diff data/UD_Czech_sample.conllu out.conllu && rm out.conllu -cat data/babinsky.conllu | udapy -s > out.conllu && diff data/babinsky.conllu out.conllu && rm out.conllu \ No newline at end of file +udapy -s read.Conllu files=data/babinsky.conllu split_docs=1 > out.conllu && diff data/babinsky.conllu out.conllu && rm out.conllu From 0509cfe8cc0fa19d84dc022e54ca0c6af2af1b66 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 13 Nov 2017 13:20:21 +0100 Subject: [PATCH 0015/1201] reverting the idea that read_tree has access to the doc The document to which a given tree will be stored is not known at this time, so it was misleading to provide such parameter. It is decided based on the tree's `root.newdoc` whether the tree will be added to a new document (which is not created yet). --- udapi/block/read/conllu.py | 6 +++--- udapi/block/read/vislcg.py | 2 +- udapi/core/basereader.py | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 645b9685..a2fd0406 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -56,7 +56,7 @@ def __init__(self, strict=False, separator='tab', empty_parent='warn', self.empty_parent = empty_parent @staticmethod - def parse_comment_line(line, root, document): + def parse_comment_line(line, root): """Parse one line of CoNLL-U and fill sent_id, text, newpar, newdoc in root.""" sent_id_match = RE_SENT_ID.match(line) if sent_id_match is not None: @@ -92,7 +92,7 @@ def parse_comment_line(line, root, document): # pylint: disable=too-many-locals,too-many-branches,too-many-statements # Maybe the code could be refactored, but it is speed-critical, # so benchmarking is needed because calling extra methods may result in slowdown. - def read_tree(self, document=None): + def read_tree(self): if self.filehandle is None: return None @@ -105,7 +105,7 @@ def read_tree(self, document=None): if line == '': break if line[0] == '#': - self.parse_comment_line(line, root, document) + self.parse_comment_line(line, root) else: if self.separator == 'tab': fields = line.split('\t') diff --git a/udapi/block/read/vislcg.py b/udapi/block/read/vislcg.py index 9ad272e3..4c5a87ab 100644 --- a/udapi/block/read/vislcg.py +++ b/udapi/block/read/vislcg.py @@ -8,7 +8,7 @@ class Vislcg(BaseReader): # TODO check validity and raise helpful exceptions if not valid # pylint: disable=too-many-branches - def read_tree(self, document=None): + def read_tree(self): if self.filehandle is None: return None diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 25feb7f0..818ce945 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -59,7 +59,7 @@ def next_filehandle(self): """Go to the next file and retrun its filehandle.""" return self.files.next_filehandle() - def read_tree(self, document=None): + def read_tree(self): """Load one (more) tree from self.files and return its root. This method must be overriden in all readers. @@ -68,13 +68,13 @@ def read_tree(self, document=None): """ raise NotImplementedError("Class %s doesn't implement read_tree" % self.__class__.__name__) - def filtered_read_tree(self, document=None): + def filtered_read_tree(self): """Load and return one more tree matching the `sent_id_filter`. This method uses `read_tree()` internally. This is the method called by `process_document`. """ - tree = self.read_tree(document) + tree = self.read_tree() if self.sent_id_filter is None: return tree while True: @@ -84,7 +84,7 @@ def filtered_read_tree(self, document=None): return tree logging.debug('Skipping sentence %s as it does not match the sent_id_filter %s.', tree.sent_id, self.sent_id_filter) - tree = self.read_tree(document) + tree = self.read_tree() # pylint: disable=too-many-branches,too-many-statements # Maybe the code could be refactored, but it is speed-critical, @@ -118,7 +118,7 @@ def process_document(self, document): trees_loaded = 0 while True: - root = self.filtered_read_tree(document) + root = self.filtered_read_tree() if root is None: if trees_loaded == 0 and self.files.has_next_file(): filehandle = self.next_filehandle() From f2c209eeee7a75b2c6eeb6445a375fa6d4e3a8cb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 30 Nov 2017 17:02:49 +0100 Subject: [PATCH 0016/1201] MorphoDiTa wrapper --- udapi/tool/morphodita.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 udapi/tool/morphodita.py diff --git a/udapi/tool/morphodita.py b/udapi/tool/morphodita.py new file mode 100644 index 00000000..656c6acb --- /dev/null +++ b/udapi/tool/morphodita.py @@ -0,0 +1,31 @@ +"""Wrapper for MorphoDiTa (more pythonic than ufal.morphodita).""" +from collections import namedtuple + +from ufal.morphodita import Morpho, TaggedLemmasForms # pylint: disable=no-name-in-module +from udapi.core.resource import require_file + +FormInfo = namedtuple('FormInfo', 'form lemma tag guesser') + + +class MorphoDiTa: + """Wrapper for MorphoDiTa.""" + + def __init__(self, model): + """Create the MorphoDiTa tool object.""" + self.model = model + path = require_file(model) + self.tool = Morpho.load(path) + if not self.tool: + raise IOError("Cannot load model from file '%s'" % path) + + def forms_of_lemma(self, lemma, tag_wildcard='?', guesser=True): + """Return all forms of a given lemma matching a given tag wildcard.""" + + use_guesser = 1 if guesser else 0 + lemmas_forms = TaggedLemmasForms() + used_guesser = self.tool.generate(lemma, tag_wildcard, use_guesser, lemmas_forms) + forms = [] + for lemma_forms in lemmas_forms: + for form in lemma_forms.forms: + forms.append(FormInfo(form.form, lemma_forms.lemma, form.tag, used_guesser)) + return forms From 388e4c566e45bf179a245acc4e368835d4a40881 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 30 Nov 2017 17:04:22 +0100 Subject: [PATCH 0017/1201] demo 1984-ization block for Tom Kocmi --- udapi/block/newspeak/__init__.py | 0 udapi/block/newspeak/prevele.py | 66 ++++++++++++++++++++++++++++++++ udapi/block/udpipe/cs.py | 10 +++++ 3 files changed, 76 insertions(+) create mode 100644 udapi/block/newspeak/__init__.py create mode 100644 udapi/block/newspeak/prevele.py create mode 100644 udapi/block/udpipe/cs.py diff --git a/udapi/block/newspeak/__init__.py b/udapi/block/newspeak/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/newspeak/prevele.py b/udapi/block/newspeak/prevele.py new file mode 100644 index 00000000..dd616571 --- /dev/null +++ b/udapi/block/newspeak/prevele.py @@ -0,0 +1,66 @@ +"""newspeak.PreVele block for 1984-like newspeak-ization. + +This is just a demo/draft. + +Usage: + $ echo 'Nejhorší žena je lepší než nejlepší muž.' | \ + udapy -q read.Sentences udpipe.Cs newspeak.PreVele write.Sentences + Převelenedobrá žena je veledobrá než převeledobrý muž. +""" +from udapi.core.block import Block +from udapi.tool.morphodita import MorphoDiTa + +ANTONYMS = { + 'špatný': 'dobrý', + 'pomalý': 'rychlý', + # 'muž': 'žena', this does not work because xpos contains gender, + # we would also need to exploit the parsing and change gender of all congruent adj children. +} + + +class PreVele(Block): + """Change all comparatives to vele-x and superlatives to převele-x.""" + + def __init__(self, morphodita_path='models/morphodita/cs/', + morphodita_model='czech-morfflex-131112.dict', + **kwargs): + """Create the PreVele block object.""" + super().__init__(**kwargs) + self.morphodita = MorphoDiTa(model=morphodita_path + morphodita_model) + + def process_tree(self, tree): + + # apply process_node on all nodes + super().process_tree(tree) + + # Capitalize if needed + first_node = tree.descendants[0] + if tree.text[0].isupper() and not first_node.form[0].isupper(): + first_node.form = first_node.form[0].upper() + first_node.form[1:] + + # Recompute the sentence string + tree.text = tree.compute_text() + + def process_node(self, node): + antonym = ANTONYMS.get(node.lemma) + if antonym is not None: + if node.xpos[11] == 'N': + if node.form.lower().startswith('ne'): + node.lemma = antonym + node.xpos = node.xpos[:10] + 'A' + node.xpos[11:] + node.form = node.form[2:] + else: + forms = self.morphodita.forms_of_lemma(antonym, node.xpos) + if forms: + node.lemma = antonym + node.xpos = node.xpos[:10] + 'N' + node.xpos[11:] + node.form = 'ne' + forms[0].form + + degree = node.feats["Degree"] + if degree in ("Sup", "Cmp"): + new_xpos = node.xpos[:9] + '1' + node.xpos[10:] + forms = self.morphodita.forms_of_lemma(node.lemma, new_xpos) + if forms: + new_form = "vele" if degree == "Cmp" else "převele" + new_form += forms[0].form + node.form = new_form diff --git a/udapi/block/udpipe/cs.py b/udapi/block/udpipe/cs.py new file mode 100644 index 00000000..743efcb7 --- /dev/null +++ b/udapi/block/udpipe/cs.py @@ -0,0 +1,10 @@ +"""Block udpipe.Cs for tagging and parsing Czech.""" +from udapi.block.udpipe.base import Base + + +class Cs(Base): + """Tag and parse Czech.""" + + def __init__(self, **kwargs): + """Create the udpipe.Cs block object.""" + super().__init__(model_alias='cs', **kwargs) From 1a7492b58796fb78f31109c88d3e9514984ab731 Mon Sep 17 00:00:00 2001 From: Tom Kocmi Date: Sun, 10 Dec 2017 16:03:07 +0100 Subject: [PATCH 0018/1201] Allowing the import of private modules --- udapi/core/run.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/udapi/core/run.py b/udapi/core/run.py index f42f3f9d..512213f3 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -82,7 +82,12 @@ def _import_blocks(block_names, block_args): for (block_id, block_name) in enumerate(block_names): # Importing module dynamically. sub_path, class_name = _parse_block_name(block_name) - module = "udapi.block." + sub_path + "." + class_name.lower() + + if block_name.startswith('.'): + # Private modules are recognized by a dot at the beginning + module = block_name.lower()[1:] + else: + module = "udapi.block." + sub_path + "." + class_name.lower() try: command = "from " + module + " import " + class_name + " as b" + str(block_id) logging.debug("Trying to run command: %s", command) From c9efe469860d27039c6acb49e90d4d0446fb70b2 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 19 Dec 2017 21:28:25 +0100 Subject: [PATCH 0019/1201] before_process_document of writers must be called otherwise the output goes to stdout instead of the file provided in constructor --- udapi/core/document.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/core/document.py b/udapi/core/document.py index 66164997..ad7bbdbe 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -29,9 +29,13 @@ def create_bundle(self): def load_conllu(self, filename): """Load a document from a conllu-formatted file.""" reader = ConlluReader(files=filename) + reader.before_process_document(self) reader.process_document(self) + reader.after_process_document(self) def store_conllu(self, filename): """Store a document into a conllu-formatted file.""" writer = ConlluWriter(files=filename) + writer.before_process_document(self) writer.process_document(self) + writer.after_process_document(self) From c8ca5dd05ca0d6bf4c99148ef7f851c3b7fd76ca Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 21 Dec 2017 03:24:04 +0100 Subject: [PATCH 0020/1201] support for loading/storing documents from/to strings doc.from_conllu_string(s) s = doc.to_conllu_strint() --- udapi/core/basereader.py | 7 ++++--- udapi/core/basewriter.py | 16 ++++++++++++++-- udapi/core/block.py | 5 +++++ udapi/core/document.py | 24 ++++++++++++++++-------- udapi/core/files.py | 17 ++++++++++++----- udapi/core/run.py | 8 +++----- 6 files changed, 54 insertions(+), 23 deletions(-) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 818ce945..9e3b6488 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -12,13 +12,14 @@ class BaseReader(Block): """Base class for all reader blocks.""" # pylint: disable=too-many-arguments - def __init__(self, files='-', zone='keep', bundles_per_doc=0, encoding='utf-8', + def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8', sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs): super().__init__(**kwargs) - self.files = Files(filenames=files) + if filehandle is not None: + files = None + self.files = Files(filenames=files, filehandle=filehandle, encoding=encoding) self.zone = zone self.bundles_per_doc = bundles_per_doc - self.encoding = encoding self._buffer = None self.finished = False self.sent_id_filter = None diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 36edd1aa..ed574c95 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -9,10 +9,14 @@ class BaseWriter(Block): """Base class for all reader blocks.""" - def __init__(self, files='-', docname_as_file=False, encoding='utf-8', newline='\n', **kwargs): + def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8', + newline='\n', **kwargs): super().__init__(**kwargs) self.orig_files = files - self.files = Files(filenames=files) + if filehandle is not None: + files = None + self.orig_files = '' + self.files = Files(filenames=files, filehandle=filehandle) self.encoding = encoding self.newline = newline self.docname_as_file = docname_as_file @@ -34,6 +38,10 @@ def next_filename(self): return self.files.next_filename() def before_process_document(self, document): + if self.orig_files == '': + logging.info('Writing to filehandle.') + sys.stdout = self.files.filehandle + return if self.orig_files == '-': if self.docname_as_file: docname = document.meta.get('docname', None) @@ -60,3 +68,7 @@ def before_process_document(self, document): else: logging.info('Writing to file %s.', filename) sys.stdout = open(filename, 'wt', encoding=self.encoding, newline=self.newline) + + def after_process_document(self, document): + if self.orig_files == '': + sys.stdout = sys.__stdout__ diff --git a/udapi/core/block.py b/udapi/core/block.py index 453b1d65..4ddbaf2b 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -31,6 +31,11 @@ def process_bundle(self, bundle): if self._should_process_tree(tree): self.process_tree(tree) + def apply_on_document(self, document): + self.before_process_document(document) + self.process_document(document) + self.after_process_document(document) + def process_document(self, document): """Process a UD document""" for bundle_no, bundle in enumerate(document.bundles, 1): diff --git a/udapi/core/document.py b/udapi/core/document.py index ad7bbdbe..778e5bd9 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -1,7 +1,7 @@ """Document class is a container for UD trees.""" +import io from udapi.core.bundle import Bundle - from udapi.block.read.conllu import Conllu as ConlluReader from udapi.block.write.conllu import Conllu as ConlluWriter @@ -26,16 +26,24 @@ def create_bundle(self): bundle.number = len(self.bundles) return bundle - def load_conllu(self, filename): + def load_conllu(self, filename=None): """Load a document from a conllu-formatted file.""" reader = ConlluReader(files=filename) - reader.before_process_document(self) - reader.process_document(self) - reader.after_process_document(self) + reader.apply_on_document(self) def store_conllu(self, filename): """Store a document into a conllu-formatted file.""" writer = ConlluWriter(files=filename) - writer.before_process_document(self) - writer.process_document(self) - writer.after_process_document(self) + writer.apply_on_document(self) + + def from_conllu_string(self, string): + """Load a document from a conllu-formatted string.""" + reader = ConlluReader(filehandle=io.StringIO(string)) + reader.apply_on_document(self) + + def to_conllu_string(self): + """Return the document as a conllu-formatted string.""" + fh = io.StringIO() + writer = ConlluWriter(filehandle=fh) + writer.apply_on_document(self) + return fh.getvalue() diff --git a/udapi/core/files.py b/udapi/core/files.py index 29ad60e9..523fe303 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -29,16 +29,21 @@ class Files(object): >>> filehandle = files.next_filehandle() """ - def __init__(self, filenames, encoding='utf-8'): - if isinstance(filenames, list): + def __init__(self, filenames=None, filehandle=None, encoding='utf-8'): + self.filehandle = None + self.file_number = 0 + self.encoding = encoding + if filehandle is not None: + self.filehandle = filehandle + if filenames is not None: + raise ValueError('Cannot specify both "filenames" and "filehandle"') + self.filenames = [''] + elif isinstance(filenames, list): self.filenames = filenames elif isinstance(filenames, str): self.filenames = self.string_to_filenames(filenames) else: raise ValueError('Parameter "filenames" must be a list or str') - self.filehandle = None - self.encoding = encoding - self.file_number = 0 def string_to_filenames(self, string): """Parse a pattern string (e.g. '!dir??/file*.txt') and return a list of matching filenames. @@ -105,6 +110,8 @@ def next_filehandle(self): fhandle = None elif filename == '-': fhandle = sys.stdin + elif filename == '': + fhandle = self.filehandle else: filename_extension = filename.split('.')[-1] if filename_extension == 'gz': diff --git a/udapi/core/run.py b/udapi/core/run.py index 512213f3..c730a1a7 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -82,11 +82,11 @@ def _import_blocks(block_names, block_args): for (block_id, block_name) in enumerate(block_names): # Importing module dynamically. sub_path, class_name = _parse_block_name(block_name) - + if block_name.startswith('.'): # Private modules are recognized by a dot at the beginning module = block_name.lower()[1:] - else: + else: module = "udapi.block." + sub_path + "." + class_name.lower() try: command = "from " + module + " import " + class_name + " as b" + str(block_id) @@ -156,9 +156,7 @@ def execute(self): logging.info(" ---- ROUND ----") for block in blocks: logging.info("Executing block " + block.__class__.__name__) - block.before_process_document(document) - block.process_document(document) - block.after_process_document(document) + block.apply_on_document(document) finished = True From 9d8365ecdb02dece78aaea8de942e9dd750d346e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 8 Jan 2018 22:41:37 +0100 Subject: [PATCH 0021/1201] bump version to 0.2.2 --- CHANGES.txt | 11 +++++++++++ setup.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 CHANGES.txt diff --git a/CHANGES.txt b/CHANGES.txt new file mode 100644 index 00000000..dbcd9702 --- /dev/null +++ b/CHANGES.txt @@ -0,0 +1,11 @@ +Udapi Change Log +---------------- +See https://github.com/udapi/udapi-python/commits/master for details. + +0.2.2 2018-01-08 + - support for loading/storing documents from/to strings + - allow private modules (starting with dot instead of udapi.block) + - MorphoDiTa wrapper udapi/tool/morphodita.py + - root.sent_id returns always the same as root.address() + +0.2.1 2017-10-23 the first PyPI release \ No newline at end of file diff --git a/setup.py b/setup.py index bbc49234..7197d909 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='udapi', - version='0.2.1', + version='0.2.2', description='Python framework for processing Universal Dependencies data', long_description=( 'Udapi is an open-source framework providing API for processing ' From 6067ccb5256bb024f414dc8b664b3cb03269d2de Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 16 Feb 2018 01:33:24 +0100 Subject: [PATCH 0022/1201] allow ud.FixPunct to edit only upos=PUNCT nodes fixes #45 --- udapi/block/ud/fixpunct.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 23f4be74..546add79 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -48,10 +48,11 @@ class FixPunct(Block): """Make sure punctuation nodes are attached projectively.""" - def __init__(self, **kwargs): + def __init__(self, check_paired_punct_upos=False, **kwargs): """Create the ud.FixPunct block instance.""" super().__init__(**kwargs) self._punct_type = None + self.check_paired_punct_upos = check_paired_punct_upos def process_tree(self, root): # First, make sure no PUNCT has children @@ -163,6 +164,9 @@ def _fix_subord_punct(self, node): node.deprel = "punct" def _fix_paired_punct(self, root, opening_node, closing_punct): + if self.check_paired_punct_upos and opening_node.upos != 'PUNCT': + return + nested_level = 0 for node in root.descendants[opening_node.ord:]: if node.form == closing_punct: From 7cf82d2c5a34ac9a7e8df8330ab736e090addd17 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 9 Mar 2018 17:55:20 +0100 Subject: [PATCH 0023/1201] optionally copy PUNCT dependencies to enahnced deps use `ud.FixPunct copy_to_enhanced=1` Fixes #47 --- udapi/block/ud/fixpunct.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 546add79..7d5bb483 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -48,11 +48,12 @@ class FixPunct(Block): """Make sure punctuation nodes are attached projectively.""" - def __init__(self, check_paired_punct_upos=False, **kwargs): + def __init__(self, check_paired_punct_upos=False, copy_to_enhanced=False, **kwargs): """Create the ud.FixPunct block instance.""" super().__init__(**kwargs) self._punct_type = None self.check_paired_punct_upos = check_paired_punct_upos + self.copy_to_enhanced = copy_to_enhanced def process_tree(self, root): # First, make sure no PUNCT has children @@ -87,6 +88,11 @@ def process_tree(self, root): if another_node.parent != root and another_node.udeprel == 'root': another_node.udeprel = 'punct' + if self.copy_to_enhanced: + for node in root.descendants: + if node.upos == "PUNCT": + node.deps = [{'parent': node.parent, 'deprel': 'punct'}] + def _fix_subord_punct(self, node): # Dot used as the ordinal-number marker (in some languages) or abbreviation marker. # TODO: detect these cases somehow From d536e6ff3a3c95f413e48d5cd7512383939064b2 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 21 Mar 2018 20:58:37 +0100 Subject: [PATCH 0024/1201] conll2018 version of the evaluation script with MLAS and BLEX --- udapi/block/eval/conll18.py | 320 ++++++++++++++++++++++++++++++++++++ 1 file changed, 320 insertions(+) create mode 100644 udapi/block/eval/conll18.py diff --git a/udapi/block/eval/conll18.py b/udapi/block/eval/conll18.py new file mode 100644 index 00000000..97bc68aa --- /dev/null +++ b/udapi/block/eval/conll18.py @@ -0,0 +1,320 @@ +r"""Block&script eval.Conll18 for evaluating LAS,UAS,etc as in CoNLL2018 UD shared task. + +This is a reimplementation of the CoNLL2018 shared task official evaluation script, +http://universaldependencies.org/conll18/evaluation.html + +The gold trees and predicted (system-output) trees need to be sentence-aligned +e.g. using `util.ResegmentGold`. +Unlike in `eval.Parsing`, the gold and predicted trees can have different tokenization. + +An example usage and output:: + + $ udapy read.Conllu zone=gold files=gold.conllu \ + read.Conllu zone=pred files=pred.conllu ignore_sent_id=1 \ + eval.Conll18 + Metric | Precision | Recall | F1 Score | AligndAcc + -----------+-----------+-----------+-----------+----------- + Words | 27.91 | 52.17 | 36.36 | 100.00 + UPOS | 27.91 | 52.17 | 36.36 | 100.00 + XPOS | 27.91 | 52.17 | 36.36 | 100.00 + Feats | 27.91 | 52.17 | 36.36 | 100.00 + Lemma | 27.91 | 52.17 | 36.36 | 100.00 + UAS | 16.28 | 30.43 | 21.21 | 58.33 + LAS | 16.28 | 30.43 | 21.21 | 58.33 + CLAS | 10.34 | 16.67 | 12.77 | 37.50 + + +For evaluating multiple systems and testsets (as in CoNLL2018) +stored in `systems/testset_name/system_name.conllu` you can use:: + + #!/bin/bash + SYSTEMS=`ls systems` + [[ $# -ne 0 ]] && SYSTEMS=$@ + set -x + set -e + for sys in $SYSTEMS; do + mkdir -p results/$sys + for testset in `ls systems/$sys`; do + udapy read.Conllu zone=gold files=gold/$testset \ + read.Conllu zone=pred files=systems/$sys/$testset ignore_sent_id=1 \ + util.ResegmentGold \ + eval.Conll18 print_results=0 print_raw=LAS \ + > results/$sys/${testset%.conllu} + done + done + python3 `python3 -c 'import udapi.block.eval.conll18 as x; print(x.__file__)'` -r 100 + +The last line executes this block as a script and computes bootstrap resampling with 100 resamples +(default=1000, it is recommended to keep the default or higher value unless testing the interface). +This prints the ranking and confidence intervals (95% by default) and also p-values for each +pair of systems with neighboring ranks. If the difference in LAS is significant +(according to a paired bootstrap test, by default if p < 0.05), +a line is printed between the two systems. + +The output looks like:: + + 1. Stanford 76.17 ± 0.12 (76.06 .. 76.30) p=0.001 + ------------------------------------------------------------ + 2. C2L2 74.88 ± 0.12 (74.77 .. 75.01) p=0.001 + ------------------------------------------------------------ + 3. IMS 74.29 ± 0.13 (74.16 .. 74.43) p=0.001 + ------------------------------------------------------------ + 4. HIT-SCIR 71.99 ± 0.14 (71.84 .. 72.12) p=0.001 + ------------------------------------------------------------ + 5. LATTICE 70.81 ± 0.13 (70.67 .. 70.94) p=0.001 + ------------------------------------------------------------ + 6. NAIST-SATO 70.02 ± 0.13 (69.89 .. 70.16) p=0.001 + ------------------------------------------------------------ + 7. Koc-University 69.66 ± 0.13 (69.52 .. 69.79) p=0.002 + ------------------------------------------------------------ + 8. UFAL-UDPipe-1-2 69.36 ± 0.13 (69.22 .. 69.49) p=0.001 + ------------------------------------------------------------ + 9. UParse 68.75 ± 0.14 (68.62 .. 68.89) p=0.003 + ------------------------------------------------------------ + 10. Orange-Deskin 68.50 ± 0.13 (68.37 .. 68.62) p=0.448 + 11. TurkuNLP 68.48 ± 0.14 (68.34 .. 68.62) p=0.029 + ------------------------------------------------------------ + 12. darc 68.29 ± 0.13 (68.16 .. 68.42) p=0.334 + 13. conll18-baseline 68.25 ± 0.14 (68.11 .. 68.38) p=0.003 + ------------------------------------------------------------ + 14. MQuni 67.93 ± 0.13 (67.80 .. 68.06) p=0.062 + 15. fbaml 67.78 ± 0.13 (67.65 .. 67.91) p=0.283 + 16. LyS-FASTPARSE 67.73 ± 0.13 (67.59 .. 67.85) p=0.121 + 17. LIMSI-LIPN 67.61 ± 0.14 (67.47 .. 67.75) p=0.445 + 18. RACAI 67.60 ± 0.13 (67.46 .. 67.72) p=0.166 + 19. IIT-Kharagpur 67.50 ± 0.14 (67.36 .. 67.64) p=0.447 + 20. naistCL 67.49 ± 0.15 (67.34 .. 67.63) +""" +import argparse +import difflib +import logging +import os +import random +import sys +from collections import Counter +from udapi.core.basewriter import BaseWriter + +CONTENT = {'nsubj', 'obj', 'iobj', 'csubj', 'ccomp', 'xcomp', 'obl', 'vocative', 'expl', + 'dislocated', 'advcl', 'advmod', 'discourse', 'nmod', 'appos', 'nummod', 'acl', + 'amod', 'conj', 'fixed', 'flat', 'compound', 'list', 'parataxis', 'orphan', 'goeswith', + 'reparandum', 'root', 'dep'} +FUNCTIONAL = {'aux', 'cop', 'mark', 'det', 'clf', 'case', 'cc'} +UNIV_FEATS = {'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr', 'Gender', 'Animacy', + 'Number', 'Case', 'Definite', 'Degree', 'VerbForm', 'Mood', 'Tense', 'Aspect', + 'Voice', 'Evident', 'Polarity', 'Person', 'Polite'} + +class Conll18(BaseWriter): + """Evaluate LAS, UAS, MLAS and BLEX.""" + + def __init__(self, gold_zone='gold', print_raw=False, print_results=True, **kwargs): + """Args: + gold_zone - Which zone contains the gold-standard trees (the other zone contains "pred")? + print_raw - Print raw counts (pred, gold, aligned, correct) for each sentence. + This is useful for bootstrap resampling post-processing to get confidence intervals. + The parameter print_raw specifies a given metric + (UAS, LAS, MLAS, BLEX, UPOS, XPOS, Feats, Lemma) or is 0 (or False) by default. + print_results - Print a table with overall results after all document are processed. + """ + super().__init__(**kwargs) + self.gold_zone = gold_zone + self.total_count = Counter() + self.print_raw = print_raw + self.print_results = print_results + + def process_tree(self, tree): + gold_tree = tree.bundle.get_tree(self.gold_zone) + if tree == gold_tree: + return + pred_nodes = tree.descendants + gold_nodes = gold_tree.descendants + pred_forms = [n.form.lower() for n in pred_nodes] + gold_forms = [n.form.lower() for n in gold_nodes] + matcher = difflib.SequenceMatcher(None, pred_forms, gold_forms, autojunk=False) + aligned = [] + for diff in matcher.get_opcodes(): + edit, pred_lo, pred_hi, gold_lo, gold_hi = diff + if edit == 'equal': + aligned.extend(zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi])) + align_map = {tree: gold_tree} + for p_node, g_node in aligned: + align_map[p_node] = g_node + + count = Counter() + count['pred'] = len(pred_nodes) + count['gold'] = len(gold_nodes) + count['Words'] = len(aligned) + count['pred_cont'] = len([n for n in pred_nodes if n.udeprel in CONTENT]) + count['gold_cont'] = len([n for n in gold_nodes if n.udeprel in CONTENT]) + count['alig_cont'] = len([n for _, n in aligned if n.udeprel in CONTENT]) + + for p_node, g_node in aligned: + for attr in ('UPOS', 'XPOS', 'Feats', 'Lemma'): + if p_node.get_attrs([attr.lower()]) == g_node.get_attrs([attr.lower()]): + count[attr] += 1 + if align_map.get(p_node.parent) == g_node.parent: + count['UAS'] += 1 + if p_node.udeprel == g_node.udeprel: + count['LAS'] += 1 + if g_node.udeprel in CONTENT: + count['CLAS'] += 1 + if g_node.lemma == '_' or g_node.lemma == p_node.lemma: + count['BLEX'] += 1 + if self._morpho_match(p_node, g_node, align_map): + count['MLAS'] += 1 + self.total_count.update(count) + + if self.print_raw: + if self.print_raw in {'CLAS', 'BLEX', 'MLAS'}: + scores = [str(count[s]) for s in ('pred_cont', 'gold_cont', 'alig_cont', + self.print_raw)] + else: + scores = [str(count[s]) for s in ('pred', 'gold', 'Words', self.print_raw)] + print(' '.join(scores)) + + def _morpho_match(self, p_node, g_node, align_map, check_children=True): + if p_node.upos != g_node.upos: + return False + for feat in UNIV_FEATS: + if p_node.feats[feat] != g_node.feats[feat]: + return False + if check_children: + p_children = [c for c in p_node.children if c.udeprel in FUNCTIONAL] + g_children = [c for c in g_node.children if c.udeprel in FUNCTIONAL] + if len(p_children) != len(g_children): + return False + for p_child, g_child in zip(p_children, g_children): + if align_map.get(p_child) != g_child: + return False + if p_child.udeprel != g_child.udeprel: + return False + if not self._morpho_match(p_child, g_child, None, check_children=False): + return False + return True + + def process_end(self): + if not self.print_results: + return + + # Redirect the default filehandle to the file specified by self.files + self.before_process_document(None) + + metrics = ('Words', 'UPOS', 'XPOS', 'Feats', 'Lemma', 'UAS', 'LAS', 'CLAS', 'BLEX', 'MLAS') + print("Metric | Precision | Recall | F1 Score | AligndAcc") + print("-----------+-----------+-----------+-----------+-----------") + for metric in metrics: + if metric in {'CLAS', 'BLEX', 'MLAS'}: + pred, gold = self.total_count['pred_cont'], self.total_count['gold_cont'] + alig = self.total_count['alig_cont'] + else: + pred, gold = self.total_count['pred'], self.total_count['gold'] + alig = self.total_count['Words'] + correct = self.total_count[metric] + precision, recall, fscore, alignacc = prec_rec_f1(correct, pred, gold, alig) + print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{:10.2f}".format( + metric, 100 * precision, 100 * recall, 100 * fscore, 100 * alignacc)) + + +def prec_rec_f1(correct, pred, gold, alig=0): + precision = correct / pred if pred else 0 + recall = correct / gold if gold else 0 + alignacc = correct / alig if alig else 0 + fscore = 2 * correct / (pred + gold) if pred + gold else 0 + return precision, recall, fscore, alignacc + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dir_results", "-d", default="results", help="directory with results") + parser.add_argument("--resamples", "-r", default=1000, type=int, help="how many resamples") + parser.add_argument("--confidence", "-c", default=95, help="use x-percent confidence interval") + parser.add_argument("--tests", "-t", default='all', help="comma-separated test sets") + parser.add_argument("--systems", "-s", default='all', help="comma-separated systems") + parser.add_argument("--randseed", default=0, type=int, help="random seed, default=sys time") + args = parser.parse_args() + res_dir, resamples, conf = args.dir_results, args.resamples, args.confidence + alpha = (1 - conf/100) / 2 + index_lo = int(alpha * (resamples - 1)) + index_hi = resamples - 1 - index_lo + index_mid = int(resamples / 2) + if args.systems == 'all': + systems = os.listdir(res_dir) + else: + systems = args.systems.split(',') + if args.tests == 'all': + tests = set() + for system in systems: + tests.update(os.listdir(res_dir + '/' + system)) + tests = sorted(tests) + else: + tests = args.tests.split(',') + if args.randseed: + random.seed(args.randseed) + results = [] + + print('Loading...', file=sys.stderr) + for system in systems: + sys_results = [] + results.append(sys_results) + for i_test, test in enumerate(tests): + filename = '/'.join((res_dir, system, test)) + try: + with open(filename) as res_file: + sys_results.extend([[i_test] + list(map(int, l.split())) for l in res_file]) + except FileNotFoundError: + logging.warning(filename + ' not found') + samples = len(sys_results) + + print('Resampling...', file=sys.stderr) + boot_results = [] + for i_resample in range(resamples): + print(i_resample + 1, file=sys.stderr, end='\r') + resample_results = [] + boot_results.append(resample_results) + for i_system in range(len(systems)): + pred, gold, words, correct = ([0] * len(tests) for _ in range(4)) + for _ in range(samples): + i_test, pre, gol, wor, corr = random.choice(results[i_system]) + pred[i_test] += pre + gold[i_test] += gol + words[i_test] += wor + correct[i_test] += corr + fscore_sum = 0 + for i_test in range(len(tests)): + _prec, _rec, fscore, _aligacc = prec_rec_f1(correct[i_test], pred[i_test], gold[i_test]) + fscore_sum += fscore + resample_results.append(fscore_sum / len(tests)) + print('\n', file=sys.stderr) + + sys_fscores = [] + for i_system, system in enumerate(systems): + sys_fscores.append([boot_results[i_resample][i_system] for i_resample in range(resamples)]) + final_results = [] + sys_sys_wins = [[0] * len(systems) for x in range(len(systems))] + for i_system, system in enumerate(systems): + for j_system in range(i_system): + for i, j in zip(sys_fscores[i_system], sys_fscores[j_system]): + if i > j: + sys_sys_wins[i_system][j_system] += 1 + elif i < j: + sys_sys_wins[j_system][i_system] += 1 + fscores = sorted(sys_fscores[i_system]) + final_results.append([i_system, fscores[index_mid], fscores[index_lo], fscores[index_hi]]) + + sorted_systems = sorted(final_results, key=lambda x: -x[1]) + for rank, sys_results in enumerate(sorted_systems): + i_system, f1_mid, f1_lo, f1_hi = sys_results + if rank < len(systems) - 1: + j_worse_sys = sorted_systems[rank + 1][0] + p_value = (sys_sys_wins[j_worse_sys][i_system] + 1) / (resamples + 1) + p_str = " p=%.3f" % p_value + else: + p_value, p_str = 1, "" + print("%2d. %17s %5.2f ±%5.2f (%5.2f .. %5.2f)%s" % + (rank + 1, systems[i_system], + 100 * f1_mid, 50 * (f1_hi - f1_lo), 100 * f1_lo, 100 * f1_hi, p_str)) + if p_value < (1 - conf/100): + print('-' * 60) + + +if __name__ == "__main__": + main() From c8fbca0a38e8ce741f5fda4a9b3778f186ce41a1 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 3 Apr 2018 19:19:35 +0200 Subject: [PATCH 0025/1201] add language param for AddCommas, add RemoveCommas --- udapi/block/tutorial/addcommas.py | 15 ++++++++++++++- udapi/block/tutorial/removecommas.py | 13 +++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 udapi/block/tutorial/removecommas.py diff --git a/udapi/block/tutorial/addcommas.py b/udapi/block/tutorial/addcommas.py index ccc26a66..de707094 100644 --- a/udapi/block/tutorial/addcommas.py +++ b/udapi/block/tutorial/addcommas.py @@ -1,10 +1,23 @@ """tutorial.AddCommas block template.""" from udapi.core.block import Block +# nickname = xy123 +# TODO: make up a unique nickname and edit the previous line +# if you want your results to be listed on the NPFL070 web (under that nickname). +# Delete the line if you don't want to listed on the web. class AddCommas(Block): """Heuristically insert nodes for missing commas.""" + def __init__(self, language='en', **kwargs): + """Create the AddCommas block object. + + Args: + `language`: which language-specific rules to use ('en' or 'cs') + """ + super().__init__(**kwargs) + self.language = language + def process_node(self, node): if self.should_add_comma_before(node): comma = node.create_child(form=',', deprel='punct', upos='PUNCT') @@ -15,7 +28,7 @@ def should_add_comma_before(self, node): prev_node = node.prev_node if prev_node is None: return False - if prev_node.lemma == 'however': + if self.language == 'en' and node.lemma == 'however': return True if any(n.deprel == 'appos' for n in prev_node.children): return True diff --git a/udapi/block/tutorial/removecommas.py b/udapi/block/tutorial/removecommas.py new file mode 100644 index 00000000..a07e2bba --- /dev/null +++ b/udapi/block/tutorial/removecommas.py @@ -0,0 +1,13 @@ +"""tutorial.RemoveCommas helper block.""" +from udapi.core.block import Block + + +class RemoveCommas(Block): + """Delete all comma nodes and edit SpaceAfter and text accordingly.""" + + def process_tree(self, root): + for node in root.descendants: + if node.form == ",": + node.remove(children="rehang") + del node.prev_node.misc['SpaceAfter'] + root.text = root.compute_text() From 9a8015c3124467e3ddfad3a5d8ef569d0ab425f1 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 4 Apr 2018 11:21:31 +0200 Subject: [PATCH 0026/1201] Thai also should use compound:prt --- udapi/block/ud/google2ud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/google2ud.py b/udapi/block/ud/google2ud.py index 453bb9c0..3ba20c5c 100644 --- a/udapi/block/ud/google2ud.py +++ b/udapi/block/ud/google2ud.py @@ -498,7 +498,7 @@ def fix_deprel(self, node): if self.lang == 'fr' and node.parent.form in {'M.', 'Mme', 'Dr'}: node.deprel = 'flat:name' elif node.deprel == 'prt': - if self.lang in {'en', 'de', 'nl', 'sv', 'da', 'no'}: + if self.lang in {'en', 'de', 'nl', 'sv', 'da', 'no', 'th'}: node.deprel = 'compound:prt' elif self.lang == 'tr': node.deprel = 'advmod:emph' From 984a9ea5c1f98ee5aba2fea4e762bd54b953f213 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 10 Apr 2018 17:55:01 +0200 Subject: [PATCH 0027/1201] fix bug (prev_node instead of node) introduced in my last commit --- udapi/block/tutorial/addcommas.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/udapi/block/tutorial/addcommas.py b/udapi/block/tutorial/addcommas.py index de707094..97677d89 100644 --- a/udapi/block/tutorial/addcommas.py +++ b/udapi/block/tutorial/addcommas.py @@ -10,25 +10,20 @@ class AddCommas(Block): """Heuristically insert nodes for missing commas.""" def __init__(self, language='en', **kwargs): - """Create the AddCommas block object. - - Args: - `language`: which language-specific rules to use ('en' or 'cs') - """ super().__init__(**kwargs) self.language = language def process_node(self, node): + # TODO: Your task: implement some heuristics if self.should_add_comma_before(node): comma = node.create_child(form=',', deprel='punct', upos='PUNCT') comma.shift_before_node(node) def should_add_comma_before(self, node): - # TODO: Your task: implement some heuristics prev_node = node.prev_node if prev_node is None: return False - if self.language == 'en' and node.lemma == 'however': + if self.language == 'en' and prev_node.lemma == 'however': return True if any(n.deprel == 'appos' for n in prev_node.children): return True From 6d8ef782f0e4fc47b0b4a412eecffad11b16b17c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 15 Apr 2018 17:45:51 +0200 Subject: [PATCH 0028/1201] fix edge cases in util.ResegmentGold and eval.Conll2018 goal: make eval.Conll2018's output more similar to the official script --- udapi/block/eval/conll18.py | 66 ++++++++++++++++++------------- udapi/block/util/resegmentgold.py | 17 +++++++- 2 files changed, 54 insertions(+), 29 deletions(-) diff --git a/udapi/block/eval/conll18.py b/udapi/block/eval/conll18.py index 97bc68aa..0df41b46 100644 --- a/udapi/block/eval/conll18.py +++ b/udapi/block/eval/conll18.py @@ -106,7 +106,8 @@ class Conll18(BaseWriter): """Evaluate LAS, UAS, MLAS and BLEX.""" - def __init__(self, gold_zone='gold', print_raw=False, print_results=True, **kwargs): + def __init__(self, gold_zone='gold', print_raw=False, print_results=True, print_counts=False, + **kwargs): """Args: gold_zone - Which zone contains the gold-standard trees (the other zone contains "pred")? print_raw - Print raw counts (pred, gold, aligned, correct) for each sentence. @@ -114,12 +115,17 @@ def __init__(self, gold_zone='gold', print_raw=False, print_results=True, **kwar The parameter print_raw specifies a given metric (UAS, LAS, MLAS, BLEX, UPOS, XPOS, Feats, Lemma) or is 0 (or False) by default. print_results - Print a table with overall results after all document are processed. + print_counts - Print counts of correct/gold/system instead of prec/rec/f1 for all metrics. """ super().__init__(**kwargs) self.gold_zone = gold_zone self.total_count = Counter() self.print_raw = print_raw self.print_results = print_results + self.print_counts = print_counts + + def _ufeats(self, feats): + return '|'.join(sorted(x for x in feats.split('|') if x.split('=', 1)[0] in UNIV_FEATS)) def process_tree(self, tree): gold_tree = tree.bundle.get_tree(self.gold_zone) @@ -135,9 +141,10 @@ def process_tree(self, tree): edit, pred_lo, pred_hi, gold_lo, gold_hi = diff if edit == 'equal': aligned.extend(zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi])) - align_map = {tree: gold_tree} + align_map, feats_match = {tree: gold_tree}, {} for p_node, g_node in aligned: align_map[p_node] = g_node + feats_match[p_node] = self._ufeats(str(p_node.feats)) == self._ufeats(str(g_node.feats)) count = Counter() count['pred'] = len(pred_nodes) @@ -148,9 +155,10 @@ def process_tree(self, tree): count['alig_cont'] = len([n for _, n in aligned if n.udeprel in CONTENT]) for p_node, g_node in aligned: - for attr in ('UPOS', 'XPOS', 'Feats', 'Lemma'): - if p_node.get_attrs([attr.lower()]) == g_node.get_attrs([attr.lower()]): - count[attr] += 1 + count['UPOS'] += 1 if p_node.upos == g_node.upos else 0 + count['XPOS'] += 1 if p_node.xpos == g_node.xpos else 0 + count['Lemmas'] += 1 if g_node.lemma == '_' or p_node.lemma == g_node.lemma else 0 + count['UFeats'] += 1 if feats_match[p_node] else 0 if align_map.get(p_node.parent) == g_node.parent: count['UAS'] += 1 if p_node.udeprel == g_node.udeprel: @@ -159,7 +167,7 @@ def process_tree(self, tree): count['CLAS'] += 1 if g_node.lemma == '_' or g_node.lemma == p_node.lemma: count['BLEX'] += 1 - if self._morpho_match(p_node, g_node, align_map): + if self._morpho_match(p_node, g_node, align_map, feats_match): count['MLAS'] += 1 self.total_count.update(count) @@ -171,24 +179,20 @@ def process_tree(self, tree): scores = [str(count[s]) for s in ('pred', 'gold', 'Words', self.print_raw)] print(' '.join(scores)) - def _morpho_match(self, p_node, g_node, align_map, check_children=True): - if p_node.upos != g_node.upos: + def _morpho_match(self, p_node, g_node, align_map, feats_match): + if p_node.upos != g_node.upos or not feats_match[p_node]: + return False + p_children = [c for c in p_node.children if c.udeprel in FUNCTIONAL] + g_children = [c for c in g_node.children if c.udeprel in FUNCTIONAL] + if len(p_children) != len(g_children): return False - for feat in UNIV_FEATS: - if p_node.feats[feat] != g_node.feats[feat]: + for p_child, g_child in zip(p_children, g_children): + if align_map.get(p_child) != g_child: return False - if check_children: - p_children = [c for c in p_node.children if c.udeprel in FUNCTIONAL] - g_children = [c for c in g_node.children if c.udeprel in FUNCTIONAL] - if len(p_children) != len(g_children): + if p_child.udeprel != g_child.udeprel: + return False + if p_child.upos != g_child.upos or not feats_match[p_child]: return False - for p_child, g_child in zip(p_children, g_children): - if align_map.get(p_child) != g_child: - return False - if p_child.udeprel != g_child.udeprel: - return False - if not self._morpho_match(p_child, g_child, None, check_children=False): - return False return True def process_end(self): @@ -198,20 +202,28 @@ def process_end(self): # Redirect the default filehandle to the file specified by self.files self.before_process_document(None) - metrics = ('Words', 'UPOS', 'XPOS', 'Feats', 'Lemma', 'UAS', 'LAS', 'CLAS', 'BLEX', 'MLAS') - print("Metric | Precision | Recall | F1 Score | AligndAcc") + metrics = ('Words', 'UPOS', 'XPOS', 'UFeats', 'Lemmas', 'UAS', 'LAS', 'CLAS', 'MLAS', 'BLEX') + if self.print_counts: + print("Metric | Correct | Gold | Predicted | Aligned") + else: + print("Metric | Precision | Recall | F1 Score | AligndAcc") print("-----------+-----------+-----------+-----------+-----------") for metric in metrics: + correct = self.total_count[metric] if metric in {'CLAS', 'BLEX', 'MLAS'}: pred, gold = self.total_count['pred_cont'], self.total_count['gold_cont'] alig = self.total_count['alig_cont'] else: pred, gold = self.total_count['pred'], self.total_count['gold'] alig = self.total_count['Words'] - correct = self.total_count[metric] - precision, recall, fscore, alignacc = prec_rec_f1(correct, pred, gold, alig) - print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{:10.2f}".format( - metric, 100 * precision, 100 * recall, 100 * fscore, 100 * alignacc)) + if self.print_counts: + print("{:11}|{:10} |{:10} |{:10} |{:10}".format( + metric, correct, gold, pred, alig)) + else: + precision, recall, fscore, alignacc = prec_rec_f1(correct, pred, gold, alig) + alignacc = "{:10.2f}".format(100 * alignacc) if metric != 'Words' else "" + print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( + metric, 100 * precision, 100 * recall, 100 * fscore, alignacc)) def prec_rec_f1(correct, pred, gold, alig=0): diff --git a/udapi/block/util/resegmentgold.py b/udapi/block/util/resegmentgold.py index 39ebe6e9..654ec524 100644 --- a/udapi/block/util/resegmentgold.py +++ b/udapi/block/util/resegmentgold.py @@ -76,9 +76,22 @@ def process_document(self, document): words.extend(token.words) else: words.append(token) + next_p_subroot = None + for word in words: + if word.parent == word.root: + next_p_subroot = word + if word.deprel.startswith('wrong-'): + word.deprel = word.deprel[6:] next_p_tree.steal_nodes(words) self.choose_root(p_tree, g_tree) - self.choose_root(next_p_tree, document.bundles[bundle_no + 1].trees[0]) + next_p_subroots = next_p_tree.children + if len(next_p_subroots) > 1: + if next_p_subroot: + for false_subroot in (n for n in next_p_subroots if n != next_p_subroot): + false_subroot.parent = next_p_subroot + false_subroot.deprel = 'wrong-' + false_subroot.deprel + else: + self.choose_root(next_p_tree, document.bundles[bundle_no + 1].trees[0]) pred_trees.append(next_p_tree) bundle.add_tree(p_tree) break @@ -107,7 +120,7 @@ def choose_root(p_tree, g_tree): """Prevent multiple roots, which are forbidden in the evaluation script.""" p_subroots = p_tree.children if len(p_subroots) > 1: - g_subroot_form = g_tree.children[0] + g_subroot_form = g_tree.children[0].form p_subroot = next((n for n in p_subroots if n.form == g_subroot_form), p_subroots[0]) for false_subroot in (n for n in p_subroots if n != p_subroot): false_subroot.parent = p_subroot From 9f4730748705c8c1fbc5de569829fd94f7a0a156 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 16 Apr 2018 02:03:44 +0200 Subject: [PATCH 0029/1201] util.ResegmentGold should not change any deprel it should use `misc['Rehanged']` instead to mark nodes which should not be counted as correct in UAS etc. even if they have the correct parent. --- udapi/block/eval/conll18.py | 4 +-- udapi/block/util/resegmentgold.py | 48 ++++++++++++++++--------------- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/udapi/block/eval/conll18.py b/udapi/block/eval/conll18.py index 0df41b46..70e6de24 100644 --- a/udapi/block/eval/conll18.py +++ b/udapi/block/eval/conll18.py @@ -159,7 +159,7 @@ def process_tree(self, tree): count['XPOS'] += 1 if p_node.xpos == g_node.xpos else 0 count['Lemmas'] += 1 if g_node.lemma == '_' or p_node.lemma == g_node.lemma else 0 count['UFeats'] += 1 if feats_match[p_node] else 0 - if align_map.get(p_node.parent) == g_node.parent: + if align_map.get(p_node.parent) == g_node.parent and not p_node.misc['Rehanged']: count['UAS'] += 1 if p_node.udeprel == g_node.udeprel: count['LAS'] += 1 @@ -182,7 +182,7 @@ def process_tree(self, tree): def _morpho_match(self, p_node, g_node, align_map, feats_match): if p_node.upos != g_node.upos or not feats_match[p_node]: return False - p_children = [c for c in p_node.children if c.udeprel in FUNCTIONAL] + p_children = [c for c in p_node.children if c.udeprel in FUNCTIONAL and not c.misc['Rehanged']] g_children = [c for c in g_node.children if c.udeprel in FUNCTIONAL] if len(p_children) != len(g_children): return False diff --git a/udapi/block/util/resegmentgold.py b/udapi/block/util/resegmentgold.py index 654ec524..454eae79 100644 --- a/udapi/block/util/resegmentgold.py +++ b/udapi/block/util/resegmentgold.py @@ -22,6 +22,10 @@ def process_document(self, document): if not document.bundles: return pred_trees = self.extract_pred_trees(document) + was_subroot = set() + for pred_tree in pred_trees: + for n in pred_tree.children: + was_subroot.add(n) for bundle_no, bundle in enumerate(document.bundles): g_tree = bundle.trees[0] @@ -33,13 +37,16 @@ def process_document(self, document): continue # Make sure that p_tree contains enough nodes. + moved_roots = [] while len(p_chars) < len(g_chars): if not pred_trees: raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars)) new_p_tree = pred_trees.pop() p_chars += ''.join(t.form for t in new_p_tree.token_descendants).replace(' ', '') + moved_roots.extend(new_p_tree.children) p_tree.steal_nodes(new_p_tree.descendants) - self.choose_root(p_tree, g_tree) + self.choose_root(p_tree, was_subroot, g_tree) + if not p_chars.startswith(g_chars): raise ValueError('sent_id=%s: !p_chars.startswith(g_chars):\np_chars=%s\ng_chars=%s' % (g_tree.sent_id, p_chars, g_chars)) @@ -61,7 +68,8 @@ def process_document(self, document): if index + 1 == len(tokens): next_p_tree = Root(zone=p_tree.zone) pred_trees.append(next_p_tree) - next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):]) + next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):], + misc='Rehanged=Yes') bundle.add_tree(p_tree) break else: @@ -76,22 +84,12 @@ def process_document(self, document): words.extend(token.words) else: words.append(token) - next_p_subroot = None for word in words: - if word.parent == word.root: - next_p_subroot = word - if word.deprel.startswith('wrong-'): - word.deprel = word.deprel[6:] + if word in was_subroot: + del word.misc['Rehanged'] next_p_tree.steal_nodes(words) - self.choose_root(p_tree, g_tree) - next_p_subroots = next_p_tree.children - if len(next_p_subroots) > 1: - if next_p_subroot: - for false_subroot in (n for n in next_p_subroots if n != next_p_subroot): - false_subroot.parent = next_p_subroot - false_subroot.deprel = 'wrong-' + false_subroot.deprel - else: - self.choose_root(next_p_tree, document.bundles[bundle_no + 1].trees[0]) + self.choose_root(p_tree, was_subroot, g_tree) + self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0]) pred_trees.append(next_p_tree) bundle.add_tree(p_tree) break @@ -116,12 +114,16 @@ def extract_pred_trees(self, document): return pred_trees @staticmethod - def choose_root(p_tree, g_tree): + def choose_root(p_tree, was_subroot, g_tree): """Prevent multiple roots, which are forbidden in the evaluation script.""" - p_subroots = p_tree.children - if len(p_subroots) > 1: + possible_subroots = [n for n in p_tree.children if n in was_subroot] + if possible_subroots: g_subroot_form = g_tree.children[0].form - p_subroot = next((n for n in p_subroots if n.form == g_subroot_form), p_subroots[0]) - for false_subroot in (n for n in p_subroots if n != p_subroot): - false_subroot.parent = p_subroot - false_subroot.deprel = 'wrong-' + false_subroot.deprel + the_subroot = next((n for n in possible_subroots if n.form == g_subroot_form), possible_subroots[0]) + else: + the_subroot = p_tree.children[0] + the_subroot.misc['Rehanged'] = 'Yes' + for subroot in p_tree.children: + if subroot is not the_subroot: + subroot.parent = the_subroot + subroot.misc['Rehanged'] = 'Yes' From 8ca27c47b6b681c49151f9acaeac2804ad597ba1 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 16 Apr 2018 16:32:57 +0200 Subject: [PATCH 0030/1201] add nickname to hw templates --- udapi/block/tutorial/addarticles.py | 4 ++++ udapi/block/tutorial/parse.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/udapi/block/tutorial/addarticles.py b/udapi/block/tutorial/addarticles.py index 20a4295f..96f0ba2f 100644 --- a/udapi/block/tutorial/addarticles.py +++ b/udapi/block/tutorial/addarticles.py @@ -1,4 +1,8 @@ """tutorial.AddArticles block template.""" +# nickname = xy123 +# TODO: make up a unique nickname and edit the previous line +# if you want your results to be listed on the NPFL070 web (under that nickname). +# Delete the line if you don't want to listed on the web. from udapi.core.block import Block class AddArticles(Block): diff --git a/udapi/block/tutorial/parse.py b/udapi/block/tutorial/parse.py index 77928782..db732a12 100644 --- a/udapi/block/tutorial/parse.py +++ b/udapi/block/tutorial/parse.py @@ -9,11 +9,19 @@ util.MarkDiff gold_zone=gold \ write.TextModeTreesHtml marked_only=1 files=parse-diff.html """ +# nickname = xy123 +# TODO: make up a unique nickname and edit the previous line +# if you want your results to be listed on the NPFL070 web (under that nickname). +# Delete the line if you don't want to listed on the web. from udapi.core.block import Block class Parse(Block): """Dependency parsing.""" + def __init__(self, language='en', **kwargs): + super().__init__(**kwargs) + self.language = language + def process_tree(self, root): # TODO: Your task: implement a better heuristics than "right chain" for node in root.descendants: From d3efc9e0ba7cc6463f7e81b0990bd8372b745248 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 17 Apr 2018 17:56:29 +0200 Subject: [PATCH 0031/1201] fix an edge case with the same form roots Two clauses (merged into one sentence in the gold data) may have the same subroot word form. Luke 9,48 --- udapi/block/util/resegmentgold.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/resegmentgold.py b/udapi/block/util/resegmentgold.py index 454eae79..2af649ca 100644 --- a/udapi/block/util/resegmentgold.py +++ b/udapi/block/util/resegmentgold.py @@ -118,8 +118,11 @@ def choose_root(p_tree, was_subroot, g_tree): """Prevent multiple roots, which are forbidden in the evaluation script.""" possible_subroots = [n for n in p_tree.children if n in was_subroot] if possible_subroots: - g_subroot_form = g_tree.children[0].form - the_subroot = next((n for n in possible_subroots if n.form == g_subroot_form), possible_subroots[0]) + the_subroot = possible_subroots[0] + g_subroot = g_tree.children[0] + possible_subroots = sorted([n for n in possible_subroots if n.form == g_subroot.form], + key=lambda n: abs(n.ord - g_subroot.ord)) + the_subroot = possible_subroots[0] if possible_subroots else the_subroot else: the_subroot = p_tree.children[0] the_subroot.misc['Rehanged'] = 'Yes' From a2acdec90fad67bc5efac4e16dd584cff83b7a80 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 17 Apr 2018 18:53:19 +0200 Subject: [PATCH 0032/1201] eval.conll18 reports AllTags and is more similar to the official evaluation script --- udapi/block/eval/conll18.py | 9 +++++++-- udapi/block/util/resegmentgold.py | 7 +++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/udapi/block/eval/conll18.py b/udapi/block/eval/conll18.py index 70e6de24..72612832 100644 --- a/udapi/block/eval/conll18.py +++ b/udapi/block/eval/conll18.py @@ -11,6 +11,7 @@ $ udapy read.Conllu zone=gold files=gold.conllu \ read.Conllu zone=pred files=pred.conllu ignore_sent_id=1 \ + util.ResegmentGold \ eval.Conll18 Metric | Precision | Recall | F1 Score | AligndAcc -----------+-----------+-----------+-----------+----------- @@ -159,6 +160,8 @@ def process_tree(self, tree): count['XPOS'] += 1 if p_node.xpos == g_node.xpos else 0 count['Lemmas'] += 1 if g_node.lemma == '_' or p_node.lemma == g_node.lemma else 0 count['UFeats'] += 1 if feats_match[p_node] else 0 + if feats_match[p_node] and p_node.upos == g_node.upos and p_node.xpos == g_node.xpos: + count['AllTags'] += 1 if align_map.get(p_node.parent) == g_node.parent and not p_node.misc['Rehanged']: count['UAS'] += 1 if p_node.udeprel == g_node.udeprel: @@ -168,7 +171,8 @@ def process_tree(self, tree): if g_node.lemma == '_' or g_node.lemma == p_node.lemma: count['BLEX'] += 1 if self._morpho_match(p_node, g_node, align_map, feats_match): - count['MLAS'] += 1 + if not p_node.misc['FuncChildMissing']: + count['MLAS'] += 1 self.total_count.update(count) if self.print_raw: @@ -202,7 +206,8 @@ def process_end(self): # Redirect the default filehandle to the file specified by self.files self.before_process_document(None) - metrics = ('Words', 'UPOS', 'XPOS', 'UFeats', 'Lemmas', 'UAS', 'LAS', 'CLAS', 'MLAS', 'BLEX') + metrics = ('Words', 'UPOS', 'XPOS', 'UFeats', 'AllTags', + 'Lemmas', 'UAS', 'LAS', 'CLAS', 'MLAS', 'BLEX') if self.print_counts: print("Metric | Correct | Gold | Predicted | Aligned") else: diff --git a/udapi/block/util/resegmentgold.py b/udapi/block/util/resegmentgold.py index 2af649ca..d1499cb7 100644 --- a/udapi/block/util/resegmentgold.py +++ b/udapi/block/util/resegmentgold.py @@ -4,6 +4,7 @@ from udapi.core.mwt import MWT from udapi.core.root import Root +FUNCTIONAL = {'aux', 'cop', 'mark', 'det', 'clf', 'case', 'cc'} class ResegmentGold(Block): """Sentence-align two zones (gold and pred) and resegment the pred zone. @@ -87,6 +88,12 @@ def process_document(self, document): for word in words: if word in was_subroot: del word.misc['Rehanged'] + if word.parent is not p_tree and word.parent not in words: + if word.udeprel in FUNCTIONAL: + word.parent.misc['FuncChildMissing'] = 'Yes' + for child in word.children: + if child not in words and child.udeprel in FUNCTIONAL: + word.misc['FuncChildMissing'] = 'Yes' next_p_tree.steal_nodes(words) self.choose_root(p_tree, was_subroot, g_tree) self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0]) From 50522c20c33ddeacfd326c8afeb4198cb9a8d728 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 3 May 2018 16:12:12 +0200 Subject: [PATCH 0033/1201] minor edit --- udapi/block/util/resegmentgold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/util/resegmentgold.py b/udapi/block/util/resegmentgold.py index d1499cb7..a93f01f4 100644 --- a/udapi/block/util/resegmentgold.py +++ b/udapi/block/util/resegmentgold.py @@ -122,7 +122,7 @@ def extract_pred_trees(self, document): @staticmethod def choose_root(p_tree, was_subroot, g_tree): - """Prevent multiple roots, which are forbidden in the evaluation script.""" + """Prevent multiple roots, which are forbidden in CoNLL-U.""" possible_subroots = [n for n in p_tree.children if n in was_subroot] if possible_subroots: the_subroot = possible_subroots[0] From 812cd0f71a5667733b8cbbf756ac5aba12f04d84 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 3 May 2018 16:13:41 +0200 Subject: [PATCH 0034/1201] update usage in demo (filename->files) fixes #48 --- demo/python-demo.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/python-demo.sh b/demo/python-demo.sh index aefa17cf..d83e51d9 100755 --- a/demo/python-demo.sh +++ b/demo/python-demo.sh @@ -3,4 +3,4 @@ export PATH=../bin:$PATH export PYTHONPATH=../:$PYTHONPATH -udapy read.Conllu filename=en-sample.conllu demo.RehangPrepositions write.Conllu > prepositions-up.conllu +udapy read.Conllu files=en-sample.conllu demo.RehangPrepositions write.Conllu > prepositions-up.conllu From 6b7e808538754638971cba49f56a69be207ce6fd Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 31 May 2018 22:06:18 +0200 Subject: [PATCH 0035/1201] allow to specify encoding for stdin --- udapi/core/files.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index 523fe303..7a711dfe 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -1,6 +1,7 @@ """Files is a helper class for iterating over filenames.""" import glob +import io import sys import os.path import bz2 @@ -109,7 +110,7 @@ def next_filehandle(self): if filename is None: fhandle = None elif filename == '-': - fhandle = sys.stdin + fhandle = io.TextIOWrapper(sys.stdin.buffer, encoding=self.encoding) elif filename == '': fhandle = self.filehandle else: From 4d7c87f0672d809f0367f6cf6eefb90a746b51e5 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 31 May 2018 22:07:09 +0200 Subject: [PATCH 0036/1201] ignore BOM when reading utf8 files Windows editors like to include BOM although it is deprecated for UTF8. Ignoring BOM is better than throwing an error and suggesting Windows users (or anyone who has such BOM files) to use sed -i '1s/^\xEF\xBB\xBF//' file.conllu When writing conllu, we won't insert BOM, of course. --- udapi/core/basereader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 9e3b6488..b5e159a2 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -12,7 +12,7 @@ class BaseReader(Block): """Base class for all reader blocks.""" # pylint: disable=too-many-arguments - def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8', + def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs): super().__init__(**kwargs) if filehandle is not None: From 471578e011799805a28cb3abc3078565a6240240 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 1 Jun 2018 01:32:19 +0200 Subject: [PATCH 0037/1201] getter should not change sent_id `root._bundle` can be None only when loading/creating a tree. During that process someone may call `node.address()` (e.g. via `node.__str__` in a debug prints or warnings). But `node.address()` calls `node.root.address()` and that calls `root.sent_id` getter. If this getter sets `root._sent_id` to `'?' + zone`, the loading may fail with strange unexpected results, because the loading uses `root._sent_id` when loading CoNLL-U which contains sent_id in comments, but having all sent_id='?' results in adding all trees into the same bundle and failing that a tree with zone '?' already exists in that bundle. --- udapi/core/root.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/root.py b/udapi/core/root.py index ae259718..7944cd55 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -48,7 +48,7 @@ def sent_id(self): if self._bundle is not None: self._sent_id = self._bundle.address() + zone else: - self._sent_id = '?' + zone + return '?' + zone return self._sent_id @sent_id.setter From 5918fbd4e084e38a14641123c923d3efa4d0d9d7 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 1 Jun 2018 02:14:04 +0200 Subject: [PATCH 0038/1201] an option to "fix" cycles in CoNLL-U by attaching to root Usage: cat with_cycles.conllu | udapy -s read.Conllu fix_cycles=1 > fixed.conllu Of course, this is not the proper correction, just a work-around (UD guidelines do not allow multiple nodes attached to root). This is useful e.g. when you don't have time to fix the cycles manually and just want to extract some info from the invalid conllu. This way at least you can load the file into Udapi and e.g. try to fix the multiple roots automatically or detect if there are any other problems. --- udapi/block/read/conllu.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index a2fd0406..1b3383fb 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -18,7 +18,7 @@ class Conllu(BaseReader): """A reader of the CoNLL-U files.""" - def __init__(self, strict=False, separator='tab', empty_parent='warn', + def __init__(self, strict=False, separator='tab', empty_parent='warn', fix_cycles=False, attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc', **kwargs): """Create the Conllu reader object. @@ -54,6 +54,7 @@ def __init__(self, strict=False, separator='tab', empty_parent='warn', self.strict = strict self.separator = separator self.empty_parent = empty_parent + self.fix_cycles = fix_cycles @staticmethod def parse_comment_line(line, root): @@ -174,6 +175,13 @@ def read_tree(self): for node_ord, node in enumerate(nodes[1:], 1): try: node.parent = nodes[parents[node_ord]] + # TODO add a special Exception class for cycles + except ValueError as e: + if self.fix_cycles: + logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", e) + node.parent = root + else: + raise except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) From de8d5294f77c76c6ef7a55ffadb56bb4d82a281c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 12 Jun 2018 15:27:59 +0200 Subject: [PATCH 0039/1201] Let UDPipe ignore trees with no nodes fixes #49 --- udapi/tool/udpipe.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/udapi/tool/udpipe.py b/udapi/tool/udpipe.py index 8fe024c7..5d17ae97 100644 --- a/udapi/tool/udpipe.py +++ b/udapi/tool/udpipe.py @@ -22,14 +22,17 @@ def __init__(self, model): def tag_parse_tree(self, root): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" + descendants = root.descendants + if not descendants: + return pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') - in_data = " ".join([n.form for n in root.descendants]) + in_data = " ".join([n.form for n in descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() - nodes = [root] + root.descendants + nodes = [root] + descendants for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] From f41031def5a4a9bd3646c2d9d3ad2e7ed63a023f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 12 Jun 2018 16:03:55 +0200 Subject: [PATCH 0040/1201] options how to handle empty lines/trees read.Sentences ignore_empty_lines=1 # skip empty lines in the input All Udapi blocks now also have parameter `if_empty_tree`, with possible values: process (default), skip, skip_warn, fail, delete. --- udapi/block/read/sentences.py | 12 ++++++++++++ udapi/core/block.py | 25 +++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/udapi/block/read/sentences.py b/udapi/block/read/sentences.py index 14840a50..c3a02ddd 100644 --- a/udapi/block/read/sentences.py +++ b/udapi/block/read/sentences.py @@ -6,6 +6,10 @@ class Sentences(BaseReader): """A reader for plain-text sentences (one sentence per line) files.""" + def __init__(self, ignore_empty_lines=False, **kwargs): + self.ignore_empty_lines = ignore_empty_lines + super().__init__(**kwargs) + @staticmethod def is_multizone_reader(): """Can this reader read bundles which contain more zones?. @@ -18,8 +22,16 @@ def read_tree(self, document=None): if self.filehandle is None: return None line = self.filehandle.readline() + # if readline() returns an empty string, the end of the file has been + # reached, while a blank line is represented by '\n' + # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None + if self.ignore_empty_lines: + while line in {'\n', '\r\n'}: + line = self.filehandle.readline() + if line == '': + return None root = Root() root.text = line.rstrip() return root diff --git a/udapi/core/block.py b/udapi/core/block.py index 4ddbaf2b..67c299f0 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -3,10 +3,17 @@ class Block(object): - """The smallest processing unit for processing Universal Dependencies data.""" + """The smallest processing unit for processing Universal Dependencies data. - def __init__(self, zones='all'): + Parameters: + zones: which zone to process (default="all") + if_empty_tree: what to do when encountering a tree with no nodes. + Possible values are: process (default), skip, skip_warn, fail, delete. + """ + + def __init__(self, zones='all', if_empty_tree='process'): self.zones = zones + self.if_empty_tree = if_empty_tree def process_start(self): """A hook method that is executed before processing UD data""" @@ -52,6 +59,20 @@ def after_process_document(self, document): pass def _should_process_tree(self, tree): + if self.if_empty_tree != 'process' and not tree.descendants: + if self.if_empty_tree == 'skip': + return False + elif self.if_empty_tree == 'delete': + tree.remove() + return False + elif self.if_empty_tree == 'skip_warn': + logging.warning("Tree %s is empty", tree) + return False + elif self.if_empty_tree == 'fail': + raise Exception("Tree %s is empty" % tree) + else: + raise ValueError("Unknown value for if_empty_tree: " + + self.if_empty_tree) if self.zones == 'all': return True if self.zones == '' and tree.zone == '': From c8741af269105023564ccfe4c0f2509eedd824f2 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 23 Jun 2018 01:11:28 +0200 Subject: [PATCH 0041/1201] fix url of FileSaver.min.js --- udapi/block/write/html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index b06c971a..148b29ee 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -14,7 +14,7 @@ class Html(BaseWriter): For offline use, we need to download first three JavaScript libraries:: wget https://code.jquery.com/jquery-2.1.4.min.js - wget https://cdn.rawgit.com/eligrey/FileSaver.js/master/FileSaver.min.js + wget https://cdn.rawgit.com/eligrey/FileSaver.js/1.3.4r/FileSaver.min.js wget https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js udapy write.Html path_to_js=. < file.conllu > file.html firefox file.html @@ -65,7 +65,7 @@ def __init__(self, path_to_js='web', **kwargs): def process_document(self, doc): if self.path_to_js == 'web': jquery = 'https://code.jquery.com/jquery-2.1.4.min.js' - fsaver = 'https://cdn.rawgit.com/eligrey/FileSaver.js/master/FileSaver.min.js' + fsaver = 'https://cdn.rawgit.com/eligrey/FileSaver.js/1.3.4/FileSaver.min.js' js_t_v = 'https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js' else: jquery = self.path_to_js + '/jquery-2.1.4.min.js' From 3814dd2ef06fb33d3b384db327f2c698660e1d2d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 25 Jun 2018 00:10:58 +0200 Subject: [PATCH 0042/1201] write.Tikz as_tree=1 --- udapi/block/write/tikz.py | 58 +++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/udapi/block/write/tikz.py b/udapi/block/write/tikz.py index 19480e23..802487b5 100644 --- a/udapi/block/write/tikz.py +++ b/udapi/block/write/tikz.py @@ -10,6 +10,8 @@ class Tikz(BaseWriter): Usage:: udapy write.Tikz < my.conllu > my.tex + # or for 2D tree-like rendering + udapy write.Tikz as_tree=1 < my.conllu > my.tex pdflatex my.tex xdg-open my.pdf @@ -26,13 +28,18 @@ class Tikz(BaseWriter): `_ for details. + With ``as_tree=1``, there are two options how to visualize deprels: + either as labels positioned on the edges by uncommenting the relevant style definition, + or by adding ``deprel`` to the list of attributes, so deprels are above/below the words. + The latter is the default because the edge labels need manual tweaks to prevent overlapping. + Alternatives: * use `write.TextModeTrees` and include it in verbatim environment in LaTeX. * use `write.Html`, press "Save as SVG" button, convert to pdf and include in LaTeX. """ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, - attributes='form,upos', **kwargs): + attributes=None, as_tree=False, **kwargs): """Create the Tikz block object. Args: @@ -46,15 +53,31 @@ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, self.print_sent_id = print_sent_id self.print_text = print_text self.print_preambule = print_preambule - self.node_attributes = attributes.split(',') + if attributes is not None: + self.node_attributes = attributes.split(',') + elif as_tree: + self.node_attributes = 'form,upos,deprel'.split(',') + else: + self.node_attributes = 'form,upos'.split(',') + self.as_tree = as_tree def before_process_document(self, doc): super().before_process_document(doc) if self.print_preambule: - print(r'\documentclass{article}') + print(r'\documentclass[multi=dependency]{standalone}') print(r'\usepackage[T1]{fontenc}') print(r'\usepackage[utf8]{inputenc}') print(r'\usepackage{tikz-dependency}') + if self.as_tree: + print(r'\tikzset{depedge/.style = {blue,thick}, %,<-') + print(r' deplabel/.style = {opacity=0, %black, fill opacity=0.9, text opacity=1,') + print(r' % yshift=4pt, pos=0.1, inner sep=0, fill=white, font={\scriptsize}') + print(r' },') + print(r' depnode/.style = {draw,circle,fill,blue,inner sep=1.5pt},') + print(r' depguide/.style = {dashed,gray},') + print(r'}') + print(r'\newlength{\deplevel}\setlength{\deplevel}{8mm}') + print(r'\newlength{\depskip}\setlength{\depskip}{4mm}') print(r'\begin{document}') def after_process_document(self, doc): @@ -81,8 +104,7 @@ def process_tree(self, tree): lines = ['' for _ in self.node_attributes] for node in nodes: - values = [str(getattr(node, attr_name)) for attr_name in self.node_attributes] - values = [v if v != '_' else r'\_' for v in values] + values = [v.replace('_', r'\_') for v in node.get_attrs(self.node_attributes)] max_len = max(len(value) for value in values) for index, value in enumerate(values): if node.ord > 1: @@ -91,10 +113,24 @@ def process_tree(self, tree): for line in lines: print(line + r' \\') print(r'\end{deptext}') - for node in nodes: - if node.parent.is_root(): - print(r'\deproot{%d}{root}' % node.ord) - else: - print(r'\depedge{%d}{%d}{%s}' % (node.parent.ord, node.ord, node.deprel)) + if self.as_tree: + depths = [n._get_attr('depth') for n in nodes] + max_depth = max(depths) + for node in nodes: + print(r'\node (w%d) [yshift=\depskip+%s\deplevel,depnode] at (\wordref{1}{%d}) {};' + % (node.ord, max_depth - depths[node.ord - 1], node.ord)) + for node in nodes: + print(r'\draw[depguide] (w%d)--(\wordref{1}{%d});' % (node.ord, node.ord), end='') + if node.parent.is_root(): + print('') + else: + print(r' \draw[depedge] (w%d)--node[deplabel] {%s} (w%d);' + % (node.ord, node.deprel, node.parent.ord)) + else: + for node in nodes: + if node.parent.is_root(): + print(r'\deproot{%d}{root}' % node.ord) + else: + print(r'\depedge{%d}{%d}{%s}' % (node.parent.ord, node.ord, node.deprel)) print(r'\end{dependency}') - print('') # empty line marks a new paragraph in LaTeX + print('') # empty line marks a new paragraph in LaTeX, but multi=dependency causes newpage From 33af0f217259ac7249e21c0c3a0d8662ba7015d1 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 25 Jun 2018 00:12:05 +0200 Subject: [PATCH 0043/1201] write.TextModeTrees layout=classic|compact|align-words|align --- udapi/block/write/textmodetrees.py | 76 +++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 6 deletions(-) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index 54680e88..be673b2f 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -29,7 +29,7 @@ class TextModeTrees(BaseWriter): In scenario (examples of other parameters):: - write.TextModeTrees indent=1 print_sent_id=1 print_sentence=1 + write.TextModeTrees indent=2 print_sent_id=0 print_sentence=1 layout=align write.TextModeTrees zones=en,cs attributes=form,lemma,upos minimize_cross=0 This block prints dependency trees in plain-text format. @@ -47,7 +47,7 @@ class TextModeTrees(BaseWriter): 10 boxer boxer NOUN NN Number=Sing 4 acl:relcl _ SpaceAfter=No 11 . . PUNCT . _ 2 punct _ _ - will be printed (with the default parameters) as:: + will be printed (with the default parameters plus hints=0) as:: ─┮ │ ╭─╼ I PRON nsubj @@ -62,6 +62,51 @@ class TextModeTrees(BaseWriter): │ ╰─┶ boxer NOUN acl:relcl ╰─╼ . PUNCT punct + With ``layout=compact``, the output will be (note the nodes "today" and ","):: + + ─┮ + │ ╭─╼ I PRON nsubj + ╰─┾ saw VERB root + │ ╭─╼ a DET det + ┡───┾ dog NOUN dobj + ┡─╼ │ today NOUN nmod:tmod + ┡─╼ │ , PUNCT punct + │ │ ╭─╼ which DET nsubj + │ │ ┢─╼ was VERB cop + │ │ ┢─╼ a DET det + │ ╰─┶ boxer NOUN acl:relcl + ╰─╼ . PUNCT punct + + With ``layout=align-words``, the output will be:: + + ─┮ + │ ╭─╼ I PRON nsubj + ╰─┾ saw VERB root + │ ╭─╼ a DET det + ┡───┾ dog NOUN dobj + ┡─╼ │ today NOUN nmod:tmod + ┡─╼ │ , PUNCT punct + │ │ ╭─╼ which DET nsubj + │ │ ┢─╼ was VERB cop + │ │ ┢─╼ a DET det + │ ╰─┶ boxer NOUN acl:relcl + ╰─╼ . PUNCT punct + + And finally with ``layout=align``:: + + ─┮ + │ ╭─╼ I PRON nsubj + ╰─┾ saw VERB root + │ ╭─╼ a DET det + ┡───┾ dog NOUN dobj + ┡─╼ │ today NOUN nmod:tmod + ┡─╼ │ , PUNCT punct + │ │ ╭─╼ which DET nsubj + │ │ ┢─╼ was VERB cop + │ │ ┢─╼ a DET det + │ ╰─┶ boxer NOUN acl:relcl + ╰─╼ . PUNCT punct + Some non-projective trees cannot be printed witout crossing edges. TextModeTrees uses a special "bridge" symbol ─╪─ to mark this:: @@ -71,7 +116,7 @@ class TextModeTrees(BaseWriter): ╰─┶ 3 │ ╰─╼ 4 - By default parameter ``color=auto``, so if the output is printed to the console + With ``color=auto`` (which is the default), if the output is printed to the console (not file or pipe), each node attribute is printed in different color. If a given node's MISC contains any of `ToDo`, `Bug` or `Mark` attributes (or any other specified in the parameter `mark`), the node will be highlighted @@ -88,7 +133,8 @@ class TextModeTrees(BaseWriter): def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color='auto', attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, - mark='ToDo|ToDoOrigText|Bug|Mark', marked_only=False, hints=True, **kwargs): + mark='ToDo|ToDoOrigText|Bug|Mark', marked_only=False, hints=True, + layout='classic', **kwargs): """Create new TextModeTrees block object. Args: @@ -117,6 +163,10 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind marked_only: print only trees containing one or more marked nodes/comments. Default=False. hints: use thick-marked segments (┡ and ┢) to distinguish whether a given node precedes or follows its parent. Default=True. If False, plain ├ is used in both cases. + layout: 'classic' (default) shows word attributes immediately next to each node, + 'compact' never print edges after (right to) words even in non-projectivities, + 'align-words' as 'compact' but all first attributes (forms by default) are aligned, + 'align' as 'align-words' but all attributes are aligned in columns. """ super().__init__(**kwargs) self.print_sent_id = print_sent_id @@ -130,6 +180,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.print_comments = print_comments self.mark = mark self.marked_only = marked_only + self.layout = layout # _draw[is_bottommost][is_topmost] line = '─' * indent @@ -211,7 +262,8 @@ def process_tree(self, root): botmost = idx == max_idx if idx_node is node: self._add(idx, self._draw[botmost][topmost]) - self.add_node(idx, node) + if self.layout == 'classic': + self.add_node(idx, node) else: if idx_node.parent is not node: self._add(idx, self._vert[self._ends(idx, '─╭╰╪┡┢')]) @@ -219,7 +271,8 @@ def process_tree(self, root): self._add(idx, self._space[idx < node.ord][topmost or botmost]) if idx_node.is_leaf(): self._add(idx, self._horiz) - self.add_node(idx, idx_node) + if self.layout == 'classic': + self.add_node(idx, idx_node) else: stack.append(idx_node) @@ -227,6 +280,17 @@ def process_tree(self, root): if self.minimize_cross: stack = sorted(stack, key=lambda x: -self._gaps[x.ord]) + if self.layout != 'classic': + columns_attrs = [[a] for a in self.attrs] if self.layout == 'align' else [self.attrs] + for col_attrs in columns_attrs: + self.attrs = col_attrs + max_length = max(self.lengths) + for idx, node in enumerate(allnodes): + if self.layout.startswith('align'): + self._add(idx, ' ' * (max_length - self.lengths[idx])) + self.add_node(idx, node) + self.attrs = [a for sublist in columns_attrs for a in sublist] + # Print headers (if required) and the tree itself self.print_headers(root) for line in self.lines: From 36f4951afad91a01ae4df69d99f05558b3a1e900 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 3 Jul 2018 12:33:58 +0200 Subject: [PATCH 0044/1201] documentation error --- udapi/block/eval/conll17.py | 2 +- udapi/block/eval/conll18.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/eval/conll17.py b/udapi/block/eval/conll17.py index 12158e55..61e86383 100644 --- a/udapi/block/eval/conll17.py +++ b/udapi/block/eval/conll17.py @@ -25,7 +25,7 @@ For evaluating multiple systems and testsets (as in CoNLL2017) -stored in `systems/testset_name/system_name.conllu` you can use:: +stored in `systems/system_name/testset_name.conllu` you can use:: #!/bin/bash SYSTEMS=`ls systems` diff --git a/udapi/block/eval/conll18.py b/udapi/block/eval/conll18.py index 72612832..22f42a42 100644 --- a/udapi/block/eval/conll18.py +++ b/udapi/block/eval/conll18.py @@ -26,7 +26,7 @@ For evaluating multiple systems and testsets (as in CoNLL2018) -stored in `systems/testset_name/system_name.conllu` you can use:: +stored in `systems/system_name/testset_name.conllu` you can use:: #!/bin/bash SYSTEMS=`ls systems` From a42acfa47a8daf9fa7ff84ac643804b161b99f2d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 4 Jul 2018 16:12:42 +0200 Subject: [PATCH 0045/1201] Ignore all spaces from Zs when matching gold and pred characters see https://github.com/ufal/conll2018/commit/c0c6c58da485f52e74f48 --- udapi/block/util/resegmentgold.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/resegmentgold.py b/udapi/block/util/resegmentgold.py index a93f01f4..b39db31c 100644 --- a/udapi/block/util/resegmentgold.py +++ b/udapi/block/util/resegmentgold.py @@ -1,5 +1,6 @@ """util.ResegmentGold is a block for sentence alignment and re-segmentation of two zones.""" import logging +import unicodedata from udapi.core.block import Block from udapi.core.mwt import MWT from udapi.core.root import Root @@ -31,8 +32,10 @@ def process_document(self, document): for bundle_no, bundle in enumerate(document.bundles): g_tree = bundle.trees[0] p_tree = pred_trees.pop() - g_chars = ''.join(t.form for t in g_tree.token_descendants).replace(' ', '') - p_chars = ''.join(t.form for t in p_tree.token_descendants).replace(' ', '') + g_chars = ''.join(t.form for t in g_tree.token_descendants) + p_chars = ''.join(t.form for t in p_tree.token_descendants) + g_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", g_chars)) + p_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", p_chars)) if g_chars == p_chars: bundle.add_tree(p_tree) continue From 5305e9d441fd83a9b3fa88d1a29a526bb21f0516 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 4 Jul 2018 22:40:01 +0200 Subject: [PATCH 0046/1201] delete whitespace everywhere (fix the previous commit) --- udapi/block/util/resegmentgold.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/udapi/block/util/resegmentgold.py b/udapi/block/util/resegmentgold.py index b39db31c..383510b6 100644 --- a/udapi/block/util/resegmentgold.py +++ b/udapi/block/util/resegmentgold.py @@ -20,6 +20,10 @@ def __init__(self, gold_zone='gold', **kwargs): super().__init__(**kwargs) self.gold_zone = gold_zone + @staticmethod + def _strip_spaces(string): + return ''.join(filter(lambda c: unicodedata.category(c) != "Zs", string)) + def process_document(self, document): if not document.bundles: return @@ -32,8 +36,8 @@ def process_document(self, document): for bundle_no, bundle in enumerate(document.bundles): g_tree = bundle.trees[0] p_tree = pred_trees.pop() - g_chars = ''.join(t.form for t in g_tree.token_descendants) - p_chars = ''.join(t.form for t in p_tree.token_descendants) + g_chars = self._strip_spaces(''.join(t.form for t in g_tree.token_descendants)) + p_chars = self._strip_spaces(''.join(t.form for t in p_tree.token_descendants)) g_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", g_chars)) p_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", p_chars)) if g_chars == p_chars: @@ -46,7 +50,7 @@ def process_document(self, document): if not pred_trees: raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars)) new_p_tree = pred_trees.pop() - p_chars += ''.join(t.form for t in new_p_tree.token_descendants).replace(' ', '') + p_chars += self._strip_spaces(''.join(t.form for t in new_p_tree.token_descendants)) moved_roots.extend(new_p_tree.children) p_tree.steal_nodes(new_p_tree.descendants) self.choose_root(p_tree, was_subroot, g_tree) @@ -62,7 +66,7 @@ def process_document(self, document): p_chars = '' tokens = p_tree.token_descendants for index, token in enumerate(tokens): - p_chars += token.form.replace(' ', '') + p_chars += self._strip_spaces(token.form) if len(p_chars) > len(g_chars): logging.warning('Pred token crossing gold sentences: %s', g_tree.sent_id) # E.g. gold cs ln95048-151-p2s8 contains SpaceAfter=No on the last word From 09025ee167e1e00d0b1f54d6338547092c765dd1 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 17 Dec 2018 16:18:36 +0100 Subject: [PATCH 0047/1201] allow e.g. write.Tikz attributes=upos,misc[SpaceAfter] --- udapi/core/node.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/core/node.py b/udapi/core/node.py index 8e42e991..6e399631 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -501,6 +501,10 @@ def _get_attr(self, name): # pylint: disable=too-many-return-statements return value if name == 'feats_split': return str(self.feats).split('|') + if name.startswith('feats['): + return self.feats[name[6:-1]] + if name.startswith('misc['): + return self.misc[name[5:-1]] return getattr(self, name) def get_attrs(self, attrs, undefs=None, stringify=True): From c47a15c177bea02dc38995ec80ff095fc09c7669 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 17 Dec 2018 16:24:11 +0100 Subject: [PATCH 0048/1201] node.gloss as a r/w shortcut for node.misc["Gloss"] --- udapi/core/node.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/udapi/core/node.py b/udapi/core/node.py index 6e399631..483f1fa2 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -676,6 +676,14 @@ def no_space_after(self): """Boolean property as a shortcut for `node.misc["SpaceAfter"] == "No"`.""" return self.misc["SpaceAfter"] == "No" + @property + def gloss(self): + """String property as a shortcut for `node.misc["Gloss"]`.""" + return self.misc["Gloss"] + + @gloss.setter + def gloss(self, new_gloss): + self.misc["Gloss"] = new_gloss class ListOfNodes(list): """Helper class for results of node.children and node.descendants. From 54c9101fb0c4dd18b26ca5b65eae74dca091aa54 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 17 Dec 2018 16:47:17 +0100 Subject: [PATCH 0049/1201] allow `write.Tikz comment_attribute=text_en` --- udapi/block/write/tikz.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/udapi/block/write/tikz.py b/udapi/block/write/tikz.py index 802487b5..e03a3f17 100644 --- a/udapi/block/write/tikz.py +++ b/udapi/block/write/tikz.py @@ -39,7 +39,7 @@ class Tikz(BaseWriter): """ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, - attributes=None, as_tree=False, **kwargs): + attributes=None, as_tree=False, comment_attribute=None, **kwargs): """Create the Tikz block object. Args: @@ -48,6 +48,8 @@ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, print_preambule: surround each document with LaTeX preambule (`documentclass` etc) and `end{document}` (default=True) attributes: comma-separated list of node attributes to print (each on a separate line). + as_tree: boolean - should print it as a 2D tree? + comment_attribute: which attribute to print as a string under each graph (e.g. text_en) """ super().__init__(**kwargs) self.print_sent_id = print_sent_id @@ -60,6 +62,7 @@ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, else: self.node_attributes = 'form,upos'.split(',') self.as_tree = as_tree + self.comment_attribute = comment_attribute def before_process_document(self, doc): super().before_process_document(doc) @@ -78,6 +81,8 @@ def before_process_document(self, doc): print(r'}') print(r'\newlength{\deplevel}\setlength{\deplevel}{8mm}') print(r'\newlength{\depskip}\setlength{\depskip}{4mm}') + print(r'\newcommand{\deptrans}[1]{\node (t) at (\matrixref.south)[yshift=-1mm]' + " {``#1''}};}") print(r'\begin{document}') def after_process_document(self, doc): @@ -132,5 +137,12 @@ def process_tree(self, tree): print(r'\deproot{%d}{root}' % node.ord) else: print(r'\depedge{%d}{%d}{%s}' % (node.parent.ord, node.ord, node.deprel)) + if self.comment_attribute and tree.comment: + start_pos = tree.comment.find(self.comment_attribute + ' = ') + if start_pos != -1: + start_pos += len(self.comment_attribute) + 3 + end_pos = tree.comment.find('\n', start_pos) + print(r'\deptrans{' + tree.comment[start_pos:end_pos]) + print(r'\end{dependency}') print('') # empty line marks a new paragraph in LaTeX, but multi=dependency causes newpage From 31c1eed315bb27193565a0b6048b5577126cd19d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 22 Jan 2019 12:39:07 +0100 Subject: [PATCH 0050/1201] Fixing my own typo. --- udapi/block/write/tikz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/write/tikz.py b/udapi/block/write/tikz.py index e03a3f17..58f53a3d 100644 --- a/udapi/block/write/tikz.py +++ b/udapi/block/write/tikz.py @@ -82,7 +82,7 @@ def before_process_document(self, doc): print(r'\newlength{\deplevel}\setlength{\deplevel}{8mm}') print(r'\newlength{\depskip}\setlength{\depskip}{4mm}') print(r'\newcommand{\deptrans}[1]{\node (t) at (\matrixref.south)[yshift=-1mm]' - " {``#1''}};}") + " {``#1''};}") print(r'\begin{document}') def after_process_document(self, doc): From 917ef46d9309705acfefe8650de5944a846021a3 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 22 Apr 2019 14:43:11 -0700 Subject: [PATCH 0051/1201] prevent non-projectivities also in another (edge) case Prevent creating (or preserving) non-projectively attached punctuation in cases when it is neighboring another non-projectively attached node. Fixes #52 --- udapi/block/ud/fixpunct.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 7d5bb483..dea1035d 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -14,10 +14,6 @@ Since the punctuation should not have children, we should not create a non-projectivity if we check the root edges going to the right. -However, it is still possible that we will attach the punctuation non-projectively -by joining a non-projectivity that already exists. -For example, the left neighbor (node i-1) may have its parent at i-3, -and the node i-2 forms a gap (does not depend on i-3). """ from udapi.core.block import Block # pylint: disable=no-self-use @@ -128,17 +124,24 @@ def _fix_subord_punct(self, node): # Climb up from the candidates, until we would reach the root or "cross" the punctuation. # If the candidates' descendants span across the punctuation, we also stop # because climbing higher would cause a non-projectivity (the punct would be the gap). + # We also stop if the candidate is attached non-projectively to its parent, + # because climbing higher would make the edge from the punctuation non-projective as well. + # For example, the left neighbor (node i-1) may have its parent at i-3, + # and the node i-2 forms a gap (does not depend on i-3) + # - in this case, the punctuation must be attached to the node i-1 (not i-3). l_path, r_path = [l_cand], [r_cand] if l_cand is None or l_cand.is_root(): l_cand = None else: while (not l_cand.parent.is_root() and l_cand.parent.precedes(node) - and not node.precedes(l_cand.descendants(add_self=1)[-1])): + and not node.precedes(l_cand.descendants(add_self=1)[-1]) + and not l_cand.is_nonprojective()): l_cand = l_cand.parent l_path.append(l_cand) if r_cand is not None: while (not r_cand.parent.is_root() and node.precedes(r_cand.parent) - and not r_cand.descendants(add_self=1)[0].precedes(node)): + and not r_cand.descendants(add_self=1)[0].precedes(node) + and not r_cand.is_nonprojective()): r_cand = r_cand.parent r_path.append(r_cand) From 4e3e034d2b0d8ce07b4cd14395bef7250a8e7365 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 23 Apr 2019 17:43:00 -0700 Subject: [PATCH 0052/1201] yet another solution for not introducing non-projectivities in edge cases Fixes #52 --- udapi/block/ud/fixpunct.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index dea1035d..0d617edb 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -124,27 +124,28 @@ def _fix_subord_punct(self, node): # Climb up from the candidates, until we would reach the root or "cross" the punctuation. # If the candidates' descendants span across the punctuation, we also stop # because climbing higher would cause a non-projectivity (the punct would be the gap). - # We also stop if the candidate is attached non-projectively to its parent, - # because climbing higher would make the edge from the punctuation non-projective as well. - # For example, the left neighbor (node i-1) may have its parent at i-3, - # and the node i-2 forms a gap (does not depend on i-3) - # - in this case, the punctuation must be attached to the node i-1 (not i-3). l_path, r_path = [l_cand], [r_cand] if l_cand is None or l_cand.is_root(): l_cand = None else: while (not l_cand.parent.is_root() and l_cand.parent.precedes(node) - and not node.precedes(l_cand.descendants(add_self=1)[-1]) - and not l_cand.is_nonprojective()): + and not node.precedes(l_cand.descendants(add_self=1)[-1])): l_cand = l_cand.parent l_path.append(l_cand) if r_cand is not None: while (not r_cand.parent.is_root() and node.precedes(r_cand.parent) - and not r_cand.descendants(add_self=1)[0].precedes(node) - and not r_cand.is_nonprojective()): + and not r_cand.descendants(add_self=1)[0].precedes(node)): r_cand = r_cand.parent r_path.append(r_cand) + # Filter out candidates which would lead to non-projectivities. + orig_parent = node.parent + l_path = [n for n in l_path if n and self._will_be_projective(node, n)] + r_path = [n for n in r_path if n and self._will_be_projective(node, n)] + l_cand = l_path[-1] if l_path else None + r_cand = r_path[-1] if r_path else None + node.parent = orig_parent + # Now select between l_cand and r_cand -- which will be the new parent? # The lower one. Note that if neither is descendant of the other and neither is None # (which can happen in rare non-projective cases), we arbitrarily prefer l_cand, @@ -172,6 +173,10 @@ def _fix_subord_punct(self, node): node.parent = cand node.deprel = "punct" + def _will_be_projective(self, node, cand): + node.parent = cand + return not node.is_nonprojective() + def _fix_paired_punct(self, root, opening_node, closing_punct): if self.check_paired_punct_upos and opening_node.upos != 'PUNCT': return From a058df75180befe63c53e76e80e1d5f48875283f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 23 Apr 2019 18:13:48 -0700 Subject: [PATCH 0053/1201] fix a bug in none.is_nonprojective_gap() --- udapi/core/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 483f1fa2..f1702e6f 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -667,7 +667,7 @@ def is_nonprojective_gap(self): if self.precedes(left_node.parent) and left_node.parent not in ancestors: return True for right_node in all_nodes[self.ord:]: - if right_node.parent.precedes(node) and right_node.parent not in ancestors: + if right_node.parent.precedes(self) and right_node.parent not in ancestors: return True return False From 7298c29226f2036e2609d50d1bc846cff2a9511e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 24 Apr 2019 13:53:55 -0700 Subject: [PATCH 0054/1201] Fix CoNLL-U export for sentence-initial empty nodes and also for multiple empty nodes on the same position in sentence. Fixes #53. --- udapi/block/write/conllu.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index cb69159d..4b931bac 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -47,9 +47,20 @@ def process_tree(self, tree): # pylint: disable=too-many-branches print('#' + comment.replace('\n', '\n#')) last_mwt_id = 0 + last_ord = 0 empty_nodes = list(tree.empty_nodes) - next_empty_ord = int(float(empty_nodes[0].ord)) if empty_nodes else -1 for node in nodes: + # print all empty nodes which should go here + while empty_nodes: + next_empty_ord = int(float(empty_nodes[0].ord)) + if next_empty_ord > last_ord: + break + empty = empty_nodes.pop(0) + values = [str(getattr(empty, a)) for a in self.node_attributes] + values[6] = '_' + values[7] = '_' + print('\t'.join(values)) + mwt = node.multiword_token if mwt and node.ord > last_mwt_id: last_mwt_id = mwt.words[-1].ord @@ -63,13 +74,7 @@ def process_tree(self, tree): # pylint: disable=too-many-branches except AttributeError: values[6] = '0' print('\t'.join(values)) - if node.ord == next_empty_ord: - empty = empty_nodes.pop(0) - values = [str(getattr(empty, a)) for a in self.node_attributes] - values[6] = '_' - values[7] = '_' - print('\t'.join(values)) - next_empty_ord = int(float(empty_nodes[0].ord)) if empty_nodes else -1 + last_ord = node.ord # Empty sentences are not allowed in CoNLL-U, # but with print_empty_trees==1 (which is the default), From 8c1a8a74495f59a6584f960b2c9e30097090de7a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 24 Apr 2019 18:39:28 -0700 Subject: [PATCH 0055/1201] Prevent non-projectivities in case of "[...]" where the three-dots token is a PUNCT with children (which is against the guidelines, but there is no other way), but if we forbid attaching it outside of the parenthesis, we effectively keep its original attachement, which could be non-projective. --- udapi/block/ud/fixpunct.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 0d617edb..e3e6319b 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -111,12 +111,12 @@ def _fix_subord_punct(self, node): if node.form in FINAL_PUNCT: r_cand = None while l_cand.ord > 0 and l_cand.upos == "PUNCT": - if self._punct_type[l_cand.ord] == 'opening': + if self._punct_type[l_cand.ord] == 'opening' and l_cand.parent != node: l_cand = None break l_cand = l_cand.prev_node while r_cand is not None and r_cand.upos == "PUNCT": - if self._punct_type[r_cand.ord] == 'closing': + if self._punct_type[r_cand.ord] == 'closing' and r_cand.parent != node: r_cand = None break r_cand = r_cand.next_node From 02394f7978db281f09e830bfa8e2c01c2b4982ce Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 25 Apr 2019 11:21:27 -0700 Subject: [PATCH 0056/1201] util.Filter keep_node='Python expreession' --- udapi/block/util/filter.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/udapi/block/util/filter.py b/udapi/block/util/filter.py index 6d4118d6..811973ec 100644 --- a/udapi/block/util/filter.py +++ b/udapi/block/util/filter.py @@ -30,7 +30,7 @@ class Filter(Block): def __init__(self, # pylint: disable=too-many-arguments delete_tree=None, delete_tree_if_node=None, delete_subtree=None, keep_tree=None, keep_tree_if_node=None, keep_subtree=None, - mark=None, **kwargs): + keep_node=None, mark=None, **kwargs): """Create the Filter block object. Args: @@ -56,6 +56,10 @@ def __init__(self, # pylint: disable=too-many-arguments If no node in the tree was marked (i.e. only the root without any children remained), the whole tree will be deleted. + `keep_node`: Python expression to be evaluated for each node and if False, + the node will be deleted and its children rehanged to its parent. + Multiple nodes can be deleted (or kept) this way. + `mark`: a string or None. This makes sense only with `keep_tree_if_node`, where the matched nodes are marked with `Mark=` in `node.misc`, so they will be highlighted if printed with `write.TextModeTrees`. Default=None. @@ -71,6 +75,7 @@ def __init__(self, # pylint: disable=too-many-arguments self.keep_tree = keep_tree self.keep_tree_if_node = keep_tree_if_node self.keep_subtree = keep_subtree + self.keep_node = keep_node self.mark = mark def process_tree(self, tree): # pylint: disable=too-many-branches @@ -118,8 +123,17 @@ def process_tree(self, tree): # pylint: disable=too-many-branches kept_subtrees.append(node) if not kept_subtrees: tree.remove() + return else: for node in kept_subtrees: node.parent = root for orig_subroot in [n for n in root.children if n not in kept_subtrees]: orig_subroot.remove() + + if self.keep_node is not None: + nodes_to_delete = [node for node in tree.descendants if not eval(self.keep_node)] + if nodes_to_delete == tree.descendants: + tree.remove() + return + for node in nodes_to_delete: + node.remove(children='rehang') From 1a3805b71c56a59cb7ab68e719bcd56aba63522c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 25 Apr 2019 13:40:44 -0700 Subject: [PATCH 0057/1201] fix few more rare cases of punct-nonproj-gap --- udapi/block/ud/fixpunct.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index e3e6319b..2f00756a 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -215,3 +215,13 @@ def _fix_pair(self, root, opening_node, closing_node): closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] self._punct_type[opening_node.ord] = 'opening' self._punct_type[closing_node.ord] = 'closing' + + # In rare cases, non-projective gaps may remain. Let's dirty fix these! + # E.g. in "the (lack of) reproducibility", the closing parenthesis + # should be attached to "of" rather than to "lack" + # -- breaking the paired-marks-have-same-parent rule + # in order to prevent the punct-nonproj-gap bug (recently checked by validator.py). + if opening_node.is_nonprojective_gap(): + opening_node.parent = opening_node.next_node + if closing_node.is_nonprojective_gap(): + closing_node.parent = closing_node.prev_node From 1d45aa3b6bf259aef299a7dea10e62b01b91d106 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 25 Apr 2019 16:10:14 -0700 Subject: [PATCH 0058/1201] Consider a pair of ' characters a paired single quotation only if upos=PUNCT Fixes #54 --- udapi/block/ud/fixpunct.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 2f00756a..e3c8dc3b 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -45,7 +45,16 @@ class FixPunct(Block): """Make sure punctuation nodes are attached projectively.""" def __init__(self, check_paired_punct_upos=False, copy_to_enhanced=False, **kwargs): - """Create the ud.FixPunct block instance.""" + """Create the ud.FixPunct block instance. + + Args: + check_paired_punct_upos: fix paired punctuation tokens only if their UPOS=PUNCT. + The default is false, which means that fixed punctuation is detected only + based on the form with the exception of single quote / apostrophe character, + which is frequently ambiguous, so UPOS=PUNCT is checked always. + copy_to_enhanced: for all PUNCT nodes, let the enhanced depencies be the same + as the basic dependencies. + """ super().__init__(**kwargs) self._punct_type = None self.check_paired_punct_upos = check_paired_punct_upos @@ -178,7 +187,8 @@ def _will_be_projective(self, node, cand): return not node.is_nonprojective() def _fix_paired_punct(self, root, opening_node, closing_punct): - if self.check_paired_punct_upos and opening_node.upos != 'PUNCT': + if (self.check_paired_punct_upos + or opening_node.form == "'") and opening_node.upos != 'PUNCT': return nested_level = 0 From d1d16529862bf537195259569b1a34108725de0e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 30 Apr 2019 11:22:00 -0700 Subject: [PATCH 0059/1201] root should be among the ancestors to prevent false nonproj-gaps --- udapi/core/node.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index f1702e6f..c0684e60 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -657,11 +657,11 @@ def is_nonprojective_gap(self): - this node is within span of X, i.e. it is between (word-order-wise) X's leftmost descendant (or X itself) and X's rightmost descendant (or X itself). """ - ancestors = set() + ancestors = set([self]) node = self while node.parent: - ancestors.add(node) node = node.parent + ancestors.add(node) all_nodes = node.descendants for left_node in all_nodes[:self.ord - 1]: if self.precedes(left_node.parent) and left_node.parent not in ancestors: From 4b896f0f79a86ae9ef06e403f50caeb25ca83103 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 30 Apr 2019 13:13:01 -0700 Subject: [PATCH 0060/1201] even in edge cases we should non introduce punct-child bugs --- udapi/block/ud/fixpunct.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index e3c8dc3b..e9507c23 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -211,6 +211,7 @@ def _fix_pair(self, root, opening_node, closing_node): punct_heads.append(node) else: heads.append(node) + # Punctuation should not have children, but if there is no other head candidate, # let's break this rule. if len(heads) == 0: @@ -233,5 +234,13 @@ def _fix_pair(self, root, opening_node, closing_node): # in order to prevent the punct-nonproj-gap bug (recently checked by validator.py). if opening_node.is_nonprojective_gap(): opening_node.parent = opening_node.next_node + while (opening_node.parent.ord < closing_node.ord - 1 + and (opening_node.parent.upos == 'PUNCT' or opening_node.is_nonprojective() + or opening_node.is_nonprojective_gap())): + opening_node.parent = opening_node.parent.next_node if closing_node.is_nonprojective_gap(): closing_node.parent = closing_node.prev_node + while (closing_node.parent.ord > opening_node.ord + 1 + and (closing_node.parent.upos == 'PUNCT' or closing_node.is_nonprojective() + or closing_node.is_nonprojective_gap())): + closing_node.parent = closing_node.parent.prev_node From cf0d7bbd8aa04d6789c556f0d52374f97fc3478b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 May 2019 17:24:00 -0700 Subject: [PATCH 0061/1201] fix a bug when printing subtrees --- udapi/block/write/textmodetrees.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index be673b2f..1f8163c5 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -268,7 +268,8 @@ def process_tree(self, root): if idx_node.parent is not node: self._add(idx, self._vert[self._ends(idx, '─╭╰╪┡┢')]) else: - self._add(idx, self._space[idx < node.ord][topmost or botmost]) + precedes_parent = idx < self._index_of[node.ord] + self._add(idx, self._space[precedes_parent][topmost or botmost]) if idx_node.is_leaf(): self._add(idx, self._horiz) if self.layout == 'classic': From 5b582f6b9a1a705767222c9b628ed4dbfd10186b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 May 2019 17:37:13 -0700 Subject: [PATCH 0062/1201] bugfix in ud.FixPunct we want to prevent punct-nonproj-gap bugs, i.e. nodes which are the *primary* cause of non-projective gaps, but we don't care about nodes which *inherited* it, i.e. already their parent is causing a non-projective gap. --- udapi/block/ud/fixpunct.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index e9507c23..2c26a294 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -186,6 +186,9 @@ def _will_be_projective(self, node, cand): node.parent = cand return not node.is_nonprojective() + def _causes_gap(self, node): + return node.is_nonprojective_gap() and not node.parent.is_nonprojective_gap() + def _fix_paired_punct(self, root, opening_node, closing_punct): if (self.check_paired_punct_upos or opening_node.form == "'") and opening_node.upos != 'PUNCT': @@ -232,15 +235,15 @@ def _fix_pair(self, root, opening_node, closing_node): # should be attached to "of" rather than to "lack" # -- breaking the paired-marks-have-same-parent rule # in order to prevent the punct-nonproj-gap bug (recently checked by validator.py). - if opening_node.is_nonprojective_gap(): + if self._causes_gap(opening_node): opening_node.parent = opening_node.next_node while (opening_node.parent.ord < closing_node.ord - 1 and (opening_node.parent.upos == 'PUNCT' or opening_node.is_nonprojective() - or opening_node.is_nonprojective_gap())): + or self._causes_gap(opening_node))): opening_node.parent = opening_node.parent.next_node - if closing_node.is_nonprojective_gap(): + if self._causes_gap(closing_node): closing_node.parent = closing_node.prev_node while (closing_node.parent.ord > opening_node.ord + 1 and (closing_node.parent.upos == 'PUNCT' or closing_node.is_nonprojective() - or closing_node.is_nonprojective_gap())): + or self._causes_gap(closing_node))): closing_node.parent = closing_node.parent.prev_node From cb2539abe259778d76240a1134269c3bdcc0b8bb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 17 Jun 2019 13:14:17 -0700 Subject: [PATCH 0063/1201] fix a bug which prevented fixing of nested quotes fixes #55 --- udapi/block/ud/fixpunct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 2c26a294..96ec0ed5 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -198,7 +198,7 @@ def _fix_paired_punct(self, root, opening_node, closing_punct): for node in root.descendants[opening_node.ord:]: if node.form == closing_punct: if nested_level > 0: - nested_level -= 0 + nested_level -= 1 else: self._fix_pair(root, opening_node, node) return From aea510f9724ea7b3ab4137e868b89114b5886221 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 23 Oct 2019 12:45:14 +0200 Subject: [PATCH 0064/1201] update UDPipe model defaults from 2.0 to 2.4 --- udapi/block/udpipe/base.py | 171 +++++++++++++++++++++++-------------- 1 file changed, 107 insertions(+), 64 deletions(-) diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index 11e12b65..0c623ab6 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -3,70 +3,113 @@ from udapi.tool.udpipe import UDPipe KNOWN_MODELS = { - 'grc': 'models/udpipe/2.0/ancient_greek-proiel-ud-2.0-conll17-170315.udpipe', - 'grc_proiel': 'models/udpipe/2.0/ancient_greek-ud-2.0-conll17-170315.udpipe', - 'ar': 'models/udpipe/2.0/arabic-ud-2.0-conll17-170315.udpipe', - 'eu': 'models/udpipe/2.0/basque-ud-2.0-conll17-170315.udpipe', - 'bg': 'models/udpipe/2.0/bulgarian-ud-2.0-conll17-170315.udpipe', - 'ca': 'models/udpipe/2.0/catalan-ud-2.0-conll17-170315.udpipe', - 'zh': 'models/udpipe/2.0/chinese-ud-2.0-conll17-170315.udpipe', - 'hr': 'models/udpipe/2.0/croatian-ud-2.0-conll17-170315.udpipe', - 'cs_cac': 'models/udpipe/2.0/czech-cac-ud-2.0-conll17-170315.udpipe', - 'cs_cltt': 'models/udpipe/2.0/czech-cltt-ud-2.0-conll17-170315.udpipe', - 'cs': 'models/udpipe/2.0/czech-ud-2.0-conll17-170315.udpipe', - 'da': 'models/udpipe/2.0/danish-ud-2.0-conll17-170315.udpipe', - 'nl_lassysmall': 'models/udpipe/2.0/dutch-lassysmall-ud-2.0-conll17-170315.udpipe', - 'nl': 'models/udpipe/2.0/dutch-ud-2.0-conll17-170315.udpipe', - 'en_lines': 'models/udpipe/2.0/english-lines-ud-2.0-conll17-170315.udpipe', - 'en_partut': 'models/udpipe/2.0/english-partut-ud-2.0-conll17-170315.udpipe', - 'en': 'models/udpipe/2.0/english-ud-2.0-conll17-170315.udpipe', - 'et': 'models/udpipe/2.0/estonian-ud-2.0-conll17-170315.udpipe', - 'fi_ftb': 'models/udpipe/2.0/finnish-ftb-ud-2.0-conll17-170315.udpipe', - 'fi': 'models/udpipe/2.0/finnish-ud-2.0-conll17-170315.udpipe', - 'fr_partut': 'models/udpipe/2.0/french-partut-ud-2.0-conll17-170315.udpipe', - 'fr_sequoia': 'models/udpipe/2.0/french-sequoia-ud-2.0-conll17-170315.udpipe', - 'fr': 'models/udpipe/2.0/french-ud-2.0-conll17-170315.udpipe', - 'gl_treegal': 'models/udpipe/2.0/galician-treegal-ud-2.0-conll17-170315.udpipe', - 'gl': 'models/udpipe/2.0/galician-ud-2.0-conll17-170315.udpipe', - 'de': 'models/udpipe/2.0/german-ud-2.0-conll17-170315.udpipe', - 'got': 'models/udpipe/2.0/gothic-ud-2.0-conll17-170315.udpipe', - 'el': 'models/udpipe/2.0/greek-ud-2.0-conll17-170315.udpipe', - 'he': 'models/udpipe/2.0/hebrew-ud-2.0-conll17-170315.udpipe', - 'hi': 'models/udpipe/2.0/hindi-ud-2.0-conll17-170315.udpipe', - 'hu': 'models/udpipe/2.0/hungarian-ud-2.0-conll17-170315.udpipe', - 'id': 'models/udpipe/2.0/indonesian-ud-2.0-conll17-170315.udpipe', - 'ga': 'models/udpipe/2.0/irish-ud-2.0-conll17-170315.udpipe', - 'it_partut': 'models/udpipe/2.0/italian-partut-ud-2.0-conll17-170315.udpipe', - 'it': 'models/udpipe/2.0/italian-ud-2.0-conll17-170315.udpipe', - 'ja': 'models/udpipe/2.0/japanese-ud-2.0-conll17-170315.udpipe', - 'kk': 'models/udpipe/2.0/kazakh-ud-2.0-conll17-170315.udpipe', - 'ko': 'models/udpipe/2.0/korean-ud-2.0-conll17-170315.udpipe', - 'la_ittb': 'models/udpipe/2.0/latin-ittb-ud-2.0-conll17-170315.udpipe', - 'la_proiel': 'models/udpipe/2.0/latin-proiel-ud-2.0-conll17-170315.udpipe', - 'la': 'models/udpipe/2.0/latin-ud-2.0-conll17-170315.udpipe', - 'lv': 'models/udpipe/2.0/latvian-ud-2.0-conll17-170315.udpipe', - 'no_bokmaal': 'models/udpipe/2.0/norwegian-bokmaal-ud-2.0-conll17-170315.udpipe', - 'no_nynorsk': 'models/udpipe/2.0/norwegian-nynorsk-ud-2.0-conll17-170315.udpipe', - 'cu': 'models/udpipe/2.0/old_church_slavonic-ud-2.0-conll17-170315.udpipe', - 'fa': 'models/udpipe/2.0/persian-ud-2.0-conll17-170315.udpipe', - 'pl': 'models/udpipe/2.0/polish-ud-2.0-conll17-170315.udpipe', - 'pt_br': 'models/udpipe/2.0/portuguese-br-ud-2.0-conll17-170315.udpipe', - 'pt': 'models/udpipe/2.0/portuguese-ud-2.0-conll17-170315.udpipe', - 'ro': 'models/udpipe/2.0/romanian-ud-2.0-conll17-170315.udpipe', - 'ru_syntagrus': 'models/udpipe/2.0/russian-syntagrus-ud-2.0-conll17-170315.udpipe', - 'ru': 'models/udpipe/2.0/russian-ud-2.0-conll17-170315.udpipe', - 'sk': 'models/udpipe/2.0/slovak-ud-2.0-conll17-170315.udpipe', - 'sl_sst': 'models/udpipe/2.0/slovenian-sst-ud-2.0-conll17-170315.udpipe', - 'sl': 'models/udpipe/2.0/slovenian-ud-2.0-conll17-170315.udpipe', - 'es_ancora': 'models/udpipe/2.0/spanish-ancora-ud-2.0-conll17-170315.udpipe', - 'es': 'models/udpipe/2.0/spanish-ud-2.0-conll17-170315.udpipe', - 'sv_lines': 'models/udpipe/2.0/swedish-lines-ud-2.0-conll17-170315.udpipe', - 'sv': 'models/udpipe/2.0/swedish-ud-2.0-conll17-170315.udpipe', - 'tr': 'models/udpipe/2.0/turkish-ud-2.0-conll17-170315.udpipe', - 'uk': 'models/udpipe/2.0/ukrainian-ud-2.0-conll17-170315.udpipe', - 'ur': 'models/udpipe/2.0/urdu-ud-2.0-conll17-170315.udpipe', - 'ug': 'models/udpipe/2.0/uyghur-ud-2.0-conll17-170315.udpipe', - 'vi': 'models/udpipe/2.0/vietnamese-ud-2.0-conll17-170315.udpipe', + 'af': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', + 'af_afribooms': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', + 'grc': 'models/udpipe/2.4/ancient_greek-perseus-ud-2.4-190531.udpipe', + 'grc_perseus': 'models/udpipe/2.4/ancient_greek-perseus-ud-2.4-190531.udpipe', + 'grc_proiel': 'models/udpipe/2.4/ancient_greek-proiel-ud-2.4-190531.udpipe', + 'ar': 'models/udpipe/2.4/arabic-padt-ud-2.4-190531.udpipe', + 'ar_padt': 'models/udpipe/2.4/arabic-padt-ud-2.4-190531.udpipe', + 'hy': 'models/udpipe/2.4/armenian-armtdp-ud-2.4-190531.udpipe', + 'hy_armtdp': 'models/udpipe/2.4/armenian-armtdp-ud-2.4-190531.udpipe', + 'eu': 'models/udpipe/2.4/basque-bdt-ud-2.4-190531.udpipe', + 'eu_bdt': 'models/udpipe/2.4/basque-bdt-ud-2.4-190531.udpipe', + 'be': 'models/udpipe/2.4/belarusian-hse-ud-2.4-190531.udpipe', + 'be_hse': 'models/udpipe/2.4/belarusian-hse-ud-2.4-190531.udpipe', + 'bg': 'models/udpipe/2.4/bulgarian-btb-ud-2.4-190531.udpipe', + 'bg_btb': 'models/udpipe/2.4/bulgarian-btb-ud-2.4-190531.udpipe', + 'ca': 'models/udpipe/2.4/catalan-ancora-ud-2.4-190531.udpipe', + 'ca_ancora': 'models/udpipe/2.4/catalan-ancora-ud-2.4-190531.udpipe', + 'zh': 'models/udpipe/2.4/chinese-gsd-ud-2.4-190531.udpipe', + 'zh_gsd': 'models/udpipe/2.4/chinese-gsd-ud-2.4-190531.udpipe', + 'lzh': 'models/udpipe/2.4/classical_chinese-kyoto-ud-2.4-190531.udpipe', + 'lzh_kyoto': 'models/udpipe/2.4/classical_chinese-kyoto-ud-2.4-190531.udpipe', + 'cop': 'models/udpipe/2.4/coptic-scriptorium-ud-2.4-190531.udpipe', + 'cop_scriptotium': 'models/udpipe/2.4/coptic-scriptorium-ud-2.4-190531.udpipe', + 'hr': 'models/udpipe/2.4/croatian-set-ud-2.4-190531.udpipe', + 'hr_set': 'models/udpipe/2.4/croatian-set-ud-2.4-190531.udpipe', + 'cs': 'models/udpipe/2.4/czech-pdt-ud-2.4-190531.udpipe', + 'cs_pdt': 'models/udpipe/2.4/czech-pdt-ud-2.4-190531.udpipe', + 'cs_cac': 'models/udpipe/2.4/czech-cac-ud-2.4-190531.udpipe', + 'cs_cltt': 'models/udpipe/2.4/czech-cltt-ud-2.4-190531.udpipe', + 'cs_fictree': 'models/udpipe/2.4/czech-fictree-ud-2.4-190531.udpipe', + 'da': 'models/udpipe/2.4/danish-ddt-ud-2.4-190531.udpipe', + 'da_ddt': 'models/udpipe/2.4/danish-ddt-ud-2.4-190531.udpipe', + 'nl': 'models/udpipe/2.4/dutch-alpino-ud-2.4-190531.udpipe', + 'nl_alpino': 'models/udpipe/2.4/dutch-alpino-ud-2.4-190531.udpipe', + 'nl_lassysmall': 'models/udpipe/2.4/dutch-lassysmall-ud-2.4-190531.udpipe', + 'en': 'models/udpipe/2.4/english-ewt-ud-2.4-190531.udpipe', + 'en_ewt': 'models/udpipe/2.4/english-ewt-ud-2.4-190531.udpipe', + 'en_gum': 'models/udpipe/2.4/english-gum-ud-2.4-190531.udpipe', + 'en_lines': 'models/udpipe/2.4/english-lines-ud-2.4-190531.udpipe', + 'en_partut': 'models/udpipe/2.4/english-partut-ud-2.4-190531.udpipe', + 'et_edt': 'models/udpipe/2.4/estonian-edt-ud-2.4-190531.udpipe', + 'et_ewt': 'models/udpipe/2.4/estonian-ewt-ud-2.4-190531.udpipe', + 'fi': 'models/udpipe/2.4/finnish-tdt-ud-2.4-190531.udpipe', + 'fi_tdt': 'models/udpipe/2.4/finnish-tdt-ud-2.4-190531.udpipe', + 'fi_ftb': 'models/udpipe/2.4/finnish-ftb-ud-2.4-190531.udpipe', + 'fr_gsd': 'models/udpipe/2.4/french-gsd-ud-2.4-190531.udpipe', + 'fr_partut': 'models/udpipe/2.4/french-partut-ud-2.4-190531.udpipe', + 'fr_sequoia': 'models/udpipe/2.4/french-sequoia-ud-2.4-190531.udpipe', + 'fr_spoken': 'models/udpipe/2.4/french-spoken-ud-2.4-190531.udpipe', + 'gl_ctg': 'models/udpipe/2.4/galician-ctg-ud-2.4-190531.udpipe', + 'gl_treegal': 'models/udpipe/2.4/galician-treegal-ud-2.4-190531.udpipe', + 'de': 'models/udpipe/2.4/german-gsd-ud-2.4-190531.udpipe', + 'got': 'models/udpipe/2.4/gothic-proiel-ud-2.4-190531.udpipe', + 'el': 'models/udpipe/2.4/greek-gdt-ud-2.4-190531.udpipe', + 'he': 'models/udpipe/2.4/hebrew-htb-ud-2.4-190531.udpipe', + 'hi': 'models/udpipe/2.4/hindi-hdtb-ud-2.4-190531.udpipe', + 'hu': 'models/udpipe/2.4/hungarian-szeged-ud-2.4-190531.udpipe', + 'id': 'models/udpipe/2.4/indonesian-gsd-ud-2.4-190531.udpipe', + 'ga': 'models/udpipe/2.4/irish-idt-ud-2.4-190531.udpipe', + 'it_isdt': 'models/udpipe/2.4/italian-isdt-ud-2.4-190531.udpipe', + 'it_partut': 'models/udpipe/2.4/italian-partut-ud-2.4-190531.udpipe', + 'it_postwita': 'models/udpipe/2.4/italian-postwita-ud-2.4-190531.udpipe', + 'it_vit': 'models/udpipe/2.4/italian-vit-ud-2.4-190531.udpipe', + 'ja': 'models/udpipe/2.4/japanese-gsd-ud-2.4-190531.udpipe', + 'ko_gsd': 'models/udpipe/2.4/korean-gsd-ud-2.4-190531.udpipe', + 'ko_kaist': 'models/udpipe/2.4/korean-kaist-ud-2.4-190531.udpipe', + 'la_ittb': 'models/udpipe/2.4/latin-ittb-ud-2.4-190531.udpipe', + 'la_perseus': 'models/udpipe/2.4/latin-perseus-ud-2.4-190531.udpipe', + 'la_proiel': 'models/udpipe/2.4/latin-proiel-ud-2.4-190531.udpipe', + 'lv': 'models/udpipe/2.4/latvian-lvtb-ud-2.4-190531.udpipe', + 'lt_alksnis': 'models/udpipe/2.4/lithuanian-alksnis-ud-2.4-190531.udpipe', + 'lt_hse': 'models/udpipe/2.4/lithuanian-hse-ud-2.4-190531.udpipe', + 'mt': 'models/udpipe/2.4/maltese-mudt-ud-2.4-190531.udpipe', + 'mr': 'models/udpipe/2.4/marathi-ufal-ud-2.4-190531.udpipe', + 'sme': 'models/udpipe/2.4/north_sami-giella-ud-2.4-190531.udpipe', + 'no_bokmaal': 'models/udpipe/2.4/norwegian-bokmaal-ud-2.4-190531.udpipe', + 'no_nynorsklia': 'models/udpipe/2.4/norwegian-nynorsklia-ud-2.4-190531.udpipe', + 'no_nynorsk': 'models/udpipe/2.4/norwegian-nynorsk-ud-2.4-190531.udpipe', + 'cu': 'models/udpipe/2.4/old_church_slavonic-proiel-ud-2.4-190531.udpipe', + 'fro': 'models/udpipe/2.4/old_french-srcmf-ud-2.4-190531.udpipe', + 'orv': 'models/udpipe/2.4/old_russian-torot-ud-2.4-190531.udpipe', + 'fa': 'models/udpipe/2.4/persian-seraji-ud-2.4-190531.udpipe', + 'pl_lfg': 'models/udpipe/2.4/polish-lfg-ud-2.4-190531.udpipe', + 'pl_pdb': 'models/udpipe/2.4/polish-pdb-ud-2.4-190531.udpipe', + 'pt_bosque': 'models/udpipe/2.4/portuguese-bosque-ud-2.4-190531.udpipe', + 'pt_gsd': 'models/udpipe/2.4/portuguese-gsd-ud-2.4-190531.udpipe', + 'ro_nonstandard': 'models/udpipe/2.4/romanian-nonstandard-ud-2.4-190531.udpipe', + 'ro_rrt': 'models/udpipe/2.4/romanian-rrt-ud-2.4-190531.udpipe', + 'ru_gsd': 'models/udpipe/2.4/russian-gsd-ud-2.4-190531.udpipe', + 'ru_syntagrus': 'models/udpipe/2.4/russian-syntagrus-ud-2.4-190531.udpipe', + 'ru_taiga': 'models/udpipe/2.4/russian-taiga-ud-2.4-190531.udpipe', + 'sr': 'models/udpipe/2.4/serbian-set-ud-2.4-190531.udpipe', + 'sk': 'models/udpipe/2.4/slovak-snk-ud-2.4-190531.udpipe', + 'sl_ssj': 'models/udpipe/2.4/slovenian-ssj-ud-2.4-190531.udpipe', + 'sl_sst': 'models/udpipe/2.4/slovenian-sst-ud-2.4-190531.udpipe', + 'es_ancora': 'models/udpipe/2.4/spanish-ancora-ud-2.4-190531.udpipe', + 'es_gsd': 'models/udpipe/2.4/spanish-gsd-ud-2.4-190531.udpipe', + 'sv_lines': 'models/udpipe/2.4/swedish-lines-ud-2.4-190531.udpipe', + 'sv_talbanken': 'models/udpipe/2.4/swedish-talbanken-ud-2.4-190531.udpipe', + 'ta': 'models/udpipe/2.4/tamil-ttb-ud-2.4-190531.udpipe', + 'te': 'models/udpipe/2.4/telugu-mtg-ud-2.4-190531.udpipe', + 'tr': 'models/udpipe/2.4/turkish-imst-ud-2.4-190531.udpipe', + 'uk': 'models/udpipe/2.4/ukrainian-iu-ud-2.4-190531.udpipe', + 'ur': 'models/udpipe/2.4/urdu-udtb-ud-2.4-190531.udpipe', + 'ug': 'models/udpipe/2.4/uyghur-udt-ud-2.4-190531.udpipe', + 'vi': 'models/udpipe/2.4/vietnamese-vtb-ud-2.4-190531.udpipe', + 'wo': 'models/udpipe/2.4/wolof-wtb-ud-2.4-190531.udpipe', } From 531c1cfb54f158f069fbc626e038d160b1efc1c9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 23 Oct 2019 13:22:25 +0200 Subject: [PATCH 0065/1201] drop tests of Python 3.3 in Travis, add 3.6 and 3.7 --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d45259b8..8d5f2f69 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,9 @@ language: python python: - - "3.3" - "3.4" - "3.5" + - "3.6" + - "3.7" before_install: - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - sudo apt-get update -qq From 4addd58c5761b77750e9a83ac62665eb0f58caca Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 23 Oct 2019 15:11:27 +0200 Subject: [PATCH 0066/1201] allow `udpipe.Cs resegment=1` --- udapi/block/udpipe/base.py | 41 ++++++++++++-------- udapi/tool/udpipe.py | 79 ++++++++++++++++++++++++-------------- 2 files changed, 76 insertions(+), 44 deletions(-) diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index 0c623ab6..2af62d27 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -1,6 +1,7 @@ """Block udpipe.Base for tagging and parsing using UDPipe.""" from udapi.core.block import Block from udapi.tool.udpipe import UDPipe +from udapi.core.bundle import Bundle KNOWN_MODELS = { 'af': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', @@ -118,12 +119,12 @@ class Base(Block): # pylint: disable=too-many-arguments def __init__(self, model=None, model_alias=None, - tokenize=True, tag=True, parse=True, **kwargs): + tokenize=True, tag=True, parse=True, resegment=False, **kwargs): """Create the udpipe.En block object.""" super().__init__(**kwargs) self.model, self.model_alias = model, model_alias self._tool = None - self.tokenize, self.tag, self.parse = tokenize, tag, parse + self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment @property def tool(self): @@ -137,20 +138,30 @@ def tool(self): self._tool = UDPipe(model=self.model) return self._tool - def process_tree(self, root): + def process_document(self, doc): tok, tag, par = self.tokenize, self.tag, self.parse - if tok and tag and par: - return self.tool.tokenize_tag_parse_tree(root) - if not tok and tag and par: - return self.tool.tag_parse_tree(root) - # TODO - # return $self->tool->tokenize_tag_parse_tree($root) if $tok && $tag && $par; - # return $self->tool->tokenize_tag_tree($root) if $tok && $tag && !$par; - # return $self->tool->tokenize_tree($root) if $tok && !$tag && !$par; - # return $self->tool->tag_parse_tree($root) if !$tok && $tag && $par; - # return $self->tool->tag_tree($root) if !$tok && $tag && !$par; - # return $self->tool->parse_tree($root) if !$tok && !$tag && $par; - raise ValueError("Unimplemented tokenize=%s tag=%s parse=%s" % (tok, tag, par)) + old_bundles = doc.bundles + new_bundles = [] + for bundle in old_bundles: + for tree in bundle: + new_bundles.append(bundle) + if self._should_process_tree(tree): + if tok and tag and par: + new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=self.resegment) + if self.resegment and len(new_trees) > 1: + orig_bundle_id = bundle.bundle_id + bundle.bundle_id = orig_bundle_id + '-1' + tree.text = None + for i, new_tree in enumerate(new_trees[1:], 2): + new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) + new_tree.zone = tree.zone + new_bundle.add_tree(new_tree) + new_bundles.append(new_bundle) + elif not tok and tag and par: + self.tool.tag_parse_tree(tree) + else: + raise ValueError("Unimplemented tokenize=%s tag=%s parse=%s" % (tok, tag, par)) + doc.bundles = new_bundles ''' Udapi::Block::UDPipe::Base - tokenize, tag and parse into UD diff --git a/udapi/tool/udpipe.py b/udapi/tool/udpipe.py index 5d17ae97..f42b2944 100644 --- a/udapi/tool/udpipe.py +++ b/udapi/tool/udpipe.py @@ -4,6 +4,7 @@ from ufal.udpipe import Model, Pipeline, ProcessingError, Sentence # pylint: disable=no-name-in-module from udapi.core.resource import require_file from udapi.block.read.conllu import Conllu as ConlluReader +from udapi.core.root import Root class UDPipe: @@ -45,42 +46,62 @@ def tag_parse_tree(self, root): # pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants - def tokenize_tag_parse_tree(self, root): - """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.""" + def tokenize_tag_parse_tree(self, root, resegment=False): + """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. + + If resegment=True, the returned list of Udapi trees may contain multiple trees. + """ if root.children: raise ValueError('Tree already contained nodes before tokenization') - # tokenization (I cannot turn off segmenter, so I need to join the segments) + # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions). self.tokenizer.setText(root.text) - u_sentence = Sentence() - is_another = self.tokenizer.nextSentence(u_sentence) - u_words = u_sentence.words - n_words = u_words.size() - 1 - if is_another: - u_sent_cont = Sentence() - while self.tokenizer.nextSentence(u_sent_cont): - n_cont = u_sent_cont.words.size() - 1 - for i in range(1, n_cont + 1): - u_w = u_sent_cont.words[i] + is_another = True + u_sentences = [] + while is_another: + u_sentence = Sentence() + is_another = self.tokenizer.nextSentence(u_sentence) + if is_another: + u_sentences.append(u_sentence) + + # If resegmentation was not required, we need to join the segments. + if not resegment and len(u_sentences) > 1: + first_sent = u_sentences[0] + n_words = first_sent.words.size() - 1 + for other_sent in u_sentences[1:]: + other_words = other_sent.words.size() - 1 + for i in range(1, other_words + 1): + u_w = other_sent.words[i] n_words += 1 u_w.id = n_words - u_words.append(u_w) + first_sent.words.append(u_w) + u_sentences = [first_sent] # tagging and parsing - self.tool.tag(u_sentence, Model.DEFAULT) - self.tool.parse(u_sentence, Model.DEFAULT) + for u_sentence in u_sentences: + self.tool.tag(u_sentence, Model.DEFAULT) + self.tool.parse(u_sentence, Model.DEFAULT) # converting UDPipe nodes to Udapi nodes - heads, nodes = [], [root] - for i in range(1, u_words.size()): - u_w = u_words[i] - node = root.create_child( - form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, - xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, - ) - node.misc = u_w.misc - heads.append(u_w.head) - nodes.append(node) - for node in nodes[1:]: - head = heads.pop(0) - node.parent = nodes[head] + new_root = root + trees = [] + for u_sentence in u_sentences: + if not new_root: + new_root = Root() + heads, nodes = [], [new_root] + u_words = u_sentence.words + for i in range(1, u_words.size()): + u_w = u_words[i] + node = new_root.create_child( + form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, + xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, + ) + node.misc = u_w.misc + heads.append(u_w.head) + nodes.append(node) + for node in nodes[1:]: + head = heads.pop(0) + node.parent = nodes[head] + trees.append(new_root) + new_root = None + return trees From 0ba159c33634b70b72b35aa57d8d09b5245bab9e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 27 Oct 2019 13:01:13 -0700 Subject: [PATCH 0067/1201] fix reading from a @filelist (which never worked) _token_to_filenames(token) now always returns a list (even if token represents a sinle filename) and string_to_filenames flattens the list of lists. Fixes #56. --- udapi/core/files.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index 7a711dfe..3a13f230 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -7,6 +7,7 @@ import bz2 import gzip import lzma +import itertools class Files(object): @@ -63,7 +64,8 @@ def string_to_filenames(self, string): if not filenames: raise RuntimeError('No filenames matched "%s" pattern' % pattern) return filenames - return [self._token_to_filenames(tok) for tok in string.replace(',', ' ').split()] + return list(itertools.chain.from_iterable(self._token_to_filenames(tok) + for tok in string.replace(',', ' ').split())) @staticmethod def _token_to_filenames(token): @@ -80,7 +82,7 @@ def _token_to_filenames(token): if directory != '.': filenames = [f if f[0] != '/' else directory + '/' + f for f in filenames] else: - filenames = token + filenames = [token] return filenames @property From 2c39b236f58fe579982ae5e782d94c619ea48e7e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 27 Oct 2019 14:05:27 -0700 Subject: [PATCH 0068/1201] store input filename in `document.meta['loaded_from']` --- udapi/core/basereader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index b5e159a2..f1e32d18 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -126,6 +126,8 @@ def process_document(self, document): continue self.finished = not self.files.has_next_file() break + if trees_loaded == 0: + document.meta['loaded_from'] = self.filename add_to_the_last_bundle = 0 trees_loaded += 1 From 752e5d77d7be1e50bdae46e2088837609160566c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 6 Nov 2019 22:12:20 +0100 Subject: [PATCH 0069/1201] prevent sent_id ending with a slash (but with no zone) --- udapi/core/bundle.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/udapi/core/bundle.py b/udapi/core/bundle.py index 5a9d6808..aa030259 100644 --- a/udapi/core/bundle.py +++ b/udapi/core/bundle.py @@ -31,8 +31,11 @@ def bundle_id(self): @bundle_id.setter def bundle_id(self, bundle_id): self._bundle_id = bundle_id - for tree in self.trees: - tree._sent_id = bundle_id + '/' + tree.zone # pylint: disable=protected-access + if len(self.trees) == 1 and self.trees[0].zone == '': + self.trees[0]._sent_id = bundle_id + else: + for tree in self.trees: + tree._sent_id = bundle_id + '/' + tree.zone # pylint: disable=protected-access def __str__(self): if self.bundle_id is None: From 27879db2a0e055e4a53408ce9a3ffd0aa6e0e836 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 8 Nov 2019 14:57:50 +0100 Subject: [PATCH 0070/1201] add block segment.Simple for simple sentence segmentation --- udapi/block/segment/simple.py | 39 +++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 udapi/block/segment/simple.py diff --git a/udapi/block/segment/simple.py b/udapi/block/segment/simple.py new file mode 100644 index 00000000..5842a6c5 --- /dev/null +++ b/udapi/block/segment/simple.py @@ -0,0 +1,39 @@ +"""Block segment.Simple""" +from udapi.core.block import Block +from udapi.core.bundle import Bundle +import re + +# We don't want to introduce the extra "regex" dependency for \p{Lu} support. +# import sys +# pLu = '[{}]'.format("".join([chr(i) for i in range(sys.maxunicode) if chr(i).isupper()])) +# p = re.compile(pLu) + + +class Simple(Block): + """"Base segmenter, splits on sentence-final segmentation followed by uppercase.""" + + @staticmethod + def segment_string(string): + """A method to be overriden in subclasses.""" + return re.sub(r'([.!?])(["“»›]?) (["„«¿¡‹(]?)(\d|[ČĎŇÓŘŠŤÚŽA-Z])', r'\1\2\n\3\4', string).split('\n') + + + def process_document(self, doc): + old_bundles = doc.bundles + new_bundles = [] + for bundle in old_bundles: + for tree in bundle: + new_bundles.append(bundle) + if self._should_process_tree(tree): + if tree.children: + raise ValueError("Segmenting already tokenized text is not supported.") + sentences = self.segment_string(tree.text) + orig_bundle_id = bundle.bundle_id + bundle.bundle_id = orig_bundle_id + '-1' + if len(sentences) > 1: + tree.text = sentences[0] + for i, sentence in enumerate(sentences[1:], 2): + new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) + new_bundle.create_tree(tree.zone).text = sentence + new_bundles.append(new_bundle) + doc.bundles = new_bundles \ No newline at end of file From fd454c27567c7bab4545c4ce577722eed68ebf57 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 18 Nov 2019 07:23:59 +0100 Subject: [PATCH 0071/1201] improve segment.Simple --- udapi/block/segment/simple.py | 50 +++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/udapi/block/segment/simple.py b/udapi/block/segment/simple.py index 5842a6c5..b4f2bbe4 100644 --- a/udapi/block/segment/simple.py +++ b/udapi/block/segment/simple.py @@ -3,27 +3,55 @@ from udapi.core.bundle import Bundle import re -# We don't want to introduce the extra "regex" dependency for \p{Lu} support. -# import sys -# pLu = '[{}]'.format("".join([chr(i) for i in range(sys.maxunicode) if chr(i).isupper()])) -# p = re.compile(pLu) - - class Simple(Block): - """"Base segmenter, splits on sentence-final segmentation followed by uppercase.""" + """"Heuristic segmenter, splits on sentence-final segmentation followed by uppercase.""" @staticmethod - def segment_string(string): - """A method to be overriden in subclasses.""" - return re.sub(r'([.!?])(["“»›]?) (["„«¿¡‹(]?)(\d|[ČĎŇÓŘŠŤÚŽA-Z])', r'\1\2\n\3\4', string).split('\n') + def is_nonfinal_abbrev(token): + """Is a given token an abbreviation (without the final period) which cannot end a sentence?""" + if re.search('(např|e.g.)$', token): + return True + return False + + + def is_boundary(self, first, second): + """Is there a sentence boundary between the first and second token?""" + if first[-1] in '"“»›)': + first = first[:-1] + if second[0] in '"„«¿¡‹(': + second = second[1:] + if not second[0].isupper() or second[0].isdigit(): + return False + if not first[-1] in '.!?': + return False + if first[-1] == '.': + if len(first) == 2 and first[0].isupper(): + return False + if self.is_nonfinal_abbrev(first[:-1]): + return False + return True + + + def segment_string(self, string): + """Return a list of sentences in a given string.""" + tokens = string.split(' ') + previous = tokens[0] + segments = [previous] + for token in tokens[1:]: + if self.is_boundary(previous, token): + segments.append(token) + else: + segments[-1] += ' ' + token + previous = token + return segments def process_document(self, doc): old_bundles = doc.bundles new_bundles = [] for bundle in old_bundles: + new_bundles.append(bundle) for tree in bundle: - new_bundles.append(bundle) if self._should_process_tree(tree): if tree.children: raise ValueError("Segmenting already tokenized text is not supported.") From 516addae5ba1694af45985f033d863ec72c35107 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 18 Nov 2019 07:25:43 +0100 Subject: [PATCH 0072/1201] allow udpipe.base parse=0 and various bugfixes --- udapi/block/udpipe/base.py | 5 +++-- udapi/core/bundle.py | 2 ++ udapi/tool/udpipe.py | 29 +++++++++++++++++------------ 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index 2af62d27..34028a4a 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -146,8 +146,9 @@ def process_document(self, doc): for tree in bundle: new_bundles.append(bundle) if self._should_process_tree(tree): - if tok and tag and par: - new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=self.resegment) + if tok: + new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=self.resegment, + tag=self.tag, parse=self.parse) if self.resegment and len(new_trees) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' diff --git a/udapi/core/bundle.py b/udapi/core/bundle.py index aa030259..4df1deb2 100644 --- a/udapi/core/bundle.py +++ b/udapi/core/bundle.py @@ -100,6 +100,8 @@ def add_tree(self, root): def remove(self): """Remove a bundle from the document.""" self._document.bundles = [bundle for bundle in self._document.bundles if bundle != self] + for i, bundle in enumerate(self._document.bundles): + bundle.number = i def address(self): """Return bundle_id or '?' if missing.""" diff --git a/udapi/tool/udpipe.py b/udapi/tool/udpipe.py index f42b2944..c81a7562 100644 --- a/udapi/tool/udpipe.py +++ b/udapi/tool/udpipe.py @@ -37,7 +37,7 @@ def tag_parse_tree(self, root): for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] - for attr in 'upos xpos lemma feats'.split(): + for attr in 'upos xpos lemma feats deprel'.split(): setattr(node, attr, getattr(parsed_node, attr)) # TODO: benchmark which solution is the fastest one. E.g. we could also do @@ -46,7 +46,7 @@ def tag_parse_tree(self, root): # pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants - def tokenize_tag_parse_tree(self, root, resegment=False): + def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. If resegment=True, the returned list of Udapi trees may contain multiple trees. @@ -78,9 +78,13 @@ def tokenize_tag_parse_tree(self, root, resegment=False): u_sentences = [first_sent] # tagging and parsing - for u_sentence in u_sentences: - self.tool.tag(u_sentence, Model.DEFAULT) - self.tool.parse(u_sentence, Model.DEFAULT) + if tag: + for u_sentence in u_sentences: + self.tool.tag(u_sentence, Model.DEFAULT) + if parse: + self.tool.parse(u_sentence, Model.DEFAULT) + elif parse: + raise ValueError('Combination parse=True tag=False is not allowed.') # converting UDPipe nodes to Udapi nodes new_root = root @@ -94,14 +98,15 @@ def tokenize_tag_parse_tree(self, root, resegment=False): u_w = u_words[i] node = new_root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, - xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, + xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, misc=u_w.misc, ) - node.misc = u_w.misc - heads.append(u_w.head) - nodes.append(node) - for node in nodes[1:]: - head = heads.pop(0) - node.parent = nodes[head] + if parse: + heads.append(u_w.head) + nodes.append(node) + if parse: + for node in nodes[1:]: + head = heads.pop(0) + node.parent = nodes[head] trees.append(new_root) new_root = None return trees From 997b6f0b5e1181ed9a535d6eb5c0479ab22082f4 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 18 Nov 2019 07:26:54 +0100 Subject: [PATCH 0073/1201] segment.Merge merges sentences ending with semicolons --- udapi/block/segment/merge.py | 46 ++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 udapi/block/segment/merge.py diff --git a/udapi/block/segment/merge.py b/udapi/block/segment/merge.py new file mode 100644 index 00000000..9ada45f1 --- /dev/null +++ b/udapi/block/segment/merge.py @@ -0,0 +1,46 @@ +"""Block segment.Merge""" +from udapi.core.block import Block + +class Merge(Block): + """"Re-segmenter merging selected sentences (trees). + + This class merges sentences ending with semicolons, + but it can be used as a base class for merging based on different criteria + by overriding one of the `should_*` methods. + """ + + @staticmethod + def should_merge_tokens(first, second): + """Is there actually a sentence boundary between the first and second node?""" + if first.form[-1] == ';': + return True + return False + + def should_merge_bundles(self, first_bundle, second_bundle): + """Is there actually a sentence boundary between the first and second bundle?""" + first_tree = self._get_our_tree(first_bundle) + second_tree = self._get_our_tree(second_bundle) + return self.should_merge_tokens(first_tree.descendants[-1], second_tree.descendants[0]) + + + def _get_our_tree(self, bundle): + for tree in bundle: + if self._should_process_tree(tree): + return tree + raise ValueError("Bundle %s contains no tree to process." % bundle.address()) + + + def process_document(self, doc): + old_bundles = doc.bundles + prev_bundle = old_bundles[0] + new_bundles = [prev_bundle] + for bundle in old_bundles[1:]: + if self.should_merge_bundles(prev_bundle, bundle): + for tree in bundle: + prev_tree = prev_bundle.get_tree(tree.zone) + prev_tree.steal_nodes(tree.descendants) + prev_tree.text = prev_tree.compute_text() + else: + new_bundles.append(bundle) + prev_bundle = bundle + doc.bundles = new_bundles \ No newline at end of file From e5c51e66e341c003b7e092f3498310f2125f0a18 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 20 Nov 2019 18:32:59 +0100 Subject: [PATCH 0074/1201] rename newspeak.PreVele to demo.Newspeak --- udapi/block/{newspeak/prevele.py => demo/newspeak.py} | 6 +++--- udapi/block/newspeak/__init__.py | 0 2 files changed, 3 insertions(+), 3 deletions(-) rename udapi/block/{newspeak/prevele.py => demo/newspeak.py} (93%) delete mode 100644 udapi/block/newspeak/__init__.py diff --git a/udapi/block/newspeak/prevele.py b/udapi/block/demo/newspeak.py similarity index 93% rename from udapi/block/newspeak/prevele.py rename to udapi/block/demo/newspeak.py index dd616571..6be2caf5 100644 --- a/udapi/block/newspeak/prevele.py +++ b/udapi/block/demo/newspeak.py @@ -1,10 +1,10 @@ -"""newspeak.PreVele block for 1984-like newspeak-ization. +"""demo.Newspeak block for 1984-like newspeak-ization of Czech. This is just a demo/draft. Usage: $ echo 'Nejhorší žena je lepší než nejlepší muž.' | \ - udapy -q read.Sentences udpipe.Cs newspeak.PreVele write.Sentences + udapy -q read.Sentences udpipe.Cs demo.Newspeak write.Sentences Převelenedobrá žena je veledobrá než převeledobrý muž. """ from udapi.core.block import Block @@ -18,7 +18,7 @@ } -class PreVele(Block): +class Newspeak(Block): """Change all comparatives to vele-x and superlatives to převele-x.""" def __init__(self, morphodita_path='models/morphodita/cs/', diff --git a/udapi/block/newspeak/__init__.py b/udapi/block/newspeak/__init__.py deleted file mode 100644 index e69de29b..00000000 From e7a649cefe737d0c6b67d7edd210b5587dc88c25 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 23 Nov 2019 22:54:45 +0100 Subject: [PATCH 0075/1201] fix loading empty trees --- udapi/block/read/conllu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 1b3383fb..83f7a1c3 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -164,8 +164,10 @@ def read_tree(self): # they need to create one artificial node and mark it with Empty=Yes. # In that case, we will delete this node, so the tree will have just the (technical) root. # See also udapi.block.write.Conllu, which is compatible with this trick. - if len(nodes) == 2 and nodes[1].misc == 'Empty=Yes': + if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': nodes.pop() + root._children = [] + root._descendants = [] # Set dependency parents (now, all nodes of the tree are created). # TODO: parent setter checks for cycles, but this is something like O(n*log n) From c0f3cd2dd6e0e547e85180b18332398083222bbe Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 25 Nov 2019 01:02:45 +0100 Subject: [PATCH 0076/1201] first draft of a block for Silvie --- udapi/block/demo/complexity.py | 267 +++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 udapi/block/demo/complexity.py diff --git a/udapi/block/demo/complexity.py b/udapi/block/demo/complexity.py new file mode 100644 index 00000000..a5d972e9 --- /dev/null +++ b/udapi/block/demo/complexity.py @@ -0,0 +1,267 @@ +"""demo.Complexity prints statistics on syntactic complexity. +""" +from udapi.core.basewriter import BaseWriter +from collections import deque + + +def non_punct(nodes): + return [n for n in nodes if n.upos != 'PUNCT'] + + +def is_np(node): + return node.upos in ("NOUN", "PROPN") or (node.upos == "PRON" and node.feats["PronType"] == "Prs" and not node.feats["Poss"]) + + +def is_vp(node): + """E.g. prosili, naléhali a žadonili => 1 coordinated verb phrase, head “prosili”. + + [POS == “VERB”, [deprel == “conj”, POS == “VERB”]], unique coordination heads + TODO: zahrnout i non-VERB? + - vznikla a byla přijata(conj,ADJ,parent=vznikla) + - je(cop,AUX) nešťastný(ADJ) a nechá(conj,VERB,parent=nešťastný) se nalákat + - "podařilo se to a dokladem(ClauseHead,NOUN,conj,parent=podařilo) je(cop,AUX,parent=dokladem)" + - omezit se jen na (či využít) ClauseHead, nebo zahrnout i non-finite verbs (koordinace infinitivů či příčestí)? + "stihl(ClauseHead) napsat(VerbForm=Inf) a publikovat(VerbForm=Inf)" ... napsat ani publikovat nejsou ClauseHead + "rozhodl se ukončit a ukazuje(ClauseHead,parent=ukončit)" správně by mělo být parent=rozhodl, ale parser dělá chyby. + - Parsing vůbec dělá mnoho chyb v koordinacích, takže je vhodné podmínky velmi omezit. + """ + return node.upos == "VERB" or node.misc["ClauseHead"] + + +def is_relcl(node): + """Is a given node a head of a relative clause? + + Unfortunatelly, UDPipe 2.4 produces just acl instead of acl:relcl. + """ + if node.deprel == 'acl:relcl': + return True + return node.udeprel == 'acl' and next((c for c in node.children if 'Rel' in c.feats['PronType']), None) + + +def is_postponed_nom_mod(node): + """Is a given node a postponed nominal modifier? + + Silvie: [(POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)), child with higher word order than parent + [deprel != “conj”, POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)] + + TODO: Tohle hledá v češtině zcela běžné jevy jako "vznik díla". Nechceme hledat něco jiného? + """ + return node.udeprel != 'conj' and is_np(node) and node.parent.precedes(node) and is_np(node.parent) + + +def is_postponed_adj_mod(node): + # TODO můžeme rozlišovat holý přívlastek ("písní ruských") a rozvitý ("milenec známý z pozdějšího zpracování") + return node.parent.precedes(node) and is_np(node.parent) and node.upos == 'ADJ' #and not node.children + + +def is_complex_nominal(node): + """[(POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)) 2x descendant [deprel != “conj”]] + TODO: punct, case, cc a dep taky ignorovat? + TODO: opravdu descendants a ne children? (descendants snadno roste nad všechny meze, je-li tam třeba vedlejší věta) + TODO: beztak bude chtít odfiltrovat copuly: "Jádrem tvorby jsou sbírky." - Jádrem má 3 děti. + TODO: a nezvýšit ten limit z 2x aspoň na 3x? + """ + return is_np(node) and len([n for n in node.children if n.deprel not in ('conj', 'punct', 'case', 'cc', 'dep', 'cop')]) > 1 + +class Complexity(BaseWriter): + + def __init__(self, matches=False, **kwargs): + super().__init__(**kwargs) + self.matches = matches + + + def process_tree(self, root): + print("# " + root.text) + + allnodes = root.descendants + depth, clause_depth = {0: 0}, {0: 0} + queue = deque(root.children) + clause_heads = [] + while queue: + node = queue.popleft() + depth[node.ord] = depth[node.parent.ord] + 1 + clause_depth[node.ord] = clause_depth[node.parent.ord] + if self.is_finite_clause_head(node): + node.misc['ClauseHead'] = 1 + clause_heads.append(node) + clause_depth[node.ord] += 1 + queue.extend(node.children) + max_depth = sorted(depth.values())[-1] + max_clause_depth = sorted(clause_depth.values())[-1] + + t_units = self.get_t_units([n for n in root.children if n.deprel == 'root']) + total_t_units_length = sum(len(t_unit) for t_unit in t_units) + mean_t_unit_length = total_t_units_length / (len(t_units) or 1) # TODO co reportovat, když věta nemá žádné t-units? + + if not self.matches: + print("\t".join(str(x) for x in [root.sent_id, len(non_punct(allnodes)), max_depth, max_clause_depth, mean_t_unit_length]), end='') + + self.report("clauses", [[n] for n in clause_heads], 'subtree') + self.report("adjectivized_predicates", [[n] for n in allnodes if self.is_adjectivized_predicate(n)]) + self.report("controlled_predicates", [[n] for n in allnodes if self.is_controlled_predicate(n)]) + self.report("main_clauses", self.get_main_clauses(root), 'subtree_within_clause') + self.report("coordinated_verb_phrases", self.get_coord_phrase(root, is_vp)) + self.report("coordinated_noun_phrases", self.get_coord_phrase(root, is_np)) + self.report("coordinated_adjective_phrases", self.get_coord_phrase(root, lambda n: n.upos in ("ADJ", "DET"))) + self.report("coordinated_adverb_phrases", self.get_coord_phrase(root, lambda n: n.upos == "ADV")) + self.report("t-units", t_units) + self.report("complex_t-units", self.get_complex_t_units(root)) + # TODO: najde "básně a písně" a "rychtář a rychtářka" UDPipe kdovíproč určil jako ADV a ADV. Zkontrolovat, máme-li nejlepší možný UDPipe model. + self.report("relative_clauses", [[n] for n in allnodes if is_relcl(n)], 'subtree_within_clause') + self.report("postponed_nominal_modifiers", [[n] for n in allnodes if is_postponed_nom_mod(n)]) + # TODO postponed_adj_mod si přidal Martin + #self.report("postponed_adjective_modifiers", [[n] for n in allnodes if is_postponed_adj_mod(n)]) + self.report("complex_nominals", [[n] for n in allnodes if is_complex_nominal(n)]) + + if not self.matches: + # TODO: pro total koordinace asi nemá smysl reportovat matches, jen total count? + self.report("coordinated_phrases_total", self.get_coord_phrase(root, lambda _: True)) + + nonpunct_upos = [n.upos for n in non_punct(allnodes)] + ['NONE', 'NONE'] + brackets = str(len([n for n in allnodes if n.form == '('])) + dashes = str(len([n for n in allnodes if n.form in '-–—―'])) # hyphen, en-dash, em-dash, horizonatal bar + colons = str(len([n for n in allnodes if n.form == ':'])) + semicolons = str(len([n for n in allnodes if n.form == ';'])) + print("\t", "\t".join([nonpunct_upos[0], nonpunct_upos[1], brackets, dashes, colons, semicolons])) + + + def report(self, category, groups, expand_type='no'): + if self.matches: + for group in groups: + self.print_match(category, group, expand_type) + else: + print("\t" + str(len(groups)), end='') + + + def expand_subtree(self, nodes, expand_type): + if expand_type == 'no': + return nodes + if len(nodes) > 1: + raise Exception("expanding more than one node not implemented yet") + if expand_type == 'subtree': + return nodes[0].descendants(add_self=True) + #if expand_type == 'subtree_except_conj': + #result = nodes + #for child in group.children: + #if child.udeprel != 'conj': + #result.extend(child.descendants(add_self=True)) + #return = sorted(result, key=lambda n: n.ord) + if expand_type == 'subtree_within_clause': + stack = [n for n in nodes[0].children if n.udeprel != 'conj'] + while stack: + node = stack.pop() + if not node.misc["ClauseHead"]: + nodes.append(node) + stack.extend(node.children()) + return sorted(nodes, key=lambda n: n.ord) + raise ValueError("unknown expand value " + expand_type) + + + def print_match(self, category, group, expand_type='no'): + nodes = self.expand_subtree(group, expand_type) + lemmas = " ".join(n.lemma for n in nodes) + tags = " ".join(n.upos for n in nodes) + n_tokens = str(len(non_punct(nodes))) + print("\t".join([category, nodes[0].root.sent_id, lemmas, tags, n_tokens])) + + + def is_finite_clause_head(self, node): + """Is a given node a head of a finite clause? + + Silvie: [(POS == „VERB“ & feats:Verbform == „Fin“ | Verbform == „Part“} ) ] OR [(POS in {„ADJ“, „NOUN“, „PROPN“}, [child POS == „AUX“)]] + - POS == „VERB“ je zbytečné, protože VerbForm=Part je nastaveno i u ADJ ("je nucen" apod.) + - child POS == „AUX“ zase matchuje i např. na "Vidím psa(NOUN), který je(AUX,acl,parent=psa) z dávné doby." + - adjectivized predicates (převažující(VerbForm=Part) básně) by neměly být určeny jako clause_head + + * Most finite verbs with deprel=amod are parsing errors - they should have deprel=acl, + but for better robustness we include these as well. + * Similarly "dep" and "orphan" are mostly parsing errors. + * TODO: by uncommenting the nsubj/csubj line, we find few more real clause heads, but also some false positives. + """ + # TODO appos + if ((node.udeprel in {'root', 'conj', 'acl', 'advcl', 'ccomp', 'csubj', 'obl', 'parataxis', 'amod', 'dep', 'orphan'} and self.is_finite_verb(node)) + #or any(c.udeprel in {'nsubj', 'csubj'} for c in node.children) + or (any(c.udeprel == 'cop' for c in node.children) and node.udeprel != 'xcomp')): + return True + xcomp_child = next((c for c in node.children if c.udeprel == 'xcomp'), None) + return xcomp_child and any(c.udeprel == 'cop' for c in xcomp_child.children) + + + # TODO: zahrnout i: bude(aux,AUX,parent=chovat) se chovat(VERB,VerbForm=Inf) + def is_finite_verb(self, node): + return (node.feats['VerbForm'] in {'Fin', 'Part'} and + (node.upos == 'VERB' or + node.upos == 'ADJ' and any(c.deprel == 'aux:pass' for c in node.children))) + + + def is_adjectivized_predicate(self, node): + """E.g. kouřící komín, zbitý kluk + + Silvie: [(POS == „ADJ“ & feats:VerbForm == „Part“), parent [POS in {„NOUN“, „PROPN“}] ] + - parent [POS in {„NOUN“, „PROPN“}] zamezí případům jako + "kvůli nesmyslné a stupňující(parent=nesmyslné,deprel=conj) se žárlivosti" + "Nové pronikající(parent=Nové,deprel=amod) socialistické myšlení" asi chyba parsingu, mělo být parent=myšlení? + - dotaz naopak matchuje na "způsob, jakým jsou popsány", proto přidávám podmínku not node.misc["ClauseHead"] + """ + return (node.feats["VerbForm"] == "Part" + and node.upos == "ADJ" + and (node.parent.upos in {"NOUN","PROPN"} or (node.udeprel == "conj" and node.parent.upos == "ADJ")) + and not node.misc["ClauseHead"]) + + + def is_controlled_predicate(self, node): + """E.g. Mohli jsme odejít i zůstat. + + TODO: Chceme zahrnout i druhý a další člen koordinace, např. "stihl napsat a publikovat", + tedy node.udeprel == "conj" and node.parent.udeprel == "xcomp"? + """ + return node.deprel == "xcomp" + + + def get_main_clauses(self, root): + main_heads = [] + for main_head in root.children: + main_heads.append(main_head) + main_heads.extend(n for n in main_head.children if n.udeprel == 'conj') + return [[n] for n in main_heads] + + def get_coord_phrase(self, root, phrase_type_function): + results = [] + for node in root.descendants: + if phrase_type_function(node): + conjuncts = [n for n in node.children if n.udeprel == 'conj' and phrase_type_function(n)] + if conjuncts: + conjunctions = [] + for conj in conjuncts: + # TODO multiword conjunctions (udeprel=flat)? + conjunctions.extend([n for n in conj.children if n.udeprel == 'cc']) + results.append(sorted([node] + conjuncts + conjunctions, key=lambda n: n.ord)) + return results + + + def get_t_units(self, main_heads): + results = [] + for main_head in main_heads: + main_clause = [main_head] + dep_heads = [] + stack = main_head.children + while stack: + node = stack.pop() + if node.misc["ClauseHead"]: + dep_heads.append(node) + else: + main_clause.append(node) + stack.extend(node.children) + main_clause = sorted(main_clause, key=lambda n: n.ord) + + for dep_clause_head in dep_heads: + results.append(main_clause + self.expand_subtree([dep_clause_head], 'subtree')) # TODO subtree_within_clause? + return results + + + def get_complex_t_units(self, root): + results = [] + for node in root.descendants: + if node.deprel != 'root' and node.misc["ClauseHead"]: # TODO: exclude the main clause? + results += self.get_t_units([node]) + return results From c3221e2ce0718f77de69adeb9462628bf16bee6d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 25 Nov 2019 13:06:37 +0100 Subject: [PATCH 0077/1201] tidy demo.Complexity --- udapi/block/demo/complexity.py | 237 +++++++++++++++++---------------- 1 file changed, 119 insertions(+), 118 deletions(-) diff --git a/udapi/block/demo/complexity.py b/udapi/block/demo/complexity.py index a5d972e9..020f8b17 100644 --- a/udapi/block/demo/complexity.py +++ b/udapi/block/demo/complexity.py @@ -13,7 +13,7 @@ def is_np(node): def is_vp(node): - """E.g. prosili, naléhali a žadonili => 1 coordinated verb phrase, head “prosili”. + """E.g. prosili, naléhali a žadonili => 1 coordinated verb phrase, head “prosili”. [POS == “VERB”, [deprel == “conj”, POS == “VERB”]], unique coordination heads TODO: zahrnout i non-VERB? @@ -30,20 +30,20 @@ def is_vp(node): def is_relcl(node): """Is a given node a head of a relative clause? - + Unfortunatelly, UDPipe 2.4 produces just acl instead of acl:relcl. """ if node.deprel == 'acl:relcl': return True - return node.udeprel == 'acl' and next((c for c in node.children if 'Rel' in c.feats['PronType']), None) + return node.udeprel == 'acl' and any('Rel' in c.feats['PronType'] for c in node.children) def is_postponed_nom_mod(node): """Is a given node a postponed nominal modifier? - + Silvie: [(POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)), child with higher word order than parent [deprel != “conj”, POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)] - + TODO: Tohle hledá v češtině zcela běžné jevy jako "vznik díla". Nechceme hledat něco jiného? """ return node.udeprel != 'conj' and is_np(node) and node.parent.precedes(node) and is_np(node.parent) @@ -61,68 +61,67 @@ def is_complex_nominal(node): TODO: beztak bude chtít odfiltrovat copuly: "Jádrem tvorby jsou sbírky." - Jádrem má 3 děti. TODO: a nezvýšit ten limit z 2x aspoň na 3x? """ - return is_np(node) and len([n for n in node.children if n.deprel not in ('conj', 'punct', 'case', 'cc', 'dep', 'cop')]) > 1 + return is_np(node) and len([n for n in node.descendants if n.deprel not in ('conj', 'punct', 'case', 'cc', 'dep', 'cop')]) > 1 -class Complexity(BaseWriter): - def __init__(self, matches=False, **kwargs): - super().__init__(**kwargs) - self.matches = matches +def is_finite_clause_head(node): + """Is a given node a head of a finite clause? + Silvie: [(POS == „VERB“ & feats:Verbform == „Fin“ | Verbform == „Part“} ) ] OR [(POS in {„ADJ“, „NOUN“, „PROPN“}, [child POS == „AUX“)]] + - POS == „VERB“ je zbytečné, protože VerbForm=Part je nastaveno i u ADJ ("je nucen" apod.) + - child POS == „AUX“ zase matchuje i např. na "Vidím psa(NOUN), který je(AUX,acl,parent=psa) z dávné doby." + - adjectivized predicates (převažující(VerbForm=Part) básně) by neměly být určeny jako clause_head - def process_tree(self, root): - print("# " + root.text) - - allnodes = root.descendants - depth, clause_depth = {0: 0}, {0: 0} - queue = deque(root.children) - clause_heads = [] - while queue: - node = queue.popleft() - depth[node.ord] = depth[node.parent.ord] + 1 - clause_depth[node.ord] = clause_depth[node.parent.ord] - if self.is_finite_clause_head(node): - node.misc['ClauseHead'] = 1 - clause_heads.append(node) - clause_depth[node.ord] += 1 - queue.extend(node.children) - max_depth = sorted(depth.values())[-1] - max_clause_depth = sorted(clause_depth.values())[-1] + * Most finite verbs with deprel=amod are parsing errors - they should have deprel=acl, + but for better robustness we include these as well. + * Similarly "dep" and "orphan" are mostly parsing errors. + * TODO: by uncommenting the nsubj/csubj line, we find few more real clause heads, but also some false positives. + """ + # TODO appos + if ((node.udeprel in {'root', 'conj', 'acl', 'advcl', 'ccomp', 'csubj', 'obl', 'parataxis', 'amod', 'dep', 'orphan'} + and is_finite_verb(node)) + #or any(c.udeprel in {'nsubj', 'csubj'} for c in node.children) + or (any(c.udeprel == 'cop' for c in node.children) and node.udeprel != 'xcomp')): + return True + xcomp_child = next((c for c in node.children if c.udeprel == 'xcomp'), None) + return xcomp_child and any(c.udeprel == 'cop' for c in xcomp_child.children) - t_units = self.get_t_units([n for n in root.children if n.deprel == 'root']) - total_t_units_length = sum(len(t_unit) for t_unit in t_units) - mean_t_unit_length = total_t_units_length / (len(t_units) or 1) # TODO co reportovat, když věta nemá žádné t-units? - - if not self.matches: - print("\t".join(str(x) for x in [root.sent_id, len(non_punct(allnodes)), max_depth, max_clause_depth, mean_t_unit_length]), end='') - - self.report("clauses", [[n] for n in clause_heads], 'subtree') - self.report("adjectivized_predicates", [[n] for n in allnodes if self.is_adjectivized_predicate(n)]) - self.report("controlled_predicates", [[n] for n in allnodes if self.is_controlled_predicate(n)]) - self.report("main_clauses", self.get_main_clauses(root), 'subtree_within_clause') - self.report("coordinated_verb_phrases", self.get_coord_phrase(root, is_vp)) - self.report("coordinated_noun_phrases", self.get_coord_phrase(root, is_np)) - self.report("coordinated_adjective_phrases", self.get_coord_phrase(root, lambda n: n.upos in ("ADJ", "DET"))) - self.report("coordinated_adverb_phrases", self.get_coord_phrase(root, lambda n: n.upos == "ADV")) - self.report("t-units", t_units) - self.report("complex_t-units", self.get_complex_t_units(root)) - # TODO: najde "básně a písně" a "rychtář a rychtářka" UDPipe kdovíproč určil jako ADV a ADV. Zkontrolovat, máme-li nejlepší možný UDPipe model. - self.report("relative_clauses", [[n] for n in allnodes if is_relcl(n)], 'subtree_within_clause') - self.report("postponed_nominal_modifiers", [[n] for n in allnodes if is_postponed_nom_mod(n)]) - # TODO postponed_adj_mod si přidal Martin - #self.report("postponed_adjective_modifiers", [[n] for n in allnodes if is_postponed_adj_mod(n)]) - self.report("complex_nominals", [[n] for n in allnodes if is_complex_nominal(n)]) - if not self.matches: - # TODO: pro total koordinace asi nemá smysl reportovat matches, jen total count? - self.report("coordinated_phrases_total", self.get_coord_phrase(root, lambda _: True)) - - nonpunct_upos = [n.upos for n in non_punct(allnodes)] + ['NONE', 'NONE'] - brackets = str(len([n for n in allnodes if n.form == '('])) - dashes = str(len([n for n in allnodes if n.form in '-–—―'])) # hyphen, en-dash, em-dash, horizonatal bar - colons = str(len([n for n in allnodes if n.form == ':'])) - semicolons = str(len([n for n in allnodes if n.form == ';'])) - print("\t", "\t".join([nonpunct_upos[0], nonpunct_upos[1], brackets, dashes, colons, semicolons])) +# TODO: zahrnout i: bude(aux,AUX,parent=chovat) se chovat(VERB,VerbForm=Inf) +def is_finite_verb(node): + return (node.feats['VerbForm'] in {'Fin', 'Part'} and + (node.upos == 'VERB' or + node.upos == 'ADJ' and any(c.deprel == 'aux:pass' for c in node.children))) + + +def is_adjectivized_predicate(node): + """E.g. kouřící komín, zbitý kluk + + Silvie: [(POS == „ADJ“ & feats:VerbForm == „Part“), parent [POS in {„NOUN“, „PROPN“}] ] + - parent [POS in {„NOUN“, „PROPN“}] zamezí případům jako + "kvůli nesmyslné a stupňující(parent=nesmyslné,deprel=conj) se žárlivosti" + "Nové pronikající(parent=Nové,deprel=amod) socialistické myšlení" asi chyba parsingu, mělo být parent=myšlení? + - dotaz naopak matchuje na "způsob, jakým jsou popsány", proto přidávám podmínku not node.misc["ClauseHead"] + """ + return (node.feats["VerbForm"] == "Part" + and node.upos == "ADJ" + and (node.parent.upos in {"NOUN","PROPN"} or (node.udeprel == "conj" and node.parent.upos == "ADJ")) + and not node.misc["ClauseHead"]) + + +def is_controlled_predicate(node): + """E.g. Mohli jsme odejít i zůstat. + + TODO: Chceme zahrnout i druhý a další člen koordinace, např. "stihl napsat a publikovat", + tedy node.udeprel == "conj" and node.parent.udeprel == "xcomp"? + """ + return node.deprel == "xcomp" + +class Complexity(BaseWriter): + + def __init__(self, matches=False, **kwargs): + super().__init__(**kwargs) + self.matches = matches def report(self, category, groups, expand_type='no'): @@ -165,59 +164,6 @@ def print_match(self, category, group, expand_type='no'): print("\t".join([category, nodes[0].root.sent_id, lemmas, tags, n_tokens])) - def is_finite_clause_head(self, node): - """Is a given node a head of a finite clause? - - Silvie: [(POS == „VERB“ & feats:Verbform == „Fin“ | Verbform == „Part“} ) ] OR [(POS in {„ADJ“, „NOUN“, „PROPN“}, [child POS == „AUX“)]] - - POS == „VERB“ je zbytečné, protože VerbForm=Part je nastaveno i u ADJ ("je nucen" apod.) - - child POS == „AUX“ zase matchuje i např. na "Vidím psa(NOUN), který je(AUX,acl,parent=psa) z dávné doby." - - adjectivized predicates (převažující(VerbForm=Part) básně) by neměly být určeny jako clause_head - - * Most finite verbs with deprel=amod are parsing errors - they should have deprel=acl, - but for better robustness we include these as well. - * Similarly "dep" and "orphan" are mostly parsing errors. - * TODO: by uncommenting the nsubj/csubj line, we find few more real clause heads, but also some false positives. - """ - # TODO appos - if ((node.udeprel in {'root', 'conj', 'acl', 'advcl', 'ccomp', 'csubj', 'obl', 'parataxis', 'amod', 'dep', 'orphan'} and self.is_finite_verb(node)) - #or any(c.udeprel in {'nsubj', 'csubj'} for c in node.children) - or (any(c.udeprel == 'cop' for c in node.children) and node.udeprel != 'xcomp')): - return True - xcomp_child = next((c for c in node.children if c.udeprel == 'xcomp'), None) - return xcomp_child and any(c.udeprel == 'cop' for c in xcomp_child.children) - - - # TODO: zahrnout i: bude(aux,AUX,parent=chovat) se chovat(VERB,VerbForm=Inf) - def is_finite_verb(self, node): - return (node.feats['VerbForm'] in {'Fin', 'Part'} and - (node.upos == 'VERB' or - node.upos == 'ADJ' and any(c.deprel == 'aux:pass' for c in node.children))) - - - def is_adjectivized_predicate(self, node): - """E.g. kouřící komín, zbitý kluk - - Silvie: [(POS == „ADJ“ & feats:VerbForm == „Part“), parent [POS in {„NOUN“, „PROPN“}] ] - - parent [POS in {„NOUN“, „PROPN“}] zamezí případům jako - "kvůli nesmyslné a stupňující(parent=nesmyslné,deprel=conj) se žárlivosti" - "Nové pronikající(parent=Nové,deprel=amod) socialistické myšlení" asi chyba parsingu, mělo být parent=myšlení? - - dotaz naopak matchuje na "způsob, jakým jsou popsány", proto přidávám podmínku not node.misc["ClauseHead"] - """ - return (node.feats["VerbForm"] == "Part" - and node.upos == "ADJ" - and (node.parent.upos in {"NOUN","PROPN"} or (node.udeprel == "conj" and node.parent.upos == "ADJ")) - and not node.misc["ClauseHead"]) - - - def is_controlled_predicate(self, node): - """E.g. Mohli jsme odejít i zůstat. - - TODO: Chceme zahrnout i druhý a další člen koordinace, např. "stihl napsat a publikovat", - tedy node.udeprel == "conj" and node.parent.udeprel == "xcomp"? - """ - return node.deprel == "xcomp" - - def get_main_clauses(self, root): main_heads = [] for main_head in root.children: @@ -225,6 +171,7 @@ def get_main_clauses(self, root): main_heads.extend(n for n in main_head.children if n.udeprel == 'conj') return [[n] for n in main_heads] + def get_coord_phrase(self, root, phrase_type_function): results = [] for node in root.descendants: @@ -238,7 +185,7 @@ def get_coord_phrase(self, root, phrase_type_function): results.append(sorted([node] + conjuncts + conjunctions, key=lambda n: n.ord)) return results - + # TODO koordinace hlavních i vedlejších vět def get_t_units(self, main_heads): results = [] for main_head in main_heads: @@ -253,15 +200,69 @@ def get_t_units(self, main_heads): main_clause.append(node) stack.extend(node.children) main_clause = sorted(main_clause, key=lambda n: n.ord) - + for dep_clause_head in dep_heads: - results.append(main_clause + self.expand_subtree([dep_clause_head], 'subtree')) # TODO subtree_within_clause? + results.append(main_clause + self.expand_subtree([dep_clause_head], 'subtree')) return results - + # TODO complex t-unit má jinou definici: 3 klauze def get_complex_t_units(self, root): results = [] for node in root.descendants: if node.deprel != 'root' and node.misc["ClauseHead"]: # TODO: exclude the main clause? results += self.get_t_units([node]) return results + + + def process_tree(self, root): + print("# " + root.text) + + allnodes = root.descendants + depth, clause_depth = {0: 0}, {0: 0} + queue = deque(root.children) + clause_heads = [] + while queue: + node = queue.popleft() + depth[node.ord] = depth[node.parent.ord] + 1 + clause_depth[node.ord] = clause_depth[node.parent.ord] + if is_finite_clause_head(node): + node.misc['ClauseHead'] = 1 + clause_heads.append(node) + clause_depth[node.ord] += 1 + queue.extend(node.children) + max_depth = sorted(depth.values())[-1] + max_clause_depth = sorted(clause_depth.values())[-1] + + t_units = self.get_t_units([n for n in root.children if n.deprel == 'root']) + total_t_units_length = sum(len(t_unit) for t_unit in t_units) + mean_t_unit_length = total_t_units_length / (len(t_units) or 1) # TODO co reportovat, když věta nemá žádné t-units? + + if not self.matches: + print("\t".join(str(x) for x in [root.sent_id, len(non_punct(allnodes)), max_depth, max_clause_depth, mean_t_unit_length]), end='') + + self.report("clauses", [[n] for n in clause_heads], 'subtree') + self.report("adjectivized_predicates", [[n] for n in allnodes if is_adjectivized_predicate(n)]) + self.report("controlled_predicates", [[n] for n in allnodes if is_controlled_predicate(n)]) + self.report("main_clauses", self.get_main_clauses(root), 'subtree_within_clause') + self.report("coordinated_verb_phrases", self.get_coord_phrase(root, is_vp)) + self.report("coordinated_noun_phrases", self.get_coord_phrase(root, is_np)) + self.report("coordinated_adjective_phrases", self.get_coord_phrase(root, lambda n: n.upos in ("ADJ", "DET"))) + self.report("coordinated_adverb_phrases", self.get_coord_phrase(root, lambda n: n.upos == "ADV")) + self.report("t-units", t_units) + self.report("complex_t-units", self.get_complex_t_units(root)) + # TODO: najde "básně a písně" a "rychtář a rychtářka" UDPipe kdovíproč určil jako ADV a ADV. Zkontrolovat, máme-li nejlepší možný UDPipe model. + self.report("relative_clauses", [[n] for n in allnodes if is_relcl(n)], 'subtree_within_clause') + self.report("postponed_nominal_modifiers", [[n] for n in allnodes if is_postponed_nom_mod(n)]) + self.report("postponed_adjective_modifiers", [[n] for n in allnodes if is_postponed_adj_mod(n)]) + self.report("complex_nominals", [[n] for n in allnodes if is_complex_nominal(n)]) + + if not self.matches: + # TODO: pro total koordinace asi nemá smysl reportovat matches, jen total count? + self.report("coordinated_phrases_total", self.get_coord_phrase(root, lambda _: True)) + + nonpunct_upos = [n.upos for n in non_punct(allnodes)] + ['NONE', 'NONE'] + brackets = str(len([n for n in allnodes if n.form == '('])) + dashes = str(len([n for n in allnodes if n.form in '-–—―'])) # hyphen, en-dash, em-dash, horizonatal bar + colons = str(len([n for n in allnodes if n.form == ':'])) + semicolons = str(len([n for n in allnodes if n.form == ';'])) + print("\t", "\t".join([nonpunct_upos[0], nonpunct_upos[1], brackets, dashes, colons, semicolons])) From c700ea1e276b9a3b63a0c16ede00daa5c95e7863 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 10 Dec 2019 14:27:37 +0100 Subject: [PATCH 0078/1201] fix a bug in eval.F1 --- udapi/block/eval/f1.py | 66 ++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py index a4f93a1b..982e4190 100644 --- a/udapi/block/eval/f1.py +++ b/udapi/block/eval/f1.py @@ -112,9 +112,32 @@ def process_tree(self, tree): pred_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in tree.descendants] gold_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in gold_tree.descendants] - common = find_lcs(pred_tokens, gold_tokens) - if self.focus is not None: + # lcs("abc", "acb") can be either "ab" or "ac". + # We want to prefer the LCS with the highest number of non-focused tokens. + # E.g. if focus="," then lcs("a,c", "ac,") should be "ac" and the comma should be evaluated + # as non-aligned, i.e. eval.F1 should return precision=recall=f1=0 for this sentence. + if self.focus is None: + common = find_lcs(pred_tokens, gold_tokens) + else: + nf_pred_tokens = [x for x in pred_tokens if not self.focus.fullmatch(x)] + nf_gold_tokens = [x for x in gold_tokens if not self.focus.fullmatch(x)] + nf_common = find_lcs(nf_pred_tokens, nf_gold_tokens) + i, j, c, un_pred, un_gold, common = 0, 0, 0, [], [], [] + while i < len(pred_tokens) and j < len(gold_tokens): + while nf_common[c] != pred_tokens[i]: + un_pred.append(pred_tokens[i]) + i += 1 + while nf_common[c] != gold_tokens[j]: + un_gold.append(gold_tokens[j]) + j += 1 + common += find_lcs(un_pred, un_gold) + un_pred, un_gold = [], [] + while c < len(nf_common) and nf_common[c] == pred_tokens[i] and nf_common[c] == gold_tokens[j]: + i, j, c = i+1, j+1, c+1 + if c == len(nf_common): + common += find_lcs(pred_tokens[i+1:], gold_tokens[j+1:]) + break common = [x for x in common if self.focus.fullmatch(x)] pred_tokens = [x for x in pred_tokens if self.focus.fullmatch(x)] gold_tokens = [x for x in gold_tokens if self.focus.fullmatch(x)] @@ -172,22 +195,29 @@ def process_end(self): # difflib.SequenceMatcher does not compute LCS, so let's implement it here -# TODO: make faster by trimming common prefix and sufix def find_lcs(x, y): """Find longest common subsequence.""" m, n = len(x), len(y) - C = [[0] * (n + 1) for _ in range(m + 1)] - for i in range(1, m + 1): - for j in range(1, n + 1): - C[i][j] = C[i - 1][j - 1] + 1 if x[i - 1] == y[j - 1] else max(C[i][j - 1], C[i - 1][j]) - index = C[m][n] - lcs = [None] * index - while m > 0 and n > 0: - if x[m - 1] == y[n - 1]: - lcs[index - 1] = x[m - 1] - m, n, index = m - 1, n - 1, index - 1 - elif C[m - 1][n] > C[m][n - 1]: - m -= 1 - else: - n -= 1 - return lcs + if m == 0 or n == 0: + return [] + elif x[0] == y[0]: + i = 1 + while i < min(m, n) and x[i] == y[i]: + i += 1 + return x[:i] + (find_lcs(x[i:], y[i:]) if i < min(m, n) else []) + else: + C = [[0] * (n + 1) for _ in range(m + 1)] + for i in range(1, m + 1): + for j in range(1, n + 1): + C[i][j] = C[i - 1][j - 1] + 1 if x[i - 1] == y[j - 1] else max(C[i][j - 1], C[i - 1][j]) + index = C[m][n] + lcs = [None] * index + while m > 0 and n > 0: + if x[m - 1] == y[n - 1]: + lcs[index - 1] = x[m - 1] + m, n, index = m - 1, n - 1, index - 1 + elif C[m - 1][n] > C[m][n - 1]: + m -= 1 + else: + n -= 1 + return lcs From dd0fe498a8ca6a8177a8c8fdcb106a55fb1024d4 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 12 Dec 2019 14:52:09 +0100 Subject: [PATCH 0079/1201] use `is` instead of `==` for node identity checks, it is faster --- udapi/core/node.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index c0684e60..2b1006fd 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -254,17 +254,17 @@ def parent(self, new_parent): (from the list of original parent's children). """ # If the parent is already assigned, return. - if self.parent == new_parent: + if self.parent is new_parent: return # The node itself couldn't be assigned as a parent. - if self == new_parent: + if self is new_parent: raise ValueError('Cannot set a node as its own parent (cycle are forbidden): %s' % self) # Check if the current Node is not an antecedent of the new parent. climbing_node = new_parent while not climbing_node.is_root(): - if climbing_node == self: + if climbing_node is self: raise ValueError('Setting the parent of %s to %s would lead to a cycle.' % (self, new_parent)) climbing_node = climbing_node.parent @@ -272,10 +272,10 @@ def parent(self, new_parent): # Remove the current Node from the children of the old parent. # Forbid moving nodes from one tree to another using parent setter. if self._parent: - self._parent._children = [node for node in self.parent.children if node != self] + self._parent._children = [node for node in self.parent.children if node is not self] # TODO: .root is currently computed, so it is quite slow old_root, new_root = self._parent.root, climbing_node - if old_root != new_root: + if old_root is not new_root: raise ValueError('Cannot move nodes between trees with parent setter, ' 'use new_root.steal_nodes(nodes_to_be_moved) instead') # Set the new parent. @@ -342,7 +342,7 @@ def is_descendant_of(self, node): """Is the current node a descendant of the node given as argument?""" climber = self.parent while climber: - if climber == node: + if climber is node: return True climber = climber.parent return False @@ -392,7 +392,7 @@ def remove(self, children=None): `warn` means to issue a warning if any children are present and delete them. `rehang_warn` means to rehang and warn:-). """ - self.parent._children = [child for child in self.parent.children if child != self] + self.parent._children = [child for child in self.parent.children if child is not self] if children is not None and self.children: if children.startswith('rehang'): for child in self.children: @@ -413,7 +413,7 @@ def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): reference_ord = reference_node.ord if reference_subtree: - for node in [n for n in reference_node.descendants() if n != self]: + for node in [n for n in reference_node.descendants() if n is not self]: if (after and node.ord > reference_ord) or (not after and node.ord < reference_ord): reference_ord = node.ord From 05e63a375de5b4e86f389fecc89b238c4feae5dd Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 12 Dec 2019 14:55:54 +0100 Subject: [PATCH 0080/1201] nicer error message for `node.parent = None` --- udapi/core/node.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 2b1006fd..dd3d2980 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -257,9 +257,11 @@ def parent(self, new_parent): if self.parent is new_parent: return - # The node itself couldn't be assigned as a parent. + # The node itself couldn't be assigned as a parent. None cannot be used as parent. if self is new_parent: raise ValueError('Cannot set a node as its own parent (cycle are forbidden): %s' % self) + if new_parent is None: + raise ValueError('Cannot set None as parent: %s' % self) # Check if the current Node is not an antecedent of the new parent. climbing_node = new_parent From 19b4e680b2ff7bbbec6060b9b97bcb1949f3196d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 26 Mar 2020 20:20:01 +0100 Subject: [PATCH 0081/1201] return `sys.stdout` always back to its original value and close the filehandle (unless it is the real __stdout__). Fixes #57. --- udapi/core/basewriter.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index ed574c95..956d08c9 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -42,6 +42,7 @@ def before_process_document(self, document): logging.info('Writing to filehandle.') sys.stdout = self.files.filehandle return + old_filehandle = sys.stdout if self.orig_files == '-': if self.docname_as_file: docname = document.meta.get('docname', None) @@ -52,23 +53,23 @@ def before_process_document(self, document): logging.warning('docname_as_file=1 but the document contains no docname') else: sys.stdout = sys.__stdout__ - return - - old_filehandle = sys.stdout - if old_filehandle.fileno != sys.stdout.fileno: + else: + filename = self.next_filename() + if filename is None: + raise RuntimeError('There are more documents to save than filenames given (%s)' + % self.orig_files) + elif filename == '-': + logging.info('Writing to stdout.') + sys.stdout = sys.__stdout__ + else: + logging.info('Writing to file %s.', filename) + sys.stdout = open(filename, 'wt', encoding=self.encoding, newline=self.newline) + if old_filehandle.fileno() not in (sys.stdout.fileno(), sys.__stdout__.fileno()): old_filehandle.close() - filename = self.next_filename() - if filename is None: - raise RuntimeError('There are more documents to save than filenames given (%s)' - % self.orig_files) - elif filename == '-': - logging.info('Writing to stdout.') - sys.stdout = sys.__stdout__ - else: - logging.info('Writing to file %s.', filename) - sys.stdout = open(filename, 'wt', encoding=self.encoding, newline=self.newline) def after_process_document(self, document): - if self.orig_files == '': + sys.stdout.flush() + if sys.stdout.fileno() != sys.__stdout__.fileno(): + sys.stdout.close() sys.stdout = sys.__stdout__ From a09e43340c6cc4c71bf82948a25c74280681c81a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 27 Mar 2020 00:09:30 +0100 Subject: [PATCH 0082/1201] Jupyter stdout doesn't support fileno() and __stdout__ != stdout --- udapi/core/basewriter.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 956d08c9..1d64e874 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -13,6 +13,7 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' newline='\n', **kwargs): super().__init__(**kwargs) self.orig_files = files + self.orig_stdout = sys.stdout if filehandle is not None: files = None self.orig_files = '' @@ -52,7 +53,7 @@ def before_process_document(self, document): else: logging.warning('docname_as_file=1 but the document contains no docname') else: - sys.stdout = sys.__stdout__ + sys.stdout = self.orig_stdout else: filename = self.next_filename() if filename is None: @@ -60,16 +61,16 @@ def before_process_document(self, document): % self.orig_files) elif filename == '-': logging.info('Writing to stdout.') - sys.stdout = sys.__stdout__ + sys.stdout = self.orig_stdout else: logging.info('Writing to file %s.', filename) sys.stdout = open(filename, 'wt', encoding=self.encoding, newline=self.newline) - if old_filehandle.fileno() not in (sys.stdout.fileno(), sys.__stdout__.fileno()): + if old_filehandle not in (sys.stdout, self.orig_stdout): old_filehandle.close() def after_process_document(self, document): sys.stdout.flush() - if sys.stdout.fileno() != sys.__stdout__.fileno(): + if sys.stdout != self.orig_stdout: sys.stdout.close() - sys.stdout = sys.__stdout__ + sys.stdout = self.orig_stdout From 29e1edab9823083b8440af872d9a9db4e100e3b8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 19 Apr 2020 14:45:48 +0200 Subject: [PATCH 0083/1201] Processing enhanced dependencies does not work.\ --- udapi/core/node.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index dd3d2980..02b6f049 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -228,7 +228,11 @@ def deps(self): return self._deps for raw_dependency in self._raw_deps.split('|'): - head, deprel = raw_dependency.split(':') + # Deprel itself may contain one or more ':' (subtypes). + pieces = raw_dependency.split(':') + head = pieces[0] + deprel = ':'.join(pieces[1:]) + ###!!! The following line will throw an exception if the head is an empty node, e.g., '7.1'. parent = nodes[int(head)] self._deps.append({'parent': parent, 'deprel': deprel}) From 33b0d260dc085ed09626cda052bf9184a83ba226 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 20 Apr 2020 14:48:43 +0200 Subject: [PATCH 0084/1201] Another quick patch of node.deps. --- udapi/core/node.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 02b6f049..ee5bc24e 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -232,8 +232,15 @@ def deps(self): pieces = raw_dependency.split(':') head = pieces[0] deprel = ':'.join(pieces[1:]) - ###!!! The following line will throw an exception if the head is an empty node, e.g., '7.1'. - parent = nodes[int(head)] + # Empty nodes have to be located differently than normal nodes. + if '.' in head: + matching = [x for x in self.root.empty_nodes if x.ord == head] + if len(matching) > 0: + parent = matching[0] + else: + parent = None ###!!! what should we do here? + else: + parent = nodes[int(head)] self._deps.append({'parent': parent, 'deprel': deprel}) return self._deps From 912af2639c1fae9d2d7648f1b9a745b234d02edb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 4 May 2020 11:41:59 +0200 Subject: [PATCH 0085/1201] prevent ud.FixPunct to create multiple sub-roots Fixes #60 --- udapi/block/ud/fixpunct.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 96ec0ed5..55097997 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -61,7 +61,9 @@ def __init__(self, check_paired_punct_upos=False, copy_to_enhanced=False, **kwar self.copy_to_enhanced = copy_to_enhanced def process_tree(self, root): - # First, make sure no PUNCT has children + # First, make sure no PUNCT has children. + # This may introduce multiple subroots, which will be fixed later on + # (preventing to temporarily create multiple subroots here would prevent fixing some errors). for node in root.descendants: while node.parent.upos == "PUNCT": node.parent = node.parent.parent @@ -84,15 +86,25 @@ def process_tree(self, root): if node.upos == "PUNCT" and not self._punct_type[node.ord]: self._fix_subord_punct(node) - # Finally, check if root is still marked with deprel=root. - # This may not hold if the original root was a paired punctuation, which was rehanged. - for node in root.children: - if node.udeprel != 'root': - node.udeprel = 'root' - for another_node in root.descendants: - if another_node.parent != root and another_node.udeprel == 'root': - another_node.udeprel = 'punct' + # UD requires "exactly one word is the head of the sentence, dependent on a notional ROOT", i.e. a single "subroot". + # This seems to be a stronger rule than no-PUNCT-children because it is checked by the validator. + # So lets prevent multiple subroots (at the cost of possibly re-introducing PUNCT-children). + if len(root.children) > 1: + selected_subroot = next((n for n in root.children if n.udeprel == 'root'), root.children[0]) + for a_subroot in root.children: + if a_subroot != selected_subroot: + a_subroot.parent = selected_subroot + + # Check if the subroot is still marked with deprel=root. + # This may not hold if the original subroot was a paired punctuation, which was rehanged. + if root.children[0].udeprel != 'root': + root.children[0].udeprel = 'root' + for another_node in root.children[0].descendants: + if another_node.udeprel == 'root': + another_node.udeprel = 'punct' + + # TODO: This block changes parents not only for PUNCT nodes. These should be reflected into enhanced deps as well. if self.copy_to_enhanced: for node in root.descendants: if node.upos == "PUNCT": @@ -135,7 +147,7 @@ def _fix_subord_punct(self, node): # because climbing higher would cause a non-projectivity (the punct would be the gap). l_path, r_path = [l_cand], [r_cand] if l_cand is None or l_cand.is_root(): - l_cand = None + l_cand, l_path = None, [] else: while (not l_cand.parent.is_root() and l_cand.parent.precedes(node) and not node.precedes(l_cand.descendants(add_self=1)[-1])): From 1e4004f577f3c6e471528ce4b87dd570ce8f2706 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 11 May 2020 14:15:47 +0200 Subject: [PATCH 0086/1201] ud.FixPunct should not introduce punct-nonproj-gap errors (in addition to punct-nonproj errors, as defined and checked by the UD validator and ud.MarkBugs). Fixes #52 --- udapi/block/ud/fixpunct.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 55097997..cc34a0d0 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -159,7 +159,8 @@ def _fix_subord_punct(self, node): r_cand = r_cand.parent r_path.append(r_cand) - # Filter out candidates which would lead to non-projectivities. + # Filter out candidates which would lead to non-projectivities, i.e. bugs + # punct-nonproj and punct-nonproj-gap as checked by the UD validator and ud.MarkBugs. orig_parent = node.parent l_path = [n for n in l_path if n and self._will_be_projective(node, n)] r_path = [n for n in r_path if n and self._will_be_projective(node, n)] @@ -196,7 +197,7 @@ def _fix_subord_punct(self, node): def _will_be_projective(self, node, cand): node.parent = cand - return not node.is_nonprojective() + return not node.is_nonprojective() and not self._causes_gap(node) def _causes_gap(self, node): return node.is_nonprojective_gap() and not node.parent.is_nonprojective_gap() From 2385ff1218726157b4f0cba1bb6d3a905ec5562b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 29 Jun 2020 23:05:01 +0200 Subject: [PATCH 0087/1201] support Udpipe segmentation w/o tokenization/tagging/parsing e.g. `udpipe.En tag=0 parse=0 tokenize=0 resegment=1` Also, even when `tokenize=1 resegment=1`, the newly created trees should have `tree.text` set to the correct sentence string. --- udapi/block/udpipe/base.py | 12 +++++++++++- udapi/tool/udpipe.py | 13 +++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index 34028a4a..3ec4a131 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -152,7 +152,6 @@ def process_document(self, doc): if self.resegment and len(new_trees) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' - tree.text = None for i, new_tree in enumerate(new_trees[1:], 2): new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) new_tree.zone = tree.zone @@ -160,6 +159,17 @@ def process_document(self, doc): new_bundles.append(new_bundle) elif not tok and tag and par: self.tool.tag_parse_tree(tree) + elif not tok and not tag and not par and self.resegment: + sentences = self.tool.segment_text(tree.text) + if len(sentences) > 1: + orig_bundle_id = bundle.bundle_id + bundle.bundle_id = orig_bundle_id + '-1' + tree.text = sentences[0] + for i, sentence in enumerate(sentences[1:], 2): + new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) + new_tree = new_bundle.create_tree(zone=tree.zone) + new_tree.text = sentence + new_bundles.append(new_bundle) else: raise ValueError("Unimplemented tokenize=%s tag=%s parse=%s" % (tok, tag, par)) doc.bundles = new_bundles diff --git a/udapi/tool/udpipe.py b/udapi/tool/udpipe.py index c81a7562..c08785da 100644 --- a/udapi/tool/udpipe.py +++ b/udapi/tool/udpipe.py @@ -92,6 +92,7 @@ def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): for u_sentence in u_sentences: if not new_root: new_root = Root() + new_root.text = u_sentence.getText() heads, nodes = [], [new_root] u_words = u_sentence.words for i in range(1, u_words.size()): @@ -110,3 +111,15 @@ def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): trees.append(new_root) new_root = None return trees + + def segment_text(self, text): + """Segment the provided text into sentences.""" + self.tokenizer.setText(text) + is_another = True + sentences = [] + while is_another: + u_sentence = Sentence() + is_another = self.tokenizer.nextSentence(u_sentence) + if is_another: + sentences.append(u_sentence.getText()) + return sentences From 000205820e632461f36455b48fd60af4c58ef3cb Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 21 Oct 2020 10:40:23 +0200 Subject: [PATCH 0088/1201] Added a block to fix pseudocopulas in a UD treebank. --- udapi/block/ud/fixpseudocop.py | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 udapi/block/ud/fixpseudocop.py diff --git a/udapi/block/ud/fixpseudocop.py b/udapi/block/ud/fixpseudocop.py new file mode 100644 index 00000000..ab07eaaa --- /dev/null +++ b/udapi/block/ud/fixpseudocop.py @@ -0,0 +1,39 @@ +"""Block to fix annotation of verbs that are currently treated as copulas + but they should be treated as normal verbs (with secondary predication) + instead.""" +from udapi.core.block import Block +import logging +import re + +class FixPseudoCop(Block): + + def __init__(self, lemma, **kwargs): + """Create the ud.FixPseudoCop block instance. + + Args: + lemma: the lemma of the pseudocopula that should be fixed + """ + super().__init__(**kwargs) + self.lemma = lemma + + def process_node(self, node): + pseudocop = self.lemma + if node.lemma == pseudocop and node.udeprel == "cop": + secpred = node.parent + grandparent = secpred.parent + node.parent = grandparent + node.deprel = secpred.deprel + secpred.parent = node + secpred.deprel = "xcomp" + ###!!! We should also take care of DEPS if they exist. + # As a copula, the word was tagged AUX. Now it should be VERB. + node.upos = "VERB" + # Examine the children of the original parent. + # Those that modify the clause should be re-attached to me. + # Those that modify the word (noun, adjective) should stay there. + for c in secpred.children: + # obl is borderline. It could modify an adjective rather than a clause. + # obj and iobj should not occur in copular clauses but it sometimes + # occurs with pseudocopulas: "I declare him handsome." + if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): + c.parent = node From 2fb4e005c76473bbd9ee64cf40a235078f1f09b3 Mon Sep 17 00:00:00 2001 From: Francesco Mambrini Date: Sat, 7 Nov 2020 14:30:08 +0100 Subject: [PATCH 0089/1201] Issue 62: proposed fix and test --- udapi/core/node.py | 4 ++- udapi/core/tests/test_enhdeps.py | 62 ++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 udapi/core/tests/test_enhdeps.py diff --git a/udapi/core/node.py b/udapi/core/node.py index dd3d2980..0851f202 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -195,7 +195,9 @@ def raw_deps(self): if self._deps is not None: serialized_deps = [] for secondary_dependence in self._deps: - serialized_deps.append('%d:%s' % (secondary_dependence[ + # serialized_deps.append('%d:%s' % (secondary_dependence[ + # 'parent'].ord, secondary_dependence['deprel'])) + serialized_deps.append('{}:{}'.format(secondary_dependence[ 'parent'].ord, secondary_dependence['deprel'])) self._raw_deps = '|'.join(serialized_deps) return self._raw_deps diff --git a/udapi/core/tests/test_enhdeps.py b/udapi/core/tests/test_enhdeps.py new file mode 100644 index 00000000..fd80b3f2 --- /dev/null +++ b/udapi/core/tests/test_enhdeps.py @@ -0,0 +1,62 @@ +import unittest +import os +import udapi + +from udapi.core.root import Root +from udapi.core.node import Node, find_minimal_common_treelet +from udapi.core.document import Document +from udapi.block.read.conllu import Conllu as ConlluReader +from udapi.block.write.conllu import Conllu as ConlluWriter + + +class TestEnhDeps(unittest.TestCase): + """Unit tests for udapi.core.node and enhanced dependecies. + Tests the behaviour with empty nodes (with decimal ord, such as 0.1, 2.3 etc.) as well""" + + @classmethod + def setUpClass(cls): + cls.doc = Document() + cls.data = os.path.join(os.path.dirname(udapi.__file__), "core", "tests", "data", "enh_deps.conllu") + cls.doc.load_conllu(cls.data) + cls.tree = cls.doc.bundles[0].get_tree() + cls.nodes = cls.tree.descendants + cls.add_empty_node(cls.tree, 3) + + @staticmethod + def add_empty_node(tree, ord_before, decimal=1): + """Add an empty node to tree after the node with index `ord_before`. + Empty node will receive ord=`ord_before`.`decimal`""" + e = tree.create_empty_child() + e.ord = float('{}.{}'.format(ord_before, decimal)) + e.form = "E{}".format(e.ord) + + def test_datapath(self): + self.assertTrue(os.path.isfile(self.data)) + + def test_nodes(self): + self.assertEqual(6, len(self.nodes)) + + def test_ord_type(self): + self.assertIsNot(str, type(self.nodes[0].ord)) + + def test_create_empty(self): + writer = ConlluWriter() + writer.apply_on_document(self.doc) + # self.tree.print_subtree() + self.assertGreater(len(self.tree.empty_nodes), 0) + + def test_regular_deps(self): + + n = self.nodes[0] + self.assertEqual("0:root|2:amod", n.raw_deps) + + def test_create_deps2empty(self): + e = self.tree.empty_nodes[0] + h = self.nodes[1] + d = self.nodes[5] + e.deps.append({'parent': h, 'deprel':'dep:e2h'}) + d.deps.append({'parent': e, 'deprel': 'dep:d2e'}) + self.assertEqual("2:dep:e2h", e.raw_deps, ) + self.assertEqual("5:conj|3.1:dep:d2e", d.raw_deps) + + From fd1e4a0806877f602c35cdad767c59c6ea49ee2c Mon Sep 17 00:00:00 2001 From: Francesco Mambrini Date: Sat, 7 Nov 2020 14:38:54 +0100 Subject: [PATCH 0090/1201] Issue 62: cleaned up previous code --- udapi/core/node.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 0851f202..b51e5505 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -195,8 +195,6 @@ def raw_deps(self): if self._deps is not None: serialized_deps = [] for secondary_dependence in self._deps: - # serialized_deps.append('%d:%s' % (secondary_dependence[ - # 'parent'].ord, secondary_dependence['deprel'])) serialized_deps.append('{}:{}'.format(secondary_dependence[ 'parent'].ord, secondary_dependence['deprel'])) self._raw_deps = '|'.join(serialized_deps) From 928d25f1f19cdd6689d64c63a6edd9d7941cb47d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 17 Nov 2020 23:04:45 +0100 Subject: [PATCH 0091/1201] VerbForm is not obligatory in Austronesian languages (#64). --- udapi/block/ud/markbugs.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index 1969278a..c8b06328 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -8,6 +8,13 @@ Usage: udapy -s ud.MarkBugs < in.conllu > marked.conllu 2> log.txt +Some tests may be customized for individual languages if the language code is +available as the zone id. The zone id can be provided in the sentence id after +the slash (e.g., "sent_id = s125/en" for English), or as a parameter of the +reader: + +udapy -s read.Conllu zone=en ud.MarkBugs < in.conllu > marked.conllu 2> log.txt + Errors are both logged to stderr and marked within the nodes' MISC field, e.g. `node.misc['Bug'] = 'aux-chain'`, so the output conllu file can be searched for "Bug=" occurences. @@ -109,7 +116,10 @@ def process_node(self, node): for i_upos, i_feat in REQUIRED_FEATURE_FOR_UPOS.items(): if upos == i_upos and not feats[i_feat]: - self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) + # Some languages do not distinguish finite and non-finite forms of verbs. + # The VerbForm feature is not obligatory in those languages. + if not node.root.zone.split("_")[0] in {"id", "tl", "hil", "ifb"}: + self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) if feats['VerbForm'] == 'Fin': if upos not in ('VERB', 'AUX'): From 3dd3af69a5314e32ed71def7a3e8cf81226b3571 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 17 Nov 2020 23:09:10 +0100 Subject: [PATCH 0092/1201] Fix: The new condition should affect only VerbForm (#64). --- udapi/block/ud/markbugs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index c8b06328..cbd57eef 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -118,7 +118,7 @@ def process_node(self, node): if upos == i_upos and not feats[i_feat]: # Some languages do not distinguish finite and non-finite forms of verbs. # The VerbForm feature is not obligatory in those languages. - if not node.root.zone.split("_")[0] in {"id", "tl", "hil", "ifb"}: + if i_feat != "VerbForm" or not node.root.zone.split("_")[0] in {"id", "tl", "hil", "ifb"}: self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) if feats['VerbForm'] == 'Fin': From 0e809365578514e1d76b784f96c73fd798ce6e6e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 20 Nov 2020 01:17:56 +0100 Subject: [PATCH 0093/1201] make Udapi easier to use from IPython For example to print subjects with more than 4 children: ``` import udapi doc = udapi.Document("UD_English/sample.conllu") subjects = [n for n in doc.nodes if n.deprel == 'nsubj'] for subject in subjects: if len(subject.children) > 4: subject.print_subtree() ``` --- udapi/__init__.py | 1 + udapi/core/document.py | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/udapi/__init__.py b/udapi/__init__.py index e69de29b..598df7fd 100644 --- a/udapi/__init__.py +++ b/udapi/__init__.py @@ -0,0 +1 @@ +from .core.document import Document diff --git a/udapi/core/document.py b/udapi/core/document.py index 778e5bd9..c7e7b870 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -9,11 +9,14 @@ class Document(object): """Document is a container for Universal Dependency trees.""" - def __init__(self): + def __init__(self, filename=None): + """Create a new Udapi document. Optionally, load the CoNLL-U file specified in `filename`.""" self.bundles = [] self._highest_bundle_id = 0 self.meta = {} self.json = {} + if filename is not None: + self.load_conllu(filename) def __iter__(self): return iter(self.bundles) @@ -47,3 +50,18 @@ def to_conllu_string(self): writer = ConlluWriter(filehandle=fh) writer.apply_on_document(self) return fh.getvalue() + + @property + def trees(self): + """An iterator over all trees in the document.""" + for bundle in self: + for tree in bundle: + yield tree + + @property + def nodes(self): + """An iterator over all nodes in the document.""" + for bundle in self: + for tree in bundle: + for node in tree.descendants: + yield node From 0a39dcd80da4b3402405b330c1794ace619efff8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 21 Nov 2020 03:45:11 +0100 Subject: [PATCH 0094/1201] further tricks for IPython Allow e.g.: ``` import udapi doc = udapi.Document("one-sentence-per-line.txt") udapi.create_block("udpipe.En").run(doc) udapi.create_block("util.See", node='node.is_nonprojective()', n=2).run(doc) ``` --- udapi/__init__.py | 1 + udapi/core/block.py | 5 +++++ udapi/core/document.py | 19 +++++++++++++++++-- udapi/core/run.py | 6 ++++++ 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/udapi/__init__.py b/udapi/__init__.py index 598df7fd..afdd8025 100644 --- a/udapi/__init__.py +++ b/udapi/__init__.py @@ -1 +1,2 @@ from .core.document import Document +from .core.run import create_block diff --git a/udapi/core/block.py b/udapi/core/block.py index 67c299f0..3292866f 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -38,6 +38,11 @@ def process_bundle(self, bundle): if self._should_process_tree(tree): self.process_tree(tree) + def run(self, document): + self.process_start() + self.apply_on_document(document) + self.process_end() + def apply_on_document(self, document): self.before_process_document(document) self.process_document(document) diff --git a/udapi/core/document.py b/udapi/core/document.py index c7e7b870..b0e17d2a 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -4,19 +4,34 @@ from udapi.core.bundle import Bundle from udapi.block.read.conllu import Conllu as ConlluReader from udapi.block.write.conllu import Conllu as ConlluWriter +from udapi.block.read.sentences import Sentences as SentencesReader class Document(object): """Document is a container for Universal Dependency trees.""" def __init__(self, filename=None): - """Create a new Udapi document. Optionally, load the CoNLL-U file specified in `filename`.""" + """Create a new Udapi document. + + Args: + filename: load the specified file. + Only `*.conlu` (using `udapi.block.read.conllu`) + and `*.txt` (using `udapi.block.read.sentences`) filenames are supported. + No pre-processing is applied, so when loading the document from a *.txt file, + `Document("a.txt").nodes` will be empty and you need to run tokenization first. + """ self.bundles = [] self._highest_bundle_id = 0 self.meta = {} self.json = {} if filename is not None: - self.load_conllu(filename) + if filename.endswith(".conllu"): + self.load_conllu(filename) + elif filename.endswith(".txt"): + reader = SentencesReader(files=filename) + reader.apply_on_document(self) + else: + raise ValueError("Only *.conllu and *.txt are supported. Provided: " + filename) def __iter__(self): return iter(self.bundles) diff --git a/udapi/core/run.py b/udapi/core/run.py index c730a1a7..0a08504c 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -171,3 +171,9 @@ def execute(self): def scenario_string(self): """Return the scenario string.""" return "\n".join(self.args.scenario) + + +def create_block(block, **kwargs): + """A factory function for creating new block instances (handy for IPython).""" + blocks = _import_blocks([block], [kwargs]) + return blocks[0] From f1b4d8ae36557feb0f93306a2a7d1b780abc8d88 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Nov 2020 22:13:27 +0100 Subject: [PATCH 0095/1201] rename node.print_subtree() to node.draw() for consistency with `bundle.draw()` and `doc.draw()` --- udapi/block/write/textmodetrees.py | 4 ++-- udapi/core/bundle.py | 5 +++++ udapi/core/document.py | 21 ++++++++++++++++++--- udapi/core/node.py | 5 +++++ udapi/core/tests/test_enhdeps.py | 2 +- udapi/core/tests/test_node.py | 12 ++++++------ 6 files changed, 37 insertions(+), 12 deletions(-) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index 1f8163c5..fd29e72e 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -123,7 +123,7 @@ class TextModeTrees(BaseWriter): (by reveresing the background and foreground colors). This block's method `process_tree` can be called on any node (not only root), - which is useful for printing subtrees using ``node.print_subtree()``, + which is useful for printing subtrees using ``node.draw()``, which is internally implemented using this block. SEE ALSO @@ -207,7 +207,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.lengths = [] # We want to be able to call process_tree not only on root node, - # so this block can be called from node.print_subtree(**kwargs) + # so this block can be called from node.print_draw(**kwargs) # on any node and print its subtree. Thus, we cannot assume that # allnodes[idx].ord == idx. Instead of node.ord, we'll use index_of[node.ord], # which is its index within the printed subtree. diff --git a/udapi/core/bundle.py b/udapi/core/bundle.py index 4df1deb2..591b8a84 100644 --- a/udapi/core/bundle.py +++ b/udapi/core/bundle.py @@ -3,6 +3,7 @@ import re from udapi.core.root import Root +from udapi.block.write.textmodetrees import TextModeTrees VALID_ZONE_REGEX = re.compile("^[a-z-]*(_[A-Za-z0-9-]+)?$") @@ -106,3 +107,7 @@ def remove(self): def address(self): """Return bundle_id or '?' if missing.""" return self.bundle_id if self.bundle_id is not None else '?' + + def draw(self, **kwargs): + """Pretty print the trees using TextModeTrees.""" + TextModeTrees(**kwargs).process_bundle(self) diff --git a/udapi/core/document.py b/udapi/core/document.py index b0e17d2a..25699398 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -1,11 +1,12 @@ """Document class is a container for UD trees.""" import io +import contextlib from udapi.core.bundle import Bundle from udapi.block.read.conllu import Conllu as ConlluReader from udapi.block.write.conllu import Conllu as ConlluWriter from udapi.block.read.sentences import Sentences as SentencesReader - +from udapi.block.write.textmodetrees import TextModeTrees class Document(object): """Document is a container for Universal Dependency trees.""" @@ -36,6 +37,16 @@ def __init__(self, filename=None): def __iter__(self): return iter(self.bundles) + def __getitem__(self, key): + return self.bundles[key] + + def __str__(self): + """Pretty print the whole document using write.TextModeTrees.""" + fh = io.StringIO() + with contextlib.redirect_stdout(fh): + TextModeTrees(color=True).run(self) + return fh.getvalue() + def create_bundle(self): """Create a new bundle and add it at the end of the document.""" self._highest_bundle_id += 1 @@ -62,8 +73,8 @@ def from_conllu_string(self, string): def to_conllu_string(self): """Return the document as a conllu-formatted string.""" fh = io.StringIO() - writer = ConlluWriter(filehandle=fh) - writer.apply_on_document(self) + with contextlib.redirect_stdout(fh): + ConlluWriter().apply_on_document(self) return fh.getvalue() @property @@ -80,3 +91,7 @@ def nodes(self): for tree in bundle: for node in tree.descendants: yield node + + def draw(self, **kwargs): + """Pretty print the trees using TextModeTrees.""" + TextModeTrees(**kwargs).run(self) diff --git a/udapi/core/node.py b/udapi/core/node.py index de58c9be..32061b16 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -600,6 +600,11 @@ def compute_text(self, use_mwt=True): return string.rstrip() def print_subtree(self, **kwargs): + """deprecated name for draw()""" + logging.warning("node.print_subtree() is deprecated, use node.draw() instead.") + TextModeTrees(**kwargs).process_tree(self) + + def draw(self, **kwargs): """Print ASCII visualization of the dependency structure of this subtree. This method is useful for debugging. diff --git a/udapi/core/tests/test_enhdeps.py b/udapi/core/tests/test_enhdeps.py index fd80b3f2..80fe0209 100644 --- a/udapi/core/tests/test_enhdeps.py +++ b/udapi/core/tests/test_enhdeps.py @@ -42,7 +42,7 @@ def test_ord_type(self): def test_create_empty(self): writer = ConlluWriter() writer.apply_on_document(self.doc) - # self.tree.print_subtree() + # self.tree.draw() self.assertGreater(len(self.tree.empty_nodes), 0) def test_regular_deps(self): diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index 52e94722..7d811456 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -56,8 +56,8 @@ def test_topology(self): self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6]) self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6]) - def test_print_subtree(self): - """Test print_subtree() method, which uses udapi.block.write.textmodetrees.""" + def test_draw(self): + """Test the draw() method, which uses udapi.block.write.textmodetrees.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) @@ -98,16 +98,16 @@ def test_print_subtree(self): try: sys.stdout = capture = io.StringIO() - root.print_subtree(color=False) + root.draw(color=False) self.assertEqual(capture.getvalue(), expected1) capture.seek(0) capture.truncate() - root.print_subtree(color=False, attributes='form,feats,misc', - print_sent_id=False, print_text=False) + root.draw(color=False, attributes='form,feats,misc', + print_sent_id=False, print_text=False) self.assertEqual(capture.getvalue(), expected2) capture.seek(0) capture.truncate() - root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0) + root3.draw(color=False, attributes='form', print_sent_id=0, print_text=0) self.assertEqual(capture.getvalue(), expected3) finally: sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type From e94591fce7f68d0e317fca5628c966e766f777ad Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 1 Dec 2020 08:06:47 +0100 Subject: [PATCH 0096/1201] tutorial/01-visualizing.ipynb --- tutorial/01-visualizing.ipynb | 554 ++++++++++++++++++++++++++++++++++ 1 file changed, 554 insertions(+) create mode 100644 tutorial/01-visualizing.ipynb diff --git a/tutorial/01-visualizing.ipynb b/tutorial/01-visualizing.ipynb new file mode 100644 index 00000000..382bb11f --- /dev/null +++ b/tutorial/01-visualizing.ipynb @@ -0,0 +1,554 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "Udapi is an API and framework for processing [Universal Dependencies](http://universaldependencies.org/). In this tutorial, we will focus on the Python version of Udapi. Perl and Java versions are [available](http://udapi.github.io/) as well, but they are missing some of the features.\n", + "\n", + "Udapi can be used from the shell (e.g. Bash), using the wrapper script `udapy`. It can be also used as a library, from Python, IPython or Jupyter notebooks. We will show both of these ways bellow.\n", + "\n", + "This tutorial uses Details sections for extra info (if you want to know more or if you run into problems). You need to click on it to show its content.\n", + "
Details\n", + "It is a substitute for footnotes. The content may be long and showing it in the main text may be distracting.\n", + "
\n", + "\n", + "### Install (upgrade) Udapi\n", + "First, make sure you have the newest version of Udapi. If you have already installed Udapi [using git clone](https://github.com/udapi/udapi-python#install-udapi-for-developers), just run `git pull`. If you have not installed Udapi yet, run\n", + "
Details\n", + "
    \n", + "
  • The command below installs Udapi from GitHub (from the master branch). With pip3 install --user --upgrade udapi, you can install the last version released on PyPI (possibly older).\n", + "
  • The exclamation mark (!) in Jupyter or IPython means that the following command will be executed by the system shell (e.g. Bash).\n", + "
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install --user --upgrade git+https://github.com/udapi/udapi-python.git\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, make sure you can run the command-line interface `udapy`, e.g. by printing the help message." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: udapy [optional_arguments] scenario\r\n", + "\r\n", + "udapy - Python interface to Udapi - API for Universal Dependencies\r\n", + "\r\n", + "Examples of usage:\r\n", + " udapy -s read.Sentences udpipe.En < in.txt > out.conllu\r\n", + " udapy -T < sample.conllu | less -R\r\n", + " udapy -HAM ud.MarkBugs < sample.conllu > bugs.html\r\n", + "\r\n", + "positional arguments:\r\n", + " scenario A sequence of blocks and their parameters.\r\n", + "\r\n", + "optional arguments:\r\n", + " -h, --help show this help message and exit\r\n", + " -q, --quiet Warning, info and debug messages are suppressed. Only fatal errors are reported.\r\n", + " -v, --verbose Warning, info and debug messages are printed to the STDERR.\r\n", + " -s, --save Add write.Conllu to the end of the scenario\r\n", + " -T, --save_text_mode_trees\r\n", + " Add write.TextModeTrees color=1 to the end of the scenario\r\n", + " -H, --save_html Add write.TextModeTreesHtml color=1 to the end of the scenario\r\n", + " -A, --save_all_attributes\r\n", + " Add attributes=form,lemma,upos,xpos,feats,deprel,misc (to be used after -T and -H)\r\n", + " -C, --save_comments Add print_comments=1 (to be used after -T and -H)\r\n", + " -M, --marked_only Add marked_only=1 to the end of the scenario (to be used after -T and -H)\r\n", + " -N, --no_color Add color=0 to the end of the scenario, this overrides color=1 of -T and -H\r\n", + "\r\n", + "See http://udapi.github.io\r\n" + ] + } + ], + "source": [ + "!udapy -h" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Details: If the previous command fails with \"udapy: command not found\"\n", + "This means that Udapi is not properly installed. When installing Udapi with pip3 --user, it is installed into ~/.local/lib/python3.6/site-packages/udapi/ (or similar depending on your Python version) and the wrapper into ~/.local/bin. Thus you need to\n", + "
\n",
+    "export PATH=\"$HOME/.local/bin/:$PATH\"\n",
+    "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Browse CoNLL-U files\n", + "### Get sample UD data\n", + "\n", + "Download and extract [ud20sample.tgz](http://ufal.mff.cuni.cz/~popel/udapi/ud20sample.tgz). There are just 100 sentences for each of the 70 treebanks (`sample.conllu`), plus 4 bigger files (`train.conllu` and `dev.conllu`) for German, English, French and Czech. For full UD ([2.0](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1983) or [newer](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3424)), go to [Lindat](https://lindat.cz)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-12-01 07:53:37-- http://ufal.mff.cuni.cz/~popel/udapi/ud20sample.tgz\n", + "Resolving ufal.mff.cuni.cz (ufal.mff.cuni.cz)... 195.113.20.52\n", + "Connecting to ufal.mff.cuni.cz (ufal.mff.cuni.cz)|195.113.20.52|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4670982 (4,5M) [application/x-gzip]\n", + "Saving to: ‘ud20sample.tgz.1’\n", + "\n", + "ud20sample.tgz.1 100%[===================>] 4,45M 1,49MB/s in 3,0s \n", + "\n", + "2020-12-01 07:53:40 (1,49 MB/s) - ‘ud20sample.tgz.1’ saved [4670982/4670982]\n", + "\n", + "/home/martin/udapi/python/notebook/sample\n" + ] + } + ], + "source": [ + "!wget http://ufal.mff.cuni.cz/~popel/udapi/ud20sample.tgz\n", + "!tar -xf ud20sample.tgz\n", + "%cd sample" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's choose one of the sample files and see the raw [CoNLL-U format](https://universaldependencies.org/format.html).\n", + "
Details: executing from Bash, IPython, Jupyter\n", + "
    \n", + "
  • If you see \"No such file or directory\" error, make sure you executed the previous cell. Note that the cd command is not prefixed by an exclamation mark because that would run in a sub-shell, which \"forgets\" the changed directory when finished. It is prefixed by a percent sign, which marks it as IPython magic.\n", + "
  • cat is another IPython magic command, this time an alias for the shell command of the same name (so you can prefix cat with an exclamation mark, if you prefer), which prints a given file. With automagic on, you can use it without the percent sign.\n", + "
  • In this tutorial, we use | head to show just the first 10 lines of the output (preventing thus big ipynb file size). You can ignore the \"cat: write error: Broken pipe\" warning.\n", + "
  • When using Jupyter, you can omit the | head because long outputs are automatically wrapped in a text box with a scrollbar.\n", + "
  • When running this from IPython or Bash, you can use a pager: less UD_Ancient_Greek/sample.conllu\n", + "
\n", + "
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# newdoc id = tlg0008.tlg001.perseus-grc1.13.tb.xml\r\n", + "# sent_id = tlg0008.tlg001.perseus-grc1.13.tb.xml@1144\r\n", + "# text = ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·\r\n", + "1\tἐρᾷ\tἐράω\tVERB\tv3spia---\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act\t0\troot\t_\t_\r\n", + "2\tμὲν\tμέν\tADV\td--------\t_\t1\tadvmod\t_\t_\r\n", + "3\tἁγνὸς\tἁγνός\tADJ\ta-s---mn-\tCase=Nom|Gender=Masc|Number=Sing\t4\tnmod\t_\t_\r\n", + "4\tοὐρανὸς\tοὐρανός\tNOUN\tn-s---mn-\tCase=Nom|Gender=Masc|Number=Sing\t1\tnsubj\t_\t_\r\n", + "5\tτρῶσαι\tτιτρώσκω\tVERB\tv--ana---\tTense=Past|VerbForm=Inf|Voice=Act\t1\txcomp\t_\t_\r\n", + "6\tχθόνα\tχθών\tNOUN\tn-s---fa-\tCase=Acc|Gender=Fem|Number=Sing\t5\tobj\t_\tSpaceAfter=No\r\n", + "7\t,\t,\tPUNCT\tu--------\t_\t1\tpunct\t_\t_\r\n", + "cat: write error: Broken pipe\r\n" + ] + } + ], + "source": [ + "cat UD_Ancient_Greek/sample.conllu | head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Browse conllu files with `udapy -T`\n", + "While the CoNLL-U format was designed with readibility (by both machines and humans) on mind, it may be still a bit difficult to read and interpret by humans. Let's visualize the dependency tree structure using ASCII-art by piping the conllu file into `udapy -T`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020-12-01 08:00:33,276 [ INFO] execute - No reader specified, using read.Conllu\n", + "2020-12-01 08:00:33,276 [ INFO] execute - ---- ROUND ----\n", + "2020-12-01 08:00:33,276 [ INFO] execute - Executing block Conllu\n", + "2020-12-01 08:00:33,305 [ INFO] execute - Executing block TextModeTrees\n", + "docname = tlg0008.tlg001.perseus-grc1.13.tb.xml\n", + "loaded_from = -\n", + "# sent_id = tlg0008.tlg001.perseus-grc1.13.tb.xml@1144\n", + "# text = ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·\n", + "─┮\n", + " ╰─┮ \u001b[33mἐρᾷ\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mroot\u001b[0m\n", + " ┡─╼ \u001b[33mμὲν\u001b[0m \u001b[31mADV\u001b[0m \u001b[34madvmod\u001b[0m\n", + " │ ╭─╼ \u001b[33mἁγνὸς\u001b[0m \u001b[31mADJ\u001b[0m \u001b[34mnmod\u001b[0m\n", + " ┡─┶ \u001b[33mοὐρανὸς\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m\n", + " ┡─┮ \u001b[33mτρῶσαι\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m\n", + " │ ╰─╼ \u001b[33mχθόνα\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m\n", + " ┡─╼ \u001b[33m,\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m\n", + " │ ╭─╼ \u001b[33mἔρως\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m\n", + " ┡─╼ \u001b[33mδὲ\u001b[0m \u001b[31mCCONJ\u001b[0m \u001b[34mcc\u001b[0m │\n", + " │ ┢─╼ \u001b[33mγαῖαν\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m\n", + " ┡───────────────┾ \u001b[33mλαμβάνει\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mconj\u001b[0m\n", + " │ │ ╭─╼ \u001b[33mγάμου\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m\n", + " │ ╰─┶ \u001b[33mτυχεῖν\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m\n", + " ╰─╼ \u001b[33m·\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "cat UD_Ancient_Greek/sample.conllu | udapy -T | head -n 20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Details:\n", + "
    \n", + "
  • You may be used to see dependency trees where the root node is on the top and words are ordered horizontally (left to right). Here, the root is on left and words are ordered vertically (top to bottom).\n", + "
  • The colors are implemented using the colorama package and ANSI escape codes. When running this from IPython or Bash and using less, you need to instruct it to display the colors with -R:\n", + "\n", + "cat UD_Ancient_Greek/sample.conllu | udapy -T | less -R\n", + "\n", + "
  • You can also use udapy -T -N to disable the colors.\n", + "
  • udapy -q suppresses all Udapi messages (warnings, info, debug) printed on the standard error output, so only fatal errors are printed. By default only debug messages are suppresses, but these can be printed with udapy -v.\n", + "
  • But you already know this because you have read udapy -h, am I right?\n", + "
\n", + "
\n", + "\n", + "`udapy -T` is a shortcut for `udapy write.TextModeTrees color=1`, where `write.TextModeTrees` is a so-called *block* (a basic Udapi processing unit) and `color=1` is its parameter. See [the documentation](https://udapi.readthedocs.io/en/latest/udapi.block.write.html#module-udapi.block.write.textmodetrees) (or even [the source code](https://github.com/udapi/udapi-python/blob/master/udapi/block/write/textmodetrees.py) of `write.TextModeTrees` to learn about further parameters. Now, let's print also the LEMMA and MISC columns and display the columns vertically aligned using parameters `layout=align attributes=form,lemma,upos,deprel,misc`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "docname = tlg0008.tlg001.perseus-grc1.13.tb.xml\r\n", + "loaded_from = -\r\n", + "# sent_id = tlg0008.tlg001.perseus-grc1.13.tb.xml@1144\r\n", + "# text = ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·\r\n", + "─┮ \r\n", + " ╰─┮ \u001b[33mἐρᾷ\u001b[0m \u001b[36mἐράω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mroot\u001b[0m _\u001b[0m\r\n", + " ┡─╼ \u001b[33mμὲν\u001b[0m \u001b[36mμέν\u001b[0m \u001b[31mADV\u001b[0m \u001b[34madvmod\u001b[0m _\u001b[0m\r\n", + " │ ╭─╼ \u001b[33mἁγνὸς\u001b[0m \u001b[36mἁγνός\u001b[0m \u001b[31mADJ\u001b[0m \u001b[34mnmod\u001b[0m _\u001b[0m\r\n", + " ┡─┶ \u001b[33mοὐρανὸς\u001b[0m \u001b[36mοὐρανός\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m _\u001b[0m\r\n", + " ┡─┮ \u001b[33mτρῶσαι\u001b[0m \u001b[36mτιτρώσκω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m _\u001b[0m\r\n", + " │ ╰─╼ \u001b[33mχθόνα\u001b[0m \u001b[36mχθών\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m SpaceAfter=No\u001b[0m\r\n", + " ┡─╼ \u001b[33m,\u001b[0m \u001b[36m,\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m _\u001b[0m\r\n", + " │ ╭─╼ \u001b[33mἔρως\u001b[0m \u001b[36mἔρως\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m _\u001b[0m\r\n", + " ┡─╼ │ \u001b[33mδὲ\u001b[0m \u001b[36mδέ\u001b[0m \u001b[31mCCONJ\u001b[0m \u001b[34mcc\u001b[0m _\u001b[0m\r\n", + " │ ┢─╼ \u001b[33mγαῖαν\u001b[0m \u001b[36mγαῖα\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m _\u001b[0m\r\n", + " ┡───┾ \u001b[33mλαμβάνει\u001b[0m \u001b[36mλαμβάνω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mconj\u001b[0m _\u001b[0m\r\n", + " │ │ ╭─╼ \u001b[33mγάμου\u001b[0m \u001b[36mγάμος\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m _\u001b[0m\r\n", + " │ ╰─┶ \u001b[33mτυχεῖν\u001b[0m \u001b[36mτυγχάνω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m SpaceAfter=No\u001b[0m\r\n", + " ╰─╼ \u001b[33m·\u001b[0m \u001b[36m·\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m _\u001b[0m\r\n", + "\r\n" + ] + } + ], + "source": [ + "cat UD_Ancient_Greek/sample.conllu | udapy -q write.TextModeTrees color=1 layout=align attributes=form,lemma,upos,deprel,misc | head -n 20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Browse conllu files from IPython/Jupyter\n", + "So far, we were using Udapi only via its command-line interface `udapy`, which is handy, but not very Pythonic. So let's now use Udapi as a library and load the English conllu sample file into a document `doc` and visualize the sixth tree (i.e. `doc[5]` in zero-based indexing)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0006\n", + "# text = The third was being run by the head of an investment firm.\n", + "─┮\n", + " │ ╭─╼ \u001b[33mThe\u001b[0m \u001b[31mDET\u001b[0m \u001b[34mdet\u001b[0m\n", + " │ ╭─┶ \u001b[33mthird\u001b[0m \u001b[31mADJ\u001b[0m \u001b[34mnsubj:pass\u001b[0m\n", + " │ ┢─╼ \u001b[33mwas\u001b[0m \u001b[31mAUX\u001b[0m \u001b[34maux\u001b[0m\n", + " │ ┢─╼ \u001b[33mbeing\u001b[0m \u001b[31mAUX\u001b[0m \u001b[34maux:pass\u001b[0m\n", + " ╰─┾ \u001b[33mrun\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mroot\u001b[0m\n", + " │ ╭─╼ \u001b[33mby\u001b[0m \u001b[31mADP\u001b[0m \u001b[34mcase\u001b[0m\n", + " │ ┢─╼ \u001b[33mthe\u001b[0m \u001b[31mDET\u001b[0m \u001b[34mdet\u001b[0m\n", + " ┡─┾ \u001b[33mhead\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobl\u001b[0m\n", + " │ │ ╭─╼ \u001b[33mof\u001b[0m \u001b[31mADP\u001b[0m \u001b[34mcase\u001b[0m\n", + " │ │ ┢─╼ \u001b[33man\u001b[0m \u001b[31mDET\u001b[0m \u001b[34mdet\u001b[0m\n", + " │ │ ┢─╼ \u001b[33minvestment\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mcompound\u001b[0m\n", + " │ ╰─┶ \u001b[33mfirm\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnmod\u001b[0m\n", + " ╰─╼ \u001b[33m.\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "import udapi\n", + "doc = udapi.Document(\"UD_English/sample.conllu\")\n", + "doc[5].draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Details:\n", + "
    \n", + "
  • doc = udapi.Document(filename) is a shortcut for\n", + "
    \n",
    +    "import udapi.core.document\n",
    +    "doc = udapi.core.document.Document(filename)\n",
    +    "
    \n", + "
  • We can print the whole document using doc.draw().\n", + "
  • doc.draw(**kwargs) is a shortcut for creating a write.TextModeTrees block and applying it on the document:\n", + "
    \n",
    +    "import udapi.block.write.textmodetrees\n",
    +    "block = udapi.block.write.textmodetrees.TextModeTrees(**kwargs)\n",
    +    "block.run(doc)\n",
    +    "
    \n", + "
\n", + "
\n", + "\n", + "The `draw()` method takes the same parameters as the `write.TextModeTrees` block, so we can for example display only the node ID (aka `ord`, i.e. word-order index), form and [universal (morpho-syntactic) features](https://universaldependencies.org/u/feat/index.html).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0006\n", + "# text = The third was being run by the head of an investment firm.\n", + "─┮ \n", + " │ ╭─╼ \u001b[32m1\u001b[0m \u001b[33mThe\u001b[0m Definite=Def|PronType=Art\u001b[0m\n", + " │ ╭─┶ \u001b[32m2\u001b[0m \u001b[33mthird\u001b[0m Degree=Pos|NumType=Ord\u001b[0m\n", + " │ ┢─╼ \u001b[32m3\u001b[0m \u001b[33mwas\u001b[0m Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\u001b[0m\n", + " │ ┢─╼ \u001b[32m4\u001b[0m \u001b[33mbeing\u001b[0m VerbForm=Ger\u001b[0m\n", + " ╰─┾ \u001b[32m5\u001b[0m \u001b[33mrun\u001b[0m Tense=Past|VerbForm=Part|Voice=Pass\u001b[0m\n", + " │ ╭─╼ \u001b[32m6\u001b[0m \u001b[33mby\u001b[0m _\u001b[0m\n", + " │ ┢─╼ \u001b[32m7\u001b[0m \u001b[33mthe\u001b[0m Definite=Def|PronType=Art\u001b[0m\n", + " ┡─┾ \u001b[32m8\u001b[0m \u001b[33mhead\u001b[0m Number=Sing\u001b[0m\n", + " │ │ ╭─╼ \u001b[32m9\u001b[0m \u001b[33mof\u001b[0m _\u001b[0m\n", + " │ │ ┢─╼ \u001b[32m10\u001b[0m \u001b[33man\u001b[0m Definite=Ind|PronType=Art\u001b[0m\n", + " │ │ ┢─╼ \u001b[32m11\u001b[0m \u001b[33minvestment\u001b[0m Number=Sing\u001b[0m\n", + " │ ╰─┶ \u001b[32m12\u001b[0m \u001b[33mfirm\u001b[0m Number=Sing\u001b[0m\n", + " ╰─╼ \u001b[32m13\u001b[0m \u001b[33m.\u001b[0m _\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "doc[5].draw(layout=\"align\", attributes=\"ord,form,feats\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Document representation in Udapi\n", + "\n", + "Udapi [document](https://github.com/udapi/udapi-python/blob/master/udapi/core/document.py) consists of a sequence of so-called *bundles*, mirroring a sequence of sentences in a typical natural language text.\n", + "\n", + "A [bundle](https://github.com/udapi/udapi-python/blob/master/udapi/core/bundle.py) corresponds to a sentence,\n", + "possibly in multiple versions or with different representations, such as sentence-tuples from parallel corpora, or paraphrases in the same language or alternative analyses (e.g. parses produced by different parsers). If there are more trees in a bundle, they must be distinguished by a so-called *zone* (a label which contains the language code).\n", + "\n", + "Each tree is represented by a special (artificial) [root](https://github.com/udapi/udapi-python/blob/master/udapi/core/root.py) node, which is added to the top of a CoNLL-U tree in the Udapi model. The root node bears the ID of a given tree/sentence (`sent_id`) and its word order (`ord`) is 0. Technically, Root is subclass of Node, with some extra methods.\n", + "\n", + "The [Node](https://github.com/udapi/udapi-python/blob/master/udapi/core/node.py) class corresponds to a node\n", + "of a dependency tree. It provides access to all the CoNLL-U-defined attributes (`ord`, `form`, `lemma`, `upos`, `xpos`, `feats`, `deprel`, `deps`, `misc`). There are methods for tree traversal (`parent`, `root`, `children`, `descendants`); word-order traversal (`next_node`, `prev_node`); tree manipulation (`parent` setter) including word-order changes (`shift_after_node(x)`, `shift_before_subtree(x)`, etc.); and utility methods: `is_descendant_of(x)`, `is_nonprojective()`, `precedes(x)`, `is_leaf()`, `is_root()`, `get_attrs([])`, `compute_text()`, `draw()`.\n", + "\n", + "## Exercise 1: Count prepositions and postpositions\n", + "[Prepositions and postpositions](https://en.wikipedia.org/wiki/Preposition_and_postposition) are together called *adpositions* and assigned the [ADP](https://universaldependencies.org/u/pos/ADP.html) universal part-of-speech tag (`upos`) in UD. Some languages (e.g. English) use mostly prepositions, others mostly postpositions.\n", + "* Do you know any English postpositions?\n", + "* Guess the typical adposition type (i.e. whether a given language uses more prepositions or postpositions) for at least 10 languages of your choice (from those in UD2.0).\n", + "* Complete the following code and find out how many prepositions and postpositions are in `UD_English/sample.conllu` (which has been loaded into `doc`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prepositions, postpositions = 0, 0\n", + "# Iterate over all nodes in the document (in all trees)\n", + "for node in doc.nodes:\n", + " if node.upos == \"ADP\":\n", + " # TODO: fix this code to actually distinguish prepositions and postpositions\n", + " prepositions += 1\n", + "# Print the results\n", + "prepositions, postpositions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you don't know how to proceed click on the following hints.\n", + "
Hint 1:\n", + "In some dependency grammars, adpositions govern noun (i.e. adposition is the *parent* of a given noun node). In other dependency grammars, adpositions depend on nouns (i.e. noun is the *parent* of a given adposition). Find out which style is being used by UD. Check the UD documentation or inspect some of the tree visualizations and guess.\n", + "
\n", + "
Hint 2:\n", + "See the Node documentation and find out how to obtain dependency parent and dependency children. Note that these are properties of a given node, rather than methods, so you should not write parentheses () after the property name.\n", + "
\n", + "
Hint 3:\n", + "doc.nodes iterates over all nodes in the document sorted by the word order, but this would be cumbersome to exploit. Find a method of Node to detect the relative word order of two nodes (within the same tree/sentence).\n", + "
\n", + "
Hint 4:\n", + "Use node.parent and node.precedes(another_node).\n", + "The latter is a shortcut for node.ord < another_node.ord.\n", + "
\n", + "
Solution:\n", + "
\n",
+    "for node in doc.nodes:\n",
+    "    if node.upos == \"ADP\":\n",
+    "        if node.precedes(node.parent):\n",
+    "            prepositions += 1\n",
+    "        else:\n",
+    "            postpositions += 1\n",
+    "
\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2: Explore English postpositions\n", + "The previous exercise indicates there are 7 occurrences of postpositions in the English sample. Find these 7 occurrences and visualize them using `node.draw()`. Count which adpositions (`lemma`) with which dependency relations (`deprel`) are responsible for these occurrences. Recompute these statistics on the bigger English training data. Can you explain these occurrences? What are the reasons? Is any occurrence an annotation error?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For the statistics, you may find useful: count[\"any string\"] += 1\n", + "import collections\n", + "count = collections.Counter()\n", + "big_doc = udapi.Document(\"UD_English/train.conllu\")\n", + "\n", + "for node in doc.nodes:\n", + " # TODO detect postposition\n", + " pass\n", + "\n", + "# Print the statistics\n", + "count.most_common()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Solution 1:\n", + "
\n",
+    "for node in doc.nodes:\n",
+    "    if node.upos == \"ADP\" and node.parent.precedes(node):\n",
+    "        node.parent.draw()\n",
+    "        count[node.lemma + \" \" + node.deprel] += 1\n",
+    "
\n", + "
\n", + "
Hint 1:\n", + "We can see there are many particles of phrase verbs, e.g. \"busted up\".\n", + "These seem to be correctly annotated as ADP according to the UD guidelines.\n", + "Let's filter out those cases and focus on the rest and let's switch to the big train data.\n", + "
\n", + "
Solution 2:\n", + "
\n",
+    "count = collections.Counter()\n",
+    "for node in big_doc.nodes:\n",
+    "    if node.upos == \"ADP\" and node.parent.precedes(node) and node.parent.upos != \"VERB\":\n",
+    "        count[node.lemma + \" \" + node.deprel] += 1\n",
+    "count.most_common()\n",
+    "
\n", + "Alternatively to node.parent.upos != \"VERB\",\n", + "you could also filter out node.deprel != \"compound:prt\",\n", + "or directly focus on node.deprel == \"case\"\n", + "
\n", + "
Partial answer:\n", + "Most of the occurrences are actually annotated correctly,\n", + "although they are not typically considered as postpositions.\n", + "For example, node.deprel == \"fixed\" is being used for multi-word adpositions,\n", + "such as \"because of\", where \"of\" depends on \"because\" from technical (and consistency) reasons,\n", + "but the whole multi-word adpositions precedes its governing nound, so it is actually a multi-word preposition.\n", + "\n", + "What about the remaining occurrences, after filtering out node.deprel not in {\"compound:prt\", \"fixed\"}?\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next tutorial, [02-blocks.ipynb](02-blocks.ipynb), we will explore several useful Udapi blocks, some of which may be handy when working further on Exercise 2 or similar tasks." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From e9091688e6f60e7408c1e44d87832172d469ead8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 1 Dec 2020 08:22:18 +0100 Subject: [PATCH 0097/1201] tutorial README.md --- tutorial/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 tutorial/README.md diff --git a/tutorial/README.md b/tutorial/README.md new file mode 100644 index 00000000..05e96d59 --- /dev/null +++ b/tutorial/README.md @@ -0,0 +1,9 @@ +# Udapi tutorial + +To run this tutorial, install [Jupyter Notebook](https://jupyter.org/install.html) (or JupyterLab) and run `jupyter notebook` from this directory. + +Don't display the tutorial `ipynb` files on GitHub because it cannot render the collapsible Details, Hints and Solution sections, so you would miss important parts of the tutorial. +If you don't have Jupyter installed, you can display the tutorial with https://nbviewer.jupyter.org, using the following links: + +- [01-visualizing.ipynb](https://nbviewer.jupyter.org/github/udapi/udapi-python/blob/master/tutorial/01-visualizing.ipynb) +- [02-blocks.ipynb](https://nbviewer.jupyter.org/github/udapi/udapi-python/blob/master/tutorial/01-blocks.ipynb) From 5eb27a80fc3b1563050a37c31275e31661a02c45 Mon Sep 17 00:00:00 2001 From: Zdenek Zabokrtsky Date: Wed, 27 Jan 2021 22:23:55 +0100 Subject: [PATCH 0098/1201] a block that prints most frequent values for each attribute stored in the MISC field --- udapi/block/corefud/__init__.py | 0 udapi/block/corefud/miscstats.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 udapi/block/corefud/__init__.py create mode 100644 udapi/block/corefud/miscstats.py diff --git a/udapi/block/corefud/__init__.py b/udapi/block/corefud/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/corefud/miscstats.py b/udapi/block/corefud/miscstats.py new file mode 100644 index 00000000..4f4e86c0 --- /dev/null +++ b/udapi/block/corefud/miscstats.py @@ -0,0 +1,32 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class MiscStats(Block): + """Block corefud.MiscStats prints 10 most frequent values of each attribute stored in the MISC field""" + + def __init__(self, maxvalues=10): + + """Create the corefud.MiscStats + + Args: + maxvalues: the number of most frequent values + to be printed for each attribute. + + """ + self.maxvalues = maxvalues + self.valuecounter = {} + self.zones = 'all' + + def process_node(self,node): + for attrname in node.misc: + shortattrname = re.sub(r'\[\d+\]',r'',attrname) + if not shortattrname in self.valuecounter: + self.valuecounter[shortattrname] = Counter() + self.valuecounter[shortattrname][node.misc[attrname]] += 1 + + def process_end(self): + for attrname in self.valuecounter: + print("MISC attribute: "+attrname) + for value,freq in self.valuecounter[attrname].most_common(self.maxvalues): + print(" "+str(value)+" "+str(freq)) From 1def22ea2da0fd4ced8cb999c7fb180e69309702 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 27 Jan 2021 23:42:58 +0100 Subject: [PATCH 0099/1201] call super() constructor --- udapi/block/corefud/miscstats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/miscstats.py b/udapi/block/corefud/miscstats.py index 4f4e86c0..e7aabc03 100644 --- a/udapi/block/corefud/miscstats.py +++ b/udapi/block/corefud/miscstats.py @@ -5,7 +5,7 @@ class MiscStats(Block): """Block corefud.MiscStats prints 10 most frequent values of each attribute stored in the MISC field""" - def __init__(self, maxvalues=10): + def __init__(self, maxvalues=10, **kwargs): """Create the corefud.MiscStats @@ -14,9 +14,9 @@ def __init__(self, maxvalues=10): to be printed for each attribute. """ + super().__init__(**kwargs) self.maxvalues = maxvalues self.valuecounter = {} - self.zones = 'all' def process_node(self,node): for attrname in node.misc: From 8a4d1b2a6467aaa84aafb0009726d636db8584e6 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 27 Jan 2021 23:43:52 +0100 Subject: [PATCH 0100/1201] allow `util.See stats=misc_split` similarly to feats_split --- udapi/core/node.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 32061b16..3c917e29 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -514,6 +514,8 @@ def _get_attr(self, name): # pylint: disable=too-many-return-statements return value if name == 'feats_split': return str(self.feats).split('|') + if name == 'misc_split': + return str(self.misc).split('|') if name.startswith('feats['): return self.feats[name[6:-1]] if name.startswith('misc['): @@ -554,7 +556,7 @@ def get_attrs(self, attrs, undefs=None, stringify=True): elif name.startswith('r_'): nodes, name = [self.next_node], name[2:] for node in (n for n in nodes if n is not None): - if name == 'feats_split': + if name in {'feats_split', 'misc_split'}: values.extend(node._get_attr(name)) else: values.append(node._get_attr(name)) From 70525351aea920c8e0f0e0dbd6da6523dc68bb77 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 1 Feb 2021 08:41:38 +0100 Subject: [PATCH 0101/1201] allow `d = udapi.Document(filename="my.txt", rstrip="\n")` so that spaces and `\r` at ends of lines are preserved, which is needed in some character-indexed formats. --- udapi/block/read/sentences.py | 15 ++++++++++++--- udapi/core/document.py | 6 ++++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/udapi/block/read/sentences.py b/udapi/block/read/sentences.py index c3a02ddd..758b4980 100644 --- a/udapi/block/read/sentences.py +++ b/udapi/block/read/sentences.py @@ -4,10 +4,19 @@ class Sentences(BaseReader): - """A reader for plain-text sentences (one sentence per line) files.""" + r"""A reader for plain-text sentences (one sentence per line) files. - def __init__(self, ignore_empty_lines=False, **kwargs): + Args: + ignore_empty_lines: if True, delete empty lines from the input. + Default=False. + rstrip: a set of characters to be stripped from the end of each line. + Default='\r\n '. You can use rstrip='\n' if you want to preserve + any space or '\r' (Carriage Return) at end of line, + so that `udpipe.Base resegment=1` keeps these characters in `SpacesAfter`. + """ + def __init__(self, ignore_empty_lines=False, rstrip='\r\n ', **kwargs): self.ignore_empty_lines = ignore_empty_lines + self.rstrip = rstrip super().__init__(**kwargs) @staticmethod @@ -33,5 +42,5 @@ def read_tree(self, document=None): if line == '': return None root = Root() - root.text = line.rstrip() + root.text = line.rstrip(self.rstrip) return root diff --git a/udapi/core/document.py b/udapi/core/document.py index 25699398..0f5241c5 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -11,7 +11,7 @@ class Document(object): """Document is a container for Universal Dependency trees.""" - def __init__(self, filename=None): + def __init__(self, filename=None, **kwargs): """Create a new Udapi document. Args: @@ -20,6 +20,8 @@ def __init__(self, filename=None): and `*.txt` (using `udapi.block.read.sentences`) filenames are supported. No pre-processing is applied, so when loading the document from a *.txt file, `Document("a.txt").nodes` will be empty and you need to run tokenization first. + You can pass additional parameters for `udapi.block.read.sentences` + (`ignore_empty_lines` and `rstrip`). """ self.bundles = [] self._highest_bundle_id = 0 @@ -29,7 +31,7 @@ def __init__(self, filename=None): if filename.endswith(".conllu"): self.load_conllu(filename) elif filename.endswith(".txt"): - reader = SentencesReader(files=filename) + reader = SentencesReader(files=filename, **kwargs) reader.apply_on_document(self) else: raise ValueError("Only *.conllu and *.txt are supported. Provided: " + filename) From 18a5f173d15008a3cf3221b05ff3b25c6a1387b2 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 2 Feb 2021 17:21:53 +0100 Subject: [PATCH 0102/1201] "udapy -X" for adding params to the end of the scenario --- bin/udapy | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bin/udapy b/bin/udapy index 9bb58f53..eaaa00cc 100755 --- a/bin/udapy +++ b/bin/udapy @@ -42,6 +42,10 @@ argparser.add_argument( argparser.add_argument( "-N", "--no_color", action="store_true", help="Add color=0 to the end of the scenario, this overrides color=1 of -T and -H") +argparser.add_argument( + "-X", "--extra", action="append", + help="Add a specified parameter (or a block name) to the end of the scenario\n" + "For example 'udapy -TNX attributes=form,misc -X layout=align < my.conllu'") argparser.add_argument( 'scenario', nargs=argparse.REMAINDER, help="A sequence of blocks and their parameters.") @@ -74,6 +78,8 @@ if __name__ == "__main__": args.scenario = args.scenario + ['marked_only=1'] if args.no_color: args.scenario = args.scenario + ['color=0'] + if args.extra: + args.scenario += args.extra runner = Run(args) # udapy is often piped to head etc., e.g. From 4c7c8e10dc0f5151fae5b6222b84cf84b47121c6 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 2 Feb 2021 18:46:03 +0100 Subject: [PATCH 0103/1201] improved ud.SetSpaceAfter and added an English-specific subclass fixes #67 --- udapi/block/ud/en/__init__.py | 0 udapi/block/ud/en/setspaceafter.py | 46 ++++++++++++++++++++++++++++++ udapi/block/ud/setspaceafter.py | 28 ++++++++++-------- 3 files changed, 62 insertions(+), 12 deletions(-) create mode 100644 udapi/block/ud/en/__init__.py create mode 100644 udapi/block/ud/en/setspaceafter.py diff --git a/udapi/block/ud/en/__init__.py b/udapi/block/ud/en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/en/setspaceafter.py b/udapi/block/ud/en/setspaceafter.py new file mode 100644 index 00000000..1ebc3054 --- /dev/null +++ b/udapi/block/ud/en/setspaceafter.py @@ -0,0 +1,46 @@ +"""Block ud.en.SetSpaceAfter for heuristic setting of SpaceAfter=No in English. + +Usage:: + + udapy -s ud.en.SetSpaceAfter < in.conllu > fixed.conllu + +Author: Martin Popel +""" +import udapi.block.ud.setspaceafter + + +class SetSpaceAfter(udapi.block.ud.setspaceafter.SetSpaceAfter): + """Block for heuristic setting of the SpaceAfter=No MISC attribute in English. + + """ + + def process_tree(self, root): + nodes = root.descendants + for i, node in enumerate(nodes[:-1]): + next_form = nodes[i + 1].form + + # Contractions like "don't" and possessive suffix 's should be annotated as MWT. + # However, older UD_English-EWT versions did not follow this rule and even v2.7 + # contains some forgotten occurrences, so let's handle these as well. + if next_form in {"n't", "'s"}: + self.mark_no_space(node) + + # Parsers may distinguish opening and closing single quotes by XPOS. + elif node.form == "'" and node.xpos == "``": + self.mark_no_space(node) + elif next_form == "'" and nodes[i + 1].xpos == "''": + self.mark_no_space(node) + + + # hyphen-compounds + elif node.form == '-' and i: + if ((nodes[i - 1] is node.parent or nodes[i - 1].parent is node.parent) and + (nodes[i + 1] is node.parent or nodes[i + 1].parent is node.parent)): + self.mark_no_space(nodes[i - 1]) + self.mark_no_space(node) + + # $200 + elif node.form == '$' and nodes[i + 1].upos == 'NUM': + self.mark_no_space(node) + + super().process_tree(root) diff --git a/udapi/block/ud/setspaceafter.py b/udapi/block/ud/setspaceafter.py index e796bf0d..04c9fffb 100644 --- a/udapi/block/ud/setspaceafter.py +++ b/udapi/block/ud/setspaceafter.py @@ -13,10 +13,11 @@ class SetSpaceAfter(Block): """Block for heuristic setting of the SpaceAfter=No MISC attribute.""" - def __init__(self, not_after='¡¿([{„', not_before='.,;:!?}])', fix_text=True, **kwargs): + def __init__(self, not_after='¡ ¿ ( [ { „ /', not_before='. , ; : ! ? } ] ) / ?? ??? !! !!! ... …', + fix_text=True, extra_not_after='', extra_not_before='', **kwargs): super().__init__(**kwargs) - self.not_after = not_after - self.not_before = not_before + self.not_after = (not_after + ' ' + extra_not_after).split(' ') + self.not_before = (not_before + ' ' + extra_not_before).split(' ') self.fix_text = fix_text self.changed = False @@ -26,7 +27,7 @@ def process_tree(self, root): self.changed = False # Undirected double quotes are ambiguous. - # If there is an even number of quotes in a sentence, supposed they are not nested + # If there is an even number of quotes in a sentence, suppose they are not nested # and treat odd-indexed ones as opening and even-indexed ones as closing. # Otherwise (odd number, e.g. when quoting multiple sentences), don't remove any space. matching_quotes = not bool(count_of_form['"'] % 2) @@ -36,22 +37,25 @@ def process_tree(self, root): # Some languages use directed „quotes“ and some “quotes”, # so the symbol “ (U+201C) is ambiguous and we heuristically check for presence of „. if count_of_form['„']: - not_before += '“' + not_before += ['“'] else: - not_after += '“' + not_after += ['“'] for i, node in enumerate(nodes[:-1]): next_form = nodes[i + 1].form if node.form in self.not_after or next_form in not_before: self.mark_no_space(node) - if matching_quotes and node.form == '"': - if odd_indexed_quote: + if node.form == '"': + if matching_quotes: + if odd_indexed_quote: + self.mark_no_space(node) + elif i: + self.mark_no_space(nodes[i - 1]) + odd_indexed_quote = not odd_indexed_quote + elif i==0: self.mark_no_space(node) - elif i: - self.mark_no_space(nodes[i - 1]) - odd_indexed_quote = not odd_indexed_quote - if matching_quotes and nodes[-1].form == '"': + if nodes[-1].form == '"': self.mark_no_space(nodes[-2]) if self.fix_text and self.changed: From fee50975f11aec6659eb0ea130da1aabef2bc2ec Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 3 Feb 2021 06:28:16 +0100 Subject: [PATCH 0104/1201] bundle.document instead of bundle.document() for consistency --- udapi/core/bundle.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/core/bundle.py b/udapi/core/bundle.py index 591b8a84..110ed42c 100644 --- a/udapi/core/bundle.py +++ b/udapi/core/bundle.py @@ -46,6 +46,7 @@ def __str__(self): def __iter__(self): return iter(self.trees) + @property def document(self): """Returns the document in which the bundle is contained.""" return self._document From c4567359290f406ea6b71c48857ff2975037ce88 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 3 Feb 2021 06:33:02 +0100 Subject: [PATCH 0105/1201] basic support for coreference --- udapi/core/basewriter.py | 2 + udapi/core/coref.py | 193 +++++++++++++++++++++++++++++++++++++++ udapi/core/document.py | 18 ++++ udapi/core/node.py | 17 ++++ udapi/core/root.py | 4 + 5 files changed, 234 insertions(+) create mode 100644 udapi/core/coref.py diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 1d64e874..3f28d155 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -2,6 +2,7 @@ import sys import logging +import udapi.core.coref from udapi.core.block import Block from udapi.core.files import Files @@ -39,6 +40,7 @@ def next_filename(self): return self.files.next_filename() def before_process_document(self, document): + udapi.core.coref.store_coref_to_misc(document) if self.orig_files == '': logging.info('Writing to filehandle.') sys.stdout = self.files.filehandle diff --git a/udapi/core/coref.py b/udapi/core/coref.py new file mode 100644 index 00000000..7fae4387 --- /dev/null +++ b/udapi/core/coref.py @@ -0,0 +1,193 @@ +"""Classes for handling coreference.""" + +class CorefMention(object): + """Class for representing a mention (instance of an entity).""" + __slots__ = ['_head', '_cluster', '_bridging', '_words'] + + def __init__(self, head, cluster=None): + self._head = head + self._cluster = cluster + if cluster is not None: + cluster._mentions.append(self) + self._bridging = None + self._words = [] + + @property + def head(self): + return self._head + + # TODO change head - make sure it is already within the span (_words) or add it? + + @property + def cluster(self): + return self._cluster + + @cluster.setter + def cluster(self, new_cluster): + if self._cluster is not None: + raise NotImplementedError('changing the cluster of a mention not supported yet') + self._cluster = new_cluster + new_cluster._mentions.append(new_cluster) + + @property + def bridging(self): + return self._bridging + + # TODO add/edit bridging + + @property + def words(self): + return self._words + + @words.setter + def words(self, new_words): + if self.head not in new_words: + raise ValueError(f"Head {self.head} not in new_words {new_words}") + for old_word in self._words: + old_word._mentions.remove(self) + self._words = new_words + for new_word in new_words: + new_word._mentions.append(self) + + @property + def span(self): + def _nums_to_ranges(nums): + lo, hi = nums[0], nums[0] + for num in nums[1:]: + if num == hi + 1: + hi = num + else: + yield (lo, hi) + lo, hi = num, num + yield (lo, hi) + + if not self._words: + return '' + ords = sorted(n.ord for n in self._words) + if len(ords) == 1: + return str(ords[0]) + first, last = ords[0], ords[-1] + if ords == list(range(first, last+1)): + return "%g-%g" % (first, last) + return ','.join( '%g' % r[0] if r[0]==r[1] else '%g-%g' % r for r in _nums_to_ranges(ords)) + + @span.setter + def span(self, new_span): + ranges = [] + for span_str in new_span.split(','): + if '-' not in span_str: + lo = hi = float(span_str) + else: + lo, hi = (float(x) for x in span_str.split('-')) + ranges.append((lo, hi)) + + def _num_in_ranges(num): + for (lo, hi) in ranges: + if num > hi: + return False + if num >= lo: + return True + return False + + new_words = [w for w in self._head.root.descendants_and_empty if _num_in_ranges(w.ord)] + self.words = new_words + + +class CorefCluster(object): + """Class for representing all mentions of a given entity.""" + __slots__ = ['_cluster_id', '_mentions', 'cluster_type', '_split_ante'] + + def __init__(self, cluster_id, cluster_type=None): + self._cluster_id = cluster_id + self._mentions = [] + self.cluster_type = cluster_type + self._split_ante = None + + @property + def cluster_id(self): + return self._cluster_id + + @property + def mentions(self): + return self._mentions + + def create_mention(self, head, mention_words=None, mention_span=None): + if mention_words is not None and mention_span is not None: + raise ValueError("Cannot set both mention_words and mention_span") + mention = CorefMention(head, self) + if mention_words is not None: + mention.words = mention_words + elif mention_span is not None: + mention.span = mention_span + return mention + + @property + def split_ante(self): + return self._split_ante + + # TODO add/edit split_ante + + # TODO adapt depending on how mention.bridging is implemented (callable list subclass) + def all_bridging(self): + for m in self._mentions: + if m._bridging: + for b in m._bridging: + yield b + + +def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): + clusters = head.root.bundle.document.coref_clusters + if not cluster_id: + counter = 1 + while clusters.get('c%d' % counter): + counter += 1 + cluster_id = 'c%d' % counter + elif clusters.get(cluster_id): + raise ValueError("Cluster with a id %s already exists", cluster_id) + cluster = CorefCluster(cluster_id, cluster_type) + cluster.create_mention(head, **kwargs) + clusters[cluster_id] = cluster + return cluster + + +def load_coref_from_misc(doc): + clusters = {} + for node in doc.nodes: + cluster_id = node.misc["ClusterId"] + if cluster_id: + cluster = clusters.get(cluster_id) + if cluster is None: + cluster = CorefCluster(cluster_id) + clusters[cluster_id] = cluster + mention = CorefMention(node, cluster) + if node.misc["MentionSpan"]: + mention.span = node.misc["MentionSpan"] + cluster_type = node.misc["ClusterType"] + if cluster_type is not None: + if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: + logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") + cluster.cluster_type = cluster_type + # TODO deserialize Bridging and Split + mention._bridging = node.misc["Bridging"] + mention._split_ante = node.misc["Split"] + doc._coref_clusters = clusters + +# TODO don't recompute the serialization if not needed, i.e. if doc._is_coref_stored_in_misc is True, +# but make it False after each edit of coref or change of ord (reordering, new nodes, deleted nodes...). +def store_coref_to_misc(doc): + if not doc._coref_clusters: + return + for node in doc.nodes: + del node.misc["ClusterId"] + del node.misc["MentionSpan"] + del node.misc["ClusterType"] + del node.misc["Bridging"] + del node.misc["Split"] + for cluster in doc._coref_clusters.values(): + for mention in cluster.mentions: + head = mention.head + head.misc["ClusterId"] = cluster.cluster_id + head.misc["MentionSpan"] = mention.span + head.misc["ClusterType"] = cluster.cluster_type + head.misc["Bridging"] = mention.bridging + head.misc["Split"] = cluster.split_ante diff --git a/udapi/core/document.py b/udapi/core/document.py index 0f5241c5..36edb856 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -2,6 +2,7 @@ import io import contextlib +import udapi.core.coref from udapi.core.bundle import Bundle from udapi.block.read.conllu import Conllu as ConlluReader from udapi.block.write.conllu import Conllu as ConlluWriter @@ -27,6 +28,7 @@ def __init__(self, filename=None, **kwargs): self._highest_bundle_id = 0 self.meta = {} self.json = {} + self._coref_clusters = None if filename is not None: if filename.endswith(".conllu"): self.load_conllu(filename) @@ -97,3 +99,19 @@ def nodes(self): def draw(self, **kwargs): """Pretty print the trees using TextModeTrees.""" TextModeTrees(**kwargs).run(self) + + def _load_coref(self): + """De-serialize coreference-related objects (CorefMention, CorefCluster). + + This internal method will be called automatically whenever any coref-related method is called. + It iterates through all nodes in the document and creates the objects based on the info in MISC + (stored in attributes ClusterId, MentionSpan, ClusterType, Split, Bridging). + """ + if self._coref_clusters is None: + udapi.core.coref.load_coref_from_misc(self) + + @property + def coref_clusters(self): + """A dict mapping ClusterId to a CorefCluster object.""" + self._load_coref() + return self._coref_clusters diff --git a/udapi/core/node.py b/udapi/core/node.py index 3c917e29..00b18c9d 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -5,6 +5,7 @@ """ import logging +import udapi.core.coref from udapi.block.write.textmodetrees import TextModeTrees from udapi.core.dualdict import DualDict from udapi.core.feats import Feats @@ -76,6 +77,7 @@ class Node(object): '_parent', # Parent node. '_children', # Ord-ordered list of child nodes. '_mwt', # Multi-word token in which this word participates. + '_mentions', # List of udapi.core.coref.CorefMention objects whose span includes this node ] def __init__(self, form=None, lemma=None, upos=None, # pylint: disable=too-many-arguments @@ -94,6 +96,7 @@ def __init__(self, form=None, lemma=None, upos=None, # pylint: disable=too-many self._parent = None self._children = list() self._mwt = None + self._mentions = list() def __str__(self): """Pretty print of the Node object.""" @@ -705,6 +708,20 @@ def gloss(self): def gloss(self, new_gloss): self.misc["Gloss"] = new_gloss + @property + def coref_mentions(self): + self.root.bundle.document._load_coref() + return self._mentions + + @property + def coref_clusters(self): + self.root.bundle.document._load_coref() + return [m.cluster for m in self._mentions if m.cluster is not None] + + def create_coref_cluster(self, **kwargs): + return udapi.core.coref.create_coref_cluster(head=self, **kwargs) + + class ListOfNodes(list): """Helper class for results of node.children and node.descendants. diff --git a/udapi/core/root.py b/udapi/core/root.py index 7944cd55..364c6845 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -235,6 +235,10 @@ def token_descendants(self): result.append(node) return result + @property + def descendants_and_empty(self): + return sorted(self._descendants + self.empty_nodes, key=lambda n: float(n.ord)) + def steal_nodes(self, nodes): """Move nodes from another tree to this tree (append).""" old_root = nodes[0].root From bf25a8ae3d986ec1df68a773d4914a10e237a0eb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 3 Feb 2021 17:52:21 +0100 Subject: [PATCH 0106/1201] bump to 0.2.3 - require Python 3.6+ due to f-strings - Travis test with Python 3.6-3.9 - expect pip older than 2016, which knows python_requires --- .travis.yml | 4 ++-- CHANGES.txt | 6 +++++- README.md | 2 +- setup.py | 10 ++-------- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8d5f2f69..92714116 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,9 @@ language: python python: - - "3.4" - - "3.5" - "3.6" - "3.7" + - "3.8" + - "3.9" before_install: - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - sudo apt-get update -qq diff --git a/CHANGES.txt b/CHANGES.txt index dbcd9702..73418e3b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,10 +2,14 @@ Udapi Change Log ---------------- See https://github.com/udapi/udapi-python/commits/master for details. +0.2.3 2021-02-04 + - support for enhanced dependencies and coreference + - requires Python 3.6+ due to f-strings + 0.2.2 2018-01-08 - support for loading/storing documents from/to strings - allow private modules (starting with dot instead of udapi.block) - MorphoDiTa wrapper udapi/tool/morphodita.py - root.sent_id returns always the same as root.address() -0.2.1 2017-10-23 the first PyPI release \ No newline at end of file +0.2.1 2017-10-23 the first PyPI release diff --git a/README.md b/README.md index a082cc00..3bf52eec 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Python framework for processing Universal Dependencies data [![Documentation Status](https://readthedocs.org/projects/udapi/badge/)](http://udapi.readthedocs.io) ## Requirements -- You need Python 3.3 or higher. +- You need Python 3.6 or higher. - If the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser is needed, make sure you have a C++11 compiler (e.g. [g++ 4.7 or newer](.travis.yml#L9)) and install UDPipe with `pip3 install --user --upgrade ufal.udpipe`. diff --git a/setup.py b/setup.py index 7197d909..804ebd59 100644 --- a/setup.py +++ b/setup.py @@ -2,15 +2,9 @@ from setuptools import setup, find_packages -# python_requires is supported by pip only from November 2016, -# so let's check the Python version also the old way. -import sys -if sys.version_info < (3, 3): - raise SystemExit('Udapi requires Python 3.3 or higher.') - setup( name='udapi', - version='0.2.2', + version='0.2.3', description='Python framework for processing Universal Dependencies data', long_description=( 'Udapi is an open-source framework providing API for processing ' @@ -27,7 +21,7 @@ scripts=['bin/udapy'], tests_require=['pytest'], install_requires=['colorama', 'termcolor'], - python_requires='>=3.3', + python_requires='>=3.6', license='GPL 2 or newer', platforms='any', ) From 156b66ed2903757a2ffa51944be65713a68f27ab Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 4 Feb 2021 22:03:36 +0100 Subject: [PATCH 0107/1201] create_mention improvements - If no head is specified, the first word from mention_words will be used instead. - If mention_words are provided, they must contain the head. - docstring --- udapi/core/coref.py | 102 ++++++++++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 27 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 7fae4387..162cc15b 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -45,7 +45,7 @@ def words(self, new_words): raise ValueError(f"Head {self.head} not in new_words {new_words}") for old_word in self._words: old_word._mentions.remove(self) - self._words = new_words + self._words = new_words # TODO sorted for new_word in new_words: new_word._mentions.append(self) @@ -63,7 +63,7 @@ def _nums_to_ranges(nums): if not self._words: return '' - ords = sorted(n.ord for n in self._words) + ords = sorted(n.ord for n in self._words) # TODO vypustit sorted if len(ords) == 1: return str(ords[0]) first, last = ords[0], ords[-1] @@ -73,24 +73,7 @@ def _nums_to_ranges(nums): @span.setter def span(self, new_span): - ranges = [] - for span_str in new_span.split(','): - if '-' not in span_str: - lo = hi = float(span_str) - else: - lo, hi = (float(x) for x in span_str.split('-')) - ranges.append((lo, hi)) - - def _num_in_ranges(num): - for (lo, hi) in ranges: - if num > hi: - return False - if num >= lo: - return True - return False - - new_words = [w for w in self._head.root.descendants_and_empty if _num_in_ranges(w.ord)] - self.words = new_words + self.words = span_to_nodes(self._head.root, new_span) class CorefCluster(object): @@ -109,15 +92,41 @@ def cluster_id(self): @property def mentions(self): + #TODO return sorted(self._mentions, key=lambda x:... return self._mentions - def create_mention(self, head, mention_words=None, mention_span=None): - if mention_words is not None and mention_span is not None: - raise ValueError("Cannot set both mention_words and mention_span") + def create_mention(self, head=None, mention_words=None, mention_span=None): + """Create a new CoreferenceMention object within this CorefCluster. + + Args: + head: a node where the annotation about this CorefMention will be stored in MISC. + The head is supposed to be the linguistic head of the mention, + i.e. the highest node in the dependency tree, + but if such information is not available (yet), + it can be any node within the mention_words. + If no head is specified, the first word from mention_words will be used instead. + mention_words: a list of nodes of the mention. + This argument is optional, but if provided, it must contain the head. + The nodes can be both normal nodes or empty nodes. + mention_span: an alternative way how to specify mention_words + using a string such as "3-5,6,7.1-7.2". + (which means, there is an empty node 5.1 and normal node 7, + which are not part of the mention). + At most one of the args mention_words and mention_span can be specified. + """ + if mention_words and mention_span: + raise ValueError("Cannot specify both mention_words and mention_span") + if head and mention_words and head not in mention_words: + raise ValueError(f"Head {head} is not among the specified mention_words") + if head is None and mention_words is None: + raise ValueError("Either head or mention_words must be specified") + if head is None: + head = mention_words[0] + mention = CorefMention(head, self) - if mention_words is not None: + if mention_words: mention.words = mention_words - elif mention_span is not None: + if mention_span: mention.span = mention_span return mention @@ -172,8 +181,7 @@ def load_coref_from_misc(doc): mention._split_ante = node.misc["Split"] doc._coref_clusters = clusters -# TODO don't recompute the serialization if not needed, i.e. if doc._is_coref_stored_in_misc is True, -# but make it False after each edit of coref or change of ord (reordering, new nodes, deleted nodes...). + def store_coref_to_misc(doc): if not doc._coref_clusters: return @@ -191,3 +199,43 @@ def store_coref_to_misc(doc): head.misc["ClusterType"] = cluster.cluster_type head.misc["Bridging"] = mention.bridging head.misc["Split"] = cluster.split_ante + + +def span_to_nodes(root, span): + ranges = [] + for span_str in span.split(','): + if '-' not in span_str: + lo = hi = float(span_str) + else: + lo, hi = (float(x) for x in span_str.split('-')) + ranges.append((lo, hi)) + + def _num_in_ranges(num): + for (lo, hi) in ranges: + if num > hi: + return False + if num >= lo: + return True + return False + + return [w for w in root.descendants_and_empty if _num_in_ranges(w.ord)] + + +def nodes_to_span(nodes): + def _nums_to_ranges(nums): + lo, hi = nums[0], nums[0] + for num in nums[1:]: + if num == hi + 1: + hi = num + else: + yield (lo, hi) + lo, hi = num, num + yield (lo, hi) + + ords = sorted(n.ord for n in nodes) # TODO vypustit sorted + if len(ords) == 1: + return str(ords[0]) + first, last = ords[0], ords[-1] + if ords == list(range(first, last+1)): + return "%g-%g" % (first, last) + return ','.join( '%g' % r[0] if r[0]==r[1] else '%g-%g' % r for r in _nums_to_ranges(ords)) From 5086ca0b70943e19003bb9e51e3947d4f429e2ca Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 4 Feb 2021 22:51:04 +0100 Subject: [PATCH 0108/1201] report span correctly, e.g. "3-5,6" if there is 5.1 which is not part of the span --- udapi/core/coref.py | 60 ++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 162cc15b..2a47b3ea 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -51,25 +51,7 @@ def words(self, new_words): @property def span(self): - def _nums_to_ranges(nums): - lo, hi = nums[0], nums[0] - for num in nums[1:]: - if num == hi + 1: - hi = num - else: - yield (lo, hi) - lo, hi = num, num - yield (lo, hi) - - if not self._words: - return '' - ords = sorted(n.ord for n in self._words) # TODO vypustit sorted - if len(ords) == 1: - return str(ords[0]) - first, last = ords[0], ords[-1] - if ords == list(range(first, last+1)): - return "%g-%g" % (first, last) - return ','.join( '%g' % r[0] if r[0]==r[1] else '%g-%g' % r for r in _nums_to_ranges(ords)) + return nodes_to_span(self._words) @span.setter def span(self, new_span): @@ -222,20 +204,26 @@ def _num_in_ranges(num): def nodes_to_span(nodes): - def _nums_to_ranges(nums): - lo, hi = nums[0], nums[0] - for num in nums[1:]: - if num == hi + 1: - hi = num - else: - yield (lo, hi) - lo, hi = num, num - yield (lo, hi) - - ords = sorted(n.ord for n in nodes) # TODO vypustit sorted - if len(ords) == 1: - return str(ords[0]) - first, last = ords[0], ords[-1] - if ords == list(range(first, last+1)): - return "%g-%g" % (first, last) - return ','.join( '%g' % r[0] if r[0]==r[1] else '%g-%g' % r for r in _nums_to_ranges(ords)) + """Converts a list of nodes into a string specifying ranges of their ords. + + For example, nodes with ords 3, 4, 5 and 7 will be converted to "3-5,7". + The function handles also empty nodes, so e.g. 3.1, 3.2 and 3.3 will be converted to "3.1-3.3". + Note that empty nodes may form gaps in the span, so if a given tree contains + an empty node with ord 5.1, but only nodes with ords 3, 4, 5, 6, 7.1 and 7.2 + are provided as `nodes`, the resulting string will be "3-5,6,7.1-7.2". + This means that the implementation needs to iterate of all nodes + in a given tree (root.descendants_and_empty) to check for such gaps. + """ + if not nodes: + return '' + all_nodes = nodes[0].root.descendants_and_empty + i, found, ranges = -1, 0, [] + while i + 1 < len(all_nodes) and found < len(nodes): + i += 1 + if all_nodes[i] in nodes: + lo = all_nodes[i].ord + while i < len(all_nodes) and all_nodes[i] in nodes: + i, found = i + 1, found + 1 + hi = all_nodes[i - 1].ord + ranges.append(f"{lo}-{hi}" if hi > lo else f"{lo}") + return ','.join(ranges) From 75e59cf7c5c9b6132eff480246977cfd39ca7a26 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 4 Feb 2021 22:56:36 +0100 Subject: [PATCH 0109/1201] cluster_mention.head = new_head --- udapi/core/coref.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 2a47b3ea..d00eb058 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -16,7 +16,11 @@ def __init__(self, head, cluster=None): def head(self): return self._head - # TODO change head - make sure it is already within the span (_words) or add it? + @head.setter + def head(self, new_head): + if new_head not in self._words: + raise ValueError(f"New head {new_head} not in mention words") + self._head = new_head @property def cluster(self): @@ -42,7 +46,7 @@ def words(self): @words.setter def words(self, new_words): if self.head not in new_words: - raise ValueError(f"Head {self.head} not in new_words {new_words}") + raise ValueError(f"Head {self.head} not in new_words") for old_word in self._words: old_word._mentions.remove(self) self._words = new_words # TODO sorted From 64264ec98815782ab136f536010fe0b33ae878dc Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 5 Feb 2021 15:50:36 +0100 Subject: [PATCH 0110/1201] rename misc["Split"] to misc["SplitAnte"] and store it in CorefCluster, not CorefMention --- udapi/core/coref.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index d00eb058..32e0a87a 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -162,9 +162,9 @@ def load_coref_from_misc(doc): if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") cluster.cluster_type = cluster_type - # TODO deserialize Bridging and Split + # TODO deserialize Bridging and SplitAnte mention._bridging = node.misc["Bridging"] - mention._split_ante = node.misc["Split"] + cluster._split_ante = node.misc["SplitAnte"] doc._coref_clusters = clusters @@ -176,7 +176,7 @@ def store_coref_to_misc(doc): del node.misc["MentionSpan"] del node.misc["ClusterType"] del node.misc["Bridging"] - del node.misc["Split"] + del node.misc["SplitAnte"] for cluster in doc._coref_clusters.values(): for mention in cluster.mentions: head = mention.head @@ -184,7 +184,7 @@ def store_coref_to_misc(doc): head.misc["MentionSpan"] = mention.span head.misc["ClusterType"] = cluster.cluster_type head.misc["Bridging"] = mention.bridging - head.misc["Split"] = cluster.split_ante + head.misc["SplitAnte"] = cluster.split_ante def span_to_nodes(root, span): From e82ff0eb27c72bba1fdc7c6ab439ed6c94dafb77 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 5 Feb 2021 16:21:26 +0100 Subject: [PATCH 0111/1201] more pythonic --- udapi/core/node.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 00b18c9d..7aaa3984 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -232,16 +232,13 @@ def deps(self): for raw_dependency in self._raw_deps.split('|'): # Deprel itself may contain one or more ':' (subtypes). - pieces = raw_dependency.split(':') - head = pieces[0] - deprel = ':'.join(pieces[1:]) + head, deprel = raw_dependency.split(':', maxsplit=1) # Empty nodes have to be located differently than normal nodes. if '.' in head: - matching = [x for x in self.root.empty_nodes if x.ord == head] - if len(matching) > 0: - parent = matching[0] - else: - parent = None ###!!! what should we do here? + try: + parent = next(x for x in self.root.empty_nodes if x.ord == head) + except StopIteration: + raise ValueError(f'Empty node with ord={head} not found') else: parent = nodes[int(head)] self._deps.append({'parent': parent, 'deprel': deprel}) From 459d4ecd35b8ac37bb19d5dc664b11fa6e2b5a4e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 5 Feb 2021 21:25:44 +0100 Subject: [PATCH 0112/1201] create_empty_child() creates EmptyNode - EmptyNode is a subclass of Node - root is stored in empty nodes, not computed (because empty nodes have parent=None and root may be unreachable) - node.create_empty_child(deprel='...') deprel is now required - it is the enhanced UD deprel to be stored in DEPS - argument `after` specifies the position (ord) of the newly created empty node - root.create_empty_child() is a faster version, which does not set `deps` and `ord`. --- udapi/core/node.py | 46 ++++++++++++++++++++++++++++++++++++++++++---- udapi/core/root.py | 15 ++++++++++++++- 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 7aaa3984..cc1c7110 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -369,12 +369,36 @@ def create_child(self, **kwargs): new_node.parent = self return new_node - def create_empty_child(self, **kwargs): - """Create and return a new empty node child of the current node.""" - new_node = Node(**kwargs) - self.root.empty_nodes.append(new_node) + def create_empty_child(self, deprel, after=True, **kwargs): + """Create and return a new empty node child of the current node. + + Args: + deprel: the enhanced dependency relation (required to be stored in DEPS) + form, lemma, upos, xpos, feats, misc: as in Node, the default is '_' + after: position the newly created empty node after this `node`? + If True (default), the `new_node.ord` will be `node.ord + 0.1`, + unless there is already an empty node with such ord, + in which case it will be `node.ord + 0.2` etc. + If False, the new node will be placed immediately before `node`. + """ + new_node = EmptyNode(root=self.root, **kwargs) + new_node.deps = [{'parent': self, 'deprel': deprel}] # self.enh_children.append(new_node) TODO # new_node.enh_parents.append(self) TODO + base_ord = self.ord if after else self.ord - 1 + new_ord = base_ord + 0.1 + for empty in self.root.empty_nodes: + if empty.ord > new_ord: + break + if empty.ord == new_ord: + #if isinstance(new_ord, OrdTuple); + # new_ord.increase() + #elif new_ord == base_ord + 0.9: + # new_ord = OrdTuple(base_ord, 10) + #else: + new_ord = round(new_ord+0.1, 1) + new_node.ord = new_ord + self.root.empty_nodes.append(new_node) return new_node # TODO: make private: _unordered_descendants @@ -719,6 +743,20 @@ def create_coref_cluster(self, **kwargs): return udapi.core.coref.create_coref_cluster(head=self, **kwargs) +class EmptyNode(Node): + """Class for representing empty nodes (for ellipsis in enhanced UD).""" + __slots__ = ['_root'] + + def __init__(self, root, form='_', lemma='_', upos='_', xpos='_', feats='_', misc='_'): + super().__init__(form=form, lemma=lemma, upos=upos, + xpos=xpos, feats=feats, deprel='_', misc=misc) + self._root = root + + @property + def root(self): + return self._root + + class ListOfNodes(list): """Helper class for results of node.children and node.descendants. diff --git a/udapi/core/root.py b/udapi/core/root.py index 364c6845..a47bcf8e 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -1,7 +1,7 @@ """Root class represents the technical root node in each tree.""" import logging -from udapi.core.node import Node, ListOfNodes +from udapi.core.node import Node, EmptyNode, ListOfNodes from udapi.core.mwt import MWT # 7 instance attributes is too low (CoNLL-U has 10 columns) @@ -142,6 +142,19 @@ def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): """Attempts at changing the word order of root result in Exception.""" raise Exception('Technical root cannot be shifted as it is always the first node') + def create_empty_child(self, **kwargs): + """Create and return a new empty node within this tree. + + This root-specific implementation overrides `Node.create_empty_child()'. + It is faster because it does not set `deps` and `ord` of the newly created node. + It is up to the user to set up these attributes correctly. + It is used in `udapi.block.read.conllu` (where speed is important and thus, + only `raw_deps` are set up instead of `deps`). + """ + new_node = EmptyNode(root=self, **kwargs) + self.empty_nodes.append(new_node) + return new_node + # TODO document whether misc is a string or dict or it can be both def create_multiword_token(self, words=None, form=None, misc=None): """Create and return a new multi-word token (MWT) in this tree. From 82a6d2dbab862f42a5e9bb0e3e10d0d57bd7f549 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 5 Feb 2021 21:48:25 +0100 Subject: [PATCH 0113/1201] fix write.Conllu, so that None attributes are converted to _ In Python, empty attributes should be None (it is more memory efficient than '_'), the underscore is just a matter of CoNLL-U serialization. --- udapi/block/write/conllu.py | 3 ++- udapi/core/node.py | 7 +++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 4b931bac..50cf366d 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -56,7 +56,8 @@ def process_tree(self, tree): # pylint: disable=too-many-branches if next_empty_ord > last_ord: break empty = empty_nodes.pop(0) - values = [str(getattr(empty, a)) for a in self.node_attributes] + values = [getattr(empty, attr_name) for attr_name in self.node_attributes] + values = ['_' if v is None else str(v) for v in values] values[6] = '_' values[7] = '_' print('\t'.join(values)) diff --git a/udapi/core/node.py b/udapi/core/node.py index cc1c7110..680009c9 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -1,6 +1,6 @@ """Node class and related classes and functions. -In addition to class `Node`, this module contains class `ListOfNodes` +In addition to class `Node`, this module contains also classes `EmptyNode` and `ListOfNodes` and function `find_minimal_common_treelet`. """ import logging @@ -747,9 +747,8 @@ class EmptyNode(Node): """Class for representing empty nodes (for ellipsis in enhanced UD).""" __slots__ = ['_root'] - def __init__(self, root, form='_', lemma='_', upos='_', xpos='_', feats='_', misc='_'): - super().__init__(form=form, lemma=lemma, upos=upos, - xpos=xpos, feats=feats, deprel='_', misc=misc) + def __init__(self, root, form=None, lemma=None, upos=None, xpos=None, feats=None, misc=None): + super().__init__(form=form, lemma=lemma, upos=upos, xpos=xpos, feats=feats, misc=misc) self._root = root @property From dc09ee577f7f42ed7004fbe0c75d955a8c098be3 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 5 Feb 2021 22:42:41 +0100 Subject: [PATCH 0114/1201] store ord in node._ord internally so the setter can be overriden in EmptyNode --- udapi/core/node.py | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 680009c9..6fb0d331 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -28,9 +28,10 @@ class Node(object): Attributes `form`, `lemma`, `upos`, `xpos` and `deprel` are public attributes of type `str`, so you can use e.g. `node.lemma = node.form`. - `node.ord` is a int type public attribute for storing the node's word order index, + `node.ord` is a int type property for storing the node's word-order index, but assigning to it should be done with care, so the non-root nodes have `ord`s 1,2,3... It is recommended to use one of the `node.shift_*` methods for reordering nodes. + Note that `EmptyNode`s (subclass of `Node`) have decimal ords (and no `shift_*` methods). For changing dependency structure (topology) of the tree, there is the `parent` property, e.g. `node.parent = node.parent.parent` and `node.create_child()` method. @@ -63,8 +64,10 @@ class Node(object): # TODO: Benchmark memory and speed of slots vs. classic dict. # With Python 3.5 split dict, slots may not be better. # TODO: Should not we include __weakref__ in slots? + # TODO: Benchmark using node._ord instead node.ord in this file + # TODO: Benchmark storing root in node._root (as it is done in EmptyNode) __slots__ = [ - 'ord', # Word-order index of the node (root has 0). + '_ord', # Word-order index of the node (root has 0). 'form', # Word form or punctuation symbol. 'lemma', # Lemma of word form. 'upos', # Universal PoS tag. @@ -83,7 +86,7 @@ class Node(object): def __init__(self, form=None, lemma=None, upos=None, # pylint: disable=too-many-arguments xpos=None, feats=None, deprel=None, misc=None): """Create a new node and initialize its attributes using the keyword arguments.""" - self.ord = None + self._ord = None self.form = form self.lemma = lemma self.upos = upos @@ -102,6 +105,15 @@ def __str__(self): """Pretty print of the Node object.""" return "node<%s, %s>" % (self.address(), self.form) + # ord is implemented as a property, so that it can be overriden in EmptyNode and Root + @property + def ord(self): + return self._ord + + @ord.setter + def ord(self, new_ord): + self._ord = new_ord + @property def udeprel(self): """Return the universal part of dependency relation, e.g. `acl` instead of `acl:relcl`. @@ -755,6 +767,30 @@ def __init__(self, root, form=None, lemma=None, upos=None, xpos=None, feats=None def root(self): return self._root + @property + def parent(self): + return None + + @parent.setter + def parent(self, _): + """Attempts at setting parent of EmptyNode result in AttributeError exception.""" + raise AttributeError('EmptyNode cannot have a (basic-UD) parent.') + + @property + def ord(self): + return self._ord + + @ord.setter + def ord(self, new_ord): + """Empty node's ord setter accepts float and str.""" + if isinstance(new_ord, str): + self._ord = float(new_ord) + elif isinstance(new_ord, float): + self._ord = new_ord + else: + raise ValueError('Only str and float are allowed for EmptyNode ord setter,' + f' but {type(new_ord)} was given.') + class ListOfNodes(list): """Helper class for results of node.children and node.descendants. From e339642ee8bde3346426966c6f7f5b3627fc028c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 5 Feb 2021 22:55:17 +0100 Subject: [PATCH 0115/1201] class OrdTuple, so that 1.9 < OrdTuple('1.10') --- udapi/core/node.py | 66 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 6fb0d331..1564c070 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -1,9 +1,10 @@ """Node class and related classes and functions. -In addition to class `Node`, this module contains also classes `EmptyNode` and `ListOfNodes` -and function `find_minimal_common_treelet`. +In addition to class `Node`, this module contains also classes +`EmptyNode`, `OrdTuple` and `ListOfNodes` and function `find_minimal_common_treelet`. """ import logging +import functools import udapi.core.coref from udapi.block.write.textmodetrees import TextModeTrees @@ -403,12 +404,12 @@ def create_empty_child(self, deprel, after=True, **kwargs): if empty.ord > new_ord: break if empty.ord == new_ord: - #if isinstance(new_ord, OrdTuple); - # new_ord.increase() - #elif new_ord == base_ord + 0.9: - # new_ord = OrdTuple(base_ord, 10) - #else: - new_ord = round(new_ord+0.1, 1) + if isinstance(new_ord, OrdTuple): + new_ord.increase() + elif new_ord == base_ord + 0.9: + new_ord = OrdTuple(base_ord, 10) + else: + new_ord = round(new_ord+0.1, 1) new_node.ord = new_ord self.root.empty_nodes.append(new_node) return new_node @@ -792,6 +793,55 @@ def ord(self, new_ord): f' but {type(new_ord)} was given.') +@functools.total_ordering +class OrdTuple: + """Class for the rare case of 9+ consecutive empty nodes, i.e. ords x.10, x.11 etc. + + Ord 1.10 cannot be stored as float, which would result in 1.1. + We thus store it as a tuple (1,10) wrapped in OrdTuple, so that comparisons work, + e.g.: 1.9 < OrdTuple('1.10') < 2 + """ + __slots__ = ('_key') + + def __init__(self, string): + m = re.match(r'(\d+)\.(\d+)$', string) + if not m: + raise ValueError(f"Ord {string} does not match \\d+.\\d+") + major, minor = int(m.group(1)), int(m.group(2)) + if minor == 0: + raise ValueError(f"Ord {string} should be stored as int") + if minor < 10: + raise ValueError(f"Ord {string} should be stored as float") + self._key = (major, minor) + + def __repr__(self): + return f"{self._key[0]}.{self._key[1]}" + + def __eq__(self, other): + if isinstance(other, int): + return False + elif isinstance(other, float): + return self._key == (int(other), int(10*other - 10*int(other))) + elif isinstance(other, OrdTuple): + return self._key == other._key + else: + raise ValueError(f"OrdTuple cannot be compared with {type(other)}") + + def __lt__(self, other): + if isinstance(other, int): + return self._key < (other, 0) + elif isinstance(other, float): + return self._key < (int(other), int(10*other - 10*int(other))) + elif isinstance(other, OrdTuple): + return self._key < other._key + else: + raise ValueError(f"OrdTuple cannot be compared with {type(other)}") + + def increase(self): + """Increment the decimal part of this ord.""" + self._key = (self.key[0], self._key[1]+1) + + class ListOfNodes(list): """Helper class for results of node.children and node.descendants. From 6dbc65c5d090b30f7645d1d739443d6a51cd3e15 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 6 Feb 2021 00:05:42 +0100 Subject: [PATCH 0116/1201] mention_words can be an empty string at any time --- udapi/core/coref.py | 4 ++-- udapi/core/node.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 32e0a87a..460230b8 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -18,7 +18,7 @@ def head(self): @head.setter def head(self, new_head): - if new_head not in self._words: + if self._words and new_head not in self._words: raise ValueError(f"New head {new_head} not in mention words") self._head = new_head @@ -45,7 +45,7 @@ def words(self): @words.setter def words(self, new_words): - if self.head not in new_words: + if new_words and self.head not in new_words: raise ValueError(f"Head {self.head} not in new_words") for old_word in self._words: old_word._mentions.remove(self) diff --git a/udapi/core/node.py b/udapi/core/node.py index 1564c070..def2ee18 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -22,7 +22,6 @@ # The set of public attributes/properties and methods of Node was well-thought. # pylint: disable=too-many-instance-attributes,too-many-public-methods - class Node(object): """Class for representing nodes in Universal Dependency trees. From 1d88f035416a75b5d46377c9dd0a6e7d739fad94 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 6 Feb 2021 00:29:42 +0100 Subject: [PATCH 0117/1201] overload __lt__, so we can use node1 < node2 --- udapi/block/demo/complexity.py | 8 ++++---- udapi/core/node.py | 10 +++++++--- udapi/core/root.py | 9 +++++---- udapi/core/tests/test_enhdeps.py | 1 + udapi/core/tests/test_node.py | 4 ++++ 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/udapi/block/demo/complexity.py b/udapi/block/demo/complexity.py index 020f8b17..99e8a046 100644 --- a/udapi/block/demo/complexity.py +++ b/udapi/block/demo/complexity.py @@ -144,7 +144,7 @@ def expand_subtree(self, nodes, expand_type): #for child in group.children: #if child.udeprel != 'conj': #result.extend(child.descendants(add_self=True)) - #return = sorted(result, key=lambda n: n.ord) + #return = sorted(result) if expand_type == 'subtree_within_clause': stack = [n for n in nodes[0].children if n.udeprel != 'conj'] while stack: @@ -152,7 +152,7 @@ def expand_subtree(self, nodes, expand_type): if not node.misc["ClauseHead"]: nodes.append(node) stack.extend(node.children()) - return sorted(nodes, key=lambda n: n.ord) + return sorted(nodes) raise ValueError("unknown expand value " + expand_type) @@ -182,7 +182,7 @@ def get_coord_phrase(self, root, phrase_type_function): for conj in conjuncts: # TODO multiword conjunctions (udeprel=flat)? conjunctions.extend([n for n in conj.children if n.udeprel == 'cc']) - results.append(sorted([node] + conjuncts + conjunctions, key=lambda n: n.ord)) + results.append(sorted([node] + conjuncts + conjunctions)) return results # TODO koordinace hlavních i vedlejších vět @@ -199,7 +199,7 @@ def get_t_units(self, main_heads): else: main_clause.append(node) stack.extend(node.children) - main_clause = sorted(main_clause, key=lambda n: n.ord) + main_clause = sorted(main_clause) for dep_clause_head in dep_heads: results.append(main_clause + self.expand_subtree([dep_clause_head], 'subtree')) diff --git a/udapi/core/node.py b/udapi/core/node.py index def2ee18..f0a4f5c0 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -22,6 +22,7 @@ # The set of public attributes/properties and methods of Node was well-thought. # pylint: disable=too-many-instance-attributes,too-many-public-methods +@functools.total_ordering class Node(object): """Class for representing nodes in Universal Dependency trees. @@ -114,6 +115,9 @@ def ord(self): def ord(self, new_ord): self._ord = new_ord + def __lt__(self, other): + return self.ord < other.ord + @property def udeprel(self): """Return the universal part of dependency relation, e.g. `acl` instead of `acl:relcl`. @@ -307,7 +311,7 @@ def parent(self, new_parent): self._parent = new_parent # Append the current node to the new parent children. - new_parent._children = sorted(new_parent.children + [self], key=lambda child: child.ord) + new_parent._children = sorted(new_parent.children + [self]) @property def children(self): @@ -361,7 +365,7 @@ def descendants(self): nodes4 = [n for n in node.descendants if n.ord < node.ord] + [node] See documentation of ListOfNodes for details. """ - return ListOfNodes(sorted(self.unordered_descendants(), key=lambda n: n.ord), origin=self) + return ListOfNodes(sorted(self.unordered_descendants()), origin=self) def is_descendant_of(self, node): """Is the current node a descendant of the node given as argument?""" @@ -884,7 +888,7 @@ def __call__(self, add_self=False, following_only=False, preceding_only=False): result = [x for x in result if x.ord <= self.origin.ord] if following_only: result = [x for x in result if x.ord >= self.origin.ord] - return sorted(result, key=lambda node: node.ord) + return sorted(result) def find_minimal_common_treelet(*args): diff --git a/udapi/core/root.py b/udapi/core/root.py index a47bcf8e..484fa319 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -153,6 +153,7 @@ def create_empty_child(self, **kwargs): """ new_node = EmptyNode(root=self, **kwargs) self.empty_nodes.append(new_node) + self.empty_nodes.sort() return new_node # TODO document whether misc is a string or dict or it can be both @@ -186,7 +187,7 @@ def _update_ordering(self): Update also the list of all tree nodes stored in root._descendants. This method is automatically called after node removal or reordering. """ - self._descendants = sorted(self.unordered_descendants(), key=lambda node: node.ord) + self._descendants = sorted(self.unordered_descendants()) for (new_ord, node) in enumerate(self._descendants, 1): node.ord = new_ord @@ -250,7 +251,7 @@ def token_descendants(self): @property def descendants_and_empty(self): - return sorted(self._descendants + self.empty_nodes, key=lambda n: float(n.ord)) + return sorted(self._descendants + self.empty_nodes) def steal_nodes(self, nodes): """Move nodes from another tree to this tree (append).""" @@ -258,7 +259,7 @@ def steal_nodes(self, nodes): for node in nodes[1:]: if node.root != old_root: raise ValueError("steal_nodes(nodes) was called with nodes from several trees") - nodes = sorted(nodes, key=lambda n: n.ord) + nodes = sorted(nodes) whole_tree = nodes == old_root.descendants new_ord = len(self._descendants) # pylint: disable=protected-access @@ -268,7 +269,7 @@ def steal_nodes(self, nodes): if not whole_tree: for child in [n for n in node.children if n not in nodes]: child._parent = old_root - old_root._children = sorted(old_root.children + [child], key=lambda n: n.ord) + old_root._children = sorted(old_root.children + [child]) node._children = [n for n in node.children if n in nodes] if node.parent == old_root or (not whole_tree and node.parent not in nodes): node.parent._children = [n for n in node.parent._children if n != node] diff --git a/udapi/core/tests/test_enhdeps.py b/udapi/core/tests/test_enhdeps.py index 80fe0209..3f473bf3 100644 --- a/udapi/core/tests/test_enhdeps.py +++ b/udapi/core/tests/test_enhdeps.py @@ -58,5 +58,6 @@ def test_create_deps2empty(self): d.deps.append({'parent': e, 'deprel': 'dep:d2e'}) self.assertEqual("2:dep:e2h", e.raw_deps, ) self.assertEqual("5:conj|3.1:dep:d2e", d.raw_deps) + self.assertEqual(self.tree.descendants_and_empty, self.nodes[:3] + [e] + self.nodes[3:]) diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index 7d811456..23ac0402 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -52,6 +52,10 @@ def test_topology(self): # ords and reorderings self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6]) + self.assertTrue(nodes[0].precedes(nodes[1])) + self.assertTrue(nodes[0] < nodes[1]) + self.assertFalse(nodes[0] > nodes[1]) + self.assertTrue(nodes[0] <= nodes[0]) nodes[0].shift_after_node(nodes[1]) self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6]) self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6]) From 87ac4206ae1413df17ff8efc0d5f5676876cf9c9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 6 Feb 2021 02:43:22 +0100 Subject: [PATCH 0118/1201] each node now stores an explicit reference to the root (node._root) Benchmark "NewTreex" shows 0.5% more memory, but total time 6% faster (rehanging, i.e. changing a parent is 21% faster, next_node 65% faster): experiment|TIME |MAXMEM |load |save |iterN|rehang|remove|add |reorder| ----------|-----:|------:|-----:|----:|----:|-----:|-----:|----:|------:| udapi |40.507|832.815|17.066|4.521|0.715|3.502 |2.348 |2.868|4.174 | udapi_new |37.958|837.302|16.720|3.846|0.251|2.766 |2.284 |2.699|4.106 | --- udapi/core/node.py | 31 +++++++++++-------------------- udapi/core/root.py | 3 ++- udapi/core/tests/test_node.py | 2 +- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index f0a4f5c0..239b250c 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -66,7 +66,6 @@ class Node(object): # With Python 3.5 split dict, slots may not be better. # TODO: Should not we include __weakref__ in slots? # TODO: Benchmark using node._ord instead node.ord in this file - # TODO: Benchmark storing root in node._root (as it is done in EmptyNode) __slots__ = [ '_ord', # Word-order index of the node (root has 0). 'form', # Word form or punctuation symbol. @@ -80,13 +79,15 @@ class Node(object): '_feats', # Morphological features as udapi.core.feats.Feats object. '_parent', # Parent node. '_children', # Ord-ordered list of child nodes. + '_root', # Technical root of the tree '_mwt', # Multi-word token in which this word participates. '_mentions', # List of udapi.core.coref.CorefMention objects whose span includes this node ] - def __init__(self, form=None, lemma=None, upos=None, # pylint: disable=too-many-arguments + def __init__(self, root, form=None, lemma=None, upos=None, # pylint: disable=too-many-arguments xpos=None, feats=None, deprel=None, misc=None): """Create a new node and initialize its attributes using the keyword arguments.""" + self._root = root self._ord = None self.form = form self.lemma = lemma @@ -106,6 +107,10 @@ def __str__(self): """Pretty print of the Node object.""" return "node<%s, %s>" % (self.address(), self.form) + @property + def root(self): + return self._root + # ord is implemented as a property, so that it can be overriden in EmptyNode and Root @property def ord(self): @@ -336,14 +341,6 @@ def children(self): """ return ListOfNodes(self._children, origin=self) - @property - def root(self): - """Return the (technical) root node of the whole tree.""" - node = self - while node.parent: - node = node.parent - return node - @property def descendants(self): """Return a list of all descendants of the current node. @@ -378,7 +375,7 @@ def is_descendant_of(self, node): def create_child(self, **kwargs): """Create and return a new child of the current node.""" - new_node = Node(**kwargs) + new_node = Node(root=self._root, **kwargs) new_node.ord = len(self.root._descendants) + 1 self.root._descendants.append(new_node) self.children.append(new_node) @@ -761,15 +758,6 @@ def create_coref_cluster(self, **kwargs): class EmptyNode(Node): """Class for representing empty nodes (for ellipsis in enhanced UD).""" - __slots__ = ['_root'] - - def __init__(self, root, form=None, lemma=None, upos=None, xpos=None, feats=None, misc=None): - super().__init__(form=form, lemma=lemma, upos=upos, xpos=xpos, feats=feats, misc=misc) - self._root = root - - @property - def root(self): - return self._root @property def parent(self): @@ -795,6 +783,9 @@ def ord(self, new_ord): raise ValueError('Only str and float are allowed for EmptyNode ord setter,' f' but {type(new_ord)} was given.') + def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): + """Attempts at changing the word order of EmptyNode result in NotImplemented exception.""" + raise NotImplemented('Empty nodes cannot be re-order using shift* methods yet.') @functools.total_ordering class OrdTuple: diff --git a/udapi/core/root.py b/udapi/core/root.py index 484fa319..06d018f4 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -18,7 +18,7 @@ class Root(Node): def __init__(self, zone=None, comment='', text=None, newpar=None, newdoc=None): """Create new root node.""" # Call constructor of the parent object. - super().__init__() + super().__init__(root=self) self.ord = 0 self.form = '' @@ -266,6 +266,7 @@ def steal_nodes(self, nodes): for node in nodes: new_ord += 1 node.ord = new_ord + node._root = self if not whole_tree: for child in [n for n in node.children if n not in nodes]: child._parent = old_root diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index 23ac0402..f38ca585 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -118,7 +118,7 @@ def test_draw(self): def test_feats(self): """Test the morphological featrues.""" - node = Node() + node = Node(root=None) self.assertEqual(str(node.feats), '_') node.feats = '' self.assertEqual(str(node.feats), '_') From 27b8be77c36b3581a22fd8a37f2fdcdd25e81619 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 6 Feb 2021 03:43:50 +0100 Subject: [PATCH 0119/1201] keep mention.word always sorted, better error msg when parsing span --- udapi/core/coref.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 460230b8..59c87d18 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -49,7 +49,7 @@ def words(self, new_words): raise ValueError(f"Head {self.head} not in new_words") for old_word in self._words: old_word._mentions.remove(self) - self._words = new_words # TODO sorted + self._words = sorted(new_words) for new_word in new_words: new_word._mentions.append(self) @@ -190,10 +190,13 @@ def store_coref_to_misc(doc): def span_to_nodes(root, span): ranges = [] for span_str in span.split(','): - if '-' not in span_str: - lo = hi = float(span_str) - else: - lo, hi = (float(x) for x in span_str.split('-')) + try: + if '-' not in span_str: + lo = hi = float(span_str) + else: + lo, hi = (float(x) for x in span_str.split('-')) + except ValueError as e: + raise ValueError(f"Cannot parse '{span}': {e}") ranges.append((lo, hi)) def _num_in_ranges(num): From a0f677551e78f19852c1682f2a6f3c5e7a4afaf4 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 6 Feb 2021 14:28:52 +0100 Subject: [PATCH 0120/1201] bug fix in parsing spans --- udapi/core/coref.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 59c87d18..a3777698 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -201,11 +201,11 @@ def span_to_nodes(root, span): def _num_in_ranges(num): for (lo, hi) in ranges: - if num > hi: + if num < lo: return False - if num >= lo: + if num <= hi: return True - return False + return False return [w for w in root.descendants_and_empty if _num_in_ranges(w.ord)] From 76ba743616125d84f48c63443eb5fdbe7d680861 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 6 Feb 2021 19:02:01 +0100 Subject: [PATCH 0121/1201] support for ClusterId[1] reading+writing support for multiple clusters in one node stored using layered attributes --- udapi/core/coref.py | 52 +++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index a3777698..613261d1 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -1,4 +1,5 @@ """Classes for handling coreference.""" +import re class CorefMention(object): """Class for representing a mention (instance of an entity).""" @@ -148,43 +149,62 @@ def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): def load_coref_from_misc(doc): clusters = {} for node in doc.nodes: + index, index_str = 0, "" cluster_id = node.misc["ClusterId"] - if cluster_id: + if not cluster_id: + index, index_str = 1, "[1]" + cluster_id = node.misc["ClusterId[1]"] + while cluster_id: cluster = clusters.get(cluster_id) if cluster is None: cluster = CorefCluster(cluster_id) clusters[cluster_id] = cluster mention = CorefMention(node, cluster) - if node.misc["MentionSpan"]: - mention.span = node.misc["MentionSpan"] - cluster_type = node.misc["ClusterType"] + if node.misc["MentionSpan" + index_str]: + mention.span = node.misc["MentionSpan" + index_str] + cluster_type = node.misc["ClusterType" + index_str] if cluster_type is not None: if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") cluster.cluster_type = cluster_type # TODO deserialize Bridging and SplitAnte - mention._bridging = node.misc["Bridging"] - cluster._split_ante = node.misc["SplitAnte"] + mention._bridging = node.misc["Bridging" + index_str] + cluster._split_ante = node.misc["SplitAnte" + index_str] + index += 1 + index_str = f"[{index}]" + cluster_id = node.misc["ClusterId" + index_str] doc._coref_clusters = clusters def store_coref_to_misc(doc): if not doc._coref_clusters: return + attrs = ("ClusterId", "MentionSpan", "ClusterType", "Bridging", "SplitAnte") for node in doc.nodes: - del node.misc["ClusterId"] - del node.misc["MentionSpan"] - del node.misc["ClusterType"] - del node.misc["Bridging"] - del node.misc["SplitAnte"] + for key in list(node.misc): + if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): + del node.misc[key] for cluster in doc._coref_clusters.values(): for mention in cluster.mentions: head = mention.head - head.misc["ClusterId"] = cluster.cluster_id - head.misc["MentionSpan"] = mention.span - head.misc["ClusterType"] = cluster.cluster_type - head.misc["Bridging"] = mention.bridging - head.misc["SplitAnte"] = cluster.split_ante + if head.misc["ClusterId"]: + for a in attrs: + if head.misc[a]: + head.misc[a + "[1]"] = head.misc[a] + del head.misc[a] + index_str = "[2]" + else: + index, index_str = 1, "[1]" + while(head.misc["ClusterId" + index_str]): + index += 1 + index_str = f"[{index}]" + if index == 1: + index_str = "" + head.misc["ClusterId" + index_str] = cluster.cluster_id + head.misc["MentionSpan" + index_str] = mention.span + head.misc["ClusterType" + index_str] = cluster.cluster_type + head.misc["Bridging" + index_str] = mention.bridging + head.misc["SplitAnte" + index_str] = cluster.split_ante def span_to_nodes(root, span): From ac3f69e0e656ac984e3a3ea9a2e7a2a4c3fe01ac Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 7 Feb 2021 21:17:21 +0100 Subject: [PATCH 0122/1201] bug fixes: node.ord is never a string now, it can be int, float or OrdTuple root.create_empty_child() should not sort the empty nodes because they may not have ord filled. In contrast, node.create_empty_child() should sort the empty nodes. --- udapi/core/node.py | 3 ++- udapi/core/root.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 239b250c..6c2ef03f 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -257,7 +257,7 @@ def deps(self): # Empty nodes have to be located differently than normal nodes. if '.' in head: try: - parent = next(x for x in self.root.empty_nodes if x.ord == head) + parent = next(x for x in self.root.empty_nodes if str(x.ord) == head) except StopIteration: raise ValueError(f'Empty node with ord={head} not found') else: @@ -412,6 +412,7 @@ def create_empty_child(self, deprel, after=True, **kwargs): new_ord = round(new_ord+0.1, 1) new_node.ord = new_ord self.root.empty_nodes.append(new_node) + self.root.empty_nodes.sort() return new_node # TODO: make private: _unordered_descendants diff --git a/udapi/core/root.py b/udapi/core/root.py index 06d018f4..2d76e9ea 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -153,7 +153,6 @@ def create_empty_child(self, **kwargs): """ new_node = EmptyNode(root=self, **kwargs) self.empty_nodes.append(new_node) - self.empty_nodes.sort() return new_node # TODO document whether misc is a string or dict or it can be both From 542707fb94047fe39977ea84307184bbd7cc312a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 7 Feb 2021 21:21:03 +0100 Subject: [PATCH 0123/1201] draft of ordering of CorefMention objects --- udapi/core/coref.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 613261d1..51cb7240 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -1,6 +1,8 @@ """Classes for handling coreference.""" import re +import functools +@functools.total_ordering class CorefMention(object): """Class for representing a mention (instance of an entity).""" __slots__ = ['_head', '_cluster', '_bridging', '_words'] @@ -13,6 +15,25 @@ def __init__(self, head, cluster=None): self._bridging = None self._words = [] + def __lt__(self, other): + """Does this mention precedes (word-order wise) the `other` mention? + + This method defines a total ordering of all mentions + (within one cluster or across different clusters). + The position is primarily defined by the first word in each mention + (or by the head if mention.words are missing). + If two mentions start at the same word, + their order is defined by the last word in their span + -- the shorter mention precedes the longer one. + """ + node1 = self._words[0] if self._words else self._head + node2 = other._words[0] if other._words else other._head + if node1 is node2: + node1 = self._words[-1] if self._words else self._head + node2 = other._words[-1] if other._words else other._head + return node1 > node2 + return node1 < node2 + @property def head(self): return self._head From b27a153d5298e5dab89f807dcf4360e37febab16 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 8 Feb 2021 14:11:01 +0100 Subject: [PATCH 0124/1201] write.TextModeTrees prints empty nodes by default --- udapi/block/write/textmodetrees.py | 21 +++++++++++++++------ udapi/core/node.py | 13 +++++++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index fd29e72e..f3dad456 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -132,7 +132,7 @@ class TextModeTrees(BaseWriter): def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color='auto', attributes='form,upos,deprel', - print_undef_as='_', print_doc_meta=True, print_comments=False, + print_undef_as='_', print_doc_meta=True, print_comments=False, print_empty=True, mark='ToDo|ToDoOrigText|Bug|Mark', marked_only=False, hints=True, layout='classic', **kwargs): """Create new TextModeTrees block object. @@ -156,6 +156,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind print_undef_as: What should be printed instead of undefined attribute values (if any)? print_doc_meta: Print `document.meta` metadata before each document? print_comments: Print comments (other than sent_id and text)? + print_empty: Print empty nodes? mark: a regex. If `re.search(mark + '=', str(node.misc))` the node is highlighted. If `print_comments and re.search(r'^ (%s) = ' % mark, root.comment, re.M)` the comment is highlighted. @@ -178,6 +179,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.print_undef_as = print_undef_as self.print_doc_meta = print_doc_meta self.print_comments = print_comments + self.print_empty = print_empty self.mark = mark self.marked_only = marked_only self.layout = layout @@ -222,11 +224,11 @@ def _compute_gaps(self, node): self._gaps[node.ord] = rmost - lmost - descs return lmost, rmost, descs + 1 - def should_print_tree(self, root): + def should_print_tree(self, root, allnodes): """Should this tree be printed?""" if not self.marked_only: return True - if any(self.is_marked(n) for n in root.descendants(add_self=1)): + if any(self.is_marked(n) for n in allnodes): return True if not self.print_comments or root.comment is None or self.mark_re is None: return False @@ -234,8 +236,11 @@ def should_print_tree(self, root): def process_tree(self, root): """Print the tree to (possibly redirected) sys.stdout.""" - allnodes = root.descendants(add_self=1) - if not self.should_print_tree(root): + if self.print_empty: + allnodes = [root] + root.descendants_and_empty + else: + allnodes = root.descendants(add_self=1) + if not self.should_print_tree(root, allnodes): return self._index_of = {allnodes[i].ord: i for i in range(len(allnodes))} self.lines = [''] * len(allnodes) @@ -281,7 +286,11 @@ def process_tree(self, root): if self.minimize_cross: stack = sorted(stack, key=lambda x: -self._gaps[x.ord]) - if self.layout != 'classic': + if self.layout == 'classic': + for idx, node in enumerate(allnodes): + if node.is_empty(): + self.add_node(idx, node) + else: columns_attrs = [[a] for a in self.attrs] if self.layout == 'align' else [self.attrs] for col_attrs in columns_attrs: self.attrs = col_attrs diff --git a/udapi/core/node.py b/udapi/core/node.py index 6c2ef03f..6dd6301f 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -433,6 +433,15 @@ def is_root(): """ return False + @staticmethod + def is_empty(): + """Is the current node an empty node? + + Returns False for all Node instances. + True is returned only by instances of the EmptyNode subclass. + """ + return False + def remove(self, children=None): """Delete this node and all its descendants. @@ -760,6 +769,10 @@ def create_coref_cluster(self, **kwargs): class EmptyNode(Node): """Class for representing empty nodes (for ellipsis in enhanced UD).""" + def is_empty(self): + """Return True for all EmptyNode instances.""" + return True + @property def parent(self): return None From 5f1497028e15a667d8d120269d66b5bc6d0ff6cb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 8 Feb 2021 17:07:17 +0100 Subject: [PATCH 0125/1201] 18% faster loading * l.append(x) is faster than l += [x] * l.sort(); l2=l; is faster than l2 = sorted(l) --- udapi/core/coref.py | 18 +++++++++++++++--- udapi/core/node.py | 9 +++++++-- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 51cb7240..b593b898 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -69,11 +69,23 @@ def words(self): def words(self, new_words): if new_words and self.head not in new_words: raise ValueError(f"Head {self.head} not in new_words") + kept_words = [] for old_word in self._words: - old_word._mentions.remove(self) - self._words = sorted(new_words) + if old_word in new_words: + kept_words.append(old_word) + else: + old_word._mentions.remove(self) + #for old_word in self._words: + # old_word._mentions.remove(self) + new_words.sort() + self._words = new_words for new_word in new_words: - new_word._mentions.append(self) + if new_word not in kept_words: + #new_word._mentions.append(self) + if new_word._mentions and self > new_word._mentions[-1]: + new_word._mentions.append(self) + else: + new_word._mentions = sorted(new_word._mentions + [self]) @property def span(self): diff --git a/udapi/core/node.py b/udapi/core/node.py index 6dd6301f..34d06609 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -316,7 +316,11 @@ def parent(self, new_parent): self._parent = new_parent # Append the current node to the new parent children. - new_parent._children = sorted(new_parent.children + [self]) + if new_parent._children and self > new_parent._children[-1]: + new_parent._children.append(self) + else: + new_parent._children.append(self) + new_parent._children.sort() @property def children(self): @@ -893,7 +897,8 @@ def __call__(self, add_self=False, following_only=False, preceding_only=False): result = [x for x in result if x.ord <= self.origin.ord] if following_only: result = [x for x in result if x.ord >= self.origin.ord] - return sorted(result) + result.sort() + return result def find_minimal_common_treelet(*args): From 44c291b930fa591477c87457a17c0e76e6ee22ea Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 9 Feb 2021 09:53:07 +0100 Subject: [PATCH 0126/1201] huge speedup: "NewTreex" benchmark 40s -> 25s The main trick is to prevent creating new lists (and memory allocations) wherever possible. E.g. `node.children` creates a new `ListOfNodes` object with a copy of the list of children and `node.children(add_self=True)` creates one more copy (and the previous copy is thrown away for gc). Thus internally, we can call node._children (which currently does not guarantee sorted result), which creates no extra list. `node.children(add_self=True)` was changed so it creates just a single new list. Further speedup is possible in future. Very minor speedup is due to direct usage of attributes instead of overloaded properties, e.g. node._ord instead of node.ord. This is not worth the effort in user blocks, but internally in the core API it makes a (small) difference in total. --- udapi/core/block.py | 2 +- udapi/core/document.py | 2 +- udapi/core/node.py | 193 +++++++++++++++++++++-------------------- udapi/core/root.py | 2 +- 4 files changed, 103 insertions(+), 96 deletions(-) diff --git a/udapi/core/block.py b/udapi/core/block.py index 3292866f..64b8bcc5 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -29,7 +29,7 @@ def process_node(self, _): def process_tree(self, tree): """Process a UD tree""" - for node in tree.descendants: + for node in tree._descendants: self.process_node(node) def process_bundle(self, bundle): diff --git a/udapi/core/document.py b/udapi/core/document.py index 36edb856..6bf2e55d 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -93,7 +93,7 @@ def nodes(self): """An iterator over all nodes in the document.""" for bundle in self: for tree in bundle: - for node in tree.descendants: + for node in tree._descendants: yield node def draw(self, **kwargs): diff --git a/udapi/core/node.py b/udapi/core/node.py index 34d06609..1bf118c7 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -121,7 +121,7 @@ def ord(self, new_ord): self._ord = new_ord def __lt__(self, other): - return self.ord < other.ord + return self._ord < other._ord @property def udeprel(self): @@ -220,7 +220,7 @@ def raw_deps(self): serialized_deps = [] for secondary_dependence in self._deps: serialized_deps.append('{}:{}'.format(secondary_dependence[ - 'parent'].ord, secondary_dependence['deprel'])) + 'parent']._ord, secondary_dependence['deprel'])) self._raw_deps = '|'.join(serialized_deps) return self._raw_deps @@ -243,7 +243,7 @@ def deps(self): """ if self._deps is None: # Obtain a list of all nodes in the dependency tree. - nodes = [self.root] + self.root.descendants() + nodes = [self._root] + self._root._descendants # Create a list of secondary dependencies. self._deps = list() @@ -257,7 +257,7 @@ def deps(self): # Empty nodes have to be located differently than normal nodes. if '.' in head: try: - parent = next(x for x in self.root.empty_nodes if str(x.ord) == head) + parent = next(x for x in self._root.empty_nodes if str(x._ord) == head) except StopIteration: raise ValueError(f'Empty node with ord={head} not found') else: @@ -286,7 +286,7 @@ def parent(self, new_parent): (from the list of original parent's children). """ # If the parent is already assigned, return. - if self.parent is new_parent: + if self._parent is new_parent: return # The node itself couldn't be assigned as a parent. None cannot be used as parent. @@ -301,14 +301,13 @@ def parent(self, new_parent): if climbing_node is self: raise ValueError('Setting the parent of %s to %s would lead to a cycle.' % (self, new_parent)) - climbing_node = climbing_node.parent + climbing_node = climbing_node._parent # Remove the current Node from the children of the old parent. # Forbid moving nodes from one tree to another using parent setter. if self._parent: - self._parent._children = [node for node in self.parent.children if node is not self] - # TODO: .root is currently computed, so it is quite slow - old_root, new_root = self._parent.root, climbing_node + self._parent._children = [node for node in self._parent._children if node is not self] + old_root, new_root = self._parent._root, climbing_node if old_root is not new_root: raise ValueError('Cannot move nodes between trees with parent setter, ' 'use new_root.steal_nodes(nodes_to_be_moved) instead') @@ -316,11 +315,12 @@ def parent(self, new_parent): self._parent = new_parent # Append the current node to the new parent children. - if new_parent._children and self > new_parent._children[-1]: - new_parent._children.append(self) - else: - new_parent._children.append(self) - new_parent._children.sort() + new_parent._children.append(self) +# if not new_parent._children or self > new_parent._children[-1]: +# new_parent._children.append(self) +# else: +# new_parent._children.append(self) +# new_parent._children.sort() @property def children(self): @@ -366,24 +366,24 @@ def descendants(self): nodes4 = [n for n in node.descendants if n.ord < node.ord] + [node] See documentation of ListOfNodes for details. """ - return ListOfNodes(sorted(self.unordered_descendants()), origin=self) + return ListOfNodes(self.unordered_descendants(), origin=self) def is_descendant_of(self, node): """Is the current node a descendant of the node given as argument?""" - climber = self.parent + climber = self._parent while climber: if climber is node: return True - climber = climber.parent + climber = climber._parent return False def create_child(self, **kwargs): """Create and return a new child of the current node.""" new_node = Node(root=self._root, **kwargs) - new_node.ord = len(self.root._descendants) + 1 - self.root._descendants.append(new_node) - self.children.append(new_node) - new_node.parent = self + new_node._ord = len(self._root._descendants) + 1 + self._root._descendants.append(new_node) + self._children.append(new_node) + new_node._parent = self return new_node def create_empty_child(self, deprel, after=True, **kwargs): @@ -398,32 +398,35 @@ def create_empty_child(self, deprel, after=True, **kwargs): in which case it will be `node.ord + 0.2` etc. If False, the new node will be placed immediately before `node`. """ - new_node = EmptyNode(root=self.root, **kwargs) + new_node = EmptyNode(root=self._root, **kwargs) new_node.deps = [{'parent': self, 'deprel': deprel}] # self.enh_children.append(new_node) TODO # new_node.enh_parents.append(self) TODO - base_ord = self.ord if after else self.ord - 1 + base_ord = self._ord if after else self._ord - 1 new_ord = base_ord + 0.1 - for empty in self.root.empty_nodes: - if empty.ord > new_ord: + for empty in self._root.empty_nodes: + if empty._ord > new_ord: break - if empty.ord == new_ord: + if empty._ord == new_ord: if isinstance(new_ord, OrdTuple): new_ord.increase() elif new_ord == base_ord + 0.9: new_ord = OrdTuple(base_ord, 10) else: new_ord = round(new_ord+0.1, 1) - new_node.ord = new_ord - self.root.empty_nodes.append(new_node) - self.root.empty_nodes.sort() + new_node._ord = new_ord + if not self._root.empty_nodes or new_node > self._root.empty_nodes[-1]: + self._root.empty_nodes.append(new_node) + else: + self._root.empty_nodes.append(new_node) + self._root.empty_nodes.sort() return new_node # TODO: make private: _unordered_descendants def unordered_descendants(self): """Return a list of all descendants in any order.""" descendants = [] - for child in self.children: + for child in self._children: descendants.append(child) descendants.extend(child.unordered_descendants()) return descendants @@ -456,40 +459,45 @@ def remove(self, children=None): `warn` means to issue a warning if any children are present and delete them. `rehang_warn` means to rehang and warn:-). """ - self.parent._children = [child for child in self.parent.children if child is not self] - if children is not None and self.children: + self._parent._children = [child for child in self._parent._children if child is not self] + if children is not None and self._children: if children.startswith('rehang'): - for child in self.children: - child.parent = self.parent + for child in self._children: + child.parent = self._parent # TODO child._parent = self._parent if children.endswith('warn'): logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) - self.root._update_ordering() + self._root._update_ordering() # TODO: make private: _shift def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): """Internal method for changing word order.""" - nodes_to_move = [self] - if move_subtree: - nodes_to_move.extend(self.descendants()) - - reference_ord = reference_node.ord + nodes_to_move = self.unordered_descendants() + nodes_to_move.append(self) + else: + nodes_to_move = [self] + reference_ord = reference_node._ord if reference_subtree: - for node in [n for n in reference_node.descendants() if n is not self]: - if (after and node.ord > reference_ord) or (not after and node.ord < reference_ord): - reference_ord = node.ord + if after: + for node in reference_node.unordered_descendants(): + if node._ord > reference_ord and node is not self: + reference_ord = node._ord + else: + for node in reference_node.unordered_descendants(): + if node._ord < reference_ord and node is not self: + reference_ord = node._ord common_delta = 0.5 if after else -0.5 # TODO: can we use some sort of epsilon instead of choosing a silly # upper bound for out-degree? for node_to_move in nodes_to_move: - node_to_move.ord = reference_ord + common_delta + \ - (node_to_move.ord - self.ord) / 100000. + node_to_move._ord = reference_ord + common_delta + \ + (node_to_move._ord - self._ord) / 100000. - self.root._update_ordering() + self._root._update_ordering() # TODO add without_children kwarg def shift_after_node(self, reference_node): @@ -519,48 +527,48 @@ def shift_before_subtree(self, reference_node, without_children=0): @property def prev_node(self): """Return the previous node according to word order.""" - new_ord = self.ord - 1 + new_ord = self._ord - 1 if new_ord < 0: return None if new_ord == 0: - return self.root - return self.root._descendants[new_ord - 1] + return self._root + return self._root._descendants[new_ord - 1] @property def next_node(self): """Return the following node according to word order.""" # Note that all_nodes[n].ord == n+1 try: - return self.root._descendants[self.ord] + return self._root._descendants[self._ord] except IndexError: return None def precedes(self, node): """Does this node precedes another `node` in word order (`self.ord < node.ord`)?""" - return self.ord < node.ord + return self._ord < node._ord def is_leaf(self): """Is this node a leaf, ie. a node without any children?""" - return not self.children + return not self._children def _get_attr(self, name): # pylint: disable=too-many-return-statements if name == 'dir': - if self.parent.is_root(): + if self._parent.is_root(): return 'root' - return 'left' if self.precedes(self.parent) else 'right' + return 'left' if self.precedes(self._parent) else 'right' if name == 'edge': - if self.parent.is_root(): + if self._parent.is_root(): return 0 - return self.ord - self.parent.ord + return self._ord - self._parent._ord if name == 'children': - return len(self.children) + return len(self._children) if name == 'siblings': - return len(self.parent.children) - 1 + return len(self._parent._children) - 1 if name == 'depth': value = 0 tmp = self while not tmp.is_root(): - tmp = tmp.parent + tmp = tmp._parent value += 1 return value if name == 'feats_split': @@ -599,7 +607,7 @@ def get_attrs(self, attrs, undefs=None, stringify=True): for name in attrs: nodes = [self] if name.startswith('p_'): - nodes, name = [self.parent], name[2:] + nodes, name = [self._parent], name[2:] elif name.startswith('c_'): nodes, name = self.children, name[2:] elif name.startswith('l_'): @@ -641,8 +649,8 @@ def compute_text(self, use_mwt=True): for node in self.descendants(add_self=not self.is_root()): mwt = node.multiword_token if use_mwt and mwt: - if node.ord > last_mwt_id: - last_mwt_id = mwt.words[-1].ord + if node._ord > last_mwt_id: + last_mwt_id = mwt.words[-1]._ord string += mwt.form if mwt.misc['SpaceAfter'] != 'No': string += ' ' @@ -679,7 +687,7 @@ def address(self): e.g. s123/en_udpipe#4. If zone is empty, the slash is excluded as well, e.g. s123#4. """ - return '%s#%d' % (self.root.address() if self.root else '?', self.ord) + return '%s#%d' % (self._root.address() if self._root else '?', self._ord) @property def multiword_token(self): @@ -701,13 +709,13 @@ def is_nonprojective(self): and the total number of nodes in the span. """ # Root and its children are always projective - parent = self.parent + parent = self._parent if not parent or parent.is_root(): return False # Edges between neighboring nodes are always projective. # Check it now to make it a bit faster. - ord1, ord2 = self.ord, parent.ord + ord1, ord2 = self._ord, parent._ord if ord1 > ord2: ord1, ord2 = ord2, ord1 distance = ord2 - ord1 @@ -715,7 +723,7 @@ def is_nonprojective(self): return False # Get all the descendants of parent that are in the span of the edge. - span = [n for n in parent.descendants if n.ord > ord1 and n.ord < ord2] + span = [n for n in parent.unordered_descendants() if n._ord > ord1 and n._ord < ord2] # For projective edges, span must include all the nodes between parent and self. return len(span) != distance - 1 @@ -730,15 +738,15 @@ def is_nonprojective_gap(self): """ ancestors = set([self]) node = self - while node.parent: - node = node.parent + while node._parent: + node = node._parent ancestors.add(node) - all_nodes = node.descendants - for left_node in all_nodes[:self.ord - 1]: - if self.precedes(left_node.parent) and left_node.parent not in ancestors: + all_nodes = node._descendants + for left_node in all_nodes[:self._ord - 1]: + if self.precedes(left_node._parent) and left_node._parent not in ancestors: return True - for right_node in all_nodes[self.ord:]: - if right_node.parent.precedes(self) and right_node.parent not in ancestors: + for right_node in all_nodes[self._ord:]: + if right_node._parent.precedes(self) and right_node._parent not in ancestors: return True return False @@ -758,12 +766,12 @@ def gloss(self, new_gloss): @property def coref_mentions(self): - self.root.bundle.document._load_coref() + self._root.bundle.document._load_coref() return self._mentions @property def coref_clusters(self): - self.root.bundle.document._load_coref() + self._root.bundle.document._load_coref() return [m.cluster for m in self._mentions if m.cluster is not None] def create_coref_cluster(self, **kwargs): @@ -876,7 +884,7 @@ class ListOfNodes(list): nodes = node.children(add_self=True, following_only=True) """ - def __init__(self, iterable, origin): + def __init__(self, iterable, origin, skip_sort=False): """Create a new ListOfNodes. Args: @@ -884,21 +892,20 @@ def __init__(self, iterable, origin): origin: a node which is the parent/ancestor of these nodes """ super().__init__(iterable) + if not skip_sort: + self.sort() self.origin = origin def __call__(self, add_self=False, following_only=False, preceding_only=False): """Returns a subset of nodes contained in this list as specified by the args.""" - if not add_self and not following_only and not preceding_only: - return self - result = list(self) if add_self: - result.append(self.origin) + self.append(self.origin) + self.sort() if preceding_only: - result = [x for x in result if x.ord <= self.origin.ord] + return [x for x in self if x._ord <= self.origin._ord] if following_only: - result = [x for x in result if x.ord >= self.origin.ord] - result.sort() - return result + return [x for x in self if x._ord >= self.origin._ord] + return self def find_minimal_common_treelet(*args): @@ -917,7 +924,7 @@ def find_minimal_common_treelet(*args): """ nodes = list(args) # The input nodes are surely in the treelet, let's mark this with "1". - in_treelet = {node.ord: 1 for node in nodes} + in_treelet = {node._ord: 1 for node in nodes} # Step 1: Find a node (`highest`) which is governing all the input `nodes`. # It may not be the lowest such node, however. @@ -936,14 +943,14 @@ def find_minimal_common_treelet(*args): highest = None while len(nodes) > 1: node = nodes.pop(0) # TODO deque - parent = node.parent + parent = node._parent if parent is None: highest = node - elif in_treelet.get(parent.ord, False): - in_treelet[parent.ord] = 1 + elif in_treelet.get(parent._ord, False): + in_treelet[parent._ord] = 1 else: - new_nodes[parent.ord] = parent - in_treelet[parent.ord] = node + new_nodes[parent._ord] = parent + in_treelet[parent._ord] = node nodes.append(parent) # In most cases, `nodes` now contain just one node -- the one we were looking for. @@ -954,11 +961,11 @@ def find_minimal_common_treelet(*args): # If the `highest` node is unsure, climb down using poiners stored in `in_treelet`. # All such nodes which were rejected as true members of the minimal common treelet # must be deleted from the set of newly added nodes `new_nodes`. - child = in_treelet[highest.ord] + child = in_treelet[highest._ord] while child != 1: - del new_nodes[highest.ord] + del new_nodes[highest._ord] highest = child - child = in_treelet[highest.ord] + child = in_treelet[highest._ord] # We return the root of the minimal common treelet plus all the newly added nodes. return (highest, new_nodes.values()) diff --git a/udapi/core/root.py b/udapi/core/root.py index 2d76e9ea..7482d1e2 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -112,7 +112,7 @@ def descendants(self): The nodes are sorted by their ord. This root-specific implementation returns all the nodes in the tree except the root itself. """ - return ListOfNodes(self._descendants, origin=self) + return ListOfNodes(self._descendants, origin=self, skip_sort=True) def is_descendant_of(self, node): """Is the current node a descendant of the node given as argument? From 4bb190883a6c19218fd8e54217a4bf70c694d7b6 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 9 Feb 2021 00:00:37 +0100 Subject: [PATCH 0127/1201] bugfix: if resegment==False, the text attribute should contain the whole text, not just the first sentence after segmentation --- udapi/tool/udpipe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/tool/udpipe.py b/udapi/tool/udpipe.py index c08785da..18f6b2ca 100644 --- a/udapi/tool/udpipe.py +++ b/udapi/tool/udpipe.py @@ -1,5 +1,6 @@ """Wrapper for UDPipe (more pythonic than ufal.udpipe).""" import io +import sys from ufal.udpipe import Model, Pipeline, ProcessingError, Sentence # pylint: disable=no-name-in-module from udapi.core.resource import require_file @@ -92,7 +93,7 @@ def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): for u_sentence in u_sentences: if not new_root: new_root = Root() - new_root.text = u_sentence.getText() + new_root.text = u_sentence.getText() if resegment else root.text heads, nodes = [], [new_root] u_words = u_sentence.words for i in range(1, u_words.size()): From bac4e88f08b3b4979a28a5827a5da28eb8ab1aa7 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 9 Feb 2021 11:37:45 +0100 Subject: [PATCH 0128/1201] minor speedup --- udapi/core/coref.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index b593b898..e5f3d3d5 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -75,17 +75,15 @@ def words(self, new_words): kept_words.append(old_word) else: old_word._mentions.remove(self) - #for old_word in self._words: - # old_word._mentions.remove(self) new_words.sort() self._words = new_words for new_word in new_words: if new_word not in kept_words: - #new_word._mentions.append(self) - if new_word._mentions and self > new_word._mentions[-1]: + if not new_word._mentions or self > new_word._mentions[-1]: new_word._mentions.append(self) else: - new_word._mentions = sorted(new_word._mentions + [self]) + new_word._mentions.append(self) + new_word._mentions.sort() @property def span(self): From 2089000fb8f9d83271544b42eeae36572d0497a1 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 9 Feb 2021 23:41:11 +0100 Subject: [PATCH 0129/1201] keep node._children always sorted This makes the NewTreex benchmark slightly slower, but this is due to the fact it almost never reads node.children, while it often changes the list of children (load, add, remove, rehang, reorder). We can make the loading faster in future. --- udapi/core/node.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 1bf118c7..08eca6bc 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -315,12 +315,11 @@ def parent(self, new_parent): self._parent = new_parent # Append the current node to the new parent children. - new_parent._children.append(self) -# if not new_parent._children or self > new_parent._children[-1]: -# new_parent._children.append(self) -# else: -# new_parent._children.append(self) -# new_parent._children.sort() + if not new_parent._children or self > new_parent._children[-1]: + new_parent._children.append(self) + else: + new_parent._children.append(self) + new_parent._children.sort() @property def children(self): @@ -343,7 +342,7 @@ def children(self): nodes4 = [n for n in node.children if n.ord < node.ord] + [node] See documentation of ListOfNodes for details. """ - return ListOfNodes(self._children, origin=self) + return ListOfNodes(self._children, origin=self, skip_sort=True) @property def descendants(self): @@ -862,6 +861,12 @@ def increase(self): self._key = (self.key[0], self._key[1]+1) +# Implementation note on ListOfNodes +# We could inherit from collections.abc.Sequence, store the list in self._data +# and implement __getitem__ and __len__ by delegating it to self._data. +# I thought it could be faster because we prevent copying of the list in super().__init__(iterable). +# In practice, it is slower because of the delegation: native list's __getitem__ is C-optimized. +# So let's just inherit from list. class ListOfNodes(list): """Helper class for results of node.children and node.descendants. @@ -883,6 +888,7 @@ class ListOfNodes(list): nodes = node.children() nodes = node.children(add_self=True, following_only=True) """ + __slots__ = ('origin',) def __init__(self, iterable, origin, skip_sort=False): """Create a new ListOfNodes. @@ -890,6 +896,7 @@ def __init__(self, iterable, origin, skip_sort=False): Args: iterable: a list of nodes origin: a node which is the parent/ancestor of these nodes + skip_sort: is the data already sorted? """ super().__init__(iterable) if not skip_sort: From 0cb07cdb22551783a5c92934a2913ea43acb2fe2 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Feb 2021 01:28:35 +0100 Subject: [PATCH 0130/1201] minor speedup by preventing recursion in unordered_descendants() Implementing it as a generator was actually a bit slower, both using `yield from child.unordered_descendants()` and usign a stack and yielding always a single node. --- udapi/core/node.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 08eca6bc..b4c60559 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -424,10 +424,13 @@ def create_empty_child(self, deprel, after=True, **kwargs): # TODO: make private: _unordered_descendants def unordered_descendants(self): """Return a list of all descendants in any order.""" - descendants = [] - for child in self._children: - descendants.append(child) - descendants.extend(child.unordered_descendants()) + stack = list(self._children) + descendants = list(stack) + while(stack): + n = stack.pop() + if n._children: + stack.extend(n._children) + descendants.extend(n._children) return descendants @staticmethod From cb5c8808e09907de61079961eeb1bce2b37a41b8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Feb 2021 03:07:00 +0100 Subject: [PATCH 0131/1201] minor speedup: node.descendants builds ListOfNodes to prevent one list copying Thus no skip_sort is needed in ListOfNodes' __init__. --- udapi/core/node.py | 17 +++++++++++------ udapi/core/root.py | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index b4c60559..caa95129 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -342,7 +342,7 @@ def children(self): nodes4 = [n for n in node.children if n.ord < node.ord] + [node] See documentation of ListOfNodes for details. """ - return ListOfNodes(self._children, origin=self, skip_sort=True) + return ListOfNodes(self._children, origin=self) @property def descendants(self): @@ -365,7 +365,15 @@ def descendants(self): nodes4 = [n for n in node.descendants if n.ord < node.ord] + [node] See documentation of ListOfNodes for details. """ - return ListOfNodes(self.unordered_descendants(), origin=self) + stack = list(self._children) + descendants = ListOfNodes(stack, origin=self) + while(stack): + n = stack.pop() + if n._children: + stack.extend(n._children) + descendants.extend(n._children) + descendants.sort() + return descendants def is_descendant_of(self, node): """Is the current node a descendant of the node given as argument?""" @@ -893,17 +901,14 @@ class ListOfNodes(list): """ __slots__ = ('origin',) - def __init__(self, iterable, origin, skip_sort=False): + def __init__(self, iterable, origin): """Create a new ListOfNodes. Args: iterable: a list of nodes origin: a node which is the parent/ancestor of these nodes - skip_sort: is the data already sorted? """ super().__init__(iterable) - if not skip_sort: - self.sort() self.origin = origin def __call__(self, add_self=False, following_only=False, preceding_only=False): diff --git a/udapi/core/root.py b/udapi/core/root.py index 7482d1e2..2d76e9ea 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -112,7 +112,7 @@ def descendants(self): The nodes are sorted by their ord. This root-specific implementation returns all the nodes in the tree except the root itself. """ - return ListOfNodes(self._descendants, origin=self, skip_sort=True) + return ListOfNodes(self._descendants, origin=self) def is_descendant_of(self, node): """Is the current node a descendant of the node given as argument? From d701452314d04471d856e0f582f4b4cb90519d0a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Feb 2021 04:28:51 +0100 Subject: [PATCH 0132/1201] inline root._update_ordering() It was used only in node.remove() and node.shift() and in both cases it could be optimized. --- udapi/core/node.py | 11 +++++++++-- udapi/core/root.py | 10 ---------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index caa95129..0471dd30 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -477,7 +477,11 @@ def remove(self, children=None): if children.endswith('warn'): logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) - self._root._update_ordering() + + self._root._descendants = sorted(self._root.unordered_descendants()) + for (new_ord, node) in enumerate(self._root._descendants, 1): + node.ord = new_ord + # TODO: make private: _shift def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): @@ -507,7 +511,10 @@ def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): node_to_move._ord = reference_ord + common_delta + \ (node_to_move._ord - self._ord) / 100000. - self._root._update_ordering() + self._root._descendants.sort() + for (new_ord, node) in enumerate(self._root._descendants, 1): + node.ord = new_ord + # TODO add without_children kwarg def shift_after_node(self, reference_node): diff --git a/udapi/core/root.py b/udapi/core/root.py index 2d76e9ea..3fbe5fca 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -180,16 +180,6 @@ def multiword_tokens(self, mwts): """Set the list of all multi-word tokens in this tree.""" self._mwts = mwts - def _update_ordering(self): - """Update the ord attribute of all nodes. - - Update also the list of all tree nodes stored in root._descendants. - This method is automatically called after node removal or reordering. - """ - self._descendants = sorted(self.unordered_descendants()) - for (new_ord, node) in enumerate(self._descendants, 1): - node.ord = new_ord - def get_sentence(self, if_missing='detokenize'): """Return either the stored `root.text` or (if None) `root.compute_text()`. From ee1c4a32734a88a27df62a47cb90c10ebd619e6e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Feb 2021 05:00:44 +0100 Subject: [PATCH 0133/1201] speedup: when node.remove() deletes just node --- udapi/core/node.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 0471dd30..ea2b11d0 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -478,9 +478,21 @@ def remove(self, children=None): logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) - self._root._descendants = sorted(self._root.unordered_descendants()) - for (new_ord, node) in enumerate(self._root._descendants, 1): - node.ord = new_ord + # When self is the only node being removed, it is faster to root._descendants.remove(self) + # and update the ords only where necessary (from self._ord further). + # When removing also its children+descendants, it is faster to recompute root._descendants + # and update all ords (computing leftmost descendant of self would be too slow). + if not self._children: + try: + self._root._descendants.remove(self) + except ValueError: + pass # self may be an already deleted node e.g. if n.remove() called twice + for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): + node.ord = new_ord + else: + self._root._descendants = sorted(self._root.unordered_descendants()) + for (new_ord, node) in enumerate(self._root._descendants, 1): + node.ord = new_ord # TODO: make private: _shift From 34ea72b7a7e3677e74f8658e232e70eac3b9cd3b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Feb 2021 09:55:32 +0100 Subject: [PATCH 0134/1201] 10% faster loading of CoNLL-U files --- udapi/block/read/conllu.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 83f7a1c3..e9adbae1 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -5,6 +5,7 @@ from udapi.core.basereader import BaseReader from udapi.core.root import Root +from udapi.core.node import Node # Compile a set of regular expressions that will be searched over the lines. # The equal sign after sent_id was added to the specification in UD v2.0. @@ -131,7 +132,8 @@ def read_tree(self): empty.raw_deps = fields[8] # TODO continue - node = root.create_child() + node = Node(root=root) + root._descendants.append(node) # TODO slow implementation of speed-critical loading for (n_attribute, attribute_name) in enumerate(self.node_attributes): @@ -170,22 +172,31 @@ def read_tree(self): root._descendants = [] # Set dependency parents (now, all nodes of the tree are created). - # TODO: parent setter checks for cycles, but this is something like O(n*log n) - # if done for each node. It could be done faster if the whole tree is checked at once. - # Also parent setter removes the node from its old parent's list of children, - # this could be skipped here by not using `node = root.create_child()`. for node_ord, node in enumerate(nodes[1:], 1): try: - node.parent = nodes[parents[node_ord]] - # TODO add a special Exception class for cycles - except ValueError as e: - if self.fix_cycles: - logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", e) - node.parent = root - else: - raise + parent = nodes[parents[node_ord]] except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) + if node is parent: + if self.fix_cycles: + logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) + node._parent = root + root._children.append(node) + else: + raise ValueError(f"Detected a cycle: {node} attached to itself") + elif node.children: + climbing = parent._parent + while climbing: + if climbing == node: + if self.fix_cycles: + logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent) + parent = root + break + else: + raise ValueError(f"Detected a cycle: {node}") + climbing = climbing._parent + node._parent = parent + parent._children.append(node) # Create multi-word tokens. for fields in mwts: From 1ada00a6e51d372663ded6a0886c029317cc4c18 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 02:10:51 +0100 Subject: [PATCH 0135/1201] explanatory comment --- udapi/core/node.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/core/node.py b/udapi/core/node.py index ea2b11d0..2cfdb5c4 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -365,6 +365,9 @@ def descendants(self): nodes4 = [n for n in node.descendants if n.ord < node.ord] + [node] See documentation of ListOfNodes for details. """ + # The following code is equivalent to + # ListOfNodes(sorted(self.unordered_descendants()), origin=self) + # but it is faster because there is no extra copying of lists of nodes. stack = list(self._children) descendants = ListOfNodes(stack, origin=self) while(stack): From 15ed2bafbb04ed29bfc5cbc54a164ed43a26fa67 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 04:09:31 +0100 Subject: [PATCH 0136/1201] make reordering (node.shift*) 50% faster --- udapi/core/node.py | 71 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 2cfdb5c4..65d050a2 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -502,8 +502,7 @@ def remove(self, children=None): def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): """Internal method for changing word order.""" if move_subtree: - nodes_to_move = self.unordered_descendants() - nodes_to_move.append(self) + nodes_to_move = self.descendants(add_self=True) else: nodes_to_move = [self] @@ -518,17 +517,65 @@ def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): if node._ord < reference_ord and node is not self: reference_ord = node._ord - common_delta = 0.5 if after else -0.5 - - # TODO: can we use some sort of epsilon instead of choosing a silly - # upper bound for out-degree? - for node_to_move in nodes_to_move: - node_to_move._ord = reference_ord + common_delta + \ - (node_to_move._ord - self._ord) / 100000. + # convert shift_after_node to shift_before_node + reference_ord += 1 if after else 0 + + first_ord, last_ord = nodes_to_move[0]._ord, nodes_to_move[-1]._ord + all_nodes = self._root._descendants + + # If there are no "gaps" in nodes_to_move (e.g. when it is projective), + # we can make the shifting a bit faster and simpler. + if last_ord - first_ord + 1 == len(nodes_to_move): + # First, move a node from position src_ord to position trg_ord RIGHT-ward. + trg_ord, src_ord = last_ord, first_ord - 1 + while src_ord >= reference_ord: + all_nodes[trg_ord - 1] = all_nodes[src_ord - 1] + all_nodes[trg_ord-1]._ord = trg_ord + trg_ord, src_ord = trg_ord - 1, src_ord - 1 + # Second, move a node from position src_ord to position trg_ord LEFT-ward. + trg_ord, src_ord = first_ord, last_ord + 1 + while src_ord < reference_ord: + all_nodes[trg_ord - 1] = all_nodes[src_ord - 1] + all_nodes[trg_ord - 1]._ord = trg_ord + trg_ord, src_ord = trg_ord + 1, src_ord + 1 + # Third, move nodes_to_move to trg_ord RIGHT-ward. + trg_ord = reference_ord if reference_ord < first_ord else trg_ord + for node in nodes_to_move: + all_nodes[trg_ord - 1], node._ord = node, trg_ord + trg_ord += 1 + return - self._root._descendants.sort() - for (new_ord, node) in enumerate(self._root._descendants, 1): - node.ord = new_ord + # First, move a node from position src_ord to position trg_ord RIGHT-ward. + # src_ord iterates decreasingly over nodes which are not moving. + trg_ord, src_ord, mov_ord = last_ord, last_ord - 1, len(nodes_to_move) - 2 + while src_ord >= reference_ord: + while all_nodes[src_ord - 1] is nodes_to_move[mov_ord]: + mov_ord, src_ord = mov_ord - 1, src_ord - 1 + if src_ord < reference_ord: + break + else: + all_nodes[trg_ord - 1] = all_nodes[src_ord - 1] + all_nodes[trg_ord - 1]._ord = trg_ord + trg_ord, src_ord = trg_ord - 1, src_ord - 1 + + # Second, move a node from position src_ord to position trg_ord LEFT-ward. + # src_ord iterates increasingly over nodes which are not moving. + trg_ord, src_ord, mov_ord = first_ord, first_ord + 1, 1 + while src_ord < reference_ord: + while mov_ord < len(nodes_to_move) and all_nodes[src_ord - 1] is nodes_to_move[mov_ord]: + mov_ord, src_ord = mov_ord + 1, src_ord + 1 + if src_ord >= reference_ord: + break + else: + all_nodes[trg_ord - 1] = all_nodes[src_ord - 1] + all_nodes[trg_ord - 1]._ord = trg_ord + trg_ord, src_ord = trg_ord + 1, src_ord + 1 + + # Third, move nodes_to_move to trg_ord RIGHT-ward. + trg_ord = reference_ord if reference_ord < first_ord else trg_ord + for node in nodes_to_move: + all_nodes[trg_ord - 1], node._ord = node, trg_ord + trg_ord += 1 # TODO add without_children kwarg From 1fd8d643c28c79d15d6eb2f7e7f9cd17cb977ce9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 04:52:18 +0100 Subject: [PATCH 0137/1201] faster parent setter --- udapi/core/node.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 65d050a2..fa5aa095 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -289,26 +289,19 @@ def parent(self, new_parent): if self._parent is new_parent: return - # The node itself couldn't be assigned as a parent. None cannot be used as parent. - if self is new_parent: - raise ValueError('Cannot set a node as its own parent (cycle are forbidden): %s' % self) + # Check for None new_parent and cycles. if new_parent is None: - raise ValueError('Cannot set None as parent: %s' % self) - - # Check if the current Node is not an antecedent of the new parent. - climbing_node = new_parent - while not climbing_node.is_root(): - if climbing_node is self: - raise ValueError('Setting the parent of %s to %s would lead to a cycle.' - % (self, new_parent)) - climbing_node = climbing_node._parent + raise ValueError(f'Cannot set None as parent: {self}') + if self is new_parent: + raise ValueError(f'Cannot set a node as its own parent (cycle are forbidden): {self}') + if self._children and new_parent.is_descendant_of(self): + raise ValueError(f'Setting the parent of {self} to {new_parent} would lead to a cycle.') # Remove the current Node from the children of the old parent. # Forbid moving nodes from one tree to another using parent setter. if self._parent: - self._parent._children = [node for node in self._parent._children if node is not self] - old_root, new_root = self._parent._root, climbing_node - if old_root is not new_root: + self._parent._children.remove(self) + if self._parent._root is not new_parent._root: raise ValueError('Cannot move nodes between trees with parent setter, ' 'use new_root.steal_nodes(nodes_to_be_moved) instead') # Set the new parent. From 2813afbc6abbbd17f5177b174bafcfd9a9457bdb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 08:28:05 +0100 Subject: [PATCH 0138/1201] refactor shift* methods (and rename internal shift() to _shift_before_ord()) --- udapi/core/node.py | 48 ++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index fa5aa095..b5febc03 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -490,28 +490,12 @@ def remove(self, children=None): for (new_ord, node) in enumerate(self._root._descendants, 1): node.ord = new_ord - - # TODO: make private: _shift - def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): + def _shift_before_ord(self, reference_ord, without_children=False): """Internal method for changing word order.""" - if move_subtree: - nodes_to_move = self.descendants(add_self=True) - else: + if without_children: nodes_to_move = [self] - - reference_ord = reference_node._ord - if reference_subtree: - if after: - for node in reference_node.unordered_descendants(): - if node._ord > reference_ord and node is not self: - reference_ord = node._ord - else: - for node in reference_node.unordered_descendants(): - if node._ord < reference_ord and node is not self: - reference_ord = node._ord - - # convert shift_after_node to shift_before_node - reference_ord += 1 if after else 0 + else: + nodes_to_move = self.descendants(add_self=True) first_ord, last_ord = nodes_to_move[0]._ord, nodes_to_move[-1]._ord all_nodes = self._root._descendants @@ -570,23 +554,25 @@ def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): all_nodes[trg_ord - 1], node._ord = node, trg_ord trg_ord += 1 - - # TODO add without_children kwarg - def shift_after_node(self, reference_node): + def shift_after_node(self, reference_node, without_children=False): """Shift this node after the reference_node.""" - self.shift(reference_node, after=1, move_subtree=1, reference_subtree=0) + self._shift_before_ord(reference_node._ord + 1, without_children=without_children) - def shift_before_node(self, reference_node): + def shift_before_node(self, reference_node, without_children=False): """Shift this node after the reference_node.""" - self.shift(reference_node, after=0, move_subtree=1, reference_subtree=0) + self._shift_before_ord(reference_node._ord, without_children=without_children) - def shift_after_subtree(self, reference_node, without_children=0): + def shift_after_subtree(self, reference_node, without_children=False): """Shift this node (and its subtree) after the subtree rooted by reference_node. Args: without_children: shift just this node without its subtree? """ - self.shift(reference_node, after=1, move_subtree=not without_children, reference_subtree=1) + ref_ord = reference_node._ord + for node in reference_node.unordered_descendants(): + if node._ord > ref_ord and node is not self: + ref_ord = node._ord + self._shift_before_ord(ref_ord + 1, without_children=without_children) def shift_before_subtree(self, reference_node, without_children=0): """Shift this node (and its subtree) before the subtree rooted by reference_node. @@ -594,7 +580,11 @@ def shift_before_subtree(self, reference_node, without_children=0): Args: without_children: shift just this node without its subtree? """ - self.shift(reference_node, after=0, move_subtree=not without_children, reference_subtree=1) + ref_ord = reference_node._ord + for node in reference_node.unordered_descendants(): + if node._ord < ref_ord and node is not self: + ref_ord = node._ord + self._shift_before_ord(ref_ord, without_children=without_children) @property def prev_node(self): From b9e755e14efced9ed2befebf7de650ba26059475 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 08:49:11 +0100 Subject: [PATCH 0139/1201] faster node.remove() --- udapi/core/node.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index b5febc03..b21643f5 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -465,11 +465,14 @@ def remove(self, children=None): `warn` means to issue a warning if any children are present and delete them. `rehang_warn` means to rehang and warn:-). """ - self._parent._children = [child for child in self._parent._children if child is not self] + self._parent._children.remove(self) if children is not None and self._children: if children.startswith('rehang'): for child in self._children: - child.parent = self._parent # TODO child._parent = self._parent + child._parent = self._parent + self._parent._children.extend(self._children) + self._parent._children.sort() + self._children.clear() if children.endswith('warn'): logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) From 3409ecb17bf6b99c5c3d8b327e0b6d0bde59e74f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 09:50:05 +0100 Subject: [PATCH 0140/1201] faster reordering of a single node, check reference_node.is_descendant_of(self) --- udapi/core/node.py | 59 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index b21643f5..6cbf5471 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -373,11 +373,12 @@ def descendants(self): def is_descendant_of(self, node): """Is the current node a descendant of the node given as argument?""" - climber = self._parent - while climber: - if climber is node: - return True - climber = climber._parent + if node._children: + climber = self._parent + while climber: + if climber is node: + return True + climber = climber._parent return False def create_child(self, **kwargs): @@ -495,13 +496,27 @@ def remove(self, children=None): def _shift_before_ord(self, reference_ord, without_children=False): """Internal method for changing word order.""" - if without_children: - nodes_to_move = [self] - else: - nodes_to_move = self.descendants(add_self=True) + all_nodes = self._root._descendants + # Moving a single node can be faster than nodes_to_move = [self] + if without_children or not self._children: + my_ord = self._ord + if reference_ord > my_ord + 1: + for i_ord in range(my_ord, reference_ord - 1): + all_nodes[i_ord - 1] = all_nodes[i_ord] + all_nodes[i_ord - 1]._ord = i_ord + all_nodes[reference_ord - 2] = self + self._ord = reference_ord - 1 + elif reference_ord < my_ord: + for i_ord in range(my_ord, reference_ord, -1): + all_nodes[i_ord - 1] = all_nodes[i_ord - 2] + all_nodes[i_ord - 1]._ord = i_ord + all_nodes[reference_ord - 1] = self + self._ord = reference_ord + return + + nodes_to_move = self.descendants(add_self=True) first_ord, last_ord = nodes_to_move[0]._ord, nodes_to_move[-1]._ord - all_nodes = self._root._descendants # If there are no "gaps" in nodes_to_move (e.g. when it is projective), # we can make the shifting a bit faster and simpler. @@ -557,32 +572,48 @@ def _shift_before_ord(self, reference_ord, without_children=False): all_nodes[trg_ord - 1], node._ord = node, trg_ord trg_ord += 1 - def shift_after_node(self, reference_node, without_children=False): + def shift_after_node(self, reference_node, without_children=False, skip_if_descendant=False): """Shift this node after the reference_node.""" + if not without_children and reference_node.is_descendant_of(self): + if skip_if_descendant: + return + raise ValueError(f'{reference_node} is a descendant of {self}. Consider without_children=1.') self._shift_before_ord(reference_node._ord + 1, without_children=without_children) - def shift_before_node(self, reference_node, without_children=False): + def shift_before_node(self, reference_node, without_children=False, skip_if_descendant=False): """Shift this node after the reference_node.""" + if not without_children and reference_node.is_descendant_of(self): + if skip_if_descendant: + return + raise ValueError(f'{reference_node} is a descendant of {self}. Consider without_children=1.') self._shift_before_ord(reference_node._ord, without_children=without_children) - def shift_after_subtree(self, reference_node, without_children=False): + def shift_after_subtree(self, reference_node, without_children=False, skip_if_descendant=False): """Shift this node (and its subtree) after the subtree rooted by reference_node. Args: without_children: shift just this node without its subtree? """ + if not without_children and reference_node.is_descendant_of(self): + if skip_if_descendant: + return + raise ValueError(f'{reference_node} is a descendant of {self}. Consider without_children=1.') ref_ord = reference_node._ord for node in reference_node.unordered_descendants(): if node._ord > ref_ord and node is not self: ref_ord = node._ord self._shift_before_ord(ref_ord + 1, without_children=without_children) - def shift_before_subtree(self, reference_node, without_children=0): + def shift_before_subtree(self, reference_node, without_children=0, skip_if_descendant=False): """Shift this node (and its subtree) before the subtree rooted by reference_node. Args: without_children: shift just this node without its subtree? """ + if not without_children and reference_node.is_descendant_of(self): + if skip_if_descendant: + return + raise ValueError(f'{reference_node} is a descendant of {self}. Consider without_children=1.') ref_ord = reference_node._ord for node in reference_node.unordered_descendants(): if node._ord < ref_ord and node is not self: From 6a22df0bc63f6b94db517b87f3e789ebf2d928f7 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 18:01:58 +0100 Subject: [PATCH 0141/1201] read.Conllu is not faster, but supports just standard CoNLL-U For loading CoNLL-X and CoNLL-2009 and other CoNLL-like formats, users need to use a new subclass: read.Conll. --- udapi/block/read/conll.py | 99 ++++++++++++++++++++++++++++++ udapi/block/read/conllu.py | 120 ++++++++++++++----------------------- 2 files changed, 144 insertions(+), 75 deletions(-) create mode 100644 udapi/block/read/conll.py diff --git a/udapi/block/read/conll.py b/udapi/block/read/conll.py new file mode 100644 index 00000000..7591f924 --- /dev/null +++ b/udapi/block/read/conll.py @@ -0,0 +1,99 @@ +""""Conll is a reader block for CoNLL-like files (CoNLL-U, CoNLL-X, CoNLL-2009).""" +import json +import logging +import re + +import udapi.block.read.conllu +from udapi.core.root import Root +from udapi.core.node import Node + + +class Conll(udapi.block.read.conllu.Conllu): + """A reader of the CoNLL-U files.""" + + def __init__(self, separator='tab', + attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc', **kwargs): + """Create the Conll reader object. + + This us a subclass of udapi.block.read.conllu.Conllu, + which adds a support for arbitrary column names and thus supporting not only CoNLL-U, + but also CoNLL-X, CoNLL-2009 and many other CoNLL-like formats. + + Args: + separator: How are the columns separated? + Default='tab' is the only possibility in valid CoNLL-U files. + 'space' means one or more whitespaces (this does not allow forms with space). + 'doublespace' means two or more spaces. + attributes: comma-separated list of column names in the input files + (default='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc') + Changing the default can be used for loading CoNLL-like formats (not valid CoNLL-U). + For ignoring a column, use "_" as its name. + Column "ord" marks the column with 1-based word-order number/index (usualy called ID). + Column "head" marks the column with dependency parent index (word-order number). + + For example, for CoNLL-X which uses name1=value1|name2=value2 format of FEATS, use + `attributes=ord,form,lemma,upos,xpos,feats,head,deprel,_,_` + but note that attributes upos, feats and deprel will contain language-specific values, + not valid according to UD guidelines and a further conversion will be needed. + You will loose the projective_HEAD and projective_DEPREL attributes. + + For CoNLL-2009 you can use `attributes=ord,form,lemma,_,upos,_,feats,_,head,_,deprel`. + You will loose the predicted_* attributes and semantic/predicate annotation. + + TODO: allow storing the rest of columns in misc, e.g. `node.misc[feats]` + for feats which do not use the name1=value1|name2=value2 format. + """ + super().__init__(**kwargs) + self.node_attributes = attributes.split(',') + self.separator = separator + + # pylint: disable=too-many-locals,too-many-branches,too-many-statements + # Maybe the code could be refactored, but it is speed-critical, + # so benchmarking is needed because calling extra methods may result in slowdown. + + def parse_node_line(self, line, root, nodes, parents, mwts): + if self.separator == 'tab': + fields = line.split('\t') + elif self.separator == 'space': + fields = line.split() + elif self.separator == 'doublespace': + fields = re.split(' +', line) + else: + raise ValueError('separator=%s is not valid' % self.separator) + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + # multi-word tokens will be processed later + if '-' in fields[0]: + mwts.append(fields) + return + if '.' in fields[0]: + empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], + xpos=fields[4], feats=fields[5], misc=fields[9]) + empty.ord = fields[0] + empty.raw_deps = fields[8] # TODO + return + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + if attribute_name == 'head': + try: + parents.append(int(fields[n_attribute])) + except ValueError as exception: + if not self.strict and fields[n_attribute] == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception + elif attribute_name == 'ord': + setattr(node, 'ord', int(fields[n_attribute])) + elif attribute_name == 'deps': + setattr(node, 'raw_deps', fields[n_attribute]) + elif attribute_name != '_': + setattr(node, attribute_name, fields[n_attribute]) + + nodes.append(node) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index e9adbae1..698fd0b6 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -19,41 +19,17 @@ class Conllu(BaseReader): """A reader of the CoNLL-U files.""" - def __init__(self, strict=False, separator='tab', empty_parent='warn', fix_cycles=False, - attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc', **kwargs): + def __init__(self, strict=False, empty_parent='warn', fix_cycles=False, **kwargs): """Create the Conllu reader object. Args: strict: raise an exception if errors found (default=False, i.e. a robust mode) - separator: How are the columns separated? - Default='tab' is the only possibility in valid CoNLL-U files. - 'space' means one or more whitespaces (this does not allow forms with space). - 'doublespace' means two or more spaces. - empty_parent: What to do if HEAD is _? Default=warn - issue a warning and attach to the root + empty_parent: What to do if HEAD is _? Default=warn: issue a warning and attach to the root or if strict=1 issue an exception. With `empty_parent=ignore` no warning is issued. - attributes: comma-separated list of column names in the input files - (default='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc') - Changing the default can be used for loading CoNLL-like formats (not valid CoNLL-U). - For ignoring a column, use "_" as its name. - Column "ord" marks the column with 1-based word-order number/index (usualy called ID). - Column "head" marks the column with dependency parent index (word-order number). - - For example, for CoNLL-X which uses name1=value1|name2=value2 format of FEATS, use - `attributes=ord,form,lemma,upos,xpos,feats,head,deprel,_,_` - but note attributes that upos, feats and deprel will contain language-specific values, - not valid according to UD guidelines and a further conversion will be needed. - You will loose the projective_HEAD and projective_DEPREL attributes. - - For CoNLL-2009 you can use `attributes=ord,form,lemma,_,upos,_,feats,_,head,_,deprel`. - You will loose the predicted_* attributes and semantic/predicate annotation. - - TODO: allow storing the rest of columns in misc, e.g. `node.misc[feats]` - for feats which do not use the name1=value1|name2=value2 format. + fix_cycles: fix cycles by attaching a node in the cycle to the root """ super().__init__(**kwargs) - self.node_attributes = attributes.split(',') self.strict = strict - self.separator = separator self.empty_parent = empty_parent self.fix_cycles = fix_cycles @@ -88,9 +64,48 @@ def parse_comment_line(line, root): container = root.json['__doc__'] container[json_match.group(2)] = json.loads(json_match.group(3)) return - root.comment += line[1:] + "\n" + def parse_node_line(self, line, root, nodes, parents, mwts): + fields = line.split('\t') + if len(fields) != 10: + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (10 - len(fields))) + # multi-word tokens will be processed later + if '-' in fields[0]: + mwts.append(fields) + return + if '.' in fields[0]: + empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], + xpos=fields[4], feats=fields[5], misc=fields[9]) + empty.ord = fields[0] + empty.raw_deps = fields[8] # TODO + return + + for i in range(1, 10): + if fields[i] == '_': + fields[i] = None + + # ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc + node = Node(root=root, form=fields[1], lemma=fields[2], upos=fields[3], + xpos=fields[4], feats=fields[5], deprel=fields[7], misc=fields[9]) + root._descendants.append(node) + node._ord = int(fields[0]) + if fields[8] is not None: + node.raw_deps = fields[8] + try: + parents.append(int(fields[6])) + except ValueError as exception: + if not self.strict and fields[6] == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception + + nodes.append(node) + # pylint: disable=too-many-locals,too-many-branches,too-many-statements # Maybe the code could be refactored, but it is speed-critical, # so benchmarking is needed because calling extra methods may result in slowdown. @@ -109,52 +124,7 @@ def read_tree(self): if line[0] == '#': self.parse_comment_line(line, root) else: - if self.separator == 'tab': - fields = line.split('\t') - elif self.separator == 'space': - fields = line.split() - elif self.separator == 'doublespace': - fields = re.split(' +', line) - else: - raise ValueError('separator=%s is not valid' % self.separator) - if len(fields) != len(self.node_attributes): - if self.strict: - raise RuntimeError('Wrong number of columns in %r' % line) - fields.extend(['_'] * (len(self.node_attributes) - len(fields))) - # multi-word tokens will be processed later - if '-' in fields[0]: - mwts.append(fields) - continue - if '.' in fields[0]: - empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], - xpos=fields[4], feats=fields[5], misc=fields[9]) - empty.ord = fields[0] - empty.raw_deps = fields[8] # TODO - continue - - node = Node(root=root) - root._descendants.append(node) - - # TODO slow implementation of speed-critical loading - for (n_attribute, attribute_name) in enumerate(self.node_attributes): - if attribute_name == 'head': - try: - parents.append(int(fields[n_attribute])) - except ValueError as exception: - if not self.strict and fields[n_attribute] == '_': - if self.empty_parent == 'warn': - logging.warning("Empty parent/head index in '%s'", line) - parents.append(0) - else: - raise exception - elif attribute_name == 'ord': - setattr(node, 'ord', int(fields[n_attribute])) - elif attribute_name == 'deps': - setattr(node, 'raw_deps', fields[n_attribute]) - elif attribute_name != '_': - setattr(node, attribute_name, fields[n_attribute]) - - nodes.append(node) + self.parse_node_line(line, root, nodes, parents, mwts) # If no nodes were read from the filehandle (so only root remained in nodes), # we return None as a sign of failure (end of file or more than one empty line). @@ -187,7 +157,7 @@ def read_tree(self): elif node.children: climbing = parent._parent while climbing: - if climbing == node: + if climbing is node: if self.fix_cycles: logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent) parent = root From 5f77044cdda993ffb94b6c330e0d72729e0b0f3f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 18:03:47 +0100 Subject: [PATCH 0142/1201] bugfix: empty root.text shoudl be loaded from CoNLL-U as well otherwise loading and storing a CoNLL-U file with a sentence with no nodes results in duplicating the "# text =" comment. --- udapi/block/read/conllu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 698fd0b6..655146fc 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -11,7 +11,7 @@ # The equal sign after sent_id was added to the specification in UD v2.0. # This reader accepts also older-style sent_id (until UD v2.0 treebanks are released). RE_SENT_ID = re.compile(r'^# sent_id\s*=?\s*(\S+)') -RE_TEXT = re.compile(r'^# text\s*=\s*(.+)') +RE_TEXT = re.compile(r'^# text\s*=\s*(.*)') RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?') RE_JSON = re.compile(r'^# (doc_)?json_([^ =]+)\s*=\s*(.+)') From ab651f537bfe0216d70d41bfa847da88188ccc49 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 19:04:33 +0100 Subject: [PATCH 0143/1201] mwt.ord_range is a property, which internally sorts mwt.words Preserving correct MWT and coref mentions while deleting/reordering nodes is tricky - noting as TODOs. --- udapi/block/write/conllu.py | 4 ++-- udapi/core/mwt.py | 12 +++++++++--- udapi/core/node.py | 2 ++ 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 50cf366d..3ef30e5f 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -64,10 +64,10 @@ def process_tree(self, tree): # pylint: disable=too-many-branches mwt = node.multiword_token if mwt and node.ord > last_mwt_id: - last_mwt_id = mwt.words[-1].ord - print('\t'.join([mwt.ord_range(), + print('\t'.join([mwt.ord_range, mwt.form if mwt.form is not None else '_', '_\t_\t_\t_\t_\t_\t_', str(mwt.misc)])) + last_mwt_id = mwt.words[-1].ord values = [getattr(node, attr_name) for attr_name in self.node_attributes] values = ['_' if v is None else str(v) for v in values] try: diff --git a/udapi/core/mwt.py b/udapi/core/mwt.py index 289adcdb..dd3d4916 100644 --- a/udapi/core/mwt.py +++ b/udapi/core/mwt.py @@ -26,21 +26,27 @@ def misc(self): def misc(self, value): self._misc.set_mapping(value) + @property def ord_range(self): """Return a string suitable for the first column of CoNLL-U.""" + self.words.sort() return "%d-%d" % (self.words[0].ord, self.words[-1].ord) def remove(self): """Delete this multi-word token (but keep its words).""" for word in self.words: word._mwt = None # pylint: disable=W0212 - self.root.multiword_tokens = [tok for tok in self.root.multiword_tokens if tok != self] + self.root.multiword_tokens.remove(self) def address(self): """Full (document-wide) id of the multi-word token.""" return self.root.address + '#' + self.ord_range # TODO: node.remove() should check if the node is not part of any MWT -# TODO: mwt.words.append(node) and node.shift* should check if the MWT does not contain gaps +# TODO: Document that editing words by mwt.words.append(node), del or remove(node) is not supported +# TODO: Make mwt._words privat and provide a setter +# TODO: What to do when mwt.words = []? (It is allowed after mwt=MWT().) +# TODO: words.setter and node.shift* should check if the MWT does not contain gaps # and is still multi-word -# TODO: check if one word is not included in multiple multi-word tokens +# TODO: Make sure mwt.words are always sorted (even after node.shift*). +# TODO: Check if one word is not included in multiple multi-word tokens. diff --git a/udapi/core/node.py b/udapi/core/node.py index 6cbf5471..4ced6137 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -490,6 +490,8 @@ def remove(self, children=None): for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): node.ord = new_ord else: + # TODO nodes_to_remove = self.unordered_descendants() + # and mark all nodes as deleted, remove them from MWT and coref mentions self._root._descendants = sorted(self._root.unordered_descendants()) for (new_ord, node) in enumerate(self._root._descendants, 1): node.ord = new_ord From 812828507863b3463bdddcb74d377de25f239a79 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 19:32:12 +0100 Subject: [PATCH 0144/1201] speedup: misc and feats (in Node and MWT) are stored as None if missing --- udapi/core/mwt.py | 9 +++++++-- udapi/core/node.py | 20 ++++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/udapi/core/mwt.py b/udapi/core/mwt.py index dd3d4916..525b9bb0 100644 --- a/udapi/core/mwt.py +++ b/udapi/core/mwt.py @@ -9,7 +9,7 @@ class MWT(object): def __init__(self, words=None, form=None, misc=None, root=None): self.words = words if words is not None else [] self.form = form - self._misc = DualDict(misc) + self._misc = DualDict(misc) if misc else None self.root = root for word in self.words: word._mwt = self # pylint: disable=W0212 @@ -20,11 +20,16 @@ def misc(self): See `udapi.core.node.Node` for details. """ + if self._misc is None: + self._misc = DualDict() return self._misc @misc.setter def misc(self, value): - self._misc.set_mapping(value) + if self._misc is None: + self._misc = DualDict(value) + else: + self._misc.set_mapping(value) @property def ord_range(self): diff --git a/udapi/core/node.py b/udapi/core/node.py index 4ced6137..eae9fb70 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -93,9 +93,9 @@ def __init__(self, root, form=None, lemma=None, upos=None, # pylint: disable=to self.lemma = lemma self.upos = upos self.xpos = xpos - self._feats = Feats(feats) + self._feats = Feats(feats) if feats else None self.deprel = deprel - self._misc = DualDict(misc) + self._misc = DualDict(misc) if misc else None self._raw_deps = '_' self._deps = None self._parent = None @@ -176,11 +176,16 @@ def feats(self): For details about the implementation and other methods (e.g. `node.feats.is_plural()`), see ``udapi.core.feats.Feats`` which is a subclass of `DualDict`. """ + if self._feats is None: + self._feats = Feats() return self._feats @feats.setter def feats(self, value): - self._feats.set_mapping(value) + if self._feats is None: + self._feats = Feats(value) + else: + self._feats.set_mapping(value) @property def misc(self): @@ -203,11 +208,16 @@ def misc(self): For details about the implementation, see ``udapi.core.dualdict.DualDict``. """ + if self._misc is None: + self._misc = DualDict() return self._misc @misc.setter def misc(self, value): - self._misc.set_mapping(value) + if self._misc is None: + self._misc = DualDict(value) + else: + self._misc.set_mapping(value) @property def raw_deps(self): @@ -892,6 +902,8 @@ def parent(self, _): """Attempts at setting parent of EmptyNode result in AttributeError exception.""" raise AttributeError('EmptyNode cannot have a (basic-UD) parent.') + # The ord getter is the same as in Node, but it must be defined, + # so that we can override the ord setter. @property def ord(self): return self._ord From c46dd7ca6929246027e0ac1b563d7a0330454ce6 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 21:46:24 +0100 Subject: [PATCH 0145/1201] faster write.Conllu and node.deps + node.raw_deps --- udapi/block/write/conllu.py | 40 ++++++++++++++++++------------------- udapi/core/node.py | 22 ++++++++++---------- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 3ef30e5f..18a696f2 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -12,10 +12,6 @@ def __init__(self, print_sent_id=True, print_text=True, print_empty_trees=True, self.print_text = print_text self.print_empty_trees = print_empty_trees - # A list of Conllu columns. - self.node_attributes = ["ord", "form", "lemma", "upos", "xpos", - "feats", "parent", "deprel", "raw_deps", "misc"] - def process_tree(self, tree): # pylint: disable=too-many-branches nodes = tree.descendants @@ -52,29 +48,33 @@ def process_tree(self, tree): # pylint: disable=too-many-branches for node in nodes: # print all empty nodes which should go here while empty_nodes: - next_empty_ord = int(float(empty_nodes[0].ord)) + next_empty_ord = empty_nodes[0]._ord if next_empty_ord > last_ord: break empty = empty_nodes.pop(0) - values = [getattr(empty, attr_name) for attr_name in self.node_attributes] - values = ['_' if v is None else str(v) for v in values] - values[6] = '_' - values[7] = '_' - print('\t'.join(values)) + print('\t'.join('_' if v is None else v for v in + (str(node._ord), node.form, node.lemma, node.upos, node.xpos, + '_' if node._feats is None else str(node.feats), '_\t_', + node.raw_deps, '_' if node._misc is None else str(node.misc)))) mwt = node.multiword_token - if mwt and node.ord > last_mwt_id: - print('\t'.join([mwt.ord_range, - mwt.form if mwt.form is not None else '_', - '_\t_\t_\t_\t_\t_\t_', str(mwt.misc)])) - last_mwt_id = mwt.words[-1].ord - values = [getattr(node, attr_name) for attr_name in self.node_attributes] - values = ['_' if v is None else str(v) for v in values] + if mwt and node._ord > last_mwt_id: + print('\t'.join((mwt.ord_range, + '_' if mwt.form is None else mwt.form, + '_\t_\t_\t_\t_\t_\t_', + '_' if node._misc is None else str(mwt.misc)))) + last_mwt_id = mwt.words[-1]._ord + try: - values[6] = str(node.parent.ord) + head = str(node._parent._ord) except AttributeError: - values[6] = '0' - print('\t'.join(values)) + head = '0' + + print('\t'.join('_' if v is None else v for v in + (str(node._ord), node.form, node.lemma, node.upos, node.xpos, + '_' if node._feats is None else str(node.feats), head, node.deprel, + node.raw_deps, '_' if node._misc is None else str(node.misc)))) + last_ord = node.ord # Empty sentences are not allowed in CoNLL-U, diff --git a/udapi/core/node.py b/udapi/core/node.py index eae9fb70..a89cfeae 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -226,12 +226,12 @@ def raw_deps(self): After the access to the raw enhanced dependencies, provide the serialization if they were deserialized already. """ - if self._deps is not None: - serialized_deps = [] - for secondary_dependence in self._deps: - serialized_deps.append('{}:{}'.format(secondary_dependence[ - 'parent']._ord, secondary_dependence['deprel'])) - self._raw_deps = '|'.join(serialized_deps) + if self._raw_deps is not None: + return self._raw_deps + if not self._deps: + self._raw_deps = '_' + return '_' + self._raw_deps = '|'.join(f"{dep['parent']._ord}:{dep['deprel']}" for dep in self._deps) return self._raw_deps @raw_deps.setter @@ -239,9 +239,9 @@ def raw_deps(self, value): """Set serialized enhanced dependencies (the new value is a string). When updating raw secondary dependencies, - delete the current version of the deserialized data. + the current version of the deserialized data is deleted. """ - self._raw_deps = str(value) + self._raw_deps = value self._deps = None @property @@ -252,14 +252,15 @@ def deps(self): provide the deserialization of the raw data and save deps to the list. """ if self._deps is None: + if self._raw_deps == '_': + return [] + # Obtain a list of all nodes in the dependency tree. nodes = [self._root] + self._root._descendants # Create a list of secondary dependencies. self._deps = list() - if self._raw_deps == '_': - return self._deps for raw_dependency in self._raw_deps.split('|'): # Deprel itself may contain one or more ':' (subtypes). @@ -280,6 +281,7 @@ def deps(self): def deps(self, value): """Set deserialized enhanced dependencies (the new value is a list of dicts).""" self._deps = value + self._raw_deps = None @property def parent(self): From e677f85009d2a3f6213360a16f39824d12051f20 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 12 Feb 2021 23:49:49 +0100 Subject: [PATCH 0146/1201] partial revert of deps/raw_deps caching, so that tests pass Tests use node.deps.append(dep) instead of node.deps = node.deps + [dep] --- udapi/core/node.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index a89cfeae..0f094d5e 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -226,12 +226,13 @@ def raw_deps(self): After the access to the raw enhanced dependencies, provide the serialization if they were deserialized already. """ - if self._raw_deps is not None: - return self._raw_deps - if not self._deps: - self._raw_deps = '_' - return '_' - self._raw_deps = '|'.join(f"{dep['parent']._ord}:{dep['deprel']}" for dep in self._deps) + # TODO: node.deps.append(dep) should be hooked and + # mark the serialized cache dirty, i.e. self._raw_deps = None. + # Afterwards, we can use the following optimization + #if self._raw_deps is not None: + # return self._raw_deps + if self._deps is not None: + self._raw_deps = '|'.join(f"{dep['parent']._ord}:{dep['deprel']}" for dep in self._deps) return self._raw_deps @raw_deps.setter @@ -252,16 +253,15 @@ def deps(self): provide the deserialization of the raw data and save deps to the list. """ if self._deps is None: + # Create a list of secondary dependencies. + self._deps = list() + if self._raw_deps == '_': - return [] + return self._deps # Obtain a list of all nodes in the dependency tree. nodes = [self._root] + self._root._descendants - # Create a list of secondary dependencies. - self._deps = list() - - for raw_dependency in self._raw_deps.split('|'): # Deprel itself may contain one or more ':' (subtypes). head, deprel = raw_dependency.split(':', maxsplit=1) From b7b5c6859a7d6e899a31eae29628aa02b8493f28 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 13 Feb 2021 00:05:13 +0100 Subject: [PATCH 0147/1201] faster exceptions don't serialize node.address until we need to actually print the error message --- udapi/core/node.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 0f094d5e..0a3a716e 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -303,11 +303,11 @@ def parent(self, new_parent): # Check for None new_parent and cycles. if new_parent is None: - raise ValueError(f'Cannot set None as parent: {self}') + raise ValueError('Cannot set None as parent: %s', self) if self is new_parent: - raise ValueError(f'Cannot set a node as its own parent (cycle are forbidden): {self}') + raise ValueError('Cannot set a node as its own parent (cycle are forbidden): %s', self) if self._children and new_parent.is_descendant_of(self): - raise ValueError(f'Setting the parent of {self} to {new_parent} would lead to a cycle.') + raise ValueError('Setting the parent of %s to %s would lead to a cycle.', (self, new_parent)) # Remove the current Node from the children of the old parent. # Forbid moving nodes from one tree to another using parent setter. From ec5a40573263f53c42fcade67356ba5fe1eca6a0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 13 Feb 2021 12:52:30 +0100 Subject: [PATCH 0148/1201] write.TextModeTrees should handle empty nodes even if drawing a subtree --- udapi/block/write/textmodetrees.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index f3dad456..c8e619b9 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -3,6 +3,7 @@ import sys import colorama +import collections from termcolor import colored from udapi.core.basewriter import BaseWriter @@ -237,7 +238,13 @@ def should_print_tree(self, root, allnodes): def process_tree(self, root): """Print the tree to (possibly redirected) sys.stdout.""" if self.print_empty: - allnodes = [root] + root.descendants_and_empty + if root.is_root(): + allnodes = [root] + root.descendants_and_empty + else: + allnodes = root.descendants(add_self=1) + empty = [e for e in root._root.empty_nodes if e > allnodes[0] and e < allnodes[-1]] + allnodes.extend(empty) + allnodes.sort() else: allnodes = root.descendants(add_self=1) if not self.should_print_tree(root, allnodes): @@ -248,7 +255,7 @@ def process_tree(self, root): # Precompute the number of non-projective gaps for each subtree if self.minimize_cross: - self._gaps = [0, ] * (1 + len(root.root.descendants)) + self._gaps = collections.Counter() self._compute_gaps(root) # Precompute lines for printing From 2356102357afa951e0191b911334eec904bc6f09 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 13 Feb 2021 14:46:05 +0100 Subject: [PATCH 0149/1201] CycleError for faster and better cycle detection handling While logging.warning('My message: %s', node) stringify-ies node into the message according to %s, raise ValueError('My message: %s', node) does not work like this. We need our own exception class with overridden __str__. The reason for stringifying the message only in __str__ is speed. When users catch the exception, they may not need to print the message. Current str(node) is relatively slow because it calls node.address(), which calls node.root.address(). --- udapi/__init__.py | 1 + udapi/core/node.py | 24 +++++++++++++++++++----- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/udapi/__init__.py b/udapi/__init__.py index afdd8025..6c281c0f 100644 --- a/udapi/__init__.py +++ b/udapi/__init__.py @@ -1,2 +1,3 @@ from .core.document import Document from .core.run import create_block +from .core.node import CycleError diff --git a/udapi/core/node.py b/udapi/core/node.py index 0a3a716e..cf9a8f31 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -1,7 +1,8 @@ """Node class and related classes and functions. -In addition to class `Node`, this module contains also classes -`EmptyNode`, `OrdTuple` and `ListOfNodes` and function `find_minimal_common_treelet`. +In addition to class `Node`, this module contains also helper classes +`CycleError`, `EmptyNode`, `OrdTuple` and `ListOfNodes` +and function `find_minimal_common_treelet`. """ import logging import functools @@ -303,11 +304,11 @@ def parent(self, new_parent): # Check for None new_parent and cycles. if new_parent is None: - raise ValueError('Cannot set None as parent: %s', self) + raise ValueError(f'Cannot set None as parent: {self}') if self is new_parent: - raise ValueError('Cannot set a node as its own parent (cycle are forbidden): %s', self) + raise CycleError('Cannot set a node as its own parent (cycle are forbidden): %s', self) if self._children and new_parent.is_descendant_of(self): - raise ValueError('Setting the parent of %s to %s would lead to a cycle.', (self, new_parent)) + raise CycleError('Setting the parent of %s to %s would lead to a cycle.', self, new_parent) # Remove the current Node from the children of the old parent. # Forbid moving nodes from one tree to another using parent setter. @@ -888,6 +889,19 @@ def create_coref_cluster(self, **kwargs): return udapi.core.coref.create_coref_cluster(head=self, **kwargs) +class CycleError(Exception): + '''A cycle in the dependency tree detected (or would be created).''' + def __init__(self, message, node1, node2=None): + self.message = message + self.node1 = node1 + self.node2 = node2 + super().__init__(message) + + def __str__(self): + if self.node2 is None: + return self.message % self.node1 + return self.message % (self.node1, self.node2) + class EmptyNode(Node): """Class for representing empty nodes (for ellipsis in enhanced UD).""" From 8d58745c9964a9d3b910b12b27675cd7b05fbb12 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 17 Feb 2021 01:06:24 +0100 Subject: [PATCH 0150/1201] write.Conllu prints empty nodes correctly and faster --- udapi/block/write/conllu.py | 50 ++++++++++++++----------------------- 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 18a696f2..429c2f42 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -13,7 +13,11 @@ def __init__(self, print_sent_id=True, print_text=True, print_empty_trees=True, self.print_empty_trees = print_empty_trees def process_tree(self, tree): # pylint: disable=too-many-branches - nodes = tree.descendants + empty_nodes = tree.empty_nodes + if empty_nodes: + nodes = sorted(tree._descendants + empty_nodes) + else: + nodes = tree._descendants # Empty sentences are not allowed in CoNLL-U, so with print_empty_trees==0 # we need to skip the whole tree (including possible comments). @@ -22,42 +26,25 @@ def process_tree(self, tree): # pylint: disable=too-many-branches if self.print_sent_id: if tree.newdoc: - value = ' id = ' + tree.newdoc if tree.newdoc is not True else '' - print('# newdoc' + value) + print('# newdoc' + (' id = ' + tree.newdoc if tree.newdoc is not True else '')) if tree.newpar: - value = ' id = ' + tree.newpar if tree.newpar is not True else '' - print('# newpar' + value) - print('# sent_id = ' + tree.address()) + print('# newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '')) + print('# sent_id = ' + tree.sent_id) if self.print_text: - print("# text = " + tree.get_sentence()) + print('# text = ' + (tree.text or tree.compute_text())) if tree.json: for key, value in sorted(tree.json.items()): - print("# json_%s = %s" - % (key, json.dumps(value, ensure_ascii=False, sort_keys=True))) + print(f"# json_{key} = {json.dumps(value, ensure_ascii=False, sort_keys=True)}") comment = tree.comment if comment: - comment = comment.rstrip() - print('#' + comment.replace('\n', '\n#')) + print('#' + comment.rstrip().replace('\n', '\n#')) last_mwt_id = 0 - last_ord = 0 - empty_nodes = list(tree.empty_nodes) for node in nodes: - # print all empty nodes which should go here - while empty_nodes: - next_empty_ord = empty_nodes[0]._ord - if next_empty_ord > last_ord: - break - empty = empty_nodes.pop(0) - print('\t'.join('_' if v is None else v for v in - (str(node._ord), node.form, node.lemma, node.upos, node.xpos, - '_' if node._feats is None else str(node.feats), '_\t_', - node.raw_deps, '_' if node._misc is None else str(node.misc)))) - - mwt = node.multiword_token + mwt = node._mwt if mwt and node._ord > last_mwt_id: print('\t'.join((mwt.ord_range, '_' if mwt.form is None else mwt.form, @@ -65,18 +52,19 @@ def process_tree(self, tree): # pylint: disable=too-many-branches '_' if node._misc is None else str(mwt.misc)))) last_mwt_id = mwt.words[-1]._ord - try: - head = str(node._parent._ord) - except AttributeError: - head = '0' + if node._parent is None: + head = '_' # Empty nodes + else: + try: + head = str(node._parent._ord) + except AttributeError: + head = '0' print('\t'.join('_' if v is None else v for v in (str(node._ord), node.form, node.lemma, node.upos, node.xpos, '_' if node._feats is None else str(node.feats), head, node.deprel, node.raw_deps, '_' if node._misc is None else str(node.misc)))) - last_ord = node.ord - # Empty sentences are not allowed in CoNLL-U, # but with print_empty_trees==1 (which is the default), # we will print an artificial node, so we can print the comments. From a2862496221410514b8e5290723b8b3dcd7cde5e Mon Sep 17 00:00:00 2001 From: michnov Date: Fri, 19 Feb 2021 20:50:01 +0100 Subject: [PATCH 0151/1201] Whitespaces can be stored in the conllu file, so that the original text can be reconstructed back (#71) * Keep the information on spaces The basic whitespace tokenizer keeps the extended information on whitespace from now on. This is done in the way as UDPipe does it, i.e. using in the following MISC attributes: - SpaceAfter=No - SpacesAfter='\t\s\n' - SpacesBefore='\s\s\s' * escaping whitespaces in SpacesAfter and SpacesBefore attrs * Whitespace filling can be enbled by a parameter * use `fill_spaces` to fill in the extra whitespace MISC features * its usage and combination with `read.Sentences` documented * the init parameter `tokenizer_params` was commited by mistake => reverting * `fill_spaces=True` -> `normalize_spaces=False` * the parameter renamed to match the parameter in UDPipe * fix: if normalize_spaces=True, SpaceAfter=No is never set for the last token in the sentence * fixes after Martin's code review * bugfix, missing self --- udapi/block/tokenize/onwhitespace.py | 86 ++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 18 deletions(-) diff --git a/udapi/block/tokenize/onwhitespace.py b/udapi/block/tokenize/onwhitespace.py index 5451b3a1..b25005a9 100644 --- a/udapi/block/tokenize/onwhitespace.py +++ b/udapi/block/tokenize/onwhitespace.py @@ -1,9 +1,40 @@ """Block tokenize.OnWhitespace""" +import re from udapi.core.block import Block class OnWhitespace(Block): - """"Base tokenizer, splits on whitespaces, fills SpaceAfter=No.""" + """Base tokenizer, splits on whitespaces, fills SpaceAfter=No. + + Use the parameter `normalize_spaces=False` to preserve all whitespaces in the sentence + in the UDPipe way, i.e. using the `SpacesAfter` and `SpacesBefore` features in the MISC field. + It is backward compatible with CoNLL-U v2 `SpaceAfter=No` feature. That is, no following + whitespace is marked by `SpaceAfter=No` and a single following space results in no + whitespace-related markup. + If loading the text using `read.Sentences` and all whitespaces need to be preserved + (in order to be able to reconstruct the original document), the `read.Sentences` block + must be called with `rstrip=\n` or `rstrip=\r\n` to prevent stripping the trailing + whitespace, e.g.:: + $> echo -e "Hello \t world " | udapy read.Sentences $'rstrip=\r\n' tokenize.OnWhitespace normalize_spaces=0 write.Conllu + + # sent_id = 1 + # text = Hello world + 1 Hello _ _ _ _ 0 _ _ SpacesAfter=\s\t\s + 2 world _ _ _ _ 0 _ _ _ + Note that the attribute `SpaceAfter=No` is missing for the token `world`, since it is + followed by a single space. + + Parameters + ---------- + normalize_spaces : bool + preserve whitespaces by filling MISC attributes `SpacesAfter` and `SpacesBefore` (by default True) + """ + + escape_whitespace_table = str.maketrans({' ':r'\s', '\t':r'\t', '\r':r'\r', '\n':r'\n'}) + + def __init__(self, normalize_spaces=True, **kwargs): + super().__init__(**kwargs) + self.normalize_spaces = normalize_spaces @staticmethod def tokenize_sentence(string): @@ -13,24 +44,23 @@ def tokenize_sentence(string): def process_tree(self, root): if root.children: raise ValueError('Tree %s is already tokenized.' % root) - sentence = ' '.join(root.text.split()) + #sentence = ' '.join(root.text.split()) + sentence = root.text tokens = self.tokenize_sentence(sentence) + + # Check if there are any spaces before the first token + spaces_before = "" + m = re.match(r'\s+', sentence) + if m: + spaces_before = m.group(0) + sentence = sentence[len(spaces_before):] + for i, token in enumerate(tokens, 1): - space_after = False + spaces_after = "" - # Delete the token from the begining of the sentence. - if sentence.startswith(token): - sentence = sentence[len(token):] - # This is the expected case. The sentence starts with the token. - # If it is followed by a space, delete the space and set space_after=True. - if not len(sentence): - space_after = True - elif sentence.startswith(' '): - space_after = True - sentence = sentence[1:] - else: - # The token (returned from tokenization) does not match the start of sentence. - # E.g. '. . . word' is tokenized as '... word'. + # The token (returned from tokenization) does not match the start of sentence. + # E.g. '. . . word' is tokenized as '... word'. + if not sentence.startswith(token): # Let's delete the start of sentence anyway, # using a non-greedy regex and the expected next token # returned from the tokenization. @@ -40,8 +70,28 @@ def process_tree(self, root): # $sentence = $rest if (defined $rest); raise ValueError('tokenization does not match: "%s" vs "%s"' % (token, sentence)) + # Delete the token from the begining of the sentence. + sentence = sentence[len(token):] + + # Set the SpaceAfter and SpacesAfter properly + m = re.match(r'\s+', sentence) + if m is not None: + spaces_after = m.group(0) + sentence = sentence[len(spaces_after):] + + # normalize whitespace + if self.normalize_spaces: + spaces_before = "" + # spaces_after = "" <=> SpaceAfter=No is never set for the last token <=> len(sentence) = 0 + spaces_after = "" if not len(spaces_after) and len(sentence) else " " + # create a new node node = root.create_child(form=token) node.ord = i - if not space_after: - node.misc = 'SpaceAfter=No' + + if i == 1 and spaces_before: + node.misc["SpacesBefore"] = spaces_before.translate(self.escape_whitespace_table) + if not spaces_after: + node.misc["SpaceAfter"] = 'No' + elif spaces_after != " ": + node.misc["SpacesAfter"] = spaces_after.translate(self.escape_whitespace_table) From 014e0cffa1d64b981fd6666ad325e447fc2cec0d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 18 Feb 2021 11:32:20 +0100 Subject: [PATCH 0152/1201] disable garbage collection when using Udapi via udapy --- bin/udapy | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/bin/udapy b/bin/udapy index eaaa00cc..c7ed8bba 100755 --- a/bin/udapy +++ b/bin/udapy @@ -1,5 +1,7 @@ #!/usr/bin/env python3 - +import os +import gc +import atexit import logging import argparse @@ -46,6 +48,12 @@ argparser.add_argument( "-X", "--extra", action="append", help="Add a specified parameter (or a block name) to the end of the scenario\n" "For example 'udapy -TNX attributes=form,misc -X layout=align < my.conllu'") +argparser.add_argument( + "--gc", action="store_true", + help="By default, udapy disables Python garbage collection and at-exit cleanup\n" + "to speed up everything (especially reading CoNLL-U files). In edge cases,\n" + "when processing many files and running out of memory, you can disable this\n" + "optimization (i.e. enable garbage collection) with 'udapy --gc'.") argparser.add_argument( 'scenario', nargs=argparse.REMAINDER, help="A sequence of blocks and their parameters.") @@ -64,6 +72,20 @@ logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(mes # Process and provide the scenario. if __name__ == "__main__": + + # Disabling garbage collections makes the whole processing much faster. + # Similarly, we can save several seconds by partially disabling the at-exit Python cleanup + # (atexit hooks are called in reversed order of their registration, + # so flushing stdio buffers etc. will be still done before the os._exit(0) call). + # See https://instagram-engineering.com/dismissing-python-garbage-collection-at-instagram-4dca40b29172 + # Is it safe to disable GC? + # OS will free the memory allocated by this process after it ends anyway. + # The udapy wrapper is aimed for one-time tasks, not a long-running server, + # so in a typical case a document is loaded and almost no memory is freed before the end. + # Udapi documents have a many cyclic references, so running GC is quite slow. + if not args.gc: + gc.disable() + atexit.register(os._exit, 0) if args.save: args.scenario = args.scenario + ['write.Conllu'] if args.save_text_mode_trees: From a0e7ae535a99e4c6fd6fbeac40ae3472e1020d71 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 18 Feb 2021 11:50:24 +0100 Subject: [PATCH 0153/1201] speedup loading by 15% even when not using Udapi via udapy When using Udapi via udapy, gc is disabled by default. But even when gc is not globally disabled, we can get a big speedup: - temporarily disabling garbage collection during CoNLL-U (or another reader) loading - run gc.collect() after the loading is done (all-generation GC after loading makes future node.create_child() calls faster) --- udapi/core/basereader.py | 199 +++++++++++++++++++++------------------ 1 file changed, 106 insertions(+), 93 deletions(-) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index f1e32d18..77ebc0db 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -1,7 +1,5 @@ """BaseReader is the base class for all reader blocks.""" -import re -import logging - +import gc from udapi.core.block import Block from udapi.core.files import Files @@ -91,99 +89,114 @@ def filtered_read_tree(self): # Maybe the code could be refactored, but it is speed-critical, # so benchmarking is needed because calling extra methods may result in slowdown. def process_document(self, document): - orig_bundles = document.bundles[:] - last_bundle_id = '' - bundle = None - - # There may be a tree left in the buffer when reading the last doc. - if self._buffer: - root = self._buffer - self._buffer = None - if orig_bundles: - # TODO list.pop(0) is inefficient, use collections.deque.popleft() - bundle = orig_bundles.pop(0) - else: - bundle = document.create_bundle() - if root._sent_id is not None: - bundle.bundle_id = root._sent_id.split('/', 1)[0] - bundle.add_tree(root) - if root.newdoc and root.newdoc is not True: - document.meta["docname"] = root.newdoc - - filehandle = self.filehandle - if filehandle is None: - filehandle = self.next_filehandle() - if filehandle is None: - self.finished = True - return - - trees_loaded = 0 - while True: - root = self.filtered_read_tree() - if root is None: - if trees_loaded == 0 and self.files.has_next_file(): - filehandle = self.next_filehandle() - continue - self.finished = not self.files.has_next_file() - break - if trees_loaded == 0: - document.meta['loaded_from'] = self.filename - add_to_the_last_bundle = 0 - trees_loaded += 1 - - if self.ignore_sent_id: - root._sent_id = None - if root._sent_id is not None: - parts = root._sent_id.split('/', 1) - bundle_id = parts[0] - if len(parts) == 2: - root.zone = parts[1] - add_to_the_last_bundle = bundle_id == last_bundle_id - last_bundle_id = bundle_id - - if self.zone != 'keep': - root.zone = self.zone - - # The `# newdoc` comment in CoNLL-U marks a start of a new document. - if root.newdoc: - if not bundle and root.newdoc is not True: + # Temporarily disabling garbage collection makes the loading much faster. + gc_was_enabled = gc.isenabled() + gc.disable() + try: + orig_bundles = document.bundles[:] + last_bundle_id = '' + bundle = None + + # There may be a tree left in the buffer when reading the last doc. + if self._buffer: + root = self._buffer + self._buffer = None + if orig_bundles: + # TODO list.pop(0) is inefficient, use collections.deque.popleft() + bundle = orig_bundles.pop(0) + else: + bundle = document.create_bundle() + if root._sent_id is not None: + bundle.bundle_id = root._sent_id.split('/', 1)[0] + bundle.add_tree(root) + if root.newdoc and root.newdoc is not True: document.meta["docname"] = root.newdoc - if bundle and self.split_docs: - self._buffer = root - if orig_bundles: - logging.warning("split_docs=1 but the doc had contained %d bundles", - len(orig_bundles)) - self.finished = False + + filehandle = self.filehandle + if filehandle is None: + filehandle = self.next_filehandle() + if filehandle is None: + self.finished = True return - # assign new/next bundle to `bundle` if needed - if not bundle or not add_to_the_last_bundle: - if self.bundles_per_doc and bundle and self.bundles_per_doc == bundle.number: - self._buffer = root + trees_loaded = 0 + while True: + root = self.filtered_read_tree() + if root is None: + if trees_loaded == 0 and self.files.has_next_file(): + filehandle = self.next_filehandle() + continue + self.finished = not self.files.has_next_file() + break + if trees_loaded == 0: + document.meta['loaded_from'] = self.filename + add_to_the_last_bundle = 0 + trees_loaded += 1 + + if self.ignore_sent_id: + root._sent_id = None + if root._sent_id is not None: + parts = root._sent_id.split('/', 1) + bundle_id = parts[0] + if len(parts) == 2: + root.zone = parts[1] + add_to_the_last_bundle = bundle_id == last_bundle_id + last_bundle_id = bundle_id + + if self.zone != 'keep': + root.zone = self.zone + + # The `# newdoc` comment in CoNLL-U marks a start of a new document. + if root.newdoc: + if not bundle and root.newdoc is not True: + document.meta["docname"] = root.newdoc + if bundle and self.split_docs: + self._buffer = root + if orig_bundles: + logging.warning("split_docs=1 but the doc had contained %d bundles", + len(orig_bundles)) + self.finished = False + return + + # assign new/next bundle to `bundle` if needed + if not bundle or not add_to_the_last_bundle: + if self.bundles_per_doc and bundle and self.bundles_per_doc == bundle.number: + self._buffer = root + if orig_bundles: + logging.warning("bundles_per_doc=%d but the doc had contained %d bundles", + self.bundles_per_doc, len(orig_bundles)) + return + if orig_bundles: - logging.warning("bundles_per_doc=%d but the doc had contained %d bundles", - self.bundles_per_doc, len(orig_bundles)) + # TODO list.pop(0) is inefficient, use collections.deque.popleft() + bundle = orig_bundles.pop(0) + if last_bundle_id and last_bundle_id != bundle.bundle_id: + logging.warning('Mismatch in bundle IDs: %s vs %s. Keeping the former one.', + bundle.bundle_id, last_bundle_id) + else: + bundle = document.create_bundle() + if last_bundle_id != '': + bundle.bundle_id = last_bundle_id + + bundle.add_tree(root) + + # If bundles_per_doc is set and we have read the specified number of bundles, + # we should end the current document and return. + # However, if the reader supports reading multiple zones, we can never know + # if the current bundle has ended or there will be another tree for this bundle. + # So in case of multizone readers we need to read one extra tree + # and store it in the buffer (and include it into the next document). + if self.bundles_per_doc and self.bundles_per_doc == bundle.number \ + and not self.is_multizone_reader(): return - if orig_bundles: - # TODO list.pop(0) is inefficient, use collections.deque.popleft() - bundle = orig_bundles.pop(0) - if last_bundle_id and last_bundle_id != bundle.bundle_id: - logging.warning('Mismatch in bundle IDs: %s vs %s. Keeping the former one.', - bundle.bundle_id, last_bundle_id) - else: - bundle = document.create_bundle() - if last_bundle_id != '': - bundle.bundle_id = last_bundle_id - - bundle.add_tree(root) - - # If bundles_per_doc is set and we have read the specified number of bundles, - # we should end the current document and return. - # However, if the reader supports reading multiple zones, we can never know - # if the current bundle has ended or there will be another tree for this bundle. - # So in case of multizone readers we need to read one extra tree - # and store it in the buffer (and include it into the next document). - if self.bundles_per_doc and self.bundles_per_doc == bundle.number \ - and not self.is_multizone_reader(): - return + # Running garbage collector now takes about 0.36s for a 720k-words (68MiB) conllu file + # but it makes further processing (where new objects are created) much faster, + # e.g. 0.85s when creating 65k new nodes. + # If garbage collection was already disabled (e.g. in udapy), everything is even faster + # (but no memory with cyclic references is ever freed before the process exits) + # and in that case we don't want to enable gc here. + finally: + if gc_was_enabled: + gc.enable() + gc.collect() From 6ee5259b9ac87da2baed49ee4945018e50286a72 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 18 Feb 2021 12:08:14 +0100 Subject: [PATCH 0154/1201] inlined parse_node_line(): 2% faster loading --- udapi/block/read/conllu.py | 80 +++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 655146fc..3026c2c1 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -66,46 +66,6 @@ def parse_comment_line(line, root): return root.comment += line[1:] + "\n" - def parse_node_line(self, line, root, nodes, parents, mwts): - fields = line.split('\t') - if len(fields) != 10: - if self.strict: - raise RuntimeError('Wrong number of columns in %r' % line) - fields.extend(['_'] * (10 - len(fields))) - # multi-word tokens will be processed later - if '-' in fields[0]: - mwts.append(fields) - return - if '.' in fields[0]: - empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], - xpos=fields[4], feats=fields[5], misc=fields[9]) - empty.ord = fields[0] - empty.raw_deps = fields[8] # TODO - return - - for i in range(1, 10): - if fields[i] == '_': - fields[i] = None - - # ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc - node = Node(root=root, form=fields[1], lemma=fields[2], upos=fields[3], - xpos=fields[4], feats=fields[5], deprel=fields[7], misc=fields[9]) - root._descendants.append(node) - node._ord = int(fields[0]) - if fields[8] is not None: - node.raw_deps = fields[8] - try: - parents.append(int(fields[6])) - except ValueError as exception: - if not self.strict and fields[6] == '_': - if self.empty_parent == 'warn': - logging.warning("Empty parent/head index in '%s'", line) - parents.append(0) - else: - raise exception - - nodes.append(node) - # pylint: disable=too-many-locals,too-many-branches,too-many-statements # Maybe the code could be refactored, but it is speed-critical, # so benchmarking is needed because calling extra methods may result in slowdown. @@ -124,7 +84,45 @@ def read_tree(self): if line[0] == '#': self.parse_comment_line(line, root) else: - self.parse_node_line(line, root, nodes, parents, mwts) + fields = line.split('\t') + if len(fields) != 10: + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (10 - len(fields))) + # multi-word tokens will be processed later + if '-' in fields[0]: + mwts.append(fields) + continue + if '.' in fields[0]: + empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], + xpos=fields[4], feats=fields[5], misc=fields[9]) + empty.ord = fields[0] + empty.raw_deps = fields[8] # TODO + continue + + for i in range(1, 10): + if fields[i] == '_': + fields[i] = None + + # ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc + node = Node(root=root, form=fields[1], lemma=fields[2], upos=fields[3], + xpos=fields[4], feats=fields[5], deprel=fields[7], misc=fields[9]) + root._descendants.append(node) + node._ord = int(fields[0]) + if fields[8] is not None: + node.raw_deps = fields[8] + try: + parents.append(int(fields[6])) + except ValueError as exception: + if not self.strict and fields[6] == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception + + nodes.append(node) + # If no nodes were read from the filehandle (so only root remained in nodes), # we return None as a sign of failure (end of file or more than one empty line). From 4b34090dce1719935e9f3d8ccf6d1a6d3337258c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 18 Feb 2021 12:14:52 +0100 Subject: [PATCH 0155/1201] faster conversion of '_' to None: 8% faster loading - `for i in range(1,10)` is slower than `for i in [1,2,3,4,5,6,7,8,9]`. - We need just some of the columns to be converted. --- udapi/block/read/conllu.py | 12 +++++++----- udapi/core/node.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 3026c2c1..850be530 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -100,16 +100,19 @@ def read_tree(self): empty.raw_deps = fields[8] # TODO continue - for i in range(1, 10): - if fields[i] == '_': - fields[i] = None + if fields[3] == '_': + fields[3] = None + if fields[4] == '_': + fields[4] = None + if fields[7] == '_': + fields[7] = None # ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc node = Node(root=root, form=fields[1], lemma=fields[2], upos=fields[3], xpos=fields[4], feats=fields[5], deprel=fields[7], misc=fields[9]) root._descendants.append(node) node._ord = int(fields[0]) - if fields[8] is not None: + if fields[8] != '_': node.raw_deps = fields[8] try: parents.append(int(fields[6])) @@ -123,7 +126,6 @@ def read_tree(self): nodes.append(node) - # If no nodes were read from the filehandle (so only root remained in nodes), # we return None as a sign of failure (end of file or more than one empty line). if len(nodes) == 1: diff --git a/udapi/core/node.py b/udapi/core/node.py index cf9a8f31..1b22f3dc 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -94,9 +94,9 @@ def __init__(self, root, form=None, lemma=None, upos=None, # pylint: disable=to self.lemma = lemma self.upos = upos self.xpos = xpos - self._feats = Feats(feats) if feats else None + self._feats = Feats(feats) if feats and feats != '_' else None self.deprel = deprel - self._misc = DualDict(misc) if misc else None + self._misc = DualDict(misc) if misc and misc != '_' else None self._raw_deps = '_' self._deps = None self._parent = None From 3e44b732dec082a18a2ea2a97fd3e712d0fe1d0b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 20 Feb 2021 04:03:06 +0100 Subject: [PATCH 0156/1201] 9% faster loading by reading the whole file at once at the cost of 17% more memory and an additional method `read_trees()` as an alternative to `read_tree()` (which needs to be implemented in all readers anyway) --- udapi/block/read/conllu.py | 23 +++++++++---- udapi/core/basereader.py | 68 ++++++++++++++++++++++++++++++++++---- udapi/core/document.py | 6 ++-- 3 files changed, 79 insertions(+), 18 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 850be530..0f750bb7 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -66,21 +66,30 @@ def parse_comment_line(line, root): return root.comment += line[1:] + "\n" - # pylint: disable=too-many-locals,too-many-branches,too-many-statements - # Maybe the code could be refactored, but it is speed-critical, - # so benchmarking is needed because calling extra methods may result in slowdown. + def read_trees(self): + return [self.read_tree_from_lines(s.split('\n')) for s in + self.filehandle.read().split('\n\n') if s] + def read_tree(self): if self.filehandle is None: return None + lines = [] + for line in self.filehandle: + line = line.rstrip() + if line == '': + break + lines.append(line) + return self.read_tree_from_lines(lines) + # pylint: disable=too-many-locals,too-many-branches,too-many-statements + # Maybe the code could be refactored, but it is speed-critical, + # so benchmarking is needed because calling extra methods may result in slowdown. + def read_tree_from_lines(self, lines): root = Root() nodes = [root] parents = [0] mwts = [] - for line in self.filehandle: - line = line.rstrip() - if line == '': - break + for line in lines: if line[0] == '#': self.parse_comment_line(line, root) else: diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 77ebc0db..4f14d32c 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -59,7 +59,7 @@ def next_filehandle(self): return self.files.next_filehandle() def read_tree(self): - """Load one (more) tree from self.files and return its root. + """Load one (more) tree from self.filehandle and return its root. This method must be overriden in all readers. Usually it is the only method that needs to be implemented. @@ -67,6 +67,14 @@ def read_tree(self): """ raise NotImplementedError("Class %s doesn't implement read_tree" % self.__class__.__name__) + def read_trees(self): + """Load all trees from self.filehandle and return a list of their roots. + + This method may be overriden in a reader if a faster alternative to read_tree() is needed. + The implementation in this base clases raises `NotImplementedError`. + """ + raise NotImplementedError("Class %s doesn't implement read_trees" % self.__class__.__name__) + def filtered_read_tree(self): """Load and return one more tree matching the `sent_id_filter`. @@ -85,6 +93,53 @@ def filtered_read_tree(self): tree.sent_id, self.sent_id_filter) tree = self.read_tree() + def try_fast_load(self, document): + """Try to use self.read_trees() if possible and return True, otherwise False.""" + if document.bundles or self.bundles_per_doc or self.sent_id_filter or self.split_docs: + return False + if self.filehandle is None: + filehandle = self.next_filehandle() + if filehandle is None: + self.finished = True + return True + try: + trees = self.read_trees() + except NotImplementedError: + return False + + document.meta['loaded_from'] = self.filename + if trees and trees[0].newdoc and trees[0].newdoc is not True: + document.meta["docname"] = trees[0].newdoc + + bundle, last_bundle_id = None, '' + for root in trees: + add_to_the_last_bundle = False + + if self.ignore_sent_id: + root._sent_id = None + elif root._sent_id is not None: + parts = root._sent_id.split('/', 1) + bundle_id = parts[0] + if len(parts) == 2: + root.zone = parts[1] + add_to_the_last_bundle = bundle_id == last_bundle_id + last_bundle_id = bundle_id + if self.zone != 'keep': + root.zone = self.zone + + # assign new/next bundle to `bundle` if needed + if not bundle or not add_to_the_last_bundle: + bundle = document.create_bundle() + if last_bundle_id != '': + bundle.bundle_id = last_bundle_id + + bundle.add_tree(root) + + self.next_filehandle() + if filehandle is None: + self.finished = True + return True + # pylint: disable=too-many-branches,too-many-statements # Maybe the code could be refactored, but it is speed-critical, # so benchmarking is needed because calling extra methods may result in slowdown. @@ -93,16 +148,16 @@ def process_document(self, document): gc_was_enabled = gc.isenabled() gc.disable() try: + if self.try_fast_load(document): + return orig_bundles = document.bundles[:] - last_bundle_id = '' - bundle = None + bundle, last_bundle_id = None, '' # There may be a tree left in the buffer when reading the last doc. if self._buffer: root = self._buffer self._buffer = None if orig_bundles: - # TODO list.pop(0) is inefficient, use collections.deque.popleft() bundle = orig_bundles.pop(0) else: bundle = document.create_bundle() @@ -130,12 +185,12 @@ def process_document(self, document): break if trees_loaded == 0: document.meta['loaded_from'] = self.filename - add_to_the_last_bundle = 0 + add_to_the_last_bundle = False trees_loaded += 1 if self.ignore_sent_id: root._sent_id = None - if root._sent_id is not None: + elif root._sent_id is not None: parts = root._sent_id.split('/', 1) bundle_id = parts[0] if len(parts) == 2: @@ -168,7 +223,6 @@ def process_document(self, document): return if orig_bundles: - # TODO list.pop(0) is inefficient, use collections.deque.popleft() bundle = orig_bundles.pop(0) if last_bundle_id and last_bundle_id != bundle.bundle_id: logging.warning('Mismatch in bundle IDs: %s vs %s. Keeping the former one.', diff --git a/udapi/core/document.py b/udapi/core/document.py index 6bf2e55d..16961975 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -61,13 +61,11 @@ def create_bundle(self): def load_conllu(self, filename=None): """Load a document from a conllu-formatted file.""" - reader = ConlluReader(files=filename) - reader.apply_on_document(self) + ConlluReader(files=filename).process_document(self) def store_conllu(self, filename): """Store a document into a conllu-formatted file.""" - writer = ConlluWriter(files=filename) - writer.apply_on_document(self) + ConlluWriter(files=filename).apply_on_document(self) def from_conllu_string(self, string): """Load a document from a conllu-formatted string.""" From d466683c947b18b5b9dd7b39f584a15a0e44b558 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 20 Feb 2021 04:12:44 +0100 Subject: [PATCH 0157/1201] util.Wc tsv=1 for easier-to-parse output --- udapi/block/util/wc.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/udapi/block/util/wc.py b/udapi/block/util/wc.py index 403daf5f..137c95e9 100644 --- a/udapi/block/util/wc.py +++ b/udapi/block/util/wc.py @@ -5,10 +5,15 @@ class Wc(Block): """Special block for printing statistics (word count etc).""" - def __init__(self, **kwargs): - """Create the Wc block object.""" + def __init__(self, tsv=False, **kwargs): + """Create the Wc block object. + + Params: + tsv: print just tab-separated-values (trees, words, tokens, MWTs, empty nodes) + """ super().__init__(**kwargs) self.trees, self.words, self.mwts, self.tokens, self.empty = 0, 0, 0, 0, 0 + self.tsv = tsv def process_tree(self, tree): self.trees += 1 @@ -19,8 +24,11 @@ def process_tree(self, tree): self.empty += len(tree.empty_nodes) def process_end(self): - print('%8d trees\n%8d words' % (self.trees, self.words)) - if self.mwts: - print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) - if self.empty: - print('%8d empty nodes' % self.empty) + if self.tsv: + print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty)))) + else: + print('%8d trees\n%8d words' % (self.trees, self.words)) + if self.mwts: + print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) + if self.empty: + print('%8d empty nodes' % self.empty) From b6d64982756af9a071390698accfd2b37991eb37 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 20 Feb 2021 05:25:30 +0100 Subject: [PATCH 0158/1201] allow udapi.Document('my.conllu', empty_parent='ignore') --- udapi/core/document.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/core/document.py b/udapi/core/document.py index 16961975..58b4b34e 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -31,7 +31,7 @@ def __init__(self, filename=None, **kwargs): self._coref_clusters = None if filename is not None: if filename.endswith(".conllu"): - self.load_conllu(filename) + self.load_conllu(filename, **kwargs) elif filename.endswith(".txt"): reader = SentencesReader(files=filename, **kwargs) reader.apply_on_document(self) @@ -59,9 +59,9 @@ def create_bundle(self): bundle.number = len(self.bundles) return bundle - def load_conllu(self, filename=None): + def load_conllu(self, filename=None, **kwargs): """Load a document from a conllu-formatted file.""" - ConlluReader(files=filename).process_document(self) + ConlluReader(files=filename, **kwargs).process_document(self) def store_conllu(self, filename): """Store a document into a conllu-formatted file.""" From a977dcfd6a843758bf669db5c05480a18fd97268 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 21 Feb 2021 13:00:24 +0100 Subject: [PATCH 0159/1201] bugfix --- udapi/core/basereader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 4f14d32c..5d991a29 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -136,7 +136,7 @@ def try_fast_load(self, document): bundle.add_tree(root) self.next_filehandle() - if filehandle is None: + if self.filehandle is None: self.finished = True return True From df098c35fbb5f81338ccfaf8cfbac9f3e85fdee8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 22 Feb 2021 15:12:15 +0100 Subject: [PATCH 0160/1201] bugfix: allow (again) `n.is_descendant_of(None)` which returns False, of course --- udapi/core/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 1b22f3dc..2640ab91 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -386,7 +386,7 @@ def descendants(self): def is_descendant_of(self, node): """Is the current node a descendant of the node given as argument?""" - if node._children: + if node and node._children: climber = self._parent while climber: if climber is node: From f24e9d27096074baa9710a27c78089097c985719 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 22 Feb 2021 18:30:09 +0100 Subject: [PATCH 0161/1201] deploy udapi 0.2.3 --- CHANGES.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 73418e3b..c79a5d82 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,9 +2,10 @@ Udapi Change Log ---------------- See https://github.com/udapi/udapi-python/commits/master for details. -0.2.3 2021-02-04 +0.2.3 2021-02-22 - support for enhanced dependencies and coreference - requires Python 3.6+ due to f-strings + - speed-up (benchmark 40.5s -> 10.4s) 0.2.2 2018-01-08 - support for loading/storing documents from/to strings From acd214e659dc5986b42384e00d91db49b8ce86c5 Mon Sep 17 00:00:00 2001 From: Zdenek Zabokrtsky Date: Mon, 22 Feb 2021 21:19:21 +0100 Subject: [PATCH 0162/1201] a block for removing listed temporary attributes after primary corefud conversions --- udapi/block/corefud/removemisc.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 udapi/block/corefud/removemisc.py diff --git a/udapi/block/corefud/removemisc.py b/udapi/block/corefud/removemisc.py new file mode 100644 index 00000000..6ca1e87a --- /dev/null +++ b/udapi/block/corefud/removemisc.py @@ -0,0 +1,17 @@ +from udapi.core.block import Block +import re + +class RemoveMisc(Block): + """Deleting all temporary attributes after primary conversions""" + + def __init__(self, attrnames='', **kwargs): + """ Arg: attrnames = comma-separated list of Misc attributes to be deleted""" + super().__init__(**kwargs) + self.attrs4deletion = set(attrnames.split(',')) + + def process_node(self,node): + for attrname in list(node.misc): + shortattrname = re.sub(r'\[\d+\]',r'',attrname) + if shortattrname in self.attrs4deletion: + del node.misc[attrname] + From 09208bccd38461153652ec7321e5d7b7f62e8495 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 23 Feb 2021 02:31:04 +0100 Subject: [PATCH 0163/1201] modern Python packaging --- CHANGES.txt | 2 +- pyproject.toml | 6 ++++++ setup.cfg | 29 +++++++++++++++++++++++++++++ setup.py | 29 +++-------------------------- 4 files changed, 39 insertions(+), 27 deletions(-) create mode 100644 pyproject.toml create mode 100644 setup.cfg diff --git a/CHANGES.txt b/CHANGES.txt index c79a5d82..77d72548 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,7 +2,7 @@ Udapi Change Log ---------------- See https://github.com/udapi/udapi-python/commits/master for details. -0.2.3 2021-02-22 +0.2.3 2021-02-23 - support for enhanced dependencies and coreference - requires Python 3.6+ due to f-strings - speed-up (benchmark 40.5s -> 10.4s) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..374b58cb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..4e96f81a --- /dev/null +++ b/setup.cfg @@ -0,0 +1,29 @@ +[metadata] +name = udapi +version = 0.2.3 +author = Martin Popel +author_email = popel@ufal.mff.cuni.cz +description = Python framework for processing Universal Dependencies data +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/udapi/udapi-python +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+) + Operating System :: OS Independent + +[options] +packages = find: +python_requires = >=3.6 +include_package_data = True +scripts = + bin/udapy +install_requires = + colorama + termcolor + +[options.extras_require] +test = + pytest + + diff --git a/setup.py b/setup.py index 804ebd59..7f1a1763 100644 --- a/setup.py +++ b/setup.py @@ -1,27 +1,4 @@ -#!/usr/bin/env python3 +from setuptools import setup -from setuptools import setup, find_packages - -setup( - name='udapi', - version='0.2.3', - description='Python framework for processing Universal Dependencies data', - long_description=( - 'Udapi is an open-source framework providing API for processing ' - 'Universal Dependencies data. It is available in Python, Perl and Java. ' - 'Udapi is suitable both for full-fledged applications and fast ' - 'prototyping: visualization of dependency trees, format conversions, ' - 'querying, editing and transformations, validity tests, dependency ' - 'parsing, evaluation etc.' - ), - author='Martin Popel', - author_email='popel@ufal.mff.cuni.cz', - url='https://github.com/udapi/udapi-python', - packages=find_packages(), - scripts=['bin/udapy'], - tests_require=['pytest'], - install_requires=['colorama', 'termcolor'], - python_requires='>=3.6', - license='GPL 2 or newer', - platforms='any', -) +if __name__ == "__main__": + setup() From e737b0d773eddc88f033537d4c2f103dd7f0483a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 23 Feb 2021 13:58:22 +0100 Subject: [PATCH 0164/1201] draft of coref statistics --- udapi/block/corefud/stats.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 udapi/block/corefud/stats.py diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py new file mode 100644 index 00000000..049e6e05 --- /dev/null +++ b/udapi/block/corefud/stats.py @@ -0,0 +1,33 @@ +from udapi.core.block import Block +from collections import Counter +import pprint + +class Stats(Block): + """Block corefud.Stats prints various coreference-related statistics.""" + + def __init__(self, m_len_max=5, **kwargs): + super().__init__(**kwargs) + self.m_len_max = m_len_max + + self.counter = Counter() + self.mentions = 0 + self.clusters = 0 + self.nodes = 0 + self.longest_mention = 0 + + def process_document(self, doc): + self.nodes += len(list(doc.nodes)) + for cluster in doc.coref_clusters.values(): + self.clusters += 1 + for mention in cluster.mentions: + self.mentions += 1 + words = len(mention.words) + self.longest_mention = max(words, self.longest_mention) + self.counter[f"m_len_{min(words, self.m_len_max)}"] += 1 + + def process_end(self): + #pprint.pprint(self.counter) + mentions_per1k = 1000 * self.mentions / self.nodes + percents = [100 * self.counter[f"m_len_{i}"]/self.mentions for i in range(self.m_len_max + 1)] + print(f"{self.mentions:6} & {mentions_per1k:6.0f} & {self.longest_mention:6} & " + + " & ".join(f"{p:3.0f}" for p in percents) + r" \\") From 449be8f4bbe542dd7a058d68f8f0f45712bc237f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 23 Feb 2021 23:32:51 +0100 Subject: [PATCH 0165/1201] forgotten import --- udapi/core/coref.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index e5f3d3d5..3aed44e7 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -1,6 +1,7 @@ """Classes for handling coreference.""" import re import functools +import logging @functools.total_ordering class CorefMention(object): From a69ec13d9becefa7cffc8dbf4a2d3e1dc33db209 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 24 Feb 2021 01:07:47 +0100 Subject: [PATCH 0166/1201] improved coref stats --- udapi/block/corefud/stats.py | 37 ++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index 049e6e05..a85738b1 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -1,33 +1,46 @@ from udapi.core.block import Block from collections import Counter -import pprint class Stats(Block): """Block corefud.Stats prints various coreference-related statistics.""" - def __init__(self, m_len_max=5, **kwargs): + def __init__(self, m_len_max=5, focus='non-singletons', **kwargs): super().__init__(**kwargs) self.m_len_max = m_len_max + self.focus = focus self.counter = Counter() self.mentions = 0 self.clusters = 0 - self.nodes = 0 + self.total_nodes = 0 self.longest_mention = 0 + self.m_words = 0 + self.m_empty = 0 def process_document(self, doc): - self.nodes += len(list(doc.nodes)) + self.total_nodes += len(list(doc.nodes)) for cluster in doc.coref_clusters.values(): + if len(cluster.mentions) == 1: + if self.focus == 'non-singletons': + continue + elif self.focus == 'singletons': + continue + self.clusters += 1 for mention in cluster.mentions: self.mentions += 1 - words = len(mention.words) - self.longest_mention = max(words, self.longest_mention) - self.counter[f"m_len_{min(words, self.m_len_max)}"] += 1 + all_words = len(mention.words) + non_empty = len([w for w in mention.words if not w.is_empty()]) + self.m_words += all_words + self.m_empty += all_words - non_empty + self.longest_mention = max(non_empty, self.longest_mention) + self.counter[f"m_len_{min(non_empty, self.m_len_max)}"] += 1 def process_end(self): - #pprint.pprint(self.counter) - mentions_per1k = 1000 * self.mentions / self.nodes - percents = [100 * self.counter[f"m_len_{i}"]/self.mentions for i in range(self.m_len_max + 1)] - print(f"{self.mentions:6} & {mentions_per1k:6.0f} & {self.longest_mention:6} & " - + " & ".join(f"{p:3.0f}" for p in percents) + r" \\") + mentions = 1 if self.mentions == 0 else self.mentions + total_nodes = 1 if self.total_nodes == 0 else self.total_nodes + mentions_per1k = 1000 * self.mentions / total_nodes + + percents = [100 * self.counter[f"m_len_{i}"] / mentions for i in range(self.m_len_max + 1)] + print(f"{self.mentions:7,} & {mentions_per1k:6.0f} & {self.longest_mention:6} & " + + " & ".join(f"{p:5.1f}" for p in percents) + r" \\") From 7e3887eaf1cfae8ec0cd8c9796add34c38974e35 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 24 Feb 2021 01:52:25 +0100 Subject: [PATCH 0167/1201] coref cluster stats --- udapi/block/corefud/stats.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index a85738b1..0d791969 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -4,9 +4,10 @@ class Stats(Block): """Block corefud.Stats prints various coreference-related statistics.""" - def __init__(self, m_len_max=5, focus='non-singletons', **kwargs): + def __init__(self, m_len_max=5, c_len_max=5, focus='non-singletons', **kwargs): super().__init__(**kwargs) self.m_len_max = m_len_max + self.c_len_max = c_len_max self.focus = focus self.counter = Counter() @@ -14,16 +15,20 @@ def __init__(self, m_len_max=5, focus='non-singletons', **kwargs): self.clusters = 0 self.total_nodes = 0 self.longest_mention = 0 + self.longest_cluster = 0 self.m_words = 0 self.m_empty = 0 def process_document(self, doc): self.total_nodes += len(list(doc.nodes)) for cluster in doc.coref_clusters.values(): - if len(cluster.mentions) == 1: - if self.focus == 'non-singletons': + len_mentions = len(cluster.mentions) + self.longest_cluster = max(len_mentions, self.longest_cluster) + self.counter[f"c_len_{min(len_mentions, self.c_len_max)}"] += 1 + if len_mentions == 1: + if self.focus.startswith('non-singletons'): continue - elif self.focus == 'singletons': + elif self.focus.startswith('singletons'): continue self.clusters += 1 @@ -38,9 +43,16 @@ def process_document(self, doc): def process_end(self): mentions = 1 if self.mentions == 0 else self.mentions + clusters = 1 if self.clusters == 0 else self.clusters total_nodes = 1 if self.total_nodes == 0 else self.total_nodes - mentions_per1k = 1000 * self.mentions / total_nodes - percents = [100 * self.counter[f"m_len_{i}"] / mentions for i in range(self.m_len_max + 1)] - print(f"{self.mentions:7,} & {mentions_per1k:6.0f} & {self.longest_mention:6} & " - + " & ".join(f"{p:5.1f}" for p in percents) + r" \\") + if 'clusters' in self.focus: + clusters_per1k = 1000 * self.clusters / total_nodes + percents = [100 * self.counter[f"c_len_{i}"] / clusters for i in range(1, self.c_len_max + 1)] + print(f"{self.clusters:7,} & {clusters_per1k:6.0f} & {self.longest_cluster:6} & " + + " & ".join(f"{p:5.1f}" for p in percents) + r" \\") + if self.focus != 'clusters': + mentions_per1k = 1000 * self.mentions / total_nodes + percents = [100 * self.counter[f"m_len_{i}"] / mentions for i in range(self.m_len_max + 1)] + print(f"{self.mentions:7,} & {mentions_per1k:6.0f} & {self.longest_mention:6} & " + + " & ".join(f"{p:5.1f}" for p in percents) + r" \\") From 9161cbdec054700d6cfbb28664df720038b03363 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 24 Feb 2021 15:58:45 +0100 Subject: [PATCH 0168/1201] redesign corefud.Stats --- udapi/block/corefud/stats.py | 59 +++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index 0d791969..586c0236 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -4,11 +4,18 @@ class Stats(Block): """Block corefud.Stats prints various coreference-related statistics.""" - def __init__(self, m_len_max=5, c_len_max=5, focus='non-singletons', **kwargs): + def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_clusters=True, + exclude_singletons=False, exclude_nonsingletons=False, style='human', **kwargs): super().__init__(**kwargs) self.m_len_max = m_len_max self.c_len_max = c_len_max - self.focus = focus + self.report_mentions = report_mentions + self.report_clusters = report_clusters + self.exclude_singletons = exclude_singletons + self.exclude_nonsingletons = exclude_nonsingletons + self.style = style + if style not in 'tex human'.split(): + raise ValueError(f'Unknown style f{style}') self.counter = Counter() self.mentions = 0 @@ -23,13 +30,12 @@ def process_document(self, doc): self.total_nodes += len(list(doc.nodes)) for cluster in doc.coref_clusters.values(): len_mentions = len(cluster.mentions) + if len_mentions == 1 and self.exclude_singletons: + continue + elif len_mentions > 1 and self.exclude_nonsingletons: + continue self.longest_cluster = max(len_mentions, self.longest_cluster) self.counter[f"c_len_{min(len_mentions, self.c_len_max)}"] += 1 - if len_mentions == 1: - if self.focus.startswith('non-singletons'): - continue - elif self.focus.startswith('singletons'): - continue self.clusters += 1 for mention in cluster.mentions: @@ -42,17 +48,28 @@ def process_document(self, doc): self.counter[f"m_len_{min(non_empty, self.m_len_max)}"] += 1 def process_end(self): - mentions = 1 if self.mentions == 0 else self.mentions - clusters = 1 if self.clusters == 0 else self.clusters - total_nodes = 1 if self.total_nodes == 0 else self.total_nodes - - if 'clusters' in self.focus: - clusters_per1k = 1000 * self.clusters / total_nodes - percents = [100 * self.counter[f"c_len_{i}"] / clusters for i in range(1, self.c_len_max + 1)] - print(f"{self.clusters:7,} & {clusters_per1k:6.0f} & {self.longest_cluster:6} & " - + " & ".join(f"{p:5.1f}" for p in percents) + r" \\") - if self.focus != 'clusters': - mentions_per1k = 1000 * self.mentions / total_nodes - percents = [100 * self.counter[f"m_len_{i}"] / mentions for i in range(self.m_len_max + 1)] - print(f"{self.mentions:7,} & {mentions_per1k:6.0f} & {self.longest_mention:6} & " - + " & ".join(f"{p:5.1f}" for p in percents) + r" \\") + mentions_nonzero = 1 if self.mentions == 0 else self.mentions + clusters_nonzero = 1 if self.clusters == 0 else self.clusters + total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes + + columns =[ ] + if self.report_clusters: + columns += [('clusters', f"{self.clusters:7,}"), + ('clusters_per1k', f"{1000 * self.clusters / total_nodes_nonzero:6.0f}"), + ('longest_cluster', f"{self.longest_cluster:6}")] + for i in range(1, self.c_len_max + 1): + percent = 100 * self.counter[f"c_len_{i}"] / clusters_nonzero + columns.append((f"c_len_{i}{'' if i < self.c_len_max else '+'}", f"{percent:5.1f}")) + if self.report_mentions: + columns += [('mentions', f"{self.mentions:7,}"), + ('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"), + ('longest_mention', f"{self.longest_mention:6}")] + for i in range(0, self.m_len_max + 1): + percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero + columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}")) + + if self.style == 'tex': + print(" & ".join(c[1] for c in columns)) + elif self.style == 'human': + for c in columns: + print(f"{c[0]:>15} = {c[1].strip():>10}") From f0252f7d4e67dfe4a9f5e236f66000ff33a3b6ae Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Wed, 24 Feb 2021 02:33:51 +0100 Subject: [PATCH 0169/1201] Improvements in Simple segmenter 1. deal with multiple consecutive spaces in input sentences or the space at the start/end of the sentence => no sentence boundary 2. if an abbreviation of the first name is a first word of a quoted segment, delete the starting quotation mark to find out if the word consists of two chars --- udapi/block/segment/simple.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/udapi/block/segment/simple.py b/udapi/block/segment/simple.py index b4f2bbe4..8e4c38f9 100644 --- a/udapi/block/segment/simple.py +++ b/udapi/block/segment/simple.py @@ -16,6 +16,8 @@ def is_nonfinal_abbrev(token): def is_boundary(self, first, second): """Is there a sentence boundary between the first and second token?""" + if not first or not second: + return False if first[-1] in '"“»›)': first = first[:-1] if second[0] in '"„«¿¡‹(': @@ -25,6 +27,9 @@ def is_boundary(self, first, second): if not first[-1] in '.!?': return False if first[-1] == '.': + # correctly count length in "„A. Merkel" + if first[0] in '"„«¿¡‹(': + first = first[1:] if len(first) == 2 and first[0].isupper(): return False if self.is_nonfinal_abbrev(first[:-1]): @@ -39,6 +44,7 @@ def segment_string(self, string): segments = [previous] for token in tokens[1:]: if self.is_boundary(previous, token): + segments[-1] += ' ' segments.append(token) else: segments[-1] += ' ' + token @@ -64,4 +70,4 @@ def process_document(self, doc): new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) new_bundle.create_tree(tree.zone).text = sentence new_bundles.append(new_bundle) - doc.bundles = new_bundles \ No newline at end of file + doc.bundles = new_bundles From 7cb814e7ded1418a6f82d1d299052322e8ddfdf7 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Thu, 25 Feb 2021 16:58:59 +0100 Subject: [PATCH 0170/1201] added the keep_spaces parameter + doc extended --- udapi/block/segment/simple.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/udapi/block/segment/simple.py b/udapi/block/segment/simple.py index 8e4c38f9..5f4a8423 100644 --- a/udapi/block/segment/simple.py +++ b/udapi/block/segment/simple.py @@ -4,7 +4,20 @@ import re class Simple(Block): - """"Heuristic segmenter, splits on sentence-final segmentation followed by uppercase.""" + """"Heuristic segmenter, splits on sentence-final segmentation followed by uppercase. + The exceptions are: + 1) abbreviations of names, e.g. "A. Merkel" + 2) predefined list of nonfinal abbreviations, e.g. "e.g." + + Parameters + ---------- + keep_spaces : bool + do not strip whitespaces from the `text` attribute of the sentences created by segmentation + """ + + def __init__(self, keep_spaces=False, **kwargs): + super().__init__(**kwargs) + self.keep_spaces = keep_spaces @staticmethod def is_nonfinal_abbrev(token): @@ -44,7 +57,8 @@ def segment_string(self, string): segments = [previous] for token in tokens[1:]: if self.is_boundary(previous, token): - segments[-1] += ' ' + if self.keep_spaces: + segments[-1] += ' ' segments.append(token) else: segments[-1] += ' ' + token From cc43b199140378759f3d9d69661bf6f26af562f5 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Thu, 25 Feb 2021 11:48:59 +0100 Subject: [PATCH 0171/1201] preventing the text attribute from being invalid * removing \r and \n from anywhere in the text attribute * stripping all whitespace from the end of the text attribute --- udapi/block/write/conllu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 429c2f42..d8ca1001 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -2,6 +2,7 @@ import json from udapi.core.basewriter import BaseWriter +escape_whitespace_table = str.maketrans({'\r':'', '\n':''}) class Conllu(BaseWriter): """A writer of files in the CoNLL-U format.""" @@ -32,7 +33,7 @@ def process_tree(self, tree): # pylint: disable=too-many-branches print('# sent_id = ' + tree.sent_id) if self.print_text: - print('# text = ' + (tree.text or tree.compute_text())) + print('# text = ' + (tree.compute_text() if tree.text is None else tree.text.translate(escape_whitespace_table).rstrip())) if tree.json: for key, value in sorted(tree.json.items()): From 26105f8d8d371437a85307d230c54c687ace354e Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Thu, 25 Feb 2021 12:52:06 +0100 Subject: [PATCH 0172/1201] faster --- udapi/block/write/conllu.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index d8ca1001..df3adce6 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -2,8 +2,6 @@ import json from udapi.core.basewriter import BaseWriter -escape_whitespace_table = str.maketrans({'\r':'', '\n':''}) - class Conllu(BaseWriter): """A writer of files in the CoNLL-U format.""" @@ -33,7 +31,7 @@ def process_tree(self, tree): # pylint: disable=too-many-branches print('# sent_id = ' + tree.sent_id) if self.print_text: - print('# text = ' + (tree.compute_text() if tree.text is None else tree.text.translate(escape_whitespace_table).rstrip())) + print('# text = ' + (tree.compute_text() if tree.text is None else tree.text.replace('\n', '').replace('\r', '').rstrip())) if tree.json: for key, value in sorted(tree.json.items()): From 0c7918c70018033351ec220ccde9fbc49859957e Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Thu, 25 Feb 2021 16:11:27 +0100 Subject: [PATCH 0173/1201] warning for using read.Sentences rstrip='' --- udapi/block/read/sentences.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/read/sentences.py b/udapi/block/read/sentences.py index 758b4980..356e196f 100644 --- a/udapi/block/read/sentences.py +++ b/udapi/block/read/sentences.py @@ -12,7 +12,9 @@ class Sentences(BaseReader): rstrip: a set of characters to be stripped from the end of each line. Default='\r\n '. You can use rstrip='\n' if you want to preserve any space or '\r' (Carriage Return) at end of line, - so that `udpipe.Base resegment=1` keeps these characters in `SpacesAfter`. + so that `udpipe.Base` keeps these characters in `SpacesAfter`. + As most blocks do not expect whitespace other than a space to appear + in the processed text, using this feature is at your own risk. """ def __init__(self, ignore_empty_lines=False, rstrip='\r\n ', **kwargs): self.ignore_empty_lines = ignore_empty_lines From 6868f58ab7773b1c5fb3815bdfed0af39b3ffab5 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Thu, 25 Feb 2021 16:12:51 +0100 Subject: [PATCH 0174/1201] normalize_spaces=False -> keep_spaces=True --- udapi/block/tokenize/onwhitespace.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/udapi/block/tokenize/onwhitespace.py b/udapi/block/tokenize/onwhitespace.py index b25005a9..6ca412a8 100644 --- a/udapi/block/tokenize/onwhitespace.py +++ b/udapi/block/tokenize/onwhitespace.py @@ -6,16 +6,16 @@ class OnWhitespace(Block): """Base tokenizer, splits on whitespaces, fills SpaceAfter=No. - Use the parameter `normalize_spaces=False` to preserve all whitespaces in the sentence + Use the parameter `keep_spaces=True` to preserve all whitespaces in the sentence in the UDPipe way, i.e. using the `SpacesAfter` and `SpacesBefore` features in the MISC field. It is backward compatible with CoNLL-U v2 `SpaceAfter=No` feature. That is, no following whitespace is marked by `SpaceAfter=No` and a single following space results in no whitespace-related markup. If loading the text using `read.Sentences` and all whitespaces need to be preserved (in order to be able to reconstruct the original document), the `read.Sentences` block - must be called with `rstrip=\n` or `rstrip=\r\n` to prevent stripping the trailing - whitespace, e.g.:: - $> echo -e "Hello \t world " | udapy read.Sentences $'rstrip=\r\n' tokenize.OnWhitespace normalize_spaces=0 write.Conllu + must be called with `rstrip=''`, `rstrip=\n` or `rstrip=\r\n` to prevent stripping the + trailing whitespace, e.g.:: + $> echo -e "Hello \t world " | udapy read.Sentences $'rstrip=\r\n' tokenize.OnWhitespace keep_spaces=1 write.Conllu # sent_id = 1 # text = Hello world @@ -26,15 +26,15 @@ class OnWhitespace(Block): Parameters ---------- - normalize_spaces : bool - preserve whitespaces by filling MISC attributes `SpacesAfter` and `SpacesBefore` (by default True) + keep_spaces : bool + preserve whitespaces by filling MISC attributes `SpacesAfter` and `SpacesBefore` (by default False) """ escape_whitespace_table = str.maketrans({' ':r'\s', '\t':r'\t', '\r':r'\r', '\n':r'\n'}) - def __init__(self, normalize_spaces=True, **kwargs): + def __init__(self, keep_spaces=True, **kwargs): super().__init__(**kwargs) - self.normalize_spaces = normalize_spaces + self.keep_spaces = keep_spaces @staticmethod def tokenize_sentence(string): @@ -80,7 +80,7 @@ def process_tree(self, root): sentence = sentence[len(spaces_after):] # normalize whitespace - if self.normalize_spaces: + if not self.keep_spaces: spaces_before = "" # spaces_after = "" <=> SpaceAfter=No is never set for the last token <=> len(sentence) = 0 spaces_after = "" if not len(spaces_after) and len(sentence) else " " From e2fe7e03eb0dc809a77010be58f10e42c1ce881e Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Thu, 25 Feb 2021 18:47:03 +0100 Subject: [PATCH 0175/1201] bugfix --- udapi/block/tokenize/onwhitespace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/tokenize/onwhitespace.py b/udapi/block/tokenize/onwhitespace.py index 6ca412a8..913dae61 100644 --- a/udapi/block/tokenize/onwhitespace.py +++ b/udapi/block/tokenize/onwhitespace.py @@ -32,7 +32,7 @@ class OnWhitespace(Block): escape_whitespace_table = str.maketrans({' ':r'\s', '\t':r'\t', '\r':r'\r', '\n':r'\n'}) - def __init__(self, keep_spaces=True, **kwargs): + def __init__(self, keep_spaces=False, **kwargs): super().__init__(**kwargs) self.keep_spaces = keep_spaces From e8e3824b20b96927b1ca0d4272b0afc330a97c23 Mon Sep 17 00:00:00 2001 From: Zdenek Zabokrtsky Date: Fri, 26 Feb 2021 22:03:21 +0100 Subject: [PATCH 0176/1201] a block for generating latex-formatted statistic of distributions of values of misc attributes --- udapi/block/corefud/miscstats.py | 7 +++-- udapi/block/corefud/miscstatstex.py | 44 +++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 udapi/block/corefud/miscstatstex.py diff --git a/udapi/block/corefud/miscstats.py b/udapi/block/corefud/miscstats.py index e7aabc03..dee358d6 100644 --- a/udapi/block/corefud/miscstats.py +++ b/udapi/block/corefud/miscstats.py @@ -17,6 +17,7 @@ def __init__(self, maxvalues=10, **kwargs): super().__init__(**kwargs) self.maxvalues = maxvalues self.valuecounter = {} + self.totalcounter = Counter() def process_node(self,node): for attrname in node.misc: @@ -24,9 +25,11 @@ def process_node(self,node): if not shortattrname in self.valuecounter: self.valuecounter[shortattrname] = Counter() self.valuecounter[shortattrname][node.misc[attrname]] += 1 + self.totalcounter[shortattrname] += 1 def process_end(self): for attrname in self.valuecounter: - print("MISC attribute: "+attrname) + print() + print(attrname+"\t"+str(self.totalcounter[attrname])) for value,freq in self.valuecounter[attrname].most_common(self.maxvalues): - print(" "+str(value)+" "+str(freq)) + print("\t"+str(value)+"\t"+str(freq)) diff --git a/udapi/block/corefud/miscstatstex.py b/udapi/block/corefud/miscstatstex.py new file mode 100644 index 00000000..25d3751a --- /dev/null +++ b/udapi/block/corefud/miscstatstex.py @@ -0,0 +1,44 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class MiscStatsTex(Block): + """Block corefud.MiscStats prints 10 most frequent values of each attribute stored in the MISC field""" + + def __init__(self, maxvalues=10, **kwargs): + + """Create the corefud.MiscStats + + Args: + maxvalues: the number of most frequent values + to be printed for each attribute. + + """ + super().__init__(**kwargs) + self.maxvalues = maxvalues + self.valuecounter = {} + self.totalcounter = Counter() + + def process_node(self,node): + for attrname in node.misc: + shortattrname = re.sub(r'\[\d+\]',r'',attrname) + if not shortattrname in self.valuecounter: + self.valuecounter[shortattrname] = Counter() + self.valuecounter[shortattrname][node.misc[attrname]] += 1 + self.totalcounter[shortattrname] += 1 + + def process_end(self): + for attrname in self.valuecounter: + + total = self.totalcounter[attrname] + distrvalues = [] + + for value,freq in self.valuecounter[attrname].most_common(self.maxvalues): + value = re.sub(r'_',r'\\_',value) + distrvalues.append(f'\\attr{{{str(value)}}} {100*freq/total:2.1f}~\\%') + + attrname = re.sub(r'_',r'\\_',attrname) + print(f" \\item attribute \\attr{{{attrname}}}, {total:,} occurrences, values: "+", ".join(distrvalues)) +# print(f" \\item attribute \\attr\{{attrname}\}, {str(total)} occurrences, distribution of values: "+", ".join(distrvalues)) + + From 95b7d1e4dee939bb67744098ec6e5e3377e62bd0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 27 Feb 2021 14:48:59 +0100 Subject: [PATCH 0177/1201] overload Node.__repr__, so that f"{list_of_nodes}" works as expected --- udapi/core/node.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 2640ab91..c49f8128 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -105,8 +105,13 @@ def __init__(self, root, form=None, lemma=None, upos=None, # pylint: disable=to self._mentions = list() def __str__(self): - """Pretty print of the Node object.""" - return "node<%s, %s>" % (self.address(), self.form) + """String representation of the Node object: .""" + return f"<{self.address()}, {self.form}>" + + def __repr__(self): + """String representation of the Node object: Node.""" + return f"Node<{self.address()}, {self.form}>" + @property def root(self): From 8e3b614709f18424a8cb8b00cc98645b277d895a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 27 Feb 2021 14:50:36 +0100 Subject: [PATCH 0178/1201] allow loading conllu files, where MentionSpan is not sorted --- udapi/core/coref.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 3aed44e7..be8f6883 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -69,7 +69,7 @@ def words(self): @words.setter def words(self, new_words): if new_words and self.head not in new_words: - raise ValueError(f"Head {self.head} not in new_words") + raise ValueError(f"Head {self.head} not in new_words {new_words}") kept_words = [] for old_word in self._words: if old_word in new_words: @@ -250,6 +250,7 @@ def span_to_nodes(root, span): except ValueError as e: raise ValueError(f"Cannot parse '{span}': {e}") ranges.append((lo, hi)) + ranges.sort() def _num_in_ranges(num): for (lo, hi) in ranges: From c627846c40fc8f99cf68cb91662bd7b512b6dfd2 Mon Sep 17 00:00:00 2001 From: Zdenek Zabokrtsky Date: Sat, 27 Feb 2021 23:56:55 +0100 Subject: [PATCH 0179/1201] a block for concatenating all MISC attributes named MentionMisc_... into a single value of MentionMisc --- udapi/block/corefud/concatmentionmisc.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 udapi/block/corefud/concatmentionmisc.py diff --git a/udapi/block/corefud/concatmentionmisc.py b/udapi/block/corefud/concatmentionmisc.py new file mode 100644 index 00000000..28be34cf --- /dev/null +++ b/udapi/block/corefud/concatmentionmisc.py @@ -0,0 +1,23 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class ConcatMentionMisc(Block): + """All MISC attributes named MentionMisc_... are concatenated into MentionMisc""" + + def process_node(self,node): + for attrname in list(node.misc): + matchObj = re.match('MentionMisc_([^[]+)((\[\d+\])?)',attrname) + if matchObj: + innerattrib = matchObj.group(1) + index = matchObj.group(2) + + finalattr = 'MentionMisc'+index + value = node.misc[attrname] + + if finalattr not in node.misc: + node.misc[finalattr] = f'{innerattrib}:{value}' + else: + node.misc[finalattr] += f' {innerattrib}:{value}' + del node.misc[attrname] + From 84320e8299b8efabcd58af110d59ecc5509caf07 Mon Sep 17 00:00:00 2001 From: Zdenek Zabokrtsky Date: Sun, 28 Feb 2021 00:14:33 +0100 Subject: [PATCH 0180/1201] including empty nodes in corefud blocks --- udapi/block/corefud/concatmentionmisc.py | 27 ++++++++++++------------ udapi/block/corefud/removemisc.py | 11 +++++----- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/udapi/block/corefud/concatmentionmisc.py b/udapi/block/corefud/concatmentionmisc.py index 28be34cf..aeb945a8 100644 --- a/udapi/block/corefud/concatmentionmisc.py +++ b/udapi/block/corefud/concatmentionmisc.py @@ -5,19 +5,20 @@ class ConcatMentionMisc(Block): """All MISC attributes named MentionMisc_... are concatenated into MentionMisc""" - def process_node(self,node): - for attrname in list(node.misc): - matchObj = re.match('MentionMisc_([^[]+)((\[\d+\])?)',attrname) - if matchObj: - innerattrib = matchObj.group(1) - index = matchObj.group(2) + def process_tree(self,root): + for node in root.descendants_and_empty: + for attrname in list(node.misc): + matchObj = re.match('MentionMisc_([^[]+)((\[\d+\])?)',attrname) + if matchObj: + innerattrib = matchObj.group(1) + index = matchObj.group(2) - finalattr = 'MentionMisc'+index - value = node.misc[attrname] + finalattr = 'MentionMisc'+index + value = node.misc[attrname] - if finalattr not in node.misc: - node.misc[finalattr] = f'{innerattrib}:{value}' - else: - node.misc[finalattr] += f' {innerattrib}:{value}' - del node.misc[attrname] + if finalattr not in node.misc: + node.misc[finalattr] = f'{innerattrib}:{value}' + else: + node.misc[finalattr] += f' {innerattrib}:{value}' + del node.misc[attrname] diff --git a/udapi/block/corefud/removemisc.py b/udapi/block/corefud/removemisc.py index 6ca1e87a..f132aaed 100644 --- a/udapi/block/corefud/removemisc.py +++ b/udapi/block/corefud/removemisc.py @@ -9,9 +9,10 @@ def __init__(self, attrnames='', **kwargs): super().__init__(**kwargs) self.attrs4deletion = set(attrnames.split(',')) - def process_node(self,node): - for attrname in list(node.misc): - shortattrname = re.sub(r'\[\d+\]',r'',attrname) - if shortattrname in self.attrs4deletion: - del node.misc[attrname] + def process_tree(self,root): + for node in root.descendants_and_empty: + for attrname in list(node.misc): + shortattrname = re.sub(r'\[\d+\]',r'',attrname) + if shortattrname in self.attrs4deletion: + del node.misc[attrname] From a66a530eb07c38d38a4c98a6f0b14ea62dbf6d06 Mon Sep 17 00:00:00 2001 From: Zdenek Zabokrtsky Date: Sun, 28 Feb 2021 00:30:44 +0100 Subject: [PATCH 0181/1201] forgotten commit --- udapi/block/corefud/bridgingclusters.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 udapi/block/corefud/bridgingclusters.py diff --git a/udapi/block/corefud/bridgingclusters.py b/udapi/block/corefud/bridgingclusters.py new file mode 100644 index 00000000..30ac49a7 --- /dev/null +++ b/udapi/block/corefud/bridgingclusters.py @@ -0,0 +1,17 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class BridgingClusters(Block): + + def process_node(self,node): + + if 'Bridging' in node.misc and "+" in node.misc['BridgingAllTargetClusterTexts']: + print("SENTENCE : "+node.root.get_sentence()) + print("SOURCE MENTION: "+node.misc['MentionText']) + print("RELATION: "+node.misc['Bridging']) + print("TARGET MENTION: "+node.misc['BridgingTargetMentionText']) + print("TARGET CLUSTER: "+node.misc['BridgingAllTargetClusterTexts']) + print() + + From 8ea1e4c76f230fba0282f5dcc071d46608b70e16 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 26 Feb 2021 00:15:54 +0100 Subject: [PATCH 0182/1201] If MentionSpan is not provided, suppose only the head is in the span Otherwise, the mention is not reachable via `head.coref_mentions` and `len(mention.words) == 0`, which is both misleading. --- udapi/core/coref.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index be8f6883..1e2df00b 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -194,6 +194,8 @@ def load_coref_from_misc(doc): mention = CorefMention(node, cluster) if node.misc["MentionSpan" + index_str]: mention.span = node.misc["MentionSpan" + index_str] + else: + mentions.words = [node] cluster_type = node.misc["ClusterType" + index_str] if cluster_type is not None: if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: From ba53c9a8dfea2ce565624ceb4f3d294075e50412 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 26 Feb 2021 00:26:31 +0100 Subject: [PATCH 0183/1201] MentionMisc --- udapi/core/coref.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 1e2df00b..793dd4d0 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -6,7 +6,7 @@ @functools.total_ordering class CorefMention(object): """Class for representing a mention (instance of an entity).""" - __slots__ = ['_head', '_cluster', '_bridging', '_words'] + __slots__ = ['_head', '_cluster', '_bridging', '_words', 'misc'] def __init__(self, head, cluster=None): self._head = head @@ -15,6 +15,7 @@ def __init__(self, head, cluster=None): cluster._mentions.append(self) self._bridging = None self._words = [] + self.misc = None def __lt__(self, other): """Does this mention precedes (word-order wise) the `other` mention? @@ -195,7 +196,7 @@ def load_coref_from_misc(doc): if node.misc["MentionSpan" + index_str]: mention.span = node.misc["MentionSpan" + index_str] else: - mentions.words = [node] + mention.words = [node] cluster_type = node.misc["ClusterType" + index_str] if cluster_type is not None: if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: @@ -204,6 +205,7 @@ def load_coref_from_misc(doc): # TODO deserialize Bridging and SplitAnte mention._bridging = node.misc["Bridging" + index_str] cluster._split_ante = node.misc["SplitAnte" + index_str] + mention.misc = node.misc["MentionMisc" + index_str] index += 1 index_str = f"[{index}]" cluster_id = node.misc["ClusterId" + index_str] @@ -239,6 +241,8 @@ def store_coref_to_misc(doc): head.misc["ClusterType" + index_str] = cluster.cluster_type head.misc["Bridging" + index_str] = mention.bridging head.misc["SplitAnte" + index_str] = cluster.split_ante + if mention.misc: + head.misc["MentionMisc" + index_str] = mention.misc def span_to_nodes(root, span): From 15aa680241e033b824cafe5a24a7a2d737bfdceb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 26 Feb 2021 18:43:18 +0100 Subject: [PATCH 0184/1201] support for Bridging --- udapi/core/coref.py | 84 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 4 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 793dd4d0..46d42e95 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -1,6 +1,7 @@ """Classes for handling coreference.""" import re import functools +import collections import logging @functools.total_ordering @@ -59,6 +60,9 @@ def cluster(self, new_cluster): @property def bridging(self): + if self._bridging: + return self._bridging + self._bridging = BridgingLinks(self) return self._bridging # TODO add/edit bridging @@ -156,7 +160,7 @@ def split_ante(self): # TODO add/edit split_ante - # TODO adapt depending on how mention.bridging is implemented (callable list subclass) + # TODO or should we create a BridgingLinks instance with a fake src_mention? def all_bridging(self): for m in self._mentions: if m._bridging: @@ -164,6 +168,75 @@ def all_bridging(self): yield b +BridgingLink = collections.namedtuple('BridgingLink', 'target relation') + + +class BridgingLinks(collections.abc.MutableSequence): + """BridgingLinks class serves as a list of BridgingLinks tuples with additional methods. + + Example usage: + >>> bl = BridgingLinks(src_mention) # empty links + >>> bl = BridgingLinks(src_mention, [(c12, 'Part'), (c56, 'Subset')]) # from a list of tuples + >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset') # from a string + >>> for cluster, relation in bl: + >>> print(f"{bl.src_mention} ->{relation}-> {cluster.cluster_id}") + >>> print(str(bl)) # c12:Part,c56:Subset + >>> bl('Part').targets == [c12] + >>> bl('Part|Subset').targets == [c12, c56] + >>> bl.append((c89, 'Funct')) + """ + def __init__(self, src_mention, value=None): + self.src_mention = src_mention + self._data = [] + if value is not None: + if isinstance(value, str): + self._from_string(string) + elif isinstance(value, collections.abc.Sequence): + for v in value: + self._data.append(BridgingLink(v[0], v[1])) + super().__init__() + + def __getitem__(self, key): + return self._data[key] + + def __len__(self): + return len(self._data) + + # TODO delete backlinks of old links + def __setitem__(self, key, new_value): + self._data[key] = BridgingLink(new_value[0], new_value[1]) + + def __delitem__(self, key): + del self._data[key] + + def insert(self, key, new_value): + self._data.insert(key, BridgingLink(new_value[0], new_value[1])) + + def __str__(self): + return ','.join(f'{l.target._cluster_id}:{l.relation}' for l in self) + + def _from_string(self, string): + self._data.clear() + clusters = self.src_mention.head.root.coref_clusters + for link_str in string.split(','): + target, relation = link_str.split(':') + self._data.append(BridgingLink(clusters[target], relation)) + + def __call__(self, relations_re=None): + """Return a subset of links contained in this list as specified by the args. + Args: + relations: only links with a relation matching this regular expression will be returned + """ + if relations_re is None: + return self + return Links(self.src_mention, [l for l in self._data if re.match(relations_re, l.relation)]) + + @property + def targets(self): + """Return a list of the target clusters (without relations).""" + return [link.target for link in self._data] + + def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): clusters = head.root.bundle.document.coref_clusters if not cluster_id: @@ -202,8 +275,10 @@ def load_coref_from_misc(doc): if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") cluster.cluster_type = cluster_type - # TODO deserialize Bridging and SplitAnte - mention._bridging = node.misc["Bridging" + index_str] + bridging_str = node.misc["Bridging" + index_str] + if bridging_str: + mention._bridging = BridgingLinks(mention, bridging_str) + # TODO deserialize SplitAnte cluster._split_ante = node.misc["SplitAnte" + index_str] mention.misc = node.misc["MentionMisc" + index_str] index += 1 @@ -239,7 +314,8 @@ def store_coref_to_misc(doc): head.misc["ClusterId" + index_str] = cluster.cluster_id head.misc["MentionSpan" + index_str] = mention.span head.misc["ClusterType" + index_str] = cluster.cluster_type - head.misc["Bridging" + index_str] = mention.bridging + if mention._bridging: + head.misc["Bridging" + index_str] = str(mention.bridging) head.misc["SplitAnte" + index_str] = cluster.split_ante if mention.misc: head.misc["MentionMisc" + index_str] = mention.misc From fdf5604428fd87890f8cb22038ccce3c8d645650 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 26 Feb 2021 21:24:18 +0100 Subject: [PATCH 0185/1201] handle SplitAnte --- udapi/core/coref.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 46d42e95..5c8f5b89 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -102,13 +102,13 @@ def span(self, new_span): class CorefCluster(object): """Class for representing all mentions of a given entity.""" - __slots__ = ['_cluster_id', '_mentions', 'cluster_type', '_split_ante'] + __slots__ = ['_cluster_id', '_mentions', 'cluster_type', 'split_ante'] def __init__(self, cluster_id, cluster_type=None): self._cluster_id = cluster_id self._mentions = [] self.cluster_type = cluster_type - self._split_ante = None + self.split_ante = [] @property def cluster_id(self): @@ -154,12 +154,6 @@ def create_mention(self, head=None, mention_words=None, mention_span=None): mention.span = mention_span return mention - @property - def split_ante(self): - return self._split_ante - - # TODO add/edit split_ante - # TODO or should we create a BridgingLinks instance with a fake src_mention? def all_bridging(self): for m in self._mentions: @@ -202,7 +196,7 @@ def __getitem__(self, key): def __len__(self): return len(self._data) - # TODO delete backlinks of old links + # TODO delete backlinks of old links, dtto for SplitAnte def __setitem__(self, key, new_value): self._data[key] = BridgingLink(new_value[0], new_value[1]) @@ -275,11 +269,24 @@ def load_coref_from_misc(doc): if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") cluster.cluster_type = cluster_type + bridging_str = node.misc["Bridging" + index_str] if bridging_str: mention._bridging = BridgingLinks(mention, bridging_str) - # TODO deserialize SplitAnte - cluster._split_ante = node.misc["SplitAnte" + index_str] + + split_ante_str = node.misc["SplitAnte" + index_str] + if split_ante_str: + split_antes = [] + for ante_str in split_ante_str.split('+'): + if ante_str in clusters: + split_antes.append(clusters[ante_str]) + else: + # split cataphora, e.g. "We, that is you and me..." + cluster = CorefCluster(ante_str) + clusters[ante_str] = cluster + split_antes.append(cluster) + cluster.split_ante = split_antes + mention.misc = node.misc["MentionMisc" + index_str] index += 1 index_str = f"[{index}]" @@ -316,7 +323,9 @@ def store_coref_to_misc(doc): head.misc["ClusterType" + index_str] = cluster.cluster_type if mention._bridging: head.misc["Bridging" + index_str] = str(mention.bridging) - head.misc["SplitAnte" + index_str] = cluster.split_ante + if cluster.split_ante: + serialized = '+'.join((c.cluster_id for c in cluster.split_ante)) + head.misc["SplitAnte" + index_str] = serialized if mention.misc: head.misc["MentionMisc" + index_str] = mention.misc From cb75f9e9cc04e87229a53e08a3216dd43b620a03 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 27 Feb 2021 13:09:24 +0100 Subject: [PATCH 0186/1201] bugfix: prevent infinite recursion when loading coref_clusters from MISC We cannot access src_mention.head.root.coref_clusters when loading Bridging because these are not ready yet. --- udapi/core/coref.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 5c8f5b89..818bb248 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -171,7 +171,7 @@ class BridgingLinks(collections.abc.MutableSequence): Example usage: >>> bl = BridgingLinks(src_mention) # empty links >>> bl = BridgingLinks(src_mention, [(c12, 'Part'), (c56, 'Subset')]) # from a list of tuples - >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset') # from a string + >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset', clusters) # from a string >>> for cluster, relation in bl: >>> print(f"{bl.src_mention} ->{relation}-> {cluster.cluster_id}") >>> print(str(bl)) # c12:Part,c56:Subset @@ -179,12 +179,18 @@ class BridgingLinks(collections.abc.MutableSequence): >>> bl('Part|Subset').targets == [c12, c56] >>> bl.append((c89, 'Funct')) """ - def __init__(self, src_mention, value=None): + def __init__(self, src_mention, value=None, clusters=None): self.src_mention = src_mention self._data = [] if value is not None: if isinstance(value, str): - self._from_string(string) + if clusters is None: + raise ValueError('BridgingClusters: clusters must be provided if initializing with a string') + try: + self._from_string(value, clusters) + except ValueError: + logging.error(f"Problem when parsing {value} in {src_mention.words[0]}:\n") + raise elif isinstance(value, collections.abc.Sequence): for v in value: self._data.append(BridgingLink(v[0], v[1])) @@ -209,9 +215,8 @@ def insert(self, key, new_value): def __str__(self): return ','.join(f'{l.target._cluster_id}:{l.relation}' for l in self) - def _from_string(self, string): + def _from_string(self, string, clusters): self._data.clear() - clusters = self.src_mention.head.root.coref_clusters for link_str in string.split(','): target, relation = link_str.split(':') self._data.append(BridgingLink(clusters[target], relation)) @@ -272,7 +277,7 @@ def load_coref_from_misc(doc): bridging_str = node.misc["Bridging" + index_str] if bridging_str: - mention._bridging = BridgingLinks(mention, bridging_str) + mention._bridging = BridgingLinks(mention, bridging_str, clusters) split_ante_str = node.misc["SplitAnte" + index_str] if split_ante_str: From 27d2ed3ea6d4c3f9fe5a50cce78d5370009aee3e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 27 Feb 2021 15:18:27 +0100 Subject: [PATCH 0187/1201] corefud.PrintCluster for revealing cluster_id used in several documents --- udapi/block/corefud/printcluster.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 udapi/block/corefud/printcluster.py diff --git a/udapi/block/corefud/printcluster.py b/udapi/block/corefud/printcluster.py new file mode 100644 index 00000000..5f44fab2 --- /dev/null +++ b/udapi/block/corefud/printcluster.py @@ -0,0 +1,19 @@ +from udapi.core.block import Block +from collections import Counter + +class PrintCluster(Block): + """Block corefud.PrintCluster prints all mentions of a given cluster.""" + + def __init__(self, cluster_id, **kwargs): + super().__init__(**kwargs) + self.cluster_id = cluster_id + + def process_document(self, doc): + cluster = doc.coref_clusters.get(self.cluster_id) + if cluster and cluster.mentions: + print(f"Coref cluster {self.cluster_id} has {len(cluster.mentions)} mentions in document {doc.meta['docname']}:") + counter = Counter() + for mention in cluster.mentions: + counter[' '.join([w.form for w in mention.words])] += 1 + for form, count in counter.most_common(): + print(f"{count:4}: {form}") From fd86474fd17a73fb237e24ac206ef5c4b8eabd2c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 28 Feb 2021 13:50:27 +0100 Subject: [PATCH 0188/1201] fixing typo found in the review --- udapi/core/coref.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 818bb248..41ba5a41 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -60,9 +60,8 @@ def cluster(self, new_cluster): @property def bridging(self): - if self._bridging: - return self._bridging - self._bridging = BridgingLinks(self) + if not self._bridging: + self._bridging = BridgingLinks(self) return self._bridging # TODO add/edit bridging @@ -166,7 +165,7 @@ def all_bridging(self): class BridgingLinks(collections.abc.MutableSequence): - """BridgingLinks class serves as a list of BridgingLinks tuples with additional methods. + """BridgingLinks class serves as a list of BridgingLink tuples with additional methods. Example usage: >>> bl = BridgingLinks(src_mention) # empty links From 0e27217072e62cfc6a16db69e3b8fd6b22104863 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 28 Feb 2021 13:56:43 +0100 Subject: [PATCH 0189/1201] coref must be loaded also from empty nodes An alternative to #77 --- udapi/core/coref.py | 4 ++-- udapi/core/document.py | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 41ba5a41..143e50ff 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -252,7 +252,7 @@ def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): def load_coref_from_misc(doc): clusters = {} - for node in doc.nodes: + for node in doc.nodes_and_empty: index, index_str = 0, "" cluster_id = node.misc["ClusterId"] if not cluster_id: @@ -302,7 +302,7 @@ def store_coref_to_misc(doc): if not doc._coref_clusters: return attrs = ("ClusterId", "MentionSpan", "ClusterType", "Bridging", "SplitAnte") - for node in doc.nodes: + for node in doc.nodes_and_empty: for key in list(node.misc): if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): del node.misc[key] diff --git a/udapi/core/document.py b/udapi/core/document.py index 58b4b34e..f02f831e 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -88,12 +88,20 @@ def trees(self): @property def nodes(self): - """An iterator over all nodes in the document.""" + """An iterator over all nodes (excluding empty nodes) in the document.""" for bundle in self: for tree in bundle: for node in tree._descendants: yield node + @property + def nodes_and_empty(self): + """An iterator over all nodes and empty nodes in the document.""" + for bundle in self: + for tree in bundle: + for node in tree.descendants_and_empty: + yield node + def draw(self, **kwargs): """Pretty print the trees using TextModeTrees.""" TextModeTrees(**kwargs).run(self) From 44def37aad48996d8671a1c3b49f371f6edb4ec0 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Fri, 26 Feb 2021 11:38:55 +0100 Subject: [PATCH 0190/1201] a block for cluster re-indexing --- udapi/block/corefud/indexclusters.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 udapi/block/corefud/indexclusters.py diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py new file mode 100644 index 00000000..f602b93a --- /dev/null +++ b/udapi/block/corefud/indexclusters.py @@ -0,0 +1,26 @@ +"""Block corefud.IndexClusters""" +from udapi.core.block import Block + + +class IndexClusters(Block): + """Re-index the coreference cluster IDs. The final cluster IDs are of the "c" form, + where are ordinal numbers starting from the one specified by the `start` parameter. + + Parameters: + ----------- + start : int + the starting index (by default 1) + """ + + def __init__(self, start=1): + self.start = start + + def process_document(self, doc): + clusters = doc.coref_clusters + new_clusters = {} + for idx, cid in enumerate(clusters, self.start): + cluster = clusters[cid] + new_cid = "c" + str(idx) + # need to change private variable + cluster._cluster_id = new_cid + new_clusters[new_cid] = cluster From 0760a2b12e92824b8dc5c46ba0c4fc688cf211de Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 28 Feb 2021 16:24:09 +0100 Subject: [PATCH 0191/1201] bugfix: empty_node.org cannot be printed with %d --- udapi/core/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index c49f8128..7fa4ec0c 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -803,7 +803,7 @@ def address(self): e.g. s123/en_udpipe#4. If zone is empty, the slash is excluded as well, e.g. s123#4. """ - return '%s#%d' % (self._root.address() if self._root else '?', self._ord) + return f"{self._root.address() if self._root else '?'}#{self._ord}" @property def multiword_token(self): From f2bedb6804d24eaaae6b7de0cf19ebae87942e97 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 28 Feb 2021 16:51:59 +0100 Subject: [PATCH 0192/1201] bugfix: Bridging can refer to clusters which are not defined yet --- udapi/core/coref.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 143e50ff..b1348067 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -187,7 +187,7 @@ def __init__(self, src_mention, value=None, clusters=None): raise ValueError('BridgingClusters: clusters must be provided if initializing with a string') try: self._from_string(value, clusters) - except ValueError: + except Exception: logging.error(f"Problem when parsing {value} in {src_mention.words[0]}:\n") raise elif isinstance(value, collections.abc.Sequence): @@ -218,6 +218,8 @@ def _from_string(self, string, clusters): self._data.clear() for link_str in string.split(','): target, relation = link_str.split(':') + if target not in clusters: + clusters[target] = CorefCluster(target) self._data.append(BridgingLink(clusters[target], relation)) def __call__(self, relations_re=None): From 5fe2ab7829f8d2b5895ca714fb5312925e0655ab Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 28 Feb 2021 21:41:50 +0100 Subject: [PATCH 0193/1201] `write.Conllu overwrite=1` will overwrite the input files as stored in `doc.meta["loaded_from"]` --- udapi/core/basewriter.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 3f28d155..0db348a8 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -11,7 +11,7 @@ class BaseWriter(Block): """Base class for all reader blocks.""" def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8', - newline='\n', **kwargs): + newline='\n', overwrite=False, **kwargs): super().__init__(**kwargs) self.orig_files = files self.orig_stdout = sys.stdout @@ -24,6 +24,11 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' self.docname_as_file = docname_as_file if docname_as_file and files != '-': raise ValueError("docname_as_file=1 is not compatible with files=" + files) + self.overwrite = overwrite + if overwrite and files != '-': + raise ValueError("overwrite=1 is not compatible with files=" + files) + if overwrite and docname_as_file: + raise ValueError("overwrite=1 is not compatible with docname_as_file=1") @property def filename(self): @@ -54,6 +59,13 @@ def before_process_document(self, document): sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: logging.warning('docname_as_file=1 but the document contains no docname') + elif self.overwrite: + docname = document.meta.get('loaded_from', None) + if docname is not None: + logging.info('Writing to file %s.', docname) + sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) + else: + logging.warning('overwrite=1 but documet.meta["loaded_from"] is None') else: sys.stdout = self.orig_stdout else: From 67b303d666f93c87e3bbbbeb491b681f0ed829f9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 28 Feb 2021 21:43:15 +0100 Subject: [PATCH 0194/1201] Make cluster IDs unique across all documents processed --- udapi/block/corefud/indexclusters.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py index f602b93a..6c0e9242 100644 --- a/udapi/block/corefud/indexclusters.py +++ b/udapi/block/corefud/indexclusters.py @@ -5,7 +5,11 @@ class IndexClusters(Block): """Re-index the coreference cluster IDs. The final cluster IDs are of the "c" form, where are ordinal numbers starting from the one specified by the `start` parameter. - + This block can be applied on multiple documents within one udapy call. + For example, to re-index ClusterId in all conllu files in the current directory + (keeping the IDs unique across all the files), use: + `udapy read.Conllu files='!*.conllu' corefud.IndexClusters write.Conllu overwrite=1` + Parameters: ----------- start : int @@ -22,5 +26,6 @@ def process_document(self, doc): cluster = clusters[cid] new_cid = "c" + str(idx) # need to change private variable - cluster._cluster_id = new_cid + cluster._cluster_id = new_cid new_clusters[new_cid] = cluster + self.start = idx + 1 From 49e94345d6ab84d36478d687262906cf0672513c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 1 Mar 2021 18:34:00 +0100 Subject: [PATCH 0195/1201] bugfix: MentionMisc was not properly updated in write.Conllu --- udapi/core/coref.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index b1348067..0f9df0d1 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -303,7 +303,7 @@ def load_coref_from_misc(doc): def store_coref_to_misc(doc): if not doc._coref_clusters: return - attrs = ("ClusterId", "MentionSpan", "ClusterType", "Bridging", "SplitAnte") + attrs = ("ClusterId", "MentionSpan", "ClusterType", "Bridging", "SplitAnte", "MentionMisc") for node in doc.nodes_and_empty: for key in list(node.misc): if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): From 66ae04a5aa8c02afbaa54bff7014af007d768e82 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 1 Mar 2021 18:53:14 +0100 Subject: [PATCH 0196/1201] new_clusters not needed, idx not defined if there are no clusters --- udapi/block/corefud/indexclusters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py index 6c0e9242..2c5d8162 100644 --- a/udapi/block/corefud/indexclusters.py +++ b/udapi/block/corefud/indexclusters.py @@ -21,11 +21,11 @@ def __init__(self, start=1): def process_document(self, doc): clusters = doc.coref_clusters - new_clusters = {} + if not clusters: + return for idx, cid in enumerate(clusters, self.start): cluster = clusters[cid] new_cid = "c" + str(idx) # need to change private variable cluster._cluster_id = new_cid - new_clusters[new_cid] = cluster self.start = idx + 1 From 65dd375f527a5062a13e131090f84f85cf6b989f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 1 Mar 2021 19:55:22 +0100 Subject: [PATCH 0197/1201] first version of corefud.MoveHead --- udapi/block/corefud/movehead.py | 61 +++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 udapi/block/corefud/movehead.py diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py new file mode 100644 index 00000000..f2e0bea2 --- /dev/null +++ b/udapi/block/corefud/movehead.py @@ -0,0 +1,61 @@ +import logging +from collections import Counter +from udapi.core.block import Block +from udapi.core.node import find_minimal_common_treelet + +class MoveHead(Block): + """Block corefud.MoveHead moves the head to the highest node in each mention.""" + + def __init__(self, nontreelet='fix', **kwargs): + self.counter = Counter() + self.nontreelet = nontreelet + super().__init__(**kwargs) + + def find_head(self, mention): + empty_nodes, non_empty = [], [] + for w in mention.words: + (empty_nodes if w.is_empty() else non_empty).append(w) + if empty_nodes: + self.counter['with_empty'] += 1 + for empty_node in empty_nodes: + parents = [d['parent'] for d in empty_node.deps if not d['parent'].is_empty()] + if parents and parents[0] not in non_empty: + non_empty.append(parents[0]) + else: + # TODO we should climb up, but preventing cycles + # We could also introduce empty_node.nonempty_ancestor + logging.warning(f"could not find non-empty parent of {empty_node} for mention {mention.head}") + non_empty.sort() + + (highest, added_nodes) = find_minimal_common_treelet(*non_empty) + if highest in mention.words: + return highest, 'treelet' + + if 'warn' in self.nontreelet: + logging.warning(f"Non-treelet mention in {mention.head} (nearest common antecedent={highest})") + if 'mark' in self.nontreelet: + node.misc['Mark'] = 'non-treelet-mention' + for word in mention.words: + if not word.is_empty() and word.parent not in non_empty: + return word, 'nontreelet' + return mention.head, 'bug' + + def process_document(self, doc): + for cluster in doc.coref_clusters.values(): + for mention in cluster.mentions: + self.counter['total'] += 1 + if len(mention.words) < 2: + self.counter['single-word'] += 1 + else: + new_head, category = self.find_head(mention) + if new_head is mention.head: + self.counter[category + '-kept'] += 1 + else: + self.counter[category + '-moved'] += 1 + mention.head = new_head + + def process_end(self): + logging.info("corefud.MoveHead overview of mentions:") + total = self.counter['total'] + for key, value in self.counter.most_common(): + logging.info(f"{key:>16} = {value:6} ({100*value/total:5.1f}%)") From 740f9126c0036a0ba808d0559425eca40e2ddc7f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 2 Mar 2021 09:30:23 +0100 Subject: [PATCH 0198/1201] doc.coref_clusters should be updated after corefud.IndexClusters Thanks, @michnov, for reminding me. --- udapi/block/corefud/indexclusters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py index 2c5d8162..dee45544 100644 --- a/udapi/block/corefud/indexclusters.py +++ b/udapi/block/corefud/indexclusters.py @@ -23,9 +23,12 @@ def process_document(self, doc): clusters = doc.coref_clusters if not clusters: return + new_clusters = {} for idx, cid in enumerate(clusters, self.start): cluster = clusters[cid] new_cid = "c" + str(idx) # need to change private variable cluster._cluster_id = new_cid + new_clusters[new_cid] = cluster self.start = idx + 1 + doc._coref_clusters = new_clusters From 5e5ef50e261ac791646fcf2f8b36ee489988bd24 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 3 Mar 2021 11:37:26 +0100 Subject: [PATCH 0199/1201] be consistent and require the regex parameter to include () if needed Before this commit, `udapy -TM` matched also nodes with MISC `Bugwhatever=1`. The intend is that only nodes with MISC containing `Bug=` are matched. --- udapi/block/write/textmodetrees.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index c8e619b9..fb38c22a 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -134,7 +134,7 @@ class TextModeTrees(BaseWriter): def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color='auto', attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, print_empty=True, - mark='ToDo|ToDoOrigText|Bug|Mark', marked_only=False, hints=True, + mark='(ToDo|ToDoOrigText|Bug|Mark)', marked_only=False, hints=True, layout='classic', **kwargs): """Create new TextModeTrees block object. @@ -159,7 +159,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind print_comments: Print comments (other than sent_id and text)? print_empty: Print empty nodes? mark: a regex. If `re.search(mark + '=', str(node.misc))` the node is highlighted. - If `print_comments and re.search(r'^ (%s) = ' % mark, root.comment, re.M)` + If `print_comments and re.search(r'^ %s = ' % mark, root.comment, re.M)` the comment is highlighted. Empty string means no highlighting. Default = 'ToDo|ToDoOrigText|Bug|Mark'. marked_only: print only trees containing one or more marked nodes/comments. Default=False. @@ -203,7 +203,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.mark_re, self.comment_mark_re = None, None if mark is not None and mark != '': self.mark_re = re.compile(mark + '=') - self.comment_mark_re = re.compile(r'^ (%s) = ' % mark, re.M) + self.comment_mark_re = re.compile(r'^ %s = ' % mark, re.M) self._index_of = [] self._gaps = [] self.lines = [] From 3ea67d43143f29f4ec586fc7b0944a763bad1a2f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 3 Mar 2021 11:44:52 +0100 Subject: [PATCH 0200/1201] tool.morphodita should wrap not only `generate`, but also `analyze` --- udapi/tool/morphodita.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/udapi/tool/morphodita.py b/udapi/tool/morphodita.py index 656c6acb..326bee46 100644 --- a/udapi/tool/morphodita.py +++ b/udapi/tool/morphodita.py @@ -1,7 +1,7 @@ """Wrapper for MorphoDiTa (more pythonic than ufal.morphodita).""" from collections import namedtuple -from ufal.morphodita import Morpho, TaggedLemmasForms # pylint: disable=no-name-in-module +from ufal.morphodita import Morpho, TaggedLemmasForms, TaggedLemmas # pylint: disable=no-name-in-module from udapi.core.resource import require_file FormInfo = namedtuple('FormInfo', 'form lemma tag guesser') @@ -19,8 +19,7 @@ def __init__(self, model): raise IOError("Cannot load model from file '%s'" % path) def forms_of_lemma(self, lemma, tag_wildcard='?', guesser=True): - """Return all forms of a given lemma matching a given tag wildcard.""" - + """Return all forms (a list of FormInfo tuples) of a given lemma matching a given tag wildcard.""" use_guesser = 1 if guesser else 0 lemmas_forms = TaggedLemmasForms() used_guesser = self.tool.generate(lemma, tag_wildcard, use_guesser, lemmas_forms) @@ -29,3 +28,13 @@ def forms_of_lemma(self, lemma, tag_wildcard='?', guesser=True): for form in lemma_forms.forms: forms.append(FormInfo(form.form, lemma_forms.lemma, form.tag, used_guesser)) return forms + + def analyze_form(self, form, guesser=True): + """Return all lemma-tag analyses (a list of FormInfo tuples) of a given form.""" + use_guesser = 1 if guesser else 0 + tagged_lemmas = TaggedLemmas() + used_guesser = self.tool.analyze(form, use_guesser, tagged_lemmas) + result = [] + for tl in tagged_lemmas: + result.append(FormInfo(form, tl.lemma, tl.tag, used_guesser)) + return result From d5fa1b6d21b0d4ba80b55cf932fae3588486c669 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 3 Mar 2021 11:58:11 +0100 Subject: [PATCH 0201/1201] change the SplitAnte separator from plus to comma --- udapi/core/coref.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 0f9df0d1..8d4cd4da 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -283,7 +283,9 @@ def load_coref_from_misc(doc): split_ante_str = node.misc["SplitAnte" + index_str] if split_ante_str: split_antes = [] - for ante_str in split_ante_str.split('+'): + # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma. + # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. + for ante_str in split_ante_str.replace('+', ',').split(','): if ante_str in clusters: split_antes.append(clusters[ante_str]) else: @@ -330,7 +332,7 @@ def store_coref_to_misc(doc): if mention._bridging: head.misc["Bridging" + index_str] = str(mention.bridging) if cluster.split_ante: - serialized = '+'.join((c.cluster_id for c in cluster.split_ante)) + serialized = ','.join((c.cluster_id for c in cluster.split_ante)) head.misc["SplitAnte" + index_str] = serialized if mention.misc: head.misc["MentionMisc" + index_str] = mention.misc From 7912ac805e0d76e85f4672a5087f5b62a4846953 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 3 Mar 2021 17:48:10 +0100 Subject: [PATCH 0202/1201] define ordering of clusters --- udapi/core/coref.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 8d4cd4da..c3c4acf5 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -99,6 +99,7 @@ def span(self, new_span): self.words = span_to_nodes(self._head.root, new_span) +@functools.total_ordering class CorefCluster(object): """Class for representing all mentions of a given entity.""" __slots__ = ['_cluster_id', '_mentions', 'cluster_type', 'split_ante'] @@ -109,6 +110,20 @@ def __init__(self, cluster_id, cluster_type=None): self.cluster_type = cluster_type self.split_ante = [] + def __lt__(self, other): + """Does this CorefCluster precedes (word-order wise) the `other` cluster? + + This method defines a total ordering of all clusters + by the first mention of each cluster (see `CorefMention.__lt__`). + Only if one of the clusters has no mentions (which should not happen normally), + the ordering is defined by the `cluster_id` (lexicographically). + If cluster IDs are not important, it is recommended to use block + `corefud.IndexClusters` to re-name cluster IDs in accordance with this cluster ordering. + """ + if not self.mentions or not other.mentions: + return self._cluster_id < other._cluster_id + return self.mentions[0] < other.mentions[0] + @property def cluster_id(self): return self._cluster_id @@ -299,7 +314,7 @@ def load_coref_from_misc(doc): index += 1 index_str = f"[{index}]" cluster_id = node.misc["ClusterId" + index_str] - doc._coref_clusters = clusters + doc._coref_clusters = {k: clusters[k] for k in sorted(clusters)} def store_coref_to_misc(doc): @@ -310,7 +325,7 @@ def store_coref_to_misc(doc): for key in list(node.misc): if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): del node.misc[key] - for cluster in doc._coref_clusters.values(): + for cluster in sorted(doc._coref_clusters.values()): for mention in cluster.mentions: head = mention.head if head.misc["ClusterId"]: From 6ef20d80d10850290a6faf3c8e854304036c833f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 5 Mar 2021 00:33:35 +0100 Subject: [PATCH 0203/1201] improved and renames corefud.PrintClusters --- udapi/block/corefud/printcluster.py | 19 ------------ udapi/block/corefud/printclusters.py | 44 ++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 19 deletions(-) delete mode 100644 udapi/block/corefud/printcluster.py create mode 100644 udapi/block/corefud/printclusters.py diff --git a/udapi/block/corefud/printcluster.py b/udapi/block/corefud/printcluster.py deleted file mode 100644 index 5f44fab2..00000000 --- a/udapi/block/corefud/printcluster.py +++ /dev/null @@ -1,19 +0,0 @@ -from udapi.core.block import Block -from collections import Counter - -class PrintCluster(Block): - """Block corefud.PrintCluster prints all mentions of a given cluster.""" - - def __init__(self, cluster_id, **kwargs): - super().__init__(**kwargs) - self.cluster_id = cluster_id - - def process_document(self, doc): - cluster = doc.coref_clusters.get(self.cluster_id) - if cluster and cluster.mentions: - print(f"Coref cluster {self.cluster_id} has {len(cluster.mentions)} mentions in document {doc.meta['docname']}:") - counter = Counter() - for mention in cluster.mentions: - counter[' '.join([w.form for w in mention.words])] += 1 - for form, count in counter.most_common(): - print(f"{count:4}: {form}") diff --git a/udapi/block/corefud/printclusters.py b/udapi/block/corefud/printclusters.py new file mode 100644 index 00000000..274aebe5 --- /dev/null +++ b/udapi/block/corefud/printclusters.py @@ -0,0 +1,44 @@ +import re +import os.path +from udapi.core.block import Block +from collections import Counter, defaultdict + +class PrintClusters(Block): + """Block corefud.PrintClusters prints all mentions of a given cluster.""" + + def __init__(self, id_re=None, min_mentions=0, print_ranges=True, **kwargs): + """Params: + id_re: regular expression constraining ClusterId of the clusters to be printed + min_mentions: print only clusters with with at least N mentions + print_ranges: print also addressess of all mentions + (compactly, using the longest common prefix of sent_id) + """ + super().__init__(**kwargs) + self.id_re = re.compile(str(id_re)) if id_re else None + self.min_mentions = min_mentions + self.print_ranges = print_ranges + + def process_document(self, doc): + if 'docname' in doc.meta: + print(f"Coref clusters in document {doc.meta['docname']}:") + for cluster in doc.coref_clusters.values(): + if self.id_re and not self.id_re.match(cluster.cluster_id): + continue + if len(cluster.mentions) < self.min_mentions: + continue + print(f"{cluster.cluster_id} has {len(cluster.mentions)} mentions:") + counter = Counter() + ranges = defaultdict(list) + for mention in cluster.mentions: + forms = ' '.join([w.form for w in mention.words]) + counter[forms] += 1 + if self.print_ranges: + ranges[forms].append(mention.head.root.address() + ':' +mention.span) + for form, count in counter.most_common(): + print(f"{count:4}: {form}") + if self.print_ranges: + if count == 1: + print(' ' + ranges[form][0]) + else: + prefix = os.path.commonprefix(ranges[form]) + print(f' {prefix} ({" ".join(f[len(prefix):] for f in ranges[form])})') From 0a107692d6bbf081c943e162abc124a794ca9f88 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 5 Mar 2021 00:36:55 +0100 Subject: [PATCH 0204/1201] cluster+mention ordering bug fixes * If two mentions start at the same word, the shorter one should go first. * `doc.coref_clusters` dict should be sorted by the values (clusters), not keys (cluster_id). --- udapi/core/coref.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index c3c4acf5..13ac3fd4 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -34,7 +34,7 @@ def __lt__(self, other): if node1 is node2: node1 = self._words[-1] if self._words else self._head node2 = other._words[-1] if other._words else other._head - return node1 > node2 + return node1 < node2 return node1 < node2 @property @@ -314,7 +314,11 @@ def load_coref_from_misc(doc): index += 1 index_str = f"[{index}]" cluster_id = node.misc["ClusterId" + index_str] - doc._coref_clusters = {k: clusters[k] for k in sorted(clusters)} + # c=doc.coref_clusters should be sorted, so that c[0] < c[1] etc. + # In other words, the dict should be sorted by the values (according to CorefCluster.__lt__), + # not by the keys (cluster_id). + # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order. + doc._coref_clusters = {c._cluster_id: c for c in sorted(clusters.values())} def store_coref_to_misc(doc): From 5ccb76b819bd9b66a70f5c7d471f06f66ea7772d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 5 Mar 2021 14:03:42 +0100 Subject: [PATCH 0205/1201] missing imports --- udapi/core/basereader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 5d991a29..05f204b9 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -1,5 +1,7 @@ """BaseReader is the base class for all reader blocks.""" import gc +import re +import logging from udapi.core.block import Block from udapi.core.files import Files From a879b2dab675a628e590dce26a8aca636cee67c6 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 5 Mar 2021 14:04:28 +0100 Subject: [PATCH 0206/1201] allow corefud.PrintClusters agregate_mentions=0 if you want to inspect the ordering of the mentions --- udapi/block/corefud/printclusters.py | 41 ++++++++++++++++------------ 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/udapi/block/corefud/printclusters.py b/udapi/block/corefud/printclusters.py index 274aebe5..163fa1a7 100644 --- a/udapi/block/corefud/printclusters.py +++ b/udapi/block/corefud/printclusters.py @@ -6,7 +6,7 @@ class PrintClusters(Block): """Block corefud.PrintClusters prints all mentions of a given cluster.""" - def __init__(self, id_re=None, min_mentions=0, print_ranges=True, **kwargs): + def __init__(self, id_re=None, min_mentions=0, print_ranges=True, agregate_mentions=True, **kwargs): """Params: id_re: regular expression constraining ClusterId of the clusters to be printed min_mentions: print only clusters with with at least N mentions @@ -17,6 +17,7 @@ def __init__(self, id_re=None, min_mentions=0, print_ranges=True, **kwargs): self.id_re = re.compile(str(id_re)) if id_re else None self.min_mentions = min_mentions self.print_ranges = print_ranges + self.agregate_mentions = agregate_mentions def process_document(self, doc): if 'docname' in doc.meta: @@ -26,19 +27,25 @@ def process_document(self, doc): continue if len(cluster.mentions) < self.min_mentions: continue - print(f"{cluster.cluster_id} has {len(cluster.mentions)} mentions:") - counter = Counter() - ranges = defaultdict(list) - for mention in cluster.mentions: - forms = ' '.join([w.form for w in mention.words]) - counter[forms] += 1 - if self.print_ranges: - ranges[forms].append(mention.head.root.address() + ':' +mention.span) - for form, count in counter.most_common(): - print(f"{count:4}: {form}") - if self.print_ranges: - if count == 1: - print(' ' + ranges[form][0]) - else: - prefix = os.path.commonprefix(ranges[form]) - print(f' {prefix} ({" ".join(f[len(prefix):] for f in ranges[form])})') + print(f" {cluster.cluster_id} has {len(cluster.mentions)} mentions:") + if self.agregate_mentions: + counter = Counter() + ranges = defaultdict(list) + for mention in cluster.mentions: + forms = ' '.join([w.form for w in mention.words]) + counter[forms] += 1 + if self.print_ranges: + ranges[forms].append(mention.head.root.address() + ':' +mention.span) + for form, count in counter.most_common(): + print(f"{count:4}: {form}") + if self.print_ranges: + if count == 1: + print(' ' + ranges[form][0]) + else: + prefix = os.path.commonprefix(ranges[form]) + print(f' {prefix} ({" ".join(f[len(prefix):] for f in ranges[form])})') + else: + for mention in cluster.mentions: + print(' ' + ' '.join([w.form for w in mention.words])) + if self.print_ranges: + print(f" {mention.head.root.address()}:{mention.span}") From 8dd20467c3bc97052c8c107ea8f2a8769b599f9e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 6 Mar 2021 12:04:32 +0100 Subject: [PATCH 0207/1201] node.precedes(n) now works across trees and is used in mentionA < mentionB --- udapi/core/coref.py | 4 ++-- udapi/core/node.py | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 13ac3fd4..0cb55e65 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -34,8 +34,8 @@ def __lt__(self, other): if node1 is node2: node1 = self._words[-1] if self._words else self._head node2 = other._words[-1] if other._words else other._head - return node1 < node2 - return node1 < node2 + return node1.precedes(node2) + return node1.precedes(node2) @property def head(self): diff --git a/udapi/core/node.py b/udapi/core/node.py index 7fa4ec0c..1f038087 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -127,6 +127,12 @@ def ord(self, new_ord): self._ord = new_ord def __lt__(self, other): + """Calling `nodeA < nodeB` is equivalent to `nodeA.ord < nodeB.ord`. + + Note that this does not work as expected for nodes from different trees + because `ord` is the word order within each sentence. + For comparing the word order across trees, use `nodeA.precedes(nodeB)` instead. + """ return self._ord < other._ord @property @@ -660,8 +666,18 @@ def next_node(self): return None def precedes(self, node): - """Does this node precedes another `node` in word order (`self.ord < node.ord`)?""" - return self._ord < node._ord + """Does this node precedes another `node` in word order? + + This method handles correctly also nodes from different trees (but the same zone). + If you have nodes from the same tree, it is faster and more elegant to use just `nodeA < nodeB`, + which is equivalent to calling `nodeA.ord < nodeB.ord`. + For sorting nodes from the same tree, you can use `nodes.sort()` or `sorted(nodes)`. + """ + if self._root is node._root: + return self._ord < node._ord + if self._root._zone != node._root._zone: + raise ValueError(f"Cannot compare word order across zones: {self} {node}") + return self._root._bundle.number < node._root._bundle.number def is_leaf(self): """Is this node a leaf, ie. a node without any children?""" From 07ff9d5e3e87d1d0d24fa6e9905e40458871eb76 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 6 Mar 2021 12:58:47 +0100 Subject: [PATCH 0208/1201] ud.Basic2Enhanced for copying basic deps to enhanced deps --- udapi/block/ud/basic2enhanced.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 udapi/block/ud/basic2enhanced.py diff --git a/udapi/block/ud/basic2enhanced.py b/udapi/block/ud/basic2enhanced.py new file mode 100644 index 00000000..bc5c8b25 --- /dev/null +++ b/udapi/block/ud/basic2enhanced.py @@ -0,0 +1,23 @@ +"""Block ud.Basic2Enhanced for copying basic dependencies to enhanced where missing. + +UD treebanks are not required to have enhanced dependencies (https://universaldependencies.org/u/overview/enhanced-syntax.html). +However, if such annotation is present (in the DEPS column of CoNLL-U), +it must be present in all nodes and all nodes must be reachable from the root +in the enhanced-deps graph (as checked by the validator). +There may be use cases where enhanced deps are annotated only in some kinds of nodes (e.g. empty nodes) +and the rest of nodes is expected to be the same as in the basic dependencies. +To make such file valid, one can use this block. + +This block should not be used on a file with no enhanced dependencies: +It makes no sense to just duplicate the HEAD+DEPREL information also in the DEPS column. +""" +from udapi.core.block import Block + + +class Basic2Enhanced(Block): + """Make sure DEPS column is always filled.""" + + def process_tree(self, tree): + for node in tree.descendants_and_empty: + if node.raw_deps == "_": + node.raw_deps = f"{node.parent.ord}:{node.deprel}" From abd220953e088b59f08cc2ce4cf97c3c00b2b68f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 8 Mar 2021 09:56:03 +0100 Subject: [PATCH 0209/1201] better corefud.MoveHead: less warnings, possibility to store all bugs in misc['Bug'] --- udapi/block/corefud/movehead.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py index f2e0bea2..50ef1480 100644 --- a/udapi/block/corefud/movehead.py +++ b/udapi/block/corefud/movehead.py @@ -6,9 +6,9 @@ class MoveHead(Block): """Block corefud.MoveHead moves the head to the highest node in each mention.""" - def __init__(self, nontreelet='fix', **kwargs): + def __init__(self, bugs='warn', **kwargs): self.counter = Counter() - self.nontreelet = nontreelet + self.bugs = bugs super().__init__(**kwargs) def find_head(self, mention): @@ -19,25 +19,34 @@ def find_head(self, mention): self.counter['with_empty'] += 1 for empty_node in empty_nodes: parents = [d['parent'] for d in empty_node.deps if not d['parent'].is_empty()] - if parents and parents[0] not in non_empty: - non_empty.append(parents[0]) + if parents: + if parents[0] not in non_empty: + non_empty.append(parents[0]) else: # TODO we should climb up, but preventing cycles # We could also introduce empty_node.nonempty_ancestor - logging.warning(f"could not find non-empty parent of {empty_node} for mention {mention.head}") + if 'warn' in self.bugs: + logging.warning(f"could not find non-empty parent of {empty_node} for mention {mention.head}") + if 'mark' in self.bugs: + node.misc['Bug'] = 'no-parent-of-empty' non_empty.sort() (highest, added_nodes) = find_minimal_common_treelet(*non_empty) if highest in mention.words: return highest, 'treelet' - if 'warn' in self.nontreelet: + if 'warn' in self.bugs: logging.warning(f"Non-treelet mention in {mention.head} (nearest common antecedent={highest})") - if 'mark' in self.nontreelet: - node.misc['Mark'] = 'non-treelet-mention' + if 'mark' in self.bugs: + mention.head.misc['Bug'] = 'non-treelet-mention' for word in mention.words: if not word.is_empty() and word.parent not in non_empty: return word, 'nontreelet' + + if 'warn' in self.bugs: + logging.warning(f"Strange mention {mention.head} (nearest common antecedent={highest})") + if 'mark' in self.bugs: + mention.head.misc['Bug'] = 'strange-mention' return mention.head, 'bug' def process_document(self, doc): From dfe8343042b689b9b5ade9f856c6977f994d7726 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 9 Mar 2021 14:51:03 +0100 Subject: [PATCH 0210/1201] printo also average cluster/mention length --- udapi/block/corefud/stats.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index 586c0236..8b919005 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -35,6 +35,7 @@ def process_document(self, doc): elif len_mentions > 1 and self.exclude_nonsingletons: continue self.longest_cluster = max(len_mentions, self.longest_cluster) + self.counter['c_total_len'] += len_mentions self.counter[f"c_len_{min(len_mentions, self.c_len_max)}"] += 1 self.clusters += 1 @@ -45,6 +46,7 @@ def process_document(self, doc): self.m_words += all_words self.m_empty += all_words - non_empty self.longest_mention = max(non_empty, self.longest_mention) + self.counter['m_total_len'] += non_empty self.counter[f"m_len_{min(non_empty, self.m_len_max)}"] += 1 def process_end(self): @@ -56,14 +58,16 @@ def process_end(self): if self.report_clusters: columns += [('clusters', f"{self.clusters:7,}"), ('clusters_per1k', f"{1000 * self.clusters / total_nodes_nonzero:6.0f}"), - ('longest_cluster', f"{self.longest_cluster:6}")] + ('longest_cluster', f"{self.longest_cluster:6}"), + ('avg_cluster', f"{self.counter['c_total_len'] / self.clusters:5.1f}")] for i in range(1, self.c_len_max + 1): percent = 100 * self.counter[f"c_len_{i}"] / clusters_nonzero columns.append((f"c_len_{i}{'' if i < self.c_len_max else '+'}", f"{percent:5.1f}")) if self.report_mentions: columns += [('mentions', f"{self.mentions:7,}"), ('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"), - ('longest_mention', f"{self.longest_mention:6}")] + ('longest_mention', f"{self.longest_mention:6}"), + ('avg_mention', f"{self.counter['m_total_len'] / self.mentions:5.1f}")] for i in range(0, self.m_len_max + 1): percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}")) From 33065f2f771f37a4495aed1e161f1de10ec158ec Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 9 Mar 2021 16:03:50 +0100 Subject: [PATCH 0211/1201] define ordering of mentions with the same first and last word --- udapi/core/coref.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 0cb55e65..351561ea 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -34,7 +34,8 @@ def __lt__(self, other): if node1 is node2: node1 = self._words[-1] if self._words else self._head node2 = other._words[-1] if other._words else other._head - return node1.precedes(node2) + if node1 is node2: + return len(self._words) < len(other._words) return node1.precedes(node2) @property From a559ea271a18d5fd9f2fb6069eb8180d530cb360 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 9 Mar 2021 16:16:14 +0100 Subject: [PATCH 0212/1201] don't warn about Non-treelet mention --- udapi/block/corefud/movehead.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py index 50ef1480..a0326d98 100644 --- a/udapi/block/corefud/movehead.py +++ b/udapi/block/corefud/movehead.py @@ -35,8 +35,8 @@ def find_head(self, mention): if highest in mention.words: return highest, 'treelet' - if 'warn' in self.bugs: - logging.warning(f"Non-treelet mention in {mention.head} (nearest common antecedent={highest})") + #if 'warn' in self.bugs: + # logging.warning(f"Non-treelet mention in {mention.head} (nearest common antecedent={highest})") if 'mark' in self.bugs: mention.head.misc['Bug'] = 'non-treelet-mention' for word in mention.words: From f0af0dd49dae2bec0f9978e5bffa703977e2bf74 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Mar 2021 12:30:19 +0100 Subject: [PATCH 0213/1201] cluster.mentions should be sorted at least after loading and before storing to CoNLL-U because guaranteeing always-sorted mentions would be too difficult. --- udapi/core/coref.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 351561ea..7d9229bf 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -131,7 +131,6 @@ def cluster_id(self): @property def mentions(self): - #TODO return sorted(self._mentions, key=lambda x:... return self._mentions def create_mention(self, head=None, mention_words=None, mention_span=None): @@ -167,6 +166,7 @@ def create_mention(self, head=None, mention_words=None, mention_span=None): mention.words = mention_words if mention_span: mention.span = mention_span + self._mentions.sort() return mention # TODO or should we create a BridgingLinks instance with a fake src_mention? @@ -330,6 +330,17 @@ def store_coref_to_misc(doc): for key in list(node.misc): if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): del node.misc[key] + # doc._coref_clusters is a dict, which is insertion ordered in Python 3.7+. + # The insertion order is sorted according to CorefCluster.__lt__ (see few lines above). + # However, new clusters could be added meanwhile or some clusters edited, + # so we need to sort the clusters again before storing to MISC. + # We also need to mare sure cluster.mentions are sorted in each cluster + # because the ordering of clusters is defined by the first mention in each cluster. + # Ordering of mentions within a cluster can be changed when e.g. changing the span + # of a given mention or reordering words within a sentence and in such events + # Udapi currently does not automatically update the ordering of clusters. + for cluster in doc._coref_clusters.values(): + cluster._mentions.sort() for cluster in sorted(doc._coref_clusters.values()): for mention in cluster.mentions: head = mention.head From aba224bc4493912966fe228e179e1715feb12abe Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Mar 2021 14:41:20 +0100 Subject: [PATCH 0214/1201] stderr was not flushed if redirected to a file --- bin/udapy | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/udapy b/bin/udapy index c7ed8bba..528e3577 100755 --- a/bin/udapy +++ b/bin/udapy @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import os import gc +import sys import atexit import logging import argparse @@ -86,6 +87,7 @@ if __name__ == "__main__": if not args.gc: gc.disable() atexit.register(os._exit, 0) + atexit.register(sys.stderr.flush) if args.save: args.scenario = args.scenario + ['write.Conllu'] if args.save_text_mode_trees: From bfaf062db53b54cf97a3457c5fc52e8737e9ff45 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Mar 2021 16:30:57 +0100 Subject: [PATCH 0215/1201] typo --- udapi/block/corefud/printclusters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/corefud/printclusters.py b/udapi/block/corefud/printclusters.py index 163fa1a7..a9a03f5e 100644 --- a/udapi/block/corefud/printclusters.py +++ b/udapi/block/corefud/printclusters.py @@ -6,7 +6,7 @@ class PrintClusters(Block): """Block corefud.PrintClusters prints all mentions of a given cluster.""" - def __init__(self, id_re=None, min_mentions=0, print_ranges=True, agregate_mentions=True, **kwargs): + def __init__(self, id_re=None, min_mentions=0, print_ranges=True, aggregate_mentions=True, **kwargs): """Params: id_re: regular expression constraining ClusterId of the clusters to be printed min_mentions: print only clusters with with at least N mentions @@ -17,7 +17,7 @@ def __init__(self, id_re=None, min_mentions=0, print_ranges=True, agregate_menti self.id_re = re.compile(str(id_re)) if id_re else None self.min_mentions = min_mentions self.print_ranges = print_ranges - self.agregate_mentions = agregate_mentions + self.aggregate_mentions = aggregate_mentions def process_document(self, doc): if 'docname' in doc.meta: @@ -28,7 +28,7 @@ def process_document(self, doc): if len(cluster.mentions) < self.min_mentions: continue print(f" {cluster.cluster_id} has {len(cluster.mentions)} mentions:") - if self.agregate_mentions: + if self.aggregate_mentions: counter = Counter() ranges = defaultdict(list) for mention in cluster.mentions: From ccc47433cc4f22a67a2a082557c482bfd5603f2b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Mar 2021 18:15:42 +0100 Subject: [PATCH 0216/1201] fail if non-existing ClusterId referenced in SplitAnte or Bridging --- udapi/core/coref.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 7d9229bf..95260b4b 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -121,9 +121,9 @@ def __lt__(self, other): If cluster IDs are not important, it is recommended to use block `corefud.IndexClusters` to re-name cluster IDs in accordance with this cluster ordering. """ - if not self.mentions or not other.mentions: + if not self._mentions or not other._mentions: return self._cluster_id < other._cluster_id - return self.mentions[0] < other.mentions[0] + return self._mentions[0] < other._mentions[0] @property def cluster_id(self): @@ -319,6 +319,10 @@ def load_coref_from_misc(doc): # In other words, the dict should be sorted by the values (according to CorefCluster.__lt__), # not by the keys (cluster_id). # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order. + for cluster in clusters.values(): + if not cluster._mentions: + raise ValueError(f"Cluster {cluster.cluster_id} referenced in SplitAnte or Bridging, but not defined with ClusterId") + cluster._mentions.sort() doc._coref_clusters = {c._cluster_id: c for c in sorted(clusters.values())} From 9ce505fe51ea98fc91778ca4ea0890df543a9e5b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Mar 2021 18:36:40 +0100 Subject: [PATCH 0217/1201] ordering of clusters should be total even if some clusters have no mentions We want to prevent clusters with no mentions, but that's another issue. When the ordering is not total, Udapi gives unpredicable and hard-to-debug results. E.g. when c6 has no mentions, sorted(doc.coref_clusters.values()) gives c22 c6 c7 c17 although c17 < c22, but this is never tested within sorted. We can see that c22 < c6 because c6 has no mentions and this comparison was thus driven by "c22" < "c6" (lexicographically). c6 < c7, also lexicographically by the cluster_id. c7 < c17 by their position in the data. --- udapi/core/coref.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 95260b4b..4e246783 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -116,13 +116,17 @@ def __lt__(self, other): This method defines a total ordering of all clusters by the first mention of each cluster (see `CorefMention.__lt__`). - Only if one of the clusters has no mentions (which should not happen normally), - the ordering is defined by the `cluster_id` (lexicographically). + If one of the clusters has no mentions (which should not happen normally), + there is a backup solution (see the source code). If cluster IDs are not important, it is recommended to use block `corefud.IndexClusters` to re-name cluster IDs in accordance with this cluster ordering. """ if not self._mentions or not other._mentions: - return self._cluster_id < other._cluster_id + # Clusters without mentions should go first, so the ordering is total. + # If both clusters are missing mentions, let's use cluster_id, so the ordering is stable. + if not self._mentions and not other._mentions: + return self._cluster_id < other._cluster_id + return not self._mentions return self._mentions[0] < other._mentions[0] @property From 6cddc1f3e6e72eaedd48b090d0409375d04f6e51 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Mar 2021 19:14:12 +0100 Subject: [PATCH 0218/1201] bugfix: parsing SplitAnte --- udapi/core/coref.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 4e246783..110b1ef1 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -310,9 +310,9 @@ def load_coref_from_misc(doc): split_antes.append(clusters[ante_str]) else: # split cataphora, e.g. "We, that is you and me..." - cluster = CorefCluster(ante_str) - clusters[ante_str] = cluster - split_antes.append(cluster) + ante_cl = CorefCluster(ante_str) + clusters[ante_str] = ante_cl + split_antes.append(ante_cl) cluster.split_ante = split_antes mention.misc = node.misc["MentionMisc" + index_str] From 5bb27398297bfb5b6bc01933d15815410be7ab7b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Mar 2021 19:42:50 +0100 Subject: [PATCH 0219/1201] keep SplitAnte and Bridging sorted both after loading and before saving --- udapi/core/coref.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 110b1ef1..243cac63 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -232,7 +232,7 @@ def insert(self, key, new_value): self._data.insert(key, BridgingLink(new_value[0], new_value[1])) def __str__(self): - return ','.join(f'{l.target._cluster_id}:{l.relation}' for l in self) + return ','.join(f'{l.target._cluster_id}:{l.relation}' for l in sorted(self._data)) def _from_string(self, string, clusters): self._data.clear() @@ -241,6 +241,7 @@ def _from_string(self, string, clusters): if target not in clusters: clusters[target] = CorefCluster(target) self._data.append(BridgingLink(clusters[target], relation)) + self._data.sort() def __call__(self, relations_re=None): """Return a subset of links contained in this list as specified by the args. @@ -313,7 +314,7 @@ def load_coref_from_misc(doc): ante_cl = CorefCluster(ante_str) clusters[ante_str] = ante_cl split_antes.append(ante_cl) - cluster.split_ante = split_antes + cluster.split_ante = sorted(split_antes) mention.misc = node.misc["MentionMisc" + index_str] index += 1 @@ -371,7 +372,7 @@ def store_coref_to_misc(doc): if mention._bridging: head.misc["Bridging" + index_str] = str(mention.bridging) if cluster.split_ante: - serialized = ','.join((c.cluster_id for c in cluster.split_ante)) + serialized = ','.join((c.cluster_id for c in sorted(cluster.split_ante))) head.misc["SplitAnte" + index_str] = serialized if mention.misc: head.misc["MentionMisc" + index_str] = mention.misc From 2cade18ac39f7e3a85d5f8aa94dc573e20841657 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Mar 2021 20:18:17 +0100 Subject: [PATCH 0220/1201] fail if loading self-referencing SplitAnte or Bridging --- udapi/core/coref.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 243cac63..8118d905 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -212,6 +212,8 @@ def __init__(self, src_mention, value=None, clusters=None): raise elif isinstance(value, collections.abc.Sequence): for v in value: + if v[0] is src_mention._cluster: + raise ValueError("Bridging cannot self-reference the same cluster: " + v[0].cluster_id) self._data.append(BridgingLink(v[0], v[1])) super().__init__() @@ -223,12 +225,16 @@ def __len__(self): # TODO delete backlinks of old links, dtto for SplitAnte def __setitem__(self, key, new_value): + if new_value[0] is self.src_mention._cluster: + raise ValueError("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id) self._data[key] = BridgingLink(new_value[0], new_value[1]) def __delitem__(self, key): del self._data[key] def insert(self, key, new_value): + if new_value[0] is self.src_mention._cluster: + raise ValueError("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id) self._data.insert(key, BridgingLink(new_value[0], new_value[1])) def __str__(self): @@ -238,6 +244,8 @@ def _from_string(self, string, clusters): self._data.clear() for link_str in string.split(','): target, relation = link_str.split(':') + if target == self.src_mention._cluster._cluster_id: + raise ValueError("Bridging cannot self-reference the same cluster: " + target) if target not in clusters: clusters[target] = CorefCluster(target) self._data.append(BridgingLink(clusters[target], relation)) @@ -308,6 +316,8 @@ def load_coref_from_misc(doc): # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. for ante_str in split_ante_str.replace('+', ',').split(','): if ante_str in clusters: + if ante_str == cluster_id: + raise ValueError("SplitAnte cannot self-reference the same cluster: " + cluster_id) split_antes.append(clusters[ante_str]) else: # split cataphora, e.g. "We, that is you and me..." From 3c25693622ae8158aa1299661809e12c9df93255 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 10 Mar 2021 20:34:57 +0100 Subject: [PATCH 0221/1201] more helpful error msg when calling read.Conllu files="" --- udapi/core/files.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/core/files.py b/udapi/core/files.py index 3a13f230..7fcd9149 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -43,6 +43,8 @@ def __init__(self, filenames=None, filehandle=None, encoding='utf-8'): elif isinstance(filenames, list): self.filenames = filenames elif isinstance(filenames, str): + if filenames == '': + raise ValueError('Filenames (files=) cannot be an empty string') self.filenames = self.string_to_filenames(filenames) else: raise ValueError('Parameter "filenames" must be a list or str') From a0568538f435faf1c4924f12766668e0d0a5499a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 11 Mar 2021 04:10:35 +0100 Subject: [PATCH 0222/1201] better corefud.MoveHead implementation (and less warnings) --- udapi/block/corefud/movehead.py | 53 +++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py index a0326d98..1cba3e68 100644 --- a/udapi/block/corefud/movehead.py +++ b/udapi/block/corefud/movehead.py @@ -11,12 +11,36 @@ def __init__(self, bugs='warn', **kwargs): self.bugs = bugs super().__init__(**kwargs) + def _eparents(self, node): + if node.deps: + return [d['parent'] for d in node.deps] + if node.parent: + return [node.parent] + return [] + def find_head(self, mention): + mwords = set(mention.words) + + # First, check the simplest case: no empty words and a treelet in basic dependencies. + basic_heads = [w for w in mention.words if not w.parent or not w.parent in mwords] + assert basic_heads + if len(basic_heads) == 1: + return basic_heads[0], 'treelet' + + # Second, check also enhanced dependencies (but only within basic_heads for simplicity). + enh_heads = [w for w in basic_heads if not any(p in mwords for p in self._eparents(w))] + if not enh_heads: + enh_heads = [w for w in basic_heads if not all(p in mwords for p in self._eparents(w))] + if not enh_heads: + return mention.head, 'cycle' + if len(enh_heads) == 1: + return enh_heads[0], 'treelet' + + # Third, find non-empty parents (ancestors in future) of empty nodes. empty_nodes, non_empty = [], [] - for w in mention.words: + for w in enh_heads: (empty_nodes if w.is_empty() else non_empty).append(w) if empty_nodes: - self.counter['with_empty'] += 1 for empty_node in empty_nodes: parents = [d['parent'] for d in empty_node.deps if not d['parent'].is_empty()] if parents: @@ -31,23 +55,19 @@ def find_head(self, mention): node.misc['Bug'] = 'no-parent-of-empty' non_empty.sort() + # Fourth, check if there is a node within the enh_heads governing all the mention nodes + # and forming thus a "gappy treelet", where the head is clearly the "highest" node. (highest, added_nodes) = find_minimal_common_treelet(*non_empty) - if highest in mention.words: - return highest, 'treelet' + if highest in enh_heads: + return highest, 'gappy' + assert highest not in mwords - #if 'warn' in self.bugs: - # logging.warning(f"Non-treelet mention in {mention.head} (nearest common antecedent={highest})") - if 'mark' in self.bugs: - mention.head.misc['Bug'] = 'non-treelet-mention' - for word in mention.words: - if not word.is_empty() and word.parent not in non_empty: - return word, 'nontreelet' + # Fifth, try to convervatively preserve the original head, if it is one of the possible heads. + if mention.head in enh_heads: + return mention.head, 'nontreelet' - if 'warn' in self.bugs: - logging.warning(f"Strange mention {mention.head} (nearest common antecedent={highest})") - if 'mark' in self.bugs: - mention.head.misc['Bug'] = 'strange-mention' - return mention.head, 'bug' + # Finally, return the word-order-wise first head candidate as the head. + return enh_heads[0], 'nontreelet' def process_document(self, doc): for cluster in doc.coref_clusters.values(): @@ -57,6 +77,7 @@ def process_document(self, doc): self.counter['single-word'] += 1 else: new_head, category = self.find_head(mention) + self.counter[category] += 1 if new_head is mention.head: self.counter[category + '-kept'] += 1 else: From 720f35d1324d91b425dfb55627bc7836f7dfe197 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 11 Mar 2021 04:42:29 +0100 Subject: [PATCH 0223/1201] when accessing node.deps which were empty, _ was changed to an empty string --- udapi/block/corefud/movehead.py | 2 +- udapi/core/node.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py index 1cba3e68..fc2c8970 100644 --- a/udapi/block/corefud/movehead.py +++ b/udapi/block/corefud/movehead.py @@ -12,7 +12,7 @@ def __init__(self, bugs='warn', **kwargs): super().__init__(**kwargs) def _eparents(self, node): - if node.deps: + if node._raw_deps != '_': return [d['parent'] for d in node.deps] if node.parent: return [node.parent] diff --git a/udapi/core/node.py b/udapi/core/node.py index 1f038087..5225724e 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -243,7 +243,7 @@ def raw_deps(self): # Afterwards, we can use the following optimization #if self._raw_deps is not None: # return self._raw_deps - if self._deps is not None: + if self._deps: self._raw_deps = '|'.join(f"{dep['parent']._ord}:{dep['deprel']}" for dep in self._deps) return self._raw_deps From 46ee386b36a86e23c8002c93a15a5d4555788f64 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 11 Mar 2021 12:49:59 +0100 Subject: [PATCH 0224/1201] corefud.MoveHead change assertion to a warning --- udapi/block/corefud/movehead.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py index fc2c8970..e9034a22 100644 --- a/udapi/block/corefud/movehead.py +++ b/udapi/block/corefud/movehead.py @@ -60,7 +60,12 @@ def find_head(self, mention): (highest, added_nodes) = find_minimal_common_treelet(*non_empty) if highest in enh_heads: return highest, 'gappy' - assert highest not in mwords + if highest in mwords: + if 'warn' in self.bugs: + logging.warning(f"Strange mention {mention.head} with highest node {highest}") + if 'mark' in self.bugs: + highest.misc['Bug'] = 'highest-in-mwords' + mention.head.misc['Bug'] = 'highest-head' # Fifth, try to convervatively preserve the original head, if it is one of the possible heads. if mention.head in enh_heads: From f0efeab769c45ea8966ee3aaad12163a667d4fd4 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 11 Mar 2021 17:45:54 +0100 Subject: [PATCH 0225/1201] allow `udapy corefud.Load strict=0 ...` for not failing on some errors Errors, such as Cluster {} referenced in SplitAnte or Bridging, but not defined with ClusterId SplitAnte cannot self-reference the same cluster will be only printed to stderr using logging.error, but no exception will be raised. You can also use udapy -s corefud.Load < in.conllu > out.conllu && diff {in,out}.conllu --- udapi/block/corefud/load.py | 12 ++++++++++++ udapi/core/coref.py | 23 ++++++++++++++--------- 2 files changed, 26 insertions(+), 9 deletions(-) create mode 100644 udapi/block/corefud/load.py diff --git a/udapi/block/corefud/load.py b/udapi/block/corefud/load.py new file mode 100644 index 00000000..3b2534bc --- /dev/null +++ b/udapi/block/corefud/load.py @@ -0,0 +1,12 @@ +from udapi.core.block import Block +import udapi.core.coref + +class Load(Block): + """Load coreference-related MISC attributes into memory. Allow lenient mode by strict=0.""" + + def __init__(self, strict=True): + self.strict = strict + + def process_document(self, doc): + if doc._coref_clusters is None: + udapi.core.coref.load_coref_from_misc(doc, self.strict) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 8118d905..6236e4cf 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -198,9 +198,10 @@ class BridgingLinks(collections.abc.MutableSequence): >>> bl('Part|Subset').targets == [c12, c56] >>> bl.append((c89, 'Funct')) """ - def __init__(self, src_mention, value=None, clusters=None): + def __init__(self, src_mention, value=None, clusters=None, strict=True): self.src_mention = src_mention self._data = [] + self.strict = strict if value is not None: if isinstance(value, str): if clusters is None: @@ -213,7 +214,7 @@ def __init__(self, src_mention, value=None, clusters=None): elif isinstance(value, collections.abc.Sequence): for v in value: if v[0] is src_mention._cluster: - raise ValueError("Bridging cannot self-reference the same cluster: " + v[0].cluster_id) + _error("Bridging cannot self-reference the same cluster: " + v[0].cluster_id, strict) self._data.append(BridgingLink(v[0], v[1])) super().__init__() @@ -226,7 +227,7 @@ def __len__(self): # TODO delete backlinks of old links, dtto for SplitAnte def __setitem__(self, key, new_value): if new_value[0] is self.src_mention._cluster: - raise ValueError("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id) + _error("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id, self.strict) self._data[key] = BridgingLink(new_value[0], new_value[1]) def __delitem__(self, key): @@ -234,7 +235,7 @@ def __delitem__(self, key): def insert(self, key, new_value): if new_value[0] is self.src_mention._cluster: - raise ValueError("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id) + _error("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id, self.strict) self._data.insert(key, BridgingLink(new_value[0], new_value[1])) def __str__(self): @@ -245,7 +246,7 @@ def _from_string(self, string, clusters): for link_str in string.split(','): target, relation = link_str.split(':') if target == self.src_mention._cluster._cluster_id: - raise ValueError("Bridging cannot self-reference the same cluster: " + target) + _error("Bridging cannot self-reference the same cluster: " + target, self.strict) if target not in clusters: clusters[target] = CorefCluster(target) self._data.append(BridgingLink(clusters[target], relation)) @@ -280,8 +281,12 @@ def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): clusters[cluster_id] = cluster return cluster +def _error(msg, strict): + if strict: + raise ValueError(msg) + logging.error(msg) -def load_coref_from_misc(doc): +def load_coref_from_misc(doc, strict=True): clusters = {} for node in doc.nodes_and_empty: index, index_str = 0, "" @@ -307,7 +312,7 @@ def load_coref_from_misc(doc): bridging_str = node.misc["Bridging" + index_str] if bridging_str: - mention._bridging = BridgingLinks(mention, bridging_str, clusters) + mention._bridging = BridgingLinks(mention, bridging_str, clusters, strict) split_ante_str = node.misc["SplitAnte" + index_str] if split_ante_str: @@ -317,7 +322,7 @@ def load_coref_from_misc(doc): for ante_str in split_ante_str.replace('+', ',').split(','): if ante_str in clusters: if ante_str == cluster_id: - raise ValueError("SplitAnte cannot self-reference the same cluster: " + cluster_id) + _error("SplitAnte cannot self-reference the same cluster: " + cluster_id, strict) split_antes.append(clusters[ante_str]) else: # split cataphora, e.g. "We, that is you and me..." @@ -336,7 +341,7 @@ def load_coref_from_misc(doc): # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order. for cluster in clusters.values(): if not cluster._mentions: - raise ValueError(f"Cluster {cluster.cluster_id} referenced in SplitAnte or Bridging, but not defined with ClusterId") + _error(f"Cluster {cluster.cluster_id} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict) cluster._mentions.sort() doc._coref_clusters = {c._cluster_id: c for c in sorted(clusters.values())} From c7e16c077618c611b1fcdbdf844d57b60e6a1276 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 13 Mar 2021 02:10:50 +0100 Subject: [PATCH 0226/1201] corefud.Stats now can report also about discountinous/non-treelet mentions and head UPOS distribution --- udapi/block/corefud/stats.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index 8b919005..f07c2a27 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -5,12 +5,14 @@ class Stats(Block): """Block corefud.Stats prints various coreference-related statistics.""" def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_clusters=True, + report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM', exclude_singletons=False, exclude_nonsingletons=False, style='human', **kwargs): super().__init__(**kwargs) self.m_len_max = m_len_max self.c_len_max = c_len_max self.report_mentions = report_mentions self.report_clusters = report_clusters + self.report_details = report_details self.exclude_singletons = exclude_singletons self.exclude_nonsingletons = exclude_nonsingletons self.style = style @@ -24,7 +26,7 @@ def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_cluste self.longest_mention = 0 self.longest_cluster = 0 self.m_words = 0 - self.m_empty = 0 + self.selected_upos = None if selected_upos == 'all' else selected_upos.split() def process_document(self, doc): self.total_nodes += len(list(doc.nodes)) @@ -39,15 +41,30 @@ def process_document(self, doc): self.counter[f"c_len_{min(len_mentions, self.c_len_max)}"] += 1 self.clusters += 1 + if not self.report_mentions and not self.report_details: + continue for mention in cluster.mentions: self.mentions += 1 all_words = len(mention.words) non_empty = len([w for w in mention.words if not w.is_empty()]) self.m_words += all_words - self.m_empty += all_words - non_empty self.longest_mention = max(non_empty, self.longest_mention) self.counter['m_total_len'] += non_empty self.counter[f"m_len_{min(non_empty, self.m_len_max)}"] += 1 + if self.report_details: + upos = 'other' + if not self.selected_upos or mention.head.upos in self.selected_upos: + upos = mention.head.upos + self.counter['m_head_upos_' + upos] += 1 + self.counter['m_with_empty'] += 1 if all_words > non_empty else 0 + self.counter['m_with_gaps'] += 1 if ',' in mention.span else 0 + heads, mwords = 0, set(mention.words) + for w in mention.words: + if w.parent: + heads += 0 if w.parent in mwords else 1 + else: + heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 + self.counter['m_nontreelet'] += 1 if heads > 1 else 0 def process_end(self): mentions_nonzero = 1 if self.mentions == 0 else self.mentions @@ -71,6 +88,16 @@ def process_end(self): for i in range(0, self.m_len_max + 1): percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}")) + if self.report_details: + columns += [('with_empty', f"{100 * self.counter['m_with_empty'] / mentions_nonzero:5.1f}"), + ('with_gaps', f"{100 * self.counter['m_with_gaps'] / mentions_nonzero:5.1f}"), + ('nontreelet', f"{100 * self.counter['m_nontreelet'] / mentions_nonzero:5.1f}"),] + if self.selected_upos: + upos_list = self.selected_upos + ['other'] + else: + upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')] + for upos in upos_list: + columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}")) if self.style == 'tex': print(" & ".join(c[1] for c in columns)) From e38fdd7773737776b26bc097a88f3f1e3f4e6bb5 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 25 Mar 2021 14:07:54 +0100 Subject: [PATCH 0227/1201] keep the order of CoNLL-U comment lines using placeholders --- udapi/block/read/conllu.py | 4 +++ udapi/block/write/conllu.py | 72 ++++++++++++++++++++++++++++++++----- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 0f750bb7..71886752 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -39,11 +39,13 @@ def parse_comment_line(line, root): sent_id_match = RE_SENT_ID.match(line) if sent_id_match is not None: root.sent_id = sent_id_match.group(1) + root.comment += '$SENT_ID\n' return text_match = RE_TEXT.match(line) if text_match is not None: root.text = text_match.group(1) + root.comment += '$TEXT\n' return pardoc_match = RE_NEWPARDOC.match(line) @@ -51,8 +53,10 @@ def parse_comment_line(line, root): value = True if pardoc_match.group(2) is None else pardoc_match.group(2) if pardoc_match.group(1) == 'newpar': root.newpar = value + root.comment += '$NEWPAR\n' else: root.newdoc = value + root.comment += '$NEWDOC\n' return json_match = RE_JSON.match(line) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index df3adce6..bcdf5e21 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -23,24 +23,78 @@ def process_tree(self, tree): # pylint: disable=too-many-branches if not nodes and not self.print_empty_trees: return + # If tree.comment contains placeholders $NEWDOC,...$TEXT, replace them with the actual + # value of the attribute and make note on which line (i_*) they were present. + comment_lines = tree.comment.splitlines() + i_newdoc, i_newpar, i_sent_id, i_text = -1, -1, -1, -1 + for i, c_line in enumerate(comment_lines): + if c_line == '$SENT_ID': + i_sent_id = i + comment_lines[i] = ' sent_id = ' + tree.sent_id if self.print_sent_id else None + elif c_line == '$TEXT': + i_text = i + if self.print_text: + if tree.text is None: + comment_lines[i] = ' text = ' + tree.compute_text() + else: + comment_lines[i] = ' text = ' + tree.text.replace('\n', '').replace('\r', '').rstrip() + elif c_line == '$NEWDOC': + i_newdoc = i + if self.print_sent_id and tree.newdoc: + comment_lines[i] = ' newdoc' + (' id = ' + tree.newdoc if tree.newdoc is not True else '') + else: + comment_lines[i] = None + elif c_line == '$NEWPAR': + i_newpar = i + if self.print_sent_id and tree.newpar: + comment_lines[i] = ' newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '') + else: + comment_lines[i] = None + + # Now print the special comments: global.columns, newdoc, newpar, sent_id and text. + # If these comments were already present in tree.comment (as marked with the placeholders), + # keep them at their original position and print also all comment lines preceding them. + # It they were missing, try to print them at the correct position. + printed_i = -1 + if comment_lines and comment_lines[0].startswith(' global.columns'): + printed_i += 1 + print('#' + comment_lines[printed_i]) if self.print_sent_id: if tree.newdoc: - print('# newdoc' + (' id = ' + tree.newdoc if tree.newdoc is not True else '')) + if i_newdoc == -1: + print('# newdoc' + (' id = ' + tree.newdoc if tree.newdoc is not True else '')) + else: + while printed_i < i_newdoc: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) if tree.newpar: - print('# newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '')) - print('# sent_id = ' + tree.sent_id) - - if self.print_text: + if i_newpar == -1: + print('# newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '')) + else: + while printed_i < i_newpar: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) + if i_sent_id == -1: + print('# sent_id = ' + tree.sent_id) + else: + while printed_i < i_sent_id: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) + if self.print_text and i_text == -1: print('# text = ' + (tree.compute_text() if tree.text is None else tree.text.replace('\n', '').replace('\r', '').rstrip())) + for c_line in comment_lines[printed_i + 1:]: + if c_line: + print('#' + c_line) + + # Special-purpose json_* comments should always be at the end of the comment block. if tree.json: for key, value in sorted(tree.json.items()): print(f"# json_{key} = {json.dumps(value, ensure_ascii=False, sort_keys=True)}") - comment = tree.comment - if comment: - print('#' + comment.rstrip().replace('\n', '\n#')) - last_mwt_id = 0 for node in nodes: mwt = node._mwt From ddf9bc6f9aa59b5d0e46f8fa9835348bc4f96779 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 26 Mar 2021 19:17:31 +0100 Subject: [PATCH 0228/1201] bugfix fixes #82 --- udapi/block/write/conllu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index bcdf5e21..66ae320b 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -102,7 +102,7 @@ def process_tree(self, tree): # pylint: disable=too-many-branches print('\t'.join((mwt.ord_range, '_' if mwt.form is None else mwt.form, '_\t_\t_\t_\t_\t_\t_', - '_' if node._misc is None else str(mwt.misc)))) + '_' if mwt._misc is None else str(mwt.misc)))) last_mwt_id = mwt.words[-1]._ord if node._parent is None: From cd7523a794dce3048d3578c8984ccd453246406d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 26 Mar 2021 19:18:42 +0100 Subject: [PATCH 0229/1201] bugfix: read.Conll has to reimplement read_tree_from_lines() because read.Conllu does not call parse_node_line() anymore (for speed reasons) --- udapi/block/read/conll.py | 64 ++++++++++++++++++++++++++++++++++++++- udapi/core/mwt.py | 4 +-- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/udapi/block/read/conll.py b/udapi/block/read/conll.py index 7591f924..f64cd9ff 100644 --- a/udapi/block/read/conll.py +++ b/udapi/block/read/conll.py @@ -93,7 +93,69 @@ def parse_node_line(self, line, root, nodes, parents, mwts): setattr(node, 'ord', int(fields[n_attribute])) elif attribute_name == 'deps': setattr(node, 'raw_deps', fields[n_attribute]) - elif attribute_name != '_': + elif attribute_name != '_' and fields[n_attribute] != '_': setattr(node, attribute_name, fields[n_attribute]) nodes.append(node) + + # Acknowledged code duplication with read.Conllu + def read_tree_from_lines(self, lines): + root = Root() + nodes = [root] + parents = [0] + mwts = [] + for line in lines: + if line[0] == '#': + self.parse_comment_line(line, root) + else: + self.parse_node_line(line, root, nodes, parents, mwts) + + # If no nodes were read from the filehandle (so only root remained in nodes), + # we return None as a sign of failure (end of file or more than one empty line). + if len(nodes) == 1: + return None + + # Empty sentences are not allowed in CoNLL-U, + # but if the users want to save just the sentence string and/or sent_id + # they need to create one artificial node and mark it with Empty=Yes. + # In that case, we will delete this node, so the tree will have just the (technical) root. + # See also udapi.block.write.Conllu, which is compatible with this trick. + if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': + nodes.pop() + root._children = [] + root._descendants = [] + + # Set dependency parents (now, all nodes of the tree are created). + for node_ord, node in enumerate(nodes[1:], 1): + try: + parent = nodes[parents[node_ord]] + except IndexError: + raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) + if node is parent: + if self.fix_cycles: + logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) + node._parent = root + root._children.append(node) + else: + raise ValueError(f"Detected a cycle: {node} attached to itself") + elif node.children: + climbing = parent._parent + while climbing: + if climbing is node: + if self.fix_cycles: + logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent) + parent = root + break + else: + raise ValueError(f"Detected a cycle: {node}") + climbing = climbing._parent + node._parent = parent + parent._children.append(node) + + # Create multi-word tokens. + for fields in mwts: + range_start, range_end = fields[0].split('-') + words = nodes[int(range_start):int(range_end) + 1] + root.create_multiword_token(words, form=fields[1], misc=fields[-1]) + + return root diff --git a/udapi/core/mwt.py b/udapi/core/mwt.py index 525b9bb0..684adfaf 100644 --- a/udapi/core/mwt.py +++ b/udapi/core/mwt.py @@ -9,7 +9,7 @@ class MWT(object): def __init__(self, words=None, form=None, misc=None, root=None): self.words = words if words is not None else [] self.form = form - self._misc = DualDict(misc) if misc else None + self._misc = DualDict(misc) if misc and misc != '_' else None self.root = root for word in self.words: word._mwt = self # pylint: disable=W0212 @@ -49,7 +49,7 @@ def address(self): # TODO: node.remove() should check if the node is not part of any MWT # TODO: Document that editing words by mwt.words.append(node), del or remove(node) is not supported -# TODO: Make mwt._words privat and provide a setter +# TODO: Make mwt._words private and provide a setter # TODO: What to do when mwt.words = []? (It is allowed after mwt=MWT().) # TODO: words.setter and node.shift* should check if the MWT does not contain gaps # and is still multi-word From fd11aa4ca18a31e1e9815ad975f1715555f6a888 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 31 Mar 2021 10:42:18 +0200 Subject: [PATCH 0230/1201] try travis-CI with Windows --- .travis.yml | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 92714116..076505dc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,13 +4,26 @@ python: - "3.7" - "3.8" - "3.9" -before_install: - - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - - sudo apt-get update -qq - - sudo apt-get install -qq gcc-4.8 g++-4.8 - - CC=g++-4.8 pip install ufal.udpipe +#before_install: +# - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test +# - sudo apt-get update -qq +# - sudo apt-get install -qq gcc-4.8 g++-4.8 +# - CC=g++-4.8 pip install ufal.udpipe +#install: +# - python setup.py install install: - - python setup.py install + - pip install ".[test]" script: - python -m pytest - cd udapi/core/tests && ./external_tests.sh +matrix: + include: + - name: "Python 3.7.4 on Windows" + os: windows # Windows 10.0.17134 N/A Build 17134 + language: shell # 'language: python' is an error on Travis CI Windows + before_install: + - choco install python --version 3.7.4 + - python --version + - python -m pip install --upgrade pip + - pip3 install --upgrade pytest + env: PATH=/c/Python37:/c/Python37/Scripts:$PATH From 4d074c4d1ba158f5896006a5895b52e1b8988cb3 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 31 Mar 2021 15:39:51 +0200 Subject: [PATCH 0231/1201] Travis-CI for Windows (#84) --- .travis.yml | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 076505dc..417e39fb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,18 +12,23 @@ python: #install: # - python setup.py install install: - - pip install ".[test]" + - pip3 install ".[test]" script: - python -m pytest + - udapy read.Conllu files=udapi/core/tests/data/babinsky.conllu write.TextModeTrees color=1 - cd udapi/core/tests && ./external_tests.sh -matrix: +jobs: include: - - name: "Python 3.7.4 on Windows" - os: windows # Windows 10.0.17134 N/A Build 17134 - language: shell # 'language: python' is an error on Travis CI Windows + - name: "Python 3.9 on Windows" + os: windows + language: shell before_install: - - choco install python --version 3.7.4 + - choco install python - python --version - python -m pip install --upgrade pip - pip3 install --upgrade pytest - env: PATH=/c/Python37:/c/Python37/Scripts:$PATH + env: PATH=/c/Python39:/c/Python39/Scripts:$PATH + script: + - python -c 'import colorama;print("\033[31m some red text")' + - python -Xutf8 -c 'import udapi;udapi.Document("udapi/core/tests/data/babinsky.conllu").draw(color=1)' + - python -m pytest From af9801167fbf99364b0cd2acb0382137f9c0afc3 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 9 Apr 2021 16:08:30 +0200 Subject: [PATCH 0232/1201] let's version corefud.Gum2CorefUD block in the Udapi repo so it can be used also by users without access to the private CorefUD repo --- udapi/block/corefud/gum2corefud.py | 99 ++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 udapi/block/corefud/gum2corefud.py diff --git a/udapi/block/corefud/gum2corefud.py b/udapi/block/corefud/gum2corefud.py new file mode 100644 index 00000000..95be6ce0 --- /dev/null +++ b/udapi/block/corefud/gum2corefud.py @@ -0,0 +1,99 @@ +import re +import logging +from collections import defaultdict +from udapi.core.block import Block + +class Gum2CorefUD(Block): + + def process_tree(self, tree): + docname = tree.bundle.document.meta['docname'] + '_' + + def entity2cluster_id(name): + return docname + name.strip('()').replace(',','').replace('+','') + + clusters = tree.bundle.document.coref_clusters + unfinished_mentions = defaultdict(list) + for node in tree.descendants: + entity = node.misc['Entity'] + if not entity: + continue + parts = [x for x in re.split('(\([^())]+\)?|[^())]+\))', entity) if x] + for part in parts: + # GUM entity name could be e.g. + # abstract-173 or place-1-Coron,_Palawan or place-77-Sub-Saharan_Africa. + # Note that the wikification part of the name may contain commas and dashes. + # Let's take the whole name as cluster_id, which will be normalized later on. + # We just need to remove commas and plus signs which are forbidden in cluster_id + # because they are used as separators in Bridging and SplitAnte, respectively. + # Let's store the type in cluster.cluster_type and Wikification in mention.misc. + name = entity2cluster_id(part) + if part[0] == '(': + cluster = clusters.get(name) + if cluster is None: + chunks = part.strip('()').split('-', maxsplit=2) + if len(chunks) == 3: + ctype, _, wiki = chunks + elif len(chunks) == 2: + ctype, _, wiki = chunks[0], None, None + else: + raise ValueError(f"Unexpected entity {part} at {node}") + cluster = node.create_coref_cluster(cluster_id=name, cluster_type=ctype) + mention = cluster.mentions[0] + if wiki: + mention.misc = 'Wikification:' + wiki.replace(',', '%2C') + else: + mention = cluster.create_mention(head=node) + if part[-1] == ')': + mention.words = [node] + else: + unfinished_mentions[name].append(mention) + elif part[-1] == ')': + if not unfinished_mentions[name]: + logging.warning(f"Mention {name} closed at {node}, but not opened in the same tree.") + else: + mention = unfinished_mentions[name].pop() + mention.span = f'{mention.head.ord}-{node.ord}' + del node.misc['Entity'] + + misc_bridge = node.misc['Bridge'] + if misc_bridge: + # E.g. Entity=event-23|Bridge=time-23 Date: Fri, 16 Apr 2021 18:04:31 +0200 Subject: [PATCH 0233/1201] Portuguese "no" can be a pronoun as well fixes #85 --- udapi/block/ud/pt/addmwt.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/pt/addmwt.py b/udapi/block/ud/pt/addmwt.py index 11ebfbbf..daa605b2 100644 --- a/udapi/block/ud/pt/addmwt.py +++ b/udapi/block/ud/pt/addmwt.py @@ -39,8 +39,8 @@ 'nisso': {'form': 'em isso', 'lemma': 'em este'}, 'nisto': {'form': 'em isto', 'lemma': 'em este', 'upos': 'ADP PRON', 'main': 1, 'shape': 'subtree'}, - 'no': {'form': 'em o', 'lemma': 'em o'}, - 'nos': {'form': 'em os', 'lemma': 'em o'}, + 'no': {'form': 'em o', 'lemma': 'em o'}, # PRON cases are excluded below + 'nos': {'form': 'em os', 'lemma': 'em o'}, # PRON cases are excluded below 'num': {'form': 'em um', 'lemma': 'em um'}, 'numa': {'form': 'em uma', 'lemma': 'em um'}, 'numas': {'form': 'em umas', 'lemma': 'em um'}, @@ -79,6 +79,11 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + + # "no" can be either a contraction of "em o", or a pronoun + if node.form.lower() in ('no', 'nos') and node.upos == 'PRON': + return + analysis = MWTS.get(node.form.lower(), None) # If the input is e.g.: From 68b3eab8d2ad0294cc54f94b3298084105654327 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 24 Apr 2021 11:47:40 +0200 Subject: [PATCH 0234/1201] udapy -TMX attributes=ord,form,misc corefud.MarkCrossing < in.conllu --- udapi/block/corefud/markcrossing.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 udapi/block/corefud/markcrossing.py diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py new file mode 100644 index 00000000..81136ec9 --- /dev/null +++ b/udapi/block/corefud/markcrossing.py @@ -0,0 +1,28 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkCrossing(Block): + """Find mentions with crossing spans.""" + + def __init__(self, same_cluster_only=False, continuous_only=False, print_form=False, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + self.continuous_only = continuous_only + self.print_form = print_form + + def _print(self, mention): + if self.print_form: + return ' '.join([w.form for w in mention.words]) + else: + return mention.span + + def process_node(self, node): + if len(node.coref_mentions) > 1: + for mA, mB in itertools.combinations(node.coref_mentions, 2): + if not (set(mA.words) <= set(mB.words)) and not (set(mB.words) <= set(mA.words)): + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + if self.continuous_only and (',' in mA.span or ',' in mB.span): + continue + node.misc['Mark'] = f'cross:{self._print(mA)}+{self._print(mB)}' From bf866b26a40da205ea71119e7fca130ed965ffe1 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 2 May 2021 14:16:51 +0200 Subject: [PATCH 0235/1201] prevent creating a cycle in case of () i.e. paired punctuation with no words between, which is causing a non-projective gap. Fix #90 --- udapi/block/ud/fixpunct.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index cc34a0d0..6fa2da8f 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -232,16 +232,19 @@ def _fix_pair(self, root, opening_node, closing_node): # let's break this rule. if len(heads) == 0: heads = punct_heads - if len(heads) == 1: + # If there are no nodes between the opening and closing mark (), + # let's treat the marks as any other (non-pair) punctuation. + if len(heads) == 0: + return + elif len(heads) == 1: opening_node.parent = heads[0] closing_node.parent = heads[0] - self._punct_type[opening_node.ord] = 'opening' - self._punct_type[closing_node.ord] = 'closing' - elif len(heads) > 1: + else: opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0] closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] - self._punct_type[opening_node.ord] = 'opening' - self._punct_type[closing_node.ord] = 'closing' + + self._punct_type[opening_node.ord] = 'opening' + self._punct_type[closing_node.ord] = 'closing' # In rare cases, non-projective gaps may remain. Let's dirty fix these! # E.g. in "the (lack of) reproducibility", the closing parenthesis From 8327a45c81317d61bae945acfab743b01d5693df Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 09:49:28 +0200 Subject: [PATCH 0236/1201] other characters need to be escaped in TeX --- udapi/block/write/tikz.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/write/tikz.py b/udapi/block/write/tikz.py index 58f53a3d..43417c61 100644 --- a/udapi/block/write/tikz.py +++ b/udapi/block/write/tikz.py @@ -91,6 +91,9 @@ def after_process_document(self, doc): logging.info('Use pdflatex to compile the output') super().after_process_document(doc) + def _tex_escape(self, string): + return string.replace('_', r'\_').replace('$', '\$').replace('[', '$[$').replace(']', '$]$') + def process_tree(self, tree): print(r'\begin{dependency}') print(r'\begin{deptext}') @@ -109,7 +112,7 @@ def process_tree(self, tree): lines = ['' for _ in self.node_attributes] for node in nodes: - values = [v.replace('_', r'\_') for v in node.get_attrs(self.node_attributes)] + values = [self._tex_escape(v) for v in node.get_attrs(self.node_attributes)] max_len = max(len(value) for value in values) for index, value in enumerate(values): if node.ord > 1: From 6bbdfc30b5f042f28bbdb19a92713e03707cc26b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 09:51:14 +0200 Subject: [PATCH 0237/1201] document how to use write.TextModeTrees in LaTeX & tiny improvements --- udapi/block/write/textmodetrees.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index fb38c22a..be2f999d 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -127,6 +127,14 @@ class TextModeTrees(BaseWriter): which is useful for printing subtrees using ``node.draw()``, which is internally implemented using this block. + For use in LaTeX, you can insert the output of this block (without colors) + into \begin{verbatim}...\end{verbatim}, but you need to compile with pdflatex (xelatex not supported) + and you must add the following code into the preambule:: + + \\usepackage{pmboxdraw} + \DeclareUnicodeCharacter{256D}{\textSFi} %╭ + \DeclareUnicodeCharacter{2570}{\textSFii} %╰ + SEE ALSO :py:class:`.TextModeTreesHtml` """ @@ -205,7 +213,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.mark_re = re.compile(mark + '=') self.comment_mark_re = re.compile(r'^ %s = ' % mark, re.M) self._index_of = [] - self._gaps = [] + self._gaps = collections.Counter() self.lines = [] self.lengths = [] @@ -255,7 +263,6 @@ def process_tree(self, root): # Precompute the number of non-projective gaps for each subtree if self.minimize_cross: - self._gaps = collections.Counter() self._compute_gaps(root) # Precompute lines for printing @@ -291,7 +298,7 @@ def process_tree(self, root): # sorting the stack to minimize crossings of edges if self.minimize_cross: - stack = sorted(stack, key=lambda x: -self._gaps[x.ord]) + stack.sort(key=lambda x: -self._gaps[x.ord]) if self.layout == 'classic': for idx, node in enumerate(allnodes): From 4a8e21aa7c920f91214010781dafe61e5567d297 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 12:01:22 +0200 Subject: [PATCH 0238/1201] corefud.PrintMentions for printing mentions with various properties always highlighting just a single mention per tree --- udapi/block/corefud/printmentions.py | 55 ++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 udapi/block/corefud/printmentions.py diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py new file mode 100644 index 00000000..250474ce --- /dev/null +++ b/udapi/block/corefud/printmentions.py @@ -0,0 +1,55 @@ +from udapi.core.block import Block +import udapi.core.coref + +class PrintMentions(Block): + """Print mentions with various properties.""" + + def __init__(self, continuous='include', treelet='include', + oneword='include', singleton='include', **kwargs): + super().__init__(**kwargs) + self.continuous = self._convert(continuous) + self.treelet = self._convert(treelet) + self.oneword = self._convert(oneword) + self.singleton = self._convert(singleton) + + + def _convert(self, value): + if value in {'include', 'exclude', 'only'}: + return value + if value == 1: + return 'only' + if value == 0: + return 'exclude' + raise ValueError('unknown value ' + value) + + def _ok(self, condition, value): + if value == 'include': + return True + return (condition and value == 'only') or (not condition and value=='exclude') + + def process_document(self, doc): + for cluster in doc.coref_clusters.values(): + if not self._ok(len(cluster.mentions) == 1, self.singleton): + continue + + for mention in cluster.mentions: + if not self._ok(len(mention.words) == 1, self.oneword): + continue + if not self._ok(',' in mention.span, self.continuous): + continue + + heads, mwords = 0, set(mention.words) + for w in mention.words: + if w.parent: + heads += 0 if w.parent in mwords else 1 + else: + heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 + if not self._ok(heads <= 1, self.treelet): + continue + + for w in mention.words: + w.misc['Mark'] = 1 + mention.head.root.draw() + for w in mention.words: + del w.misc['Mark'] + From 3c995ce236e103d26f6dcadda4535e6d4f758d7b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 12:06:44 +0200 Subject: [PATCH 0239/1201] oops, continuous == no comma in span --- udapi/block/corefud/printmentions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 250474ce..b226d909 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -35,7 +35,7 @@ def process_document(self, doc): for mention in cluster.mentions: if not self._ok(len(mention.words) == 1, self.oneword): continue - if not self._ok(',' in mention.span, self.continuous): + if not self._ok(',' not in mention.span, self.continuous): continue heads, mwords = 0, set(mention.words) From 77ed69c83c4af449970c24ddefee1b14b908dca8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 13:25:24 +0200 Subject: [PATCH 0240/1201] html output and other options --- udapi/block/corefud/printmentions.py | 51 +++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index b226d909..3dd0e51d 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -1,17 +1,35 @@ from udapi.core.block import Block import udapi.core.coref +from udapi.block.write.textmodetreeshtml import TextModeTreesHtml +from udapi.block.write.textmodetrees import TextModeTrees class PrintMentions(Block): """Print mentions with various properties.""" - def __init__(self, continuous='include', treelet='include', - oneword='include', singleton='include', **kwargs): + def __init__(self, continuous='include', treelet='include', forest='include', + almost_forest='include', oneword='include', singleton='include', + max_trees=100, html=False, + print_sent_id=True, print_text=True, add_empty_line=True, indent=1, + minimize_cross=True, color=True, attributes='form,upos,deprel', + print_undef_as='_', print_doc_meta=True, print_comments=False, + mark='(Mark)', hints=True, layout='classic', + **kwargs): super().__init__(**kwargs) self.continuous = self._convert(continuous) self.treelet = self._convert(treelet) + self.forest = self._convert(forest) + self.almost_forest = self._convert(almost_forest) self.oneword = self._convert(oneword) self.singleton = self._convert(singleton) + self.max_trees = max_trees + self.html = html + print_class = TextModeTreesHtml if html else TextModeTrees + self.print_block = print_class( + print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, + minimize_cross=minimize_cross, color=color, attributes=attributes, + print_undef_as=print_undef_as, print_doc_meta=print_doc_meta, print_comments=print_comments, + mark=mark, hints=hints, layout=layout) def _convert(self, value): if value in {'include', 'exclude', 'only'}: @@ -22,12 +40,29 @@ def _convert(self, value): return 'exclude' raise ValueError('unknown value ' + value) + def before_process_document(self, document): + self.print_block.before_process_document(document) + + def after_process_document(self, document): + self.print_block.after_process_document(document) + def _ok(self, condition, value): if value == 'include': return True return (condition and value == 'only') or (not condition and value=='exclude') + def _is_forest(self, mention, mwords, almost): + for w in mention.words: + for ch in w.children(): + if ch not in mwords: + if not almost: + return False + if not w.parent or w.parent in mwords or ch.udeprel not in {'case', 'cc', 'punct', 'conj'}: + return False + return True + def process_document(self, doc): + printed_trees = 0 for cluster in doc.coref_clusters.values(): if not self._ok(len(cluster.mentions) == 1, self.singleton): continue @@ -46,10 +81,18 @@ def process_document(self, doc): heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 if not self._ok(heads <= 1, self.treelet): continue + if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest): + continue + if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest): + continue for w in mention.words: w.misc['Mark'] = 1 - mention.head.root.draw() + if self.max_trees: + printed_trees += 1 + if printed_trees > self.max_trees: + return + #print(f"{printed_trees}/{self.max_trees}") + self.print_block.process_tree(mention.head.root) for w in mention.words: del w.misc['Mark'] - From c6c57ecfb0ee0009e3ee9dbcce900cd6c4df6ddf Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 13:44:41 +0200 Subject: [PATCH 0241/1201] more exceptions --- udapi/block/corefud/printmentions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 3dd0e51d..5146a34e 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -57,7 +57,7 @@ def _is_forest(self, mention, mwords, almost): if ch not in mwords: if not almost: return False - if not w.parent or w.parent in mwords or ch.udeprel not in {'case', 'cc', 'punct', 'conj'}: + if not w.parent or w.parent in mwords or ch.udeprel not in {'case', 'cc', 'punct', 'conj', 'appos', 'cop', 'aux'}: return False return True From 45a6fe46f3fb6f9e3bcc9a43f3b2ab3d3b796fb7 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 15:40:12 +0200 Subject: [PATCH 0242/1201] option "empty" --- udapi/block/corefud/printmentions.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 5146a34e..08fef23c 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -8,7 +8,7 @@ class PrintMentions(Block): def __init__(self, continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', - max_trees=100, html=False, + empty='include', max_trees=100, html=False, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, @@ -21,6 +21,7 @@ def __init__(self, continuous='include', treelet='include', forest='include', self.almost_forest = self._convert(almost_forest) self.oneword = self._convert(oneword) self.singleton = self._convert(singleton) + self.empty = self._convert(empty) self.max_trees = max_trees self.html = html @@ -73,6 +74,10 @@ def process_document(self, doc): if not self._ok(',' not in mention.span, self.continuous): continue + empty_mwords = [w for w in mention.words if w.is_empty()] + if not self._ok(len(empty_mwords) > 0, self.empty): + continue + heads, mwords = 0, set(mention.words) for w in mention.words: if w.parent: From 996517090e9c3d2da2757cf76c9e52bd880ec04a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 11 May 2021 17:09:23 +0200 Subject: [PATCH 0243/1201] AnCora in CorefUD has all deprels=dep, so use upos instead --- udapi/block/corefud/printmentions.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 08fef23c..521b4cc6 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -52,13 +52,20 @@ def _ok(self, condition, value): return True return (condition and value == 'only') or (not condition and value=='exclude') + def _is_auxiliary(self, node): + if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'cop', 'aux'}: + return True + if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ'}: + return True + return False + def _is_forest(self, mention, mwords, almost): for w in mention.words: for ch in w.children(): if ch not in mwords: if not almost: return False - if not w.parent or w.parent in mwords or ch.udeprel not in {'case', 'cc', 'punct', 'conj', 'appos', 'cop', 'aux'}: + if not (w.parent and w.parent not in mwords and self._is_auxiliary(ch)): return False return True From d4bbdfd232bbf52562f389a8e39c8bd9709834bb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 13 May 2021 12:38:22 +0200 Subject: [PATCH 0244/1201] corefud.PrintMentions almost_continuous --- udapi/block/corefud/printmentions.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 521b4cc6..60de30eb 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -6,8 +6,8 @@ class PrintMentions(Block): """Print mentions with various properties.""" - def __init__(self, continuous='include', treelet='include', forest='include', - almost_forest='include', oneword='include', singleton='include', + def __init__(self, continuous='include', almost_continuous='include', treelet='include', + forest='include', almost_forest='include', oneword='include', singleton='include', empty='include', max_trees=100, html=False, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', @@ -16,6 +16,7 @@ def __init__(self, continuous='include', treelet='include', forest='include', **kwargs): super().__init__(**kwargs) self.continuous = self._convert(continuous) + self.almost_continuous = self._convert(almost_continuous) self.treelet = self._convert(treelet) self.forest = self._convert(forest) self.almost_forest = self._convert(almost_forest) @@ -69,6 +70,19 @@ def _is_forest(self, mention, mwords, almost): return False return True + def _is_almost_continuous(self, mention): + if ',' not in mention.span: + return True + nonempty = [w for w in mention.words if not w.is_empty()] + if not nonempty: + return True + mwords = set(mention.words) + gap_nodes = [w for w in mention.head.root.descendants if w > nonempty[0] and w < nonempty[-1] and not w in mwords] + for gap_node in gap_nodes: + if not gap_node.is_empty(): + return False + return True + def process_document(self, doc): printed_trees = 0 for cluster in doc.coref_clusters.values(): @@ -80,6 +94,8 @@ def process_document(self, doc): continue if not self._ok(',' not in mention.span, self.continuous): continue + if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous): + continue empty_mwords = [w for w in mention.words if w.is_empty()] if not self._ok(len(empty_mwords) > 0, self.empty): From 21c6a7f07c000430d02d8b58b739f53825085a06 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 13 May 2021 18:21:01 +0200 Subject: [PATCH 0245/1201] TextModeTrees color=0 now highlights marked nodes with **asterisks** --- udapi/block/write/textmodetrees.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index be2f999d..d427098a 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -359,11 +359,14 @@ def add_node(self, idx, node): if not node.is_root(): values = node.get_attrs(self.attrs, undefs=self.print_undef_as) self.lengths[idx] += 1 + len(' '.join(values)) + marked = self.is_marked(node) if self.color: - marked = self.is_marked(node) for i, attr in enumerate(self.attrs): values[i] = self.colorize_attr(attr, values[i], marked) - self.lines[idx] += ' ' + ' '.join(values) + if not self.color and marked: + self.lines[idx] += ' **' + ' '.join(values) + '**' + else: + self.lines[idx] += ' ' + ' '.join(values) def is_marked(self, node): """Should a given node be highlighted?""" From ac7360d47428e296d462c89973a9282f2bb1f84e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 13 May 2021 18:59:31 +0200 Subject: [PATCH 0246/1201] PrintMentions shuffle=1 by default (and shuffle=0 sorts mentions) --- udapi/block/corefud/printmentions.py | 82 +++++++++++++++------------- udapi/block/write/textmodetrees.py | 1 + 2 files changed, 46 insertions(+), 37 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 60de30eb..0c1d4e79 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -1,5 +1,5 @@ +import random from udapi.core.block import Block -import udapi.core.coref from udapi.block.write.textmodetreeshtml import TextModeTreesHtml from udapi.block.write.textmodetrees import TextModeTrees @@ -8,7 +8,7 @@ class PrintMentions(Block): def __init__(self, continuous='include', almost_continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', - empty='include', max_trees=100, html=False, + empty='include', max_trees=0, html=False, shuffle=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, @@ -26,6 +26,9 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i self.max_trees = max_trees self.html = html + self.shuffle = shuffle + if shuffle: + random.seed(42) print_class = TextModeTreesHtml if html else TextModeTrees self.print_block = print_class( print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, @@ -84,43 +87,48 @@ def _is_almost_continuous(self, mention): return True def process_document(self, doc): - printed_trees = 0 + mentions = [] for cluster in doc.coref_clusters.values(): - if not self._ok(len(cluster.mentions) == 1, self.singleton): - continue + if self._ok(len(cluster.mentions) == 1, self.singleton): + mentions.extend(cluster.mentions) + if self.shuffle: + random.shuffle(mentions) + else: + mentions.sort() - for mention in cluster.mentions: - if not self._ok(len(mention.words) == 1, self.oneword): - continue - if not self._ok(',' not in mention.span, self.continuous): - continue - if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous): - continue + printed_trees = 0 + for mention in mentions: + if not self._ok(len(mention.words) == 1, self.oneword): + continue + if not self._ok(',' not in mention.span, self.continuous): + continue + if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous): + continue - empty_mwords = [w for w in mention.words if w.is_empty()] - if not self._ok(len(empty_mwords) > 0, self.empty): - continue + empty_mwords = [w for w in mention.words if w.is_empty()] + if not self._ok(len(empty_mwords) > 0, self.empty): + continue - heads, mwords = 0, set(mention.words) - for w in mention.words: - if w.parent: - heads += 0 if w.parent in mwords else 1 - else: - heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 - if not self._ok(heads <= 1, self.treelet): - continue - if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest): - continue - if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest): - continue + heads, mwords = 0, set(mention.words) + for w in mention.words: + if w.parent: + heads += 0 if w.parent in mwords else 1 + else: + heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 + if not self._ok(heads <= 1, self.treelet): + continue + if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest): + continue + if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest): + continue - for w in mention.words: - w.misc['Mark'] = 1 - if self.max_trees: - printed_trees += 1 - if printed_trees > self.max_trees: - return - #print(f"{printed_trees}/{self.max_trees}") - self.print_block.process_tree(mention.head.root) - for w in mention.words: - del w.misc['Mark'] + for w in mention.words: + w.misc['Mark'] = 1 + if self.max_trees: + printed_trees += 1 + if printed_trees > self.max_trees: + print(f'######## Only first {self.max_trees} trees printed. Use max_trees=0 to see all.') + return + self.print_block.process_tree(mention.head.root) + for w in mention.words: + del w.misc['Mark'] diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index d427098a..f3f6e007 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -365,6 +365,7 @@ def add_node(self, idx, node): values[i] = self.colorize_attr(attr, values[i], marked) if not self.color and marked: self.lines[idx] += ' **' + ' '.join(values) + '**' + self.lengths[idx] += 4 else: self.lines[idx] += ' ' + ' '.join(values) From 93269127f43089e79dc250cfd4968bf0d3aff811 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 13 May 2021 19:20:46 +0200 Subject: [PATCH 0247/1201] printMentions print_other_forms=5 by default --- udapi/block/corefud/printmentions.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 0c1d4e79..4a122ccb 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -1,4 +1,5 @@ import random +from collections import Counter from udapi.core.block import Block from udapi.block.write.textmodetreeshtml import TextModeTreesHtml from udapi.block.write.textmodetrees import TextModeTrees @@ -8,7 +9,7 @@ class PrintMentions(Block): def __init__(self, continuous='include', almost_continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', - empty='include', max_trees=0, html=False, shuffle=True, + empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, @@ -29,6 +30,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i self.shuffle = shuffle if shuffle: random.seed(42) + self.print_other_forms = print_other_forms print_class = TextModeTreesHtml if html else TextModeTrees self.print_block = print_class( print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, @@ -129,6 +131,20 @@ def process_document(self, doc): if printed_trees > self.max_trees: print(f'######## Only first {self.max_trees} trees printed. Use max_trees=0 to see all.') return + + this_form = ' '.join([w.form for w in mention.words]) + print("# Mention = " + this_form) + if self.print_other_forms: + counter = Counter() + for m in mention.cluster.mentions: + forms = ' '.join([w.form for w in m.words]) + if forms != this_form: + counter[forms] += 1 + if counter: + print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='') + for form, count in counter.most_common(self.print_other_forms): + print(f' "{form}"({count})', end='') + print() self.print_block.process_tree(mention.head.root) for w in mention.words: del w.misc['Mark'] From 61ba542511507546d0a718458ebf3f1ac85eb903 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 14 May 2021 14:03:21 +0200 Subject: [PATCH 0248/1201] better copula handling --- udapi/block/corefud/printmentions.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 4a122ccb..e4c4cd5d 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -58,20 +58,27 @@ def _ok(self, condition, value): return True return (condition and value == 'only') or (not condition and value=='exclude') - def _is_auxiliary(self, node): - if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'cop', 'aux'}: + def _is_auxiliary_etc(self, node): + if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'aux', 'vocative'}: return True - if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ'}: + if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: return True return False def _is_forest(self, mention, mwords, almost): for w in mention.words: - for ch in w.children(): + # UD unfortunatelly does not use the copula-as-head style for copula construction, + # so e.g. in "It is my fault", "fault" is the root of the tree and all other words its children. + # However, in the cop-as-head stule, only "my" would depend on "fault" (and should be part of the mention). + # It is difficult to tell apart which w.children are related to w and which to the copula. + # We thus ignore these cases completely (we expect any child is potentially related to the copula). + if any(ch.udeprel == 'cop' for ch in w.children): + continue + for ch in w.children: if ch not in mwords: if not almost: return False - if not (w.parent and w.parent not in mwords and self._is_auxiliary(ch)): + if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)): return False return True From b08bfdfa6c2ec1759d13f49b45e3c59e74fdcfff Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 14 May 2021 14:16:13 +0200 Subject: [PATCH 0249/1201] aux should not be in the list when copula constructions are handled elsewhere --- udapi/block/corefud/printmentions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index e4c4cd5d..33efdd66 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -59,7 +59,7 @@ def _ok(self, condition, value): return (condition and value == 'only') or (not condition and value=='exclude') def _is_auxiliary_etc(self, node): - if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'aux', 'vocative'}: + if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'vocative'}: return True if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: return True From 39134031a5c5e7544dab32f7ec83875dfde35f01 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 18 May 2021 04:07:12 +0200 Subject: [PATCH 0250/1201] corefud.PrintMentions print_total=1 --- udapi/block/corefud/printmentions.py | 51 ++++++++++++++++------------ 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 33efdd66..e26ee6e2 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -10,6 +10,7 @@ class PrintMentions(Block): def __init__(self, continuous='include', almost_continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, + print_total=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, @@ -31,6 +32,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i if shuffle: random.seed(42) self.print_other_forms = print_other_forms + self.print_total = print_total, print_class = TextModeTreesHtml if html else TextModeTrees self.print_block = print_class( print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, @@ -105,7 +107,7 @@ def process_document(self, doc): else: mentions.sort() - printed_trees = 0 + seen_trees = 0 for mention in mentions: if not self._ok(len(mention.words) == 1, self.oneword): continue @@ -133,25 +135,32 @@ def process_document(self, doc): for w in mention.words: w.misc['Mark'] = 1 - if self.max_trees: - printed_trees += 1 - if printed_trees > self.max_trees: - print(f'######## Only first {self.max_trees} trees printed. Use max_trees=0 to see all.') + + seen_trees += 1 + if self.max_trees and seen_trees > self.max_trees: + if not self.print_total: + print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') return + else: + this_form = ' '.join([w.form for w in mention.words]) + print("# Mention = " + this_form) + if self.print_other_forms: + counter = Counter() + for m in mention.cluster.mentions: + forms = ' '.join([w.form for w in m.words]) + if forms != this_form: + counter[forms] += 1 + if counter: + print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='') + for form, count in counter.most_common(self.print_other_forms): + print(f' "{form}"({count})', end='') + print() + self.print_block.process_tree(mention.head.root) + for w in mention.words: + del w.misc['Mark'] + + if self.print_total: + if self.max_trees and seen_trees > self.max_trees: + print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + print(f'######## Total matching/all mentions = {seen_trees} / {len(mentions)}') - this_form = ' '.join([w.form for w in mention.words]) - print("# Mention = " + this_form) - if self.print_other_forms: - counter = Counter() - for m in mention.cluster.mentions: - forms = ' '.join([w.form for w in m.words]) - if forms != this_form: - counter[forms] += 1 - if counter: - print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='') - for form, count in counter.most_common(self.print_other_forms): - print(f' "{form}"({count})', end='') - print() - self.print_block.process_tree(mention.head.root) - for w in mention.words: - del w.misc['Mark'] From 0d98b5fa5ddeefd991f79482cfcf20e1657dab9b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 21 May 2021 19:08:14 +0200 Subject: [PATCH 0251/1201] prevent paired punct attached non-projectively fixes #87 --- udapi/block/ud/fixpunct.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 6fa2da8f..95cb40d0 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -2,7 +2,7 @@ Punctuation in Universal Dependencies has the tag PUNCT, dependency relation punct, and is always attached projectively, usually to the head of a neighboring subtree -to its left or right. +to its left or right (see https://universaldependencies.org/u/dep/punct.html). Punctuation normally does not have children. If it does, we will fix it first. This block tries to re-attach punctuation projectively and according to the guidelines. @@ -236,12 +236,18 @@ def _fix_pair(self, root, opening_node, closing_node): # let's treat the marks as any other (non-pair) punctuation. if len(heads) == 0: return - elif len(heads) == 1: - opening_node.parent = heads[0] - closing_node.parent = heads[0] else: - opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0] - closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] + # Ideally, there should be only a single head. + # If not, we could try e.g. to choose the "widests-span head": + # opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0] + # closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] + # which often leads to selecting the same head for the opening and closing punctuation + # ignoring single words inside the paired punct which are non-projectively attached outside. + # However, this means that the paired punctuation will be attached non-projectively, + # which is forbidden by the UD guidelines. + # Thus, we will choose the nearest head, which is the only way how to prevent non-projectivities. + opening_node.parent = heads[0] + closing_node.parent = heads[-1] self._punct_type[opening_node.ord] = 'opening' self._punct_type[closing_node.ord] = 'closing' From 1eee540ad18c00525941cebc814a76c88550c027 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 24 Jun 2021 17:52:46 +0200 Subject: [PATCH 0252/1201] util.MarkDiff print_stats=6 prints top 6 changes --- udapi/block/util/markdiff.py | 43 ++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/markdiff.py b/udapi/block/util/markdiff.py index 22a7a03e..3d183f57 100644 --- a/udapi/block/util/markdiff.py +++ b/udapi/block/util/markdiff.py @@ -1,5 +1,7 @@ """util.MarkDiff is a special block for marking differences between parallel trees.""" +import collections import difflib +import pprint from udapi.core.block import Block @@ -7,13 +9,25 @@ class MarkDiff(Block): """Mark differences between parallel trees.""" def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc', - mark=1, add=False, **kwargs): - """Create the Mark block object.""" + mark=1, add=False, print_stats=0, **kwargs): + """Create the Mark block object. + Params: + gold_zone: Which of the zones should be treated as gold? + (The changes are interpreted as from a "pred"=predicted zone into the gold zone.) + attributes: Which node attributes should be considered when searching for diffs? + The tree topology, i.e. node parent is always considered. + mark: What value should be used in `node.misc['Mark']` of the differing nodes? + add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block, + so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block. + print_stats: How many lines of statistics should be printed? -1 means all. + """ super().__init__(**kwargs) self.gold_zone = gold_zone self.attrs = attributes.split(',') self.mark = mark self.add = add + self.print_stats = print_stats + self.stats = collections.Counter() def process_tree(self, tree): gold_tree = tree.bundle.get_tree(self.gold_zone) @@ -49,6 +63,31 @@ def process_tree(self, tree): if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: p_node.misc['Mark'] = self.mark g_node.misc['Mark'] = self.mark + self.stats['ONLY-PARENT-CHANGED'] += 1 else: for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]: node.misc['Mark'] = self.mark + if self.print_stats: + if edit == 'replace': + # first n nodes are treated as aligned, the rest is treated as ADDED/DELETED + n = min(pred_hi - pred_lo, gold_hi - gold_lo) + for p_node, g_node in zip(pred_nodes[pred_lo:pred_lo + n], gold_nodes[gold_lo:gold_lo + n]): + for attr in self.attrs: + p_value, g_value = p_node._get_attr(attr), g_node._get_attr(attr) + if p_value != g_value: + self.stats[f'{attr.upper()}: {p_value} -> {g_value}'] += 1 + if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + self.stats['PARENT-CHANGED'] += 1 + pred_lo, gold_lo = pred_lo + n, gold_lo + n + for node in gold_nodes[gold_lo:gold_hi]: + self.stats['ADD-WORD'] += 1 + self.stats['ADD-LEMMA: ' + node.lemma] += 1 + for node in pred_nodes[pred_lo:pred_hi]: + self.stats['DELETE-WORD'] += 1 + self.stats['DELETE-LEMMA: ' + node.lemma] += 1 + + def process_end(self): + if self.print_stats: + how_many = None if self.print_stats in (-1, '-1') else self.print_stats + for edit, count in self.stats.most_common(how_many): + print(f'{count:4} {edit}') From 79f4cb5e1839b8250a6398f5fd00182bddb5417e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 7 Jul 2021 16:58:54 +0200 Subject: [PATCH 0253/1201] prevent fatal errors in eval.F1 `eval.F1` calls `self.before_process_document(None)` --- udapi/core/basewriter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 0db348a8..cc72c6e7 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -45,7 +45,8 @@ def next_filename(self): return self.files.next_filename() def before_process_document(self, document): - udapi.core.coref.store_coref_to_misc(document) + if document: + udapi.core.coref.store_coref_to_misc(document) if self.orig_files == '': logging.info('Writing to filehandle.') sys.stdout = self.files.filehandle From 368349003b4388ac9217ef1f7a4e844469e933fb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 23 Jul 2021 16:57:51 +0200 Subject: [PATCH 0254/1201] eval.F1 should not fail if some node attributes are None --- udapi/block/eval/f1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py index 982e4190..9f265ac7 100644 --- a/udapi/block/eval/f1.py +++ b/udapi/block/eval/f1.py @@ -110,8 +110,8 @@ def process_tree(self, tree): return self.visited_zones[tree.zone] += 1 - pred_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in tree.descendants] - gold_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in gold_tree.descendants] + pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in tree.descendants] + gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in gold_tree.descendants] # lcs("abc", "acb") can be either "ab" or "ac". # We want to prefer the LCS with the highest number of non-focused tokens. From 2fe54cde2d8465527092b8804b624d1e35bdf7cf Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 10 Sep 2021 11:41:42 +0200 Subject: [PATCH 0255/1201] Archiving work I did with German HDT between UD 2.8 and 2.9. I only now realize that I could have used the existing block ud.de.AddMwt. However, there are some differences, so for the time being, I am keeping both blocks. --- udapi/block/ud/de/fixhdt.py | 109 ++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 udapi/block/ud/de/fixhdt.py diff --git a/udapi/block/ud/de/fixhdt.py b/udapi/block/ud/de/fixhdt.py new file mode 100644 index 00000000..a3792a96 --- /dev/null +++ b/udapi/block/ud/de/fixhdt.py @@ -0,0 +1,109 @@ +""" +Block to fix annotation of UD German-HDT. + +It was created independently of ud.de.AddMwt but it aims to do essentially the +same thing. Future work: make the two blocks converge. + +Currently known differences: +- This block covers a wider range of contractions. +- This block generates morphological features for the syntactic words. +- This block does not touch words that look like contractions but do not have PronType=Art (this is a reliable indicator in HDT). +- This block overrides the default attachment when the original relation is root, conj, reparandum. +- The other block takes advantage of the generic class ud.AddMwt, so it does not have to re-invent common procedures. +""" +from udapi.core.block import Block +import logging +import re + +class FixHDT(Block): + + def process_node(self, node): + # PronType=Art with ADP is wrong. Fused prepositions and articles should be decomposed in UD. + # The following contractions have been observed: + # a. am ans aufs beim durchs fürs hinterm hinters im ins übers ums unterm unters vom vorm vors z. zum zur + if node.upos == 'ADP' and node.feats['PronType'] == 'Art': + if re.match("^(a\.|am|ans|aufs|beim|durchs|fürs|hinter[ms]|im|ins|übers|ums|unter[ms]|vom|vor[ms]|z\.|zu[mr])$", node.form, re.IGNORECASE): + # We need two nodes instead of one. Create a node. + # The parent should not be the root but unfortunately it is not guaranteed. + node2 = node.create_child() + node2.shift_after_node(node) + if not re.match(r"^(root|conj|reparandum)$", node.udeprel): + node2.parent = node.parent + node.deprel = 'case' + node2.deprel = 'det' + mwt = node.root.create_multiword_token(form=node.form, words=[node, node2], misc=node.misc) + node.misc['SpaceAfter'] = '' + # We want to respect the original letter case in the forms of the syntactic words. + # We can use the isupper() method to find out whether all letters are uppercase. + # However, detecting first-letter capitalization requires more work. + up = 2 if mwt.form.isupper() else 1 if mwt.form[:1].isupper() else 0 + up2 = 2 if up == 2 else 0 + if re.match(r"^(a\.|am|ans)$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'an') + node.lemma = 'an' + elif re.match(r"^aufs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'auf') + node.lemma = 'auf' + elif re.match(r"^beim$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'bei') + node.lemma = 'bei' + elif re.match(r"^durchs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'durch') + node.lemma = 'durch' + elif re.match(r"^fürs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'für') + node.lemma = 'für' + elif re.match(r"^hinter[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'hinter') + node.lemma = 'hinter' + elif re.match(r"^(im|ins)$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'in') + node.lemma = 'in' + elif re.match(r"^übers$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'über') + node.lemma = 'über' + elif re.match(r"^ums$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'um') + node.lemma = 'um' + elif re.match(r"^unter[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'unter') + node.lemma = 'unter' + elif re.match(r"^vom$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'von') + node.lemma = 'von' + elif re.match(r"^vor[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'vor') + node.lemma = 'vor' + elif re.match(r"^(z\.|zu[mr])$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'zu') + node.lemma = 'zu' + node.upos = 'ADP' + node.xpos = 'APPR' + node.feats = '_' + node.feats['AdpType'] = 'Prep' + # We must use search() because match() only checks at the beginning of the string. + if re.search("[m\.]$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'dem') + node2.feats = 'Case=Dat|Definite=Def|Gender=Masc,Neut|Number=Sing|PronType=Art' + node.feats['Case'] = 'Dat' + node2.lemma = 'der' + elif re.search("s$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'das') + node2.feats = 'Case=Acc|Definite=Def|Gender=Neut|Number=Sing|PronType=Art' + node.feats['Case'] = 'Acc' + node2.lemma = 'der' + elif re.search("r$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'der') + node2.feats = 'Case=Dat|Definite=Def|Gender=Fem|Number=Sing|PronType=Art' + node.feats['Case'] = 'Dat' + node2.lemma = 'der' + node2.upos = 'DET' + node2.xpos = 'ART' + +def mimic_case(up, x): + if up >= 2: + return x.upper() + elif up == 1: + return x[:1].upper() + x[1:].lower() + else: + return x.lower() From 33d8e1ce46c22f614e7fc451422278039dfafbb0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 20:15:59 +0200 Subject: [PATCH 0256/1201] German preposition-article contractions. --- udapi/block/ud/de/addmwt.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/de/addmwt.py b/udapi/block/ud/de/addmwt.py index 23ac54f9..18778a4a 100644 --- a/udapi/block/ud/de/addmwt.py +++ b/udapi/block/ud/de/addmwt.py @@ -16,15 +16,16 @@ 'durchs': {'form': 'durch das', }, 'fürs': {'form': 'fürs das', }, 'hinterm': {'form': 'hinter dem', }, + 'hinters': {'form': 'hinter das', }, 'im': {'form': 'in dem', }, 'ins': {'form': 'in das', }, 'übers': {'form': 'über das', }, 'ums': {'form': 'um das', }, - 'unters': {'form': 'unter das', }, 'unterm': {'form': 'unter dem', }, + 'unters': {'form': 'unter das', }, 'vom': {'form': 'von dem', }, - 'vors': {'form': 'vor das', }, 'vorm': {'form': 'vor dem', }, + 'vors': {'form': 'vor das', }, 'zum': {'form': 'zu dem', }, 'zur': {'form': 'zu der', }, } From 1487de76a72fa42b95274bf4237af6fb7d55b7e8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 20:52:32 +0200 Subject: [PATCH 0257/1201] Archiving my fixes for Catalan and Spanish. --- udapi/block/ud/ca/__init__.py | 0 udapi/block/ud/ca/addmwt.py | 194 ++++++++++++++++++++++++++++++++++ udapi/block/ud/es/addmwt.py | 9 +- 3 files changed, 201 insertions(+), 2 deletions(-) create mode 100644 udapi/block/ud/ca/__init__.py create mode 100644 udapi/block/ud/ca/addmwt.py diff --git a/udapi/block/ud/ca/__init__.py b/udapi/block/ud/ca/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/ca/addmwt.py b/udapi/block/ud/ca/addmwt.py new file mode 100644 index 00000000..49b79da1 --- /dev/null +++ b/udapi/block/ud/ca/addmwt.py @@ -0,0 +1,194 @@ +"""Block ud.ca.AddMwt for heuristic detection of Catalan contractions. + +According to the UD guidelines, contractions such as "del" = "de el" +should be annotated using multi-word tokens. + +Note that this block should be used only for converting legacy conllu files. +Ideally a tokenizer should have already split the MWTs. +""" +import re +import udapi.block.ud.addmwt + +MWTS = { + 'al': {'form': 'a el', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'als': {'form': 'a els', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'del': {'form': 'de el', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'dels': {'form': 'de els', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'pel': {'form': 'per el', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'pels': {'form': 'per els', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + v['lemma'] = v['form'] + v['upos'] = 'ADP DET' + v['deprel'] = '* det' + # The following are the default values + # v['main'] = 0 # which of the two words will inherit the original children (if any) + # v['shape'] = 'siblings', # the newly created nodes will be siblings + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def __init__(self, verbpron=False, **kwargs): + super().__init__(**kwargs) + self.verbpron = verbpron + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + + if analysis is not None: + # Modify the default attachment of the new syntactic words in special situations. + if re.match(r'^(root|conj|reparandum)$', node.udeprel): + # Copy the dictionary so that we do not modify the original and do not affect subsequent usages. + analysis = analysis.copy() + analysis['shape'] = 'subtree' + return analysis + return None + + def fix_personal_pronoun(self, node): + # There is a mess in lemmas and features of personal pronouns. + if node.upos == 'PRON': + if re.match("^jo$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Nom|Number=Sing|Person=1|PronType=Prs' + if re.match("^(em|m'|-me|'m|me|m)$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc,Dat|Number=Sing|Person=1|PrepCase=Npr|PronType=Prs' + if re.match("^mi$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc|Number=Sing|Person=1|PrepCase=Pre|PronType=Prs' + if re.match("^tu$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Nom|Number=Sing|Person=2|Polite=Infm|PronType=Prs' + if re.match("^(et|t'|-te|'t|te|t)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc,Dat|Number=Sing|Person=2|Polite=Infm|PrepCase=Npr|PronType=Prs' + if re.match("^ti$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc|Number=Sing|Person=2|Polite=Infm|PrepCase=Pre|PronType=Prs' + # Strong forms of third person pronouns can be used as subjects or after preposition. + # Do not mark them as nominative (because of the prepositions). + if re.match("^ell$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^ella$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Fem|Number=Sing|Person=3|PronType=Prs' + if re.match("^(el|-lo|'l|lo)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^(la|-la)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs' + if re.match("^(l')$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^(ho|-ho)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs' + if re.match("^(li|-li)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Dat|Number=Sing|Person=3|PronType=Prs' + if re.match("^(es|s'|-se|'s|se|s)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes' + if re.match("^si$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Person=3|PrepCase=Pre|PronType=Prs|Reflex=Yes' + # If nosaltres can be used after a preposition, we should not tag it as nominative. + if re.match("^nosaltres$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Number=Plur|Person=1|PronType=Prs' + # Nós is the majestic first person singular. In accusative and dative, it is identical to first person plural. + if re.match("^nós$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Number=Sing|Person=1|Polite=Form|PronType=Prs' + if re.match("^(ens|-nos|'ns|nos|ns)$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs' + if re.match("^vosaltres$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Plur|Person=2|PronType=Prs' + # Vós is the formal second person singular. In accusative and dative, it is identical to second person plural. + # Vostè is even more formal than vós. In accusative and dative, it is identical to third person singular. + if re.match("^(vós|vostè)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Sing|Person=2|Polite=Form|PronType=Prs' + if re.match("^vostès$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Plur|Person=2|Polite=Form|PronType=Prs' + if re.match("^(us|-vos|-us|vos)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs' + # Strong forms of third person pronouns can be used as subjects or after preposition. + # Do not mark them as nominative (because of the prepositions). + if re.match("^ells$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Masc|Number=Plur|Person=3|PronType=Prs' + if re.match("^elles$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Fem|Number=Plur|Person=3|PronType=Prs' + # Els is masculine accusative, or dative in any gender. + if re.match("^(els|-los|'ls|los|ls)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs' + if re.match("^(les|-les)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem|Number=Plur|Person=3|PronType=Prs' + # There are also "adverbial" pronominal clitics that can occur at direct object positions. + if re.match("^(en|n'|'n|-ne|n|ne)$", node.form, re.IGNORECASE): + node.lemma = 'en' + node.feats = 'Case=Gen|Person=3|PronType=Prs' + if re.match("^(hi|-hi)$", node.form, re.IGNORECASE): + node.lemma = 'hi' + node.feats = 'Case=Loc|Person=3|PronType=Prs' + + def report_suspicious_lemmas(self, node): + # There are offset issues of splitted multi_word_expressions. + # Sometimes a word gets the lemma of the neighboring word. + if node.form.lower()[:1] != node.lemma.lower()[:1]: + # Exclude legitimate cases where the lemma starts with a different letter. + hit = True + if node.lemma == 'jo' and re.match("(em|ens|m'|me|mi|nos|nosaltres|'ns)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'tu' and re.match("(et|'t|us|vosaltres|vostè)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'el' and re.match("(la|l|l'|les)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ell' and re.match("(hi|ho|'l|l'|la|-la|les|li|lo|-lo|los|'ls|'s|s'|se|-se|si)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'es' and re.match("(s|se)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'em' and re.match("('m|m|m')", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'en' and re.match("('n|n'|ne|-ne)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'anar' and re.match("(va|van|vàrem)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ser' and re.match("(és|era|eren|eres|érem|essent|estat|ets|foren|fos|fossin|fou)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'estar' and re.match("(sigut)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'caure' and re.match("(queia|queies|quèiem|quèieu|queien)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ampli' and re.match("(àmplia|àmplies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'indi' and re.match("(índies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'obvi' and re.match("(òbvia)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ossi' and re.match("(òssies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ús' and re.match("(usos)", node.form, re.IGNORECASE): + hit = False + # Form = '2001/37/CE', lemma = 'CE' + # Form = 'nº5', lemma = '5' + # Form = 'kg.', lemma = 'quilogram' + # Form = 'un', lemma = '1' + if node.lemma == 'CE' or re.match("nº", node.form, re.IGNORECASE) or re.match("^quil[oò]", node.lemma, re.IGNORECASE) or re.match("^[0-9]+$", node.lemma): + hit = False + if hit: + print("Form = '%s', lemma = '%s', address = %s" % (node.form, node.lemma, node.address())) diff --git a/udapi/block/ud/es/addmwt.py b/udapi/block/ud/es/addmwt.py index ee85b1d6..92f80160 100644 --- a/udapi/block/ud/es/addmwt.py +++ b/udapi/block/ud/es/addmwt.py @@ -1,6 +1,6 @@ """Block ud.es.AddMwt for heuristic detection of Spanish contractions. -According to the UD guidelines, contractions such as "dele" = "de ele" +According to the UD guidelines, contractions such as "del" = "de el" should be annotated using multi-word tokens. Note that this block should be used only for converting legacy conllu files. @@ -28,7 +28,7 @@ v['lemma'] = v['form'] v['upos'] = 'ADP DET' v['deprel'] = '* det' - v['feats'] = '_ *' + v['feats'] = '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art' # The following are the default values # v['main'] = 0 # which of the two words will inherit the original children (if any) # v['shape'] = 'siblings', # the newly created nodes will be siblings @@ -46,6 +46,11 @@ def multiword_analysis(self, node): analysis = MWTS.get(node.form.lower(), None) if analysis is not None: + # Modify the default attachment of the new syntactic words in special situations. + if re.match(r'^(root|conj|reparandum)$', node.udeprel): + # Copy the dictionary so that we do not modify the original and do not affect subsequent usages. + analysis = analysis.copy() + analysis['shape'] = 'subtree' return analysis if not self.verbpron or node.upos not in {'VERB', 'AUX'}: From 27bf3a3b82b4a88ecaeee2bc2843efaf383bc249 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 21:02:48 +0200 Subject: [PATCH 0258/1201] Archiving my fixes for Indonesian GSD. --- udapi/block/ud/id/__init__.py | 0 udapi/block/ud/id/fixgsd.py | 67 +++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 udapi/block/ud/id/__init__.py create mode 100644 udapi/block/ud/id/fixgsd.py diff --git a/udapi/block/ud/id/__init__.py b/udapi/block/ud/id/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py new file mode 100644 index 00000000..1ab30dd0 --- /dev/null +++ b/udapi/block/ud/id/fixgsd.py @@ -0,0 +1,67 @@ +"""Block to fix annotation of UD Indonesian-GSD.""" +from udapi.core.block import Block +import logging +import re + +class FixGSD(Block): + + def lemmatize_verb_from_morphind(self, node): + # The MISC column contains the output of MorphInd for the current word. + # The analysis has been interpreted wrongly for some verbs, so we need + # to re-interpret it and extract the correct lemma. + if node.upos == "VERB": + morphind = node.misc["MorphInd"] + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r"_VS[AP]$", "", morphind) + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r"\+", morphind) + # Expected suffixes are -kan, -i, -an, or no suffix at all. + if len(morphemes) > 1 and re.match(r"^(kan|i|an)$", morphemes[-1]): + del morphemes[-1] + # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. + # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". + while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se)$", morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s'" % (len(morphemes), morphemes, morphind)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r"<[a-z]+>$", "", lemma) + node.lemma = lemma + + def merge_reduplicated_plural(self, node): + # Instead of compound:plur, merge the reduplicated plurals into a single token. + if node.deprel == "compound:plur": + root = node.root + # We assume that the previous token is a hyphen and the token before it is the parent. + first = node.parent + if first.ord == node.ord-2 and first.form.lower() == node.form.lower(): + hyph = node.prev_node + if hyph.is_descendant_of(first) and re.match(r"^(-|–|--)$", hyph.form): + # Neither the hyphen nor the current node should have children. + # If they do, re-attach the children to the first node. + for c in hyph.children: + c.parent = first + for c in node.children: + c.parent = first + # Merge the three nodes. + first.form = first.form + "-" + node.form + first.feats["Number"] = "Plur" + if node.no_space_after: + first.misc["SpaceAfter"] = "No" + else: + first.misc["SpaceAfter"] = "" + hyph.remove() + node.remove() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() + + def process_node(self, node): + self.lemmatize_verb_from_morphind(node) From 9cb18121557f553068fc4c17bd5feea16227ef96 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 21:05:35 +0200 Subject: [PATCH 0259/1201] "per-" is also a prefix that should be removed from the lemma (confirmed by Ika). --- udapi/block/ud/id/fixgsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 1ab30dd0..a629d712 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -23,7 +23,7 @@ def lemmatize_verb_from_morphind(self, node): del morphemes[-1] # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". - while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se)$", morphemes[0]): + while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): del morphemes[0] # Check that we are left with just one morpheme. if len(morphemes) != 1: From e8194c4dc6726685d26eee5f9761f2ef1335fc31 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 21:10:26 +0200 Subject: [PATCH 0260/1201] Extended debugging message. --- udapi/block/ud/id/fixgsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index a629d712..b3328273 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -27,7 +27,7 @@ def lemmatize_verb_from_morphind(self, node): del morphemes[0] # Check that we are left with just one morpheme. if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s'" % (len(morphemes), morphemes, morphind)) + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s'" % (len(morphemes), morphemes, morphind, node.form)) else: lemma = morphemes[0] # Remove the stem POS category. From bc0a1d3e969698be93cd9bf8e2bd5a0aa67cad9c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 21:59:46 +0200 Subject: [PATCH 0261/1201] Indonesian fixes. --- udapi/block/ud/id/addmwt.py | 16 ++++++++++++++++ udapi/block/ud/id/fixgsd.py | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 udapi/block/ud/id/addmwt.py diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py new file mode 100644 index 00000000..3fe39dd8 --- /dev/null +++ b/udapi/block/ud/id/addmwt.py @@ -0,0 +1,16 @@ +""" +Block ud.id.AddMwt cuts the clitic "-nya" in Indonesian (preprocessed with +MorphInd whose output is stored in MISC attribute MorphInd). +""" +import udapi.block.ud.addmwt + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE): + splitform = re.sub(r'(nya)$', r' \1', re.IGNORECASE) + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'shape': 'subtree', 'deprel': '* obj'} + return None diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index b3328273..0ec79f2e 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -27,7 +27,7 @@ def lemmatize_verb_from_morphind(self, node): del morphemes[0] # Check that we are left with just one morpheme. if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s'" % (len(morphemes), morphemes, morphind, node.form)) + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) else: lemma = morphemes[0] # Remove the stem POS category. From 70361e213d59709d0793992b01476bf5373a6e46 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:07:29 +0200 Subject: [PATCH 0262/1201] Features of -nya. --- udapi/block/ud/id/addmwt.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 3fe39dd8..2d46fb67 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -11,6 +11,15 @@ def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE): splitform = re.sub(r'(nya)$', r' \1', re.IGNORECASE) + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + if node.feats["Number[psor]"] != "Sing": + logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats["Number[psor]"])) + if node.feats["Person[psor]"] != "3": + logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats["Person[psor]"])) + node.feats["Number[psor]"] = '' + node.feats["Person[psor]"] = '' + pronfeats = 'Number=Sing|Person=3|PronType=Prs' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) - return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'shape': 'subtree', 'deprel': '* obj'} + return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'feats': '* '+pronfeats, 'shape': 'subtree', 'deprel': '* obj'} return None From 87909af8a5ff6e4d19fcdd4640babe1f502e260e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:09:25 +0200 Subject: [PATCH 0263/1201] Bug fix. --- udapi/block/ud/id/addmwt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 2d46fb67..4a2158c4 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -3,6 +3,8 @@ MorphInd whose output is stored in MISC attribute MorphInd). """ import udapi.block.ud.addmwt +import logging +import re class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree).""" From 0a5148670f4133505d9e0524b41cedcbec923713 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:13:05 +0200 Subject: [PATCH 0264/1201] Bug fix. --- udapi/block/ud/id/addmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 4a2158c4..da5219f0 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -12,7 +12,7 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE): - splitform = re.sub(r'(nya)$', r' \1', re.IGNORECASE) + splitform = re.sub(r'(nya)$', r' \1', flags=re.IGNORECASE) # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. if node.feats["Number[psor]"] != "Sing": From 141f100c71ae5ec33c3e126841d63234e3c16f1a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:14:09 +0200 Subject: [PATCH 0265/1201] Bug fix. --- udapi/block/ud/id/addmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index da5219f0..c0c40486 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -12,7 +12,7 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE): - splitform = re.sub(r'(nya)$', r' \1', flags=re.IGNORECASE) + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. if node.feats["Number[psor]"] != "Sing": From da307863288f6e9872b2f6f6ace9555f1763ecf4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:21:20 +0200 Subject: [PATCH 0266/1201] Narrowing down the conditions for -nya. --- udapi/block/ud/id/addmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index c0c40486..023f10fa 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -11,7 +11,7 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" - if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE): + if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3$', node.misc["MorphInd"]): splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. From 0d54d60ba290b9bf3cbb0d245fed8eda923ff154 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Sep 2021 22:52:41 +0200 Subject: [PATCH 0267/1201] More refined -nya segmentation. --- udapi/block/ud/id/addmwt.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 023f10fa..6231c7e6 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -11,17 +11,26 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" - if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3$', node.misc["MorphInd"]): + if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3\$$', node.misc['MorphInd']): splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. - if node.feats["Number[psor]"] != "Sing": - logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats["Number[psor]"])) - if node.feats["Person[psor]"] != "3": - logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats["Person[psor]"])) - node.feats["Number[psor]"] = '' - node.feats["Person[psor]"] = '' + if node.feats['Number[psor]'] != 'Sing': + logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) + if node.feats['Person[psor]'] != '3': + logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' pronfeats = 'Number=Sing|Person=3|PronType=Prs' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = 'obl:agent' if re.match(r'^\^di\+', node.misc['MorphInd']) else 'obj' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) - return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'feats': '* '+pronfeats, 'shape': 'subtree', 'deprel': '* obj'} + return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'feats': '* '+pronfeats, 'xpos': xpos, 'shape': 'subtree', 'deprel': '* '+deprel} return None + + def postprocess_mwt(self, mwt): + """Distribute the MorphInd analysis to the two parts so that we can later use it to fix the lemmas of verbs.""" + match = re.match(r'^\^(.*)\+(dia

_PS3)\$$', mwt.misc['MorphInd']) + if match: + mwt.words[0].misc['MorphInd'] = '^'+match.group(1)+'$' + mwt.words[1].misc['MorphInd'] = '^'+match.group(2)+'$' From 1eaaebdb7311089921161e2045b407a95387d442 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 10:46:41 +0200 Subject: [PATCH 0268/1201] Indonesian proper nouns do not form plural. --- udapi/block/ud/id/fixgsd.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 0ec79f2e..552d6743 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -63,5 +63,17 @@ def merge_reduplicated_plural(self, node): # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). root.text = root.compute_text() + def fix_plural_propn(self, node): + """ + It is unlikely that a proper noun will have a plural form in Indonesian. + All examples observed in GSD should actually be tagged as common nouns. + """ + if node.upos == 'PROPN' and node.feats['Number'] == 'Plur': + node.upos = 'NOUN' + node.lemma = node.lemma.lower() + if node.upos == 'PROPN': + node.feats['Number'] = '' + def process_node(self, node): - self.lemmatize_verb_from_morphind(node) + self.fix_plural_propn(node) + #self.lemmatize_verb_from_morphind(node) From 3db7f05bf8075e81e6c274db0d22b8d8de5b4eef Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 12:34:43 +0200 Subject: [PATCH 0269/1201] Modified splitting of VERB+nya after some more input from Ika. --- udapi/block/ud/id/addmwt.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 6231c7e6..cc3a0fee 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -13,6 +13,13 @@ def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3\$$', node.misc['MorphInd']): splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + # For transitive verbs with the meN- prefix, -nya is an object clitic. + # For passive verbs with the di- prefix, -nya refers to a passive agent. + # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. + # The same would hold for intransitive verbs with the meN- prefix but we cannot recognize them (we will treat all meN- verbs as transitive). + menverb = True if re.match(r'^\^meN\+', node.misc['MorphInd']) else False + diverb = True if re.match(r'^\^di\+', node.misc['MorphInd']) else False + nominalization = not menverb and not diverb # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. if node.feats['Number[psor]'] != 'Sing': @@ -21,11 +28,19 @@ def multiword_analysis(self, node): logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' - pronfeats = 'Number=Sing|Person=3|PronType=Prs' + if nominalization: + lemma = splitform.lower() + upos = 'VERB DET' + feats = '* Definite=Def|PronType=Art' + deprel = '* det' + else: + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + upos = 'VERB PRON' + feats = '* Number=Sing|Person=3|PronType=Prs' + deprel = '* obj:agent' if diverb else '* obj' xpos = re.sub(r'\+', ' ', node.xpos) - deprel = 'obl:agent' if re.match(r'^\^di\+', node.misc['MorphInd']) else 'obj' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) - return {'form': splitform, 'lemma': splitform, 'upos': 'VERB PRON', 'feats': '* '+pronfeats, 'xpos': xpos, 'shape': 'subtree', 'deprel': '* '+deprel} + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} return None def postprocess_mwt(self, mwt): From ad0f1340276237b02ed2e0423147cbdb5b11a9b9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 14:29:27 +0200 Subject: [PATCH 0270/1201] Indonesian -nya with nouns. --- udapi/block/ud/id/addmwt.py | 75 +++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index cc3a0fee..e22f27bd 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -11,36 +11,55 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" - if node.upos == 'VERB' and re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3\$$', node.misc['MorphInd']): - splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) - # For transitive verbs with the meN- prefix, -nya is an object clitic. - # For passive verbs with the di- prefix, -nya refers to a passive agent. - # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. - # The same would hold for intransitive verbs with the meN- prefix but we cannot recognize them (we will treat all meN- verbs as transitive). - menverb = True if re.match(r'^\^meN\+', node.misc['MorphInd']) else False - diverb = True if re.match(r'^\^di\+', node.misc['MorphInd']) else False - nominalization = not menverb and not diverb - # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. - # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. - if node.feats['Number[psor]'] != 'Sing': - logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) - if node.feats['Person[psor]'] != '3': - logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) - node.feats['Number[psor]'] = '' - node.feats['Person[psor]'] = '' - if nominalization: - lemma = splitform.lower() - upos = 'VERB DET' - feats = '* Definite=Def|PronType=Art' - deprel = '* det' - else: + if re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3\$$', node.misc['MorphInd']): + if node.upos == 'VERB': + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + # For transitive verbs with the meN- prefix, -nya is an object clitic. + # For passive verbs with the di- prefix, -nya refers to a passive agent. + # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. + # The same would hold for intransitive verbs with the meN- prefix but we cannot recognize them (we will treat all meN- verbs as transitive). + menverb = True if re.match(r'^\^meN\+', node.misc['MorphInd']) else False + diverb = True if re.match(r'^\^di\+', node.misc['MorphInd']) else False + nominalization = not menverb and not diverb + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + if node.feats['Number[psor]'] != 'Sing': + logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) + if node.feats['Person[psor]'] != '3': + logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + if nominalization: + lemma = splitform.lower() + upos = 'VERB DET' + feats = '* Definite=Def|PronType=Art' + deprel = '* det' + else: + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + upos = 'VERB PRON' + feats = '* Number=Sing|Person=3|PronType=Prs' + deprel = '* obj:agent' if diverb else '* obj' + xpos = re.sub(r'\+', ' ', node.xpos) + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'(NOUN|PROPN|X)', node.upos): + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. + if node.feats['Number[psor]'] != 'Sing': + logging.warning("Noun '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) + if node.feats['Person[psor]'] != '3': + logging.warning("Noun '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' lemma = re.sub(r' nya$', ' dia', splitform.lower()) - upos = 'VERB PRON' + upos = '* PRON' feats = '* Number=Sing|Person=3|PronType=Prs' - deprel = '* obj:agent' if diverb else '* obj' - xpos = re.sub(r'\+', ' ', node.xpos) - # 'main': 0 ... this is the default value (the first node will be the head and inherit children) - return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* nmod:poss' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + return None return None def postprocess_mwt(self, mwt): From 62b8f93d35142e36722c2fc5ed5dca1073357321 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 14:40:31 +0200 Subject: [PATCH 0271/1201] Warn about unhandled instances of -nya. --- udapi/block/ud/id/addmwt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index e22f27bd..40421611 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -59,7 +59,9 @@ def multiword_analysis(self, node): deprel = '* nmod:poss' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} - return None + else: + logging.warning("Form '%s' analyzed by MorphInd as having the -nya clitic but the UPOS is '%s'" % (node.form, node.upos)) + return None return None def postprocess_mwt(self, mwt): From f5dde932ee8348abd01078123eedf5c46f7cb4dc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 14:56:03 +0200 Subject: [PATCH 0272/1201] Indonesian verb lemmatization turned on. --- udapi/block/ud/id/fixgsd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 552d6743..458e41db 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -31,7 +31,7 @@ def lemmatize_verb_from_morphind(self, node): else: lemma = morphemes[0] # Remove the stem POS category. - lemma = re.sub(r"<[a-z]+>$", "", lemma) + lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) node.lemma = lemma def merge_reduplicated_plural(self, node): @@ -76,4 +76,4 @@ def fix_plural_propn(self, node): def process_node(self, node): self.fix_plural_propn(node) - #self.lemmatize_verb_from_morphind(node) + self.lemmatize_verb_from_morphind(node) From c351c00c12e75ce7cfb802599e3c02258ecb2fc0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 22:21:54 +0200 Subject: [PATCH 0273/1201] Why we attach -nya as obl:agent. --- udapi/block/ud/id/addmwt.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 40421611..1270ba77 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -38,7 +38,13 @@ def multiword_analysis(self, node): lemma = re.sub(r' nya$', ' dia', splitform.lower()) upos = 'VERB PRON' feats = '* Number=Sing|Person=3|PronType=Prs' - deprel = '* obj:agent' if diverb else '* obj' + # The agent of the passive verb is coded like a direct object of an active verb, + # so we might want to use obj:agent rather than obl:agent. However, full nominals + # as passive agents can be optionally accompanied by the preposition _oleh_ "by", + # which is an argument in favor of saying that they are oblique. So we currently + # mark all passive agents as obliques, although it is disputable in Austronesian + # languages (unlike Indo-European passives). + deprel = '* obl:agent' if diverb else '* obj' xpos = re.sub(r'\+', ' ', node.xpos) # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} From ee04cdea67d1edb64d0f9573103818557204eb81 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 12 Sep 2021 22:52:40 +0200 Subject: [PATCH 0274/1201] ke-sama-an --- udapi/block/ud/id/fixgsd.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 458e41db..481630fa 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -19,7 +19,10 @@ def lemmatize_verb_from_morphind(self, node): # Split morphind to prefix, stem, and suffix. morphemes = re.split(r"\+", morphind) # Expected suffixes are -kan, -i, -an, or no suffix at all. - if len(morphemes) > 1 and re.match(r"^(kan|i|an)$", morphemes[-1]): + # There is also the circumfix ke-...-an which seems to be nominalized adjective: + # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; + # but I am not sure what is the reason that these are tagged VERB. + if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): del morphemes[-1] # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". From 5659d820afe8ff2dfabf1785a4fdd3e94ab58a3a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Sep 2021 16:34:22 +0200 Subject: [PATCH 0275/1201] New fixes for Indonesian. --- udapi/block/ud/id/addmwt.py | 73 ++++++++++++++++++++++++++++++++++--- udapi/block/ud/id/fixgsd.py | 48 ++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 1270ba77..7e8db2f0 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -52,10 +52,6 @@ def multiword_analysis(self, node): splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. - if node.feats['Number[psor]'] != 'Sing': - logging.warning("Noun '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) - if node.feats['Person[psor]'] != '3': - logging.warning("Noun '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' lemma = re.sub(r' nya$', ' dia', splitform.lower()) @@ -65,8 +61,75 @@ def multiword_analysis(self, node): deprel = '* nmod:poss' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'ADJ': + # nominalized adjective + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'ADJ DET' + feats = '* Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'^(banyak|semua)nya$', node.form, re.IGNORECASE): + # semua = all (DET) + # semuanya = nominalization of semua, i.e., 'everything' (PRON) + # banyak = many, much (DET) + # banyaknya = nominalization of banyak, i.e., 'a lot' (PRON) + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'DET DET' + feats = ('PronType=Tot' if lemma == 'semua nya' else 'PronType=Ind')+' Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'^(satu)nya$', node.form, re.IGNORECASE): + # satu = one (NUM) + # satunya = nominalization of satu, meaning 'the only one' + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'NUM DET' + feats = 'NumType=Card Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'ADP' and node.xpos == 'R--+PS3' or re.match(r'^(bersama|dibawah|didalam|sekitar)nya$', node.form, re.IGNORECASE): + # Fused preposition and pronoun. + # Most of them are recognized as R--+PS3 by MorphInd. However, some are different: + # bersamanya = 'with him' = VSA+PS3 + # dibawahnya = 'under it' = VSP+PS3 + # didalamnya = 'inside it' = VSP+PS3 + # sekitarnya = 'around it' = D--+PS3 + # However: + # layaknya = 'like' is a derivation from 'layak' = 'worthy' (ASP+PS3) + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + upos = 'ADP PRON' + feats = '_ Number=Sing|Person=3|PronType=Prs' + xpos = 'R-- PS3' + if node.udeprel == 'case': + if re.match(r'^(NOUN|PROPN|PRON|DET|NUM|X|SYM)$', node.parent.upos): + deprel = 'nmod' + else: + deprel = 'obl' + else: + deprel = '*' + deprel = 'case '+deprel + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} else: - logging.warning("Form '%s' analyzed by MorphInd as having the -nya clitic but the UPOS is '%s'" % (node.form, node.upos)) + # Do not warn about instances that are known exceptions. + # akibatnya = as a result (SCONJ); akibat = result + # bukannya = instead (PART); bukan = no, not + # dirinya = reflexive himself/herself/itself (similarly, diriku = myself, dirimu = yourself; somewhere else we should check that they have the right features) + # layaknya = like (ADP); layak = worthy + # sebaiknya = should (AUX) + # sesampainya = once in / arriving at (ADP) + # tidaknya = whether or not (PART); tidak = no, not + # Adverbs are an exception, too. The -nya morpheme could be derivation. E.g., 'ironis' = 'ironic'; 'ironisnya' = 'ironically'. + if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|diri|layak|sebaik|sesampai|tidak)nya$', node.form, re.IGNORECASE): + logging.warning("Form '%s' analyzed by MorphInd as having the -nya clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) return None return None diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 481630fa..fce6e4f9 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -5,6 +5,52 @@ class FixGSD(Block): + def fix_upos_based_on_morphind(self, node): + """ + Example from data: ("kesamaan"), the correct UPOS is NOUN, as + suggested by MorphInd. + Based on my observation so far, if there is a different UPOS between + the original GSD and MorphInd, it's better to trust MorphInd + I found so many incorrect UPOS in GSD, especially when NOUNs become + VERBs and VERBs become NOUNs. + I suggest adding Voice=Pass when the script decides ke-xxx-an as VERB. + """ + if node.upos == 'VERB' and node.xpos == 'NSD' and re.match(r'^ke.+an$', node.form, re.IGNORECASE): + node.upos = 'NOUN' + if node.udeprel == 'acl': + node.deprel = 'nmod' + elif node.udeprel == 'advcl': + node.deprel = 'obl' + + def fix_ordinal_numerals(self, node): + """ + Ordinal numerals should be ADJ NumType=Ord in UD. They have many different + UPOS tags in Indonesian GSD. This method harmonizes them. + pertama = first + kedua = second + ketiga = third + keempat = fourth + kelima = fifth + keenam = sixth + ketujuh = seventh + kedelapan = eighth + kesembilan = ninth + ke48 = 48th + """ + # We could also check the XPOS, which is derived from MorphInd: re.match(r'^CO-', node.xpos) + if re.match(r'^(pertama|kedua|ketiga|keempat|kelima|keenam|ketujuh|kedelapan|kesembilan|ke-?\d+)(nya)?$', node.form, re.IGNORECASE): + node.upos = 'ADJ' + node.feats['NumType'] = 'Ord' + if re.match(r'^(det|nummod|nmod)$', node.udeprel): + node.deprel = 'amod' + # The following is not an ordinal numeral but I am too lazy to create a separate method for that. + elif node.form.lower() == 'semua': + # It means 'all'. Originally it was DET, PRON, or ADV. + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + if node.udeprel == 'nmod' or node.udeprel == 'advmod': + node.deprel = 'det' + def lemmatize_verb_from_morphind(self, node): # The MISC column contains the output of MorphInd for the current word. # The analysis has been interpreted wrongly for some verbs, so we need @@ -79,4 +125,6 @@ def fix_plural_propn(self, node): def process_node(self, node): self.fix_plural_propn(node) + self.fix_upos_based_on_morphind(node) + self.fix_ordinal_numerals(node) self.lemmatize_verb_from_morphind(node) From f611e3e74192cef526d7ef702faf3f6c50a7838b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Sep 2021 17:55:40 +0200 Subject: [PATCH 0276/1201] Splitting -kah, -lah, -pun, -tah in Indonesian. --- udapi/block/ud/id/addmwt.py | 13 +++++++++- udapi/block/ud/id/fixgsd.py | 51 ++++++++++++++++++++----------------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 7e8db2f0..65aed6dc 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -131,11 +131,22 @@ def multiword_analysis(self, node): if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|diri|layak|sebaik|sesampai|tidak)nya$', node.form, re.IGNORECASE): logging.warning("Form '%s' analyzed by MorphInd as having the -nya clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) return None + elif re.search(r'(kah|lah|pun|tah)$', node.form, re.IGNORECASE) and re.search(r'\+(kah|lah|pun|tah)_T--\$$', node.misc['MorphInd']): + splitform = re.sub(r'(kah|lah|pun|tah)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = '* PART' + feats = '* _' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split()) < 2: + xpos = xpos + ' T--' + deprel = '* advmod:emph' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} return None def postprocess_mwt(self, mwt): """Distribute the MorphInd analysis to the two parts so that we can later use it to fix the lemmas of verbs.""" - match = re.match(r'^\^(.*)\+(dia

_PS3)\$$', mwt.misc['MorphInd']) + match = re.match(r'^\^(.*)\+(dia

_PS3|kah_T--|lah_T--|pun_T--|tah_T--)\$$', mwt.misc['MorphInd']) if match: mwt.words[0].misc['MorphInd'] = '^'+match.group(1)+'$' mwt.words[1].misc['MorphInd'] = '^'+match.group(2)+'$' diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index fce6e4f9..4ea96968 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -57,31 +57,34 @@ def lemmatize_verb_from_morphind(self, node): # to re-interpret it and extract the correct lemma. if node.upos == "VERB": morphind = node.misc["MorphInd"] - # Remove the start and end tags from morphind. - morphind = re.sub(r"^\^", "", morphind) - morphind = re.sub(r"\$$", "", morphind) - # Remove the final XPOS tag from morphind. - morphind = re.sub(r"_VS[AP]$", "", morphind) - # Split morphind to prefix, stem, and suffix. - morphemes = re.split(r"\+", morphind) - # Expected suffixes are -kan, -i, -an, or no suffix at all. - # There is also the circumfix ke-...-an which seems to be nominalized adjective: - # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; - # but I am not sure what is the reason that these are tagged VERB. - if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): - del morphemes[-1] - # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. - # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". - while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): - del morphemes[0] - # Check that we are left with just one morpheme. - if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r"_VS[AP]$", "", morphind) + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r"\+", morphind) + # Expected suffixes are -kan, -i, -an, or no suffix at all. + # There is also the circumfix ke-...-an which seems to be nominalized adjective: + # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; + # but I am not sure what is the reason that these are tagged VERB. + if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): + del morphemes[-1] + # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. + # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". + while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) + node.lemma = lemma else: - lemma = morphemes[0] - # Remove the stem POS category. - lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) - node.lemma = lemma + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) def merge_reduplicated_plural(self, node): # Instead of compound:plur, merge the reduplicated plurals into a single token. From 9dc01a1620237e97d446782714be27343129a942 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Sep 2021 21:47:21 +0200 Subject: [PATCH 0277/1201] More Indonesian clitics. --- udapi/block/ud/id/addmwt.py | 82 ++++++++++++++++++++++++++++--------- udapi/block/ud/id/fixgsd.py | 2 +- 2 files changed, 63 insertions(+), 21 deletions(-) diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 65aed6dc..7f5ab271 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -11,9 +11,30 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" - if re.search(r'nya$', node.form, re.IGNORECASE) and re.search(r'\+dia

_PS3\$$', node.misc['MorphInd']): + if re.search(r'^(ku|kau)', node.form, re.IGNORECASE) and re.search(r'^\^(aku

_PS1|kamu

_PS2)\+', node.misc['MorphInd']) and node.upos == 'VERB': + splitform = re.sub(r'^(ku|kau)', r'\1 ', node.form, flags=re.IGNORECASE) + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = 'PRON VERB' + if re.search(r'^ku ', splitform.lower()): + lemma = re.sub(r'^ku ', 'aku ', splitform.lower()) + feats = 'Number=Sing|Person=1|PronType=Prs *' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split())<2: + xpos = 'PS1 VSA' + else: + lemma = re.sub(r'^kau ', 'kamu ', splitform.lower()) + feats = 'Number=Sing|Person=2|PronType=Prs *' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split())<2: + xpos = 'PS2 VSA' + deprel = 'nsubj *' + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} + elif re.search(r'(nya|ku|mu)$', node.form, re.IGNORECASE) and re.search(r'\+(dia

_PS3|aku

_PS1|kamu

_PS2)\$$', node.misc['MorphInd']): if node.upos == 'VERB': - splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) # For transitive verbs with the meN- prefix, -nya is an object clitic. # For passive verbs with the di- prefix, -nya refers to a passive agent. # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. @@ -23,10 +44,6 @@ def multiword_analysis(self, node): nominalization = not menverb and not diverb # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. - if node.feats['Number[psor]'] != 'Sing': - logging.warning("Verb '%s' has Number[psor]=='%s'" % (node.form, node.feats['Number[psor]'])) - if node.feats['Person[psor]'] != '3': - logging.warning("Verb '%s' has Person[psor]=='%s'" % (node.form, node.feats['Person[psor]'])) node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' if nominalization: @@ -35,9 +52,16 @@ def multiword_analysis(self, node): feats = '* Definite=Def|PronType=Art' deprel = '* det' else: - lemma = re.sub(r' nya$', ' dia', splitform.lower()) upos = 'VERB PRON' - feats = '* Number=Sing|Person=3|PronType=Prs' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' # The agent of the passive verb is coded like a direct object of an active verb, # so we might want to use obj:agent rather than obl:agent. However, full nominals # as passive agents can be optionally accompanied by the preposition _oleh_ "by", @@ -49,19 +73,26 @@ def multiword_analysis(self, node): # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} elif re.match(r'(NOUN|PROPN|X)', node.upos): - splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. node.feats['Number[psor]'] = '' node.feats['Person[psor]'] = '' - lemma = re.sub(r' nya$', ' dia', splitform.lower()) upos = '* PRON' - feats = '* Number=Sing|Person=3|PronType=Prs' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' xpos = re.sub(r'\+', ' ', node.xpos) deprel = '* nmod:poss' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} - elif node.upos == 'ADJ': + elif node.upos == 'ADJ' and re.search(r'(nya)$', node.form, re.IGNORECASE): # nominalized adjective splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) lemma = splitform.lower() @@ -95,7 +126,7 @@ def multiword_analysis(self, node): deprel = '* det' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} - elif node.upos == 'ADP' and node.xpos == 'R--+PS3' or re.match(r'^(bersama|dibawah|didalam|sekitar)nya$', node.form, re.IGNORECASE): + elif node.upos == 'ADP' and re.match(r'^R--\+PS[123]$', node.xpos) or re.match(r'^(bersama|dibawah|didalam|sekitar)nya$', node.form, re.IGNORECASE): # Fused preposition and pronoun. # Most of them are recognized as R--+PS3 by MorphInd. However, some are different: # bersamanya = 'with him' = VSA+PS3 @@ -104,11 +135,20 @@ def multiword_analysis(self, node): # sekitarnya = 'around it' = D--+PS3 # However: # layaknya = 'like' is a derivation from 'layak' = 'worthy' (ASP+PS3) - splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) - lemma = re.sub(r' nya$', ' dia', splitform.lower()) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) upos = 'ADP PRON' - feats = '_ Number=Sing|Person=3|PronType=Prs' - xpos = 'R-- PS3' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + xpos = 'R-- PS3' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + xpos = 'R-- PS1' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' + xpos = 'R-- PS2' if node.udeprel == 'case': if re.match(r'^(NOUN|PROPN|PRON|DET|NUM|X|SYM)$', node.parent.upos): deprel = 'nmod' @@ -128,8 +168,8 @@ def multiword_analysis(self, node): # sesampainya = once in / arriving at (ADP) # tidaknya = whether or not (PART); tidak = no, not # Adverbs are an exception, too. The -nya morpheme could be derivation. E.g., 'ironis' = 'ironic'; 'ironisnya' = 'ironically'. - if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|diri|layak|sebaik|sesampai|tidak)nya$', node.form, re.IGNORECASE): - logging.warning("Form '%s' analyzed by MorphInd as having the -nya clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) + if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|diri|layak|sebaik|sesampai|tidak)(nya|ku|mu)$', node.form, re.IGNORECASE): + logging.warning("Form '%s' analyzed by MorphInd as having the -nya|-ku|-mu clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) return None elif re.search(r'(kah|lah|pun|tah)$', node.form, re.IGNORECASE) and re.search(r'\+(kah|lah|pun|tah)_T--\$$', node.misc['MorphInd']): splitform = re.sub(r'(kah|lah|pun|tah)$', r' \1', node.form, flags=re.IGNORECASE) @@ -146,7 +186,9 @@ def multiword_analysis(self, node): def postprocess_mwt(self, mwt): """Distribute the MorphInd analysis to the two parts so that we can later use it to fix the lemmas of verbs.""" - match = re.match(r'^\^(.*)\+(dia

_PS3|kah_T--|lah_T--|pun_T--|tah_T--)\$$', mwt.misc['MorphInd']) + match = re.match(r'^\^(.*)\+(aku

_PS1|kamu

_PS2|dia

_PS3|kah_T--|lah_T--|pun_T--|tah_T--)\$$', mwt.misc['MorphInd']) + if not match: + match = re.match(r'^\^(aku

_PS1|kamu

_PS2)\+(.*)\$$', mwt.misc['MorphInd']) if match: mwt.words[0].misc['MorphInd'] = '^'+match.group(1)+'$' mwt.words[1].misc['MorphInd'] = '^'+match.group(2)+'$' diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 4ea96968..926bc346 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -62,7 +62,7 @@ def lemmatize_verb_from_morphind(self, node): morphind = re.sub(r"^\^", "", morphind) morphind = re.sub(r"\$$", "", morphind) # Remove the final XPOS tag from morphind. - morphind = re.sub(r"_VS[AP]$", "", morphind) + morphind = re.sub(r"_V[SP][AP]$", "", morphind) # Split morphind to prefix, stem, and suffix. morphemes = re.split(r"\+", morphind) # Expected suffixes are -kan, -i, -an, or no suffix at all. From 1a95f38523b7fa3e30a9c868269f9330a740237a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Sep 2021 23:12:24 +0200 Subject: [PATCH 0278/1201] More Indonesian fixes. --- udapi/block/ud/id/fixgsd.py | 43 +++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 926bc346..d1c735a4 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -126,8 +126,51 @@ def fix_plural_propn(self, node): if node.upos == 'PROPN': node.feats['Number'] = '' + def fix_satu_satunya(self, node): + """ + 'satu' = 'one' (NUM) + 'satu-satunya' = 'the only' + """ + root = node.root + if node.form == 'nya' and node.parent.form.lower() == 'satu' and node.parent.udeprel == 'fixed' and node.parent.parent.form.lower() == 'satu': + satu0 = node.parent.parent + satu1 = node.parent + nya = node + dash = None + if satu1.ord == satu0.ord+2 and satu1.prev_node.form == '-': + dash = satu1.prev_node + satu0.misc['SpaceAfter'] = 'No' + dash.misc['SpaceAfter'] = 'No' + root.text = root.compute_text() + satu1.deprel = 'compound:redup' + nya.parent = satu0 + # We actually cannot leave the 'compound:redup' here because it is not used in Indonesian. + if node.form == 'nya' and node.parent.form.lower() == 'satu': + satu0 = node.parent + nya = node + if satu0.next_node.form == '-': + dash = satu0.next_node + if dash.next_node.form.lower() == 'satu': + satu1 = dash.next_node + if satu1.ord == node.ord-1: + # Merge satu0 + dash + satu1 into one node. + satu0.form = satu0.form + dash.form + satu1.form + dash.remove() + satu1.remove() + # There should be a multi-word token comprising satu1 + nya. + mwt = nya.multiword_token + if mwt: + mwtmisc = mwt.misc.copy() + mwt.remove() + mwt = root.create_multiword_token([satu0, nya], satu0.form + nya.form, mwtmisc) + satu0.misc['SpaceAfter'] = '' + root.text = root.compute_text() + if node.multiword_token and node.no_space_after: + node.misc['SpaceAfter'] = '' + def process_node(self, node): self.fix_plural_propn(node) self.fix_upos_based_on_morphind(node) self.fix_ordinal_numerals(node) self.lemmatize_verb_from_morphind(node) + self.fix_satu_satunya(node) From bfada9deeb004a60438bb0b9562270f051877d3c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 19 Sep 2021 22:40:57 +0200 Subject: [PATCH 0279/1201] Distinguishing Indonesian ordinal numerals from total cardinal numerals. --- udapi/block/ud/id/fixgsd.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index d1c735a4..6247cba0 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -36,13 +36,32 @@ def fix_ordinal_numerals(self, node): kedelapan = eighth kesembilan = ninth ke48 = 48th + + However! The ke- forms (i.e., not 'pertama') can also function as total + versions of cardinal numbers ('both', 'all three' etc.). If the numeral + precedes the noun, it is a total cardinal; if it follows the noun, it is + an ordinal. An exception is when the modified noun is 'kali' = 'time'. + Then the numeral is ordinal regardless where it occurs, and together + with 'kali' it functions as an adverbial ordinal ('for the second time'). """ # We could also check the XPOS, which is derived from MorphInd: re.match(r'^CO-', node.xpos) - if re.match(r'^(pertama|kedua|ketiga|keempat|kelima|keenam|ketujuh|kedelapan|kesembilan|ke-?\d+)(nya)?$', node.form, re.IGNORECASE): + if re.match(r'^pertama(nya)?$', node.form, re.IGNORECASE): node.upos = 'ADJ' node.feats['NumType'] = 'Ord' if re.match(r'^(det|nummod|nmod)$', node.udeprel): node.deprel = 'amod' + elif re.match(r'^(kedua|ketiga|keempat|kelima|keenam|ketujuh|kedelapan|kesembilan|ke-?\d+)(nya)?$', node.form, re.IGNORECASE): + if node.parent.ord < node.ord or node.parent.lemma == 'kali': + node.upos = 'ADJ' + node.feats['NumType'] = 'Ord' + if re.match(r'^(det|nummod|nmod)$', node.udeprel): + node.deprel = 'amod' + else: + node.upos = 'NUM' + node.feats['NumType'] = 'Card' + node.feats['PronType'] = 'Tot' + if re.match(r'^(det|amod|nmod)$', node.udeprel): + node.deprel = 'nummod' # The following is not an ordinal numeral but I am too lazy to create a separate method for that. elif node.form.lower() == 'semua': # It means 'all'. Originally it was DET, PRON, or ADV. From 12aa517448a8b5fa79af1e115f41dc8d867acbe6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 20 Sep 2021 10:53:11 +0200 Subject: [PATCH 0280/1201] Fix tokenization of ordinal numerals in Indonesian. --- udapi/block/ud/id/fixgsd.py | 51 +++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 6247cba0..2296fc31 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -70,6 +70,56 @@ def fix_ordinal_numerals(self, node): if node.udeprel == 'nmod' or node.udeprel == 'advmod': node.deprel = 'det' + def rejoin_ordinal_numerals(self, node): + """ + If an ordinal numeral is spelled using digits ('ke-18'), it is often + tokenized as multiple tokens, which is wrong. Fix it. + """ + if node.form.lower() == 'ke': + dash = None + number = None + if node.next_node: + if node.next_node.form == '-': + dash = node.next_node + if dash.next_node and re.match(r'^\d+$', dash.next_node.form): + number = dash.next_node + node.form = node.form + dash.form + number.form + node.lemma = node.lemma + dash.lemma + number.lemma + elif re.match(r'^\d+$', node.next_node.form) and (node.parent == node.next_node or node.next_node.parent == node): + number = node.next_node + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = node.form + '-' + number.form + node.form = node.form + number.form + node.lemma = node.lemma + '-' + number.lemma + if number: + # Let us pretend that these forms are always ordinal numerals. + # Situations where they act as total cardinals will be disambiguated + # in a subsequent call to fix_ordinal_numerals(). + node.upos = 'ADJ' + node.xpos = 'CO-' + node.feats['NumType'] = 'Ord' + node.misc['MorphInd'] = '^ke_R--+' + number.form + '_CC-$' + # Find the parent node. Assume that the dash, if present, was not the head. + if node.parent == number: + node.parent = number.parent + node.deprel = number.deprel + if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): + node.deprel = 'amod' + # Adjust SpaceAfter. + node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' + # Remove the separate node of the dash and the number. + if dash: + if len(dash.children) > 0: + for c in dash.children: + c.parent = node + dash.remove() + if len(number.children) > 0: + for c in number.children: + c.parent = node + number.remove() + # There may have been spaces around the dash, which are now gone. Recompute the sentence text. + node.root.text = node.root.compute_text() + def lemmatize_verb_from_morphind(self, node): # The MISC column contains the output of MorphInd for the current word. # The analysis has been interpreted wrongly for some verbs, so we need @@ -190,6 +240,7 @@ def fix_satu_satunya(self, node): def process_node(self, node): self.fix_plural_propn(node) self.fix_upos_based_on_morphind(node) + self.rejoin_ordinal_numerals(node) self.fix_ordinal_numerals(node) self.lemmatize_verb_from_morphind(node) self.fix_satu_satunya(node) From f413c812239863b9e216bf5b258496668494fff0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 20 Sep 2021 13:39:20 +0200 Subject: [PATCH 0281/1201] "ke48" is a typo (although it occurs in the corpus), the correct spelling is "ke-48". --- udapi/block/ud/id/fixgsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 2296fc31..edc71142 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -35,7 +35,7 @@ def fix_ordinal_numerals(self, node): ketujuh = seventh kedelapan = eighth kesembilan = ninth - ke48 = 48th + ke-48 = 48th However! The ke- forms (i.e., not 'pertama') can also function as total versions of cardinal numbers ('both', 'all three' etc.). If the numeral From cda5e7ade81b8d00d921e87cd52f02f6edf2c71e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Sep 2021 17:42:06 +0200 Subject: [PATCH 0282/1201] after_process_document should print before redirecting fixes #94 --- udapi/block/write/textmodetreeshtml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 75a39a97..9f9f6aa2 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -53,8 +53,8 @@ def before_process_document(self, document): print('%s = %s' % (key, value)) def after_process_document(self, document): - super().after_process_document(document) print("\n\n") + super().after_process_document(document) def add_node(self, idx, node): if not node.is_root(): From 1f61ca4894179d95bea29fcf2baf8c6e44ef48d3 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 2 Oct 2021 15:30:51 +0200 Subject: [PATCH 0283/1201] Merge tokens describing decades in Indonesian. --- udapi/block/ud/id/fixgsd.py | 52 +++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index edc71142..9634bc2d 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -120,6 +120,57 @@ def rejoin_ordinal_numerals(self, node): # There may have been spaces around the dash, which are now gone. Recompute the sentence text. node.root.text = node.root.compute_text() + def rejoin_decades(self, node): + """ + In Indonesian, the equivalent of English "1990s" is written as "1990-an". + In GSD, it is often tokenized as multiple tokens, which is wrong. Fix it. + """ + if node.form.lower() == 'an': + dash = None + number = None + if node.prev_node: + if node.prev_node.form == '-': + dash = node.prev_node + if dash.prev_node and re.match(r'^\d+$', dash.prev_node.form): + number = dash.prev_node + node.form = number.form + dash.form + node.form + node.lemma = number.lemma + dash.lemma + node.lemma + elif re.match(r'^\d+$', node.prev_node.form) and (node.parent == node.prev_node or node.prev_node.parent == node): + number = node.prev_node + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = number.form + '-' + node.form + node.form = number.form + node.form + node.lemma = number.lemma + '-' + node.lemma + if number: + # The combined token is no longer a numeral. It cannot quantify an entity. + # Instead, it is itself something like a noun (or perhaps proper noun). + node.upos = 'NOUN' + node.xpos = 'NSD' + node.feats['NumType'] = '' + # In some cases, "-an" is labeled as foreign for no obvious reason. + node.feats['Foreign'] = '' + node.misc['MorphInd'] = '^' + number.form + '_CC-+an_F--$' + # Find the parent node. Assume that the dash, if present, was not the head. + if node.parent == number: + node.parent = number.parent + node.deprel = number.deprel + if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): + node.deprel = 'nmod' + # No need to adjust SpaceAfter, as the 'an' node was the last one in the complex. + #node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' + # Remove the separate node of the dash and the number. + if dash: + if len(dash.children) > 0: + for c in dash.children: + c.parent = node + dash.remove() + if len(number.children) > 0: + for c in number.children: + c.parent = node + number.remove() + # There may have been spaces around the dash, which are now gone. Recompute the sentence text. + node.root.text = node.root.compute_text() + def lemmatize_verb_from_morphind(self, node): # The MISC column contains the output of MorphInd for the current word. # The analysis has been interpreted wrongly for some verbs, so we need @@ -242,5 +293,6 @@ def process_node(self, node): self.fix_upos_based_on_morphind(node) self.rejoin_ordinal_numerals(node) self.fix_ordinal_numerals(node) + self.rejoin_decades(node) self.lemmatize_verb_from_morphind(node) self.fix_satu_satunya(node) From e7d7502b79a2170ecad339607d8e6b3f44dc5ba9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 2 Oct 2021 23:32:14 +0200 Subject: [PATCH 0284/1201] More fixes for Indonesian UD. --- udapi/block/ud/addmwt.py | 5 ++++ udapi/block/ud/id/addmwt.py | 31 ++++++++++++++++++-- udapi/block/ud/id/fixgsd.py | 57 ++++++++++++++++++++++++++++++++++--- 3 files changed, 86 insertions(+), 7 deletions(-) diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py index ffa78bbb..2d251989 100644 --- a/udapi/block/ud/addmwt.py +++ b/udapi/block/ud/addmwt.py @@ -1,5 +1,6 @@ """Abstract base class ud.AddMwt for heuristic detection of multi-word tokens.""" from udapi.core.block import Block +import logging class AddMwt(Block): @@ -40,6 +41,10 @@ def process_node(self, node): if attr in analysis: values = analysis[attr].split() for i, new_node in enumerate(nodes): + if len(values) <= i: + logging.warning("Attribute '%s' not supplied for word no. %d" % (attr, i)) + for attr in 'form lemma upos xpos feats deprel misc'.split(): + logging.warning("%s = %s" % (attr, analysis.get(attr, ''))) if values[i] == '*': setattr(new_node, attr, orig_attr[attr]) elif attr == 'feats' and '*' in values[i]: diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py index 7f5ab271..a8d50748 100644 --- a/udapi/block/ud/id/addmwt.py +++ b/udapi/block/ud/id/addmwt.py @@ -92,13 +92,39 @@ def multiword_analysis(self, node): deprel = '* nmod:poss' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'PRON' and re.match(r'^diri(nya|ku|mu)$', node.form, re.IGNORECASE): + # dirinya = reflexive himself/herself/itself (similarly, diriku = myself, dirimu = yourself; somewhere else we should check that they have the right features) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = 'PRON PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=3|PronType=Prs' + xpos = 'NSD PS3' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=1|PronType=Prs' + xpos = 'NSD PS1' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=2|PronType=Prs' + xpos = 'NSD PS2' + deprel = '* nmod:poss' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} elif node.upos == 'ADJ' and re.search(r'(nya)$', node.form, re.IGNORECASE): # nominalized adjective splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) lemma = splitform.lower() upos = 'ADJ DET' feats = '* Definite=Def|PronType=Art' - xpos = re.sub(r'\+', ' ', node.xpos) + if re.match(r' ', node.xpos): + xpos = re.sub(r'\+', ' ', node.xpos) + else: + xpos = 'ASP PS3' deprel = '* det' # 'main': 0 ... this is the default value (the first node will be the head and inherit children) return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} @@ -162,13 +188,12 @@ def multiword_analysis(self, node): # Do not warn about instances that are known exceptions. # akibatnya = as a result (SCONJ); akibat = result # bukannya = instead (PART); bukan = no, not - # dirinya = reflexive himself/herself/itself (similarly, diriku = myself, dirimu = yourself; somewhere else we should check that they have the right features) # layaknya = like (ADP); layak = worthy # sebaiknya = should (AUX) # sesampainya = once in / arriving at (ADP) # tidaknya = whether or not (PART); tidak = no, not # Adverbs are an exception, too. The -nya morpheme could be derivation. E.g., 'ironis' = 'ironic'; 'ironisnya' = 'ironically'. - if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|diri|layak|sebaik|sesampai|tidak)(nya|ku|mu)$', node.form, re.IGNORECASE): + if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|layak|sebaik|sesampai|tidak)(nya|ku|mu)$', node.form, re.IGNORECASE): logging.warning("Form '%s' analyzed by MorphInd as having the -nya|-ku|-mu clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) return None elif re.search(r'(kah|lah|pun|tah)$', node.form, re.IGNORECASE) and re.search(r'\+(kah|lah|pun|tah)_T--\$$', node.misc['MorphInd']): diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 9634bc2d..1e83c6e9 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -171,12 +171,12 @@ def rejoin_decades(self, node): # There may have been spaces around the dash, which are now gone. Recompute the sentence text. node.root.text = node.root.compute_text() - def lemmatize_verb_from_morphind(self, node): + def lemmatize_from_morphind(self, node): # The MISC column contains the output of MorphInd for the current word. # The analysis has been interpreted wrongly for some verbs, so we need # to re-interpret it and extract the correct lemma. - if node.upos == "VERB": - morphind = node.misc["MorphInd"] + morphind = node.misc['MorphInd'] + if node.upos == 'VERB': if morphind: # Remove the start and end tags from morphind. morphind = re.sub(r"^\^", "", morphind) @@ -205,6 +205,55 @@ def lemmatize_verb_from_morphind(self, node): node.lemma = lemma else: logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + elif node.upos == 'NOUN': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_(N[SP]D|VSA)$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefixes are peN-, per-, ke-, ber-. + # Expected suffix is -an. + if len(morphemes) > 1 and re.match(r'^an$', morphemes[-1]): + del morphemes[-1] + if len(morphemes) > 1 and re.match(r'^(peN|per|ke|ber)$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + elif node.upos == 'ADJ': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_ASS$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefix is ter-. + if len(morphemes) > 1 and re.match(r'^ter$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) def merge_reduplicated_plural(self, node): # Instead of compound:plur, merge the reduplicated plurals into a single token. @@ -294,5 +343,5 @@ def process_node(self, node): self.rejoin_ordinal_numerals(node) self.fix_ordinal_numerals(node) self.rejoin_decades(node) - self.lemmatize_verb_from_morphind(node) + self.lemmatize_from_morphind(node) self.fix_satu_satunya(node) From a8c36e50d6a1877a102da2456126f4d5b67b59de Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 6 Oct 2021 00:00:24 +0200 Subject: [PATCH 0285/1201] prevent errors on a quote surrounded by spaces --- udapi/block/segment/simple.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/block/segment/simple.py b/udapi/block/segment/simple.py index 5f4a8423..58be9b6d 100644 --- a/udapi/block/segment/simple.py +++ b/udapi/block/segment/simple.py @@ -33,8 +33,12 @@ def is_boundary(self, first, second): return False if first[-1] in '"“»›)': first = first[:-1] + if not first: + return False if second[0] in '"„«¿¡‹(': second = second[1:] + if not second: + return False if not second[0].isupper() or second[0].isdigit(): return False if not first[-1] in '.!?': From 3af3eabb17c185bc28d9fc6c4657ef384b384117 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Oct 2021 18:06:52 +0200 Subject: [PATCH 0286/1201] Merge more reduplications in Indonesian. --- udapi/block/ud/id/fixgsd.py | 80 ++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 1e83c6e9..8f7ed20a 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -255,34 +255,65 @@ def lemmatize_from_morphind(self, node): else: logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) - def merge_reduplicated_plural(self, node): - # Instead of compound:plur, merge the reduplicated plurals into a single token. - if node.deprel == "compound:plur": - root = node.root - # We assume that the previous token is a hyphen and the token before it is the parent. - first = node.parent - if first.ord == node.ord-2 and first.form.lower() == node.form.lower(): - hyph = node.prev_node - if hyph.is_descendant_of(first) and re.match(r"^(-|–|--)$", hyph.form): - # Neither the hyphen nor the current node should have children. - # If they do, re-attach the children to the first node. - for c in hyph.children: - c.parent = first - for c in node.children: - c.parent = first - # Merge the three nodes. - first.form = first.form + "-" + node.form - first.feats["Number"] = "Plur" + def merge_reduplication(self, node): + """ + Reduplication is a common morphological device in Indonesian. Reduplicated + nouns signal plural but some reduplications also encode emphasis, modification + of meaning etc. In the previous annotation of GSD, reduplication was mostly + analyzed as three tokens, e.g., for plurals, the second copy would be attached + to the first one as compound:plur, and the hyphen would be attached to the + second copy as punct. We want to analyze reduplication as a single token. + Fix it. + """ + # We assume that the previous token is a hyphen and the token before it is the parent. + first = node.parent + if first.ord == node.ord-2 and first.form.lower() == node.form.lower(): + hyph = node.prev_node + if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): + root = node.root + # This is specific to the reduplicated plurals. The rest will be done for any reduplications. + # Note that not all reduplicated plurals had compound:plur. So we will look at whether they are NOUN. + ###!!! Also, reduplicated plural nouns always have exact copies on both sides of the hyphen. + ###!!! Some other reduplications have slight modifications on one or the other side. + if node.upos == 'NOUN' and first.form.lower() == node.form.lower(): + first.feats['Number'] = 'Plur' + # Neither the hyphen nor the current node should have children. + # If they do, re-attach the children to the first node. + for c in hyph.children: + c.parent = first + for c in node.children: + c.parent = first + # Merge the three nodes. + # It is possible that the last token of the original annotation + # is included in a multi-word token. Then we must extend the + # multi-word token to the whole reduplication! Example: + # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' + # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. + mwt = node.multiword_token + if mwt: + # We assume that the MWT has only two words. We are not prepared for other possibilities. + if len(mwt.words) > 2: + logging.critical('MWT of only two words is expected') + mwtmisc = mwt.misc.copy() + second = mwt.words[1] + mwt.remove() + first.form = first.form + '-' + node.form + hyph.remove() + node.remove() + first.misc['SpaceAfter'] = '' + mwt = root.create_multiword_token([first, second], first.form + second.form, mwtmisc) + else: + first.form = first.form + '-' + node.form if node.no_space_after: - first.misc["SpaceAfter"] = "No" + first.misc['SpaceAfter'] = 'No' else: - first.misc["SpaceAfter"] = "" + first.misc['SpaceAfter'] = '' hyph.remove() node.remove() - # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. - # If it did not, then we have a mismatch with the sentence text, which we must fix. - # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). - root.text = root.compute_text() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() def fix_plural_propn(self, node): """ @@ -343,5 +374,6 @@ def process_node(self, node): self.rejoin_ordinal_numerals(node) self.fix_ordinal_numerals(node) self.rejoin_decades(node) + self.merge_reduplication(node) self.lemmatize_from_morphind(node) self.fix_satu_satunya(node) From 78ffaef93cc8255f48685e47ccf0229a6d4c30c9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Oct 2021 22:36:33 +0200 Subject: [PATCH 0287/1201] More token merging in Indonesian GSD. --- udapi/block/ud/id/fixgsd.py | 59 +++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 8f7ed20a..12ca9712 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -267,16 +267,29 @@ def merge_reduplication(self, node): """ # We assume that the previous token is a hyphen and the token before it is the parent. first = node.parent - if first.ord == node.ord-2 and first.form.lower() == node.form.lower(): + root = node.root + # Example of identical reduplication: negara-negara = countries + # Example of reduplication with -an: kopi-kopian = various coffee trees + # Example of reduplication with vowel substitution: bolak-balik = alternating + # Example of reduplication with di-: disebut-sebut = mentioned (the verb sebut is reduplicated, then passivized) + # Example of reduplication with se-: sehari-hari = daily (hari = day) + # The last pattern is not reduplication but we handle it here because the procedure is very similar: non-/sub-/anti- + a word. + if first.ord == node.ord-2 and (first.form.lower() == node.form.lower() or first.form.lower() + 'an' == node.form.lower() or re.match(r'^(.)o(.)a(.)-\1a\2i\3$', first.form.lower() + '-' + node.form.lower()) or first.form.lower() == 'di' + node.form.lower() or first.form.lower() == 'se' + node.form.lower() or re.match(r'^(non|sub|anti)$', first.form.lower())): hyph = node.prev_node if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): - root = node.root # This is specific to the reduplicated plurals. The rest will be done for any reduplications. # Note that not all reduplicated plurals had compound:plur. So we will look at whether they are NOUN. ###!!! Also, reduplicated plural nouns always have exact copies on both sides of the hyphen. ###!!! Some other reduplications have slight modifications on one or the other side. if node.upos == 'NOUN' and first.form.lower() == node.form.lower(): first.feats['Number'] = 'Plur' + # For the non-/sub-/anti- prefix we want to take the morphology from the second word. + if re.match(r'^(non|sub|anti)$', first.form.lower()): + first.lemma = first.lemma + '-' + node.lemma + first.upos = node.upos + first.xpos = node.xpos + first.feats = node.feats + first.misc['MorphInd'] = re.sub(r'\$\+\^', '+', first.misc['MorphInd'] + '+' + node.misc['MorphInd']) # Neither the hyphen nor the current node should have children. # If they do, re-attach the children to the first node. for c in hyph.children: @@ -314,6 +327,48 @@ def merge_reduplication(self, node): # If it did not, then we have a mismatch with the sentence text, which we must fix. # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). root.text = root.compute_text() + # In some cases the non-/sub-/anti- prefix is annotated as the head of the phrase and the above pattern does not catch it. + elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti)$', node.form.lower()): + prefix = node + stem = first # here it is not the first part at all + hyph = stem.prev_node + if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): + # For the non-/sub-/anti- prefix we want to take the morphology from the second word. + stem.lemma = prefix.lemma + '-' + stem.lemma + stem.misc['MorphInd'] = re.sub(r'\$\+\^', '+', prefix.misc['MorphInd'] + '+' + stem.misc['MorphInd']) + # Neither the hyphen nor the prefix should have children. + # If they do, re-attach the children to the stem. + for c in hyph.children: + c.parent = stem + for c in prefix.children: + c.parent = stem + # Merge the three nodes. + # It is possible that the last token of the original annotation + # is included in a multi-word token. Then we must extend the + # multi-word token to the whole reduplication! Example: + # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' + # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. + mwt = stem.multiword_token + if mwt: + # We assume that the MWT has only two words. We are not prepared for other possibilities. + if len(mwt.words) > 2: + logging.critical('MWT of only two words is expected') + mwtmisc = mwt.misc.copy() + second = mwt.words[1] + mwt.remove() + stem.form = prefix.form + '-' + stem.form + prefix.remove() + hyph.remove() + stem.misc['SpaceAfter'] = '' + mwt = root.create_multiword_token([stem, second], stem.form + second.form, mwtmisc) + else: + stem.form = prefix.form + '-' + stem.form + prefix.remove() + hyph.remove() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() def fix_plural_propn(self, node): """ From 48834686d73034293c737598d6bc12ed20907503 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 21 Oct 2021 16:31:48 +0200 Subject: [PATCH 0288/1201] Multi, kontra. --- udapi/block/ud/id/fixgsd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 12ca9712..b5142040 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -274,7 +274,7 @@ def merge_reduplication(self, node): # Example of reduplication with di-: disebut-sebut = mentioned (the verb sebut is reduplicated, then passivized) # Example of reduplication with se-: sehari-hari = daily (hari = day) # The last pattern is not reduplication but we handle it here because the procedure is very similar: non-/sub-/anti- + a word. - if first.ord == node.ord-2 and (first.form.lower() == node.form.lower() or first.form.lower() + 'an' == node.form.lower() or re.match(r'^(.)o(.)a(.)-\1a\2i\3$', first.form.lower() + '-' + node.form.lower()) or first.form.lower() == 'di' + node.form.lower() or first.form.lower() == 'se' + node.form.lower() or re.match(r'^(non|sub|anti)$', first.form.lower())): + if first.ord == node.ord-2 and (first.form.lower() == node.form.lower() or first.form.lower() + 'an' == node.form.lower() or re.match(r'^(.)o(.)a(.)-\1a\2i\3$', first.form.lower() + '-' + node.form.lower()) or first.form.lower() == 'di' + node.form.lower() or first.form.lower() == 'se' + node.form.lower() or re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower())): hyph = node.prev_node if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): # This is specific to the reduplicated plurals. The rest will be done for any reduplications. @@ -284,7 +284,7 @@ def merge_reduplication(self, node): if node.upos == 'NOUN' and first.form.lower() == node.form.lower(): first.feats['Number'] = 'Plur' # For the non-/sub-/anti- prefix we want to take the morphology from the second word. - if re.match(r'^(non|sub|anti)$', first.form.lower()): + if re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower()): first.lemma = first.lemma + '-' + node.lemma first.upos = node.upos first.xpos = node.xpos @@ -328,7 +328,7 @@ def merge_reduplication(self, node): # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). root.text = root.compute_text() # In some cases the non-/sub-/anti- prefix is annotated as the head of the phrase and the above pattern does not catch it. - elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti)$', node.form.lower()): + elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti|multi|kontra')$', node.form.lower()): prefix = node stem = first # here it is not the first part at all hyph = stem.prev_node From 75ec0ff74ed2d28a09b514a38b2c25d9cbc779f9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 21 Oct 2021 16:33:40 +0200 Subject: [PATCH 0289/1201] Bug fix. --- udapi/block/ud/id/fixgsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index b5142040..69c785ab 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -328,7 +328,7 @@ def merge_reduplication(self, node): # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). root.text = root.compute_text() # In some cases the non-/sub-/anti- prefix is annotated as the head of the phrase and the above pattern does not catch it. - elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti|multi|kontra')$', node.form.lower()): + elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti|multi|kontra)$', node.form.lower()): prefix = node stem = first # here it is not the first part at all hyph = stem.prev_node From ddbb2b0c459c6595da1ca42bd698e07b7fd245fe Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 23 Oct 2021 22:34:13 +0200 Subject: [PATCH 0290/1201] Context-based tagging of Indonesian "semua". --- udapi/block/ud/id/fixgsd.py | 197 +++++++++++++++++++----------------- 1 file changed, 105 insertions(+), 92 deletions(-) diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index 69c785ab..d328212d 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -22,6 +22,25 @@ def fix_upos_based_on_morphind(self, node): elif node.udeprel == 'advcl': node.deprel = 'obl' + def fix_semua(self, node): + """ + Indonesian "semua" means "everything, all". + Originally it was DET, PRON, or ADV. + Ika: I usually only labeled "semua" as DET only if it's followed by a + NOUN/PROPN. If it's followed by DET (including '-nya' as DET) or it's + not followed by any NOUN/DET, I labeled them as PRON. + """ + if node.form.lower() == 'semua': + if re.match(r'^(NOUN|PROPN)$', node.parent.upos) and node.parent.ord > node.ord: + node.upos = 'DET' + if node.udeprel == 'nmod' or node.udeprel == 'advmod': + node.deprel = 'det' + else: + node.upos = 'PRON' + if node.udeprel == 'det' or node.udeprel == 'advmod': + node.deprel = 'nmod' + node.feats['PronType'] = 'Tot' + def fix_ordinal_numerals(self, node): """ Ordinal numerals should be ADJ NumType=Ord in UD. They have many different @@ -62,13 +81,6 @@ def fix_ordinal_numerals(self, node): node.feats['PronType'] = 'Tot' if re.match(r'^(det|amod|nmod)$', node.udeprel): node.deprel = 'nummod' - # The following is not an ordinal numeral but I am too lazy to create a separate method for that. - elif node.form.lower() == 'semua': - # It means 'all'. Originally it was DET, PRON, or ADV. - node.upos = 'DET' - node.feats['PronType'] = 'Tot' - if node.udeprel == 'nmod' or node.udeprel == 'advmod': - node.deprel = 'det' def rejoin_ordinal_numerals(self, node): """ @@ -171,90 +183,6 @@ def rejoin_decades(self, node): # There may have been spaces around the dash, which are now gone. Recompute the sentence text. node.root.text = node.root.compute_text() - def lemmatize_from_morphind(self, node): - # The MISC column contains the output of MorphInd for the current word. - # The analysis has been interpreted wrongly for some verbs, so we need - # to re-interpret it and extract the correct lemma. - morphind = node.misc['MorphInd'] - if node.upos == 'VERB': - if morphind: - # Remove the start and end tags from morphind. - morphind = re.sub(r"^\^", "", morphind) - morphind = re.sub(r"\$$", "", morphind) - # Remove the final XPOS tag from morphind. - morphind = re.sub(r"_V[SP][AP]$", "", morphind) - # Split morphind to prefix, stem, and suffix. - morphemes = re.split(r"\+", morphind) - # Expected suffixes are -kan, -i, -an, or no suffix at all. - # There is also the circumfix ke-...-an which seems to be nominalized adjective: - # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; - # but I am not sure what is the reason that these are tagged VERB. - if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): - del morphemes[-1] - # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. - # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". - while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): - del morphemes[0] - # Check that we are left with just one morpheme. - if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) - else: - lemma = morphemes[0] - # Remove the stem POS category. - lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) - node.lemma = lemma - else: - logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) - elif node.upos == 'NOUN': - if morphind: - # Remove the start and end tags from morphind. - morphind = re.sub(r"^\^", "", morphind) - morphind = re.sub(r"\$$", "", morphind) - # Remove the final XPOS tag from morphind. - morphind = re.sub(r'_(N[SP]D|VSA)$', '', morphind) - # Do not proceed if there is an unexpected final XPOS tag. - if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): - # Split morphind to prefix, stem, and suffix. - morphemes = re.split(r'\+', morphind) - # Expected prefixes are peN-, per-, ke-, ber-. - # Expected suffix is -an. - if len(morphemes) > 1 and re.match(r'^an$', morphemes[-1]): - del morphemes[-1] - if len(morphemes) > 1 and re.match(r'^(peN|per|ke|ber)$', morphemes[0]): - del morphemes[0] - # Check that we are left with just one morpheme. - if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) - else: - lemma = morphemes[0] - # Remove the stem POS category. - lemma = re.sub(r'<[a-z]+>', '', lemma) - node.lemma = lemma - elif node.upos == 'ADJ': - if morphind: - # Remove the start and end tags from morphind. - morphind = re.sub(r"^\^", "", morphind) - morphind = re.sub(r"\$$", "", morphind) - # Remove the final XPOS tag from morphind. - morphind = re.sub(r'_ASS$', '', morphind) - # Do not proceed if there is an unexpected final XPOS tag. - if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): - # Split morphind to prefix, stem, and suffix. - morphemes = re.split(r'\+', morphind) - # Expected prefix is ter-. - if len(morphemes) > 1 and re.match(r'^ter$', morphemes[0]): - del morphemes[0] - # Check that we are left with just one morpheme. - if len(morphemes) != 1: - logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) - else: - lemma = morphemes[0] - # Remove the stem POS category. - lemma = re.sub(r'<[a-z]+>', '', lemma) - node.lemma = lemma - else: - logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) - def merge_reduplication(self, node): """ Reduplication is a common morphological device in Indonesian. Reduplicated @@ -423,12 +351,97 @@ def fix_satu_satunya(self, node): if node.multiword_token and node.no_space_after: node.misc['SpaceAfter'] = '' + def lemmatize_from_morphind(self, node): + # The MISC column contains the output of MorphInd for the current word. + # The analysis has been interpreted wrongly for some verbs, so we need + # to re-interpret it and extract the correct lemma. + morphind = node.misc['MorphInd'] + if node.upos == 'VERB': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r"_V[SP][AP]$", "", morphind) + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r"\+", morphind) + # Expected suffixes are -kan, -i, -an, or no suffix at all. + # There is also the circumfix ke-...-an which seems to be nominalized adjective: + # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; + # but I am not sure what is the reason that these are tagged VERB. + if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): + del morphemes[-1] + # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. + # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". + while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + elif node.upos == 'NOUN': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_(N[SP]D|VSA)$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefixes are peN-, per-, ke-, ber-. + # Expected suffix is -an. + if len(morphemes) > 1 and re.match(r'^an$', morphemes[-1]): + del morphemes[-1] + if len(morphemes) > 1 and re.match(r'^(peN|per|ke|ber)$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + elif node.upos == 'ADJ': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_ASS$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefix is ter-. + if len(morphemes) > 1 and re.match(r'^ter$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + def process_node(self, node): self.fix_plural_propn(node) self.fix_upos_based_on_morphind(node) + self.fix_semua(node) self.rejoin_ordinal_numerals(node) self.fix_ordinal_numerals(node) self.rejoin_decades(node) self.merge_reduplication(node) - self.lemmatize_from_morphind(node) self.fix_satu_satunya(node) + self.lemmatize_from_morphind(node) From fc63be3266eccf252597383baa0687e536b0b3d1 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 29 Oct 2021 22:00:20 +0200 Subject: [PATCH 0291/1201] Added a block to fix tokenization in AnCora (but beware of #95 until it's fixed). --- udapi/block/ud/es/fixexclamation.py | 47 +++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 udapi/block/ud/es/fixexclamation.py diff --git a/udapi/block/ud/es/fixexclamation.py b/udapi/block/ud/es/fixexclamation.py new file mode 100644 index 00000000..7dea8e0d --- /dev/null +++ b/udapi/block/ud/es/fixexclamation.py @@ -0,0 +1,47 @@ +"""Block to fix tokenization of exclamation marks in UD Spanish-AnCora.""" +from udapi.core.block import Block +import logging +import re + +class FixExclamation(Block): + + def process_node(self, node): + """ + In Spanish AnCora, there are things like '¡Hola!' as one token. + The punctuation should be separated. One may question whether this + should include names of companies (Yahoo!) or products (la revista + Hello!) but it should, as company and product names often have + multiple tokens (even multiple full words, not just punctuation) + and these are also separated in UD. + """ + if re.search(r'^[¡!]\w', node.form): + # Separate the punctuation and attach it to the rest. + punct = node.create_child() + punct.shift_before_node(node) + punct.form = node.form[:1] + node.form = node.form[1:] + punct.lemma = punct.form + punct.upos = 'PUNCT' + punct.xpos = 'faa' if punct.form == '¡' else 'fat' + punct.feats['PunctType'] = 'Excl' + punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin' + punct.misc['SpaceAfter'] = 'No' + punct.deprel = 'punct' + # Mark the position for manual check. + node.misc['Mark'] = 'PunctSep' + if re.search(r'\w[¡!]$', node.form): + # Separate the punctuation and attach it to the rest. + punct = node.create_child() + punct.shift_after_node(node) + punct.form = node.form[-1:] + node.form = node.form[:-1] + punct.lemma = punct.form + punct.upos = 'PUNCT' + punct.xpos = 'faa' if punct.form == '¡' else 'fat' + punct.feats['PunctType'] = 'Excl' + punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin' + punct.misc['SpaceAfter'] = node.misc['SpaceAfter'] + node.misc['SpaceAfter'] = 'No' + punct.deprel = 'punct' + # Mark the position for manual check. + node.misc['Mark'] = 'PunctSep' From 49f1f4385e4006895708f4e6f22ea06c3bc3723f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Oct 2021 23:17:19 +0200 Subject: [PATCH 0292/1201] Fix leaf-aux-cop. --- udapi/block/ud/fixleaf.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 udapi/block/ud/fixleaf.py diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py new file mode 100644 index 00000000..4cac1175 --- /dev/null +++ b/udapi/block/ud/fixleaf.py @@ -0,0 +1,35 @@ +""" +Block ud.FixLeaf checks that function word dependents are leaves. +Certain known exceptions are observed (e.g., fixed expressions). +""" +from udapi.core.block import Block +import logging +import re + +class FixLeaf(Block): + """ + Make sure that aux and cop dependents are leaves unless one of the known + exceptions applies. + """ + + def __init__(self, deprels='aux,cop', **kwargs): + """ + Args: + deprels: comma-separated list of deprels to be fixed. Default = aux,cop. + """ + super().__init__(**kwargs) + self.deprels = deprels.split(',') + + def process_node(self, node): + for deprel in self.deprels: + if node.udeprel == deprel: + children = node.children + # Every function dependent can have a fixed child. + # We will also allow conj, cc, punct, goeswith, reparandum. + children = [c for c in children if not re.match(r'^(fixed|conj|cc|punct|goeswith|reparandum)$', c.udeprel)] + # Re-attach the remaining children to an acceptable ancestor. + ancestor = node.parent + while ancestor.udeprel in self.deprels: + ancestor = ancestor.parent + for c in children: + c.parent = ancestor From 0c09c4e666b1cc9923411492f7985ffb00fb88d2 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 31 Oct 2021 09:30:20 +0100 Subject: [PATCH 0293/1201] FixLeaf must update enhanced relations, too. --- udapi/block/ud/fixleaf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py index 4cac1175..345b68f9 100644 --- a/udapi/block/ud/fixleaf.py +++ b/udapi/block/ud/fixleaf.py @@ -33,3 +33,8 @@ def process_node(self, node): ancestor = ancestor.parent for c in children: c.parent = ancestor + # If there are enhanced dependencies, check whether we want to redirect them too. + if c.deps: + for edep in c.deps: + if edep['parent'] == node: + edep['parent'] = ancestor From 9528d7cf5d4927c64fba305a0ced8b32449fec4a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 3 Nov 2021 14:41:54 +0100 Subject: [PATCH 0294/1201] FixLeaf cc. --- udapi/block/ud/fixleaf.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py index 345b68f9..d715ec01 100644 --- a/udapi/block/ud/fixleaf.py +++ b/udapi/block/ud/fixleaf.py @@ -12,10 +12,10 @@ class FixLeaf(Block): exceptions applies. """ - def __init__(self, deprels='aux,cop', **kwargs): + def __init__(self, deprels='aux,cop,cc', **kwargs): """ Args: - deprels: comma-separated list of deprels to be fixed. Default = aux,cop. + deprels: comma-separated list of deprels to be fixed. Default = aux,cop,case,mark,cc. """ super().__init__(**kwargs) self.deprels = deprels.split(',') @@ -23,10 +23,12 @@ def __init__(self, deprels='aux,cop', **kwargs): def process_node(self, node): for deprel in self.deprels: if node.udeprel == deprel: - children = node.children # Every function dependent can have a fixed child. # We will also allow conj, cc, punct, goeswith, reparandum. - children = [c for c in children if not re.match(r'^(fixed|conj|cc|punct|goeswith|reparandum)$', c.udeprel)] + allowed = ['fixed', 'punct', 'goeswith', 'reparandum'] + if deprel != 'cc': + allowed += ['conj', 'cc'] + children = [c for c in node.children if not (c.udeprel in allowed)] # Re-attach the remaining children to an acceptable ancestor. ancestor = node.parent while ancestor.udeprel in self.deprels: From f1028cbc26cd627308fc7f7dd14c20ed9246c114 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 28 Nov 2021 11:32:52 +0100 Subject: [PATCH 0295/1201] Javanese does not distinguish VerbForms, either. --- udapi/block/ud/markbugs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index cbd57eef..5ca0f703 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -118,7 +118,7 @@ def process_node(self, node): if upos == i_upos and not feats[i_feat]: # Some languages do not distinguish finite and non-finite forms of verbs. # The VerbForm feature is not obligatory in those languages. - if i_feat != "VerbForm" or not node.root.zone.split("_")[0] in {"id", "tl", "hil", "ifb"}: + if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb'}: self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) if feats['VerbForm'] == 'Fin': From d1da0b0a45fe37ce6715acee6a2dfbdb4591b264 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 29 Nov 2021 10:52:45 +0100 Subject: [PATCH 0296/1201] Added a block to fix German GSD. --- udapi/block/ud/de/fixgsd.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 udapi/block/ud/de/fixgsd.py diff --git a/udapi/block/ud/de/fixgsd.py b/udapi/block/ud/de/fixgsd.py new file mode 100644 index 00000000..d6853330 --- /dev/null +++ b/udapi/block/ud/de/fixgsd.py @@ -0,0 +1,37 @@ +""" +Block to fix annotation of UD German-GSD. +""" +from udapi.core.block import Block +import logging +import re + +class FixGSD(Block): + + def process_node(self, node): + """ + Normalizes tokenization, lemmatization and tagging of ordinal numerals + that are expressed using digits followed by a period. + https://github.com/UniversalDependencies/UD_German-GSD/issues/24 + """ + # Ignore periods that terminate a sentence, although they could belong + # to an ordinal numeral at the same time. + if node.form == '.' and node.next_node: + # Ignore number+period combinations that have an intervening space. + if node.prev_node and re.match('^\d+$', node.prev_node.form) and node.prev_node.no_space_after: + # Merge the number and the period into one token. + number = node.prev_node + period = node + # The period should not have any children but if it does, re-attach them to the number. + for c in period.children: + c.parent = number + # The period should be followed by a space but if it isn't, mark it at the number. + number.misc['SpaceAfter'] = 'No' if period.no_space_after else '' + number.form += '.' + number.lemma = number.form + number.upos = 'ADJ' + number.xpos = 'ADJA' + number.feats = '_' + number.feats['NumType'] = 'Ord' + if number.udeprel == 'nummod': + number.deprel = 'amod' + period.remove() From fa25c0ed4a25026270c5a7f96287804c7a1e89cd Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 29 Nov 2021 11:44:36 +0100 Subject: [PATCH 0297/1201] More fixes of ordinals in German. --- udapi/block/ud/de/fixgsd.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/de/fixgsd.py b/udapi/block/ud/de/fixgsd.py index d6853330..65d12681 100644 --- a/udapi/block/ud/de/fixgsd.py +++ b/udapi/block/ud/de/fixgsd.py @@ -17,7 +17,7 @@ def process_node(self, node): # to an ordinal numeral at the same time. if node.form == '.' and node.next_node: # Ignore number+period combinations that have an intervening space. - if node.prev_node and re.match('^\d+$', node.prev_node.form) and node.prev_node.no_space_after: + if node.prev_node and re.match(r'^\d+$', node.prev_node.form) and node.prev_node.no_space_after: # Merge the number and the period into one token. number = node.prev_node period = node @@ -35,3 +35,24 @@ def process_node(self, node): if number.udeprel == 'nummod': number.deprel = 'amod' period.remove() + # Even if the digits and the period are already in one token, check their annotation. + if re.match(r'^\d+\.$', node.form): + node.lemma = node.form + node.upos = 'ADJ' + node.xpos = 'ADJA' + node.feats = '_' + node.feats['NumType'] = 'Ord' + if node.udeprel == 'nummod': + node.deprel = 'amod' + # Finally, make sure that ordinal numerals expressed verbosely are tagged properly. + # Unlike for digits, do not remove the features for Gender, Number, and Case. + # Skip 'acht' because we cannot reliably distinguish it from the cardinal numeral and from the verb 'achten'. + if re.match(r'^(erst|zweit|dritt|viert|fünft|sechst|siebt|neunt|(drei|vier|fünf|sechs|sieb|acht|neun)?zehnt|elft|zwölft)(er)?$', node.lemma, re.IGNORECASE): + # Skip 'erst' that is used as an adverb. + if node.lemma != 'erst' or node.upos != 'ADV': + node.lemma = re.sub(r'^(.+)er$', r'\1', node.lemma) + node.upos = 'ADJ' + node.xpos = 'ADJA' + node.feats['NumType'] = 'Ord' + if node.udeprel == 'nummod': + node.deprel = 'amod' From d7da77817a3bbda9f1b3d388232cdfcd4a734999 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 29 Nov 2021 16:34:57 +0100 Subject: [PATCH 0298/1201] Block's process_node iterates over a copy of descendants This prevents an infinite loop in tutorial.AddCommas etc. Partial revert of 44c291b930fa591477c87457a17c0e76e6ee22ea The slowdown is acceptable (about 0.05s per iterating over 700k words). That said, there may be usecases where iterating over _descendants is beneficial, e.g. when deleting nodes, so that we don't iterate over an already deleted node. --- udapi/core/block.py | 6 +++++- udapi/core/document.py | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/udapi/core/block.py b/udapi/core/block.py index 64b8bcc5..32033cde 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -29,7 +29,11 @@ def process_node(self, _): def process_tree(self, tree): """Process a UD tree""" - for node in tree._descendants: + # tree.descendants is slightly slower than tree._descendants (0.05s per iterating over 700k words), + # but it seems safer to iterate over a copy of the list of nodes. + # If a user calls parent.create_child().shift_before_node(parent) in process_node, + # it may end up in endless cycle (because the same node is processed again - Python for cycle remembers the position). + for node in tree.descendants: self.process_node(node) def process_bundle(self, bundle): diff --git a/udapi/core/document.py b/udapi/core/document.py index f02f831e..8f9ce3ea 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -91,7 +91,9 @@ def nodes(self): """An iterator over all nodes (excluding empty nodes) in the document.""" for bundle in self: for tree in bundle: - for node in tree._descendants: + # tree.descendants is slightly slower than tree._descendants, + # but it seems safer, see the comment in udapi.core.block.Block.process.process_tree(). + for node in tree.descendants: yield node @property From 7381264e9aafbd727de03ea25f6e1e862fdd83b9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 1 Dec 2021 23:22:33 +0100 Subject: [PATCH 0299/1201] update the conversion block to UD_English-GUM v2.9 --- udapi/block/corefud/gum2corefud.py | 116 ++++++++++++++++------------- 1 file changed, 64 insertions(+), 52 deletions(-) diff --git a/udapi/block/corefud/gum2corefud.py b/udapi/block/corefud/gum2corefud.py index 95be6ce0..bcd24968 100644 --- a/udapi/block/corefud/gum2corefud.py +++ b/udapi/block/corefud/gum2corefud.py @@ -8,82 +8,94 @@ class Gum2CorefUD(Block): def process_tree(self, tree): docname = tree.bundle.document.meta['docname'] + '_' - def entity2cluster_id(name): - return docname + name.strip('()').replace(',','').replace('+','') - clusters = tree.bundle.document.coref_clusters unfinished_mentions = defaultdict(list) for node in tree.descendants: - entity = node.misc['Entity'] - if not entity: + misc_entity = node.misc['Entity'] + if not misc_entity: continue - parts = [x for x in re.split('(\([^())]+\)?|[^())]+\))', entity) if x] - for part in parts: - # GUM entity name could be e.g. - # abstract-173 or place-1-Coron,_Palawan or place-77-Sub-Saharan_Africa. - # Note that the wikification part of the name may contain commas and dashes. - # Let's take the whole name as cluster_id, which will be normalized later on. - # We just need to remove commas and plus signs which are forbidden in cluster_id - # because they are used as separators in Bridging and SplitAnte, respectively. - # Let's store the type in cluster.cluster_type and Wikification in mention.misc. - name = entity2cluster_id(part) - if part[0] == '(': + # Attribute Entity may contain multiple entities, e.g. + # Entity=(abstract-7-new-2-coref(abstract-3-giv:act-1-coref) + # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. + # The following re.split line splits this into + # entities = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] + entities = [x for x in re.split('(\([^()]+\)?|[^()]+\))', misc_entity) if x] + for entity in entities: + # GUM 2.9 uses global.Entity = entity-GRP-infstat-MIN-coref_type-identity + # but the closing tag is shortent just to GRP. + opening, closing = (entity[0] == '(', entity[-1] == ')') + entity = entity.strip('()') + if not opening and not closing: + logging.warning(f"Entity {entity} at {node} has no opening nor closing bracket.") + elif not opening and closing: + name = docname + entity + if not unfinished_mentions[name]: + raise ValueError(f"Mention {name} closed at {node}, but not opened in the same tree.") + else: + mention = unfinished_mentions[name].pop() + mention.span = f'{mention.head.ord}-{node.ord}' + else: + attrs = entity.split('-') + if len(attrs) == 6: + etype, grp, infstat, minspan, ctype, wiki = attrs + elif len(attrs) == 5: + wiki = None + etype, grp, infstat, minspan, ctype = attrs + elif len(attrs) > 6: + logging.warning(f"Entity {entity} at {node} has more than 6 attributes.") + etype, grp, infstat, minspan, ctype, wiki = entity.split('-', maxsplit=5) + else: + raise ValueError(f"Less than 5 attributes in {entity} at {node}") + name = docname + grp cluster = clusters.get(name) if cluster is None: - chunks = part.strip('()').split('-', maxsplit=2) - if len(chunks) == 3: - ctype, _, wiki = chunks - elif len(chunks) == 2: - ctype, _, wiki = chunks[0], None, None - else: - raise ValueError(f"Unexpected entity {part} at {node}") - cluster = node.create_coref_cluster(cluster_id=name, cluster_type=ctype) + cluster = node.create_coref_cluster(cluster_id=name, cluster_type=etype) mention = cluster.mentions[0] + mention.misc = f"Infstat:{infstat},MinSpan:{minspan},CorefType:{ctype}" if wiki: - mention.misc = 'Wikification:' + wiki.replace(',', '%2C') + mention.misc += ',Wikification:' + wiki #.replace(',', '%2C') else: mention = cluster.create_mention(head=node) - if part[-1] == ')': + if closing: mention.words = [node] else: unfinished_mentions[name].append(mention) - elif part[-1] == ')': - if not unfinished_mentions[name]: - logging.warning(f"Mention {name} closed at {node}, but not opened in the same tree.") - else: - mention = unfinished_mentions[name].pop() - mention.span = f'{mention.head.ord}-{node.ord}' del node.misc['Entity'] - misc_bridge = node.misc['Bridge'] - if misc_bridge: - # E.g. Entity=event-23|Bridge=time-23 Date: Mon, 6 Dec 2021 18:23:42 +0100 Subject: [PATCH 0300/1201] Block to fix UD validation of CorefUD 0.2 - this block must be run to fix the trees in CorefUD so they pass the current UD validator - so far it fixes the following issues: - the node with 0 parent must have DEPREL=root - there must be a space before newdoc or newpar --- udapi/block/corefud/fixtovalidate.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 udapi/block/corefud/fixtovalidate.py diff --git a/udapi/block/corefud/fixtovalidate.py b/udapi/block/corefud/fixtovalidate.py new file mode 100644 index 00000000..8207835e --- /dev/null +++ b/udapi/block/corefud/fixtovalidate.py @@ -0,0 +1,24 @@ +from udapi.core.block import Block + +class FixToValidate(Block): + """This block fixes the CorefUD data so that the final documents are valid conllu files.""" + + def _set_root_deprel(self, node): + if node.parent == node.root and node.deprel != "root": + node.deprel = "root" + print(node) + + def _space_before_pardoc(self, doc): + last_node = None + for i, tree in enumerate(doc.trees): + if i > 0: + if (tree.newdoc is not None or tree.newpar is not None) and last_node.no_space_after: + del last_node.misc["SpaceAfter"] + print(tree) + last_node = tree.descendants[-1] + + def process_node(self, node): + self._set_root_deprel(node) + + def process_document(self, doc): + self._space_before_pardoc(doc) From 1b437618b1a5a40de1da261d8ed56df6a0097d86 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 7 Dec 2021 13:53:41 +0100 Subject: [PATCH 0301/1201] forgotten commit: no debug prints, _set_root_deprel must be called from process_document, otherwise it's not called at all --- udapi/block/corefud/fixtovalidate.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/udapi/block/corefud/fixtovalidate.py b/udapi/block/corefud/fixtovalidate.py index 8207835e..3af37490 100644 --- a/udapi/block/corefud/fixtovalidate.py +++ b/udapi/block/corefud/fixtovalidate.py @@ -3,10 +3,10 @@ class FixToValidate(Block): """This block fixes the CorefUD data so that the final documents are valid conllu files.""" - def _set_root_deprel(self, node): - if node.parent == node.root and node.deprel != "root": - node.deprel = "root" - print(node) + def _set_root_deprel(self, doc): + for node in doc.nodes: + if node.parent == node.root and node.deprel != "root": + node.deprel = "root" def _space_before_pardoc(self, doc): last_node = None @@ -14,11 +14,8 @@ def _space_before_pardoc(self, doc): if i > 0: if (tree.newdoc is not None or tree.newpar is not None) and last_node.no_space_after: del last_node.misc["SpaceAfter"] - print(tree) last_node = tree.descendants[-1] - def process_node(self, node): - self._set_root_deprel(node) - def process_document(self, doc): + self._set_root_deprel(doc) self._space_before_pardoc(doc) From 2f9c7e480d19b6d0e23013cbbb88b218a5aed154 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 7 Dec 2021 18:43:11 +0100 Subject: [PATCH 0302/1201] good point by @martinpopel: iterating over root's children is more efficient --- udapi/block/corefud/fixtovalidate.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/udapi/block/corefud/fixtovalidate.py b/udapi/block/corefud/fixtovalidate.py index 3af37490..87421688 100644 --- a/udapi/block/corefud/fixtovalidate.py +++ b/udapi/block/corefud/fixtovalidate.py @@ -4,9 +4,10 @@ class FixToValidate(Block): """This block fixes the CorefUD data so that the final documents are valid conllu files.""" def _set_root_deprel(self, doc): - for node in doc.nodes: - if node.parent == node.root and node.deprel != "root": - node.deprel = "root" + for root in doc.trees: + for node in root.children: + if node.deprel != "root": + node.deprel = "root" def _space_before_pardoc(self, doc): last_node = None From c404eb98e7276fce10fa1c0d13569e34e69df5d4 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Thu, 9 Dec 2021 18:35:27 +0100 Subject: [PATCH 0303/1201] fixing the root-is-not-0 UD validation errors for some of the automatically parsed datasets --- udapi/block/corefud/fixtovalidate.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/udapi/block/corefud/fixtovalidate.py b/udapi/block/corefud/fixtovalidate.py index 87421688..48a3608d 100644 --- a/udapi/block/corefud/fixtovalidate.py +++ b/udapi/block/corefud/fixtovalidate.py @@ -9,6 +9,22 @@ def _set_root_deprel(self, doc): if node.deprel != "root": node.deprel = "root" + def _unset_root_deprel(self, doc): + for node in doc.nodes: + parent = node.parent + if node.deprel == "root" and parent is not None and not parent.is_root(): + #print("\t".join(['Non-0-root:', node.address(), node.upos, str(node.feats), node.parent.upos, str(node.parent.feats)])) + if parent.upos == "PUNCT" and parent.parent is not None: + node.parent = parent.parent + if node.upos == "CCONJ": + node.deprel = "cc" + elif node.upos == "ADJ" and parent.upos == "PROPN": + node.deprel = "amod" + elif node.upos == "NOUN" and parent.upos == "VERB": + node.deprel = "obl" + else: + node.deprel = "parataxis" + def _space_before_pardoc(self, doc): last_node = None for i, tree in enumerate(doc.trees): @@ -19,4 +35,5 @@ def _space_before_pardoc(self, doc): def process_document(self, doc): self._set_root_deprel(doc) + self._unset_root_deprel(doc) self._space_before_pardoc(doc) From 46022c6509eebfdb71196fc9f56bfe47a2197740 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 19 Dec 2021 23:40:48 +0100 Subject: [PATCH 0304/1201] Heuristics to fix wrong edeprels in Czech. --- udapi/block/ud/cs/fixedeprels.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 udapi/block/ud/cs/fixedeprels.py diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py new file mode 100644 index 00000000..d4455235 --- /dev/null +++ b/udapi/block/ud/cs/fixedeprels.py @@ -0,0 +1,21 @@ +"""Block to fix case-enhanced dependency relations in Czech.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + if edep['deprel'] eq 'nmod:na': + # The case is unknown. We need 'acc' or 'loc'. + # The locative is probably more frequent but it is not so likely with every noun. + if re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): + edep['deprel'] = 'nmod:na:acc' + else + edep['deprel'] = 'nmod:na:loc' From 0d409b6cf86a3cf5881a9dfb37e0f39fa3409cb5 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 19 Dec 2021 23:42:21 +0100 Subject: [PATCH 0305/1201] Bug fix. --- udapi/block/ud/cs/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index d4455235..9888e51f 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -12,7 +12,7 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: - if edep['deprel'] eq 'nmod:na': + if edep['deprel'] == 'nmod:na': # The case is unknown. We need 'acc' or 'loc'. # The locative is probably more frequent but it is not so likely with every noun. if re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): From 71d8a216ba88d3af9fb05c2fe5abd224d561776d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 19 Dec 2021 23:43:24 +0100 Subject: [PATCH 0306/1201] Bug fix. --- udapi/block/ud/cs/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 9888e51f..5c4be62e 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -17,5 +17,5 @@ def process_node(self, node): # The locative is probably more frequent but it is not so likely with every noun. if re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): edep['deprel'] = 'nmod:na:acc' - else + else: edep['deprel'] = 'nmod:na:loc' From 035673ea78eb5c2ec8e6fa92c1a3b8e32ca9247d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 24 Dec 2021 14:57:46 +0100 Subject: [PATCH 0307/1201] Fix Czech edeprels. --- udapi/block/ud/cs/fixedeprels.py | 372 ++++++++++++++++++++++++++++++- 1 file changed, 366 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 5c4be62e..ddcdb6d3 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -12,10 +12,370 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: - if edep['deprel'] == 'nmod:na': - # The case is unknown. We need 'acc' or 'loc'. - # The locative is probably more frequent but it is not so likely with every noun. - if re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): - edep['deprel'] = 'nmod:na:acc' + if re.match(r'^(acl|advcl):', edep['deprel']): + edep['deprel'] = re.sub(r'^(advcl):a_jestliže$', r'\1:jestliže', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):a_pokud$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jen_když$', r'\1:když', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jen_pokud$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jestliže_tedy$', r'\1:jestliže', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):když_už$', r'\1:když', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' + edep['deprel'] = re.sub(r'^(advcl):například_když$', r'\1:když', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):pokud_totiž$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):pokud_však$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):pro$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):protože_pokud$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):v_duch$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):že_ať$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):že_jako$', r'\1:že', edep['deprel']) + if edep['deprel'] == 'acl:v' and node.form == 'patře': + edep['deprel'] = 'nmod:v:loc' + node.deprel = 'nmod' + node.lemma = 'patro' + node.upos = 'NOUN' + node.xpos = 'NNNS6-----A----' + node.feats['Aspect'] = '' + node.feats['Gender'] = 'Neut' + node.feats['Tense'] = '' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) + elif re.match(r'^(nmod|obl):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'obl:loc': + # Annotation error. The first occurrence in PDT dev: + # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' + # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. + # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. + edep['deprel'] = 'obl:v:loc' + elif edep['deprel'] == 'obl:arg:loc': + # Annotation error. The first occurrence in PDT dev: + edep['deprel'] = 'obl:arg:na:loc' + elif edep['deprel'] == 'nmod:loc': + # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': + # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? + # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. + edep['deprel'] = 'obl' + elif edep['deprel'] == 'nmod:voc': + # 'v 8. čísle tiskoviny Ty rudá krávo' + edep['deprel'] = 'nmod:nom' + elif re.match(r'^(nmod|obl(:arg)?):během$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):bez$', edep['deprel']): + edep['deprel'] += ':gen' + elif edep['deprel'] == 'nmod:co:nom': + # Annotation error: 'kompatibilní znamená tolik co slučitelný' + # 'co' should be relative pronoun rather than subordinating conjunction. + edep['deprel'] = 'acl:relcl' + node.deprel = 'acl:relcl' + elif re.match(r'^(nmod|obl(:arg)?):díky$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):do$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):k(:nom)?$', edep['deprel']): + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + ':dat' + elif re.match(r'^(nmod|obl(:arg)?):kolem$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):kromě$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): + edep['deprel'] = 'advcl:li' + elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^obl:místo_za:acc$', edep['deprel']): + # 'chytají krávu místo za rohy spíše za ocas' + # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. + for c in node.children: + if c.form == 'místo': + c.upos = 'ADV' + c.deprel = 'cc' + edep['deprel'] = 'obl:za:acc' + elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):na$', edep['deprel']): + # The case is unknown. We need 'acc' or 'loc'. + # The locative is probably more frequent but it is not so likely with every noun. + # If there is an nummod:gov child, it must be accusative and not locative. + # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:arg:na_konec$', edep['deprel']): + # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' + edep['deprel'] = 'obl:arg:na:acc' + elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):namísto$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):navzdory$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):od$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):okolo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):oproti$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^nmod:pára:nom$', edep['deprel']): + # Annotation error: 'par excellence'. + edep['deprel'] = 'nmod' + for c in node.children: + if c.udeprel == 'case' and c.form.lower() == 'par': + c.lemma = 'par' + c.upos = 'ADP' + c.xpos = 'RR--X----------' + c.feats['Case'] = '' + c.feats['Gender'] = '' + c.feats['Number'] = '' + c.feats['Polarity'] = '' + c.feats['AdpType'] = 'Prep' + elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): + ###!!! Taky bychom se mohli dívat do XPOS předložky, protože tam bude pád uveden! + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):poblíž$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):podle$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):pro$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):proti$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): + # Accusative would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):přes$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):při$', edep['deprel']): + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): + # Genitive would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):skrz$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):u$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):uprostřed$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):uvnitř$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): + # ':nom' occurs in 'karneval v Rio de Janeiro' + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): + # There is just one occurrence and it is an error: + # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' + # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. + edep['deprel'] = 'obl:s:ins' + elif re.match(r'^(nmod|obl(:arg)?):včetně$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):vedle$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):vůči$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):z$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): + # Instrumental would be possible but unlikely. + edep['deprel'] += ':acc' else: - edep['deprel'] = 'nmod:na:loc' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta(:gen)?$', r'\1:cestou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko(:nom)?$', r'\1:nedaleko:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko_od(:gen)?$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:nom$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?forma(:gen)?$', r'\1:formou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k_konec(:gen)?$', r'\1:ke_konci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):kol(em)?(:gen)?$', r'\1:kolem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):konec(:gen)?$', r'\1:koncem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi_uvnitř:gen$', r'\1:uvnitř:gen', edep['deprel']) # 'nejdou mezi, ale uvnitř odvětví a oborů' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_závěr(:gen)?$', r'\1:na_závěr:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):namísto_do(:gen)?$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):následek(:gen)?$', r'\1:následkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ne)?daleko(:gen)?$', r'\1:nedaleko:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):nežli[_:].+$', r'\1:nežli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_o:acc$', r'\1:o:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:nom$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:nom$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:nom$', r'\1:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s:nom$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_na(:acc)?$', r'\1:s_ohledem_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_pomoc(:gen)?$', r'\1:s_pomocí:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_k(:dat)?$', r'\1:s_přihlédnutím_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_na(:acc)?$', r'\1:s_přihlédnutím_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_výjimka(:gen)?$', r'\1:s_výjimkou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_vyloučení(:gen)?$', r'\1:s_vyloučením:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_k(:dat)?$', r'\1:se_zřetelem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):severně_od(:gen)?$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_do(:gen)?$', r'\1:směrem_do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_k(:dat)?$', r'\1:směrem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_na(:acc)?$', r'\1:směrem_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_od(:gen)?$', r'\1:směrem_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu_s(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo(:gen)?$', r'\1:v_čele:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_duch(:gen)?$', r'\1:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_otázka(:gen)?$', r'\1:v_otázce:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_podoba(:gen)?$', r'\1:v_podobě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_poměr_k(:dat)?$', r'\1:v_poměru_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_proces(:gen)?$', r'\1:v_procesu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_prospěch(:gen)?$', r'\1:ve_prospěch:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_protiklad_k(:dat)?$', r'\1:v_protikladu_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_průběh(:gen)?$', r'\1:v_průběhu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_případ(:gen)?$', r'\1:v_případě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rámec(:gen)?$', r'\1:v_rámci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rozpor_s(:ins)?$', r'\1:v_rozporu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_řada(:gen)?$', r'\1:v_řadě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_soulad_s(:ins|:nom)?$', r'\1:v_souladu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souvislost_s(:ins)?$', r'\1:v_souvislosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojení_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojený_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojitost_s(:ins)?$', r'\1:ve_spojitosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spolupráce_s(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_s(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen)?$', r'\1:v_zájmu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_znamení(:gen)?$', r'\1:ve_znamení:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vina(:gen)?$', r'\1:vinou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vliv(:gen)?$', r'\1:vlivem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vzhledem(_k)?(:dat)?$', r'\1:vzhledem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen)?$', r'\1:z_hlediska:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účel(:gen)?$', r'\1:za_účelem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):začátek(:gen)?$', r'\1:začátkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):zásluha(:gen)?$', r'\1:zásluhou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závěr(:gen)?$', r'\1:závěrem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závisle_na(:loc)?$', r'\1:nezávisle_na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):že_za:gen$', r'\1:za:gen', edep['deprel']) From a0c56a1307426c46bc5974cca4e3f1341da5232c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 26 Dec 2021 22:17:12 +0100 Subject: [PATCH 0308/1201] Czech enhanced case markers. --- udapi/block/ud/cs/fixedeprels.py | 90 ++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 17 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index ddcdb6d3..2628f369 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -13,29 +13,26 @@ def process_node(self, node): """ for edep in node.deps: if re.match(r'^(acl|advcl):', edep['deprel']): - edep['deprel'] = re.sub(r'^(advcl):a_jestliže$', r'\1:jestliže', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):a_pokud$', r'\1:pokud', edep['deprel']) + # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|protože|teprve|zejména)_(aby|až|jestliže|když|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|pokud)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):abi$', r'\1:aby', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jen_když$', r'\1:když', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jen_pokud$', r'\1:pokud', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):jakoby$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' + edep['deprel'] = re.sub(r'^(advcl):jelikož_do$', r'\1:jelikož', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jestliže_tedy$', r'\1:jestliže', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):když_už$', r'\1:když', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^(advcl):například_když$', r'\1:když', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):pokud_totiž$', r'\1:pokud', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):pokud_však$', r'\1:pokud', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):pro$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):protože_pokud$', r'\1:pokud', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):v_duch$', r'obl:v_duchu:gen', edep['deprel']) @@ -89,6 +86,8 @@ def process_node(self, node): node.deprel = 'acl:relcl' elif re.match(r'^(nmod|obl(:arg)?):díky$', edep['deprel']): edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):dle$', edep['deprel']): + edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):do$', edep['deprel']): edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):k(:nom)?$', edep['deprel']): @@ -148,6 +147,16 @@ def process_node(self, node): edep['deprel'] += ':loc' elif re.match(r'^(nmod|obl(:arg)?):od$', edep['deprel']): edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): + # Annotation error. + if node.form == 's': + ohled = node.next_node + na = ohled.next_node + noun = na.next_node + self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') + self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(node, noun, 'case', 'case') elif re.match(r'^(nmod|obl(:arg)?):okolo$', edep['deprel']): edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):oproti$', edep['deprel']): @@ -202,6 +211,18 @@ def process_node(self, node): edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):uvnitř$', edep['deprel']): edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': + # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. + # Find the content nominal. + cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] + vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] + if len(cnouns) > 0 and len(vs) > 0: + logging.info('I am here.') + cnoun = cnouns[0] + v = vs[0] + self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') + self.set_basic_and_enhanced(v, cnoun, 'case', 'case') + self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): # ':nom' occurs in 'karneval v Rio de Janeiro' edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) @@ -235,8 +256,11 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta(:gen)?$', r'\1:cestou:gen', edep['deprel']) @@ -246,7 +270,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:nom$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:(nom|dat)$', r'\1:do:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) @@ -258,6 +282,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) @@ -269,6 +294,8 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_než:acc$', r'\1:na:acc', edep['deprel']) # 'na víc než čtyři a půl kilometru' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_podklad(:gen)?$', r'\1:na_podkladě:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) @@ -281,26 +308,29 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_jako[_:].+$', r'\1:jako', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_o:acc$', r'\1:o:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:nom$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:(nom|dat)$', r'\1:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:nom$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_doba(:gen)?$', r'\1:po_dobu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:nom$', r'\1:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:(nom|dat)$', r'\1:pro:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):proti:nom$', r'\1:proti:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s:nom$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_k(:dat)?$', r'\1:s_ohledem_k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_na(:acc)?$', r'\1:s_ohledem_na:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_pomoc(:gen)?$', r'\1:s_pomocí:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_k(:dat)?$', r'\1:s_přihlédnutím_k:dat', edep['deprel']) @@ -308,6 +338,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_výjimka(:gen)?$', r'\1:s_výjimkou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_vyloučení(:gen)?$', r'\1:s_vyloučením:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_k(:dat)?$', r'\1:se_zřetelem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_na(:acc)?$', r'\1:se_zřetelem_na:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):severně_od(:gen)?$', r'\1:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_do(:gen)?$', r'\1:směrem_do:gen', edep['deprel']) @@ -315,18 +346,22 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_na(:acc)?$', r'\1:směrem_na:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_od(:gen)?$', r'\1:směrem_od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu_s(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu(_s)?(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_analogie_s(:ins)?$', r'\1:v_analogii_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo(:gen)?$', r'\1:v_čele:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo_s(:ins)?$', r'\1:v_čele_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_dohoda_s(:ins)?$', r'\1:v_dohodě_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_duch(:gen)?$', r'\1:v_duchu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast_s(:ins)?$', r'\1:s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_otázka(:gen)?$', r'\1:v_otázce:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_podoba(:gen)?$', r'\1:v_podobě:gen', edep['deprel']) @@ -342,6 +377,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_součinnost_s(:ins|:nom)?$', r'\1:v_součinnosti_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_soulad_s(:ins|:nom)?$', r'\1:v_souladu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souvislost_s(:ins)?$', r'\1:v_souvislosti_s:ins', edep['deprel']) @@ -353,7 +389,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen)?$', r'\1:v_zájmu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen|:loc)?$', r'\1:v_zájmu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) @@ -367,11 +403,13 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen)?$', r'\1:z_hlediska:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_oblast(:gen)?$', r'\1:z_oblasti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účast(:gen)?$', r'\1:za_účasti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účel(:gen)?$', r'\1:za_účelem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):začátek(:gen)?$', r'\1:začátkem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):zásluha(:gen)?$', r'\1:zásluhou:gen', edep['deprel']) @@ -379,3 +417,21 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závisle_na(:loc)?$', r'\1:nezávisle_na:loc', edep['deprel']) edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):že_za:gen$', r'\1:za:gen', edep['deprel']) + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) From a9720d16cb283eec919a4cdfa672085268665cb6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 27 Dec 2021 15:52:23 +0100 Subject: [PATCH 0309/1201] Czech enhanced case markers. --- udapi/block/ud/cs/fixedeprels.py | 50 +++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 2628f369..5a2e996d 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -14,28 +14,37 @@ def process_node(self, node): for edep in node.deps: if re.match(r'^(acl|advcl):', edep['deprel']): # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). - edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|protože|teprve|zejména)_(aby|až|jestliže|když|pokud)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|pokud)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):abi$', r'\1:aby', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):jako_kupříkladu$', r'\1:jako', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):jakoby$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' edep['deprel'] = re.sub(r'^(advcl):jelikož_do$', r'\1:jelikož', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:k$', r'obl:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):liž$', r'\1:li', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' + edep['deprel'] = re.sub(r'^acl:na_způsob$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):pro$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:pro$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):takže_a$', r'\1:takže', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):v_duch$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v_duch$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):zatím_co$', r'\1:zatímco', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):že_ať$', r'\1:ať', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):že_jako$', r'\1:že', edep['deprel']) if edep['deprel'] == 'acl:v' and node.form == 'patře': @@ -257,9 +266,12 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) @@ -272,6 +284,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:(nom|dat)$', r'\1:do:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_oblast(:gen)?$', r'\1:do_oblasti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France @@ -286,19 +299,25 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k(:gen)?$', r'\1:k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k_konec(:gen)?$', r'\1:ke_konci:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):kol(em)?(:gen)?$', r'\1:kolem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):konec(:gen)?$', r'\1:koncem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi_uvnitř:gen$', r'\1:uvnitř:gen', edep['deprel']) # 'nejdou mezi, ale uvnitř odvětví a oborů' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_báze(:gen)?$', r'\1:na_bázi:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_než:acc$', r'\1:na:acc', edep['deprel']) # 'na víc než čtyři a půl kilometru' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_od:acc$', r'\1:na_rozdíl_od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_podklad(:gen)?$', r'\1:na_podkladě:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_újma(:gen)?$', r'\1:gen', edep['deprel']) # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úsek(:gen)?$', r'\1:na_úseku:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základna(:gen)?$', r'\1:na_základně:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_závěr(:gen)?$', r'\1:na_závěr:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):namísto_do(:gen)?$', r'\1:do:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):následek(:gen)?$', r'\1:následkem:gen', edep['deprel']) @@ -312,14 +331,17 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:nom$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_v:loc$', r'\1:po:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_doba(:gen)?$', r'\1:po_dobu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_dojem(:gen)?$', r'\1:pod_dojmem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pouze_v(:loc)?$', r'\1:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:(nom|dat)$', r'\1:pro:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):proti:nom$', r'\1:proti:dat', edep['deprel']) @@ -327,6 +349,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přes:gen$', r'\1:přes:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! @@ -348,6 +371,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu(_s)?(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) @@ -359,7 +383,11 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_k:dat$', r'\1:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kombinace_s(:ins)?$', r'\1:v_kombinaci_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kontext_s(:ins)?$', r'\1:v_kontextu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_na:loc$', r'\1:na:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast_s(:ins)?$', r'\1:s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) @@ -376,6 +404,8 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_řada(:gen)?$', r'\1:v_řadě:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr(:gen)?$', r'\1:ve_směru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr_k(:dat)?$', r'\1:ve_směru_k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_součinnost_s(:ins|:nom)?$', r'\1:v_součinnosti_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) @@ -385,7 +415,8 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojený_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojitost_s(:ins)?$', r'\1:ve_spojitosti_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spolupráce_s(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_s(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_s_spolupráce(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_se?(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) @@ -393,6 +424,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_s(:ins)?$', r'\1:v_závislosti_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_znamení(:gen)?$', r'\1:ve_znamení:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vina(:gen)?$', r'\1:vinou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vliv(:gen)?$', r'\1:vlivem:gen', edep['deprel']) @@ -401,11 +433,13 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vzhledem(_k)?(:dat)?$', r'\1:vzhledem_k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen)?$', r'\1:z_hlediska:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen|:nom)?$', r'\1:z_hlediska:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_oblast(:gen)?$', r'\1:z_oblasti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_nedostatek(:gen)?$', r'\1:z_nedostatku:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) From ba230a6b92cea415606a6f6acb59fc0f2793100e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 3 Jan 2022 15:15:53 +0100 Subject: [PATCH 0310/1201] adding `node.siblings` Originally, we decided it is not worth of introducing and bloating the API, but now I saw a usecase for `node.siblings(preceding_only=True)` (used in a list comprehension, and I think nested list comprehensions are evil). --- udapi/core/node.py | 19 +++++++++++++++++-- udapi/core/tests/test_node.py | 2 ++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 5225724e..3d120a52 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -357,10 +357,25 @@ def children(self): nodes2 = [n for n in node.children if n.ord > node.ord] nodes3 = [n for n in node.children if n.ord < node.ord] nodes4 = [n for n in node.children if n.ord < node.ord] + [node] - See documentation of ListOfNodes for details. + See the documentation of ListOfNodes for details. """ return ListOfNodes(self._children, origin=self) + @property + def siblings(self): + """Return a list of dependency sibling nodes. + + When used as a property, `node.siblings` is just a shortcut for: + [n for n in node.parent.children if n!=node] + However, it is especially helpful when used as a method, + so e.g. `node.siblings(preceding_only=True)` stands for + [n for n in node.parent.children if n.ord < node.ord] + which is something else than + node.parent.children(preceding_only=True). + See the documentation of ListOfNodes for details. + """ + return ListOfNodes([n for n in self._parent._children if n!=self], origin=self) + @property def descendants(self): """Return a list of all descendants of the current node. @@ -380,7 +395,7 @@ def descendants(self): nodes2 = [n for n in node.descendants if n.ord > node.ord] nodes3 = [n for n in node.descendants if n.ord < node.ord] nodes4 = [n for n in node.descendants if n.ord < node.ord] + [node] - See documentation of ListOfNodes for details. + See the documentation of ListOfNodes for details. """ # The following code is equivalent to # ListOfNodes(sorted(self.unordered_descendants()), origin=self) diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index f38ca585..28a45d85 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -36,6 +36,8 @@ def test_topology(self): self.assertEqual(len(nodes[1].children), 3) self.assertEqual(len(nodes[1].children(add_self=True)), 4) self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3) + self.assertEqual(nodes[2].siblings, [nodes[0], nodes[3]]) + self.assertEqual(nodes[2].siblings(following_only=True), [nodes[3]]) self.assertEqual(nodes[0].next_node, nodes[1]) self.assertEqual(nodes[2].prev_node, nodes[1]) From d69299ecc6ea0cc8be53fb4ef240665c527caf61 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 5 Jan 2022 18:45:51 +0100 Subject: [PATCH 0311/1201] fix ZeroDivisionError --- udapi/block/corefud/stats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index f07c2a27..e39195db 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -76,7 +76,7 @@ def process_end(self): columns += [('clusters', f"{self.clusters:7,}"), ('clusters_per1k', f"{1000 * self.clusters / total_nodes_nonzero:6.0f}"), ('longest_cluster', f"{self.longest_cluster:6}"), - ('avg_cluster', f"{self.counter['c_total_len'] / self.clusters:5.1f}")] + ('avg_cluster', f"{self.counter['c_total_len'] / clusters_nonzero:5.1f}")] for i in range(1, self.c_len_max + 1): percent = 100 * self.counter[f"c_len_{i}"] / clusters_nonzero columns.append((f"c_len_{i}{'' if i < self.c_len_max else '+'}", f"{percent:5.1f}")) @@ -84,7 +84,7 @@ def process_end(self): columns += [('mentions', f"{self.mentions:7,}"), ('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"), ('longest_mention', f"{self.longest_mention:6}"), - ('avg_mention', f"{self.counter['m_total_len'] / self.mentions:5.1f}")] + ('avg_mention', f"{self.counter['m_total_len'] / mentions_nonzero:5.1f}")] for i in range(0, self.m_len_max + 1): percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}")) From 3d6a267123de3e505cee6a6821ffd028d8725615 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 5 Jan 2022 19:00:07 +0100 Subject: [PATCH 0312/1201] read.OldCorefUD and write.CorefUD for the old CorefUD 0.1 format --- udapi/block/read/oldcorefud.py | 81 +++++++++++++++++++++++++++++++++ udapi/block/write/oldcorefud.py | 58 +++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 udapi/block/read/oldcorefud.py create mode 100644 udapi/block/write/oldcorefud.py diff --git a/udapi/block/read/oldcorefud.py b/udapi/block/read/oldcorefud.py new file mode 100644 index 00000000..a7bc3101 --- /dev/null +++ b/udapi/block/read/oldcorefud.py @@ -0,0 +1,81 @@ +"""Reader for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation.""" +import re +import logging +import udapi.block.read.conllu +from udapi.core.coref import CorefCluster, CorefMention, BridgingLinks + +class OldCorefUD(udapi.block.read.conllu.Conllu): + + def process_document(self, doc, strict=True): + super().process_document(doc) + + clusters = {} + for node in doc.nodes_and_empty: + index, index_str = 0, "" + cluster_id = node.misc["ClusterId"] + if not cluster_id: + index, index_str = 1, "[1]" + cluster_id = node.misc["ClusterId[1]"] + while cluster_id: + cluster = clusters.get(cluster_id) + if cluster is None: + cluster = CorefCluster(cluster_id) + clusters[cluster_id] = cluster + mention = CorefMention(node, cluster) + if node.misc["MentionSpan" + index_str]: + mention.span = node.misc["MentionSpan" + index_str] + else: + mention.words = [node] + cluster_type = node.misc["ClusterType" + index_str] + if cluster_type is not None: + if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: + logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") + cluster.cluster_type = cluster_type + + bridging_str = node.misc["Bridging" + index_str] + if bridging_str: + mention._bridging = BridgingLinks(mention, bridging_str, clusters, strict) + + split_ante_str = node.misc["SplitAnte" + index_str] + if split_ante_str: + split_antes = [] + # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma. + # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. + for ante_str in split_ante_str.replace('+', ',').split(','): + if ante_str in clusters: + if ante_str == cluster_id: + _error("SplitAnte cannot self-reference the same cluster: " + cluster_id, strict) + split_antes.append(clusters[ante_str]) + else: + # split cataphora, e.g. "We, that is you and me..." + ante_cl = CorefCluster(ante_str) + clusters[ante_str] = ante_cl + split_antes.append(ante_cl) + cluster.split_ante = sorted(split_antes) + + mention.misc = node.misc["MentionMisc" + index_str] + index += 1 + index_str = f"[{index}]" + cluster_id = node.misc["ClusterId" + index_str] + # c=doc.coref_clusters should be sorted, so that c[0] < c[1] etc. + # In other words, the dict should be sorted by the values (according to CorefCluster.__lt__), + # not by the keys (cluster_id). + # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order. + for cluster in clusters.values(): + if not cluster._mentions: + _error(f"Cluster {cluster.cluster_id} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict) + cluster._mentions.sort() + doc._coref_clusters = {c._cluster_id: c for c in sorted(clusters.values())} + + # Delete all old-style attributes from MISC (so when converting old to new style, the old attributes are deleted). + attrs = "ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() + for node in doc.nodes_and_empty: + for key in list(node.misc): + if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): + del node.misc[key] + + +def _error(msg, strict): + if strict: + raise ValueError(msg) + logging.error(msg) diff --git a/udapi/block/write/oldcorefud.py b/udapi/block/write/oldcorefud.py new file mode 100644 index 00000000..c6c38cbe --- /dev/null +++ b/udapi/block/write/oldcorefud.py @@ -0,0 +1,58 @@ +"""Writer for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation.""" +import re +import logging +import udapi.block.write.conllu + +class OldCorefUD(udapi.block.write.conllu.Conllu): + + def process_document(self, doc): + if not doc._coref_clusters: + logging.warning("Using write.OldCorefUD on a document without any coreference annotation") + doc._coref_clusters = {} + + # Delete both new-style (GUM-style) and old-style (CorefUD 0.1) coreference annotations from MISC. + attrs = "Entity Split Bridge ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() + for node in doc.nodes_and_empty: + for key in list(node.misc): + if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): + del node.misc[key] + + # doc._coref_clusters is a dict, which is insertion ordered in Python 3.7+. + # The insertion order is sorted according to CorefCluster.__lt__ (see few lines above). + # However, new clusters could be added meanwhile or some clusters edited, + # so we need to sort the clusters again before storing to MISC. + # We also need to mare sure cluster.mentions are sorted in each cluster + # because the ordering of clusters is defined by the first mention in each cluster. + # Ordering of mentions within a cluster can be changed when e.g. changing the span + # of a given mention or reordering words within a sentence and in such events + # Udapi currently does not automatically update the ordering of clusters. + for cluster in doc._coref_clusters.values(): + cluster._mentions.sort() + for cluster in sorted(doc._coref_clusters.values()): + for mention in cluster.mentions: + head = mention.head + if head.misc["ClusterId"]: + for a in attrs: + if head.misc[a]: + head.misc[a + "[1]"] = head.misc[a] + del head.misc[a] + index_str = "[2]" + else: + index, index_str = 1, "[1]" + while(head.misc["ClusterId" + index_str]): + index += 1 + index_str = f"[{index}]" + if index == 1: + index_str = "" + head.misc["ClusterId" + index_str] = cluster.cluster_id + head.misc["MentionSpan" + index_str] = mention.span + head.misc["ClusterType" + index_str] = cluster.cluster_type + if mention._bridging: + head.misc["Bridging" + index_str] = str(mention.bridging) + if cluster.split_ante: + serialized = ','.join((c.cluster_id for c in sorted(cluster.split_ante))) + head.misc["SplitAnte" + index_str] = serialized + if mention.misc: + head.misc["MentionMisc" + index_str] = mention.misc + + super().process_document(doc) From d37dd68feda69ba5d1b52de5e146bb974ee4be95 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 6 Jan 2022 22:17:51 +0100 Subject: [PATCH 0313/1201] util.Eval coref_cluster='...' coref_mention='...' We could use `util.Eval doc='for c in doc.coref_clusters.values():...'`, but it was difficult to fit a bit more difficult code into such oneline. So I've added these two new parameters and now we can write e.g. udapy \ util.Eval coref_cluster='print($.cluster_id)' \ coref_mention='print(" ".join(w.form for w in $.words)) --- udapi/block/util/eval.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index b814b80d..07eab681 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -29,6 +29,7 @@ class Eval(Block): # pylint: disable=too-many-arguments,too-many-instance-attributes def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, + coref_mention=None, coref_cluster=None, expand_code=True, **kwargs): super().__init__(**kwargs) self.doc = doc @@ -41,6 +42,8 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.after_doc = after_doc self.before_bundle = before_bundle self.after_bundle = after_bundle + self.coref_mention = coref_mention + self.coref_cluster = coref_cluster self.expand_code = expand_code self.count = collections.Counter() @@ -71,6 +74,16 @@ def process_document(self, document): # TODO if self._should_process_bundle(bundle): self.process_bundle(bundle) + if self.coref_cluster or self.coref_mention: + for cluster in doc.coref_clusters.values(): + if self.coref_cluster: + this = cluster + exec(self.expand_eval_code(self.coref_cluster)) + if self.coref_mention: + for mention in cluster.mentions: + this = mention + exec(self.expand_eval_code(self.coref_mention)) + def process_bundle(self, bundle): # Extract variables, so they can be used in eval code document = doc = bundle.document From f7f665e6e6868481f4f06d77b08072a134368e30 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 10 Jan 2022 13:50:47 +0100 Subject: [PATCH 0314/1201] Same block as for Czech: fix case-enhanced deprels in Slovak. --- udapi/block/ud/sk/fixedeprels.py | 537 +++++++++++++++++++++++++++++++ 1 file changed, 537 insertions(+) create mode 100644 udapi/block/ud/sk/fixedeprels.py diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py new file mode 100644 index 00000000..6144a29b --- /dev/null +++ b/udapi/block/ud/sk/fixedeprels.py @@ -0,0 +1,537 @@ +"""Block to fix case-enhanced dependency relations in Slovak.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'do': 'do:gen', + 'k': 'k:dat', + 'mimo': 'mimo:gen', + 'na_rozdiel_od': 'na_rozdiel_od:gen', + 'na_základ': 'na_základe:gen', + 'pomoc': 'pomocou:gen', + 'pre': 'pre:acc', + 'prostredníctvom': 'prostredníctvom:gen', + 's': 's:ins', + 's_dôraz_na': 's_dôrazom_na:acc', + 's_ohľad_na': 's_ohľadom_na:acc', + 's_pomoc': 's_pomocou:gen', + 'smer_k': 'smerom_k:dat', + 'spoločne_s': 'spoločne_s:ins', + 'spolu_s': 'spolu_s:ins', + 'v_dôsledok': 'v_dôsledku:gen', + 'v_meno': 'v_mene:gen', + 'v_oblasť': 'v_oblasti:gen', + 'v_porovnanie_s': 'v_porovnaniu_s:ins', + 'v_priebeh': 'v_priebehu:gen', + 'v_prípad': 'v_prípade:gen', + 'v_prospech': 'v_prospech:gen', + 'v_rámec': 'v_rámci:gen', + 'v_spolupráca_s': 'v_spolupráci_s:ins', + 'v_súlad_s': 'v_súlade_s:ins', + 'v_súvislosť_s': 'v_súvislosti_s:ins', + 'v_ústrety': 'v_ústrety:dat', + 'v_vzťah_k': 'vo_vzťahu_k:dat', + 'v_závislosť_na': 'v_závislosti_na:loc', + 'vzhľad_na': 'vzhľadom_na:acc', + 'z': 'z:gen', + 'z_hľadisko': 'z_hľadiska:gen', + 'začiatkom': 'začiatkom:gen' + } + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Slovak basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + for x, xnorm in unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(0)+':'+xnorm + break + if re.match(r'^(acl|advcl):', edep['deprel']): + # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):abi$', r'\1:aby', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):jako_kupříkladu$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):jakoby$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' + edep['deprel'] = re.sub(r'^(advcl):jelikož_do$', r'\1:jelikož', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:k$', r'obl:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):liž$', r'\1:li', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' + edep['deprel'] = re.sub(r'^acl:na_způsob$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' + edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating + edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:pro$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):takže_a$', r'\1:takže', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v_duch$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):zatím_co$', r'\1:zatímco', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):že_ať$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):že_jako$', r'\1:že', edep['deprel']) + if edep['deprel'] == 'acl:v' and node.form == 'patře': + edep['deprel'] = 'nmod:v:loc' + node.deprel = 'nmod' + node.lemma = 'patro' + node.upos = 'NOUN' + node.xpos = 'NNNS6-----A----' + node.feats['Aspect'] = '' + node.feats['Gender'] = 'Neut' + node.feats['Tense'] = '' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) + elif re.match(r'^(nmod|obl):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'obl:loc': + # Annotation error. The first occurrence in PDT dev: + # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' + # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. + # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. + edep['deprel'] = 'obl:v:loc' + elif edep['deprel'] == 'obl:arg:loc': + # Annotation error. The first occurrence in PDT dev: + edep['deprel'] = 'obl:arg:na:loc' + elif edep['deprel'] == 'nmod:loc': + # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': + # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? + # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. + edep['deprel'] = 'obl' + elif edep['deprel'] == 'nmod:voc': + # 'v 8. čísle tiskoviny Ty rudá krávo' + edep['deprel'] = 'nmod:nom' + elif re.match(r'^(nmod|obl(:arg)?):během$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):bez$', edep['deprel']): + edep['deprel'] += ':gen' + elif edep['deprel'] == 'nmod:co:nom': + # Annotation error: 'kompatibilní znamená tolik co slučitelný' + # 'co' should be relative pronoun rather than subordinating conjunction. + edep['deprel'] = 'acl:relcl' + node.deprel = 'acl:relcl' + elif re.match(r'^(nmod|obl(:arg)?):díky$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):dle$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):do$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):k(:nom)?$', edep['deprel']): + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + ':dat' + elif re.match(r'^(nmod|obl(:arg)?):kolem$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):kromě$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): + edep['deprel'] = 'advcl:li' + elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^obl:místo_za:acc$', edep['deprel']): + # 'chytají krávu místo za rohy spíše za ocas' + # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. + for c in node.children: + if c.form == 'místo': + c.upos = 'ADV' + c.deprel = 'cc' + edep['deprel'] = 'obl:za:acc' + elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):na$', edep['deprel']): + # The case is unknown. We need 'acc' or 'loc'. + # The locative is probably more frequent but it is not so likely with every noun. + # If there is an nummod:gov child, it must be accusative and not locative. + # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:arg:na_konec$', edep['deprel']): + # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' + edep['deprel'] = 'obl:arg:na:acc' + elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):namísto$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):navzdory$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):od$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): + # Annotation error. + if node.form == 's': + ohled = node.next_node + na = ohled.next_node + noun = na.next_node + self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') + self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(node, noun, 'case', 'case') + elif re.match(r'^(nmod|obl(:arg)?):okolo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):oproti$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^nmod:pára:nom$', edep['deprel']): + # Annotation error: 'par excellence'. + edep['deprel'] = 'nmod' + for c in node.children: + if c.udeprel == 'case' and c.form.lower() == 'par': + c.lemma = 'par' + c.upos = 'ADP' + c.xpos = 'RR--X----------' + c.feats['Case'] = '' + c.feats['Gender'] = '' + c.feats['Number'] = '' + c.feats['Polarity'] = '' + c.feats['AdpType'] = 'Prep' + elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): + ###!!! Taky bychom se mohli dívat do XPOS předložky, protože tam bude pád uveden! + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):poblíž$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):podle$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):pro$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):proti$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): + # Accusative would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):přes$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):při$', edep['deprel']): + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): + # Genitive would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):skrz$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):u$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):uprostřed$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):uvnitř$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': + # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. + # Find the content nominal. + cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] + vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] + if len(cnouns) > 0 and len(vs) > 0: + logging.info('I am here.') + cnoun = cnouns[0] + v = vs[0] + self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') + self.set_basic_and_enhanced(v, cnoun, 'case', 'case') + self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') + elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): + # ':nom' occurs in 'karneval v Rio de Janeiro' + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): + # There is just one occurrence and it is an error: + # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' + # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. + edep['deprel'] = 'obl:s:ins' + elif re.match(r'^(nmod|obl(:arg)?):včetně$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):vedle$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):vůči$', edep['deprel']): + edep['deprel'] += ':dat' + elif re.match(r'^(nmod|obl(:arg)?):z$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): + # Instrumental would be possible but unlikely. + edep['deprel'] += ':acc' + else: + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta(:gen)?$', r'\1:cestou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko(:nom)?$', r'\1:nedaleko:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko_od(:gen)?$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:(nom|dat)$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_oblast(:gen)?$', r'\1:do_oblasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?forma(:gen)?$', r'\1:formou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k(:gen)?$', r'\1:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k_konec(:gen)?$', r'\1:ke_konci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):kol(em)?(:gen)?$', r'\1:kolem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):konec(:gen)?$', r'\1:koncem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi_uvnitř:gen$', r'\1:uvnitř:gen', edep['deprel']) # 'nejdou mezi, ale uvnitř odvětví a oborů' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_báze(:gen)?$', r'\1:na_bázi:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_než:acc$', r'\1:na:acc', edep['deprel']) # 'na víc než čtyři a půl kilometru' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_od:acc$', r'\1:na_rozdíl_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_podklad(:gen)?$', r'\1:na_podkladě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_újma(:gen)?$', r'\1:gen', edep['deprel']) # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úsek(:gen)?$', r'\1:na_úseku:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základna(:gen)?$', r'\1:na_základně:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_závěr(:gen)?$', r'\1:na_závěr:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):namísto_do(:gen)?$', r'\1:do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):následek(:gen)?$', r'\1:následkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ne)?daleko(:gen)?$', r'\1:nedaleko:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):nežli[_:].+$', r'\1:nežli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_o:acc$', r'\1:o:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:(nom|dat)$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_v:loc$', r'\1:po:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_doba(:gen)?$', r'\1:po_dobu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_dojem(:gen)?$', r'\1:pod_dojmem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pouze_v(:loc)?$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:(nom|dat)$', r'\1:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):proti:nom$', r'\1:proti:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přes:gen$', r'\1:přes:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_k(:dat)?$', r'\1:s_ohledem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_na(:acc)?$', r'\1:s_ohledem_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_pomoc(:gen)?$', r'\1:s_pomocí:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_k(:dat)?$', r'\1:s_přihlédnutím_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_na(:acc)?$', r'\1:s_přihlédnutím_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_výjimka(:gen)?$', r'\1:s_výjimkou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_vyloučení(:gen)?$', r'\1:s_vyloučením:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_k(:dat)?$', r'\1:se_zřetelem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_na(:acc)?$', r'\1:se_zřetelem_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):severně_od(:gen)?$', r'\1:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_do(:gen)?$', r'\1:směrem_do:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_k(:dat)?$', r'\1:směrem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_na(:acc)?$', r'\1:směrem_na:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_od(:gen)?$', r'\1:směrem_od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu(_s)?(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_analogie_s(:ins)?$', r'\1:v_analogii_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo(:gen)?$', r'\1:v_čele:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo_s(:ins)?$', r'\1:v_čele_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_dohoda_s(:ins)?$', r'\1:v_dohodě_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_duch(:gen)?$', r'\1:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_k:dat$', r'\1:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kombinace_s(:ins)?$', r'\1:v_kombinaci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kontext_s(:ins)?$', r'\1:v_kontextu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_na:loc$', r'\1:na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast_s(:ins)?$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_otázka(:gen)?$', r'\1:v_otázce:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_podoba(:gen)?$', r'\1:v_podobě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_poměr_k(:dat)?$', r'\1:v_poměru_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_proces(:gen)?$', r'\1:v_procesu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_prospěch(:gen)?$', r'\1:ve_prospěch:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_protiklad_k(:dat)?$', r'\1:v_protikladu_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_průběh(:gen)?$', r'\1:v_průběhu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_případ(:gen)?$', r'\1:v_případě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rámec(:gen)?$', r'\1:v_rámci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rozpor_s(:ins)?$', r'\1:v_rozporu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_řada(:gen)?$', r'\1:v_řadě:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr(:gen)?$', r'\1:ve_směru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr_k(:dat)?$', r'\1:ve_směru_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_součinnost_s(:ins|:nom)?$', r'\1:v_součinnosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_soulad_s(:ins|:nom)?$', r'\1:v_souladu_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souvislost_s(:ins)?$', r'\1:v_souvislosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojení_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojený_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojitost_s(:ins)?$', r'\1:ve_spojitosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spolupráce_s(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_s_spolupráce(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_se?(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen|:loc)?$', r'\1:v_zájmu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_s(:ins)?$', r'\1:v_závislosti_s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_znamení(:gen)?$', r'\1:ve_znamení:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vina(:gen)?$', r'\1:vinou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vliv(:gen)?$', r'\1:vlivem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vzhledem(_k)?(:dat)?$', r'\1:vzhledem_k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen|:nom)?$', r'\1:z_hlediska:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_oblast(:gen)?$', r'\1:z_oblasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_nedostatek(:gen)?$', r'\1:z_nedostatku:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účast(:gen)?$', r'\1:za_účasti:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účel(:gen)?$', r'\1:za_účelem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):začátek(:gen)?$', r'\1:začátkem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):zásluha(:gen)?$', r'\1:zásluhou:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závěr(:gen)?$', r'\1:závěrem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závisle_na(:loc)?$', r'\1:nezávisle_na:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):že_za:gen$', r'\1:za:gen', edep['deprel']) + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) From 830b2fb6a3d2d7e09773842bf4f7d0994bbb5e02 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 10 Jan 2022 13:55:31 +0100 Subject: [PATCH 0315/1201] Removed spurious code. --- udapi/block/ud/sk/fixedeprels.py | 458 +------------------------------ 1 file changed, 1 insertion(+), 457 deletions(-) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py index 6144a29b..c235ee78 100644 --- a/udapi/block/ud/sk/fixedeprels.py +++ b/udapi/block/ud/sk/fixedeprels.py @@ -5,24 +5,6 @@ class FixEdeprels(Block): - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) - # Secondary prepositions sometimes have the lemma of the original part of # speech. We want the grammaticalized form instead. List even those that # will have the same lexical form, as we also want to check the morphological @@ -34,6 +16,7 @@ class FixEdeprels(Block): 'mimo': 'mimo:gen', 'na_rozdiel_od': 'na_rozdiel_od:gen', 'na_základ': 'na_základe:gen', + 'od': 'od:gen', 'pomoc': 'pomocou:gen', 'pre': 'pre:acc', 'prostredníctvom': 'prostredníctvom:gen', @@ -78,445 +61,6 @@ def process_node(self, node): if m: edep['deprel'] = m.group(0)+':'+xnorm break - if re.match(r'^(acl|advcl):', edep['deprel']): - # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). - edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):abi$', r'\1:aby', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):jako_kupříkladu$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):jakoby$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' - edep['deprel'] = re.sub(r'^(advcl):jelikož_do$', r'\1:jelikož', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:k$', r'obl:k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):liž$', r'\1:li', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^acl:na_způsob$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' - edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating - edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:pro$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):takže_a$', r'\1:takže', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v_duch$', r'obl:v_duchu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):zatím_co$', r'\1:zatímco', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):že_ať$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):že_jako$', r'\1:že', edep['deprel']) - if edep['deprel'] == 'acl:v' and node.form == 'patře': - edep['deprel'] = 'nmod:v:loc' - node.deprel = 'nmod' - node.lemma = 'patro' - node.upos = 'NOUN' - node.xpos = 'NNNS6-----A----' - node.feats['Aspect'] = '' - node.feats['Gender'] = 'Neut' - node.feats['Tense'] = '' - node.feats['VerbForm'] = '' - node.feats['Voice'] = '' - edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) - elif re.match(r'^(nmod|obl):', edep['deprel']): - if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': - # This is a same-case noun-noun modifier, which just happens to be in the locative. - # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has - # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. - edep['deprel'] = 'nmod' - elif edep['deprel'] == 'obl:loc': - # Annotation error. The first occurrence in PDT dev: - # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' - # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. - # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. - edep['deprel'] = 'obl:v:loc' - elif edep['deprel'] == 'obl:arg:loc': - # Annotation error. The first occurrence in PDT dev: - edep['deprel'] = 'obl:arg:na:loc' - elif edep['deprel'] == 'nmod:loc': - # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. - edep['deprel'] = 'nmod:nom' - elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': - # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? - # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. - edep['deprel'] = 'obl' - elif edep['deprel'] == 'nmod:voc': - # 'v 8. čísle tiskoviny Ty rudá krávo' - edep['deprel'] = 'nmod:nom' - elif re.match(r'^(nmod|obl(:arg)?):během$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):bez$', edep['deprel']): - edep['deprel'] += ':gen' - elif edep['deprel'] == 'nmod:co:nom': - # Annotation error: 'kompatibilní znamená tolik co slučitelný' - # 'co' should be relative pronoun rather than subordinating conjunction. - edep['deprel'] = 'acl:relcl' - node.deprel = 'acl:relcl' - elif re.match(r'^(nmod|obl(:arg)?):díky$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):dle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):do$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):k(:nom)?$', edep['deprel']): - edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + ':dat' - elif re.match(r'^(nmod|obl(:arg)?):kolem$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):kromě$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): - edep['deprel'] = 'advcl:li' - elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^obl:místo_za:acc$', edep['deprel']): - # 'chytají krávu místo za rohy spíše za ocas' - # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. - for c in node.children: - if c.form == 'místo': - c.upos = 'ADV' - c.deprel = 'cc' - edep['deprel'] = 'obl:za:acc' - elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) - elif re.match(r'^(nmod|obl(:arg)?):na$', edep['deprel']): - # The case is unknown. We need 'acc' or 'loc'. - # The locative is probably more frequent but it is not so likely with every noun. - # If there is an nummod:gov child, it must be accusative and not locative. - # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^obl:arg:na_konec$', edep['deprel']): - # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' - edep['deprel'] = 'obl:arg:na:acc' - elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):namísto$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):navzdory$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):od$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): - # Annotation error. - if node.form == 's': - ohled = node.next_node - na = ohled.next_node - noun = na.next_node - self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') - self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') - self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') - self.set_basic_and_enhanced(node, noun, 'case', 'case') - elif re.match(r'^(nmod|obl(:arg)?):okolo$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):oproti$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^nmod:pára:nom$', edep['deprel']): - # Annotation error: 'par excellence'. - edep['deprel'] = 'nmod' - for c in node.children: - if c.udeprel == 'case' and c.form.lower() == 'par': - c.lemma = 'par' - c.upos = 'ADP' - c.xpos = 'RR--X----------' - c.feats['Case'] = '' - c.feats['Gender'] = '' - c.feats['Number'] = '' - c.feats['Polarity'] = '' - c.feats['AdpType'] = 'Prep' - elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): - ###!!! Taky bychom se mohli dívat do XPOS předložky, protože tam bude pád uveden! - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):poblíž$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):podle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):pro$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):proti$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): - # Accusative would be possible but unlikely. - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):přes$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):při$', edep['deprel']): - edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): - # Genitive would be possible but unlikely. - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):skrz$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):u$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):uprostřed$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):uvnitř$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': - # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. - # Find the content nominal. - cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] - vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] - if len(cnouns) > 0 and len(vs) > 0: - logging.info('I am here.') - cnoun = cnouns[0] - v = vs[0] - self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') - self.set_basic_and_enhanced(v, cnoun, 'case', 'case') - self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') - elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): - # ':nom' occurs in 'karneval v Rio de Janeiro' - edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): - # There is just one occurrence and it is an error: - # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' - # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. - edep['deprel'] = 'obl:s:ins' - elif re.match(r'^(nmod|obl(:arg)?):včetně$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):vedle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):vůči$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):z$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): - # Instrumental would be possible but unlikely. - edep['deprel'] += ':acc' - else: - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta(:gen)?$', r'\1:cestou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko(:nom)?$', r'\1:nedaleko:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko_od(:gen)?$', r'\1:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:(nom|dat)$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_oblast(:gen)?$', r'\1:do_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?forma(:gen)?$', r'\1:formou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k(:gen)?$', r'\1:k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k_konec(:gen)?$', r'\1:ke_konci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):kol(em)?(:gen)?$', r'\1:kolem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):konec(:gen)?$', r'\1:koncem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi_uvnitř:gen$', r'\1:uvnitř:gen', edep['deprel']) # 'nejdou mezi, ale uvnitř odvětví a oborů' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_báze(:gen)?$', r'\1:na_bázi:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_než:acc$', r'\1:na:acc', edep['deprel']) # 'na víc než čtyři a půl kilometru' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_od:acc$', r'\1:na_rozdíl_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_podklad(:gen)?$', r'\1:na_podkladě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_újma(:gen)?$', r'\1:gen', edep['deprel']) # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úsek(:gen)?$', r'\1:na_úseku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základna(:gen)?$', r'\1:na_základně:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_závěr(:gen)?$', r'\1:na_závěr:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):namísto_do(:gen)?$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):následek(:gen)?$', r'\1:následkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ne)?daleko(:gen)?$', r'\1:nedaleko:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):nežli[_:].+$', r'\1:nežli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_o:acc$', r'\1:o:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:(nom|dat)$', r'\1:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_v:loc$', r'\1:po:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_doba(:gen)?$', r'\1:po_dobu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_dojem(:gen)?$', r'\1:pod_dojmem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pouze_v(:loc)?$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:(nom|dat)$', r'\1:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):proti:nom$', r'\1:proti:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přes:gen$', r'\1:přes:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_k(:dat)?$', r'\1:s_ohledem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_na(:acc)?$', r'\1:s_ohledem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_pomoc(:gen)?$', r'\1:s_pomocí:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_k(:dat)?$', r'\1:s_přihlédnutím_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_na(:acc)?$', r'\1:s_přihlédnutím_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_výjimka(:gen)?$', r'\1:s_výjimkou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_vyloučení(:gen)?$', r'\1:s_vyloučením:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_k(:dat)?$', r'\1:se_zřetelem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_na(:acc)?$', r'\1:se_zřetelem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):severně_od(:gen)?$', r'\1:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_do(:gen)?$', r'\1:směrem_do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_k(:dat)?$', r'\1:směrem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_na(:acc)?$', r'\1:směrem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_od(:gen)?$', r'\1:směrem_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu(_s)?(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_analogie_s(:ins)?$', r'\1:v_analogii_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo(:gen)?$', r'\1:v_čele:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo_s(:ins)?$', r'\1:v_čele_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_dohoda_s(:ins)?$', r'\1:v_dohodě_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_duch(:gen)?$', r'\1:v_duchu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_k:dat$', r'\1:k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kombinace_s(:ins)?$', r'\1:v_kombinaci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kontext_s(:ins)?$', r'\1:v_kontextu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_na:loc$', r'\1:na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast_s(:ins)?$', r'\1:s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_otázka(:gen)?$', r'\1:v_otázce:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_podoba(:gen)?$', r'\1:v_podobě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_poměr_k(:dat)?$', r'\1:v_poměru_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_proces(:gen)?$', r'\1:v_procesu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_prospěch(:gen)?$', r'\1:ve_prospěch:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_protiklad_k(:dat)?$', r'\1:v_protikladu_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_průběh(:gen)?$', r'\1:v_průběhu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_případ(:gen)?$', r'\1:v_případě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rámec(:gen)?$', r'\1:v_rámci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rozpor_s(:ins)?$', r'\1:v_rozporu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_řada(:gen)?$', r'\1:v_řadě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr(:gen)?$', r'\1:ve_směru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr_k(:dat)?$', r'\1:ve_směru_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_součinnost_s(:ins|:nom)?$', r'\1:v_součinnosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_soulad_s(:ins|:nom)?$', r'\1:v_souladu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souvislost_s(:ins)?$', r'\1:v_souvislosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojení_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojený_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojitost_s(:ins)?$', r'\1:ve_spojitosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spolupráce_s(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_s_spolupráce(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_se?(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen|:loc)?$', r'\1:v_zájmu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_s(:ins)?$', r'\1:v_závislosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_znamení(:gen)?$', r'\1:ve_znamení:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vina(:gen)?$', r'\1:vinou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vliv(:gen)?$', r'\1:vlivem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vzhledem(_k)?(:dat)?$', r'\1:vzhledem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen|:nom)?$', r'\1:z_hlediska:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_oblast(:gen)?$', r'\1:z_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_nedostatek(:gen)?$', r'\1:z_nedostatku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účast(:gen)?$', r'\1:za_účasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účel(:gen)?$', r'\1:za_účelem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):začátek(:gen)?$', r'\1:začátkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):zásluha(:gen)?$', r'\1:zásluhou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závěr(:gen)?$', r'\1:závěrem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závisle_na(:loc)?$', r'\1:nezávisle_na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):že_za:gen$', r'\1:za:gen', edep['deprel']) def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' From 46819d52b6949d7ac0ddb4dc5c7dc6f84be7469b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 10 Jan 2022 15:27:10 +0100 Subject: [PATCH 0316/1201] Refined processing of Slovak edeprels. --- udapi/block/ud/sk/fixedeprels.py | 56 +++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py index c235ee78..d029a031 100644 --- a/udapi/block/ud/sk/fixedeprels.py +++ b/udapi/block/ud/sk/fixedeprels.py @@ -21,6 +21,7 @@ class FixEdeprels(Block): 'pre': 'pre:acc', 'prostredníctvom': 'prostredníctvom:gen', 's': 's:ins', + 's_cieľ': 's_cieľom', # no case, used with infinitives (advcl) 's_dôraz_na': 's_dôrazom_na:acc', 's_ohľad_na': 's_ohľadom_na:acc', 's_pomoc': 's_pomocou:gen', @@ -30,7 +31,8 @@ class FixEdeprels(Block): 'v_dôsledok': 'v_dôsledku:gen', 'v_meno': 'v_mene:gen', 'v_oblasť': 'v_oblasti:gen', - 'v_porovnanie_s': 'v_porovnaniu_s:ins', + 'v_porovnanie_s': 'v_porovnaní_s:ins', + 'v_porovnaniu_s': 'v_porovnaní_s:ins', 'v_priebeh': 'v_priebehu:gen', 'v_prípad': 'v_prípade:gen', 'v_prospech': 'v_prospech:gen', @@ -54,13 +56,51 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: - for x, xnorm in unambiguous: - # All secondary prepositions have only one fixed morphological case - # they appear with, so we can replace whatever case we encounter with the correct one. - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) - if m: - edep['deprel'] = m.group(0)+':'+xnorm - break + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + if not solved: + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + m = re.match(r'^(obl(?::arg)?|nmod):(medzi|na|o|po|pred|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + if not solved: + # If we failed to identify the case of the preposition in the + # preceding steps, pick a default. It applies mostly to 'o' + # with wrongly split time values. + m = re.match(r'^(obl(?::arg)?|nmod):o$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':o:acc' + solved = True + m = re.match(r'^(obl(?::arg)?|nmod):(po|v)$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + solved = True + if not solved: + # Some cases do not occur with nominal modifiers without preposition. + # If we see them, chances are that it is the same-case modifier, + # and the same case just happens to be the one we see. For vocatives, + # it is also possible that they have been confused with nominatives. + m = re.match(r'^(obl(?::arg)?|nmod):(voc|loc)$', edep['deprel']) + if m: + edep['deprel'] = m.group(1) + solved = True def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' From 05625f09c8a503d805644d1c31f988b6fbdbe81c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 10 Jan 2022 16:43:36 +0100 Subject: [PATCH 0317/1201] Refined processing of Slovak edeprels. --- udapi/block/ud/sk/fixedeprels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py index d029a031..4c19be89 100644 --- a/udapi/block/ud/sk/fixedeprels.py +++ b/udapi/block/ud/sk/fixedeprels.py @@ -17,6 +17,7 @@ class FixEdeprels(Block): 'na_rozdiel_od': 'na_rozdiel_od:gen', 'na_základ': 'na_základe:gen', 'od': 'od:gen', + 'pod_vplyv': 'pod_vplyvom:gen', 'pomoc': 'pomocou:gen', 'pre': 'pre:acc', 'prostredníctvom': 'prostredníctvom:gen', From 891cf8966a98d4fc3dc781036aa26e7a652fa316 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 11 Jan 2022 11:04:09 +0100 Subject: [PATCH 0318/1201] Slovak cased edeprels. --- udapi/block/ud/sk/fixedeprels.py | 35 +++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py index 4c19be89..7208b6ef 100644 --- a/udapi/block/ud/sk/fixedeprels.py +++ b/udapi/block/ud/sk/fixedeprels.py @@ -11,8 +11,16 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'a_hoci': 'hoci', + 'ako': 'ako', # remove morphological case + 'ako_na': 'ako', + 'akoby_z': 'z:gen', + 'akže': 'ak', + 'ani_keby': 'keby', + 'až_keď': 'keď', 'do': 'do:gen', 'k': 'k:dat', + 'kto': 'kým', ###!!! The lemma should be fixed! The pronoun has grammaticalized as a subordinator. 'mimo': 'mimo:gen', 'na_rozdiel_od': 'na_rozdiel_od:gen', 'na_základ': 'na_základe:gen', @@ -21,6 +29,7 @@ class FixEdeprels(Block): 'pomoc': 'pomocou:gen', 'pre': 'pre:acc', 'prostredníctvom': 'prostredníctvom:gen', + 'prv_ako': 'ako', 's': 's:ins', 's_cieľ': 's_cieľom', # no case, used with infinitives (advcl) 's_dôraz_na': 's_dôrazom_na:acc', @@ -69,10 +78,10 @@ def process_node(self, node): edep['deprel'] = m.group(1)+':'+self.unambiguous[x] solved = True break + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. if not solved: - # The following prepositions have more than one morphological case - # available. Thanks to the Case feature on prepositions, we can - # identify the correct one. m = re.match(r'^(obl(?::arg)?|nmod):(medzi|na|o|po|pred|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: # The following is only partial solution. We will not see @@ -81,10 +90,10 @@ def process_node(self, node): if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() solved = True + # If we failed to identify the case of the preposition in the + # preceding steps, pick a default. It applies mostly to 'o' + # with wrongly split time values. if not solved: - # If we failed to identify the case of the preposition in the - # preceding steps, pick a default. It applies mostly to 'o' - # with wrongly split time values. m = re.match(r'^(obl(?::arg)?|nmod):o$', edep['deprel']) if m: edep['deprel'] = m.group(1)+':o:acc' @@ -93,15 +102,21 @@ def process_node(self, node): if m: edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' solved = True + # Some cases do not occur with nominal modifiers without preposition. + # If we see them, chances are that it is the same-case modifier, + # and the same case just happens to be the one we see. For vocatives, + # it is also possible that they have been confused with nominatives. if not solved: - # Some cases do not occur with nominal modifiers without preposition. - # If we see them, chances are that it is the same-case modifier, - # and the same case just happens to be the one we see. For vocatives, - # it is also possible that they have been confused with nominatives. m = re.match(r'^(obl(?::arg)?|nmod):(voc|loc)$', edep['deprel']) if m: edep['deprel'] = m.group(1) solved = True + # Annotation and conversion errors. + if not solved: + # Povedal som jej „na zdorovie“. + if edep['deprel'] == 'obl:arg:na' and node.form == 'zdorovie': + self.set_basic_and_enhanced(node, edep['parent'], 'ccomp', 'ccomp') + solved = True def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' From a2e91a334e98fd4abe2bdc71b7e35ba314008399 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 17 Jan 2022 02:37:44 +0100 Subject: [PATCH 0319/1201] more params for corefud.MarkCrossing --- udapi/block/corefud/markcrossing.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py index 81136ec9..f357e7cc 100644 --- a/udapi/block/corefud/markcrossing.py +++ b/udapi/block/corefud/markcrossing.py @@ -1,15 +1,19 @@ from udapi.core.block import Block import udapi.core.coref import itertools +import logging class MarkCrossing(Block): """Find mentions with crossing spans.""" - def __init__(self, same_cluster_only=False, continuous_only=False, print_form=False, **kwargs): + def __init__(self, same_cluster_only=False, continuous_only=False, print_form=False, + log=True, mark=True, **kwargs): super().__init__(**kwargs) self.same_cluster_only = same_cluster_only self.continuous_only = continuous_only self.print_form = print_form + self.log = log + self.mark = mark def _print(self, mention): if self.print_form: @@ -25,4 +29,8 @@ def process_node(self, node): continue if self.continuous_only and (',' in mA.span or ',' in mB.span): continue - node.misc['Mark'] = f'cross:{self._print(mA)}+{self._print(mB)}' + msg = f'cross:{self._print(mA)}+{self._print(mB)}' + if self.mark: + node.misc['Mark'] = msg + if self.log: + print(msg) From 136ef06885b66eb68ea06d6a14d26e7d428c0354 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 18 Jan 2022 14:31:59 +0100 Subject: [PATCH 0320/1201] log each crossing just once (not for each node in the intersection) --- udapi/block/corefud/markcrossing.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py index f357e7cc..a6d9346a 100644 --- a/udapi/block/corefud/markcrossing.py +++ b/udapi/block/corefud/markcrossing.py @@ -14,6 +14,7 @@ def __init__(self, same_cluster_only=False, continuous_only=False, print_form=Fa self.print_form = print_form self.log = log self.mark = mark + self._logged = {} def _print(self, mention): if self.print_form: @@ -29,8 +30,10 @@ def process_node(self, node): continue if self.continuous_only and (',' in mA.span or ',' in mB.span): continue - msg = f'cross:{self._print(mA)}+{self._print(mB)}' if self.mark: - node.misc['Mark'] = msg + node.misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" if self.log: - print(msg) + cross_id = node.root.sent_id + mA.span + mB.span + if cross_id not in self._logged: + self._logged[cross_id] = True + print(f"crossing mentions at {node}: {self._print(mA)} + {self._print(mB)}") From f0e76516eb0376493486db66e408dc2995f9acbc Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 19 Jan 2022 22:05:21 +0100 Subject: [PATCH 0321/1201] util.FindBug can take any params so e.g. `util.FindBug block=eval.F1 focus=NOUN` will result in inspecting `eval.F1 focus=NOUN`. --- udapi/block/util/findbug.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/udapi/block/util/findbug.py b/udapi/block/util/findbug.py index e05afe76..e1ea838c 100644 --- a/udapi/block/util/findbug.py +++ b/udapi/block/util/findbug.py @@ -5,9 +5,12 @@ insert "util.FindBug block=" into the scenario, e.g. to debug ``second.Block``, use -udapy first.Block util.FindBug block=second.Block > bug.conllu + udapy first.Block util.FindBug block=second.Block > bug.conllu This will create the file bug.conllu with the bundle, which caused the bug. + +The second.Block can have any parameters, e.g. + udapy first.Block util.FindBug block=second.Block param1=value1 param2=value2 > bug.conllu """ import copy import logging @@ -20,24 +23,31 @@ class FindBug(BaseWriter): """Debug another block by finding a minimal testcase conllu file.""" - def __init__(self, block, first_error_only=True, **kwargs): - """Args: block, first_error_only""" - super().__init__(**kwargs) + def __init__(self, block, first_error_only=True, + files='-', filehandle=None, docname_as_file=False, encoding='utf-8', + newline='\n', overwrite=False, + **kwargs): + """Args: block, first_error_only. + All other parameters (which are not parameters of BaseWriter) + will be passed to the block being inspected. + """ + super().__init__(files, filehandle, docname_as_file, encoding, newline, overwrite) self.block = block self.first_error_only = first_error_only + self._kwargs = kwargs def process_document(self, document): sub_path, class_name = _parse_block_name(self.block) module = "udapi.block." + sub_path + "." + class_name.lower() try: - command = "from " + module + " import " + class_name + " as b" + command = "from " + module + " import " + class_name + " as B" logging.debug("Trying to run command: %s", command) exec(command) # pylint: disable=exec-used except Exception: logging.warning("Error when trying import the block %s", self.block) raise - command = "b()" # TODO params as kwargs + command = "B(**self._kwargs)" logging.debug("Trying to evaluate this: %s", command) new_block = eval(command) # pylint: disable=eval-used From 6e64786578eab324257dae04a3036c29e4d42a7e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 19 Jan 2022 22:37:16 +0100 Subject: [PATCH 0322/1201] bugfix in eval.F1 When a pair of sentences contains no non-focused tokens, `nf_common == []` and we cannot use `while nf_common[c] != pred_tokens[i]`. --- udapi/block/eval/f1.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py index 9f265ac7..ca5510e4 100644 --- a/udapi/block/eval/f1.py +++ b/udapi/block/eval/f1.py @@ -125,6 +125,9 @@ def process_tree(self, tree): nf_common = find_lcs(nf_pred_tokens, nf_gold_tokens) i, j, c, un_pred, un_gold, common = 0, 0, 0, [], [], [] while i < len(pred_tokens) and j < len(gold_tokens): + if c == len(nf_common): + common += find_lcs(pred_tokens[i+1:], gold_tokens[j+1:]) + break while nf_common[c] != pred_tokens[i]: un_pred.append(pred_tokens[i]) i += 1 @@ -135,9 +138,6 @@ def process_tree(self, tree): un_pred, un_gold = [], [] while c < len(nf_common) and nf_common[c] == pred_tokens[i] and nf_common[c] == gold_tokens[j]: i, j, c = i+1, j+1, c+1 - if c == len(nf_common): - common += find_lcs(pred_tokens[i+1:], gold_tokens[j+1:]) - break common = [x for x in common if self.focus.fullmatch(x)] pred_tokens = [x for x in pred_tokens if self.focus.fullmatch(x)] gold_tokens = [x for x in gold_tokens if self.focus.fullmatch(x)] From 7bcde4d7cb1adf8a4fc02882504fa0ff7a22654e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 9 Feb 2022 11:18:19 +0100 Subject: [PATCH 0323/1201] Reorganized the Czech block following Slovak, so it is more readable. --- udapi/block/ud/cs/fixedeprels.py | 460 ++++++++++++++++--------------- 1 file changed, 235 insertions(+), 225 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 5a2e996d..ac2653c3 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -5,6 +5,217 @@ class FixEdeprels(Block): + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'abi': 'aby', + 'aby_na': 'na', + 'ačkoliv': 'ačkoli', + 'ať': 'ať', # remove morphological case + 'ať_forma': 'formou:gen', + 'ať_v': 'v:loc', + 'ať_z': 'z:gen', + 'ať_z_strana': 'ze_strany:gen', + 'až_do': 'do:gen', + 'až_o': 'o:acc', + 'během': 'během:gen', + 'bez': 'bez:gen', + 'bez_ohled_na': 'bez_ohledu_na:acc', + 'bez_zřetel_k': 'bez_zřetele_k:dat', + 'bez_zřetel_na': 'bez_zřetele_na:acc', + 'blíž': 'blízko:dat', + 'cesta': 'cestou:gen', + 'daleko': 'nedaleko:gen', + 'daleko_od': 'od:gen', + 'dík': 'díky:dat', + 'díky': 'díky:dat', + 'dle': 'dle:gen', + 'do': 'do:gen', + 'do_k': 'k:dat', + 'do_oblast': 'do_oblasti:gen', + 'do_rozpor_s': 'do_rozporu_s:ins', + 'do_soulad_s': 'do_souladu_s:ins', + 'forma': 'formou:gen', + 'i_když': 'i_když', # remove morphological case + 'jak_aby': 'jak', + 'jak_ad': 'jak', + 'jakkoliv': 'jakkoli', + 'jako': 'jako', # remove morphological case + 'jako_kupříkladu': 'jako', + 'jakoby': 'jako', + 'jakoby_pod': 'pod:ins', + 'jelikož_do': 'jelikož', + 'jestli_že': 'jestliže', + 'k': 'k:dat', + 'k_konec': 'ke_konci:gen', + 'kdykoliv': 'kdykoli', + 'kol': 'kolem:gen', + 'kolem': 'kolem:gen', + 'konec': 'koncem:gen', + 'kromě': 'kromě:gen', + 'liž': 'li', + 'mezi_uvnitř': 'uvnitř:gen', + 'na_báze': 'na_bázi:gen', + 'na_čelo': 'na_čele:gen', + 'na_mimo': 'na:loc', # na kurtě i mimo něj + 'na_než': 'na:acc', # na víc než čtyři a půl kilometru + 'na_od': 'na_rozdíl_od:gen', + 'na_podklad': 'na_podkladě:gen', + 'na_rozdíl_od': 'na_rozdíl_od:gen', + 'na_újma': 'gen', # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier + 'na_úroveň': 'na_úrovni:gen', + 'na_úsek': 'na_úseku:gen', + 'na_základ': 'na_základě:gen', + 'na_základna': 'na_základně:gen', + 'na_závěr': 'na_závěr:gen', + 'namísto': 'namísto:gen', + 'namísto_do': 'do:gen', + 'narozdíl_od': 'na_rozdíl_od:gen', + 'následek': 'následkem:gen', + 'navzdory': 'navzdory:dat', + 'nedaleko': 'nedaleko:gen', + 'než': 'než', # remove morphological case + 'nežli': 'nežli', # remove morphological case + 'o_jako': 'jako', + 'o_o': 'o:acc', + 'od': 'od:gen', + 'ohledně': 'ohledně:gen', + 'okolo': 'okolo:gen', + 'oproti': 'oproti:dat', + 'po_v': 'po:loc', + 'po_doba': 'po_dobu:gen', + 'po_vzor': 'po_vzoru:gen', + 'poblíž': 'poblíž:gen', + 'počátek': 'počátkem:gen', + 'počínat': 'počínaje:ins', + 'pod_dojem': 'pod_dojmem:gen', + 'pod_vliv': 'pod_vlivem:gen', + 'podle': 'podle:gen', + 'pomoc': 'pomocí:gen', + 'pomocí': 'pomocí:gen', + 'postup': 'postupem:gen', + 'pouze_v': 'v:loc', + 'pro': 'pro:acc', + 'prostřednictví': 'prostřednictvím:gen', + 'prostřednictvím': 'prostřednictvím:gen', + 'proti': 'proti:dat', + 'protože': 'protože', # remove morphological case + 'před_během': 'během:gen', # před a během utkání + 'před_po': 'po:loc', # před a po vyloučení Schindlera + 'přes': 'přes:acc', + 'přestože': 'přestože', # remove morphological case + 'při': 'při:loc', + 'při_příležitost': 'při_příležitosti:gen', + 's_ohled_k': 's_ohledem_k:dat', + 's_ohled_na': 's_ohledem_na:acc', + 's_pomoc': 's_pomocí:gen', + 's_přihlédnutí_k': 's_přihlédnutím_k:dat', + 's_přihlédnutí_na': 's_přihlédnutím_na:acc', + 's_výjimka': 's_výjimkou:gen', + 's_vyloučení': 's_vyloučením:gen', + 's_zřetel_k': 'se_zřetelem_k:dat', + 's_zřetel_na': 'se_zřetelem_na:acc', + 'severně_od': 'od:gen', + 'skrz': 'skrz:acc', + 'směr_do': 'směrem_do:gen', + 'směr_k': 'směrem_k:dat', + 'směr_na': 'směrem_na:acc', + 'směr_od': 'směrem_od:gen', + 'společně_s': 'společně_s:ins', + 'spolu': 'spolu_s:ins', + 'spolu_s': 'spolu_s:ins', + 'stranou': 'stranou:gen', + 'takže': 'takže', # remove morphological case + 'takže_a': 'takže', + 'třebaže': 'třebaže', # remove morphological case + 'u': 'u:gen', + 'u_příležitost': 'u_příležitosti:gen', + 'uprostřed': 'uprostřed:gen', + 'uvnitř': 'uvnitř:gen', + 'v_analogie_s': 'v_analogii_s:ins', + 'v_čelo': 'v_čele:gen', + 'v_čelo_s': 'v_čele_s:ins', + 'v_dohoda_s': 'v_dohodě_s:ins', + 'v_duch': 'v_duchu:gen', + 'v_důsledek': 'v_důsledku:gen', + 'v_forma': 've_formě:gen', + 'v_jméno': 've_jménu:gen', + 'v_k': 'k:dat', + 'v_kombinace_s': 'v_kombinaci_s:ins', + 'v_konfrontace_s': 'v_konfrontaci_s:ins', + 'v_kontext_s': 'v_kontextu_s:ins', + 'v_na': 'na:loc', + 'v_oblast': 'v_oblasti:gen', + 'v_oblast_s': 's:ins', + 'v_obor': 'v_oboru:gen', + 'v_otázka': 'v_otázce:gen', + 'v_podoba': 'v_podobě:gen', + 'v_poměr_k': 'v_poměru_k:dat', + 'v_proces': 'v_procesu:gen', + 'v_prospěch': 've_prospěch:gen', + 'v_protiklad_k': 'v_protikladu_k:dat', + 'v_průběh': 'v_průběhu:gen', + 'v_případ': 'v_případě:gen', + 'v_případ_že': 'v_případě_že', + 'v_rámec': 'v_rámci:gen', + 'v_rozpor_s': 'v_rozporu_s:ins', + 'v_řada': 'v_řadě:gen', + 'v_shoda_s': 've_shodě_s:ins', + 'v_služba': 've_službách:gen', + 'v_směr': 've_směru:gen', + 'v_směr_k': 've_směru_k:dat', + 'v_smysl': 've_smyslu:gen', + 'v_součinnost_s': 'v_součinnosti_s:ins', + 'v_souhlas_s': 'v_souhlasu_s:ins', + 'v_soulad_s': 'v_souladu_s:ins', + 'v_souvislost_s': 'v_souvislosti_s:ins', + 'v_spojení_s': 've_spojení_s:ins', + 'v_spojený_s': 've_spojení_s:ins', + 'v_spojitost_s': 've_spojitosti_s:ins', + 'v_spolupráce_s': 've_spolupráci_s:ins', + 'v_s_spolupráce': 've_spolupráci_s:ins', + 'v_srovnání_s': 've_srovnání_s:ins', + 'v_srovnání_se': 've_srovnání_s:ins', + 'v_světlo': 've_světle:gen', + 'v_věc': 've_věci:gen', + 'v_vztah_k': 've_vztahu_k:dat', + 'v_zájem': 'v_zájmu:gen', + 'v_záležitost': 'v_záležitosti:gen', + 'v_závěr': 'v_závěru:gen', + 'v_závislost_na': 'v_závislosti_na:loc', + 'v_závislost_s': 'v_závislosti_s:ins', + 'v_znamení': 've_znamení:gen', + 'včetně': 'včetně:gen', + 'vedle': 'vedle:gen', + 'vina': 'vinou:gen', + 'vliv': 'vlivem:gen', + 'vůči': 'vůči:dat', + 'vzhledem': 'vzhledem_k:dat', + 'vzhledem_k': 'vzhledem_k:dat', + 'z': 'z:gen', + 'z_důvod': 'z_důvodu:gen', + 'z_hledisko': 'z_hlediska:gen', + 'z_oblast': 'z_oblasti:gen', + 'z_řada': 'z_řad:gen', + 'z_strana': 'ze_strany:gen', + 'z_nedostatek': 'z_nedostatku:gen', + 'z_titul': 'z_titulu:gen', + 'za_pomoc': 'za_pomoci:gen', + 'za_účast': 'za_účasti:gen', + 'za_účel': 'za_účelem:gen', + 'začátek': 'začátkem:gen', + 'zásluha': 'zásluhou:gen', + 'zatím_co': 'zatímco', + 'závěr': 'závěrem:gen', + 'závisle_na': 'nezávisle_na:loc', + 'že_ať': 'ať', + 'že_jako': 'že', + 'že_za': 'za:gen' + } + def process_node(self, node): """ Occasionally the edeprels automatically derived from the Czech basic @@ -12,26 +223,39 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True if re.match(r'^(acl|advcl):', edep['deprel']): # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):abi$', r'\1:aby', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):ačkoliv$', r'\1:ačkoli', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jak_aby$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jak_ad$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jakkoliv$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):jako_kupříkladu$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):jakoby$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' - edep['deprel'] = re.sub(r'^(advcl):jelikož_do$', r'\1:jelikož', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):jestli_že$', r'\1:jestliže', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:k$', r'obl:k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):kdykoliv$', r'\1:kdykoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):liž$', r'\1:li', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' edep['deprel'] = re.sub(r'^acl:na_způsob$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating @@ -39,14 +263,9 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:pro$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):takže_a$', r'\1:takže', edep['deprel']) edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):v_případ_že$', r'\1:v_případě_že', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:v_duch$', r'obl:v_duchu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):zatím_co$', r'\1:zatímco', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):že_ať$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):že_jako$', r'\1:že', edep['deprel']) if edep['deprel'] == 'acl:v' and node.form == 'patře': edep['deprel'] = 'nmod:v:loc' node.deprel = 'nmod' @@ -84,27 +303,11 @@ def process_node(self, node): elif edep['deprel'] == 'nmod:voc': # 'v 8. čísle tiskoviny Ty rudá krávo' edep['deprel'] = 'nmod:nom' - elif re.match(r'^(nmod|obl(:arg)?):během$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):bez$', edep['deprel']): - edep['deprel'] += ':gen' elif edep['deprel'] == 'nmod:co:nom': # Annotation error: 'kompatibilní znamená tolik co slučitelný' # 'co' should be relative pronoun rather than subordinating conjunction. edep['deprel'] = 'acl:relcl' node.deprel = 'acl:relcl' - elif re.match(r'^(nmod|obl(:arg)?):díky$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):dle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):do$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):k(:nom)?$', edep['deprel']): - edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + ':dat' - elif re.match(r'^(nmod|obl(:arg)?):kolem$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):kromě$', edep['deprel']): - edep['deprel'] += ':gen' elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): edep['deprel'] = 'advcl:li' elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): @@ -145,17 +348,11 @@ def process_node(self, node): edep['deprel'] += ':acc' else: edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):namísto$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):navzdory$', edep['deprel']): - edep['deprel'] += ':dat' elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: edep['deprel'] += ':acc' else: edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):od$', edep['deprel']): - edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): # Annotation error. if node.form == 's': @@ -166,10 +363,6 @@ def process_node(self, node): self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') self.set_basic_and_enhanced(node, noun, 'case', 'case') - elif re.match(r'^(nmod|obl(:arg)?):okolo$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):oproti$', edep['deprel']): - edep['deprel'] += ':dat' elif re.match(r'^nmod:pára:nom$', edep['deprel']): # Annotation error: 'par excellence'. edep['deprel'] = 'nmod' @@ -184,42 +377,21 @@ def process_node(self, node): c.feats['Polarity'] = '' c.feats['AdpType'] = 'Prep' elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): - ###!!! Taky bychom se mohli dívat do XPOS předložky, protože tam bude pád uveden! if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: edep['deprel'] += ':acc' else: edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):poblíž$', edep['deprel']): - edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: edep['deprel'] += ':acc' else: edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):podle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):pro$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):proti$', edep['deprel']): - edep['deprel'] += ':dat' elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): # Accusative would be possible but unlikely. edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):přes$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):při$', edep['deprel']): - edep['deprel'] += ':loc' elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): # Genitive would be possible but unlikely. edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):skrz$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):u$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):uprostřed$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):uvnitř$', edep['deprel']): - edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. # Find the content nominal. @@ -244,51 +416,23 @@ def process_node(self, node): # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. edep['deprel'] = 'obl:s:ins' - elif re.match(r'^(nmod|obl(:arg)?):včetně$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):vedle$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^(nmod|obl(:arg)?):vůči$', edep['deprel']): - edep['deprel'] += ':dat' - elif re.match(r'^(nmod|obl(:arg)?):z$', edep['deprel']): - edep['deprel'] += ':gen' elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): # Instrumental would be possible but unlikely. edep['deprel'] += ':acc' else: edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):aby_na:loc$', r'\1:na:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_v(:loc)?$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať_z(:gen)?$', r'\1:z:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_do(:gen)?$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):až_o(:acc)?$', r'\1:o:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_ohled_na(:acc)?$', r'\1:bez_ohledu_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_k(:dat)?$', r'\1:bez_zřetele_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):bez_zřetel_na(:acc)?$', r'\1:bez_zřetele_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):blíž(:dat)?$', r'\1:blízko:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta:ins$', r'\1:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):cesta(:gen)?$', r'\1:cestou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko(:nom)?$', r'\1:nedaleko:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):daleko_od(:gen)?$', r'\1:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):dík(:dat)?$', r'\1:díky:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do:(nom|dat)$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_k:dat$', r'\1:k:dat', edep['deprel']) # do maloobchodní sítě (nebo k dalšímu zpracování) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_oblast(:gen)?$', r'\1:do_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_rozpor_s(:ins)?$', r'\1:do_rozporu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):do_soulad_s(:ins)?$', r'\1:do_souladu_s:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?forma(:gen)?$', r'\1:formou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL @@ -296,161 +440,27 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby_pod:ins$', r'\1:pod:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k(:gen)?$', r'\1:k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):k_konec(:gen)?$', r'\1:ke_konci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):kol(em)?(:gen)?$', r'\1:kolem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):konec(:gen)?$', r'\1:koncem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi_uvnitř:gen$', r'\1:uvnitř:gen', edep['deprel']) # 'nejdou mezi, ale uvnitř odvětví a oborů' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na(:gen|:nom)$', r'\1:na:acc', edep['deprel']) # 'odložit na 1. září' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_báze(:gen)?$', r'\1:na_bázi:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_čelo(:gen)?$', r'\1:na_čele:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_mimo:loc$', r'\1:na:loc', edep['deprel']) # 'na kurtě i mimo něj' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_než:acc$', r'\1:na:acc', edep['deprel']) # 'na víc než čtyři a půl kilometru' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_od:acc$', r'\1:na_rozdíl_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_podklad(:gen)?$', r'\1:na_podkladě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_?rozdíl_od(:gen)?$', r'\1:na_rozdíl_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_újma(:gen)?$', r'\1:gen', edep['deprel']) # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úroveň(:gen)?$', r'\1:na_úrovni:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_úsek(:gen)?$', r'\1:na_úseku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základ(:gen)?$', r'\1:na_základě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_základna(:gen)?$', r'\1:na_základně:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):na_závěr(:gen)?$', r'\1:na_závěr:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):namísto_do(:gen)?$', r'\1:do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):následek(:gen)?$', r'\1:následkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ne)?daleko(:gen)?$', r'\1:nedaleko:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):nežli[_:].+$', r'\1:nežli', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o_o:acc$', r'\1:o:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):od:(nom|dat)$', r'\1:od:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ohledně(:gen)?$', r'\1:ohledně:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_v:loc$', r'\1:po:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_doba(:gen)?$', r'\1:po_dobu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po_vzor(:gen)?$', r'\1:po_vzoru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počátek(:gen)?$', r'\1:počátkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):počínat(:ins)?$', r'\1:počínaje:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_dojem(:gen)?$', r'\1:pod_dojmem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pod_vliv(:gen)?$', r'\1:pod_vlivem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pomocí?(:gen)?$', r'\1:pomocí:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):postup(:gen)?$', r'\1:postupem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pouze_v(:loc)?$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):pro:(nom|dat)$', r'\1:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):prostřednictvím?(:gen|:ins)?$', r'\1:prostřednictvím:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):proti:nom$', r'\1:proti:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_během:gen$', r'\1:během:gen', edep['deprel']) # 'před a během utkání' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před_po:loc$', r'\1:po:loc', edep['deprel']) # 'před a po vyloučení Schindlera' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přes:gen$', r'\1:přes:acc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):při_příležitost(:gen)?$', r'\1:při_příležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_k(:dat)?$', r'\1:s_ohledem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_ohled_na(:acc)?$', r'\1:s_ohledem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_pomoc(:gen)?$', r'\1:s_pomocí:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_k(:dat)?$', r'\1:s_přihlédnutím_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_přihlédnutí_na(:acc)?$', r'\1:s_přihlédnutím_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_výjimka(:gen)?$', r'\1:s_výjimkou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_vyloučení(:gen)?$', r'\1:s_vyloučením:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_k(:dat)?$', r'\1:se_zřetelem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):s_zřetel_na(:acc)?$', r'\1:se_zřetelem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):severně_od(:gen)?$', r'\1:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_do(:gen)?$', r'\1:směrem_do:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_k(:dat)?$', r'\1:směrem_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_na(:acc)?$', r'\1:směrem_na:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):směr_od(:gen)?$', r'\1:směrem_od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):společně_s(:ins)?$', r'\1:společně_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):spolu(_s)?(:ins|:dat)?$', r'\1:spolu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):stranou(:gen|:dat)?$', r'\1:stranou:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):u_příležitost(:gen)?$', r'\1:u_příležitosti:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_analogie_s(:ins)?$', r'\1:v_analogii_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo(:gen)?$', r'\1:v_čele:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_čelo_s(:ins)?$', r'\1:v_čele_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_dohoda_s(:ins)?$', r'\1:v_dohodě_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_duch(:gen)?$', r'\1:v_duchu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_důsledek(:gen)?$', r'\1:v_důsledku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_forma(:gen)?$', r'\1:ve_formě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_jméno(:gen)?$', r'\1:ve_jménu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_k:dat$', r'\1:k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kombinace_s(:ins)?$', r'\1:v_kombinaci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_konfrontace_s(:ins)?$', r'\1:v_konfrontaci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_kontext_s(:ins)?$', r'\1:v_kontextu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_na:loc$', r'\1:na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast(:gen)?$', r'\1:v_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_oblast_s(:ins)?$', r'\1:s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_obor(:gen)?$', r'\1:v_oboru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_otázka(:gen)?$', r'\1:v_otázce:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_podoba(:gen)?$', r'\1:v_podobě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_poměr_k(:dat)?$', r'\1:v_poměru_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_proces(:gen)?$', r'\1:v_procesu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_prospěch(:gen)?$', r'\1:ve_prospěch:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_protiklad_k(:dat)?$', r'\1:v_protikladu_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_průběh(:gen)?$', r'\1:v_průběhu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_případ(:gen)?$', r'\1:v_případě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rámec(:gen)?$', r'\1:v_rámci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_rozpor_s(:ins)?$', r'\1:v_rozporu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_řada(:gen)?$', r'\1:v_řadě:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_shoda_s(:ins)?$', r'\1:ve_shodě_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_služba(:gen)?$', r'\1:ve_službách:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr(:gen)?$', r'\1:ve_směru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_směr_k(:dat)?$', r'\1:ve_směru_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_smysl(:gen)?$', r'\1:ve_smyslu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_součinnost_s(:ins|:nom)?$', r'\1:v_součinnosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souhlas_s(:ins|:nom)?$', r'\1:v_souhlasu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_soulad_s(:ins|:nom)?$', r'\1:v_souladu_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_souvislost_s(:ins)?$', r'\1:v_souvislosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojení_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojený_s(:ins)?$', r'\1:ve_spojení_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spojitost_s(:ins)?$', r'\1:ve_spojitosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_spolupráce_s(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_s_spolupráce(:ins)?$', r'\1:ve_spolupráci_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_srovnání_se?(:ins)?$', r'\1:ve_srovnání_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_světlo(:gen)?$', r'\1:ve_světle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_věc(:gen)?$', r'\1:ve_věci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_vztah_k(:dat)?$', r'\1:ve_vztahu_k:dat', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_zájem(:gen|:loc)?$', r'\1:v_zájmu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_záležitost(:gen)?$', r'\1:v_záležitosti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závěr(:gen)?$', r'\1:v_závěru:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_na(:loc)?$', r'\1:v_závislosti_na:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_závislost_s(:ins)?$', r'\1:v_závislosti_s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v_znamení(:gen)?$', r'\1:ve_znamení:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vina(:gen)?$', r'\1:vinou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vliv(:gen)?$', r'\1:vlivem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vzhledem(_k)?(:dat)?$', r'\1:vzhledem_k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_důvod(:gen)?$', r'\1:z_důvodu:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_hledisko(:gen|:nom)?$', r'\1:z_hlediska:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_oblast(:gen)?$', r'\1:z_oblasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_řada(:gen)?$', r'\1:z_řad:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(ať_)?z_strana(:gen)?$', r'\1:ze_strany:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_nedostatek(:gen)?$', r'\1:z_nedostatku:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z_titul(:gen)?$', r'\1:z_titulu:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_pomoc(:gen)?$', r'\1:za_pomoci:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účast(:gen)?$', r'\1:za_účasti:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za_účel(:gen)?$', r'\1:za_účelem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):začátek(:gen)?$', r'\1:začátkem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):zásluha(:gen)?$', r'\1:zásluhou:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závěr(:gen)?$', r'\1:závěrem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):závisle_na(:loc)?$', r'\1:nezávisle_na:loc', edep['deprel']) edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):že_za:gen$', r'\1:za:gen', edep['deprel']) def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' From 613b26373af4fad9e28cc7186c82f1f171038901 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 9 Feb 2022 20:20:34 +0100 Subject: [PATCH 0324/1201] Manually synchronized fixedeprels.py in master with the changes done in gum-format. --- udapi/block/ud/cs/fixedeprels.py | 56 +++++++++++++++++++------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index ac2653c3..b3e551e5 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -55,6 +55,7 @@ class FixEdeprels(Block): 'kol': 'kolem:gen', 'kolem': 'kolem:gen', 'konec': 'koncem:gen', + 'krom': 'kromě:gen', 'kromě': 'kromě:gen', 'liž': 'li', 'mezi_uvnitř': 'uvnitř:gen', @@ -211,8 +212,10 @@ class FixEdeprels(Block): 'zatím_co': 'zatímco', 'závěr': 'závěrem:gen', 'závisle_na': 'nezávisle_na:loc', + 'že': 'že', # remove morphological case 'že_ať': 'ať', 'že_jako': 'že', + 'že_jakoby': 'že', 'že_za': 'za:gen' } @@ -223,14 +226,14 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) if m: bdeprel = m.group(1) solved = False for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) if m: edep['deprel'] = m.group(1)+':'+self.unambiguous[x] solved = True @@ -253,19 +256,19 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):k$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:k$', r'obl:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:k:dat$', r'obl:k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^acl:na_způsob$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' + edep['deprel'] = re.sub(r'^acl:na_způsob:gen$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):od$', r'nmod:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):podle$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:pro$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):v$', r'nmod:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^acl:od:gen$', r'nmod:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:podle:gen$', r'obl:podle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:pro:acc$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v_duch$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v_duchu:gen$', r'obl:v_duchu:gen', edep['deprel']) if edep['deprel'] == 'acl:v' and node.form == 'patře': edep['deprel'] = 'nmod:v:loc' node.deprel = 'nmod' @@ -310,6 +313,8 @@ def process_node(self, node): node.deprel = 'acl:relcl' elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): edep['deprel'] = 'advcl:li' + elif re.match(r'^(nmod|obl(:arg)?):mezi:voc$', edep['deprel']): + edep['deprel'] = re.sub(r':voc$', r':acc', edep['deprel']) elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: edep['deprel'] += ':acc' @@ -329,7 +334,8 @@ def process_node(self, node): edep['deprel'] = 'obl:za:acc' elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) - elif re.match(r'^(nmod|obl(:arg)?):na$', edep['deprel']): + elif re.match(r'^(nmod|obl(:arg)?):na(:gen)?$', edep['deprel']): + edep['deprel'] = re.sub(r':gen$', '', edep['deprel']) # The case is unknown. We need 'acc' or 'loc'. # The locative is probably more frequent but it is not so likely with every noun. # If there is an nummod:gov child, it must be accusative and not locative. @@ -398,7 +404,6 @@ def process_node(self, node): cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] if len(cnouns) > 0 and len(vs) > 0: - logging.info('I am here.') cnoun = cnouns[0] v = vs[0] self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') @@ -420,26 +425,33 @@ def process_node(self, node): # Instrumental would be possible but unlikely. edep['deprel'] += ':acc' else: - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + # If one of the following expressions occurs followed by another preposition, + # remove the additional preposition. For example, 'i_když_s' becomes just 'i_když'. edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) + # + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) @@ -447,12 +459,10 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem From c691804af00366f60af2cac8fd5823404e1f83bc Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 10 Feb 2022 10:22:33 +0100 Subject: [PATCH 0325/1201] GUM format support (#96) * global.Entity support * shortcuts: doc.coref_mentions and tree.document * reading and writing new (CorefUD 1.0) format of coreference * oops, bug in detecting discontinuous mentions * fix ordering of brackets in serialization for crossing mention spans * change CorefMention.__lt__, document rules for serialization If two mentions start at the same word, the longer must be saved first, in the new format. However, we cannot cycle through `reversed(doc.coref_mentions)` because that would break the ordering of closing brackets. The easiest solution seems to be to redefine `CorefMention.__lt__`, so that it follows the order in which mentions must be stored in the new format. * BridgingLinks string represenation now follows the new format but we can have multiple src mentions in a single `Bridge=` annotation, e.g. `Entity=(e5(e6|Bridge=e1 --- udapi/block/corefud/fixcorefud02.py | 56 ++ udapi/block/corefud/fixinterleaved.py | 82 +++ udapi/block/corefud/indexclusters.py | 3 +- udapi/block/corefud/markinterleaved.py | 45 ++ udapi/block/corefud/marksamesubspan.py | 45 ++ udapi/block/corefud/mergesamespan.py | 51 ++ udapi/block/corefud/movehead.py | 5 +- udapi/block/corefud/printclusters.py | 10 +- udapi/block/read/conllu.py | 14 +- udapi/block/read/oldcorefud.py | 52 +- udapi/block/write/conllu.py | 18 +- udapi/block/write/oldcorefud.py | 10 +- udapi/core/basereader.py | 3 + udapi/core/coref.py | 796 ++++++++++++++++++++----- udapi/core/document.py | 10 + udapi/core/dualdict.py | 2 +- udapi/core/root.py | 4 + udapi/core/run.py | 14 +- 18 files changed, 1046 insertions(+), 174 deletions(-) create mode 100644 udapi/block/corefud/fixcorefud02.py create mode 100644 udapi/block/corefud/fixinterleaved.py create mode 100644 udapi/block/corefud/markinterleaved.py create mode 100644 udapi/block/corefud/marksamesubspan.py create mode 100644 udapi/block/corefud/mergesamespan.py diff --git a/udapi/block/corefud/fixcorefud02.py b/udapi/block/corefud/fixcorefud02.py new file mode 100644 index 00000000..b8fe44f7 --- /dev/null +++ b/udapi/block/corefud/fixcorefud02.py @@ -0,0 +1,56 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +NEW_ETYPE = { + "misc": "other", + "date": "time", + "loc": "place", + "location": "place", + "per": "person", + "org": "organization", + "_": "", + } + +class FixCorefUD02(Block): + """Fix errors in CorefUD 0.2 for release of CorefUD 1.0.""" + + def process_document(self, doc): + # For GUM + if doc.meta['global.Entity'] == 'entity-GRP-infstat-MIN-coref_type-identity': + doc.meta['global.Entity'] = 'eid-etype-head-other-infstat-minspan-identity' + + for cluster in doc.coref_clusters.values(): + if cluster.cluster_type: + # Harmonize etype. + # If gen/spec is distinguished, store it in all mentions' other['gstype']. + etype = cluster.cluster_type.lower() + if etype.startswith('spec') or etype.startswith('gen'): + gstype = 'gen' if etype.startswith('gen') else 'spec' + for m in cluster.mentions: + m.other['gstype'] = gstype + if etype == 'spec': + etype = 'other' + etype = etype.replace('gen', '').replace('spec', '').replace('.', '') + etype = NEW_ETYPE.get(etype, etype) + + # cluster_type="APPOS" is used only in NONPUBL-CorefUD_English-OntoNotes. + # Apposition is a mention-based rather than cluster-based attribute. + # We don't know which of the mentions it should be assigned, but let's expect all non-first. + # UD marks appositions with deprel appos, so once someone checks it is really redunant, + # TODO we can delete the appos mention attribute. + if etype == 'appos': + etype = '' + for mention in cluster.mentions[1:]: + mention.other['appos'] = '1' + cluster.cluster_type = etype + + for mention in cluster.mentions: + # Harmonize bridge relation labels + for bridge in mention.bridging: + rel = bridge.relation.lower() + if rel.endswith('-inv'): + rel = 'i' + rel.replace('-inv', '') + rel = rel.replace('-', '') + rel = rel.replace('indirect_', '') + bridge.relation = rel diff --git a/udapi/block/corefud/fixinterleaved.py b/udapi/block/corefud/fixinterleaved.py new file mode 100644 index 00000000..6921c680 --- /dev/null +++ b/udapi/block/corefud/fixinterleaved.py @@ -0,0 +1,82 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class FixInterleaved(Block): + """Fix mentions with interleaved or crossing spans.""" + + def __init__(self, same_cluster_only=True, both_discontinuous=False, + crossing_only=False, nested_same_subspan=True, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + self.both_discontinuous = both_discontinuous + self.crossing_only = crossing_only + self.nested_same_subspan = nested_same_subspan + + def process_tree(self, tree): + mentions, deleted = set(), set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + for mA, mB in itertools.combinations(mentions, 2): + if mA in deleted or mB in deleted: + continue + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + + # Fully nested spans are OK, expect for same-subspan + sA, sB = set(mA.words), set(mB.words) + if (sA <= sB) or (sB <= sA): + if not self.nested_same_subspan: + continue + elif not set(mA.span.split(',')).intersection(set(mB.span.split(','))): + continue + + # Crossing or interleaved+crossing? + elif self.crossing_only: + if not sA.intersection(sB): + continue + else: + if mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]: + continue + if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: + continue + + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + + mA.words = list(sA.union(sB)) + for wb in sB: + try: + wb._mentions.remove(mB) + except ValueError: + pass + try: + mB.cluster.mentions.remove(mB) + except ValueError: + pass + deleted.add(mB) + + # By changing the mA.words, we could have create another error: + # making the span same as another mention. Let's fix it + sA = set(mA.words) + for mC in mentions: + if mC in deleted or mC is mA or mC is mB: + continue + if sA != set(mC.words): + continue + # So mA and mC have the same span and we need to delete one of them to fix it. + # We will delete mA because it has the artificially enlarged span, + # while mC is from the original annotation. + for wa in sA: + try: + wa._mentions.remove(mA) + except ValueError: + pass + try: + mA.cluster.mentions.remove(mA) + except ValueError: + pass + break + deleted.add(mA) diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py index dee45544..1496c11c 100644 --- a/udapi/block/corefud/indexclusters.py +++ b/udapi/block/corefud/indexclusters.py @@ -27,8 +27,7 @@ def process_document(self, doc): for idx, cid in enumerate(clusters, self.start): cluster = clusters[cid] new_cid = "c" + str(idx) - # need to change private variable - cluster._cluster_id = new_cid + cluster.cluster_id = new_cid new_clusters[new_cid] = cluster self.start = idx + 1 doc._coref_clusters = new_clusters diff --git a/udapi/block/corefud/markinterleaved.py b/udapi/block/corefud/markinterleaved.py new file mode 100644 index 00000000..ac4d9438 --- /dev/null +++ b/udapi/block/corefud/markinterleaved.py @@ -0,0 +1,45 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkInterleaved(Block): + """Find mentions with interleaved spans.""" + + def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + self.both_discontinuous = both_discontinuous + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.cluster.cluster_id + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + if len(mentions) > 1: + for mA, mB in itertools.combinations(mentions, 2): + if set(mA.words).intersection(set(mB.words)): + continue + if mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]: + continue + if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: + continue + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"interleaved mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/marksamesubspan.py b/udapi/block/corefud/marksamesubspan.py new file mode 100644 index 00000000..f99e0e13 --- /dev/null +++ b/udapi/block/corefud/marksamesubspan.py @@ -0,0 +1,45 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkSameSubSpan(Block): + """Find mentions with the same subspan.""" + + def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form=False, nested_only=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + self.both_discontinuous = both_discontinuous + self.nested_only = nested_only + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.cluster.cluster_id + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + if len(mentions) > 1: + for mA, mB in itertools.combinations(mentions, 2): + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + sA, sB = set(mA.words), set(mB.words) + if self.nested_only and not (sA <= sB) and not (sB <= sA): + continue + if not set(mA.span.split(',')).intersection(set(mB.span.split(','))): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"same-subspan mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py new file mode 100644 index 00000000..d5a46d25 --- /dev/null +++ b/udapi/block/corefud/mergesamespan.py @@ -0,0 +1,51 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +import logging + +class MergeSameSpan(Block): + """ + Multiple same-span mentions are considered invalid in CoNLL-U, whether they + belong to the same entity or not. If they occur, merge them into one. + Note: We currently do not have mentions across sentence boundaries in the + CorefUD data, so this block processes one sentence at a time. + """ + + def __init__(self, same_cluster_only=False, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + for mA, mB in itertools.combinations(mentions, 2): + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + + sA, sB = set(mA.words), set(mB.words) + if sA != sB: + continue + + # If the mentions belong to different clusters, we should merge the + # clusters first, i.e., pick one cluster as the survivor, move the + # mentions from the other cluster to this cluster, and remove the + # other cluster. + if mA.cluster != mB.cluster: + logging.warning("Merging same-span mentions that belong to different entities: '%s' vs. '%s'." % (mA.cluster.cluster_id, mB.cluster.cluster_id)) + ###!!! TODO: As of now, changing the cluster of a mention is not supported in the API. + #for m in mB.cluster.mentions: + # m.cluster = mA.cluster + # Remove mention B. It may have been removed earlier because of + # another duplicate, that is the purpose of try-except. + for wb in sB: + try: + wb._mentions.remove(mB) + except ValueError: + pass + try: + mB.cluster.mentions.remove(mB) + except ValueError: + pass diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py index e9034a22..2a38bd82 100644 --- a/udapi/block/corefud/movehead.py +++ b/udapi/block/corefud/movehead.py @@ -6,9 +6,10 @@ class MoveHead(Block): """Block corefud.MoveHead moves the head to the highest node in each mention.""" - def __init__(self, bugs='warn', **kwargs): + def __init__(self, bugs='warn', keep_head_if_possible=True, **kwargs): self.counter = Counter() self.bugs = bugs + self.keep_head_if_possible = keep_head_if_possible super().__init__(**kwargs) def _eparents(self, node): @@ -68,7 +69,7 @@ def find_head(self, mention): mention.head.misc['Bug'] = 'highest-head' # Fifth, try to convervatively preserve the original head, if it is one of the possible heads. - if mention.head in enh_heads: + if self.keep_head_if_possible and mention.head in enh_heads: return mention.head, 'nontreelet' # Finally, return the word-order-wise first head candidate as the head. diff --git a/udapi/block/corefud/printclusters.py b/udapi/block/corefud/printclusters.py index a9a03f5e..7271ae78 100644 --- a/udapi/block/corefud/printclusters.py +++ b/udapi/block/corefud/printclusters.py @@ -6,17 +6,20 @@ class PrintClusters(Block): """Block corefud.PrintClusters prints all mentions of a given cluster.""" - def __init__(self, id_re=None, min_mentions=0, print_ranges=True, aggregate_mentions=True, **kwargs): + def __init__(self, id_re=None, min_mentions=0, print_ranges=True, mark_head=True, + aggregate_mentions=True, **kwargs): """Params: id_re: regular expression constraining ClusterId of the clusters to be printed min_mentions: print only clusters with with at least N mentions print_ranges: print also addressess of all mentions (compactly, using the longest common prefix of sent_id) + mark_head: mark the head (e.g. as "red **car**") """ super().__init__(**kwargs) self.id_re = re.compile(str(id_re)) if id_re else None self.min_mentions = min_mentions self.print_ranges = print_ranges + self.mark_head = mark_head self.aggregate_mentions = aggregate_mentions def process_document(self, doc): @@ -32,7 +35,7 @@ def process_document(self, doc): counter = Counter() ranges = defaultdict(list) for mention in cluster.mentions: - forms = ' '.join([w.form for w in mention.words]) + forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) counter[forms] += 1 if self.print_ranges: ranges[forms].append(mention.head.root.address() + ':' +mention.span) @@ -46,6 +49,7 @@ def process_document(self, doc): print(f' {prefix} ({" ".join(f[len(prefix):] for f in ranges[form])})') else: for mention in cluster.mentions: - print(' ' + ' '.join([w.form for w in mention.words])) + forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) + print(' ' + forms) if self.print_ranges: print(f" {mention.head.root.address()}:{mention.span}") diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 71886752..97e39970 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -14,6 +14,7 @@ RE_TEXT = re.compile(r'^# text\s*=\s*(.*)') RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?') RE_JSON = re.compile(r'^# (doc_)?json_([^ =]+)\s*=\s*(.+)') +RE_GLOBAL_ENTITY = re.compile(r'^# global.Entity\s*=\s*(\S+)') class Conllu(BaseReader): @@ -33,8 +34,7 @@ def __init__(self, strict=False, empty_parent='warn', fix_cycles=False, **kwargs self.empty_parent = empty_parent self.fix_cycles = fix_cycles - @staticmethod - def parse_comment_line(line, root): + def parse_comment_line(self, line, root): """Parse one line of CoNLL-U and fill sent_id, text, newpar, newdoc in root.""" sent_id_match = RE_SENT_ID.match(line) if sent_id_match is not None: @@ -68,6 +68,16 @@ def parse_comment_line(line, root): container = root.json['__doc__'] container[json_match.group(2)] = json.loads(json_match.group(3)) return + + entity_match = RE_GLOBAL_ENTITY.match(line) + if entity_match is not None: + global_entity = entity_match.group(1) + if self._global_entity and self._global_entity != global_entity: + logging.warning("Mismatch in global.Entity: %s != %s", (self._global_entity, global_entity)) + self._global_entity = global_entity + root.comment += '$GLOBAL.ENTITY\n' + return + root.comment += line[1:] + "\n" def read_trees(self): diff --git a/udapi/block/read/oldcorefud.py b/udapi/block/read/oldcorefud.py index a7bc3101..539d5036 100644 --- a/udapi/block/read/oldcorefud.py +++ b/udapi/block/read/oldcorefud.py @@ -6,6 +6,33 @@ class OldCorefUD(udapi.block.read.conllu.Conllu): + def __init__(self, replace_hyphen_in_id_with='', **kwargs): + """Create the read.OldCorefUD reader object. + + Args: + substitute_hyphen_in_id_for: string to use as a replacement for hyphens in ClusterId + The new format does not allow hyphens in eid (IDs of entity clusters), + so we need to replace them. + """ + super().__init__(**kwargs) + self.replace_hyphen_in_id_with = replace_hyphen_in_id_with + self.orig2new = {} + self.new2orig = {} + + def _fix_id(self, cid): + if not cid or '-' not in cid: + return cid + new_cid = self.orig2new.get(cid) + if new_cid is None: + new_cid = cid.replace('-', self.replace_hyphen_in_id_with) + base, counter = new_cid, 1 + while new_cid in self.new2orig: + counter += 1 + new_cid = f"{base}{counter}" + self.new2orig[new_cid] = cid + self.orig2new[cid] = new_cid + return new_cid + def process_document(self, doc, strict=True): super().process_document(doc) @@ -16,25 +43,32 @@ def process_document(self, doc, strict=True): if not cluster_id: index, index_str = 1, "[1]" cluster_id = node.misc["ClusterId[1]"] + cluster_id = self._fix_id(cluster_id) while cluster_id: cluster = clusters.get(cluster_id) if cluster is None: cluster = CorefCluster(cluster_id) clusters[cluster_id] = cluster - mention = CorefMention(node, cluster) + mention = CorefMention(words=[node], cluster=cluster) if node.misc["MentionSpan" + index_str]: mention.span = node.misc["MentionSpan" + index_str] - else: - mention.words = [node] cluster_type = node.misc["ClusterType" + index_str] - if cluster_type is not None: + if cluster_type: if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") cluster.cluster_type = cluster_type bridging_str = node.misc["Bridging" + index_str] if bridging_str: - mention._bridging = BridgingLinks(mention, bridging_str, clusters, strict) + mention._bridging = BridgingLinks(mention) + for link_str in bridging_str.split(','): + target, relation = link_str.split(':') + target = self._fix_id(target) + if target == cluster_id: + _error("Bridging cannot self-reference the same cluster: " + target, strict) + if target not in clusters: + clusters[target] = CorefCluster(target) + mention._bridging.append((clusters[target], relation)) split_ante_str = node.misc["SplitAnte" + index_str] if split_ante_str: @@ -42,6 +76,7 @@ def process_document(self, doc, strict=True): # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma. # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. for ante_str in split_ante_str.replace('+', ',').split(','): + ante_str = self._fix_id(ante_str) if ante_str in clusters: if ante_str == cluster_id: _error("SplitAnte cannot self-reference the same cluster: " + cluster_id, strict) @@ -53,10 +88,13 @@ def process_document(self, doc, strict=True): split_antes.append(ante_cl) cluster.split_ante = sorted(split_antes) - mention.misc = node.misc["MentionMisc" + index_str] + # Some CorefUD 0.2 datasets (e.g. ARRAU) separate key-value pairs with spaces instead of commas. + # We also need to escape forbidden characters. + mmisc = node.misc["MentionMisc" + index_str].replace(' ', ',') + mention.other = mmisc.replace('-', '%2D').replace('(', '%28').replace(')', '%29') index += 1 index_str = f"[{index}]" - cluster_id = node.misc["ClusterId" + index_str] + cluster_id = self._fix_id(node.misc["ClusterId" + index_str]) # c=doc.coref_clusters should be sorted, so that c[0] < c[1] etc. # In other words, the dict should be sorted by the values (according to CorefCluster.__lt__), # not by the keys (cluster_id). diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 66ae320b..abe20963 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -26,7 +26,7 @@ def process_tree(self, tree): # pylint: disable=too-many-branches # If tree.comment contains placeholders $NEWDOC,...$TEXT, replace them with the actual # value of the attribute and make note on which line (i_*) they were present. comment_lines = tree.comment.splitlines() - i_newdoc, i_newpar, i_sent_id, i_text = -1, -1, -1, -1 + i_newdoc, i_newpar, i_sent_id, i_text, i_global_entity = -1, -1, -1, -1, -1 for i, c_line in enumerate(comment_lines): if c_line == '$SENT_ID': i_sent_id = i @@ -50,6 +50,13 @@ def process_tree(self, tree): # pylint: disable=too-many-branches comment_lines[i] = ' newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '') else: comment_lines[i] = None + elif c_line == '$GLOBAL.ENTITY': + i_global_entity = i + ge = tree.document.meta.get('global.Entity') + if ge: + comment_lines[i] = ' global.Entity = ' + ge + else: + comment_lines[i] = None # Now print the special comments: global.columns, newdoc, newpar, sent_id and text. # If these comments were already present in tree.comment (as marked with the placeholders), @@ -68,6 +75,15 @@ def process_tree(self, tree): # pylint: disable=too-many-branches printed_i += 1 if comment_lines[printed_i]: print('#' + comment_lines[printed_i]) + ge = tree.document.meta.get('global.Entity') + if ge: + if i_global_entity == -1: + print('# global.Entity = ' + ge) + else: + while printed_i < i_global_entity: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) if tree.newpar: if i_newpar == -1: print('# newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '')) diff --git a/udapi/block/write/oldcorefud.py b/udapi/block/write/oldcorefud.py index c6c38cbe..4eb316bb 100644 --- a/udapi/block/write/oldcorefud.py +++ b/udapi/block/write/oldcorefud.py @@ -6,9 +6,8 @@ class OldCorefUD(udapi.block.write.conllu.Conllu): def process_document(self, doc): - if not doc._coref_clusters: + if not doc.coref_clusters: logging.warning("Using write.OldCorefUD on a document without any coreference annotation") - doc._coref_clusters = {} # Delete both new-style (GUM-style) and old-style (CorefUD 0.1) coreference annotations from MISC. attrs = "Entity Split Bridge ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() @@ -16,6 +15,7 @@ def process_document(self, doc): for key in list(node.misc): if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): del node.misc[key] + del doc.meta['global.Entity'] # doc._coref_clusters is a dict, which is insertion ordered in Python 3.7+. # The insertion order is sorted according to CorefCluster.__lt__ (see few lines above). @@ -48,11 +48,11 @@ def process_document(self, doc): head.misc["MentionSpan" + index_str] = mention.span head.misc["ClusterType" + index_str] = cluster.cluster_type if mention._bridging: - head.misc["Bridging" + index_str] = str(mention.bridging) + head.misc["Bridging" + index_str] = ','.join(f'{l.target.cluster_id}:{l.relation}' for l in sorted(mention.bridging)) if cluster.split_ante: serialized = ','.join((c.cluster_id for c in sorted(cluster.split_ante))) head.misc["SplitAnte" + index_str] = serialized - if mention.misc: - head.misc["MentionMisc" + index_str] = mention.misc + if mention.other: + head.misc["MentionMisc" + index_str] = str(mention.other).replace('%2D', '-') super().process_document(doc) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 05f204b9..fee9da4c 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -28,6 +28,7 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id + self._global_entity = None @staticmethod def is_multizone_reader(): @@ -110,6 +111,7 @@ def try_fast_load(self, document): return False document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity if trees and trees[0].newdoc and trees[0].newdoc is not True: document.meta["docname"] = trees[0].newdoc @@ -187,6 +189,7 @@ def process_document(self, document): break if trees_loaded == 0: document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity add_to_the_last_bundle = False trees_loaded += 1 diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 6236e4cf..aaaa07f0 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -1,42 +1,172 @@ -"""Classes for handling coreference.""" +"""Classes for handling coreference. + +# CorefUD 1.0 format implementation details + +## Rules for ordering "chunks" within `node.misc['Entity']` +Entity mentions are annotated using "chunks" stored in `misc['Entity']`. +Chunks are of three types: +1. opening bracket, e.g. `(e1-person` +2. closing bracket, e.g. `e1-person)` +3. single-word span (both opening and closing), e.g. `(e1-person)` + +The `Entity` MISC attribute contains a sequence of chunks +without any separators, e.g. `Entity=(e1-person(e2-place)` +means opening `e1` mention and single-word `e2` mention +starting on a given node. + +### Crossing mentions +Two mentions are crossing iff their spans have non-empty intersection, +but neither is a subset of the other, e.g. `e1` spanning nodes 1-3 +and `e2` spanning 2-4 would be represented as: +``` +1 ... Entity=(e1 +2 ... Entity=(e2 +3 ... Entity=e1) +4 ... Entity=e2) +``` +This may be an annotation error and we may forbid such cases in future annotation guidelines, +but in CorefUD 0.2, there are thousands of such cases (see https://github.com/ufal/corefUD/issues/23). + +It can even happen that one entity ends and another starts at the same node: `Entity=e1)(e2` +For this reason, we need + +**Rule1**: closing brackets MUST always precede opening brackets. +Otherwise, we would get `Entity=(e2e1)`, which could not be parsed. + +Note that we cannot have same-entity crossing mentions in the CorefUD 1.0 format, +so e.g. if we substitute `e2` with `e1` in the example above, we'll get +`(e1`, `e1)`, `(e1`, `e1)`, which will be interpreted as two non-overlapping mentions of the same entity. + +### Nested mentions +One mention (span) can be often embedded within another mention (span). +It can happen that both these mentions correspond to the same entity (i.e. are in the same cluster), +for example, "` sold the world>`". +It can even happen that both mentions start at the same node, e.g. "`< w3>`" (TODO: find nice real-world examples). +In such cases, we need to make sure the brackets are well-nested: + +**Rule2**: when opening multiple brackets at the same node, longer mentions MUST be opened first. + +This is important because +- The closing bracket has the same form for both mentions of the same entity - it includes just the entity ID (`eid`). +- The opening-bracket annotation contains other mention attributes, e.g. head index. +- The two mentions may differ in these attributes, e.g. the "``" mention's head may be w3. +- When breaking Rule2, we would get +``` +1 w1 ... Entity=(e1-person-1(e1-person-3 +2 w2 ... Entity=e1) +3 w3 ... Entity=e1) +``` +which would be interpreted as if the head of the "``" mention is its third word, which is invalid. + +### Other rules + +**Rule3**: when closing multiple brackets at the same node, shorter mentions SHOULD be closed first. +See Rule4 for a single exception from this rule regarding crossing mentions. +I'm not aware of any problems when breaking this rule, but it seems intuitive +(to make the annotation well-nested if possible) and we want to define some canonical ordering anyway. +The API should be able to load even files breaking Rule3. + +**Rule4**: single-word chunks SHOULD follow all opening brackets and precede all closing brackets if possible. +When considering single-word chunks as a subtype of both opening and closing brackets, +this rule follows from the well-nestedness (and Rule2). +So we should have `Entity=(e1(e2)` and `Entity=(e3)e1)`, +but the API should be able to load even `Entity=(e2)(e1` and `Entity=e1)(e3)`. + +In case of crossing mentions (annotated following Rule1), we cannot follow Rule4. +If we want to add a single-word mention `e2` to a node with `Entity=e1)(e3`, +it seems intuitive to prefer Rule2 over Rule3, which results in `Entity=e1)(e3(e2)`. +So the canonical ordering will be achieved by placing single-word chunks after all opening brackets. +The API should be able to load even `Entity=(e2)e1)(e3` and `Entity=e1)(e2)(e3`. + +**Rule5**: ordering of same-span single-word mentions +TODO: I am not sure here. We may want to forbid such cases or define canonical ordering even for them. +E.g. `Entity=(e1)(e2)` vs. `Entity=(e2)(e1)`. + +**Rule6**: ordering of same-start same-end multiword mentions +TODO: I am not sure here. +These can be either same-span multiword mentions (which may be forbidden) +or something like +``` +1 w1 ... Entity=(e1(e2[1/2]) +2 w2 ... +3 w3 ... Entity=(e2[2/2])e1) +``` +where both `e1` and `e2` start at w1 and end at w3, but `e2` is discontinuous and does not contain w2. +If we interpret "shorter" and "longer" in Rule2 and Rule3 as `len(mention.words)` +(and not as `mention.words[-1].ord - mention.words[0].ord`), +we get the canonical ordering as in the example above. + +""" import re import functools import collections +import collections.abc +import copy import logging @functools.total_ordering class CorefMention(object): """Class for representing a mention (instance of an entity).""" - __slots__ = ['_head', '_cluster', '_bridging', '_words', 'misc'] + __slots__ = ['_head', '_cluster', '_bridging', '_words', '_other'] - def __init__(self, head, cluster=None): - self._head = head + def __init__(self, words, head=None, cluster=None): + if not words: + raise ValueError("mention.words must be non-empty") + self._words = words + self._head = head if head else words[0] self._cluster = cluster if cluster is not None: cluster._mentions.append(self) self._bridging = None - self._words = [] - self.misc = None + self._other = None - def __lt__(self, other): - """Does this mention precedes (word-order wise) the `other` mention? + def __lt__(self, another): + """Does this mention precedes (word-order wise) `another` mention? This method defines a total ordering of all mentions - (within one cluster or across different clusters). - The position is primarily defined by the first word in each mention - (or by the head if mention.words are missing). + (within one entity or across different entities). + The position is primarily defined by the first word in each mention. If two mentions start at the same word, - their order is defined by the last word in their span - -- the shorter mention precedes the longer one. + their order is defined by their length (i.e. number of words) + -- the shorter mention follows the longer one. + + In the rare case of two same-length mentions starting at the same word, but having different spans, + their order is defined by the order of the last word in their span. + For example precedes . + + The order of two same-span mentions is currently defined by their cluster_id. + There should be no same-span (or same-subspan) same-cluster mentions. """ - node1 = self._words[0] if self._words else self._head - node2 = other._words[0] if other._words else other._head - if node1 is node2: - node1 = self._words[-1] if self._words else self._head - node2 = other._words[-1] if other._words else other._head - if node1 is node2: - return len(self._words) < len(other._words) - return node1.precedes(node2) + #TODO: no mention.words should be handled already when loading + if not self._words: + self._words = [self._head] + if not another._words: + another._words = [another._head] + + if self._words[0] is another._words[0]: + if len(self._words) > len(another._words): + return True + if len(self._words) < len(another._words): + return False + if self._words[-1].precedes(another._words[-1]): + return True + if another._words[-1].precedes(self._words[-1]): + return False + return self._cluster.cluster_id < another._cluster.cluster_id + return self._words[0].precedes(another._words[0]) + + @property + def other(self): + if self._other is None: + self._other = OtherDualDict() + return self._other + + @other.setter + def other(self, value): + if self._other is None: + self._other = OtherDualDict(value) + else: + self._other.set_mapping(value) @property def head(self): @@ -74,7 +204,7 @@ def words(self): @words.setter def words(self, new_words): if new_words and self.head not in new_words: - raise ValueError(f"Head {self.head} not in new_words {new_words}") + raise ValueError(f"Head {self.head} not in new_words {new_words} for {self._cluster.cluster_id}") kept_words = [] for old_word in self._words: if old_word in new_words: @@ -100,19 +230,24 @@ def span(self, new_span): self.words = span_to_nodes(self._head.root, new_span) +CHARS_FORBIDDEN_IN_ID = "-=| \t()" + + @functools.total_ordering class CorefCluster(object): """Class for representing all mentions of a given entity.""" __slots__ = ['_cluster_id', '_mentions', 'cluster_type', 'split_ante'] def __init__(self, cluster_id, cluster_type=None): + if any(x in cluster_id for x in CHARS_FORBIDDEN_IN_ID): + raise ValueError(f"{cluster_id} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") self._cluster_id = cluster_id self._mentions = [] self.cluster_type = cluster_type self.split_ante = [] - def __lt__(self, other): - """Does this CorefCluster precedes (word-order wise) the `other` cluster? + def __lt__(self, another): + """Does this CorefCluster precedes (word-order wise) `another` cluster? This method defines a total ordering of all clusters by the first mention of each cluster (see `CorefMention.__lt__`). @@ -121,18 +256,24 @@ def __lt__(self, other): If cluster IDs are not important, it is recommended to use block `corefud.IndexClusters` to re-name cluster IDs in accordance with this cluster ordering. """ - if not self._mentions or not other._mentions: + if not self._mentions or not another._mentions: # Clusters without mentions should go first, so the ordering is total. # If both clusters are missing mentions, let's use cluster_id, so the ordering is stable. - if not self._mentions and not other._mentions: - return self._cluster_id < other._cluster_id + if not self._mentions and not another._mentions: + return self._cluster_id < another._cluster_id return not self._mentions - return self._mentions[0] < other._mentions[0] + return self._mentions[0] < another._mentions[0] @property def cluster_id(self): return self._cluster_id + @cluster_id.setter + def cluster_id(self, new_cluster_id): + if any(x in new_cluster_id for x in "-=| \t"): + raise ValueError(f"{new_cluster_id} contains forbidden characters [-=| \\t]") + self._cluster_id = new_cluster_id + @property def mentions(self): return self._mentions @@ -181,7 +322,26 @@ def all_bridging(self): yield b -BridgingLink = collections.namedtuple('BridgingLink', 'target relation') +# BridgingLink +# Especially the relation should be mutable, so we cannot use +# BridgingLink = collections.namedtuple('BridgingLink', 'target relation') +# TODO once dropping support for Python 3.6, we could use +# from dataclasses import dataclass +# @dataclass +# class DataClassCard: +# target: CorefCluster +# relation: str +class BridgingLink: + __slots__ = ['target', 'relation'] + + def __init__(self, target, relation=''): + self.target = target + self.relation = '' if relation is None else relation + + def __lt__(self, another): + if self.target == another.target: + return self.relation < another.relation + return self.target < another.target class BridgingLinks(collections.abc.MutableSequence): @@ -189,33 +349,52 @@ class BridgingLinks(collections.abc.MutableSequence): Example usage: >>> bl = BridgingLinks(src_mention) # empty links - >>> bl = BridgingLinks(src_mention, [(c12, 'Part'), (c56, 'Subset')]) # from a list of tuples - >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset', clusters) # from a string + >>> bl = BridgingLinks(src_mention, [(c12, 'part'), (c56, 'subset')]) # from a list of tuples + >>> (bl8, bl9) = BridgingLinks.from_string('c12>> for cluster, relation in bl: >>> print(f"{bl.src_mention} ->{relation}-> {cluster.cluster_id}") - >>> print(str(bl)) # c12:Part,c56:Subset - >>> bl('Part').targets == [c12] - >>> bl('Part|Subset').targets == [c12, c56] - >>> bl.append((c89, 'Funct')) + >>> print(str(bl)) # c12>> bl('part').targets == [c12] + >>> bl('part|subset').targets == [c12, c56] + >>> bl.append((c57, 'funct')) """ - def __init__(self, src_mention, value=None, clusters=None, strict=True): + + @classmethod + def from_string(cls, string, clusters, strict=True): + src_str2bl = {} + for link_str in string.split(','): + try: + trg_str, src_str = link_str.split('<') + except ValueError as err: + _error(f"invalid Bridge {link_str} {err} at {node}", strict) + continue + relation = '' + if ':' in src_str: + src_str, relation = src_str.split(':', 1) + if trg_str == src_str: + _error("Bridge cannot self-reference the same cluster: " + trg_str, strict) + bl = src_str2bl.get(src_str) + if not bl: + bl = clusters[src_str].mentions[-1].bridging + src_str2bl[src_str] = bl + if trg_str not in clusters: + clusters[trg_str] = CorefCluster(trg_str) + bl._data.append(BridgingLink(clusters[trg_str], relation)) + return src_str2bl.values() + + def __init__(self, src_mention, value=None, strict=True): self.src_mention = src_mention self._data = [] self.strict = strict if value is not None: - if isinstance(value, str): - if clusters is None: - raise ValueError('BridgingClusters: clusters must be provided if initializing with a string') - try: - self._from_string(value, clusters) - except Exception: - logging.error(f"Problem when parsing {value} in {src_mention.words[0]}:\n") - raise - elif isinstance(value, collections.abc.Sequence): + if isinstance(value, collections.abc.Sequence): for v in value: if v[0] is src_mention._cluster: _error("Bridging cannot self-reference the same cluster: " + v[0].cluster_id, strict) self._data.append(BridgingLink(v[0], v[1])) + else: + raise ValueError(f"Unknown value type: {type(value)}") + self.src_mention._bridging = self super().__init__() def __getitem__(self, key): @@ -239,18 +418,8 @@ def insert(self, key, new_value): self._data.insert(key, BridgingLink(new_value[0], new_value[1])) def __str__(self): - return ','.join(f'{l.target._cluster_id}:{l.relation}' for l in sorted(self._data)) - - def _from_string(self, string, clusters): - self._data.clear() - for link_str in string.split(','): - target, relation = link_str.split(':') - if target == self.src_mention._cluster._cluster_id: - _error("Bridging cannot self-reference the same cluster: " + target, self.strict) - if target not in clusters: - clusters[target] = CorefCluster(target) - self._data.append(BridgingLink(clusters[target], relation)) - self._data.sort() + # TODO in future link.relation should never be None, 0 nor "_", so we could delete the below. + return ','.join(f'{l.target._cluster_id}<{self.src_mention.cluster.cluster_id}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) def __call__(self, relations_re=None): """Return a subset of links contained in this list as specified by the args. @@ -259,7 +428,7 @@ def __call__(self, relations_re=None): """ if relations_re is None: return self - return Links(self.src_mention, [l for l in self._data if re.match(relations_re, l.relation)]) + return BridgingLinks(self.src_mention, [l for l in self._data if re.match(relations_re, l.relation)]) @property def targets(self): @@ -286,112 +455,332 @@ def _error(msg, strict): raise ValueError(msg) logging.error(msg) + +RE_DISCONTINUOUS = re.compile(r'^([^[]+)\[(\d+)/(\d+)\]') + def load_coref_from_misc(doc, strict=True): clusters = {} + unfinished_mentions = collections.defaultdict(list) + discontinuous_mentions = collections.defaultdict(list) + global_entity = doc.meta.get('global.Entity') + was_global_entity = True + if not global_entity: + was_global_entity = False + global_entity = 'eid-etype-head-other' + doc.meta['global.Entity'] = global_entity + # backward compatibility + if global_entity == 'entity-GRP-infstat-MIN-coref_type-identity': + global_entity = 'etype-eid-infstat-minspan-link-identity' + # Which global.Entity should be used for serialization? + doc.meta['global.Entity'] = global_entity + #doc.meta['global.Entity'] = 'eid-etype-head-other' + if 'eid' not in global_entity: + raise ValueError("No eid in global.Entity = " + global_entity) + fields = global_entity.split('-') + for node in doc.nodes_and_empty: - index, index_str = 0, "" - cluster_id = node.misc["ClusterId"] - if not cluster_id: - index, index_str = 1, "[1]" - cluster_id = node.misc["ClusterId[1]"] - while cluster_id: - cluster = clusters.get(cluster_id) - if cluster is None: - cluster = CorefCluster(cluster_id) - clusters[cluster_id] = cluster - mention = CorefMention(node, cluster) - if node.misc["MentionSpan" + index_str]: - mention.span = node.misc["MentionSpan" + index_str] + misc_entity = node.misc["Entity"] + if not misc_entity: + continue + + if not was_global_entity: + raise ValueError(f"No global.Entity header found, but Entity= annotations are presents") + + # The Entity attribute may contain multiple entities, e.g. + # Entity=(abstract-7-new-2-coref(abstract-3-giv:act-1-coref) + # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. + # The following re.split line splits this into + # chunks = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] + chunks = [x for x in re.split('(\([^()]+\)?|[^()]+\))', misc_entity) if x] + for chunk in chunks: + opening, closing = (chunk[0] == '(', chunk[-1] == ')') + chunk = chunk.strip('()') + # 1. invalid + if not opening and not closing: + logging.warning(f"Entity {chunk} at {node} has no opening nor closing bracket.") + # 2. closing bracket + elif not opening and closing: + # closing brackets should include just the ID, + # but older GUM versions repeated all the fields + if '-' in chunk: + # TODO delete this legacy hack once we don't need to load UD GUM v2.8 anymore + if not strict and global_entity.startswith('etype-eid'): + chunk = chunk.split('-')[1] + else: + _error("Unexpected closing eid " + chunk, strict) + + # closing discontinuous mentions + eid, subspan_idx = chunk, None + if chunk not in unfinished_mentions: + m = RE_DISCONTINUOUS.match(chunk) + if not m: + raise ValueError(f"Mention {chunk} closed at {node}, but not opened.") + eid, subspan_idx, total_subspans = m.group(1, 2, 3) + + mention, head_idx = unfinished_mentions[eid].pop() + last_word = mention.words[-1] + if node.root is not last_word.root: + # TODO cross-sentence mentions + raise ValueError(f"Cross-sentence mentions not supported yet: {chunk} at {node}") + for w in node.root.descendants_and_empty: + if last_word.precedes(w): + mention._words.append(w) + w._mentions.append(mention) + if w is node: + break + if head_idx and (subspan_idx is None or subspan_idx == total_subspans): + try: + mention.head = mention.words[head_idx - 1] + except IndexError as err: + _error(f"Invalid head_idx={head_idx} for {mention.cluster.cluster_id} " + f"closed at {node} with words={mention.words}", 1) + if subspan_idx and subspan_idx == total_subspans: + m = discontinuous_mentions[eid].pop() + if m is not mention: + _error(f"Closing mention {mention.cluster.cluster_id} at {node}, but it has unfinished nested mentions ({m.words})", 1) + + # 3. opening or single-word else: - mention.words = [node] - cluster_type = node.misc["ClusterType" + index_str] - if cluster_type is not None: - if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: - logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") - cluster.cluster_type = cluster_type - - bridging_str = node.misc["Bridging" + index_str] - if bridging_str: - mention._bridging = BridgingLinks(mention, bridging_str, clusters, strict) - - split_ante_str = node.misc["SplitAnte" + index_str] - if split_ante_str: - split_antes = [] - # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma. - # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. - for ante_str in split_ante_str.replace('+', ',').split(','): - if ante_str in clusters: - if ante_str == cluster_id: - _error("SplitAnte cannot self-reference the same cluster: " + cluster_id, strict) - split_antes.append(clusters[ante_str]) + eid, etype, head_idx, other = None, None, None, OtherDualDict() + for name, value in zip(fields, chunk.split('-')): + if name == 'eid': + eid = value + elif name == 'etype': + etype = value + elif name == 'head': + try: + head_idx = int(value) + except ValueError as err: + raise ValueError(f"Non-integer {value} as head index in {chunk} in {node}: {err}") + elif name == 'other': + if other: + new_other = OtherDualDict(value) + for k,v in other.values(): + new_other[k] = v + other = new_other + else: + other = OtherDualDict(value) else: - # split cataphora, e.g. "We, that is you and me..." - ante_cl = CorefCluster(ante_str) - clusters[ante_str] = ante_cl - split_antes.append(ante_cl) - cluster.split_ante = sorted(split_antes) - - mention.misc = node.misc["MentionMisc" + index_str] - index += 1 - index_str = f"[{index}]" - cluster_id = node.misc["ClusterId" + index_str] + other[name] = value + if eid is None: + raise ValueError("No eid in " + chunk) + subspan_idx, total_subspans = None, '0' + if eid[-1] == ']': + m = RE_DISCONTINUOUS.match(eid) + if not m: + _error(f"eid={eid} ending with ], but not valid discontinuous mention ID ", strict) + else: + eid, subspan_idx, total_subspans = m.group(1, 2, 3) + + cluster = clusters.get(eid) + if cluster is None: + if subspan_idx and subspan_idx != '1': + _error(f'Non-first subspan of a discontinuous mention {eid} at {node} does not have any previous mention.', 1) + cluster = CorefCluster(eid) + clusters[eid] = cluster + cluster.cluster_type = etype + elif etype and cluster.cluster_type and cluster.cluster_type != etype: + logging.warning(f"etype mismatch in {node}: {cluster.cluster_type} != {etype}") + # CorefCluster could be created first with "Bridge=" without any type + elif etype and cluster.cluster_type is None: + cluster.cluster_type = etype + + if subspan_idx and subspan_idx != '1': + opened = [pair[0] for pair in unfinished_mentions[eid]] + mention = next(m for m in discontinuous_mentions[eid] if m not in opened) + mention._words.append(node) + if closing and subspan_idx == total_subspans: + m = discontinuous_mentions[eid].pop() + if m is not mention: + _error(f"{node}: closing mention {mention.cluster.cluster_id} ({mention.words}), but it has an unfinished nested mention ({m.words})", 1) + try: + mention.head = mention._words[head_idx - 1] + except IndexError as err: + _error(f"Invalid head_idx={head_idx} for {mention.cluster.cluster_id} " + f"closed at {node} with words={mention._words}", 1) + else: + mention = CorefMention(words=[node], cluster=cluster) + if other: + mention._other = other + if subspan_idx: + discontinuous_mentions[eid].append(mention) + node._mentions.append(mention) + + if not closing: + unfinished_mentions[eid].append((mention, head_idx)) + + + # Bridge, e.g. Entity=(e12-event|Bridge=e12 (e10) + # (e1(e2 --> (e1(e2(e10) + # e3)(e1(e2 --> e3)(e1(e2(e10) + if not orig_entity or orig_entity[-1] != ')': + firstword.misc['Entity'] += mention_str + ')' + # e4)e3) --> (e10)e4)e3) + elif '(' not in orig_entity: + firstword.misc['Entity'] = mention_str + ')' + orig_entity + # (e9)e4)e3) --> (e10)(e9)e4)e3) + elif any(c and c[0] == '(' and c[-1] != ')' for c in re.split('(\([^()]+\)?|[^()]+\))', orig_entity)): + firstword.misc['Entity'] += mention_str + ')' + # (e1(e2(e9) --> (e1(e2(e9)(e10) + # e3)(e1(e2(e9)--> e3)(e1(e2(e9)(e10) + else: + firstword.misc['Entity'] = mention_str + ')' + orig_entity + # Second, multi-word mentions. Opening brackets should follow closing brackets. + else: + firstword.misc['Entity'] += mention_str + mention.words[-1].misc['Entity'] = cluster.cluster_id + ')' + mention.words[-1].misc['Entity'] + + # Bridge=e1 lo else f"{lo}") return ','.join(ranges) + + +# TODO fix code duplication with udapi.core.dualdict after making sure benchmarks are not slower +class OtherDualDict(collections.abc.MutableMapping): + """OtherDualDict class serves as dict with lazily synchronized string representation. + + >>> ddict = OtherDualDict('anacata:anaphoric,antetype:entity,nptype:np') + >>> ddict['mention'] = 'np' + >>> str(ddict) + 'anacata:anaphoric,antetype:entity,mention:np,nptype:np' + >>> ddict['NonExistent'] + '' + + This class provides access to both + * a structured (dict-based, deserialized) representation, + e.g. {'anacata': 'anaphoric', 'antetype': 'entity'}, and + * a string (serialized) representation of the mapping, e.g. `anacata:anaphoric,antetype:entity`. + There is a clever mechanism that makes sure that users can read and write + both of the representations which are always kept synchronized. + Moreover, the synchronization is lazy, so the serialization and deserialization + is done only when needed. This speeds up scenarios where access to dict is not needed. + + A value can be deleted with any of the following three ways: + >>> del ddict['nptype'] + >>> ddict['nptype'] = None + >>> ddict['nptype'] = '' + and it works even if the value was already missing. + """ + __slots__ = ['_string', '_dict'] + + def __init__(self, value=None, **kwargs): + if value is not None and kwargs: + raise ValueError('If value is specified, no other kwarg is allowed ' + str(kwargs)) + self._dict = dict(**kwargs) + self._string = None + if value is not None: + self.set_mapping(value) + + def __str__(self): + if self._string is None: + serialized = [] + for name, value in sorted(self._dict.items(), key=lambda s: s[0].lower()): + if value is True: + serialized.append(name) + else: + serialized.append(f"{name}:{value}") + self._string = ','.join(serialized) if serialized else '' + return self._string + + def _deserialize_if_empty(self): + if not self._dict and self._string is not None and self._string != '': + for raw_feature in self._string.split(','): + namevalue = raw_feature.split(':', 1) + if len(namevalue) == 2: + name, value = namevalue + else: + name, value = namevalue[0], True + self._dict[name] = value + + def __getitem__(self, key): + self._deserialize_if_empty() + return self._dict.get(key, '') + + def __setitem__(self, key, value): + self._deserialize_if_empty() + self._string = None + if value is None or value == '': + self.__delitem__(key) + else: + value = value.replace(',', '%2C') # TODO report a warning? Escape also '|' and '-'? + self._dict[key] = value + + def __delitem__(self, key): + self._deserialize_if_empty() + try: + del self._dict[key] + self._string = None + except KeyError: + pass + + def __iter__(self): + self._deserialize_if_empty() + return self._dict.__iter__() + + def __len__(self): + self._deserialize_if_empty() + return len(self._dict) + + def __contains__(self, key): + self._deserialize_if_empty() + return self._dict.__contains__(key) + + def clear(self): + self._string = '_' + self._dict.clear() + + def copy(self): + """Return a deep copy of this instance.""" + return copy.deepcopy(self) + + def set_mapping(self, value): + """Set the mapping from a dict or string. + + If the `value` is None, it is converted to storing an empty string. + If the `value` is a string, it is stored as is. + If the `value` is a dict (or any instance of `collections.abc.Mapping`), + its copy is stored. + Other types of `value` raise an `ValueError` exception. + """ + if value is None: + self.clear() + elif isinstance(value, str): + self._dict.clear() + self._string = value + elif isinstance(value, collections.abc.Mapping): + self._string = None + self._dict = dict(value) + else: + raise ValueError("Unsupported value type " + str(value)) diff --git a/udapi/core/document.py b/udapi/core/document.py index 8f9ce3ea..aceeafdf 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -123,3 +123,13 @@ def coref_clusters(self): """A dict mapping ClusterId to a CorefCluster object.""" self._load_coref() return self._coref_clusters + + @property + def coref_mentions(self): + """A sorted list of all CorefMention objects in the document.""" + self._load_coref() + all_mentions = [] + for cluster in self._coref_clusters.values(): + all_mentions.extend(cluster.mentions) + all_mentions.sort() + return all_mentions diff --git a/udapi/core/dualdict.py b/udapi/core/dualdict.py index a79c0610..540006ea 100644 --- a/udapi/core/dualdict.py +++ b/udapi/core/dualdict.py @@ -45,7 +45,7 @@ def __str__(self): if value is True: serialized.append(name) else: - serialized.append('%s=%s' % (name, value)) + serialized.append(f"{name}:{value}") self._string = '|'.join(serialized) if serialized else '_' return self._string diff --git a/udapi/core/root.py b/udapi/core/root.py index 3fbe5fca..0132566a 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -71,6 +71,10 @@ def address(self): """ return self.sent_id + @property + def document(self): + return self._bundle._document + @property def bundle(self): """Return the bundle which this tree belongs to.""" diff --git a/udapi/core/run.py b/udapi/core/run.py index 0a08504c..c3a4ca6f 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -101,7 +101,7 @@ def _import_blocks(block_names, block_args): command = "b%s(**kwargs)" % block_id logging.debug("Trying to evaluate this: %s", command) new_block_instance = eval(command) # pylint: disable=eval-used - blocks.append(new_block_instance) + blocks.append((block_name, new_block_instance)) return blocks @@ -133,11 +133,11 @@ def execute(self): blocks = _import_blocks(block_names, block_args) # Initialize blocks (process_start). - for block in blocks: + for bname, block in blocks: block.process_start() readers = [] - for block in blocks: + for bname, block in blocks: try: block.finished # pylint: disable=pointless-statement readers.append(block) @@ -147,15 +147,15 @@ def execute(self): logging.info('No reader specified, using read.Conllu') conllu_reader = Conllu() readers = [conllu_reader] - blocks = readers + blocks + blocks = [('read.Conllu', conllu_reader)] + blocks # Apply blocks on the data. finished = False while not finished: document = Document() logging.info(" ---- ROUND ----") - for block in blocks: - logging.info("Executing block " + block.__class__.__name__) + for bname, block in blocks: + logging.info(f"Executing block {bname}") block.apply_on_document(document) finished = True @@ -164,7 +164,7 @@ def execute(self): finished = finished and reader.finished # 6. close blocks (process_end) - for block in blocks: + for bname, block in blocks: block.process_end() # TODO: better implementation, included Scen From 276529ca0daa4dbbfd8e74f87592028b8ecb88bc Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 10 Feb 2022 11:00:18 +0100 Subject: [PATCH 0326/1201] oops, partial revert of the last commit --- udapi/core/dualdict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/dualdict.py b/udapi/core/dualdict.py index 540006ea..ba0129ed 100644 --- a/udapi/core/dualdict.py +++ b/udapi/core/dualdict.py @@ -45,7 +45,7 @@ def __str__(self): if value is True: serialized.append(name) else: - serialized.append(f"{name}:{value}") + serialized.append(f"{name}={value}") self._string = '|'.join(serialized) if serialized else '_' return self._string From 81a65bf8a192aaf2bfdb9fa233b71b5a1c268d8f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 10 Feb 2022 12:24:15 +0100 Subject: [PATCH 0327/1201] corefud.MarkNested in future, I would like to merge all the corefud.Mark* blocks into one universal block, but for now, let's archive this one --- udapi/block/corefud/marknested.py | 44 +++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 udapi/block/corefud/marknested.py diff --git a/udapi/block/corefud/marknested.py b/udapi/block/corefud/marknested.py new file mode 100644 index 00000000..656111c6 --- /dev/null +++ b/udapi/block/corefud/marknested.py @@ -0,0 +1,44 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkNested(Block): + """Find nested mentions.""" + + def __init__(self, same_cluster_only=True, both_discontinuous=False, multiword_only=False, + print_form=False, log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_cluster_only = same_cluster_only + self.both_discontinuous = both_discontinuous + self.multiword_only = multiword_only + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.cluster.cluster_id + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + for mA, mB in itertools.combinations(mentions, 2): + if self.same_cluster_only and mA.cluster != mB.cluster: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + sA, sB = set(mA.words), set(mB.words) + if not (sA <= sB) and not (sB <= sA): + continue + if self.multiword_only and (len(sA) == 1 or len(sB) == 1): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"nested mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") From b38e7e4312d373fc28995ca37ca7ce2d25363f8c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 10 Feb 2022 14:54:07 +0100 Subject: [PATCH 0328/1201] CorefMention(words=[w1,w2]) should create backlinks from w1 and w2 The only case when we don't want these backlinks is "fake mentions" needed for serialization of discontinuous mentions, but that should be solved with a special parameter in `__init__`. Fixes #101 --- udapi/core/coref.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index aaaa07f0..3f54d9a9 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -109,16 +109,23 @@ class CorefMention(object): """Class for representing a mention (instance of an entity).""" __slots__ = ['_head', '_cluster', '_bridging', '_words', '_other'] - def __init__(self, words, head=None, cluster=None): + def __init__(self, words, head=None, cluster=None, add_word_backlinks=True): if not words: raise ValueError("mention.words must be non-empty") - self._words = words self._head = head if head else words[0] self._cluster = cluster if cluster is not None: cluster._mentions.append(self) self._bridging = None self._other = None + self._words = words + if add_word_backlinks: + for new_word in words: + if not new_word._mentions or not cluster or self > new_word._mentions[-1]: + new_word._mentions.append(self) + else: + new_word._mentions.append(self) + new_word._mentions.sort() def __lt__(self, another): """Does this mention precedes (word-order wise) `another` mention? @@ -692,7 +699,7 @@ def store_coref_to_misc(doc): subspan_eid = f'{cluster.cluster_id}[{idx}/{len(subspans)}]' subspan_words = span_to_nodes(root, subspan) fake_cluster = CorefCluster(subspan_eid, cluster.cluster_type) - fake_mention = CorefMention(subspan_words, head_str, fake_cluster) + fake_mention = CorefMention(subspan_words, head_str, fake_cluster, add_word_backlinks=False) if mention._other: fake_mention._other = mention._other if mention._bridging and idx == 1: From fe4dfcf110dac83608af2d3f8ad944840a0aee1c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 11 Feb 2022 01:30:46 +0100 Subject: [PATCH 0329/1201] corefud.IndexClusters will use prefix=e by default --- udapi/block/corefud/indexclusters.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py index 1496c11c..14cf778d 100644 --- a/udapi/block/corefud/indexclusters.py +++ b/udapi/block/corefud/indexclusters.py @@ -3,7 +3,7 @@ class IndexClusters(Block): - """Re-index the coreference cluster IDs. The final cluster IDs are of the "c" form, + """Re-index the coreference cluster IDs. The final cluster IDs are of the "e" form, where are ordinal numbers starting from the one specified by the `start` parameter. This block can be applied on multiple documents within one udapy call. For example, to re-index ClusterId in all conllu files in the current directory @@ -13,11 +13,14 @@ class IndexClusters(Block): Parameters: ----------- start : int - the starting index (by default 1) + the starting index (default=1) + prefix : str + prefix of the IDs before the number (default="e") """ - def __init__(self, start=1): + def __init__(self, start=1, prefix='e'): self.start = start + self.prefix = prefix def process_document(self, doc): clusters = doc.coref_clusters @@ -26,7 +29,7 @@ def process_document(self, doc): new_clusters = {} for idx, cid in enumerate(clusters, self.start): cluster = clusters[cid] - new_cid = "c" + str(idx) + new_cid = self.prefix + str(idx) cluster.cluster_id = new_cid new_clusters[new_cid] = cluster self.start = idx + 1 From ecee32956bc33cdbf20944768f9f9b059340edd8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 11 Feb 2022 10:15:37 +0100 Subject: [PATCH 0330/1201] prevent code duplication when setting cluster IDs --- udapi/core/coref.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 3f54d9a9..022953a4 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -246,9 +246,8 @@ class CorefCluster(object): __slots__ = ['_cluster_id', '_mentions', 'cluster_type', 'split_ante'] def __init__(self, cluster_id, cluster_type=None): - if any(x in cluster_id for x in CHARS_FORBIDDEN_IN_ID): - raise ValueError(f"{cluster_id} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") - self._cluster_id = cluster_id + self._cluster_id = None # prepare the _cluster_id slot + self.cluster_id = cluster_id # call the setter and check the ID is valid self._mentions = [] self.cluster_type = cluster_type self.split_ante = [] @@ -277,8 +276,8 @@ def cluster_id(self): @cluster_id.setter def cluster_id(self, new_cluster_id): - if any(x in new_cluster_id for x in "-=| \t"): - raise ValueError(f"{new_cluster_id} contains forbidden characters [-=| \\t]") + if any(x in new_cluster_id for x in CHARS_FORBIDDEN_IN_ID): + raise ValueError(f"{new_cluster_id} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") self._cluster_id = new_cluster_id @property From ee89d8fbd37dee421ebbbc1d64ecfd7bbc604eab Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 11 Feb 2022 16:27:33 +0100 Subject: [PATCH 0331/1201] newpar_block (used in the newest GUM) should not be treated as newpar --- udapi/block/read/conllu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 97e39970..d703fb26 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -12,7 +12,7 @@ # This reader accepts also older-style sent_id (until UD v2.0 treebanks are released). RE_SENT_ID = re.compile(r'^# sent_id\s*=?\s*(\S+)') RE_TEXT = re.compile(r'^# text\s*=\s*(.*)') -RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?') +RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?$') RE_JSON = re.compile(r'^# (doc_)?json_([^ =]+)\s*=\s*(.+)') RE_GLOBAL_ENTITY = re.compile(r'^# global.Entity\s*=\s*(\S+)') From eab9a7890e37924d1a402eac0ddb99562f143f35 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 11 Feb 2022 20:16:15 +0100 Subject: [PATCH 0332/1201] convert (doc-level) GRP to (corpus-level) eid transparently So GRP can be in global.Entity, both for reading and writing. --- udapi/core/coref.py | 100 +++++++++++++++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 24 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 022953a4..7b205d3c 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -280,6 +280,18 @@ def cluster_id(self, new_cluster_id): raise ValueError(f"{new_cluster_id} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") self._cluster_id = new_cluster_id + @property + def eid_or_grp(self): + root = self._mentions[0].head.root + meta = root.document.meta + if 'GRP' in meta['global.Entity'] and meta['tree2docid']: + docid = meta['tree2docid'][root] + if self._cluster_id.startswith(docid): + return self._cluster_id.replace(docid, '', 1) + else: + logging.warning(f"GRP in global.Entity, but eid={self._cluster_id} does not start with docid={docid}") + return self._cluster_id + @property def mentions(self): return self._mentions @@ -366,7 +378,14 @@ class BridgingLinks(collections.abc.MutableSequence): """ @classmethod - def from_string(cls, string, clusters, strict=True): + def from_string(cls, string, clusters, node, strict=True, tree2docid=None): + """Return a sequence of BridgingLink objects representing a given string serialization. + The bridging links are also added to the mentions (`mention.bridging`) in the supplied `clusters`, + so the returned sequence can be usually ignored. + If `tree2docid` parameter is provided (mapping trees to document IDs used as prefixes in eid), + the entity IDs in the provided string are interpreted as "GRP", i.e. as document-wide IDs, + which need to be prefixed by the document IDs, to get corpus-wide unique "eid". + """ src_str2bl = {} for link_str in string.split(','): try: @@ -378,7 +397,10 @@ def from_string(cls, string, clusters, strict=True): if ':' in src_str: src_str, relation = src_str.split(':', 1) if trg_str == src_str: - _error("Bridge cannot self-reference the same cluster: " + trg_str, strict) + _error(f"Bridge cannot self-reference the same cluster {trg_str} at {node}", strict) + if tree2docid: + src_str = tree2docid[node.root] + src_str + trg_str = tree2docid[node.root] + trg_str bl = src_str2bl.get(src_str) if not bl: bl = clusters[src_str].mentions[-1].bridging @@ -425,7 +447,7 @@ def insert(self, key, new_value): def __str__(self): # TODO in future link.relation should never be None, 0 nor "_", so we could delete the below. - return ','.join(f'{l.target._cluster_id}<{self.src_mention.cluster.cluster_id}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) + return ','.join(f'{l.target.eid_or_grp}<{self.src_mention.cluster.eid_or_grp}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) def __call__(self, relations_re=None): """Return a subset of links contained in this list as specified by the args. @@ -463,8 +485,14 @@ def _error(msg, strict): RE_DISCONTINUOUS = re.compile(r'^([^[]+)\[(\d+)/(\d+)\]') +# When converting doc-level GRP IDs to corpus-level eid IDs, +# we need to assign each document a short ID/number (document names are too long). +# These document numbers must be unique even when loading multiple files, +# so we need to store the highest number generated so far here, at the Python module level. +highest_doc_n = 0 def load_coref_from_misc(doc, strict=True): + global highest_doc_n clusters = {} unfinished_mentions = collections.defaultdict(list) discontinuous_mentions = collections.defaultdict(list) @@ -474,13 +502,17 @@ def load_coref_from_misc(doc, strict=True): was_global_entity = False global_entity = 'eid-etype-head-other' doc.meta['global.Entity'] = global_entity - # backward compatibility - if global_entity == 'entity-GRP-infstat-MIN-coref_type-identity': - global_entity = 'etype-eid-infstat-minspan-link-identity' - # Which global.Entity should be used for serialization? - doc.meta['global.Entity'] = global_entity - #doc.meta['global.Entity'] = 'eid-etype-head-other' - if 'eid' not in global_entity: + tree2docid = None + if 'GRP' in global_entity: + tree2docid, docid = {}, "" + for bundle in doc: + for tree in bundle: + if tree.newdoc or docid == "": + highest_doc_n += 1 + docid = f"d{highest_doc_n}." + tree2docid[tree] = docid + doc.meta['tree2docid'] = tree2docid + elif 'eid' not in global_entity: raise ValueError("No eid in global.Entity = " + global_entity) fields = global_entity.split('-') @@ -506,14 +538,15 @@ def load_coref_from_misc(doc, strict=True): logging.warning(f"Entity {chunk} at {node} has no opening nor closing bracket.") # 2. closing bracket elif not opening and closing: - # closing brackets should include just the ID, - # but older GUM versions repeated all the fields - if '-' in chunk: + # closing brackets should include just the ID, but GRP needs to be converted to eid + if tree2docid: # TODO delete this legacy hack once we don't need to load UD GUM v2.8 anymore - if not strict and global_entity.startswith('etype-eid'): - chunk = chunk.split('-')[1] - else: - _error("Unexpected closing eid " + chunk, strict) + if '-' in chunk: + if not strict and global_entity.startswith('entity-GRP'): + chunk = chunk.split('-')[1] + else: + _error("Unexpected closing eid " + chunk, strict) + chunk = tree2docid[node.root] + chunk # closing discontinuous mentions eid, subspan_idx = chunk, None @@ -551,7 +584,9 @@ def load_coref_from_misc(doc, strict=True): for name, value in zip(fields, chunk.split('-')): if name == 'eid': eid = value - elif name == 'etype': + elif name == 'GRP': + eid = tree2docid[node.root] + value + elif name == 'etype' or name == 'entity': # entity is an old name for etype used in UD GUM 2.8 and 2.9 etype = value elif name == 'head': try: @@ -617,10 +652,10 @@ def load_coref_from_misc(doc, strict=True): # Bridge, e.g. Entity=(e12-event|Bridge=e12 Date: Tue, 15 Feb 2022 13:40:09 +0100 Subject: [PATCH 0333/1201] make sure mention.words are sorted even when reordering/deleting nodes --- udapi/core/coref.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 7b205d3c..cb865a31 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -206,6 +206,11 @@ def bridging(self): @property def words(self): + # Words in a sentence could have been reordered, so we cannot rely on sorting self._words in the setter. + # The serialization relies on storing the opening bracket in the first word (and closing in the last), + # so we need to make sure the words are always returned sorted. + # TODO: benchmark updating the order of mention._words in node.shift_*() and node.remove(). + self._words.sort() return self._words @words.setter @@ -213,12 +218,13 @@ def words(self, new_words): if new_words and self.head not in new_words: raise ValueError(f"Head {self.head} not in new_words {new_words} for {self._cluster.cluster_id}") kept_words = [] + # Make sure each word is included just once and they are in the correct order. + new_words = sorted(list(set(new_words))) for old_word in self._words: if old_word in new_words: kept_words.append(old_word) else: old_word._mentions.remove(self) - new_words.sort() self._words = new_words for new_word in new_words: if new_word not in kept_words: @@ -556,7 +562,10 @@ def load_coref_from_misc(doc, strict=True): raise ValueError(f"Mention {chunk} closed at {node}, but not opened.") eid, subspan_idx, total_subspans = m.group(1, 2, 3) - mention, head_idx = unfinished_mentions[eid].pop() + try: + mention, head_idx = unfinished_mentions[eid].pop() + except IndexError as err: + raise ValueError(f"Mention {chunk} closed at {node}, but not opened.") last_word = mention.words[-1] if node.root is not last_word.root: # TODO cross-sentence mentions From 7772c398998af5c25e89c12f11da26099d38d1bb Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 15 Feb 2022 21:57:47 +0100 Subject: [PATCH 0334/1201] Debugging corefud.MergeSameSpan. --- udapi/block/corefud/mergesamespan.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py index d5a46d25..802285af 100644 --- a/udapi/block/corefud/mergesamespan.py +++ b/udapi/block/corefud/mergesamespan.py @@ -24,6 +24,12 @@ def process_tree(self, tree): for mA, mB in itertools.combinations(mentions, 2): if self.same_cluster_only and mA.cluster != mB.cluster: continue + # Reduce non-determinism in which mention is removed: + # If the mentions belong to different entities, sort them by entity (cluster) ids. + if mA.cluster.cluster_id > mB.cluster.cluster_id: + mX = mA + mA = mB + mB = mX sA, sB = set(mA.words), set(mB.words) if sA != sB: @@ -40,6 +46,7 @@ def process_tree(self, tree): # m.cluster = mA.cluster # Remove mention B. It may have been removed earlier because of # another duplicate, that is the purpose of try-except. + ###!!! TODO: If we remove a singleton, we are destroying the cluster. Then we must also handle possible bridging and split antecedents pointing to that cluster! for wb in sB: try: wb._mentions.remove(mB) From 90ac47a28aae22ba5ae946698b0ca5e1a0ba3e53 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 16 Feb 2022 02:59:03 +0100 Subject: [PATCH 0335/1201] don't store Bridge or SplitAnte links to already deleted clusters fixes #102 A better solution would be to delete links to deleted clusters immediately, but it is tricky to make this fast (we would need backlinks) and in some scenarios we need links to clusters without any mentions (e.g. when loading a file with cataphora SplitAnte/Bridge). --- udapi/core/coref.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index cb865a31..9e3e97ee 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -469,6 +469,13 @@ def targets(self): """Return a list of the target clusters (without relations).""" return [link.target for link in self._data] + def _delete_targets_without_mentions(self, warn=True): + for link in self._data: + if not link.target.mentions: + if warn: + logging.warning(f"Cluster {link.target.cluster_id} has no mentions, but is referred to in bridging of {self.src_mention.cluster.cluster_id}") + self._data.remove(link) + def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): clusters = head.root.bundle.document.coref_clusters @@ -832,6 +839,7 @@ def store_coref_to_misc(doc): # Bridge=e1 Date: Wed, 16 Feb 2022 03:02:11 +0100 Subject: [PATCH 0336/1201] `mention.words = []` takes care about deleting backlinks from words --- udapi/block/corefud/mergesamespan.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py index 802285af..bdeefd7c 100644 --- a/udapi/block/corefud/mergesamespan.py +++ b/udapi/block/corefud/mergesamespan.py @@ -27,9 +27,7 @@ def process_tree(self, tree): # Reduce non-determinism in which mention is removed: # If the mentions belong to different entities, sort them by entity (cluster) ids. if mA.cluster.cluster_id > mB.cluster.cluster_id: - mX = mA - mA = mB - mB = mX + mA, mB = mB, mA sA, sB = set(mA.words), set(mB.words) if sA != sB: @@ -40,18 +38,14 @@ def process_tree(self, tree): # mentions from the other cluster to this cluster, and remove the # other cluster. if mA.cluster != mB.cluster: - logging.warning("Merging same-span mentions that belong to different entities: '%s' vs. '%s'." % (mA.cluster.cluster_id, mB.cluster.cluster_id)) + logging.warning(f"Merging same-span mentions that belong to different entities: {mA.cluster.cluster_id} vs. {mB.cluster.cluster_id}") ###!!! TODO: As of now, changing the cluster of a mention is not supported in the API. #for m in mB.cluster.mentions: # m.cluster = mA.cluster # Remove mention B. It may have been removed earlier because of # another duplicate, that is the purpose of try-except. ###!!! TODO: If we remove a singleton, we are destroying the cluster. Then we must also handle possible bridging and split antecedents pointing to that cluster! - for wb in sB: - try: - wb._mentions.remove(mB) - except ValueError: - pass + mB.words = [] try: mB.cluster.mentions.remove(mB) except ValueError: From f7e82090b443c7fd59557255e30e6fd5a6fc30e7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 22 Feb 2022 16:10:25 +0100 Subject: [PATCH 0337/1201] Russian block cloned from Czech. --- udapi/block/ud/ru/fixedeprels.py | 113 +++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 udapi/block/ud/ru/fixedeprels.py diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py new file mode 100644 index 00000000..54076b68 --- /dev/null +++ b/udapi/block/ud/ru/fixedeprels.py @@ -0,0 +1,113 @@ +"""Block to fix case-enhanced dependency relations in Russian.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'как': 'как' # remove morphological case + } + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Russian basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + if re.match(r'^(acl|advcl):', edep['deprel']): + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) + elif re.match(r'^(nmod|obl):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'obl:loc': + # Annotation error. The first occurrence in PDT dev: + # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' + # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. + # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. + edep['deprel'] = 'obl:v:loc' + elif edep['deprel'] == 'obl:arg:loc': + # Annotation error. The first occurrence in PDT dev: + edep['deprel'] = 'obl:arg:na:loc' + elif edep['deprel'] == 'nmod:loc': + # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': + # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? + # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. + edep['deprel'] = 'obl' + elif edep['deprel'] == 'nmod:voc': + # 'v 8. čísle tiskoviny Ty rudá krávo' + edep['deprel'] = 'nmod:nom' + elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + else: + # If one of the following expressions occurs followed by another preposition, + # remove the additional preposition. For example, 'i_když_s' becomes just 'i_když'. + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) From c673bd5e90bc64f0c4f3db68e2e424ddcdb94567 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 22 Feb 2022 16:20:37 +0100 Subject: [PATCH 0338/1201] Removed some Czech-specific rules. --- udapi/block/ud/ru/fixedeprels.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 54076b68..48febee5 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -49,39 +49,16 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) elif re.match(r'^(nmod|obl):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': # This is a same-case noun-noun modifier, which just happens to be in the locative. # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. edep['deprel'] = 'nmod' - elif edep['deprel'] == 'obl:loc': - # Annotation error. The first occurrence in PDT dev: - # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' - # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. - # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. - edep['deprel'] = 'obl:v:loc' - elif edep['deprel'] == 'obl:arg:loc': - # Annotation error. The first occurrence in PDT dev: - edep['deprel'] = 'obl:arg:na:loc' elif edep['deprel'] == 'nmod:loc': - # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. edep['deprel'] = 'nmod:nom' - elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': - # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? - # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. - edep['deprel'] = 'obl' elif edep['deprel'] == 'nmod:voc': - # 'v 8. čísle tiskoviny Ty rudá krávo' edep['deprel'] = 'nmod:nom' - elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' else: # If one of the following expressions occurs followed by another preposition, # remove the additional preposition. For example, 'i_když_s' becomes just 'i_když'. From ff0d5c915064a5b07977fe15cdc1858bf49fe786 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 22 Feb 2022 17:35:09 +0100 Subject: [PATCH 0339/1201] Circleci project setup (#103) * Add .circleci/config.yml * CircleCI debugging * CircleCI debug * regexes need \r"" * allow len(document) Users may expect this to work, when document[i] works. * reader.read_documents() * add a comment explaining the hack from #96 * add a first test for coreference API * fix the bug revealed in test_coref.py thanks to @ondfa * switch from TravisCI to CircleCI --- .circleci/config.yml | 57 ++++++++++++++++++ .travis.yml | 34 ----------- README.md | 2 +- test-requirements.txt | 1 + udapi/core/basereader.py | 20 +++++++ udapi/core/coref.py | 10 ++-- udapi/core/document.py | 3 + udapi/core/tests/__init__.py | 0 .../tests/data/fr-democrat-dev-sample.conllu | 60 +++++++++++++++++++ udapi/core/tests/test_coref.py | 23 +++++++ 10 files changed, 170 insertions(+), 40 deletions(-) create mode 100644 .circleci/config.yml delete mode 100644 .travis.yml create mode 100644 test-requirements.txt create mode 100644 udapi/core/tests/__init__.py create mode 100644 udapi/core/tests/data/fr-democrat-dev-sample.conllu create mode 100755 udapi/core/tests/test_coref.py diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000..7be539d2 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,57 @@ +# Use the latest 2.1 version of CircleCI pipeline process engine. +# See: https://circleci.com/docs/2.0/configuration-reference +version: 2.1 + +# Orbs are reusable packages of CircleCI configuration that you may share across projects, enabling you to create encapsulated, parameterized commands, jobs, and executors that can be used across multiple projects. +# See: https://circleci.com/docs/2.0/orb-intro/ +orbs: + # The python orb contains a set of prepackaged CircleCI configuration you can use repeatedly in your configuration files + # Orb commands and jobs help you with common scripting around a language/tool + # so you dont have to copy and paste it everywhere. + # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python + python: circleci/python@1.5.0 + +# Define a job to be invoked later in a workflow. +# See: https://circleci.com/docs/2.0/configuration-reference/#jobs +jobs: + build-and-test: # This is the name of the job, feel free to change it to better match what you're trying to do! + # These next lines defines a Docker executors: https://circleci.com/docs/2.0/executor-types/ + # You can specify an image from Dockerhub or use one of the convenience images from CircleCI's Developer Hub + # A list of available CircleCI Docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python + # The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container + # Change the version below to your required version of python + docker: + - image: cimg/python:3.9 + # Checkout the code as the first step. This is a dedicated CircleCI step. + # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default. + # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt. + # Then run your tests! + # CircleCI will report the results back to your VCS provider. + steps: + - checkout + - python/install-packages: + pkg-manager: pip + # app-dir: ~/project/package-directory/ # If you're requirements.txt isn't in the root directory. + # pip-dependency-file: test-requirements.txt # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements. + - run: + name: Install Udapi + command: pip install --use-feature=in-tree-build ".[test]" + - run: + name: Run pytest tests + # This assumes pytest is installed via the install-package step above + command: pytest + - run: + name: Color TextModeTrees + command: udapy read.Conllu files=udapi/core/tests/data/babinsky.conllu write.TextModeTrees color=1 + - run: + name: External tests + command: cd udapi/core/tests && ./external_tests.sh + + +# Invoke jobs via workflows +# See: https://circleci.com/docs/2.0/configuration-reference/#workflows +workflows: + sample: # This is the name of the workflow, feel free to change it to better match your workflow. + # Inside the workflow, you define the jobs you want to run. + jobs: + - build-and-test diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 417e39fb..00000000 --- a/.travis.yml +++ /dev/null @@ -1,34 +0,0 @@ -language: python -python: - - "3.6" - - "3.7" - - "3.8" - - "3.9" -#before_install: -# - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test -# - sudo apt-get update -qq -# - sudo apt-get install -qq gcc-4.8 g++-4.8 -# - CC=g++-4.8 pip install ufal.udpipe -#install: -# - python setup.py install -install: - - pip3 install ".[test]" -script: - - python -m pytest - - udapy read.Conllu files=udapi/core/tests/data/babinsky.conllu write.TextModeTrees color=1 - - cd udapi/core/tests && ./external_tests.sh -jobs: - include: - - name: "Python 3.9 on Windows" - os: windows - language: shell - before_install: - - choco install python - - python --version - - python -m pip install --upgrade pip - - pip3 install --upgrade pytest - env: PATH=/c/Python39:/c/Python39/Scripts:$PATH - script: - - python -c 'import colorama;print("\033[31m some red text")' - - python -Xutf8 -c 'import udapi;udapi.Document("udapi/core/tests/data/babinsky.conllu").draw(color=1)' - - python -m pytest diff --git a/README.md b/README.md index 3bf52eec..11b689dc 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # udapi-python Python framework for processing Universal Dependencies data -[![Build Status](https://travis-ci.org/udapi/udapi-python.svg?branch=master)](https://travis-ci.org/udapi/udapi-python) +[![Build Status](https://circleci.com/gh/udapi/udapi-python.svg?style=svg)](https://circleci.com/gh/udapi/udapi-python) [![Website](https://img.shields.io/website-up-down-green-red/http/udapi.github.io.svg)](http://udapi.github.io) [![Documentation Status](https://readthedocs.org/projects/udapi/badge/)](http://udapi.readthedocs.io) diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 00000000..e079f8a6 --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1 @@ +pytest diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index fee9da4c..9210b910 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -28,6 +28,12 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id + # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. + # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it in here, the reader. + # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, + # which reads all the trees in a file at once, but it does not have access to the document instance, + # it just returns a sequence of trees (which may be split into multiple documents if `bundles_per_doc` is set). + # So `read.Conllu` cannot store the `global.Entity` in `document.meta['global.Entity']` where it belongs. self._global_entity = None @staticmethod @@ -170,6 +176,7 @@ def process_document(self, document): bundle.add_tree(root) if root.newdoc and root.newdoc is not True: document.meta["docname"] = root.newdoc + document.meta['global.Entity'] = self._global_entity filehandle = self.filehandle if filehandle is None: @@ -259,3 +266,16 @@ def process_document(self, document): if gc_was_enabled: gc.enable() gc.collect() + + def read_documents(self): + """Load all documents of this reader and return them as a list.""" + # udapi.core.document imports udapi.block.read.conllu because of doc.load_conllu(filename) + # and udapi.block.read.conllu loads this module (udapi.core.basereader), + # so we cannot load udapi.core.document at the beginning of this module. + from udapi.core.document import Document + docs = [] + while not self.finished: + doc = Document() + self.process_document(doc) + docs.append(doc) + return docs diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 9e3e97ee..9eedeeb6 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -542,7 +542,7 @@ def load_coref_from_misc(doc, strict=True): # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. # The following re.split line splits this into # chunks = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] - chunks = [x for x in re.split('(\([^()]+\)?|[^()]+\))', misc_entity) if x] + chunks = [x for x in re.split(r'(\([^()]+\)?|[^()]+\))', misc_entity) if x] for chunk in chunks: opening, closing = (chunk[0] == '(', chunk[-1] == ')') chunk = chunk.strip('()') @@ -752,7 +752,7 @@ def store_coref_to_misc(doc): for idx,subspan in enumerate(subspans, 1): eid = cluster.cluster_id if tree2docid and 'GRP' in fields: - eid = re.sub('^d\d+\.', '', eid) # TODO or "eid = cluster.eid_or_grp"? + eid = re.sub(r'^d\d+\.', '', eid) # TODO or "eid = cluster.eid_or_grp"? subspan_eid = f'{eid}[{idx}/{len(subspans)}]' subspan_words = span_to_nodes(root, subspan) fake_cluster = CorefCluster(subspan_eid, cluster.cluster_type) @@ -771,7 +771,7 @@ def store_coref_to_misc(doc): if field == 'eid' or field == 'GRP': eid = cluster.cluster_id if field == 'GRP': - eid = re.sub('^d\d+\.', '', eid) + eid = re.sub(r'^d\d+\.', '', eid) if any(x in eid for x in CHARS_FORBIDDEN_IN_ID): _error(f"{eid} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]", strict) for c in CHARS_FORBIDDEN_IN_ID: @@ -823,7 +823,7 @@ def store_coref_to_misc(doc): elif '(' not in orig_entity: firstword.misc['Entity'] = mention_str + ')' + orig_entity # (e9)e4)e3) --> (e10)(e9)e4)e3) - elif any(c and c[0] == '(' and c[-1] != ')' for c in re.split('(\([^()]+\)?|[^()]+\))', orig_entity)): + elif any(c and c[0] == '(' and c[-1] != ')' for c in re.split(r'(\([^()]+\)?|[^()]+\))', orig_entity)): firstword.misc['Entity'] += mention_str + ')' # (e1(e2(e9) --> (e1(e2(e9)(e10) # e3)(e1(e2(e9)--> e3)(e1(e2(e9)(e10) @@ -834,7 +834,7 @@ def store_coref_to_misc(doc): firstword.misc['Entity'] += mention_str eid = cluster.cluster_id if tree2docid and 'GRP' in fields: - eid = re.sub('^d\d+\.', '', eid) + eid = re.sub(r'^d\d+\.', '', eid) mention.words[-1].misc['Entity'] = eid + ')' + mention.words[-1].misc['Entity'] # Bridge=e1 Date: Fri, 25 Feb 2022 23:16:53 +0100 Subject: [PATCH 0340/1201] =?UTF-8?q?'=D1=81=5F=D0=BF=D0=BE=D0=BC=D0=BE?= =?UTF-8?q?=D1=89=D1=8C=D1=8E:gen'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 48febee5..675e9eac 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -11,7 +11,8 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { - 'как': 'как' # remove morphological case + 'как': 'как', # remove morphological case + 'с_помощь': 'с_помощью:gen' } def process_node(self, node): From e3dbb514d734e96000d10e23a402a6aabd982e8d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 3 Mar 2022 11:13:50 +0100 Subject: [PATCH 0341/1201] CircleCI icon matching the style of other icons --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 11b689dc..0b41297f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # udapi-python Python framework for processing Universal Dependencies data -[![Build Status](https://circleci.com/gh/udapi/udapi-python.svg?style=svg)](https://circleci.com/gh/udapi/udapi-python) +[![Build Status](https://circleci.com/gh/udapi/udapi-python.svg?style=shield)](https://circleci.com/gh/udapi/udapi-python) [![Website](https://img.shields.io/website-up-down-green-red/http/udapi.github.io.svg)](http://udapi.github.io) [![Documentation Status](https://readthedocs.org/projects/udapi/badge/)](http://udapi.readthedocs.io) From d1db476c8dbd2d52a89c42da0f4175e03baf528b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 3 Mar 2022 15:14:46 +0100 Subject: [PATCH 0342/1201] fix `entity.create_mention()` and add a test --- udapi/core/coref.py | 2 +- udapi/core/tests/test_coref.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 9eedeeb6..35c66d83 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -330,7 +330,7 @@ def create_mention(self, head=None, mention_words=None, mention_span=None): if head is None: head = mention_words[0] - mention = CorefMention(head, self) + mention = CorefMention(words=[head], head=head, cluster=self) if mention_words: mention.words = mention_words if mention_span: diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py index 855a338e..6a77e886 100755 --- a/udapi/core/tests/test_coref.py +++ b/udapi/core/tests/test_coref.py @@ -18,6 +18,31 @@ def test_load(self): self.assertEqual(len(coref_entities), 1) self.assertEqual(coref_entities[0].cluster_id, 'e36781') + def test_edits(self): + data_filename = os.path.join(os.path.dirname(__file__), 'data', 'fr-democrat-dev-sample.conllu') + doc = udapi.Document(data_filename) + first_node = next(doc.nodes) + second_node = first_node.next_node + new_entity = first_node.create_coref_cluster(cluster_type='person') + self.assertEqual(new_entity.cluster_type, 'person') + self.assertEqual(len(new_entity.mentions), 1) + m1 = new_entity.mentions[0] + self.assertEqual(m1.cluster, new_entity) + self.assertEqual(m1.head, first_node) + self.assertEqual(m1.words, [first_node]) + self.assertEqual(m1.span, '1') + m1.words = [second_node, first_node, first_node] # intentional duplicates and wrong order + self.assertEqual(m1.words, [first_node, second_node]) + self.assertEqual(m1.span, '1-2') + m1.head = second_node + self.assertEqual(m1.head, second_node) + m2 = new_entity.create_mention(head=second_node, mention_span='1-3') + self.assertEqual(len(new_entity.mentions), 2) + self.assertEqual(new_entity.mentions[0], m2) # 1-3 should go before 1-2 + self.assertEqual(new_entity.mentions[1], m1) + self.assertTrue(m2 < m1) + self.assertEqual(m2.words, [first_node, second_node, second_node.next_node]) + if __name__ == "__main__": unittest.main() From 1ffb06789175aa5bf9ee546df7ae2624bb42acde Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 3 Mar 2022 19:51:23 +0100 Subject: [PATCH 0343/1201] add `doc.create_coref_cluster()`, rename params of `cluster.create_mention()` `doc.create_coref_cluster()` allows to create a new cluster without any mentions, which seems to be practical in real use cases. `m = cluster.create_mention(mention_words=[w1, w2])` seems redunant, `m = cluster.create_mention(words=[w1, w2])` looks better. Similarly with `mention_span` -> `span`. It will be consistent with `m.words` and `m.span`. TODO: consider removing `new_cluster = node.create_coref_cluster()` which creates both a cluster and a new mention, but does not return the mention (it can be accessed with `new_cluster.mentions[0]`, of course). So far, I've just remove it from the tests. --- udapi/core/coref.py | 48 ++++++++++++---------------------- udapi/core/document.py | 13 +++++++++ udapi/core/node.py | 8 ++++-- udapi/core/tests/test_coref.py | 8 +++--- 4 files changed, 41 insertions(+), 36 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 35c66d83..eef25dd2 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -302,7 +302,7 @@ def eid_or_grp(self): def mentions(self): return self._mentions - def create_mention(self, head=None, mention_words=None, mention_span=None): + def create_mention(self, head=None, words=None, span=None): """Create a new CoreferenceMention object within this CorefCluster. Args: @@ -310,31 +310,31 @@ def create_mention(self, head=None, mention_words=None, mention_span=None): The head is supposed to be the linguistic head of the mention, i.e. the highest node in the dependency tree, but if such information is not available (yet), - it can be any node within the mention_words. - If no head is specified, the first word from mention_words will be used instead. - mention_words: a list of nodes of the mention. + it can be any node within the `words`. + If no head is specified, the first word from `words` will be used instead. + words: a list of nodes of the mention. This argument is optional, but if provided, it must contain the head. The nodes can be both normal nodes or empty nodes. - mention_span: an alternative way how to specify mention_words + span: an alternative way how to specify `words` using a string such as "3-5,6,7.1-7.2". (which means, there is an empty node 5.1 and normal node 7, which are not part of the mention). - At most one of the args mention_words and mention_span can be specified. + At most one of the args `words` and `span` can be specified. """ - if mention_words and mention_span: - raise ValueError("Cannot specify both mention_words and mention_span") - if head and mention_words and head not in mention_words: - raise ValueError(f"Head {head} is not among the specified mention_words") - if head is None and mention_words is None: - raise ValueError("Either head or mention_words must be specified") + if words and span: + raise ValueError("Cannot specify both words and span") + if head and words and head not in words: + raise ValueError(f"Head {head} is not among the specified words") + if head is None and words is None: + raise ValueError("Either head or words must be specified") if head is None: - head = mention_words[0] + head = words[0] mention = CorefMention(words=[head], head=head, cluster=self) - if mention_words: - mention.words = mention_words - if mention_span: - mention.span = mention_span + if words: + mention.words = words + if span: + mention.span = span self._mentions.sort() return mention @@ -477,20 +477,6 @@ def _delete_targets_without_mentions(self, warn=True): self._data.remove(link) -def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): - clusters = head.root.bundle.document.coref_clusters - if not cluster_id: - counter = 1 - while clusters.get('c%d' % counter): - counter += 1 - cluster_id = 'c%d' % counter - elif clusters.get(cluster_id): - raise ValueError("Cluster with a id %s already exists", cluster_id) - cluster = CorefCluster(cluster_id, cluster_type) - cluster.create_mention(head, **kwargs) - clusters[cluster_id] = cluster - return cluster - def _error(msg, strict): if strict: raise ValueError(msg) diff --git a/udapi/core/document.py b/udapi/core/document.py index c50f8e43..8e33c8d6 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -136,3 +136,16 @@ def coref_mentions(self): all_mentions.extend(cluster.mentions) all_mentions.sort() return all_mentions + + def create_coref_cluster(self, cluster_id=None, cluster_type=None): + self._load_coref() + if not cluster_id: + counter = 1 + while self._coref_clusters.get(f'c{counter}'): + counter += 1 + cluster_id = f'c{counter}' + elif clusters.get(cluster_id): + raise ValueError("Cluster with a id %s already exists", cluster_id) + cluster = udapi.core.coref.CorefCluster(cluster_id, cluster_type) + self._coref_clusters[cluster_id] = cluster + return cluster diff --git a/udapi/core/node.py b/udapi/core/node.py index 3d120a52..4524c119 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -921,8 +921,12 @@ def coref_clusters(self): self._root.bundle.document._load_coref() return [m.cluster for m in self._mentions if m.cluster is not None] - def create_coref_cluster(self, **kwargs): - return udapi.core.coref.create_coref_cluster(head=self, **kwargs) + # TODO: is this method useful? + def create_coref_cluster(self, cluster_id=None, cluster_type=None, **kwargs): + doc = self._root.bundle.document + cluster = doc.create_coref_cluster(cluster_id, cluster_type) + cluster.create_mention(head=self, **kwargs) + return cluster class CycleError(Exception): diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py index 6a77e886..8eab1436 100755 --- a/udapi/core/tests/test_coref.py +++ b/udapi/core/tests/test_coref.py @@ -23,10 +23,12 @@ def test_edits(self): doc = udapi.Document(data_filename) first_node = next(doc.nodes) second_node = first_node.next_node - new_entity = first_node.create_coref_cluster(cluster_type='person') + new_entity = doc.create_coref_cluster(cluster_type='person') self.assertEqual(new_entity.cluster_type, 'person') + self.assertEqual(len(new_entity.mentions), 0) + m1 = new_entity.create_mention(words=[first_node]) # head will be automatically set to words[0] self.assertEqual(len(new_entity.mentions), 1) - m1 = new_entity.mentions[0] + self.assertEqual(m1, new_entity.mentions[0]) self.assertEqual(m1.cluster, new_entity) self.assertEqual(m1.head, first_node) self.assertEqual(m1.words, [first_node]) @@ -36,7 +38,7 @@ def test_edits(self): self.assertEqual(m1.span, '1-2') m1.head = second_node self.assertEqual(m1.head, second_node) - m2 = new_entity.create_mention(head=second_node, mention_span='1-3') + m2 = new_entity.create_mention(head=second_node, span='1-3') # mention.words will be filled according to the span self.assertEqual(len(new_entity.mentions), 2) self.assertEqual(new_entity.mentions[0], m2) # 1-3 should go before 1-2 self.assertEqual(new_entity.mentions[1], m1) From d025550b1102fa0fa2546e9d0e8cc9de0165498a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 9 Mar 2022 09:48:16 +0100 Subject: [PATCH 0344/1201] Some more Russian prepositions. --- udapi/block/ud/ru/fixedeprels.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 675e9eac..440cfd0b 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -11,8 +11,12 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'в_качество': 'в_качестве:gen', + 'в_течение': 'в_течение:gen', 'как': 'как', # remove morphological case - 'с_помощь': 'с_помощью:gen' + 'несмотря_на': 'несмотря_на:acc', + 'с_помощь': 'с_помощью:gen', + 'чем': 'чем' # remove morphological case } def process_node(self, node): @@ -38,7 +42,7 @@ def process_node(self, node): # available. Thanks to the Case feature on prepositions, we can # identify the correct one. if not solved: - m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: # The following is only partial solution. We will not see # some children because they may be shared children of coordination. @@ -46,6 +50,9 @@ def process_node(self, node): if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() solved = True + else: + # Accusative or locative are possible. Pick locative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' if re.match(r'^(acl|advcl):', edep['deprel']): edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) From c6893e678e637121b5413b5322e41eb6fca868e7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 10 Mar 2022 17:25:34 +0100 Subject: [PATCH 0345/1201] Russian prepositions with morphological case. --- udapi/block/ud/ru/fixedeprels.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 440cfd0b..e96fd8d1 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -42,7 +42,9 @@ def process_node(self, node): # available. Thanks to the Case feature on prepositions, we can # identify the correct one. if not solved: - m = re.match(r'^(obl(?::arg)?|nmod):(на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + ###!!! Both "на" and "в" seem to also occur with genitive. + ###!!! I don't think it is valid but let's see some examples before we ban it. + m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|dat|voc))?$', edep['deprel']) if m: # The following is only partial solution. We will not see # some children because they may be shared children of coordination. From 51cd1b134bac365139ca0841d01aebd22794b247 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 11 Mar 2022 11:06:32 +0100 Subject: [PATCH 0346/1201] Russian prepositions with morphological case. --- udapi/block/ud/ru/fixedeprels.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index e96fd8d1..d9c539d5 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -13,6 +13,7 @@ class FixEdeprels(Block): unambiguous = { 'в_качество': 'в_качестве:gen', 'в_течение': 'в_течение:gen', + 'в_ход': 'в_ходе:gen', 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', 'с_помощь': 'с_помощью:gen', @@ -42,9 +43,10 @@ def process_node(self, node): # available. Thanks to the Case feature on prepositions, we can # identify the correct one. if not solved: - ###!!! Both "на" and "в" seem to also occur with genitive. - ###!!! I don't think it is valid but let's see some examples before we ban it. - m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|dat|voc))?$', edep['deprel']) + # Both "на" and "в" also occur with genitive. However, this + # is only because there are numerals in the phrase ("в 9 случаев из 10") + # and the whole phrase should not be analyzed as genitive. + m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: # The following is only partial solution. We will not see # some children because they may be shared children of coordination. From 71899a4bfddeafe870d4a18c403042c2f980c678 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 12 Mar 2022 13:42:27 +0100 Subject: [PATCH 0347/1201] Russian prepositions with morphological cases. --- udapi/block/ud/ru/fixedeprels.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index d9c539d5..cc21ec33 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -12,10 +12,13 @@ class FixEdeprels(Block): # case, even if they are not secondary. unambiguous = { 'в_качество': 'в_качестве:gen', + 'в_связь_с': 'в_связи_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', + 'помимо': 'помимо:gen', + 'со_сторона': 'со_стороны:gen', 'с_помощь': 'с_помощью:gen', 'чем': 'чем' # remove morphological case } From 094a2b982cc7575ab7b3c5d22d103b08e6343ece Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 13 Mar 2022 11:05:26 +0100 Subject: [PATCH 0348/1201] Russian prepositions. --- udapi/block/ud/ru/fixedeprels.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index cc21ec33..eb8292ae 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -15,11 +15,13 @@ class FixEdeprels(Block): 'в_связь_с': 'в_связи_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', + 'до': 'до:gen', 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', 'помимо': 'помимо:gen', 'со_сторона': 'со_стороны:gen', 'с_помощь': 'с_помощью:gen', + 'так_что': 'так_что', # remove morphological case 'чем': 'чем' # remove morphological case } From 3b6748be36653e917420d44a6c8d7d8a65782aa0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Mar 2022 15:04:17 +0100 Subject: [PATCH 0349/1201] =?UTF-8?q?Rusk=C3=A9=20p=C5=99edlo=C5=BEky.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index eb8292ae..c4906053 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -12,15 +12,19 @@ class FixEdeprels(Block): # case, even if they are not secondary. unambiguous = { 'в_качество': 'в_качестве:gen', + 'в_отношение': 'в_отношении:gen', 'в_связь_с': 'в_связи_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', + 'ведь': 'ведь', # remove morphological case 'до': 'до:gen', 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', 'помимо': 'помимо:gen', + 'согласно': 'согласно:dat', 'со_сторона': 'со_стороны:gen', 'с_помощь': 'с_помощью:gen', + 'спустя': 'спустя:acc', 'так_что': 'так_что', # remove morphological case 'чем': 'чем' # remove morphological case } From c513f676363cf0b901af67f1840f184f7361a9b7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Mar 2022 16:15:41 +0100 Subject: [PATCH 0350/1201] =?UTF-8?q?=D0=B7=D0=B0:gen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index c4906053..7a5a0dc1 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -66,6 +66,20 @@ def process_node(self, node): else: # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + # Both "за" and "" also occur with instrumental. However, this + # is only because there are numerals in the phrase ("за последние 20 лет") + # and the whole phrase should be usually analyzed as accusative. + m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + else: + # Accusative or instrumental are possible. Pick accusative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' if re.match(r'^(acl|advcl):', edep['deprel']): edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) From 5f7d17f6db8263c062b5f8902c35e7a3baa7ce9f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 12:46:30 +0100 Subject: [PATCH 0351/1201] =?UTF-8?q?=D1=81=D0=BB=D0=BE=D0=B2=D0=BD=D0=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 82 ++++++++++++++++---------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 7a5a0dc1..5aaf6308 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -21,9 +21,10 @@ class FixEdeprels(Block): 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', 'помимо': 'помимо:gen', - 'согласно': 'согласно:dat', - 'со_сторона': 'со_стороны:gen', 'с_помощь': 'с_помощью:gen', + 'словно': 'словно', # remove morphological case + 'со_сторона': 'со_стороны:gen', + 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', 'так_что': 'так_что', # remove morphological case 'чем': 'чем' # remove morphological case @@ -48,38 +49,46 @@ def process_node(self, node): edep['deprel'] = m.group(1)+':'+self.unambiguous[x] solved = True break + if solved: + break + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'словно_у' becomes just 'словно'. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):словно([_:].+)?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':словно' + break # The following prepositions have more than one morphological case # available. Thanks to the Case feature on prepositions, we can # identify the correct one. - if not solved: - # Both "на" and "в" also occur with genitive. However, this - # is only because there are numerals in the phrase ("в 9 случаев из 10") - # and the whole phrase should not be analyzed as genitive. - m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) - if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True - else: - # Accusative or locative are possible. Pick locative. - edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' - # Both "за" and "" also occur with instrumental. However, this - # is only because there are numerals in the phrase ("за последние 20 лет") - # and the whole phrase should be usually analyzed as accusative. - m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) - if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True - else: - # Accusative or instrumental are possible. Pick accusative. - edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' + # Both "на" and "в" also occur with genitive. However, this + # is only because there are numerals in the phrase ("в 9 случаев из 10") + # and the whole phrase should not be analyzed as genitive. + m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + else: + # Accusative or locative are possible. Pick locative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + # Both "за" and "" also occur with instrumental. However, this + # is only because there are numerals in the phrase ("за последние 20 лет") + # and the whole phrase should be usually analyzed as accusative. + m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + else: + # Accusative or instrumental are possible. Pick accusative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' if re.match(r'^(acl|advcl):', edep['deprel']): edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) @@ -94,17 +103,6 @@ def process_node(self, node): edep['deprel'] = 'nmod:nom' elif edep['deprel'] == 'nmod:voc': edep['deprel'] = 'nmod:nom' - else: - # If one of the following expressions occurs followed by another preposition, - # remove the additional preposition. For example, 'i_když_s' becomes just 'i_když'. - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' From 0cc70536aff35dc62fa3a22a904b6a3016caa3ae Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 15:20:05 +0100 Subject: [PATCH 0352/1201] =?UTF-8?q?=D0=BF=D1=80=D0=B8=5F=D0=BF=D0=BE?= =?UTF-8?q?=D0=BC=D0=BE=D1=89=D0=B8:gen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 5aaf6308..74b919fb 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -20,7 +20,9 @@ class FixEdeprels(Block): 'до': 'до:gen', 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', + 'по_повод': 'по_поводу:gen', 'помимо': 'помимо:gen', + 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', 'словно': 'словно', # remove morphological case 'со_сторона': 'со_стороны:gen', From dc0cd7d432af92e86c476d44c46e510a07a5aae5 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 16:40:00 +0100 Subject: [PATCH 0353/1201] More systematic processing of outermost case markers. --- udapi/block/ud/ru/fixedeprels.py | 39 +++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 74b919fb..c357fa49 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -5,31 +5,40 @@ class FixEdeprels(Block): + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'как' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('как_в:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + outermost = [ + 'ведь', + 'как', + 'словно', + 'так_что', + 'чем' + ] + # Secondary prepositions sometimes have the lemma of the original part of # speech. We want the grammaticalized form instead. List even those that # will have the same lexical form, as we also want to check the morphological # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'в_вид': 'в_виде:gen', 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', 'в_связь_с': 'в_связи_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', - 'ведь': 'ведь', # remove morphological case 'до': 'до:gen', - 'как': 'как', # remove morphological case 'несмотря_на': 'несмотря_на:acc', 'по_повод': 'по_поводу:gen', 'помимо': 'помимо:gen', 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', - 'словно': 'словно', # remove morphological case 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', - 'спустя': 'спустя:acc', - 'так_что': 'так_что', # remove morphological case - 'чем': 'чем' # remove morphological case + 'спустя': 'спустя:acc' } def process_node(self, node): @@ -43,6 +52,17 @@ def process_node(self, node): if m: bdeprel = m.group(1) solved = False + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'словно_у' becomes just 'словно'. + for x in self.outermost: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + break for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. @@ -53,13 +73,6 @@ def process_node(self, node): break if solved: break - # If one of the following expressions occurs followed by another preposition - # or by morphological case, remove the additional case marking. For example, - # 'словно_у' becomes just 'словно'. - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):словно([_:].+)?$', edep['deprel']) - if m: - edep['deprel'] = m.group(1)+':словно' - break # The following prepositions have more than one morphological case # available. Thanks to the Case feature on prepositions, we can # identify the correct one. From b3e97c3b107e66d6f3c62549e3e000b8cfa2890e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 17:51:37 +0100 Subject: [PATCH 0354/1201] It is now possible to define exceptions to the rule. --- udapi/block/ud/ru/fixedeprels.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index c357fa49..6a1b001e 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -10,13 +10,14 @@ class FixEdeprels(Block): # is used with the same case (preposition + morphology) as the nominal that # is being compared ('как_в:loc' etc.) We do not want to multiply the relations # by all the inner cases. - outermost = [ - 'ведь', - 'как', - 'словно', - 'так_что', - 'чем' - ] + # The list in the value contains exceptions that should be left intact. + outermost = { + 'ведь': [], + 'как': ['как_только'], + 'словно': [], + 'так_что': [], + 'чем': [] + } # Secondary prepositions sometimes have the lemma of the original part of # speech. We want the grammaticalized form instead. List even those that @@ -56,8 +57,9 @@ def process_node(self, node): # or by morphological case, remove the additional case marking. For example, # 'словно_у' becomes just 'словно'. for x in self.outermost: + exceptions = self.outermost[x] m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) - if m: + if m and not x+m.group(2) in exceptions: edep['deprel'] = m.group(1)+':'+x solved = True break From 052ab6c68235cb15fcc4c7f69718bb03e290a7ab Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 17:54:37 +0100 Subject: [PATCH 0355/1201] Fix? --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 6a1b001e..ab706346 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -59,7 +59,7 @@ def process_node(self, node): for x in self.outermost: exceptions = self.outermost[x] m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) - if m and not x+m.group(2) in exceptions: + if m and m.group(2) and not x+m.group(2) in exceptions: edep['deprel'] = m.group(1)+':'+x solved = True break From 61845025b063ce51a96384a59b8caaf4c67fd26f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 18:16:25 +0100 Subject: [PATCH 0356/1201] =?UTF-8?q?=D0=BA:dat?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index ab706346..43ad5f3a 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -12,6 +12,7 @@ class FixEdeprels(Block): # by all the inner cases. # The list in the value contains exceptions that should be left intact. outermost = { + 'будто': [], 'ведь': [], 'как': ['как_только'], 'словно': [], @@ -32,6 +33,7 @@ class FixEdeprels(Block): 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', 'до': 'до:gen', + 'к': 'к:dat', 'несмотря_на': 'несмотря_на:acc', 'по_повод': 'по_поводу:gen', 'помимо': 'помимо:gen', @@ -106,11 +108,7 @@ def process_node(self, node): else: # Accusative or instrumental are possible. Pick accusative. edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' - if re.match(r'^(acl|advcl):', edep['deprel']): - edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) - elif re.match(r'^(nmod|obl):', edep['deprel']): + if re.match(r'^(nmod|obl):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': # This is a same-case noun-noun modifier, which just happens to be in the locative. # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has From 9c003aa06b289a21db51d612e4298b2d64632773 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 18:56:34 +0100 Subject: [PATCH 0357/1201] Fix advcl:(od|do):gen. --- udapi/block/ud/cs/fixedeprels.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index b3e551e5..feed707d 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -256,6 +256,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:do:gen$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:k:dat$', r'obl:k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) @@ -264,6 +265,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^acl:od:gen$', r'nmod:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:od:gen$', r'obl:od:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! edep['deprel'] = re.sub(r'^advcl:podle:gen$', r'obl:podle:gen', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:pro:acc$', r'obl:pro:acc', edep['deprel']) edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) From f66340342ee7443e85956d35e4302164fa154c9b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 20:57:56 +0100 Subject: [PATCH 0358/1201] Minor fixes in Czech. --- udapi/block/ud/cs/fixedeprels.py | 73 ++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index feed707d..871939a8 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -5,6 +5,27 @@ class FixEdeprels(Block): + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'ač': [], + 'ačkoli': [], # 'ačkoliv' se převede na 'ačkoli' dole + 'byť': [], + 'i_když': [], + 'jak': [], + 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole + 'jako': [], + 'jakoby': [], # these instances in FicTree should be spelled 'jako by' + 'než': [], + 'protože': [], + 'takže': [], + 'třebaže': [] + } + # Secondary prepositions sometimes have the lemma of the original part of # speech. We want the grammaticalized form instead. List even those that # will have the same lexical form, as we also want to check the morphological @@ -230,6 +251,21 @@ def process_node(self, node): if m: bdeprel = m.group(1) solved = False + # Removing 'až' must be done early. The remainder may be 'počátek' + # and we will want to convert it to 'počátkem:gen'. + edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m and m.group(2) and not x+m.group(2) in exceptions: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + break for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. @@ -238,18 +274,19 @@ def process_node(self, node): edep['deprel'] = m.group(1)+':'+self.unambiguous[x] solved = True break + if solved: + break # The following prepositions have more than one morphological case # available. Thanks to the Case feature on prepositions, we can # identify the correct one. - if not solved: - m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) - if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True + m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True if re.match(r'^(acl|advcl):', edep['deprel']): # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) @@ -261,7 +298,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^advcl:k:dat$', r'obl:k:dat', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^acl:na_způsob:gen$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' + edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^acl:od:gen$', r'nmod:od:gen', edep['deprel']) @@ -427,26 +464,10 @@ def process_node(self, node): # Instrumental would be possible but unlikely. edep['deprel'] += ':acc' else: - # If one of the following expressions occurs followed by another preposition, - # remove the additional preposition. For example, 'i_když_s' becomes just 'i_když'. - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ač([_:].+)?$', r'\1:ač', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ačkoliv?([_:].+)?$', r'\1:ačkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):byť[_:].+$', r'\1:byť', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):i_když[_:].+$', r'\1:i_když', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jak[_:].+$', r'\1:jak', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakkoliv?[_:].+$', r'\1:jakkoli', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jako[_:].+$', r'\1:jako', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jakoby[_:].+$', r'\1:jako', edep['deprel']) # these instances in FicTree should be spelled 'jako by' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):než[_:].+$', r'\1:než', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):protože[_:].+$', r'\1:protože', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):takže[_:].+$', r'\1:takže', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):třebaže[_:].+$', r'\1:třebaže', edep['deprel']) - # edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto From 5adadf475c66727f67b6bc866e7f9f18a8ef4984 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 21:41:29 +0100 Subject: [PATCH 0359/1201] Bug fix. --- udapi/block/ud/cs/fixedeprels.py | 5 ++--- udapi/block/ud/ru/fixedeprels.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 871939a8..019dd35b 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -265,7 +265,7 @@ def process_node(self, node): solved = True break if solved: - break + continue for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. @@ -275,7 +275,7 @@ def process_node(self, node): solved = True break if solved: - break + continue # The following prepositions have more than one morphological case # available. Thanks to the Case feature on prepositions, we can # identify the correct one. @@ -319,7 +319,6 @@ def process_node(self, node): node.feats['Tense'] = '' node.feats['VerbForm'] = '' node.feats['Voice'] = '' - edep['deprel'] = re.sub(r'^advcl:(od|do)$', r'obl:\1:gen', edep['deprel']) elif re.match(r'^(nmod|obl):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': # This is a same-case noun-noun modifier, which just happens to be in the locative. diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 43ad5f3a..fba30571 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -66,7 +66,7 @@ def process_node(self, node): solved = True break if solved: - break + continue for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. @@ -76,7 +76,7 @@ def process_node(self, node): solved = True break if solved: - break + continue # The following prepositions have more than one morphological case # available. Thanks to the Case feature on prepositions, we can # identify the correct one. From 75d5ea22d23fdf2cadb16991a0b44ee83e5fddc3 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 22:03:40 +0100 Subject: [PATCH 0360/1201] advcl:k:dat --- udapi/block/ud/cs/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 019dd35b..53337763 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -295,7 +295,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:do:gen$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:k:dat$', r'obl:k:dat', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:k(?::dat)?$', r'obl:k:dat', edep['deprel']) ###!!! Ale měli bychom opravit i závislost v základním stromu! edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' From fea338fec4872aa320dadb29313bb8d9392f6477 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 22:31:41 +0100 Subject: [PATCH 0361/1201] Another attempt to fix all Czech edeprels. --- udapi/block/ud/cs/fixedeprels.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 53337763..5be99867 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -251,6 +251,22 @@ def process_node(self, node): if m: bdeprel = m.group(1) solved = False + # Issues caused by errors in the original annotation must be fixed early. + # Especially if acl|advcl occurs with a preposition that unambiguously + # receives a morphological case in the subsequent steps, and then gets + # flagged as solved. + edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:k(?::dat)?$', r'obl:k:dat', edep['deprel']) ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^advcl:místo(?::gen)?$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' + edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' + edep['deprel'] = re.sub(r'^acl:od(?::gen)?$', r'nmod:od:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:od(?::gen)?$', r'obl:od:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^advcl:podle(?::gen)?$', r'obl:podle:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:pro(?::acc)?$', r'obl:pro:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v_duchu(?::gen)?$', r'obl:v_duchu:gen', edep['deprel']) # Removing 'až' must be done early. The remainder may be 'počátek' # and we will want to convert it to 'počátkem:gen'. edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) @@ -293,21 +309,9 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:do:gen$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! - edep['deprel'] = re.sub(r'^(acl):k:dat$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:k(?::dat)?$', r'obl:k:dat', edep['deprel']) ###!!! Ale měli bychom opravit i závislost v základním stromu! edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:místo$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^acl:od:gen$', r'nmod:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:od:gen$', r'obl:od:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! - edep['deprel'] = re.sub(r'^advcl:podle:gen$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:pro:acc$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v_duchu:gen$', r'obl:v_duchu:gen', edep['deprel']) if edep['deprel'] == 'acl:v' and node.form == 'patře': edep['deprel'] = 'nmod:v:loc' node.deprel = 'nmod' From f2c64f8083dc278ba338115625d6d422a954f1e8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 23:18:35 +0100 Subject: [PATCH 0362/1201] =?UTF-8?q?59=20obl:=D0=B2=5F=D1=81=D0=BE=D0=BE?= =?UTF-8?q?=D1=82=D0=B2=D0=B5=D1=82=D1=81=D1=82=D0=B2=D0=B8=D0=B5=5F=D1=81?= =?UTF-8?q?:ins=20=20=20=20=20=2058=20obl:=D1=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 46 +++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index fba30571..91046131 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -30,6 +30,7 @@ class FixEdeprels(Block): 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', 'в_связь_с': 'в_связи_с:ins', + 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', 'до': 'до:gen', @@ -44,6 +45,19 @@ class FixEdeprels(Block): 'спустя': 'спустя:acc' } + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + def process_node(self, node): """ Occasionally the edeprels automatically derived from the Russian basic @@ -85,29 +99,31 @@ def process_node(self, node): # and the whole phrase should not be analyzed as genitive. m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True + adpcase = copy_case_from_adposition(self, node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase else: # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' - # Both "за" and "" also occur with instrumental. However, this - # is only because there are numerals in the phrase ("за последние 20 лет") - # and the whole phrase should be usually analyzed as accusative. + continue m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True + adpcase = copy_case_from_adposition(self, node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase else: # Accusative or instrumental are possible. Pick accusative. edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' + continue + m = re.match(r'^(obl(?::arg)?|nmod):(с)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) + if m: + adpcase = copy_case_from_adposition(self, node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or instrumental are possible. Pick instrumental. + edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' + continue if re.match(r'^(nmod|obl):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': # This is a same-case noun-noun modifier, which just happens to be in the locative. From 8a998468cf7524e267d95b5e987a06477bc14436 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Mar 2022 23:21:28 +0100 Subject: [PATCH 0363/1201] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 91046131..96831746 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -99,7 +99,7 @@ def process_node(self, node): # and the whole phrase should not be analyzed as genitive. m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: - adpcase = copy_case_from_adposition(self, node, m.group(2)) + adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: edep['deprel'] = m.group(1)+':'+adpcase else: @@ -108,7 +108,7 @@ def process_node(self, node): continue m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: - adpcase = copy_case_from_adposition(self, node, m.group(2)) + adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: edep['deprel'] = m.group(1)+':'+adpcase else: @@ -117,7 +117,7 @@ def process_node(self, node): continue m = re.match(r'^(obl(?::arg)?|nmod):(с)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) if m: - adpcase = copy_case_from_adposition(self, node, m.group(2)) + adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: edep['deprel'] = m.group(1)+':'+adpcase else: From 1131de3720d176eac151afd4139ae822088542f7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Mar 2022 10:46:02 +0100 Subject: [PATCH 0364/1201] Bug fix. --- udapi/block/ud/cs/fixedeprels.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 5be99867..57e2bfb0 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -19,7 +19,7 @@ class FixEdeprels(Block): 'jak': [], 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole 'jako': [], - 'jakoby': [], # these instances in FicTree should be spelled 'jako by' + 'jakoby': ['jakoby_pod'], # these instances in FicTree should be spelled 'jako by' 'než': [], 'protože': [], 'takže': [], @@ -33,7 +33,7 @@ class FixEdeprels(Block): # case, even if they are not secondary. unambiguous = { 'abi': 'aby', - 'aby_na': 'na', + 'aby_na': 'na:loc', 'ačkoliv': 'ačkoli', 'ať': 'ať', # remove morphological case 'ať_forma': 'formou:gen', @@ -240,6 +240,19 @@ class FixEdeprels(Block): 'že_za': 'za:gen' } + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + def process_node(self, node): """ Occasionally the edeprels automatically derived from the Czech basic @@ -266,7 +279,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^advcl:pro(?::acc)?$', r'obl:pro:acc', edep['deprel']) edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v_duchu(?::gen)?$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:v_duchu?(?::gen)?$', r'obl:v_duchu:gen', edep['deprel']) # Removing 'až' must be done early. The remainder may be 'počátek' # and we will want to convert it to 'počátkem:gen'. edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) @@ -297,12 +310,10 @@ def process_node(self, node): # identify the correct one. m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: - # The following is only partial solution. We will not see - # some children because they may be shared children of coordination. - prepchildren = [x for x in node.children if x.lemma == m.group(2)] - if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': - edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() - solved = True + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + continue if re.match(r'^(acl|advcl):', edep['deprel']): # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) @@ -323,7 +334,7 @@ def process_node(self, node): node.feats['Tense'] = '' node.feats['VerbForm'] = '' node.feats['Voice'] = '' - elif re.match(r'^(nmod|obl):', edep['deprel']): + elif re.match(r'^(nmod|obl(:arg)?):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': # This is a same-case noun-noun modifier, which just happens to be in the locative. # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has From 8d4a6e8800dc593a0d30558f794f650ff88b84e8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Mar 2022 11:46:58 +0100 Subject: [PATCH 0365/1201] More fixes to edeprels. --- udapi/block/ud/cs/fixedeprels.py | 2 +- udapi/block/ud/ru/fixedeprels.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 57e2bfb0..7a49bb87 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -19,7 +19,7 @@ class FixEdeprels(Block): 'jak': [], 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole 'jako': [], - 'jakoby': ['jakoby_pod'], # these instances in FicTree should be spelled 'jako by' + 'jakoby': ['jakoby_pod:ins'], # these instances in FicTree should be spelled 'jako by' 'než': [], 'protože': [], 'takže': [], diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 96831746..a5560121 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -15,6 +15,7 @@ class FixEdeprels(Block): 'будто': [], 'ведь': [], 'как': ['как_только'], + 'раз': [], 'словно': [], 'так_что': [], 'чем': [] @@ -33,10 +34,15 @@ class FixEdeprels(Block): 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', + 'возле': 'возле:gen', 'до': 'до:gen', 'к': 'к:dat', 'несмотря_на': 'несмотря_на:acc', + 'относительно': 'относительно:gen', + 'по_мера': 'по_мере:gen', + 'по_отношение_ко?': 'по_отношению_к:dat', 'по_повод': 'по_поводу:gen', + 'по_сравнение_с': 'по_сравнению_с:ins', 'помимо': 'помимо:gen', 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', From dffc958c983bd1563ce421822a6f900511aed7b9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Mar 2022 18:40:36 +0200 Subject: [PATCH 0366/1201] Fix edeprels. --- udapi/block/ud/cs/fixedeprels.py | 2 +- udapi/block/ud/ru/fixedeprels.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 7a49bb87..6f0258ed 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -20,7 +20,7 @@ class FixEdeprels(Block): 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole 'jako': [], 'jakoby': ['jakoby_pod:ins'], # these instances in FicTree should be spelled 'jako by' - 'než': [], + 'než': ['než_aby'], 'protože': [], 'takže': [], 'třebaže': [] diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index a5560121..79669d63 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -14,7 +14,9 @@ class FixEdeprels(Block): outermost = { 'будто': [], 'ведь': [], + 'если': [], 'как': ['как_только'], + 'нежели': [], 'раз': [], 'словно': [], 'так_что': [], @@ -35,6 +37,7 @@ class FixEdeprels(Block): 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', 'возле': 'возле:gen', + 'вплоть_до': 'вплоть_до:gen', 'до': 'до:gen', 'к': 'к:dat', 'несмотря_на': 'несмотря_на:acc', @@ -48,7 +51,8 @@ class FixEdeprels(Block): 'с_помощь': 'с_помощью:gen', 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', - 'спустя': 'спустя:acc' + 'спустя': 'спустя:acc', + 'через': 'через:acc' } def copy_case_from_adposition(self, node, adposition): From 283fd9e7d0ef98f68626ca59b0a36e185539fca4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 4 Apr 2022 14:53:40 +0200 Subject: [PATCH 0367/1201] Czech o:gen is wrong. --- udapi/block/ud/cs/fixedeprels.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index 6f0258ed..a7158d6b 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -312,6 +312,9 @@ def process_node(self, node): if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: + ###!!! CAC contains 'o' with genitive, which is wrong! + if m.group(1) == 'o' and adpcase == 'gen': + adpcase = 'acc' edep['deprel'] = m.group(1)+':'+adpcase continue if re.match(r'^(acl|advcl):', edep['deprel']): From f1a1d537e8637a51dc00a6d06e01cccf6898556b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 4 Apr 2022 15:24:37 +0200 Subject: [PATCH 0368/1201] Bug fix. --- udapi/block/ud/cs/fixedeprels.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index a7158d6b..f2f76b4b 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -311,10 +311,7 @@ def process_node(self, node): m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) - if adpcase: - ###!!! CAC contains 'o' with genitive, which is wrong! - if m.group(1) == 'o' and adpcase == 'gen': - adpcase = 'acc' + if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase): edep['deprel'] = m.group(1)+':'+adpcase continue if re.match(r'^(acl|advcl):', edep['deprel']): From 702e9b138de59d3d8cc7267d205361cad5b34d65 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 6 Apr 2022 05:27:31 +0200 Subject: [PATCH 0369/1201] Rename cluster to entity (#104) * `CorefCluster` -> `CorefEntity` * `mention.cluster` -> `mention.entity` * `cluster.cluster_id` -> `entity.eid` * `cluster.cluster_type` -> `entity.etype` * `for cluster in doc.coref_clusters.values():` -> `for entity in doc.coref_entities:` * `for cluster_id, cluster in data.coref_clusters.items():` -> `for eid, entity in doc.eid_to_entity:` * `doc.coref_clusters` kept, but deprecated * rename also clusters/cluster -> entities/entity almost everywhere else (variable and parameter names, comments,...) * new udapi.core.block methods `process_coref_mention` and `process_coref_entity` --- udapi/block/corefud/fixcorefud02.py | 18 +- udapi/block/corefud/fixinterleaved.py | 10 +- udapi/block/corefud/gum2corefud.py | 38 +-- udapi/block/corefud/indexclusters.py | 21 +- udapi/block/corefud/load.py | 2 +- udapi/block/corefud/markcrossing.py | 6 +- udapi/block/corefud/markinterleaved.py | 10 +- udapi/block/corefud/marknested.py | 10 +- udapi/block/corefud/marksamesubspan.py | 10 +- udapi/block/corefud/mergesamespan.py | 32 +-- udapi/block/corefud/movehead.py | 26 +- .../{printclusters.py => printentities.py} | 26 +- udapi/block/corefud/printmentions.py | 8 +- udapi/block/corefud/stats.py | 32 +-- udapi/block/read/oldcorefud.py | 78 +++--- udapi/block/util/eval.py | 16 +- udapi/block/write/oldcorefud.py | 36 +-- udapi/core/block.py | 60 ++++- udapi/core/coref.py | 248 +++++++++--------- udapi/core/document.py | 52 ++-- udapi/core/node.py | 12 +- udapi/core/tests/test_coref.py | 10 +- 22 files changed, 413 insertions(+), 348 deletions(-) rename udapi/block/corefud/{printclusters.py => printentities.py} (69%) diff --git a/udapi/block/corefud/fixcorefud02.py b/udapi/block/corefud/fixcorefud02.py index b8fe44f7..1575cea6 100644 --- a/udapi/block/corefud/fixcorefud02.py +++ b/udapi/block/corefud/fixcorefud02.py @@ -20,32 +20,32 @@ def process_document(self, doc): if doc.meta['global.Entity'] == 'entity-GRP-infstat-MIN-coref_type-identity': doc.meta['global.Entity'] = 'eid-etype-head-other-infstat-minspan-identity' - for cluster in doc.coref_clusters.values(): - if cluster.cluster_type: + for entity in doc.coref_entities: + if entity.etype: # Harmonize etype. # If gen/spec is distinguished, store it in all mentions' other['gstype']. - etype = cluster.cluster_type.lower() + etype = entity.etype.lower() if etype.startswith('spec') or etype.startswith('gen'): gstype = 'gen' if etype.startswith('gen') else 'spec' - for m in cluster.mentions: + for m in entity.mentions: m.other['gstype'] = gstype if etype == 'spec': etype = 'other' etype = etype.replace('gen', '').replace('spec', '').replace('.', '') etype = NEW_ETYPE.get(etype, etype) - # cluster_type="APPOS" is used only in NONPUBL-CorefUD_English-OntoNotes. - # Apposition is a mention-based rather than cluster-based attribute. + # etype="APPOS" is used only in NONPUBL-CorefUD_English-OntoNotes. + # Apposition is a mention-based rather than entity-based attribute. # We don't know which of the mentions it should be assigned, but let's expect all non-first. # UD marks appositions with deprel appos, so once someone checks it is really redunant, # TODO we can delete the appos mention attribute. if etype == 'appos': etype = '' - for mention in cluster.mentions[1:]: + for mention in entity.mentions[1:]: mention.other['appos'] = '1' - cluster.cluster_type = etype + entity.etype = etype - for mention in cluster.mentions: + for mention in entity.mentions: # Harmonize bridge relation labels for bridge in mention.bridging: rel = bridge.relation.lower() diff --git a/udapi/block/corefud/fixinterleaved.py b/udapi/block/corefud/fixinterleaved.py index 6921c680..c5a1b3ed 100644 --- a/udapi/block/corefud/fixinterleaved.py +++ b/udapi/block/corefud/fixinterleaved.py @@ -5,10 +5,10 @@ class FixInterleaved(Block): """Fix mentions with interleaved or crossing spans.""" - def __init__(self, same_cluster_only=True, both_discontinuous=False, + def __init__(self, same_entity_only=True, both_discontinuous=False, crossing_only=False, nested_same_subspan=True, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only self.both_discontinuous = both_discontinuous self.crossing_only = crossing_only self.nested_same_subspan = nested_same_subspan @@ -22,7 +22,7 @@ def process_tree(self, tree): for mA, mB in itertools.combinations(mentions, 2): if mA in deleted or mB in deleted: continue - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue # Fully nested spans are OK, expect for same-subspan @@ -53,7 +53,7 @@ def process_tree(self, tree): except ValueError: pass try: - mB.cluster.mentions.remove(mB) + mB.entity.mentions.remove(mB) except ValueError: pass deleted.add(mB) @@ -75,7 +75,7 @@ def process_tree(self, tree): except ValueError: pass try: - mA.cluster.mentions.remove(mA) + mA.entity.mentions.remove(mA) except ValueError: pass break diff --git a/udapi/block/corefud/gum2corefud.py b/udapi/block/corefud/gum2corefud.py index bcd24968..bf6d798d 100644 --- a/udapi/block/corefud/gum2corefud.py +++ b/udapi/block/corefud/gum2corefud.py @@ -8,7 +8,7 @@ class Gum2CorefUD(Block): def process_tree(self, tree): docname = tree.bundle.document.meta['docname'] + '_' - clusters = tree.bundle.document.coref_clusters + eid_to_entity = tree.bundle.document._eid_to_entity unfinished_mentions = defaultdict(list) for node in tree.descendants: misc_entity = node.misc['Entity'] @@ -47,15 +47,15 @@ def process_tree(self, tree): else: raise ValueError(f"Less than 5 attributes in {entity} at {node}") name = docname + grp - cluster = clusters.get(name) - if cluster is None: - cluster = node.create_coref_cluster(cluster_id=name, cluster_type=etype) - mention = cluster.mentions[0] + entity = eid_to_entity.get(name) + if entity is None: + entity = node.create_coref_entity(eid=name, etype=etype) + mention = entity.mentions[0] mention.misc = f"Infstat:{infstat},MinSpan:{minspan},CorefType:{ctype}" if wiki: mention.misc += ',Wikification:' + wiki #.replace(',', '%2C') else: - mention = cluster.create_mention(head=node) + mention = entity.create_mention(head=node) if closing: mention.words = [node] else: @@ -71,23 +71,23 @@ def process_tree(self, tree): except ValueError as err: raise ValueError(f"{node}: {misc_bridge} {err}") try: - trg_cluster = clusters[trg_str] - src_cluster = clusters[src_str] + trg_entity = eid_to_entity[trg_str] + src_entity = eid_to_entity[src_str] except KeyError as err: - logging.warning(f"{node}: Cannot find cluster {err}") + logging.warning(f"{node}: Cannot find entity {err}") else: - mention = src_cluster.mentions[-1] + mention = src_entity.mentions[-1] # TODO: what relation should we choose for Bridging? # relation = f"{src_str.split('-')[0]}-{trg_str.split('-')[0]}" relation = '_' - mention.bridging.append((trg_cluster, relation)) + mention.bridging.append((trg_entity, relation)) del node.misc['Bridge'] misc_split = node.misc['Split'] if misc_split: # E.g. Entity=(person-54)|Split=4<54,9<54 src_str = docname + misc_split.split('<')[-1] - ante_clusters = [] + ante_entities = [] for x in misc_split.split(','): ante_str, this_str = [docname + grp for grp in x.split('<')] if this_str != src_str: @@ -96,16 +96,16 @@ def process_tree(self, tree): # There are just three such cases in GUM and all are bugs, # so let's ignore them entirely (the `else` clause will be skipped if exiting `for` w/ `break`). # break - ante_clusters.append(clusters[ante_str]) + ante_entities.append(eid_to_entity[ante_str]) else: - clusters[src_str].split_ante = ante_clusters + eid_to_entity[src_str].split_ante = ante_entities del node.misc['Split'] - for cluster_name, mentions in unfinished_mentions.items(): + for entity_name, mentions in unfinished_mentions.items(): for mention in mentions: logging.warning(f"Mention {name} opened at {mention.head}, but not closed in the same tree. Deleting.") - cluster = mention.cluster + entity = mention.entity mention.words = [] - cluster._mentions.remove(mention) - if not cluster._mentions: - del clusters[name] + entity._mentions.remove(mention) + if not entity._mentions: + del eid_to_entity[name] diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py index 14cf778d..3f5d74d8 100644 --- a/udapi/block/corefud/indexclusters.py +++ b/udapi/block/corefud/indexclusters.py @@ -3,10 +3,10 @@ class IndexClusters(Block): - """Re-index the coreference cluster IDs. The final cluster IDs are of the "e" form, + """Re-index the coreference entity IDs (eid). The final entity IDs are of the "e" form, where are ordinal numbers starting from the one specified by the `start` parameter. This block can be applied on multiple documents within one udapy call. - For example, to re-index ClusterId in all conllu files in the current directory + For example, to re-index eid in all conllu files in the current directory (keeping the IDs unique across all the files), use: `udapy read.Conllu files='!*.conllu' corefud.IndexClusters write.Conllu overwrite=1` @@ -23,14 +23,13 @@ def __init__(self, start=1, prefix='e'): self.prefix = prefix def process_document(self, doc): - clusters = doc.coref_clusters - if not clusters: + entities = doc.coref_entities + if not entities: return - new_clusters = {} - for idx, cid in enumerate(clusters, self.start): - cluster = clusters[cid] - new_cid = self.prefix + str(idx) - cluster.cluster_id = new_cid - new_clusters[new_cid] = cluster + new_eid_to_entity = {} + for idx, entity in enumerate(entities, self.start): + new_eid = self.prefix + str(idx) + entity.eid = new_eid + new_eid_to_entity[new_eid] = entity self.start = idx + 1 - doc._coref_clusters = new_clusters + doc._eid_to_entity = new_eid_to_entity diff --git a/udapi/block/corefud/load.py b/udapi/block/corefud/load.py index 3b2534bc..92773dc2 100644 --- a/udapi/block/corefud/load.py +++ b/udapi/block/corefud/load.py @@ -8,5 +8,5 @@ def __init__(self, strict=True): self.strict = strict def process_document(self, doc): - if doc._coref_clusters is None: + if doc._eid_to_entity is None: udapi.core.coref.load_coref_from_misc(doc, self.strict) diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py index a6d9346a..8064e67f 100644 --- a/udapi/block/corefud/markcrossing.py +++ b/udapi/block/corefud/markcrossing.py @@ -6,10 +6,10 @@ class MarkCrossing(Block): """Find mentions with crossing spans.""" - def __init__(self, same_cluster_only=False, continuous_only=False, print_form=False, + def __init__(self, same_entity_only=False, continuous_only=False, print_form=False, log=True, mark=True, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only self.continuous_only = continuous_only self.print_form = print_form self.log = log @@ -26,7 +26,7 @@ def process_node(self, node): if len(node.coref_mentions) > 1: for mA, mB in itertools.combinations(node.coref_mentions, 2): if not (set(mA.words) <= set(mB.words)) and not (set(mB.words) <= set(mA.words)): - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue if self.continuous_only and (',' in mA.span or ',' in mB.span): continue diff --git a/udapi/block/corefud/markinterleaved.py b/udapi/block/corefud/markinterleaved.py index ac4d9438..c00f73b1 100644 --- a/udapi/block/corefud/markinterleaved.py +++ b/udapi/block/corefud/markinterleaved.py @@ -5,10 +5,10 @@ class MarkInterleaved(Block): """Find mentions with interleaved spans.""" - def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form=False, + def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False, log=True, mark=True, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only self.both_discontinuous = both_discontinuous self.print_form = print_form self.log = log @@ -16,9 +16,9 @@ def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form def _print(self, mention): if self.print_form: - return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) else: - return mention.cluster.cluster_id + ':' + mention.span + return mention.entity.eid + ':' + mention.span def process_tree(self, tree): mentions = set() @@ -33,7 +33,7 @@ def process_tree(self, tree): continue if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: continue - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): continue diff --git a/udapi/block/corefud/marknested.py b/udapi/block/corefud/marknested.py index 656111c6..8db8a657 100644 --- a/udapi/block/corefud/marknested.py +++ b/udapi/block/corefud/marknested.py @@ -5,10 +5,10 @@ class MarkNested(Block): """Find nested mentions.""" - def __init__(self, same_cluster_only=True, both_discontinuous=False, multiword_only=False, + def __init__(self, same_entity_only=True, both_discontinuous=False, multiword_only=False, print_form=False, log=True, mark=True, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only self.both_discontinuous = both_discontinuous self.multiword_only = multiword_only self.print_form = print_form @@ -17,9 +17,9 @@ def __init__(self, same_cluster_only=True, both_discontinuous=False, multiword_o def _print(self, mention): if self.print_form: - return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) else: - return mention.cluster.cluster_id + ':' + mention.span + return mention.entity.eid + ':' + mention.span def process_tree(self, tree): mentions = set() @@ -27,7 +27,7 @@ def process_tree(self, tree): for m in node.coref_mentions: mentions.add(m) for mA, mB in itertools.combinations(mentions, 2): - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): continue diff --git a/udapi/block/corefud/marksamesubspan.py b/udapi/block/corefud/marksamesubspan.py index f99e0e13..f3cfd7b3 100644 --- a/udapi/block/corefud/marksamesubspan.py +++ b/udapi/block/corefud/marksamesubspan.py @@ -5,10 +5,10 @@ class MarkSameSubSpan(Block): """Find mentions with the same subspan.""" - def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form=False, nested_only=False, + def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False, nested_only=False, log=True, mark=True, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only self.both_discontinuous = both_discontinuous self.nested_only = nested_only self.print_form = print_form @@ -17,9 +17,9 @@ def __init__(self, same_cluster_only=False, both_discontinuous=False, print_form def _print(self, mention): if self.print_form: - return mention.cluster.cluster_id + ':' + ' '.join([w.form for w in mention.words]) + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) else: - return mention.cluster.cluster_id + ':' + mention.span + return mention.entity.eid + ':' + mention.span def process_tree(self, tree): mentions = set() @@ -28,7 +28,7 @@ def process_tree(self, tree): mentions.add(m) if len(mentions) > 1: for mA, mB in itertools.combinations(mentions, 2): - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): continue diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py index bdeefd7c..61b613cb 100644 --- a/udapi/block/corefud/mergesamespan.py +++ b/udapi/block/corefud/mergesamespan.py @@ -11,9 +11,9 @@ class MergeSameSpan(Block): CorefUD data, so this block processes one sentence at a time. """ - def __init__(self, same_cluster_only=False, **kwargs): + def __init__(self, same_entity_only=False, **kwargs): super().__init__(**kwargs) - self.same_cluster_only = same_cluster_only + self.same_entity_only = same_entity_only def process_tree(self, tree): mentions = set() @@ -22,31 +22,31 @@ def process_tree(self, tree): mentions.add(m) for mA, mB in itertools.combinations(mentions, 2): - if self.same_cluster_only and mA.cluster != mB.cluster: + if self.same_entity_only and mA.entity != mB.entity: continue # Reduce non-determinism in which mention is removed: - # If the mentions belong to different entities, sort them by entity (cluster) ids. - if mA.cluster.cluster_id > mB.cluster.cluster_id: + # If the mentions belong to different entities, sort them by entity (entity) ids. + if mA.entity.eid > mB.entity.eid: mA, mB = mB, mA sA, sB = set(mA.words), set(mB.words) if sA != sB: continue - # If the mentions belong to different clusters, we should merge the - # clusters first, i.e., pick one cluster as the survivor, move the - # mentions from the other cluster to this cluster, and remove the - # other cluster. - if mA.cluster != mB.cluster: - logging.warning(f"Merging same-span mentions that belong to different entities: {mA.cluster.cluster_id} vs. {mB.cluster.cluster_id}") - ###!!! TODO: As of now, changing the cluster of a mention is not supported in the API. - #for m in mB.cluster.mentions: - # m.cluster = mA.cluster + # If the mentions belong to different entities, we should merge the + # entities first, i.e., pick one entity as the survivor, move the + # mentions from the other entity to this entity, and remove the + # other entity. + if mA.entity != mB.entity: + logging.warning(f"Merging same-span mentions that belong to different entities: {mA.entity.eid} vs. {mB.entity.eid}") + ###!!! TODO: As of now, changing the entity of a mention is not supported in the API. + #for m in mB.entity.mentions: + # m.entity = mA.entity # Remove mention B. It may have been removed earlier because of # another duplicate, that is the purpose of try-except. - ###!!! TODO: If we remove a singleton, we are destroying the cluster. Then we must also handle possible bridging and split antecedents pointing to that cluster! + ###!!! TODO: If we remove a singleton, we are destroying the entity. Then we must also handle possible bridging and split antecedents pointing to that entity! mB.words = [] try: - mB.cluster.mentions.remove(mB) + mB.entity.mentions.remove(mB) except ValueError: pass diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py index 2a38bd82..00a32e9f 100644 --- a/udapi/block/corefud/movehead.py +++ b/udapi/block/corefud/movehead.py @@ -75,20 +75,18 @@ def find_head(self, mention): # Finally, return the word-order-wise first head candidate as the head. return enh_heads[0], 'nontreelet' - def process_document(self, doc): - for cluster in doc.coref_clusters.values(): - for mention in cluster.mentions: - self.counter['total'] += 1 - if len(mention.words) < 2: - self.counter['single-word'] += 1 - else: - new_head, category = self.find_head(mention) - self.counter[category] += 1 - if new_head is mention.head: - self.counter[category + '-kept'] += 1 - else: - self.counter[category + '-moved'] += 1 - mention.head = new_head + def process_coref_mention(self, mention): + self.counter['total'] += 1 + if len(mention.words) < 2: + self.counter['single-word'] += 1 + else: + new_head, category = self.find_head(mention) + self.counter[category] += 1 + if new_head is mention.head: + self.counter[category + '-kept'] += 1 + else: + self.counter[category + '-moved'] += 1 + mention.head = new_head def process_end(self): logging.info("corefud.MoveHead overview of mentions:") diff --git a/udapi/block/corefud/printclusters.py b/udapi/block/corefud/printentities.py similarity index 69% rename from udapi/block/corefud/printclusters.py rename to udapi/block/corefud/printentities.py index 7271ae78..7230c6a5 100644 --- a/udapi/block/corefud/printclusters.py +++ b/udapi/block/corefud/printentities.py @@ -3,20 +3,20 @@ from udapi.core.block import Block from collections import Counter, defaultdict -class PrintClusters(Block): - """Block corefud.PrintClusters prints all mentions of a given cluster.""" +class PrintEntities(Block): + """Block corefud.PrintEntities prints all mentions of a given entity.""" - def __init__(self, id_re=None, min_mentions=0, print_ranges=True, mark_head=True, + def __init__(self, eid_re=None, min_mentions=0, print_ranges=True, mark_head=True, aggregate_mentions=True, **kwargs): """Params: - id_re: regular expression constraining ClusterId of the clusters to be printed - min_mentions: print only clusters with with at least N mentions + eid_re: regular expression constraining ID of the entities to be printed + min_mentions: print only entities with with at least N mentions print_ranges: print also addressess of all mentions (compactly, using the longest common prefix of sent_id) mark_head: mark the head (e.g. as "red **car**") """ super().__init__(**kwargs) - self.id_re = re.compile(str(id_re)) if id_re else None + self.eid_re = re.compile(str(eid_re)) if eid_re else None self.min_mentions = min_mentions self.print_ranges = print_ranges self.mark_head = mark_head @@ -24,17 +24,17 @@ def __init__(self, id_re=None, min_mentions=0, print_ranges=True, mark_head=True def process_document(self, doc): if 'docname' in doc.meta: - print(f"Coref clusters in document {doc.meta['docname']}:") - for cluster in doc.coref_clusters.values(): - if self.id_re and not self.id_re.match(cluster.cluster_id): + print(f"Coref entities in document {doc.meta['docname']}:") + for entity in doc.coref_entities: + if self.eid_re and not self.eid_re.match(entity.eid): continue - if len(cluster.mentions) < self.min_mentions: + if len(entity.mentions) < self.min_mentions: continue - print(f" {cluster.cluster_id} has {len(cluster.mentions)} mentions:") + print(f" {entity.eid} has {len(entity.mentions)} mentions:") if self.aggregate_mentions: counter = Counter() ranges = defaultdict(list) - for mention in cluster.mentions: + for mention in entity.mentions: forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) counter[forms] += 1 if self.print_ranges: @@ -48,7 +48,7 @@ def process_document(self, doc): prefix = os.path.commonprefix(ranges[form]) print(f' {prefix} ({" ".join(f[len(prefix):] for f in ranges[form])})') else: - for mention in cluster.mentions: + for mention in entity.mentions: forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) print(' ' + forms) if self.print_ranges: diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index e26ee6e2..7ed31b0d 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -99,9 +99,9 @@ def _is_almost_continuous(self, mention): def process_document(self, doc): mentions = [] - for cluster in doc.coref_clusters.values(): - if self._ok(len(cluster.mentions) == 1, self.singleton): - mentions.extend(cluster.mentions) + for entity in doc.coref_entities: + if self._ok(len(entity.mentions) == 1, self.singleton): + mentions.extend(entity.mentions) if self.shuffle: random.shuffle(mentions) else: @@ -146,7 +146,7 @@ def process_document(self, doc): print("# Mention = " + this_form) if self.print_other_forms: counter = Counter() - for m in mention.cluster.mentions: + for m in mention.entity.mentions: forms = ' '.join([w.form for w in m.words]) if forms != this_form: counter[forms] += 1 diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index e39195db..cdd84e7a 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -4,14 +4,14 @@ class Stats(Block): """Block corefud.Stats prints various coreference-related statistics.""" - def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_clusters=True, + def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_entities=True, report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM', exclude_singletons=False, exclude_nonsingletons=False, style='human', **kwargs): super().__init__(**kwargs) self.m_len_max = m_len_max self.c_len_max = c_len_max self.report_mentions = report_mentions - self.report_clusters = report_clusters + self.report_entities = report_entities self.report_details = report_details self.exclude_singletons = exclude_singletons self.exclude_nonsingletons = exclude_nonsingletons @@ -21,29 +21,29 @@ def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_cluste self.counter = Counter() self.mentions = 0 - self.clusters = 0 + self.entities = 0 self.total_nodes = 0 self.longest_mention = 0 - self.longest_cluster = 0 + self.longest_entity = 0 self.m_words = 0 self.selected_upos = None if selected_upos == 'all' else selected_upos.split() def process_document(self, doc): self.total_nodes += len(list(doc.nodes)) - for cluster in doc.coref_clusters.values(): - len_mentions = len(cluster.mentions) + for entity in doc.coref_entities: + len_mentions = len(entity.mentions) if len_mentions == 1 and self.exclude_singletons: continue elif len_mentions > 1 and self.exclude_nonsingletons: continue - self.longest_cluster = max(len_mentions, self.longest_cluster) + self.longest_entity = max(len_mentions, self.longest_entity) self.counter['c_total_len'] += len_mentions self.counter[f"c_len_{min(len_mentions, self.c_len_max)}"] += 1 - self.clusters += 1 + self.entities += 1 if not self.report_mentions and not self.report_details: continue - for mention in cluster.mentions: + for mention in entity.mentions: self.mentions += 1 all_words = len(mention.words) non_empty = len([w for w in mention.words if not w.is_empty()]) @@ -68,17 +68,17 @@ def process_document(self, doc): def process_end(self): mentions_nonzero = 1 if self.mentions == 0 else self.mentions - clusters_nonzero = 1 if self.clusters == 0 else self.clusters + entities_nonzero = 1 if self.entities == 0 else self.entities total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes columns =[ ] - if self.report_clusters: - columns += [('clusters', f"{self.clusters:7,}"), - ('clusters_per1k', f"{1000 * self.clusters / total_nodes_nonzero:6.0f}"), - ('longest_cluster', f"{self.longest_cluster:6}"), - ('avg_cluster', f"{self.counter['c_total_len'] / clusters_nonzero:5.1f}")] + if self.report_entities: + columns += [('entities', f"{self.entities:7,}"), + ('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"), + ('longest_entity', f"{self.longest_entity:6}"), + ('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")] for i in range(1, self.c_len_max + 1): - percent = 100 * self.counter[f"c_len_{i}"] / clusters_nonzero + percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero columns.append((f"c_len_{i}{'' if i < self.c_len_max else '+'}", f"{percent:5.1f}")) if self.report_mentions: columns += [('mentions', f"{self.mentions:7,}"), diff --git a/udapi/block/read/oldcorefud.py b/udapi/block/read/oldcorefud.py index 539d5036..73e05f3b 100644 --- a/udapi/block/read/oldcorefud.py +++ b/udapi/block/read/oldcorefud.py @@ -2,7 +2,7 @@ import re import logging import udapi.block.read.conllu -from udapi.core.coref import CorefCluster, CorefMention, BridgingLinks +from udapi.core.coref import CorefEntity, CorefMention, BridgingLinks class OldCorefUD(udapi.block.read.conllu.Conllu): @@ -11,7 +11,7 @@ def __init__(self, replace_hyphen_in_id_with='', **kwargs): Args: substitute_hyphen_in_id_for: string to use as a replacement for hyphens in ClusterId - The new format does not allow hyphens in eid (IDs of entity clusters), + The new format does not allow hyphens in eid (IDs of entity entities), so we need to replace them. """ super().__init__(**kwargs) @@ -36,27 +36,27 @@ def _fix_id(self, cid): def process_document(self, doc, strict=True): super().process_document(doc) - clusters = {} + eid_to_entity = {} for node in doc.nodes_and_empty: index, index_str = 0, "" - cluster_id = node.misc["ClusterId"] - if not cluster_id: + eid = node.misc["ClusterId"] + if not eid: index, index_str = 1, "[1]" - cluster_id = node.misc["ClusterId[1]"] - cluster_id = self._fix_id(cluster_id) - while cluster_id: - cluster = clusters.get(cluster_id) - if cluster is None: - cluster = CorefCluster(cluster_id) - clusters[cluster_id] = cluster - mention = CorefMention(words=[node], cluster=cluster) + eid = node.misc["ClusterId[1]"] + eid = self._fix_id(eid) + while eid: + entity = eid_to_entity.get(eid) + if entity is None: + entity = CorefEntity(eid) + eid_to_entity[eid] = entity + mention = CorefMention(words=[node], entity=entity) if node.misc["MentionSpan" + index_str]: mention.span = node.misc["MentionSpan" + index_str] - cluster_type = node.misc["ClusterType" + index_str] - if cluster_type: - if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: - logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") - cluster.cluster_type = cluster_type + etype = node.misc["ClusterType" + index_str] + if etype: + if entity.etype is not None and etype != entity.etype: + logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + entity.etype = etype bridging_str = node.misc["Bridging" + index_str] if bridging_str: @@ -64,11 +64,11 @@ def process_document(self, doc, strict=True): for link_str in bridging_str.split(','): target, relation = link_str.split(':') target = self._fix_id(target) - if target == cluster_id: - _error("Bridging cannot self-reference the same cluster: " + target, strict) - if target not in clusters: - clusters[target] = CorefCluster(target) - mention._bridging.append((clusters[target], relation)) + if target == eid: + _error("Bridging cannot self-reference the same entity: " + target, strict) + if target not in eid_to_entity: + eid_to_entity[target] = CorefEntity(target) + mention._bridging.append((eid_to_entity[target], relation)) split_ante_str = node.misc["SplitAnte" + index_str] if split_ante_str: @@ -77,16 +77,16 @@ def process_document(self, doc, strict=True): # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. for ante_str in split_ante_str.replace('+', ',').split(','): ante_str = self._fix_id(ante_str) - if ante_str in clusters: - if ante_str == cluster_id: - _error("SplitAnte cannot self-reference the same cluster: " + cluster_id, strict) - split_antes.append(clusters[ante_str]) + if ante_str in eid_to_entity: + if ante_str == eid: + _error("SplitAnte cannot self-reference the same entity: " + eid, strict) + split_antes.append(eid_to_entity[ante_str]) else: # split cataphora, e.g. "We, that is you and me..." - ante_cl = CorefCluster(ante_str) - clusters[ante_str] = ante_cl + ante_cl = CorefEntity(ante_str) + eid_to_entity[ante_str] = ante_cl split_antes.append(ante_cl) - cluster.split_ante = sorted(split_antes) + entity.split_ante = sorted(split_antes) # Some CorefUD 0.2 datasets (e.g. ARRAU) separate key-value pairs with spaces instead of commas. # We also need to escape forbidden characters. @@ -94,16 +94,16 @@ def process_document(self, doc, strict=True): mention.other = mmisc.replace('-', '%2D').replace('(', '%28').replace(')', '%29') index += 1 index_str = f"[{index}]" - cluster_id = self._fix_id(node.misc["ClusterId" + index_str]) - # c=doc.coref_clusters should be sorted, so that c[0] < c[1] etc. - # In other words, the dict should be sorted by the values (according to CorefCluster.__lt__), - # not by the keys (cluster_id). + eid = self._fix_id(node.misc["ClusterId" + index_str]) + # c=doc.coref_entities should be sorted, so that c[0] < c[1] etc. + # In other words, the dict should be sorted by the values (according to CorefEntity.__lt__), + # not by the keys (eid). # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order. - for cluster in clusters.values(): - if not cluster._mentions: - _error(f"Cluster {cluster.cluster_id} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict) - cluster._mentions.sort() - doc._coref_clusters = {c._cluster_id: c for c in sorted(clusters.values())} + for entity in eid_to_entity.values(): + if not entity._mentions: + _error(f"Entity {entity.eid} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict) + entity._mentions.sort() + doc._eid_to_entity = {c._eid: c for c in sorted(eid_to_entity.values())} # Delete all old-style attributes from MISC (so when converting old to new style, the old attributes are deleted). attrs = "ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index 07eab681..0f80d018 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -29,7 +29,7 @@ class Eval(Block): # pylint: disable=too-many-arguments,too-many-instance-attributes def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, - coref_mention=None, coref_cluster=None, + coref_mention=None, coref_entity=None, expand_code=True, **kwargs): super().__init__(**kwargs) self.doc = doc @@ -43,7 +43,7 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.before_bundle = before_bundle self.after_bundle = after_bundle self.coref_mention = coref_mention - self.coref_cluster = coref_cluster + self.coref_entity = coref_entity self.expand_code = expand_code self.count = collections.Counter() @@ -74,13 +74,13 @@ def process_document(self, document): # TODO if self._should_process_bundle(bundle): self.process_bundle(bundle) - if self.coref_cluster or self.coref_mention: - for cluster in doc.coref_clusters.values(): - if self.coref_cluster: - this = cluster - exec(self.expand_eval_code(self.coref_cluster)) + if self.coref_entity or self.coref_mention: + for entity in doc.coref_entities: + if self.coref_entity: + this = entity + exec(self.expand_eval_code(self.coref_entity)) if self.coref_mention: - for mention in cluster.mentions: + for mention in entity.mentions: this = mention exec(self.expand_eval_code(self.coref_mention)) diff --git a/udapi/block/write/oldcorefud.py b/udapi/block/write/oldcorefud.py index 4eb316bb..49f9beb0 100644 --- a/udapi/block/write/oldcorefud.py +++ b/udapi/block/write/oldcorefud.py @@ -6,7 +6,7 @@ class OldCorefUD(udapi.block.write.conllu.Conllu): def process_document(self, doc): - if not doc.coref_clusters: + if not doc.coref_entities: logging.warning("Using write.OldCorefUD on a document without any coreference annotation") # Delete both new-style (GUM-style) and old-style (CorefUD 0.1) coreference annotations from MISC. @@ -17,19 +17,19 @@ def process_document(self, doc): del node.misc[key] del doc.meta['global.Entity'] - # doc._coref_clusters is a dict, which is insertion ordered in Python 3.7+. - # The insertion order is sorted according to CorefCluster.__lt__ (see few lines above). - # However, new clusters could be added meanwhile or some clusters edited, - # so we need to sort the clusters again before storing to MISC. - # We also need to mare sure cluster.mentions are sorted in each cluster - # because the ordering of clusters is defined by the first mention in each cluster. - # Ordering of mentions within a cluster can be changed when e.g. changing the span + # doc._eid_to_entity is a dict, which is insertion ordered in Python 3.7+. + # The insertion order is sorted according to CorefEntity.__lt__ (see few lines above). + # However, new entities could be added meanwhile or some entities edited, + # so we need to sort the entities again before storing to MISC. + # We also need to mare sure entity.mentions are sorted in each entity + # because the ordering of entities is defined by the first mention in each entity. + # Ordering of mentions within a entity can be changed when e.g. changing the span # of a given mention or reordering words within a sentence and in such events - # Udapi currently does not automatically update the ordering of clusters. - for cluster in doc._coref_clusters.values(): - cluster._mentions.sort() - for cluster in sorted(doc._coref_clusters.values()): - for mention in cluster.mentions: + # Udapi currently does not automatically update the ordering of entities. + for entity in doc.coref_entities: + entity._mentions.sort() + for entity in sorted(doc.coref_entities): + for mention in entity.mentions: head = mention.head if head.misc["ClusterId"]: for a in attrs: @@ -44,13 +44,13 @@ def process_document(self, doc): index_str = f"[{index}]" if index == 1: index_str = "" - head.misc["ClusterId" + index_str] = cluster.cluster_id + head.misc["ClusterId" + index_str] = entity.eid head.misc["MentionSpan" + index_str] = mention.span - head.misc["ClusterType" + index_str] = cluster.cluster_type + head.misc["ClusterType" + index_str] = entity.etype if mention._bridging: - head.misc["Bridging" + index_str] = ','.join(f'{l.target.cluster_id}:{l.relation}' for l in sorted(mention.bridging)) - if cluster.split_ante: - serialized = ','.join((c.cluster_id for c in sorted(cluster.split_ante))) + head.misc["Bridging" + index_str] = ','.join(f'{l.target.eid}:{l.relation}' for l in sorted(mention.bridging)) + if entity.split_ante: + serialized = ','.join((c.eid for c in sorted(entity.split_ante))) head.misc["SplitAnte" + index_str] = serialized if mention.other: head.misc["MentionMisc" + index_str] = str(mention.other).replace('%2D', '-') diff --git a/udapi/core/block.py b/udapi/core/block.py index 32033cde..f039abce 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -1,6 +1,9 @@ """Block class represents the basic Udapi processing unit.""" import logging +def not_overridden(method): + method.is_not_overridden = True + return method class Block(object): """The smallest processing unit for processing Universal Dependencies data. @@ -23,10 +26,12 @@ def process_end(self): """A hook method that is executed after processing all UD data""" pass + @not_overridden def process_node(self, _): """Process a UD node""" - raise Exception("No processing activity defined in block " + str(self)) + pass + @not_overridden def process_tree(self, tree): """Process a UD tree""" # tree.descendants is slightly slower than tree._descendants (0.05s per iterating over 700k words), @@ -36,6 +41,7 @@ def process_tree(self, tree): for node in tree.descendants: self.process_node(node) + @not_overridden def process_bundle(self, bundle): """Process a UD bundle""" for tree in bundle: @@ -54,10 +60,54 @@ def apply_on_document(self, document): def process_document(self, document): """Process a UD document""" - for bundle_no, bundle in enumerate(document.bundles, 1): - logging.debug('Block %s processing bundle #%d (id=%s)', - self.__class__.__name__, bundle_no, bundle.bundle_id) - self.process_bundle(bundle) + # Calling document.coref_entities is expensive because + # it needs to deserialize coref_entities from the MISC attributes. + # If no block in a scenario needs to process coreference entities/mentions, + # the deserialization does not need to be done. + # So we need to detect if any of the methods process_coref_entity and process_coref_mention + # has been overriden (without calling them, which could have adverse side effects). + # Let's use method annotations for this. + p_entity = not hasattr(self.process_coref_entity, 'is_not_overridden') + p_mention = not hasattr(self.process_coref_mention, 'is_not_overridden') + p_bundle = not hasattr(self.process_bundle, 'is_not_overridden') + p_tree = not hasattr(self.process_tree, 'is_not_overridden') + p_node = not hasattr(self.process_node, 'is_not_overridden') + if not any((p_entity, p_mention, p_bundle, p_tree, p_node)): + raise Exception("No processing activity defined in block " + str(self)) + + if p_entity or p_mention: + for entity in document.coref_entities: + if p_entity: + self.process_coref_entity(entity) + else: + for mention in entity.mentions: + self.process_coref_mention(mention) + + if p_bundle or p_tree or p_node: + for bundle_no, bundle in enumerate(document.bundles, 1): + logging.debug('Block %s processing bundle #%d (id=%s)', + self.__class__.__name__, bundle_no, bundle.bundle_id) + if p_bundle: + self.process_bundle(bundle) + else: + for tree in bundle: + if self._should_process_tree(tree): + if p_tree: + self.process_tree(tree) + else: + for node in tree.descendants: + self.process_node(node) + + @not_overridden + def process_coref_entity(self, entity): + """This method is called on each coreference entity in the document.""" + for mention in entity.mentions: + self.process_coref_mention(mention) + + @not_overridden + def process_coref_mention(self, mention): + """This method is called on each coreference mention in the document.""" + pass def before_process_document(self, document): """This method is called before each process_document.""" diff --git a/udapi/core/coref.py b/udapi/core/coref.py index eef25dd2..ff66c77f 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -107,21 +107,21 @@ @functools.total_ordering class CorefMention(object): """Class for representing a mention (instance of an entity).""" - __slots__ = ['_head', '_cluster', '_bridging', '_words', '_other'] + __slots__ = ['_head', '_entity', '_bridging', '_words', '_other'] - def __init__(self, words, head=None, cluster=None, add_word_backlinks=True): + def __init__(self, words, head=None, entity=None, add_word_backlinks=True): if not words: raise ValueError("mention.words must be non-empty") self._head = head if head else words[0] - self._cluster = cluster - if cluster is not None: - cluster._mentions.append(self) + self._entity = entity + if entity is not None: + entity._mentions.append(self) self._bridging = None self._other = None self._words = words if add_word_backlinks: for new_word in words: - if not new_word._mentions or not cluster or self > new_word._mentions[-1]: + if not new_word._mentions or not entity or self > new_word._mentions[-1]: new_word._mentions.append(self) else: new_word._mentions.append(self) @@ -141,8 +141,8 @@ def __lt__(self, another): their order is defined by the order of the last word in their span. For example precedes . - The order of two same-span mentions is currently defined by their cluster_id. - There should be no same-span (or same-subspan) same-cluster mentions. + The order of two same-span mentions is currently defined by their eid. + There should be no same-span (or same-subspan) same-entity mentions. """ #TODO: no mention.words should be handled already when loading if not self._words: @@ -159,7 +159,7 @@ def __lt__(self, another): return True if another._words[-1].precedes(self._words[-1]): return False - return self._cluster.cluster_id < another._cluster.cluster_id + return self._entity.eid < another._entity.eid return self._words[0].precedes(another._words[0]) @property @@ -186,15 +186,15 @@ def head(self, new_head): self._head = new_head @property - def cluster(self): - return self._cluster + def entity(self): + return self._entity - @cluster.setter - def cluster(self, new_cluster): - if self._cluster is not None: - raise NotImplementedError('changing the cluster of a mention not supported yet') - self._cluster = new_cluster - new_cluster._mentions.append(new_cluster) + @entity.setter + def entity(self, new_entity): + if self._entity is not None: + raise NotImplementedError('changing the entity of a mention not supported yet') + self._entity = new_entity + new_entity._mentions.append(new_entity) @property def bridging(self): @@ -216,7 +216,7 @@ def words(self): @words.setter def words(self, new_words): if new_words and self.head not in new_words: - raise ValueError(f"Head {self.head} not in new_words {new_words} for {self._cluster.cluster_id}") + raise ValueError(f"Head {self.head} not in new_words {new_words} for {self._entity.eid}") kept_words = [] # Make sure each word is included just once and they are in the correct order. new_words = sorted(list(set(new_words))) @@ -247,44 +247,44 @@ def span(self, new_span): @functools.total_ordering -class CorefCluster(object): +class CorefEntity(object): """Class for representing all mentions of a given entity.""" - __slots__ = ['_cluster_id', '_mentions', 'cluster_type', 'split_ante'] + __slots__ = ['_eid', '_mentions', 'etype', 'split_ante'] - def __init__(self, cluster_id, cluster_type=None): - self._cluster_id = None # prepare the _cluster_id slot - self.cluster_id = cluster_id # call the setter and check the ID is valid + def __init__(self, eid, etype=None): + self._eid = None # prepare the _eid slot + self.eid = eid # call the setter and check the ID is valid self._mentions = [] - self.cluster_type = cluster_type + self.etype = etype self.split_ante = [] def __lt__(self, another): - """Does this CorefCluster precedes (word-order wise) `another` cluster? + """Does this CorefEntity precedes (word-order wise) `another` entity? - This method defines a total ordering of all clusters - by the first mention of each cluster (see `CorefMention.__lt__`). - If one of the clusters has no mentions (which should not happen normally), + This method defines a total ordering of all entities + by the first mention of each entity (see `CorefMention.__lt__`). + If one of the entities has no mentions (which should not happen normally), there is a backup solution (see the source code). - If cluster IDs are not important, it is recommended to use block - `corefud.IndexClusters` to re-name cluster IDs in accordance with this cluster ordering. + If entity IDs are not important, it is recommended to use block + `corefud.IndexClusters` to re-name entity IDs in accordance with this entity ordering. """ if not self._mentions or not another._mentions: - # Clusters without mentions should go first, so the ordering is total. - # If both clusters are missing mentions, let's use cluster_id, so the ordering is stable. + # Entities without mentions should go first, so the ordering is total. + # If both entities are missing mentions, let's use eid, so the ordering is stable. if not self._mentions and not another._mentions: - return self._cluster_id < another._cluster_id + return self._eid < another._eid return not self._mentions return self._mentions[0] < another._mentions[0] @property - def cluster_id(self): - return self._cluster_id + def eid(self): + return self._eid - @cluster_id.setter - def cluster_id(self, new_cluster_id): - if any(x in new_cluster_id for x in CHARS_FORBIDDEN_IN_ID): - raise ValueError(f"{new_cluster_id} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") - self._cluster_id = new_cluster_id + @eid.setter + def eid(self, new_eid): + if any(x in new_eid for x in CHARS_FORBIDDEN_IN_ID): + raise ValueError(f"{new_eid} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") + self._eid = new_eid @property def eid_or_grp(self): @@ -292,18 +292,18 @@ def eid_or_grp(self): meta = root.document.meta if 'GRP' in meta['global.Entity'] and meta['tree2docid']: docid = meta['tree2docid'][root] - if self._cluster_id.startswith(docid): - return self._cluster_id.replace(docid, '', 1) + if self._eid.startswith(docid): + return self._eid.replace(docid, '', 1) else: - logging.warning(f"GRP in global.Entity, but eid={self._cluster_id} does not start with docid={docid}") - return self._cluster_id + logging.warning(f"GRP in global.Entity, but eid={self._eid} does not start with docid={docid}") + return self._eid @property def mentions(self): return self._mentions def create_mention(self, head=None, words=None, span=None): - """Create a new CoreferenceMention object within this CorefCluster. + """Create a new CoreferenceMention object within this CorefEntity. Args: head: a node where the annotation about this CorefMention will be stored in MISC. @@ -330,7 +330,7 @@ def create_mention(self, head=None, words=None, span=None): if head is None: head = words[0] - mention = CorefMention(words=[head], head=head, cluster=self) + mention = CorefMention(words=[head], head=head, entity=self) if words: mention.words = words if span: @@ -353,7 +353,7 @@ def all_bridging(self): # from dataclasses import dataclass # @dataclass # class DataClassCard: -# target: CorefCluster +# target: CorefEntity # relation: str class BridgingLink: __slots__ = ['target', 'relation'] @@ -374,9 +374,9 @@ class BridgingLinks(collections.abc.MutableSequence): Example usage: >>> bl = BridgingLinks(src_mention) # empty links >>> bl = BridgingLinks(src_mention, [(c12, 'part'), (c56, 'subset')]) # from a list of tuples - >>> (bl8, bl9) = BridgingLinks.from_string('c12>> for cluster, relation in bl: - >>> print(f"{bl.src_mention} ->{relation}-> {cluster.cluster_id}") + >>> (bl8, bl9) = BridgingLinks.from_string('c12>> for entity, relation in bl: + >>> print(f"{bl.src_mention} ->{relation}-> {entity.eid}") >>> print(str(bl)) # c12>> bl('part').targets == [c12] >>> bl('part|subset').targets == [c12, c56] @@ -384,9 +384,9 @@ class BridgingLinks(collections.abc.MutableSequence): """ @classmethod - def from_string(cls, string, clusters, node, strict=True, tree2docid=None): + def from_string(cls, string, entities, node, strict=True, tree2docid=None): """Return a sequence of BridgingLink objects representing a given string serialization. - The bridging links are also added to the mentions (`mention.bridging`) in the supplied `clusters`, + The bridging links are also added to the mentions (`mention.bridging`) in the supplied `entities`, so the returned sequence can be usually ignored. If `tree2docid` parameter is provided (mapping trees to document IDs used as prefixes in eid), the entity IDs in the provided string are interpreted as "GRP", i.e. as document-wide IDs, @@ -403,17 +403,17 @@ def from_string(cls, string, clusters, node, strict=True, tree2docid=None): if ':' in src_str: src_str, relation = src_str.split(':', 1) if trg_str == src_str: - _error(f"Bridge cannot self-reference the same cluster {trg_str} at {node}", strict) + _error(f"Bridge cannot self-reference the same entity {trg_str} at {node}", strict) if tree2docid: src_str = tree2docid[node.root] + src_str trg_str = tree2docid[node.root] + trg_str bl = src_str2bl.get(src_str) if not bl: - bl = clusters[src_str].mentions[-1].bridging + bl = entities[src_str].mentions[-1].bridging src_str2bl[src_str] = bl - if trg_str not in clusters: - clusters[trg_str] = CorefCluster(trg_str) - bl._data.append(BridgingLink(clusters[trg_str], relation)) + if trg_str not in entities: + entities[trg_str] = CorefEntity(trg_str) + bl._data.append(BridgingLink(entities[trg_str], relation)) return src_str2bl.values() def __init__(self, src_mention, value=None, strict=True): @@ -423,8 +423,8 @@ def __init__(self, src_mention, value=None, strict=True): if value is not None: if isinstance(value, collections.abc.Sequence): for v in value: - if v[0] is src_mention._cluster: - _error("Bridging cannot self-reference the same cluster: " + v[0].cluster_id, strict) + if v[0] is src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + v[0].eid, strict) self._data.append(BridgingLink(v[0], v[1])) else: raise ValueError(f"Unknown value type: {type(value)}") @@ -439,21 +439,21 @@ def __len__(self): # TODO delete backlinks of old links, dtto for SplitAnte def __setitem__(self, key, new_value): - if new_value[0] is self.src_mention._cluster: - _error("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id, self.strict) + if new_value[0] is self.src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + new_value[0].eid, self.strict) self._data[key] = BridgingLink(new_value[0], new_value[1]) def __delitem__(self, key): del self._data[key] def insert(self, key, new_value): - if new_value[0] is self.src_mention._cluster: - _error("Bridging cannot self-reference the same cluster: " + new_value[0].cluster_id, self.strict) + if new_value[0] is self.src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + new_value[0].eid, self.strict) self._data.insert(key, BridgingLink(new_value[0], new_value[1])) def __str__(self): # TODO in future link.relation should never be None, 0 nor "_", so we could delete the below. - return ','.join(f'{l.target.eid_or_grp}<{self.src_mention.cluster.eid_or_grp}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) + return ','.join(f'{l.target.eid_or_grp}<{self.src_mention.entity.eid_or_grp}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) def __call__(self, relations_re=None): """Return a subset of links contained in this list as specified by the args. @@ -466,14 +466,14 @@ def __call__(self, relations_re=None): @property def targets(self): - """Return a list of the target clusters (without relations).""" + """Return a list of the target entities (without relations).""" return [link.target for link in self._data] def _delete_targets_without_mentions(self, warn=True): for link in self._data: if not link.target.mentions: if warn: - logging.warning(f"Cluster {link.target.cluster_id} has no mentions, but is referred to in bridging of {self.src_mention.cluster.cluster_id}") + logging.warning(f"Entity {link.target.eid} has no mentions, but is referred to in bridging of {self.src_mention.entity.eid}") self._data.remove(link) @@ -492,7 +492,7 @@ def _error(msg, strict): def load_coref_from_misc(doc, strict=True): global highest_doc_n - clusters = {} + entities = {} unfinished_mentions = collections.defaultdict(list) discontinuous_mentions = collections.defaultdict(list) global_entity = doc.meta.get('global.Entity') @@ -573,12 +573,12 @@ def load_coref_from_misc(doc, strict=True): try: mention.head = mention.words[head_idx - 1] except IndexError as err: - _error(f"Invalid head_idx={head_idx} for {mention.cluster.cluster_id} " + _error(f"Invalid head_idx={head_idx} for {mention.entity.eid} " f"closed at {node} with words={mention.words}", 1) if subspan_idx and subspan_idx == total_subspans: m = discontinuous_mentions[eid].pop() if m is not mention: - _error(f"Closing mention {mention.cluster.cluster_id} at {node}, but it has unfinished nested mentions ({m.words})", 1) + _error(f"Closing mention {mention.entity.eid} at {node}, but it has unfinished nested mentions ({m.words})", 1) # 3. opening or single-word else: @@ -615,18 +615,18 @@ def load_coref_from_misc(doc, strict=True): else: eid, subspan_idx, total_subspans = m.group(1, 2, 3) - cluster = clusters.get(eid) - if cluster is None: + entity = entities.get(eid) + if entity is None: if subspan_idx and subspan_idx != '1': _error(f'Non-first subspan of a discontinuous mention {eid} at {node} does not have any previous mention.', 1) - cluster = CorefCluster(eid) - clusters[eid] = cluster - cluster.cluster_type = etype - elif etype and cluster.cluster_type and cluster.cluster_type != etype: - logging.warning(f"etype mismatch in {node}: {cluster.cluster_type} != {etype}") - # CorefCluster could be created first with "Bridge=" without any type - elif etype and cluster.cluster_type is None: - cluster.cluster_type = etype + entity = CorefEntity(eid) + entities[eid] = entity + entity.etype = etype + elif etype and entity.etype and entity.etype != etype: + logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + # CorefEntity could be created first with "Bridge=" without any type + elif etype and entity.etype is None: + entity.etype = etype if subspan_idx and subspan_idx != '1': opened = [pair[0] for pair in unfinished_mentions[eid]] @@ -635,14 +635,14 @@ def load_coref_from_misc(doc, strict=True): if closing and subspan_idx == total_subspans: m = discontinuous_mentions[eid].pop() if m is not mention: - _error(f"{node}: closing mention {mention.cluster.cluster_id} ({mention.words}), but it has an unfinished nested mention ({m.words})", 1) + _error(f"{node}: closing mention {mention.entity.eid} ({mention.words}), but it has an unfinished nested mention ({m.words})", 1) try: mention.head = mention._words[head_idx - 1] except IndexError as err: - _error(f"Invalid head_idx={head_idx} for {mention.cluster.cluster_id} " + _error(f"Invalid head_idx={head_idx} for {mention.entity.eid} " f"closed at {node} with words={mention._words}", 1) else: - mention = CorefMention(words=[node], cluster=cluster) + mention = CorefMention(words=[node], entity=entity) if other: mention._other = other if subspan_idx: @@ -657,7 +657,7 @@ def load_coref_from_misc(doc, strict=True): # or with relations Bridge=e173 Date: Wed, 6 Apr 2022 05:50:19 +0200 Subject: [PATCH 0370/1201] preparing PyPI release 0.3.0 --- CHANGES.txt | 5 +++++ setup.cfg | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 77d72548..67ced748 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,11 @@ Udapi Change Log ---------------- See https://github.com/udapi/udapi-python/commits/master for details. +0.3.0 2022-04-06 + - support for CorefUD 1.0 (new CoNLL-U format for coreference annotation) + - edits by Dan Zeman in block.ud.* + - Circle-CI (instead of Travis-CI) + 0.2.3 2021-02-23 - support for enhanced dependencies and coreference - requires Python 3.6+ due to f-strings diff --git a/setup.cfg b/setup.cfg index 4e96f81a..a14145ab 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = udapi -version = 0.2.3 +version = 0.3.0 author = Martin Popel author_email = popel@ufal.mff.cuni.cz description = Python framework for processing Universal Dependencies data From 36aae5882fe29a435721ea261f1156437c8e0e0c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 6 Apr 2022 10:22:32 +0200 Subject: [PATCH 0371/1201] MentionMisc in the OldCorefUD format should have been comma-separated, not space-separated We still use the old format during the CorefUD 1.0 conversion. Ideally, we should rewrite all the import scripts and get rid of the intermediate old-format step. --- udapi/block/corefud/concatmentionmisc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/corefud/concatmentionmisc.py b/udapi/block/corefud/concatmentionmisc.py index aeb945a8..74483368 100644 --- a/udapi/block/corefud/concatmentionmisc.py +++ b/udapi/block/corefud/concatmentionmisc.py @@ -14,11 +14,11 @@ def process_tree(self,root): index = matchObj.group(2) finalattr = 'MentionMisc'+index - value = node.misc[attrname] - + value = node.misc[attrname].replace(",", "%2C") + if finalattr not in node.misc: node.misc[finalattr] = f'{innerattrib}:{value}' else: - node.misc[finalattr] += f' {innerattrib}:{value}' + node.misc[finalattr] += f',{innerattrib}:{value}' del node.misc[attrname] From d57eb3b8b45b1e89a4b9b95ab264ce917357b5f6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 7 Apr 2022 16:13:35 +0200 Subject: [PATCH 0372/1201] =?UTF-8?q?"=D0=BF=D0=BB=D1=8E=D1=81".?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 79669d63..469c9173 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -17,6 +17,7 @@ class FixEdeprels(Block): 'если': [], 'как': ['как_только'], 'нежели': [], + 'плюс': [], 'раз': [], 'словно': [], 'так_что': [], From 73b9f54b4deb39945ebb04c9b9d83dc677fea3f9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 7 Apr 2022 21:10:50 +0200 Subject: [PATCH 0373/1201] Russian case markers. --- udapi/block/ud/ru/fixedeprels.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 469c9173..e5bab63b 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -21,6 +21,7 @@ class FixEdeprels(Block): 'раз': [], 'словно': [], 'так_что': [], + 'хоть': [], 'чем': [] } @@ -40,19 +41,25 @@ class FixEdeprels(Block): 'возле': 'возле:gen', 'вплоть_до': 'вплоть_до:gen', 'до': 'до:gen', + 'из': 'из:gen', 'к': 'к:dat', 'несмотря_на': 'несмотря_на:acc', + 'около': 'около:gen', 'относительно': 'относительно:gen', 'по_мера': 'по_мере:gen', + 'по_мера_то_как': 'по_мере_того_как', 'по_отношение_ко?': 'по_отношению_к:dat', 'по_повод': 'по_поводу:gen', 'по_сравнение_с': 'по_сравнению_с:ins', 'помимо': 'помимо:gen', + 'порядка': 'порядка:gen', 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', + 'с_тот_пора_как': 'с_тех_пор_как', 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', + 'у': 'у:gen', 'через': 'через:acc' } @@ -126,6 +133,15 @@ def process_node(self, node): # Accusative or instrumental are possible. Pick accusative. edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' continue + m = re.match(r'^(obl(?::arg)?|nmod):(по)(?::(?:nom|gen|voc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Dative, accusative or locative are possible. Pick dative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':dat' + continue m = re.match(r'^(obl(?::arg)?|nmod):(с)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) From a9b3a82baaf6283c4d47a13d4326c9f7807bccc6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 9 Apr 2022 21:35:57 +0200 Subject: [PATCH 0374/1201] Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index e5bab63b..c7293d69 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -22,7 +22,8 @@ class FixEdeprels(Block): 'словно': [], 'так_что': [], 'хоть': [], - 'чем': [] + 'чем': [], + 'что': [] } # Secondary prepositions sometimes have the lemma of the original part of @@ -35,16 +36,21 @@ class FixEdeprels(Block): 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', 'в_связь_с': 'в_связи_с:ins', + 'в_случай_если': 'в_случае_если', 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', 'возле': 'возле:gen', 'вплоть_до': 'вплоть_до:gen', + 'вроде': 'вроде:gen', + 'для': 'для:gen', 'до': 'до:gen', + 'за_исключение': 'за_исключением:gen', 'из': 'из:gen', 'к': 'к:dat', 'несмотря_на': 'несмотря_на:acc', 'около': 'около:gen', + 'от': 'от:gen', 'относительно': 'относительно:gen', 'по_мера': 'по_мере:gen', 'по_мера_то_как': 'по_мере_того_как', @@ -87,6 +93,10 @@ def process_node(self, node): if m: bdeprel = m.group(1) solved = False + # If the case marker starts with 'столько', remove this part. + # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. + # Similarly, 'то' occurs in 'то...то' and should be removed. + edep['deprel'] = re.sub(r':(столько|то|точно)[_:]', ':', edep['deprel']) # If one of the following expressions occurs followed by another preposition # or by morphological case, remove the additional case marking. For example, # 'словно_у' becomes just 'словно'. @@ -124,7 +134,7 @@ def process_node(self, node): # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' continue - m = re.match(r'^(obl(?::arg)?|nmod):(за)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(за|между|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: From 4f2eb09306121425ece1dc0717f92f1f8a5d4a74 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 10 Apr 2022 09:06:21 +0200 Subject: [PATCH 0375/1201] Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index c7293d69..0fc90641 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -40,11 +40,14 @@ class FixEdeprels(Block): 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', 'в_ход': 'в_ходе:gen', + 'во_глава': 'во_главе_с:ins', + 'во_глава_с': 'во_главе_с:ins', 'возле': 'возле:gen', 'вплоть_до': 'вплоть_до:gen', 'вроде': 'вроде:gen', 'для': 'для:gen', 'до': 'до:gen', + 'до_то_как': 'до:gen', # до того, как ... 'за_исключение': 'за_исключением:gen', 'из': 'из:gen', 'к': 'к:dat', @@ -62,6 +65,7 @@ class FixEdeprels(Block): 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', 'с_тот_пора_как': 'с_тех_пор_как', + 'свыше': 'свыше:gen', 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', @@ -125,7 +129,7 @@ def process_node(self, node): # Both "на" and "в" also occur with genitive. However, this # is only because there are numerals in the phrase ("в 9 случаев из 10") # and the whole phrase should not be analyzed as genitive. - m = re.match(r'^(obl(?::arg)?|nmod):(в|на)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(в|на|о)(?::(?:nom|gen|dat|voc|ins))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: From 3f399af0e1b88b3eb3e63f1496f6c52e478cf15f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 11 Apr 2022 10:29:14 +0200 Subject: [PATCH 0376/1201] Fixed Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 0fc90641..fd24be5a 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -18,6 +18,7 @@ class FixEdeprels(Block): 'как': ['как_только'], 'нежели': [], 'плюс': [], + 'пусть': [], 'раз': [], 'словно': [], 'так_что': [], @@ -62,6 +63,7 @@ class FixEdeprels(Block): 'по_сравнение_с': 'по_сравнению_с:ins', 'помимо': 'помимо:gen', 'порядка': 'порядка:gen', + 'после': 'после:gen', 'при_помощь': 'при_помощи:gen', 'с_помощь': 'с_помощью:gen', 'с_тот_пора_как': 'с_тех_пор_как', @@ -70,7 +72,8 @@ class FixEdeprels(Block): 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', 'у': 'у:gen', - 'через': 'через:acc' + 'через': 'через:acc', + 'чтоб': 'чтобы' } def copy_case_from_adposition(self, node, adposition): @@ -138,7 +141,7 @@ def process_node(self, node): # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' continue - m = re.match(r'^(obl(?::arg)?|nmod):(за|между|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(за|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: @@ -147,6 +150,15 @@ def process_node(self, node): # Accusative or instrumental are possible. Pick accusative. edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' continue + m = re.match(r'^(obl(?::arg)?|nmod):(между)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or instrumental are possible. Pick genitive. + edep['deprel'] = m.group(1)+':'+m.group(2)+':gen' + continue m = re.match(r'^(obl(?::arg)?|nmod):(по)(?::(?:nom|gen|voc|ins))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) From 8d0caaea2a7c491ee5787d090e612822da0d2295 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 11 Apr 2022 22:57:47 +0200 Subject: [PATCH 0377/1201] Fixed Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index fd24be5a..4e4892e4 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -12,19 +12,20 @@ class FixEdeprels(Block): # by all the inner cases. # The list in the value contains exceptions that should be left intact. outermost = { - 'будто': [], - 'ведь': [], - 'если': [], - 'как': ['как_только'], - 'нежели': [], - 'плюс': [], - 'пусть': [], - 'раз': [], - 'словно': [], - 'так_что': [], - 'хоть': [], - 'чем': [], - 'что': [] + 'более_чем': [], + 'будто': [], + 'ведь': [], + 'если': [], + 'как': ['как_только'], + 'нежели': [], + 'плюс': [], + 'пусть': [], + 'раз': [], + 'словно': [], + 'так_что': [], + 'хоть': [], + 'чем': [], + 'что': [] } # Secondary prepositions sometimes have the lemma of the original part of @@ -33,6 +34,7 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'loc': 'в:loc', 'в_вид': 'в_виде:gen', 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', @@ -40,9 +42,11 @@ class FixEdeprels(Block): 'в_случай_если': 'в_случае_если', 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', + 'в_тот_время_как': 'в_то_время_как', 'в_ход': 'в_ходе:gen', 'во_глава': 'во_главе_с:ins', 'во_глава_с': 'во_главе_с:ins', + 'во_избежание': 'во_избежание:gen', 'возле': 'возле:gen', 'вплоть_до': 'вплоть_до:gen', 'вроде': 'вроде:gen', @@ -65,6 +69,8 @@ class FixEdeprels(Block): 'порядка': 'порядка:gen', 'после': 'после:gen', 'при_помощь': 'при_помощи:gen', + 'при_условие_что': 'при_условии_что', + 'против': 'против:gen', 'с_помощь': 'с_помощью:gen', 'с_тот_пора_как': 'с_тех_пор_как', 'свыше': 'свыше:gen', From fdaa4480ac29eba170ba66f8bc6916f1805044bc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 12 Apr 2022 12:25:38 +0200 Subject: [PATCH 0378/1201] Fixed Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 4e4892e4..b243ed0b 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -24,6 +24,7 @@ class FixEdeprels(Block): 'словно': [], 'так_что': [], 'хоть': [], + 'хотя': [], 'чем': [], 'что': [] } @@ -44,19 +45,23 @@ class FixEdeprels(Block): 'в_течение': 'в_течение:gen', 'в_тот_время_как': 'в_то_время_как', 'в_ход': 'в_ходе:gen', + 'вместо': 'вместо:gen', 'во_глава': 'во_главе_с:ins', 'во_глава_с': 'во_главе_с:ins', 'во_избежание': 'во_избежание:gen', 'возле': 'возле:gen', 'вплоть_до': 'вплоть_до:gen', 'вроде': 'вроде:gen', + 'выше': 'выше:gen', 'для': 'для:gen', 'до': 'до:gen', 'до_то_как': 'до:gen', # до того, как ... 'за_исключение': 'за_исключением:gen', 'из': 'из:gen', 'к': 'к:dat', + 'ко': 'ко:dat', 'несмотря_на': 'несмотря_на:acc', + 'ниже': 'ниже:gen', 'около': 'около:gen', 'от': 'от:gen', 'относительно': 'относительно:gen', @@ -106,6 +111,10 @@ def process_node(self, node): if m: bdeprel = m.group(1) solved = False + # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause. + edep['deprel'] = re.sub(r':быть.*', '', edep['deprel']) + # Some markers should be discarded only if they occur as clause markers (acl, advcl). + edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From 0bc4776db98d580fe41e8a01d167726018761510 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 12 Apr 2022 15:02:00 +0200 Subject: [PATCH 0379/1201] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index b243ed0b..f3d9c4ea 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -190,7 +190,7 @@ def process_node(self, node): edep['deprel'] = m.group(1)+':'+adpcase else: # Genitive or instrumental are possible. Pick instrumental. - edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' + edep['deprel'] = m.group(1)+':'+m.group(2)+':ins' continue if re.match(r'^(nmod|obl):', edep['deprel']): if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': From 96e7265547073a4f2bce83699afd1b45ebab45de Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 12 Apr 2022 21:16:22 +0200 Subject: [PATCH 0380/1201] =?UTF-8?q?"=D1=81=D1=82=D0=BE=D0=BB=D1=8C=D0=BA?= =?UTF-8?q?=D0=BE"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index f3d9c4ea..3ed1d91e 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -112,7 +112,7 @@ def process_node(self, node): bdeprel = m.group(1) solved = False # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause. - edep['deprel'] = re.sub(r':быть.*', '', edep['deprel']) + edep['deprel'] = re.sub(r':(быть|столько).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # If the case marker starts with 'столько', remove this part. From 5c0271d2e0cd204e6a3ac1a45eef30c2c82a7a31 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 13 Apr 2022 08:31:25 +0200 Subject: [PATCH 0381/1201] Fixed Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 41 +++++++++++++++++++------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 3ed1d91e..bdd4aa9b 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -12,21 +12,23 @@ class FixEdeprels(Block): # by all the inner cases. # The list in the value contains exceptions that should be left intact. outermost = { - 'более_чем': [], - 'будто': [], - 'ведь': [], - 'если': [], - 'как': ['как_только'], - 'нежели': [], - 'плюс': [], - 'пусть': [], - 'раз': [], - 'словно': [], - 'так_что': [], - 'хоть': [], - 'хотя': [], - 'чем': [], - 'что': [] + 'более_чем': [], + 'будто': [], + 'ведь': [], + 'если': [], + 'как': ['как_только'], + 'когда': [], + 'нежели': [], + 'плюс': [], + 'потому_что': [], + 'пусть': [], + 'раз': [], + 'словно': [], + 'так_что': [], + 'хоть': [], + 'хотя': [], + 'чем': [], + 'что': [] } # Secondary prepositions sometimes have the lemma of the original part of @@ -45,11 +47,13 @@ class FixEdeprels(Block): 'в_течение': 'в_течение:gen', 'в_тот_время_как': 'в_то_время_как', 'в_ход': 'в_ходе:gen', + 'вблизи': 'вблизи:gen', 'вместо': 'вместо:gen', 'во_глава': 'во_главе_с:ins', 'во_глава_с': 'во_главе_с:ins', 'во_избежание': 'во_избежание:gen', 'возле': 'возле:gen', + 'вокруг': 'вокруг:gen', 'вплоть_до': 'вплоть_до:gen', 'вроде': 'вроде:gen', 'выше': 'выше:gen', @@ -60,6 +64,7 @@ class FixEdeprels(Block): 'из': 'из:gen', 'к': 'к:dat', 'ко': 'ко:dat', + 'кроме': 'кроме:gen', 'несмотря_на': 'несмотря_на:acc', 'ниже': 'ниже:gen', 'около': 'около:gen', @@ -73,8 +78,10 @@ class FixEdeprels(Block): 'помимо': 'помимо:gen', 'порядка': 'порядка:gen', 'после': 'после:gen', + 'при': 'при:loc', 'при_помощь': 'при_помощи:gen', 'при_условие_что': 'при_условии_что', + 'про': 'про:acc', 'против': 'против:gen', 'с_помощь': 'с_помощью:gen', 'с_тот_пора_как': 'с_тех_пор_как', @@ -147,7 +154,7 @@ def process_node(self, node): # Both "на" and "в" also occur with genitive. However, this # is only because there are numerals in the phrase ("в 9 случаев из 10") # and the whole phrase should not be analyzed as genitive. - m = re.match(r'^(obl(?::arg)?|nmod):(в|на|о)(?::(?:nom|gen|dat|voc|ins))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(в|во|на|о)(?::(?:nom|gen|dat|voc|ins))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: @@ -156,7 +163,7 @@ def process_node(self, node): # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' continue - m = re.match(r'^(obl(?::arg)?|nmod):(за|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod):(за|над|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: From bde48872deeeae30b9f797974e30eb67211f9dbc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 13 Apr 2022 08:45:05 +0200 Subject: [PATCH 0382/1201] =?UTF-8?q?Russian=20"=D0=BD=D0=B0=D0=B4"=20does?= =?UTF-8?q?=20not=20seem=20to=20allow=20accusative.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index bdd4aa9b..887b5e58 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -65,6 +65,7 @@ class FixEdeprels(Block): 'к': 'к:dat', 'ко': 'ко:dat', 'кроме': 'кроме:gen', + 'над': 'над:ins', # at least I have not encountered any genuine example of accusative 'несмотря_на': 'несмотря_на:acc', 'ниже': 'ниже:gen', 'около': 'около:gen', @@ -163,7 +164,8 @@ def process_node(self, node): # Accusative or locative are possible. Pick locative. edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' continue - m = re.match(r'^(obl(?::arg)?|nmod):(за|над|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + # Unlike in Czech, 'над' seems to allow only instrumental and not accusative. + m = re.match(r'^(obl(?::arg)?|nmod):(за|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) if m: adpcase = self.copy_case_from_adposition(node, m.group(2)) if adpcase: From 322695329bd8426045bae49298a00b9e0179fe66 Mon Sep 17 00:00:00 2001 From: Mehmet Oguz Derin Date: Tue, 19 Apr 2022 14:24:28 +0300 Subject: [PATCH 0383/1201] Fix a tiny typo (#105) fix documentation of shift_before_node --- udapi/core/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 732ef7f4..ad36aa0a 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -622,7 +622,7 @@ def shift_after_node(self, reference_node, without_children=False, skip_if_desce self._shift_before_ord(reference_node._ord + 1, without_children=without_children) def shift_before_node(self, reference_node, without_children=False, skip_if_descendant=False): - """Shift this node after the reference_node.""" + """Shift this node before the reference_node.""" if not without_children and reference_node.is_descendant_of(self): if skip_if_descendant: return From 87003818c9f906415a402d1f67caf601f6c5380c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Apr 2022 21:19:32 +0200 Subject: [PATCH 0384/1201] Added a block to fix spurious Spanish auxiliary "tener que". --- udapi/block/ud/es/fixtenerque.py | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 udapi/block/ud/es/fixtenerque.py diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py new file mode 100644 index 00000000..bce0b731 --- /dev/null +++ b/udapi/block/ud/es/fixtenerque.py @@ -0,0 +1,33 @@ +"""Block to fix spurious auxiliary verbs in UD Spanish-AnCora.""" +from udapi.core.block import Block +import logging +import re + +class FixTenerQue(Block): + + def process_node(self, node): + """ + Some Spanish treebanks treat the verb 'tener' in constructions such as + 'tener que comer' as auxiliary. This is wrong and the validator will + flag it as an error. This block fixes such annotations. + """ + if node.lemma == 'tener' and node.upos == 'AUX': + node.upos = 'VERB' + # In rare cases the auxiliary may have been promoted due to ellipsis. + # Most of the time however, it is attached as 'aux' to the main verb. + if node.udeprel == 'aux': + mainverb = node.parent + node.parent = mainverb.parent + node.deprel = mainverb.deprel + mainverb.parent = node + mainverb.deprel = 'xcomp' + # Some children of the former main verb should be reattached to 'tener'. + # Others (especially a direct object) should stay with the former main verb. + for c in mainverb.children: + if not re.match(r'^(obj|iobj|obl|conj|list|flat|fixed|goeswith|reparandum)$', c.udeprel): + c.parent = node + # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. + for c in node.children: + if c.form.lower() eq 'que' and c.ord > node.ord and c.ord < mainverb.ord: + c.parent = mainverb + c.deprel = 'mark' From 41e22c90061454851e8170b7f2b7217dda1b95aa Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Apr 2022 21:22:21 +0200 Subject: [PATCH 0385/1201] Bug fix. --- udapi/block/ud/es/fixtenerque.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py index bce0b731..ba6691c2 100644 --- a/udapi/block/ud/es/fixtenerque.py +++ b/udapi/block/ud/es/fixtenerque.py @@ -28,6 +28,6 @@ def process_node(self, node): c.parent = node # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. for c in node.children: - if c.form.lower() eq 'que' and c.ord > node.ord and c.ord < mainverb.ord: + if c.form.lower() == 'que' and c.ord > node.ord and c.ord < mainverb.ord: c.parent = mainverb c.deprel = 'mark' From 738980ec8820a6429235243eed2266cf27e17089 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Apr 2022 21:38:13 +0200 Subject: [PATCH 0386/1201] Refined the reattachment so that it is mirrored in the enhanced graph. --- udapi/block/ud/es/fixtenerque.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py index ba6691c2..f287051c 100644 --- a/udapi/block/ud/es/fixtenerque.py +++ b/udapi/block/ud/es/fixtenerque.py @@ -17,17 +17,29 @@ def process_node(self, node): # Most of the time however, it is attached as 'aux' to the main verb. if node.udeprel == 'aux': mainverb = node.parent - node.parent = mainverb.parent - node.deprel = mainverb.deprel - mainverb.parent = node - mainverb.deprel = 'xcomp' + self.reattach(node, mainverb.parent, mainverb.deprel) + self.reattach(mainverb, node, 'xcomp') # Some children of the former main verb should be reattached to 'tener'. # Others (especially a direct object) should stay with the former main verb. for c in mainverb.children: if not re.match(r'^(obj|iobj|obl|conj|list|flat|fixed|goeswith|reparandum)$', c.udeprel): - c.parent = node + self.reattach(c, node, c.deprel) # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. for c in node.children: if c.form.lower() == 'que' and c.ord > node.ord and c.ord < mainverb.ord: - c.parent = mainverb - c.deprel = 'mark' + self.reattach(c, mainverb, 'mark') + + def reattach(self, node, parent, deprel): + """ + Changes the incoming dependency relation to a node. Makes sure that the + same change is done in the basic tree and in the enhanced graph. + """ + if node.deps: + # If the enhanced graph contains the current basic relation, remove it. + orig_n_deps = len(node.deps) + node.deps = [x for x in node.deps if x['parent'] != node.parent or re.sub(r':.*', '', x['deprel']) != node.udeprel] + # Add the new basic relation to the enhanced graph only if the original one was there. + if len(node.deps) < orig_n_deps: + node.deps.append({'parent': parent, 'deprel': deprel}) + node.parent = parent + node.deprel = deprel From d2212f7854efad2337d11a94019bb97cd627d73c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Apr 2022 22:06:32 +0200 Subject: [PATCH 0387/1201] Fix: "tener que" and "ir a" both analyzed as xcomp rather than aux. --- udapi/block/ud/es/fixtenerque.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py index f287051c..5fc4c11e 100644 --- a/udapi/block/ud/es/fixtenerque.py +++ b/udapi/block/ud/es/fixtenerque.py @@ -10,8 +10,10 @@ def process_node(self, node): Some Spanish treebanks treat the verb 'tener' in constructions such as 'tener que comer' as auxiliary. This is wrong and the validator will flag it as an error. This block fixes such annotations. + + EDIT: 'ir a comer' is processed the same way. """ - if node.lemma == 'tener' and node.upos == 'AUX': + if re.match(r'^(tener|ir)$', node.lemma) and node.upos == 'AUX': node.upos = 'VERB' # In rare cases the auxiliary may have been promoted due to ellipsis. # Most of the time however, it is attached as 'aux' to the main verb. @@ -26,7 +28,7 @@ def process_node(self, node): self.reattach(c, node, c.deprel) # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. for c in node.children: - if c.form.lower() == 'que' and c.ord > node.ord and c.ord < mainverb.ord: + if re.match(r'^(que|a)$', c.form.lower()) and c.ord > node.ord and c.ord < mainverb.ord: self.reattach(c, mainverb, 'mark') def reattach(self, node, parent, deprel): From 3ed1fecf434a0dc19c0777195d9b2b020d1d8d13 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 20 Apr 2022 23:16:42 +0200 Subject: [PATCH 0388/1201] Refined list of deprels of children that should not be re-attached. --- udapi/block/ud/es/fixtenerque.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py index 5fc4c11e..62fa0f4d 100644 --- a/udapi/block/ud/es/fixtenerque.py +++ b/udapi/block/ud/es/fixtenerque.py @@ -24,7 +24,7 @@ def process_node(self, node): # Some children of the former main verb should be reattached to 'tener'. # Others (especially a direct object) should stay with the former main verb. for c in mainverb.children: - if not re.match(r'^(obj|iobj|obl|conj|list|flat|fixed|goeswith|reparandum)$', c.udeprel): + if not re.match(r'^(obj|iobj|obl|ccomp|xcomp|conj|list|compound|flat|fixed|goeswith|reparandum)$', c.udeprel): self.reattach(c, node, c.deprel) # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. for c in node.children: From 74c0a91d1f2a3b3a7d56f54a33b98658831eff70 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 21 Apr 2022 14:11:09 +0200 Subject: [PATCH 0389/1201] A new block to fill out obvious lemma candidates, based on features! --- udapi/block/ud/lemmatize.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 udapi/block/ud/lemmatize.py diff --git a/udapi/block/ud/lemmatize.py b/udapi/block/ud/lemmatize.py new file mode 100644 index 00000000..49aa5fbf --- /dev/null +++ b/udapi/block/ud/lemmatize.py @@ -0,0 +1,36 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + def process_node(self, node): + """ + Some treebanks lack lemmas for some or all words. Occasionally we may be + able to guess that the lemma is identical to the word form. This block + will then fill out the lemma. + + For some parts of speech, we can only say that the form is the lemma if + we have morphological features that will confirm it is the right form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + # Many closed classes do not inflect and have the same lemma as the form (just lowercased). + if re.match(r'^(PUNCT|SYM|ADV|ADP|CCONJ|SCONJ|PART|INTJ|X)$', node.upos): + node.lemma = node.form.lower() + # NOUN PROPN ADJ PRON DET NUM VERB AUX + # VERB and AUX: use the infinitive + elif re.match(r'^(VERB|AUX)$', node.upos) and node.feats['VerbForm'] == 'Inf': + node.lemma = node.form.lower() + # NOUN and PROPN: use singular nominative (but do not lowercase for PROPN) + # Note: This rule is wrong in German, where no nouns should be lowercased. + elif re.match(r'^(NOUN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() + elif re.match(r'^(PROPN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form + # ADJ, PRON, DET: use masculine singular nominative (pronouns: each person has its own lemma) + elif re.match(r'^(ADJ|PRON|DET)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() + # NUM: use masculine nominative (number, if present at all, is lexical) + elif re.match(r'^(NUM)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() From b0b483d7fb699b6f1049ae796e697142288dbff5 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 21 Apr 2022 14:18:06 +0200 Subject: [PATCH 0390/1201] Polarity and degree. --- udapi/block/ud/lemmatize.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/lemmatize.py b/udapi/block/ud/lemmatize.py index 49aa5fbf..a234256f 100644 --- a/udapi/block/ud/lemmatize.py +++ b/udapi/block/ud/lemmatize.py @@ -16,18 +16,24 @@ def process_node(self, node): """ if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': # Many closed classes do not inflect and have the same lemma as the form (just lowercased). - if re.match(r'^(PUNCT|SYM|ADV|ADP|CCONJ|SCONJ|PART|INTJ|X)$', node.upos): + if re.match(r'^(PUNCT|SYM|ADP|CCONJ|SCONJ|PART|INTJ|X)$', node.upos): + node.lemma = node.form.lower() + # NOUN PROPN ADJ PRON DET NUM VERB AUX ADV + # ADV: use positive affirmative + elif re.match(r'^(ADV)$', node.upos) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']): node.lemma = node.form.lower() - # NOUN PROPN ADJ PRON DET NUM VERB AUX # VERB and AUX: use the infinitive - elif re.match(r'^(VERB|AUX)$', node.upos) and node.feats['VerbForm'] == 'Inf': + elif re.match(r'^(VERB|AUX)$', node.upos) and node.feats['VerbForm'] == 'Inf' and re.match(r'^(Pos)?$', node.feats['Polarity']): node.lemma = node.form.lower() # NOUN and PROPN: use singular nominative (but do not lowercase for PROPN) # Note: This rule is wrong in German, where no nouns should be lowercased. - elif re.match(r'^(NOUN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + elif re.match(r'^(NOUN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']): node.lemma = node.form.lower() - elif re.match(r'^(PROPN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + elif re.match(r'^(PROPN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']): node.lemma = node.form + # ADJ: use masculine singular nominative positive affirmative + elif re.match(r'^(ADJ)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() # ADJ, PRON, DET: use masculine singular nominative (pronouns: each person has its own lemma) elif re.match(r'^(ADJ|PRON|DET)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): node.lemma = node.form.lower() From 6ee2c9b98a11785fe3c0ef9ab5c074cab4f4dfb6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 09:33:27 +0200 Subject: [PATCH 0391/1201] Added a block to fix features of infinitives in Spanish PUD. --- udapi/block/ud/es/fixverbfeats.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 udapi/block/ud/es/fixverbfeats.py diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py new file mode 100644 index 00000000..3972273a --- /dev/null +++ b/udapi/block/ud/es/fixverbfeats.py @@ -0,0 +1,19 @@ +"""Block to fix features (and potentially lemmas) of verbs in UD Spanish-PUD.""" +from udapi.core.block import Block +import logging +import re + +class FixVerbFeats(Block): + + def process_node(self, node): + """ + The features assigned to verbs in Spanish PUD are often wrong, although + the annotation was (reportedly) done manually. For example, infinitives + are tagged with VerbForm=Fin instead of VerbForm=Inf. + """ + if re.match(r'^(VERB|AUX)$', node.upos): + if re.search(r'[aei]r$', node.form, re.IGNORECASE): + # The infinitive has no features other than VerbForm. + node.feats = {} + node.feats['VerbForm'] = 'Inf' + node.lemma = node.form.lower() From 67334da56f852d1c15c03bad7ab679d3eb63879e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 11:38:34 +0200 Subject: [PATCH 0392/1201] Fix features of gerunds in Spanish. --- udapi/block/ud/es/fixverbfeats.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 3972273a..6c924319 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -17,3 +17,9 @@ def process_node(self, node): node.feats = {} node.feats['VerbForm'] = 'Inf' node.lemma = node.form.lower() + elif re.search(r'ndo$', node.form, re.IGNORECASE): + if node.form.lower() != 'entiendo': + # The gerund has no features other than VerbForm. + # The lemma is not always straightforward but we have fixed it manually. + node.feats = {} + node.feats['VerbForm'] = 'Ger' From fd0f74382c7779d0f22d5ad8b5aef341d91eb0dc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:02:23 +0200 Subject: [PATCH 0393/1201] Fix features of participles in Spanish. --- udapi/block/ud/es/fixverbfeats.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 6c924319..146105a2 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -23,3 +23,16 @@ def process_node(self, node): # The lemma is not always straightforward but we have fixed it manually. node.feats = {} node.feats['VerbForm'] = 'Ger' + elif re.search(r'(d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): + # The (past) participle has always Gender and Number. + # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). + # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) + gender = node.feats['Gender'] + number = node.feats['Number'] + node.feats = {} + node.feats['VerbForm'] = 'Part' + node.feats['Tense'] = 'Past' + node.feats['Gender'] = gender + node.feats['Number'] = number + if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): + node.lemma = re.sub(r'd[os]s?$', 'r', node.form.lower()) From 6259fd07eff9ceb90126ab2d3d6f57c0804b6f3a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:05:55 +0200 Subject: [PATCH 0394/1201] Guess gender and number if unknown. --- udapi/block/ud/es/fixverbfeats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 146105a2..3282a6eb 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -27,8 +27,8 @@ def process_node(self, node): # The (past) participle has always Gender and Number. # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) - gender = node.feats['Gender'] - number = node.feats['Number'] + gender = node.feats['Gender'] ? node.feats['Gender'] : re.search(r'os?$', node.form, re.IGNORECASE) ? 'Masc' : 'Fem' + number = node.feats['Number'] ? node.feats['Number'] : re.search(r's$', node.form, re.IGNORECASE) ? 'Plur' : 'Sing' node.feats = {} node.feats['VerbForm'] = 'Part' node.feats['Tense'] = 'Past' From c7ee815eace81fa82c5204023a5105485c847cd3 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:08:41 +0200 Subject: [PATCH 0395/1201] Fixed: Python syntax instead of Perl. --- udapi/block/ud/es/fixverbfeats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 3282a6eb..56d6587c 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -27,8 +27,8 @@ def process_node(self, node): # The (past) participle has always Gender and Number. # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) - gender = node.feats['Gender'] ? node.feats['Gender'] : re.search(r'os?$', node.form, re.IGNORECASE) ? 'Masc' : 'Fem' - number = node.feats['Number'] ? node.feats['Number'] : re.search(r's$', node.form, re.IGNORECASE) ? 'Plur' : 'Sing' + gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') + number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') node.feats = {} node.feats['VerbForm'] = 'Part' node.feats['Tense'] = 'Past' From 0644a490d3e749553855d1b0e6fe60199a33d78c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:10:11 +0200 Subject: [PATCH 0396/1201] Bug fix. --- udapi/block/ud/es/fixverbfeats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 56d6587c..6784afde 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -35,4 +35,4 @@ def process_node(self, node): node.feats['Gender'] = gender node.feats['Number'] = number if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): - node.lemma = re.sub(r'd[os]s?$', 'r', node.form.lower()) + node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) From e80b81131c7e3264f0c972bdf4428a3247b2e5e0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:12:48 +0200 Subject: [PATCH 0397/1201] Fixed: "da" is not a participle. --- udapi/block/ud/es/fixverbfeats.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index 6784afde..d6e99aa7 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -24,15 +24,16 @@ def process_node(self, node): node.feats = {} node.feats['VerbForm'] = 'Ger' elif re.search(r'(d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): - # The (past) participle has always Gender and Number. - # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). - # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) - gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') - number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') - node.feats = {} - node.feats['VerbForm'] = 'Part' - node.feats['Tense'] = 'Past' - node.feats['Gender'] = gender - node.feats['Number'] = number - if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): - node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) + if node.form.lower() != 'da': + # The (past) participle has always Gender and Number. + # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). + # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) + gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') + number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') + node.feats = {} + node.feats['VerbForm'] = 'Part' + node.feats['Tense'] = 'Past' + node.feats['Gender'] = gender + node.feats['Number'] = number + if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): + node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) From e7dbd6439d765e86322fe106d22d63ee1c3527b8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 22 Apr 2022 12:18:44 +0200 Subject: [PATCH 0398/1201] It must be -ado/-ido (to prevent catching puedo, pudo, ayuda, inunda...) --- udapi/block/ud/es/fixverbfeats.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py index d6e99aa7..643ecd7c 100644 --- a/udapi/block/ud/es/fixverbfeats.py +++ b/udapi/block/ud/es/fixverbfeats.py @@ -23,17 +23,16 @@ def process_node(self, node): # The lemma is not always straightforward but we have fixed it manually. node.feats = {} node.feats['VerbForm'] = 'Ger' - elif re.search(r'(d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): - if node.form.lower() != 'da': - # The (past) participle has always Gender and Number. - # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). - # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) - gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') - number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') - node.feats = {} - node.feats['VerbForm'] = 'Part' - node.feats['Tense'] = 'Past' - node.feats['Gender'] = gender - node.feats['Number'] = number - if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): - node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) + elif re.search(r'([ai]d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): + # The (past) participle has always Gender and Number. + # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). + # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) + gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') + number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') + node.feats = {} + node.feats['VerbForm'] = 'Part' + node.feats['Tense'] = 'Past' + node.feats['Gender'] = gender + node.feats['Number'] = number + if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): + node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) From de8021fe2616b0eed5f3a425fc51eae57da34483 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 24 Apr 2022 14:26:58 +0200 Subject: [PATCH 0399/1201] =?UTF-8?q?"=D1=80=D0=B0=D0=B2=D0=BD=D0=BE=5F?= =?UTF-8?q?=D0=BA=D0=B0=D0=BA"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ru/fixedeprels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 887b5e58..bff1a677 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -22,6 +22,7 @@ class FixEdeprels(Block): 'плюс': [], 'потому_что': [], 'пусть': [], + 'равно_как': [], 'раз': [], 'словно': [], 'так_что': [], From 7a2d08f84b8af05b9b40ad92d8b70ca9bd885411 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 24 Apr 2022 17:04:33 +0200 Subject: [PATCH 0400/1201] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index bff1a677..ebf8d213 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -18,6 +18,7 @@ class FixEdeprels(Block): 'если': [], 'как': ['как_только'], 'когда': [], + 'минус': [], 'нежели': [], 'плюс': [], 'потому_что': [], @@ -42,11 +43,13 @@ class FixEdeprels(Block): 'в_вид': 'в_виде:gen', 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', + 'в_с': 'в:loc', # annotation error: 'в партнерстве с ACCELS' lacks the second level 'в_связь_с': 'в_связи_с:ins', 'в_случай_если': 'в_случае_если', 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', 'в_тот_время_как': 'в_то_время_как', + 'в_угода': 'в_угоду:dat', 'в_ход': 'в_ходе:gen', 'вблизи': 'вблизи:gen', 'вместо': 'вместо:gen', @@ -72,6 +75,7 @@ class FixEdeprels(Block): 'около': 'около:gen', 'от': 'от:gen', 'относительно': 'относительно:gen', + 'перед': 'перед:ins', 'по_мера': 'по_мере:gen', 'по_мера_то_как': 'по_мере_того_как', 'по_отношение_ко?': 'по_отношению_к:dat', @@ -124,6 +128,9 @@ def process_node(self, node): edep['deprel'] = re.sub(r':(быть|столько).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) + # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). + edep['deprel'] = re.sub(r'^advcl:перед', r'obl:перед', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl(?::relcl)?):перед', r'nmod:перед', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From b164b5dc65d2c8d71ed1a59867b5033574a99cfe Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 27 Apr 2022 21:37:22 +0200 Subject: [PATCH 0401/1201] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index ebf8d213..1946faf0 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -18,9 +18,11 @@ class FixEdeprels(Block): 'если': [], 'как': ['как_только'], 'когда': [], + 'менее_чем': [], 'минус': [], 'нежели': [], 'плюс': [], + 'пока': [], 'потому_что': [], 'пусть': [], 'равно_как': [], @@ -39,8 +41,10 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'versus': 'версус:nom', 'loc': 'в:loc', 'в_вид': 'в_виде:gen', + 'в_во_глава': 'в:acc', # annotation error: 'входил в группу во главе с геологом' 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', 'в_с': 'в:loc', # annotation error: 'в партнерстве с ACCELS' lacks the second level @@ -52,6 +56,7 @@ class FixEdeprels(Block): 'в_угода': 'в_угоду:dat', 'в_ход': 'в_ходе:gen', 'вблизи': 'вблизи:gen', + 'взамен': 'взамен:gen', 'вместо': 'вместо:gen', 'во_глава': 'во_главе_с:ins', 'во_глава_с': 'во_главе_с:ins', @@ -69,7 +74,9 @@ class FixEdeprels(Block): 'к': 'к:dat', 'ко': 'ко:dat', 'кроме': 'кроме:gen', + 'между_во_глава': 'между:ins', # annotation error: 'между делегацией Минобороны во главе с замминистра Владимиром Исаковым и лидером Приднестровья Игорем Смирновым' 'над': 'над:ins', # at least I have not encountered any genuine example of accusative + 'насчет': 'насчет:gen', 'несмотря_на': 'несмотря_на:acc', 'ниже': 'ниже:gen', 'около': 'около:gen', @@ -84,6 +91,7 @@ class FixEdeprels(Block): 'помимо': 'помимо:gen', 'порядка': 'порядка:gen', 'после': 'после:gen', + 'посредством_как': 'посредством:gen', 'при': 'при:loc', 'при_помощь': 'при_помощи:gen', 'при_условие_что': 'при_условии_что', @@ -125,11 +133,11 @@ def process_node(self, node): bdeprel = m.group(1) solved = False # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause. - edep['deprel'] = re.sub(r':(быть|столько).*', '', edep['deprel']) + edep['deprel'] = re.sub(r':(быть|сколь|столько).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). - edep['deprel'] = re.sub(r'^advcl:перед', r'obl:перед', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(взамен|на|насчет|перед|по|с|среди)', r'obl:\1', edep['deprel']) edep['deprel'] = re.sub(r'^(acl(?::relcl)?):перед', r'nmod:перед', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. From f3b8689bffdccd0cf608423b8f50deaee0419207 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 27 Apr 2022 21:41:06 +0200 Subject: [PATCH 0402/1201] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 1946faf0..a6c702f7 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -137,7 +137,7 @@ def process_node(self, node): # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). - edep['deprel'] = re.sub(r'^advcl:(взамен|на|насчет|перед|по|с|среди)', r'obl:\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(взамен|на|насчет|перед|по|с|среди)(:|$)', r'obl:\1', edep['deprel']) edep['deprel'] = re.sub(r'^(acl(?::relcl)?):перед', r'nmod:перед', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. From 2001bc041a17b5730f18c422eefe1fa85edf46c2 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 27 Apr 2022 22:30:45 +0200 Subject: [PATCH 0403/1201] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index a6c702f7..1fd649d3 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -15,6 +15,7 @@ class FixEdeprels(Block): 'более_чем': [], 'будто': [], 'ведь': [], + 'ежели': [], 'если': [], 'как': ['как_только'], 'когда': [], @@ -45,13 +46,16 @@ class FixEdeprels(Block): 'loc': 'в:loc', 'в_вид': 'в_виде:gen', 'в_во_глава': 'в:acc', # annotation error: 'входил в группу во главе с геологом' + 'в_для': 'в:acc', 'в_качество': 'в_качестве:gen', 'в_отношение': 'в_отношении:gen', 'в_с': 'в:loc', # annotation error: 'в партнерстве с ACCELS' lacks the second level 'в_связь_с': 'в_связи_с:ins', 'в_случай_если': 'в_случае_если', + 'в_случай_когда': 'в_случае_когда', 'в_соответствие_с': 'в_соответствии_с:ins', 'в_течение': 'в_течение:gen', + 'в_то_быть': 'в:loc', 'в_тот_время_как': 'в_то_время_как', 'в_угода': 'в_угоду:dat', 'в_ход': 'в_ходе:gen', @@ -67,10 +71,12 @@ class FixEdeprels(Block): 'вроде': 'вроде:gen', 'выше': 'выше:gen', 'для': 'для:gen', + 'для_в': 'для:gen', 'до': 'до:gen', 'до_то_как': 'до:gen', # до того, как ... 'за_исключение': 'за_исключением:gen', 'из': 'из:gen', + 'из_более_чем': 'из:gen', 'к': 'к:dat', 'ко': 'ко:dat', 'кроме': 'кроме:gen', @@ -103,6 +109,7 @@ class FixEdeprels(Block): 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', + 'среди': 'среди:gen', 'у': 'у:gen', 'через': 'через:acc', 'чтоб': 'чтобы' From fc687fcbc394be78aeb15186402f967b68c10b9e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 27 Apr 2022 22:33:54 +0200 Subject: [PATCH 0404/1201] prevent duplicates in `node.coref_mentions` fix #106 --- udapi/core/coref.py | 2 +- udapi/core/tests/test_coref.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index ff66c77f..edd297b4 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -642,7 +642,7 @@ def load_coref_from_misc(doc, strict=True): _error(f"Invalid head_idx={head_idx} for {mention.entity.eid} " f"closed at {node} with words={mention._words}", 1) else: - mention = CorefMention(words=[node], entity=entity) + mention = CorefMention(words=[node], entity=entity, add_word_backlinks=False) if other: mention._other = other if subspan_idx: diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py index 6142d1f8..369e8caf 100755 --- a/udapi/core/tests/test_coref.py +++ b/udapi/core/tests/test_coref.py @@ -17,6 +17,10 @@ def test_load(self): coref_entities = docs[-1].coref_entities self.assertEqual(len(coref_entities), 1) self.assertEqual(coref_entities[0].eid, 'e36781') + node = next(docs[-1].nodes) + self.assertEqual(len(node.coref_entities), 1) + self.assertEqual(len(node.coref_mentions), 1) + self.assertEqual(node.coref_entities[0], coref_entities[0]) def test_edits(self): data_filename = os.path.join(os.path.dirname(__file__), 'data', 'fr-democrat-dev-sample.conllu') From 641a8d48526e6e0c7623e78b643e56e3df268efa Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:08:34 +0200 Subject: [PATCH 0405/1201] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 1fd649d3..7a1d36b2 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -19,11 +19,13 @@ class FixEdeprels(Block): 'если': [], 'как': ['как_только'], 'когда': [], + 'кроме_как': [], 'менее_чем': [], 'минус': [], 'нежели': [], 'плюс': [], 'пока': [], + 'поскольку': [], 'потому_что': [], 'пусть': [], 'равно_как': [], @@ -33,7 +35,9 @@ class FixEdeprels(Block): 'хоть': [], 'хотя': [], 'чем': [], - 'что': [] + 'что': [], + 'чтобы': [], + 'яко': [] } # Secondary prepositions sometimes have the lemma of the original part of @@ -79,14 +83,19 @@ class FixEdeprels(Block): 'из_более_чем': 'из:gen', 'к': 'к:dat', 'ко': 'ко:dat', + 'коли_скоро': 'коль_скоро', 'кроме': 'кроме:gen', 'между_во_глава': 'между:ins', # annotation error: 'между делегацией Минобороны во главе с замминистра Владимиром Исаковым и лидером Приднестровья Игорем Смирновым' + 'на_вперед': 'на:acc', 'над': 'над:ins', # at least I have not encountered any genuine example of accusative 'насчет': 'насчет:gen', 'несмотря_на': 'несмотря_на:acc', 'ниже': 'ниже:gen', 'около': 'около:gen', 'от': 'от:gen', + 'от_до': 'от:gen', + 'от_от': 'от:gen', + 'от_с': 'от:gen', 'относительно': 'относительно:gen', 'перед': 'перед:ins', 'по_мера': 'по_мере:gen', @@ -103,14 +112,23 @@ class FixEdeprels(Block): 'при_условие_что': 'при_условии_что', 'про': 'про:acc', 'против': 'против:gen', + 'с_более_чем': 'с:gen', + 'с_во_глава': 'с:ins', + 'с_на': 'с:par', 'с_помощь': 'с_помощью:gen', + 'с_тем': 'с:ins', 'с_тот_пора_как': 'с_тех_пор_как', + 'с_что': 'с:ins', 'свыше': 'свыше:gen', 'со_сторона': 'со_стороны:gen', 'согласно': 'согласно:dat', 'спустя': 'спустя:acc', 'среди': 'среди:gen', + 'среди_в': 'среди:gen', + 'так_чтобы': 'чтобы', + 'тем_между': 'между:ins', 'у': 'у:gen', + 'у_без': 'у:gen', 'через': 'через:acc', 'чтоб': 'чтобы' } @@ -140,7 +158,7 @@ def process_node(self, node): bdeprel = m.group(1) solved = False # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause. - edep['deprel'] = re.sub(r':(быть|сколь|столько).*', '', edep['deprel']) + edep['deprel'] = re.sub(r':(быть|сколь|столько|типа).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). From a8fea33f756cc83eec4f0ce71d5e1ee912456cbc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:18:14 +0200 Subject: [PATCH 0406/1201] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 7a1d36b2..383cdfdd 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -162,8 +162,8 @@ def process_node(self, node): # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). - edep['deprel'] = re.sub(r'^advcl:(взамен|на|насчет|перед|по|с|среди)(:|$)', r'obl:\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl(?::relcl)?):перед', r'nmod:перед', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)', r'nmod:\1', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From e938f67739a6633db8624aa110bf386840fcbb7b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:21:41 +0200 Subject: [PATCH 0407/1201] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 383cdfdd..f7701b0f 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -163,7 +163,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)', r'nmod:\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From 9da0f2d79e96f20efc9e4980ea66cb05bf4c5ea4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:24:52 +0200 Subject: [PATCH 0408/1201] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index f7701b0f..77813cb2 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -162,8 +162,8 @@ def process_node(self, node): # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). - edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1\2', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From 8aa69eb0b22941fa1ef52278cdadcaaee35a7823 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:29:21 +0200 Subject: [PATCH 0409/1201] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 77813cb2..0cb84264 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -163,7 +163,7 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl(?::relcl)?):(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1\2', edep['deprel']) + edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1\2', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From 5f81f7ca1c55f44ad1649bfe0ddb660a5402a88c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 09:55:50 +0200 Subject: [PATCH 0410/1201] Fixed some Russian edeprels. --- udapi/block/ud/ru/fixedeprels.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 0cb84264..977805c4 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -161,9 +161,10 @@ def process_node(self, node): edep['deprel'] = re.sub(r':(быть|сколь|столько|типа).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^ccomp:чтобы$', r'ccomp', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). - edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'obl:\1\2', edep['deprel']) - edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|на|насчет|от|перед|по|с|среди)(:|$)', r'nmod:\1\2', edep['deprel']) + edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'obl:\1\2', edep['deprel']) + edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'nmod:\1\2', edep['deprel']) # If the case marker starts with 'столько', remove this part. # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. # Similarly, 'то' occurs in 'то...то' and should be removed. From 84e050203be61981fb4f482b32956d2545aa7859 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 10:21:32 +0200 Subject: [PATCH 0411/1201] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index 977805c4..d6e99eed 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -153,6 +153,8 @@ def process_node(self, node): abbreviation and its morphological case is unknown. """ for edep in node.deps: + # Although in theory allowed by the EUD guidelines, Russian does not enhance the ccomp relation with case markers. + edep['deprel'] = re.sub(r'^ccomp:чтобы$', r'ccomp', edep['deprel']) m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) if m: bdeprel = m.group(1) @@ -161,7 +163,6 @@ def process_node(self, node): edep['deprel'] = re.sub(r':(быть|сколь|столько|типа).*', '', edep['deprel']) # Some markers should be discarded only if they occur as clause markers (acl, advcl). edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^ccomp:чтобы$', r'ccomp', edep['deprel']) # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'obl:\1\2', edep['deprel']) edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'nmod:\1\2', edep['deprel']) @@ -184,7 +185,7 @@ def process_node(self, node): for x in self.unambiguous: # All secondary prepositions have only one fixed morphological case # they appear with, so we can replace whatever case we encounter with the correct one. - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|par|dat|acc|voc|loc|ins))?$', edep['deprel']) if m: edep['deprel'] = m.group(1)+':'+self.unambiguous[x] solved = True From 2220946afac8a01650e6e0292356fa3e6ff5463a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 28 Apr 2022 10:34:36 +0200 Subject: [PATCH 0412/1201] Bug fix. --- udapi/block/ud/ru/fixedeprels.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py index d6e99eed..6fa73460 100644 --- a/udapi/block/ud/ru/fixedeprels.py +++ b/udapi/block/ud/ru/fixedeprels.py @@ -76,10 +76,8 @@ class FixEdeprels(Block): 'выше': 'выше:gen', 'для': 'для:gen', 'для_в': 'для:gen', - 'до': 'до:gen', 'до_то_как': 'до:gen', # до того, как ... 'за_исключение': 'за_исключением:gen', - 'из': 'из:gen', 'из_более_чем': 'из:gen', 'к': 'к:dat', 'ко': 'ко:dat', @@ -92,7 +90,6 @@ class FixEdeprels(Block): 'несмотря_на': 'несмотря_на:acc', 'ниже': 'ниже:gen', 'около': 'около:gen', - 'от': 'от:gen', 'от_до': 'от:gen', 'от_от': 'от:gen', 'от_с': 'от:gen', @@ -193,8 +190,16 @@ def process_node(self, node): if solved: continue # The following prepositions have more than one morphological case - # available. Thanks to the Case feature on prepositions, we can - # identify the correct one. + # available. + m = re.match(r'^(obl(?::arg)?|nmod):(до|из|от)(?::(?:nom|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or partitive are possible. Pick genitive. + edep['deprel'] = m.group(1)+':'+m.group(2)+':gen' + continue # Both "на" and "в" also occur with genitive. However, this # is only because there are numerals in the phrase ("в 9 случаев из 10") # and the whole phrase should not be analyzed as genitive. From b6a799c4e0bf82c8add59aa4cf3c64cfd6bedf5f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 13 May 2022 11:03:04 +0200 Subject: [PATCH 0413/1201] A block to fix Russian "to est" from mark to cc. --- udapi/block/ud/ru/fixtoest.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 udapi/block/ud/ru/fixtoest.py diff --git a/udapi/block/ud/ru/fixtoest.py b/udapi/block/ud/ru/fixtoest.py new file mode 100644 index 00000000..1b603e96 --- /dev/null +++ b/udapi/block/ud/ru/fixtoest.py @@ -0,0 +1,35 @@ +"""Block to fix annotation of то есть in Russian.""" +from udapi.core.block import Block +import logging +import re + +class FixToEst(Block): + + def process_node(self, node): + """ + In the converted data from Kira, the fixed expression "то есть" ("that is") + is treated as a subordinator and attached as "mark", which later makes it + part of complex enhanced relation labels. I believe that this analysis is + wrong and that it will be better to label these expressions as "cc". + """ + if node.udeprel == 'mark' and node.lemma == 'то': + if len([c for c in node.children if c.udeprel == 'fixed' and c.lemma == 'быть']) > 0: + self.set_basic_and_enhanced(node, node.parent, 'cc', 'cc') + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) From 4e97deca1849ab559ad3a2e12ca091fac56213df Mon Sep 17 00:00:00 2001 From: Daniel Hershcovich Date: Wed, 18 May 2022 14:27:57 +0200 Subject: [PATCH 0414/1201] Support printing enhanced graphs in Tikz (#107) --- udapi/block/write/tikz.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/udapi/block/write/tikz.py b/udapi/block/write/tikz.py index 43417c61..40071739 100644 --- a/udapi/block/write/tikz.py +++ b/udapi/block/write/tikz.py @@ -39,7 +39,8 @@ class Tikz(BaseWriter): """ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, - attributes=None, as_tree=False, comment_attribute=None, **kwargs): + attributes=None, as_tree=False, comment_attribute=None, + enhanced=False, **kwargs): """Create the Tikz block object. Args: @@ -50,6 +51,7 @@ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, attributes: comma-separated list of node attributes to print (each on a separate line). as_tree: boolean - should print it as a 2D tree? comment_attribute: which attribute to print as a string under each graph (e.g. text_en) + enhanced: boolean - print the enhanced graph below the sentence, too? """ super().__init__(**kwargs) self.print_sent_id = print_sent_id @@ -63,6 +65,9 @@ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, self.node_attributes = 'form,upos'.split(',') self.as_tree = as_tree self.comment_attribute = comment_attribute + if as_tree and enhanced: + raise ValueError("The enhanced graph cannot be printed as a tree") + self.enhanced = enhanced def before_process_document(self, doc): super().before_process_document(doc) @@ -140,6 +145,12 @@ def process_tree(self, tree): print(r'\deproot{%d}{root}' % node.ord) else: print(r'\depedge{%d}{%d}{%s}' % (node.parent.ord, node.ord, node.deprel)) + if self.enhanced: + for dep in node.deps: + if dep['parent'].is_root(): + print(r'\deproot[edge below]{%d}{root}' % node.ord) + else: + print(r'\depedge[edge below]{%d}{%d}{%s}' % (dep['parent'].ord, node.ord, dep['deprel'])) if self.comment_attribute and tree.comment: start_pos = tree.comment.find(self.comment_attribute + ' = ') if start_pos != -1: From 3688e4cc0be8e324c6ba87c1b96ca2b9e5bd8b16 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 May 2022 10:09:26 +0200 Subject: [PATCH 0415/1201] A block to check for Czech-specific bugs in feature values. --- udapi/block/ud/cs/markfeatsbugs.py | 217 +++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 udapi/block/ud/cs/markfeatsbugs.py diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py new file mode 100644 index 00000000..b4b6ccfa --- /dev/null +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -0,0 +1,217 @@ +""" +Block to identify missing or ill-valued features in Czech. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. +""" +from udapi.core.block import Block +import logging +import re + +class MarkFeatsBugs(Block): + + allowed = { + 'NOUN': {'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}, + 'ADJ': {'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'NumType': ['Ord'], + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Tense': ['Pres', 'Past'], + 'Voice': ['Act', 'Pass'], + 'Foreign': ['Yes']} + } + + required = { + 'NOUN': ['Gender', 'Number', 'Case', 'Polarity'], + 'ADJ': ['Gender', 'Number', 'Case', 'Degree', 'Polarity'] + } + + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def check_allowed_features(self, node, allowed): + """ + We need a dictionary indexed by feature names that are allowed; for each + feature name, there is a list of allowed values. + """ + # Check for features that are not allowed but the node has them. + # For features that are allowed, check that their values are allowed. + for f in node.feats: + if f in allowed: + if not node.feats[f] in allowed[f]: + self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') + else: + self.bug(node, 'Feat' + f + 'NotAllowed') + + def check_required_features(self, node, required): + """ + We need a list of names of features whose values must not be empty. + """ + for f in required: + if not f in node.feats: + self.bug(node, 'Feat' + f + 'Missing') + + def process_node(self, node): + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + elif node.upos == 'PROPN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) + elif node.upos == 'ADJ': + if node.feats['Poss'] == 'Yes': # possessive adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'Foreign': ['Yes']}) + elif node.feats['NumType'] == 'Ord': # ordinal numerals are a subtype of adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act', 'Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act', 'Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity', 'Variant']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: # regular adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) From 0116b9c71d603aa7e9111a598dc1fe8f0865a550 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 May 2022 16:41:46 +0200 Subject: [PATCH 0416/1201] Rules for features in Czech. --- udapi/block/ud/cs/markfeatsbugs.py | 303 ++++++++++++++++++++++++++--- 1 file changed, 279 insertions(+), 24 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index b4b6ccfa..c71ccd0f 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -70,6 +70,7 @@ def check_required_features(self, node, required): self.bug(node, 'Feat' + f + 'Missing') def process_node(self, node): + # NOUNS ################################################################ if node.upos == 'NOUN': self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) if node.feats['Gender'] == 'Masc': @@ -88,6 +89,7 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], 'Foreign': ['Yes']}) + # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) if node.feats['Gender'] == 'Masc': @@ -108,6 +110,7 @@ def process_node(self, node): 'Polarity': ['Pos', 'Neg'], 'NameType': ['Giv', 'Sur', 'Geo'], 'Foreign': ['Yes']}) + # ADJECTIVES ########################################################### elif node.upos == 'ADJ': if node.feats['Poss'] == 'Yes': # possessive adjectives if node.feats['Gender'] == 'Masc': @@ -150,31 +153,61 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Foreign': ['Yes']}) elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Act', 'Pass'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) + self.check_required_features(node, ['VerbForm', 'Voice']) + if node.feats['Voice'] == 'Act': # active participles have tense, passives don't + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Act', 'Pass'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) @@ -215,3 +248,225 @@ def process_node(self, node): 'Degree': ['Pos', 'Cmp', 'Sup'], 'Polarity': ['Pos', 'Neg'], 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + else: # not reflexive + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + if node.feats['Person'] == '3': + if re.match(r'^(Nom|Voc)$', node.feats['Case']): + self.check_required_features(node, ['Gender']) + # In PDT, animacy of personal pronouns is distinguished only for Person=3 Case=Nom Gender=Masc Number=Plur ('oni' vs. 'ony'). + # So we will neither require nor allow it in singular and dual. + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Voc'] + }) + else: # on, ona, ono, ony (Fem Plur) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Voc'] + }) + else: # non-nominatives also have PrepCase + # Mostly only two gender groups and no animacy: + # Masc,Neut ... jeho, jemu, jej, něm, jím + # Fem ... jí, ji, ní + # Neut ... je + self.check_required_features(node, ['PrepCase']) + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['Gender']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Gender': ['Masc,Neut', 'Fem', 'Neut'], + 'Number': ['Sing'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'PrepCase': ['Npr', 'Pre'] + }) + # No gender in dual and plural: + # Plur ... jich, jim, je, nich, jimi + else: + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # 1st and 2nd person do not have gender + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + else: # PronType není Prs + # Int,Rel ... kdo, co + # Rel ... kdož, což, jenž, ješto, jenžto, an + # Ind ... něco + # Neg ... nic, nicož + # kdo, kdož, někdo, nikdo ... Gender=Masc, Animacy=Anim, Case (but no Number; it could be used in the context of any number) + # jehožto, něhožto, jemužto, němužto ... Gender=Masc,Neut (similarly to non-nominative forms of personal pronoun 'on') + ###!!! We could make the requirements more precise if we look at the lemma. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Neut'], + 'Animacy': ['Anim'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'PrepCase': ['Npr', 'Pre'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. + # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. + if re.match(r'^(jeho|jejich|jich|jehož|jejichž|jichž|jehožto|jejichžto|jichžto)$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc,Neut'] + }) + else: + # Gender is annotated in all cases in singular, but only in + # nominative, accusative (and theoretically vocative) in plural. + # Other cases (Gen, Dat, Loc, Ins) are gender-less: 'těch', 'svým', ... + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + # Same for animacy (which implies masculine gender). + self.check_required_features(node, ['PronType', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2', '3'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc,Neut', 'Fem'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word', 'Digit', 'Roman'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + self.check_required_features(node, ['Aspect', 'VerbForm']) + if node.feats['VerbForm'] == 'Inf': + # There is no voice. For some reason, PDT does not annotate that + # the infinitive form is active (while a passive infinitive is + # a combination of the infinitive with a passive participle). + self.check_required_features(node, ['Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Fin': + # Voice is optional. For some reason it is not annotated with + # imperatives (although passive imperatives are a combination + # of the active imperative and a passive participle). It is + # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. + if node.feats['Mood'] == 'Cnd': + self.check_required_features(node, ['Mood', 'Person']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'] # optional: it is not annotated in the third person + }) + elif node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # indicative + self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Voice': ['Act'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist + }) + elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # converb + self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_allowed_features(node, { + 'AdpType': ['Prep', 'Voc'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) From a7c7b145b432a86cd12ac1303cc6daf69fa54a13 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 May 2022 21:42:13 +0200 Subject: [PATCH 0417/1201] More precise requirements on features of Czech pronouns and determiners. --- udapi/block/ud/cs/markfeatsbugs.py | 170 ++++++++++++++++++++++++----- 1 file changed, 145 insertions(+), 25 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index c71ccd0f..a2c2bb7b 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -319,28 +319,118 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Variant': ['Short'] }) - else: # PronType není Prs - # Int,Rel ... kdo, co - # Rel ... kdož, což, jenž, ješto, jenžto, an - # Ind ... něco - # Neg ... nic, nicož - # kdo, kdož, někdo, nikdo ... Gender=Masc, Animacy=Anim, Case (but no Number; it could be used in the context of any number) - # jehožto, něhožto, jemužto, němužto ... Gender=Masc,Neut (similarly to non-nominative forms of personal pronoun 'on') - ###!!! We could make the requirements more precise if we look at the lemma. - self.check_required_features(node, ['PronType', 'Case']) + elif re.search(r'k[dt]o', node.lemma): # kdo (kto), kdož, někdo, nikdo + # There is no Number. Někdo and nikdo behave like singular; + # kdo is by default singular as well but it also occurs as a subject + # of plural verbs. + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) self.check_allowed_features(node, { 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Neut'], + 'Gender': ['Masc'], 'Animacy': ['Anim'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'PrepCase': ['Npr', 'Pre'] + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif re.match(r'^(co|což|něco|nicož)$', node.lemma): + # Although these pronouns behave by default as neuter singular, + # no Gender and Number is annotated. However, quite unusually, + # there is Animacy=Inan without Gender. + ###!!! This should probably be fixed in all Czech treebanks and + ###!!! in Interset. The pronoun should get Gender=Neut and no + ###!!! animacy. For now, let's at least make animacy an optional + ###!!! feature (I see that we already do not fill it in the Old + ###!!! Czech data). + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'Animacy': ['Inan'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif node.lemma == 'ješto': + # Unlike 'jenžto', this relative pronoun does not inflect, it + # always occurs in a nominative position, but the context can + # be any gender and number. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Case': ['Nom'] }) + elif re.match(r'^(jenž|jenžto)$', node.lemma): + # The relative pronouns 'jenž', 'jenžto' inflect for gender; + # while we normally take this as a sign of DET (instead of PRON), + # these can never act as real DET because they never modify a + # nominal. + # Similarly to the personal pronoun 'on', animacy is only + # annotated for masculine nominative plural, non-nominative + # forms are merged for masculine and neuter (jehož, jemuž), and + # non-singular gender is only annotated in nominative (while + # these cases are common for all genders: jichž, jimž, jimiž). + # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even + # in the nominative, although there is no prepositional counter- + # part (but similarly the locative has no prepositionless form). + if node.feats['Case'] == 'Nom': + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # not Masc Plur + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # not Case=Nom + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # non-nominative dual or plural: jichž, nichž, jimž, nimž, jež, něž, jimiž, nimiž + self.check_required_features(node, ['PronType', 'Number', 'Case', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: + # What remains is the relative pronoun 'an'. It behaves similarly + # to 'jenž' but it does not have the PrepCase feature and it + # only occurs in the nominative. + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom'] + }) + else: # not Masc Plur: an, ana, ano, any + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom'] + }) # DETERMINERS ########################################################## elif node.upos == 'DET': # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. - if re.match(r'^(jeho|jejich|jich|jehož|jejichž|jichž|jehožto|jejichžto|jichžto)$', node.form.lower()): + if re.match(r'^(jeho|jejich|jich)(ž(to)?)?$', node.form.lower()): self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) self.check_allowed_features(node, { 'PronType': ['Prs', 'Rel'], @@ -349,21 +439,50 @@ def process_node(self, node): 'Number[psor]': ['Sing', 'Dual', 'Plur'], 'Gender[psor]': ['Masc,Neut'] }) - else: - # Gender is annotated in all cases in singular, but only in - # nominative, accusative (and theoretically vocative) in plural. - # Other cases (Gen, Dat, Loc, Ins) are gender-less: 'těch', 'svým', ... + elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): + # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + # Gender is annotated in all cases in singular (můj, má, mé) + # but only in nominative (and vocative) in plural (mí, mé, má); + # neuter is also different in accusative (mé, má). + # Animacy is distinguished only in nom/voc plural masculine (mí, mé). + # Other cases in plural are gender-less (mých, mým, mé, mými). # Note that this is not consistent with adjectives, where we # disambiguate gender in all cases in plural. - # Same for animacy (which implies masculine gender). - self.check_required_features(node, ['PronType', 'Number', 'Case']) + self.check_required_features(node, ['PronType', 'Poss', 'Number', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Prs', 'Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'PronType': ['Prs'], 'Poss': ['Yes'], 'Reflex': ['Yes'], - 'Person': ['1', '2', '3'], - 'Number[psor]': ['Sing', 'Dual', 'Plur'], - 'Gender[psor]': ['Masc,Neut', 'Fem'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + # Gender is annotated in all cases in singular (ten, ta, to) + # but only in nominative (and vocative) in plural (ti, ty, ta); + # neuter is also different in accusative (ty, ta). + # Animacy is distinguished only in nom/voc plural masculine (ti, ty). + # Other cases in plural are gender-less (těch, těm, ty, těmi). + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + self.check_required_features(node, ['PronType', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], @@ -457,12 +576,13 @@ def process_node(self, node): # ADVERBS ############################################################## elif node.upos == 'ADV': self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg'], + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'], 'Degree': ['Pos', 'Cmp', 'Sup'], 'Polarity': ['Pos', 'Neg'] }) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': + self.check_required_features(node, ['AdpType', 'Case']) self.check_allowed_features(node, { 'AdpType': ['Prep', 'Voc'], 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] From 08311e094b7443b7719924530f25e7c7fcc849ef Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 May 2022 23:43:30 +0200 Subject: [PATCH 0418/1201] More detailed feature conditions based on PDT. --- udapi/block/ud/cs/markfeatsbugs.py | 232 ++++++++++++++++++++++------- 1 file changed, 179 insertions(+), 53 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index a2c2bb7b..e027d1cb 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -441,65 +441,180 @@ def process_node(self, node): }) elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]']) - self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], - 'Poss': ['Yes'], - 'Person': ['3'], - 'Number[psor]': ['Sing'], - 'Gender[psor]': ['Fem'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) + # Congruent gender is annotated only in singular. Masculine and + # neuter are merged even in nominative. Feminine singular does + # not distinguish case in PDT but we need it in Old Czech at + # least for 'jejiej'. + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' # Gender is annotated in all cases in singular (můj, má, mé) - # but only in nominative (and vocative) in plural (mí, mé, má); - # neuter is also different in accusative (mé, má). - # Animacy is distinguished only in nom/voc plural masculine (mí, mé). - # Other cases in plural are gender-less (mých, mým, mé, mými). + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc mí, mé, má; Acc mé, má). Animacy is distinguished + # in plural if gender is distinguished and masculine; in + # singular it is distinguished only in accusative (mého, můj). + # Other cases in plural are gender-less (mých, mým, mými). # Note that this is not consistent with adjectives, where we # disambiguate gender in all cases in plural. - self.check_required_features(node, ['PronType', 'Poss', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Gender', 'Number', 'Case']) + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + }) + else: + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + self.check_required_features(node, ['PronType', 'Poss', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + }) else: # Gender is annotated in all cases in singular (ten, ta, to) - # but only in nominative (and vocative) in plural (ti, ty, ta); - # neuter is also different in accusative (ty, ta). - # Animacy is distinguished only in nom/voc plural masculine (ti, ty). - # Other cases in plural are gender-less (těch, těm, ty, těmi). + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished + # in plural if gender is distinguished and masculine; in + # singular it is distinguished only in accusative (toho, ten). + # Other cases in plural are gender-less (těch, těm, těmi). # Note that this is not consistent with adjectives, where we # disambiguate gender in all cases in plural. - self.check_required_features(node, ['PronType', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + }) + else: + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + }) + else: + self.check_required_features(node, ['PronType', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + }) # NUMERALS ############################################################# elif node.upos == 'NUM': self.check_required_features(node, ['NumType', 'NumForm']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word', 'Digit', 'Roman'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + ###!!! Somehow the NumValue feature from PDT via Interset is useless. + # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. + # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. + # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. + # 'pět' and more have Number=Plur, Case: pět, pěti. + if node.lemma == 'jeden': + self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(dva|oba)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) # VERBS AND AUXILIARIES ################################################ elif re.match(r'^(VERB|AUX)$', node.upos): self.check_required_features(node, ['Aspect', 'VerbForm']) @@ -575,11 +690,22 @@ def process_node(self, node): }) # ADVERBS ############################################################## elif node.upos == 'ADV': - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'] - }) + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] + }) + elif node.feats['Degree'] != '': + # Adverbs that are compared can also be negated. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': self.check_required_features(node, ['AdpType', 'Case']) From 9d8dc3b0568596a1d8ff16dc9f54504ea38a583a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 27 May 2022 13:08:32 +0200 Subject: [PATCH 0419/1201] Fix spurious auxiliaries in Kazakh. --- udapi/block/ud/kk/fixspuriousaux.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 udapi/block/ud/kk/fixspuriousaux.py diff --git a/udapi/block/ud/kk/fixspuriousaux.py b/udapi/block/ud/kk/fixspuriousaux.py new file mode 100644 index 00000000..8a1e06c8 --- /dev/null +++ b/udapi/block/ud/kk/fixspuriousaux.py @@ -0,0 +1,24 @@ +"""Block to convert spurious auxiliaries to lexical verbs in Kazakh.""" +from udapi.core.block import Block +import logging +import re + +class FixSpuriousAux(Block): + + def process_node(self, node): + """ + Some verbs that are called auxiliary by the traditional grammar, should + be analyzed in UD as VERB + non-finite xcomp. + """ + if node.upos == 'AUX' and node.udeprel == 'aux': + # баста = start + if re.match(r'^(баста)$', node.lemma): + node.upos = 'VERB' + # The auxiliary inherits the incoming relation of its original parent. + lexverb = node.parent + node.parent = lexverb.parent + node.deprel = lexverb.deprel + # The auxiliary also inherits some but not all children of the lexical verb. + for c in lexverb.children: + if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): + c.parent = node From f6cd84c05eb10f2acf47022afb6eaa2ff4195c10 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 27 May 2022 13:11:34 +0200 Subject: [PATCH 0420/1201] Bug fix. --- udapi/block/ud/kk/fixspuriousaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/kk/fixspuriousaux.py b/udapi/block/ud/kk/fixspuriousaux.py index 8a1e06c8..a2ba777c 100644 --- a/udapi/block/ud/kk/fixspuriousaux.py +++ b/udapi/block/ud/kk/fixspuriousaux.py @@ -22,3 +22,6 @@ def process_node(self, node): for c in lexverb.children: if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): c.parent = node + # The lexical verb becomes an xcomp of the auxiliary. + lexverb.parent = node + lexverb.deprel = 'xcomp' From 0dbb2cf5d8897f5463dcc712049491d94206eddc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 27 May 2022 13:20:04 +0200 Subject: [PATCH 0421/1201] =?UTF-8?q?=D0=BA=D0=B5=D1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/kk/fixspuriousaux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/kk/fixspuriousaux.py b/udapi/block/ud/kk/fixspuriousaux.py index a2ba777c..044ff178 100644 --- a/udapi/block/ud/kk/fixspuriousaux.py +++ b/udapi/block/ud/kk/fixspuriousaux.py @@ -12,7 +12,7 @@ def process_node(self, node): """ if node.upos == 'AUX' and node.udeprel == 'aux': # баста = start - if re.match(r'^(баста)$', node.lemma): + if re.match(r'^(баста|кет)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 59a1f6c57fc6d2913fc06e4f6495f105004c2bdc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 27 May 2022 18:09:03 +0200 Subject: [PATCH 0422/1201] A block to remove multi-word token if it contains spaces and if its words match the space-delimited segments. --- udapi/block/ud/fixmwtspace.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 udapi/block/ud/fixmwtspace.py diff --git a/udapi/block/ud/fixmwtspace.py b/udapi/block/ud/fixmwtspace.py new file mode 100644 index 00000000..a2b7b875 --- /dev/null +++ b/udapi/block/ud/fixmwtspace.py @@ -0,0 +1,22 @@ +""" +Block ud.FixMwtSpace looks for multiword tokens whose form contains a space, +which should be avoided. If found, the block checks whether it can remove +the multiword token seamlessly, that is, whether the syntactic words correspond +to the space-delimited parts of the multiword token. If possible, the MWT +line will be removed. +""" +from udapi.core.block import Block +import re + + +class FixMwtSpace(Block): + """Try to remove multiword tokens with spaces.""" + + def process_node(self, node): + if node.multiword_token: + mwt = node.multiword_token + if re.search(r' ', mwt.form): + if node == mwt.words[0]: + wordforms = [x.form for x in mwt.words] + if ' '.join(wordforms) == mwt.form: + mwt.remove() From 29c48fa5a9002e37d14fe610d942586a06c9d328 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 16:51:48 +0200 Subject: [PATCH 0423/1201] Restore forms of words within multiword tokens. --- udapi/block/ud/mr/addformsinmwt.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 udapi/block/ud/mr/addformsinmwt.py diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py new file mode 100644 index 00000000..b468fb04 --- /dev/null +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -0,0 +1,27 @@ +""" +Block ud.mr.AddFormsInMwt looks for multiword tokens whose words lack forms. +Based on the form of the surface token and on the information provided in +the lemmas and UPOS, tries to reconstruct the forms of individual words. +""" +from udapi.core.block import Block +import re +import logging + + +class AddFormsInMwt(Block): + """Guess forms of syntactic worms within a multiword token.""" + + def process_node(self, node): + if node.form == '_' and node.multiword_token: + mwt = node.multiword_token + # Many multiword tokens consist of NOUN + ADP. Beware: The adposition + # may have a form different from its lemma. It happens with possessive + # postpositions चा, चे, which distinguish the gender and number of + # the possessed entity. + if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': + if mwt.form == mwt.words[0].lemma + mwt.words[1].lemma: + node.form = node.lemma + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + else: + logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma for x in mwt.words]))) From d0e75c1f8fc89337b6ca87eb4b41b675a2eec3fd Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 16:59:38 +0200 Subject: [PATCH 0424/1201] Improved decomposition of X+ADP. --- udapi/block/ud/mr/addformsinmwt.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index b468fb04..26110fea 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -19,8 +19,12 @@ def process_node(self, node): # postpositions चा, चे, which distinguish the gender and number of # the possessed entity. if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': - if mwt.form == mwt.words[0].lemma + mwt.words[1].lemma: - node.form = node.lemma + m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = node.lemma else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) else: From 788830b3fe3d1cd963498080b019bb7c675a6bef Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:07:02 +0200 Subject: [PATCH 0425/1201] =?UTF-8?q?=E0=A4=9A=E0=A4=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/mr/addformsinmwt.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 26110fea..be290c50 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -19,13 +19,23 @@ def process_node(self, node): # postpositions चा, चे, which distinguish the gender and number of # the possessed entity. if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': - m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form) - if m: - if node == mwt.words[0]: - node.form = m.group(1) + if mwt.words[1].lemma == 'चा': + m = re.match(r'^(.+)(चा|चे)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = m.group(2) else: - node.form = node.lemma - else: - logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + else: # not the possessive 'ca' + m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = node.lemma + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) else: - logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma for x in mwt.words]))) + logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma + '/' + x.upos for x in mwt.words]))) From d5495edaf163f6e258147fee49f9f4bc8bc18fa0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:09:19 +0200 Subject: [PATCH 0426/1201] =?UTF-8?q?=E0=A4=9A=E0=A5=8D=E0=A4=AF=E0=A4=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/mr/addformsinmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index be290c50..5fcb9866 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -20,7 +20,7 @@ def process_node(self, node): # the possessed entity. if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': if mwt.words[1].lemma == 'चा': - m = re.match(r'^(.+)(चा|चे)$', mwt.form) + m = re.match(r'^(.+)(चा|चे|च्या)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) From 20e33c965cc18a1ca4f6664323ef0e167978e5f8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:11:39 +0200 Subject: [PATCH 0427/1201] =?UTF-8?q?=E0=A4=9A=E0=A5=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/mr/addformsinmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 5fcb9866..903409f0 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -20,7 +20,7 @@ def process_node(self, node): # the possessed entity. if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': if mwt.words[1].lemma == 'चा': - m = re.match(r'^(.+)(चा|चे|च्या)$', mwt.form) + m = re.match(r'^(.+)(चा|चे|च्या|ची)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) From fc5b262e7b65d8911a80f3409e28badd714f9b03 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:23:28 +0200 Subject: [PATCH 0428/1201] Particle "ca". --- udapi/block/ud/mr/addformsinmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 903409f0..44802762 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -18,7 +18,7 @@ def process_node(self, node): # may have a form different from its lemma. It happens with possessive # postpositions चा, चे, which distinguish the gender and number of # the possessed entity. - if len(mwt.words) == 2 and mwt.words[1].upos == 'ADP': + if len(mwt.words) == 2 and re.match(r'^(ADP|PART)$', mwt.words[1].upos): if mwt.words[1].lemma == 'चा': m = re.match(r'^(.+)(चा|चे|च्या|ची)$', mwt.form) if m: From 31e99b45980823b9bb3f0a2a82b6bd40872b0801 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:45:32 +0200 Subject: [PATCH 0429/1201] Possessive pronouns. --- udapi/block/ud/mr/addformsinmwt.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 44802762..7efe1e72 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -20,12 +20,25 @@ def process_node(self, node): # the possessed entity. if len(mwt.words) == 2 and re.match(r'^(ADP|PART)$', mwt.words[1].upos): if mwt.words[1].lemma == 'चा': - m = re.match(r'^(.+)(चा|चे|च्या|ची)$', mwt.form) + # चा (cā) ... Masc Sing + # ची (cī) ... Fem Sing, Neut Plur + # चे (ce) ... Neut Sing, Masc Plur + # च्या (cyā) ... Fem Plur + m = re.match(r'^(.+)(चा|ची|चे|च्या)$', mwt.form) + # The resulting form is different with personal pronouns. + # माझा (mājhā), माझी (mājhī), माझे (mājhe), माझ्या (mājhyā) + # तुझी (tujhī), तुझे (tujhe) + m2 = re.match(r'^(माझ|तुझ)(ा|ी|े|्या)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) else: node.form = m.group(2) + elif m2: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = 'च' + m.group(2) else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) else: # not the possessive 'ca' From 3a33d279bd2e2b3a99c325d4921a314941217c98 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 17:46:23 +0200 Subject: [PATCH 0430/1201] Bug fix. --- udapi/block/ud/mr/addformsinmwt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 7efe1e72..e3fcafe5 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -36,9 +36,9 @@ def process_node(self, node): node.form = m.group(2) elif m2: if node == mwt.words[0]: - node.form = m.group(1) + node.form = m2.group(1) else: - node.form = 'च' + m.group(2) + node.form = 'च' + m2.group(2) else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) else: # not the possessive 'ca' From a454721a0efbf19adfa2e973aaa03608052fc665 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 18:13:28 +0200 Subject: [PATCH 0431/1201] =?UTF-8?q?=E0=A4=9A=E0=A4=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/mr/addformsinmwt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index e3fcafe5..abf538d8 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -24,10 +24,12 @@ def process_node(self, node): # ची (cī) ... Fem Sing, Neut Plur # चे (ce) ... Neut Sing, Masc Plur # च्या (cyā) ... Fem Plur - m = re.match(r'^(.+)(चा|ची|चे|च्या)$', mwt.form) + # चं (caṁ) ... ? + m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)$', mwt.form) # The resulting form is different with personal pronouns. # माझा (mājhā), माझी (mājhī), माझे (mājhe), माझ्या (mājhyā) # तुझी (tujhī), तुझे (tujhe) + # त्याचं (tyācaṁ) m2 = re.match(r'^(माझ|तुझ)(ा|ी|े|्या)$', mwt.form) if m: if node == mwt.words[0]: From 59fb2e823e9b34679781422ef9f8a73329d97c54 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 18:19:31 +0200 Subject: [PATCH 0432/1201] Fix wrong lemma. --- udapi/block/ud/mr/addformsinmwt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index abf538d8..7ecb64b4 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -19,7 +19,9 @@ def process_node(self, node): # postpositions चा, चे, which distinguish the gender and number of # the possessed entity. if len(mwt.words) == 2 and re.match(r'^(ADP|PART)$', mwt.words[1].upos): - if mwt.words[1].lemma == 'चा': + # Occasionally the lemma of the possessive postposition is mistakenly 'ची' instead of 'चा'. + if mwt.words[1].lemma == 'चा' or mwt.words[1].lemma == 'ची': + mwt.words[1].lemma = 'चा' # चा (cā) ... Masc Sing # ची (cī) ... Fem Sing, Neut Plur # चे (ce) ... Neut Sing, Masc Plur From a2077ea36ff859ecee9f89445e000c933969d50c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 18:30:12 +0200 Subject: [PATCH 0433/1201] Three-word tokens. --- udapi/block/ud/mr/addformsinmwt.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 7ecb64b4..b899b55c 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -54,5 +54,26 @@ def process_node(self, node): node.form = node.lemma else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + elif len(mwt.words) == 3 and re.match(r'^(ADP|PART)$', mwt.words[1].upos) and re.match(r'^(ADP|PART)$', mwt.words[2].upos): + # Compound postpositions where the middle word is the possessive 'चा'. + if mwt.words[1].lemma == 'चा': + m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form) + m2 = re.match(r'^(माझ|तुझ)(ा|ी|े|्या)(.+)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + elif node == mwt.words[1]: + node.form = m.group(2) + else: + node.form = m.group(3) + elif m2: + if node == mwt.words[0]: + node.form = m2.group(1) + elif node == mwt.words[1]: + node.form = 'च' + m2.group(2) + else: + node.form = m2.group(3) + else: + logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) else: logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma + '/' + x.upos for x in mwt.words]))) From 3441331c8a4fa39c64651a6d8409007d61847aa8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 18:55:19 +0200 Subject: [PATCH 0434/1201] The honorific pronoun. --- udapi/block/ud/mr/addformsinmwt.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index b899b55c..0ad7fded 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -31,8 +31,9 @@ def process_node(self, node): # The resulting form is different with personal pronouns. # माझा (mājhā), माझी (mājhī), माझे (mājhe), माझ्या (mājhyā) # तुझी (tujhī), तुझे (tujhe) + # आपला (āpalā), आपली (āpalī), आपल्या (āpalyā) # त्याचं (tyācaṁ) - m2 = re.match(r'^(माझ|तुझ)(ा|ी|े|्या)$', mwt.form) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) @@ -58,7 +59,7 @@ def process_node(self, node): # Compound postpositions where the middle word is the possessive 'चा'. if mwt.words[1].lemma == 'चा': m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form) - m2 = re.match(r'^(माझ|तुझ)(ा|ी|े|्या)(.+)$', mwt.form) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) From 15cf8914cfd75a14eb88bf2ff6278ba7741637a7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 18:57:15 +0200 Subject: [PATCH 0435/1201] Bug fix. --- udapi/block/ud/mr/addformsinmwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 0ad7fded..f508076c 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -59,7 +59,7 @@ def process_node(self, node): # Compound postpositions where the middle word is the possessive 'चा'. if mwt.words[1].lemma == 'चा': m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form) - m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)$', mwt.form) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)(.+)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) From 940a2c6dff3895c5cad7473dcaa566a47c157390 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 19:53:17 +0200 Subject: [PATCH 0436/1201] -vara. --- udapi/block/ud/mr/addformsinmwt.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index f508076c..2df0b2e3 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -46,7 +46,16 @@ def process_node(self, node): node.form = 'च' + m2.group(2) else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) - else: # not the possessive 'ca' + elif mwt.words[1].lemma == 'वरती': + m = re.match(r'^(.+)वर$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = 'वर' + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + else: # not the possessive 'चा' m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form) if m: if node == mwt.words[0]: From 001e7f2cb531540633f4c667eb0b111f78709a0c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 May 2022 19:54:49 +0200 Subject: [PATCH 0437/1201] =?UTF-8?q?+=20=E0=A4=A4=E0=A5=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/mr/addformsinmwt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 2df0b2e3..7077e665 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -47,12 +47,12 @@ def process_node(self, node): else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) elif mwt.words[1].lemma == 'वरती': - m = re.match(r'^(.+)वर$', mwt.form) + m = re.match(r'^(.+)(वर(?:ती)?)$', mwt.form) if m: if node == mwt.words[0]: node.form = m.group(1) else: - node.form = 'वर' + node.form = m.group(2) else: logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) else: # not the possessive 'चा' From d339c1030b6f8a6ad9675776f2795e6dfef88440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Barbora=20Dohnalov=C3=A1?= <71558316+kybersutr@users.noreply.github.com> Date: Tue, 31 May 2022 17:23:04 +0200 Subject: [PATCH 0438/1201] Entity setter (#108) * add possibility to change the entity of a mention * keep mentions sorted * remove text * self instead of mention * add warning and tests Co-authored-by: Kybersutr --- udapi/core/coref.py | 8 ++++++-- udapi/core/tests/test_coref.py | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index edd297b4..3eb76db3 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -103,6 +103,7 @@ import collections.abc import copy import logging +import bisect @functools.total_ordering class CorefMention(object): @@ -192,9 +193,12 @@ def entity(self): @entity.setter def entity(self, new_entity): if self._entity is not None: - raise NotImplementedError('changing the entity of a mention not supported yet') + original_entity = self._entity + original_entity._mentions.remove(self) + if not original_entity._mentions: + logging.warning(f"Original entity {original_entity.eid} is now empty.") self._entity = new_entity - new_entity._mentions.append(new_entity) + bisect.insort(new_entity._mentions, self) @property def bridging(self): diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py index 369e8caf..e0998b75 100755 --- a/udapi/core/tests/test_coref.py +++ b/udapi/core/tests/test_coref.py @@ -48,6 +48,11 @@ def test_edits(self): self.assertEqual(new_entity.mentions[1], m1) self.assertTrue(m2 < m1) self.assertEqual(m2.words, [first_node, second_node, second_node.next_node]) + entity2 = doc.create_coref_entity() + m1.entity = entity2 + self.assertEqual(m1.entity.eid, entity2.eid) + m2.entity = entity2 + self.assertEqual(m2.entity.eid, entity2.eid) if __name__ == "__main__": From df5b371c7b1e17dbbb0b26cffe8f84243631c452 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 1 Jun 2022 14:52:30 +0200 Subject: [PATCH 0439/1201] util.Eval empty_nodes=1 node='my code...' allow processing empty nodes with util.Eval --- udapi/block/util/eval.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index 0f80d018..df6aaabf 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -29,7 +29,7 @@ class Eval(Block): # pylint: disable=too-many-arguments,too-many-instance-attributes def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, - coref_mention=None, coref_entity=None, + coref_mention=None, coref_entity=None, empty_nodes=False, expand_code=True, **kwargs): super().__init__(**kwargs) self.doc = doc @@ -44,6 +44,7 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.after_bundle = after_bundle self.coref_mention = coref_mention self.coref_entity = coref_entity + self.empty_nodes = empty_nodes self.expand_code = expand_code self.count = collections.Counter() @@ -115,7 +116,8 @@ def process_tree(self, tree): exec(self.expand_eval_code(self.tree)) if self.node: - for node in tree.descendants(): + nodes = tree.descendants_and_empty if self.empty_nodes else tree.descendants + for node in nodes: this = node exec(self.expand_eval_code(self.node)) From 6e9320fbb2fdccdcfdfacc0c25d2659801b54ba6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 7 Jun 2022 10:16:59 +0200 Subject: [PATCH 0440/1201] Lemmatization of Cantonese. --- udapi/block/ud/yue/lemmatize.py | 43 +++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 udapi/block/ud/yue/lemmatize.py diff --git a/udapi/block/ud/yue/lemmatize.py b/udapi/block/ud/yue/lemmatize.py new file mode 100644 index 00000000..87279dc1 --- /dev/null +++ b/udapi/block/ud/yue/lemmatize.py @@ -0,0 +1,43 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + # dictionary: form --> lemma + lemma = { + '𡃁仔': '笭仔', + '仲': '重', + '企': '徛', + '係咪': '係', + '出嚟': '出唻', + '可': '可以', + '啦': '喇', + '㗎喇': '㗎嘑', + '喇': '嘑', + '嚟': '唻', + '就嚟': '就唻', + '死𡃁妹': '死笭妹', + '老豆': '老頭', + '蚊': '緡', + '蛋撻': '蛋澾', + '返嚟': '返唻', + '過嚟人': '過唻人', + '過嚟': '過唻' + } + + def process_node(self, node): + """ + Parts of the Cantonese treebank lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + + For Cantonese, lemmatization includes normalization of some characters. + These are the few cases where lemma differs from the surface form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + if node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form From 9c26f877cb0200f7d52b64e4563c6cdd9cc5e09a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Jun 2022 16:48:13 +0200 Subject: [PATCH 0441/1201] sanity check: cannot compare nodes from different documents When `node1` and `node2` are from different documents, it makes no sense to use `node1.precedes(node2)`. Sorting nodes from different docs using `precedes` could result in unexpected bugs because with the default Python `sort` even nodes from the same document may not be in the correct order (the relation was not transitive without the added sanity check). --- udapi/core/node.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/core/node.py b/udapi/core/node.py index ad36aa0a..63242698 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -692,6 +692,8 @@ def precedes(self, node): return self._ord < node._ord if self._root._zone != node._root._zone: raise ValueError(f"Cannot compare word order across zones: {self} {node}") + if self._root._bundle._document is not node._root._bundle._document: + raise ValueError(f"Cannot compare word order across documents: {self} {node}") return self._root._bundle.number < node._root._bundle.number def is_leaf(self): From 02cb25bba090e49dc32bd5f74161e3266740e500 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 9 Jun 2022 01:00:03 +0200 Subject: [PATCH 0442/1201] 02-blocks.ipynb not finished yet --- tutorial/01-visualizing.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorial/01-visualizing.ipynb b/tutorial/01-visualizing.ipynb index 382bb11f..70bea240 100644 --- a/tutorial/01-visualizing.ipynb +++ b/tutorial/01-visualizing.ipynb @@ -526,7 +526,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the next tutorial, [02-blocks.ipynb](02-blocks.ipynb), we will explore several useful Udapi blocks, some of which may be handy when working further on Exercise 2 or similar tasks." + "In the next tutorial, 02-blocks.ipynb (not finished yet), we will explore several useful Udapi blocks, some of which may be handy when working further on Exercise 2 or similar tasks." ] } ], From 5216c81c600fd280867e5fd101ef05a87658c26a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 9 Jun 2022 01:00:51 +0200 Subject: [PATCH 0443/1201] 02-blocks.ipynb not finished yet --- tutorial/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorial/README.md b/tutorial/README.md index 05e96d59..425f7df5 100644 --- a/tutorial/README.md +++ b/tutorial/README.md @@ -6,4 +6,4 @@ Don't display the tutorial `ipynb` files on GitHub because it cannot render the If you don't have Jupyter installed, you can display the tutorial with https://nbviewer.jupyter.org, using the following links: - [01-visualizing.ipynb](https://nbviewer.jupyter.org/github/udapi/udapi-python/blob/master/tutorial/01-visualizing.ipynb) -- [02-blocks.ipynb](https://nbviewer.jupyter.org/github/udapi/udapi-python/blob/master/tutorial/01-blocks.ipynb) +- 02-blocks.ipynb (not finished yet) From b193c2fb5d41c075fc21b194b4ddb5b88d42a6d6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 10 Jun 2022 22:17:23 +0200 Subject: [PATCH 0444/1201] Fix spurious auxiliaries in Uyghur. --- udapi/block/ud/ug/fixspuriousaux.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 udapi/block/ud/ug/fixspuriousaux.py diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py new file mode 100644 index 00000000..2ac6adc2 --- /dev/null +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -0,0 +1,27 @@ +"""Block to convert spurious auxiliaries to lexical verbs in Uyghur.""" +from udapi.core.block import Block +import logging +import re + +class FixSpuriousAux(Block): + + def process_node(self, node): + """ + Some verbs that are called auxiliary by the traditional grammar, should + be analyzed in UD as VERB + non-finite xcomp. + """ + if node.upos == 'AUX' and node.udeprel == 'aux': + # بەر = give (used with actions done for the benefit of somebody) + if re.match(r'^(بەر)$', node.lemma): + node.upos = 'VERB' + # The auxiliary inherits the incoming relation of its original parent. + lexverb = node.parent + node.parent = lexverb.parent + node.deprel = lexverb.deprel + # The auxiliary also inherits some but not all children of the lexical verb. + for c in lexverb.children: + if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): + c.parent = node + # The lexical verb becomes an xcomp of the auxiliary. + lexverb.parent = node + lexverb.deprel = 'xcomp' From 42cdc02d3b50ecf4d70ed68d510c9d81635c41a5 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 10 Jun 2022 22:28:33 +0200 Subject: [PATCH 0445/1201] =?UTF-8?q?=DA=86=D9=89=D9=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 2ac6adc2..2f2d779c 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -12,7 +12,8 @@ def process_node(self, node): """ if node.upos == 'AUX' and node.udeprel == 'aux': # بەر = give (used with actions done for the benefit of somebody) - if re.match(r'^(بەر)$', node.lemma): + # چىق = go out + if re.match(r'^(بەر|چىق)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 8a5e45f6df8f7a9770003c979180e66d274efc12 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 10 Jun 2022 22:49:14 +0200 Subject: [PATCH 0446/1201] =?UTF-8?q?=D9=8A=DB=88=D8=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 2f2d779c..b770edcf 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -13,7 +13,8 @@ def process_node(self, node): if node.upos == 'AUX' and node.udeprel == 'aux': # بەر = give (used with actions done for the benefit of somebody) # چىق = go out - if re.match(r'^(بەر|چىق)$', node.lemma): + # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) + if re.match(r'^(بەر|چىق|يۈر)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From c580385d1147590bb5fc43bca925f001cd623a04 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 10 Jun 2022 22:55:15 +0200 Subject: [PATCH 0447/1201] =?UTF-8?q?=D8=A6=D9=88=D9=84=D8=AA=DB=87=D8=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index b770edcf..c03a0e5a 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -14,7 +14,8 @@ def process_node(self, node): # بەر = give (used with actions done for the benefit of somebody) # چىق = go out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) - if re.match(r'^(بەر|چىق|يۈر)$', node.lemma): + # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 77024b3ee2b8648c8737dfcaecd64c1b08c78220 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 10:04:06 +0200 Subject: [PATCH 0448/1201] =?UTF-8?q?=D8=A8=D8=A7=D9=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index c03a0e5a..8eae5f19 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -15,7 +15,8 @@ def process_node(self, node): # چىق = go out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر)$', node.lemma): + # باق = to do ever? + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 8bc5af7019a62220f7480b9f13abccabff801ab6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 10:08:17 +0200 Subject: [PATCH 0449/1201] It does not work if we require upos=AUX. --- udapi/block/ud/ug/fixspuriousaux.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 8eae5f19..8ea1227e 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -10,7 +10,9 @@ def process_node(self, node): Some verbs that are called auxiliary by the traditional grammar, should be analyzed in UD as VERB + non-finite xcomp. """ - if node.upos == 'AUX' and node.udeprel == 'aux': + # Sometimes there is a double error: it should not be auxiliary, it is + # attached as aux but it is not tagged AUX. So we only look at the deprel. + if node.udeprel == 'aux': # بەر = give (used with actions done for the benefit of somebody) # چىق = go out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) From b679662a77d3f8735521a57f2190512a8f272a70 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 10:18:53 +0200 Subject: [PATCH 0450/1201] _ --- udapi/block/ud/ug/fixspuriousaux.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 8ea1227e..78cb86ec 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -18,7 +18,9 @@ def process_node(self, node): # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # باق = to do ever? - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق)$', node.lemma): + # ئۆت = pass + # _ ... some putative auxiliaries do not even have a lemma + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 9003c324882539ba6201a7a18be22a08328d2c17 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 10:57:43 +0200 Subject: [PATCH 0451/1201] =?UTF-8?q?=D8=A8=D8=A7=D8=B4=D9=84=D9=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 78cb86ec..da40074e 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -19,8 +19,10 @@ def process_node(self, node): # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # باق = to do ever? # ئۆت = pass + # كۆرۈش = see + # باشلى = start # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_)$', node.lemma): + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From d97891b9053ad9e2f7bd14337e43e579768688a0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:01:08 +0200 Subject: [PATCH 0452/1201] =?UTF-8?q?=D8=A8=D8=A7=D8=B4=D9=84=D9=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index da40074e..4e620a2e 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -22,7 +22,7 @@ def process_node(self, node): # كۆرۈش = see # باشلى = start # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش)$', node.lemma): + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From be9d3675600e7884ca77835412d392c8a6daf817 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:10:28 +0200 Subject: [PATCH 0453/1201] =?UTF-8?q?=D9=8A=DB=95=D8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 4e620a2e..7bc8f546 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -21,8 +21,9 @@ def process_node(self, node): # ئۆت = pass # كۆرۈش = see # باشلى = start + # يەت = be enough # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى)$', node.lemma): + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From dc347add2e736251b4822578dfc4566110e8b834 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:16:04 +0200 Subject: [PATCH 0454/1201] =?UTF-8?q?=D9=82=D8=A7=D9=8A=D8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 7bc8f546..4cc038dc 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -17,13 +17,14 @@ def process_node(self, node): # چىق = go out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) - # باق = to do ever? + # باق = do ever? # ئۆت = pass # كۆرۈش = see # باشلى = start # يەت = be enough + # قايت = return # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت)$', node.lemma): + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From a6880f6a483d7f129159cd5a1f48f88e9a03dcbf Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:21:52 +0200 Subject: [PATCH 0455/1201] =?UTF-8?q?=DA=86=DB=88=D8=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 4cc038dc..9ccff72c 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -23,8 +23,9 @@ def process_node(self, node): # باشلى = start # يەت = be enough # قايت = return + # چۈش = fall down # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت)$', node.lemma): + if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 470cd271e3812591f78f7ddf0d651d4571c46428 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:31:27 +0200 Subject: [PATCH 0456/1201] =?UTF-8?q?=D8=A8=D8=A7=D8=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index 9ccff72c..bfbf8816 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -13,7 +13,7 @@ def process_node(self, node): # Sometimes there is a double error: it should not be auxiliary, it is # attached as aux but it is not tagged AUX. So we only look at the deprel. if node.udeprel == 'aux': - # بەر = give (used with actions done for the benefit of somebody) + # بەر/بار = give (used with actions done for the benefit of somebody) # چىق = go out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) @@ -25,7 +25,7 @@ def process_node(self, node): # قايت = return # چۈش = fall down # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش)$', node.lemma): + if re.match(r'^(بەر|بار|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 558bf49fe425f477671034eaef69c0bc67de8253 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:36:37 +0200 Subject: [PATCH 0457/1201] =?UTF-8?q?=D9=82=D9=89=D9=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index bfbf8816..dc06ade7 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -24,8 +24,9 @@ def process_node(self, node): # يەت = be enough # قايت = return # چۈش = fall down + # قىل = do # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|بار|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش)$', node.lemma): + if re.match(r'^(بەر|بار|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش|قىل)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From a9e39f6fe8c7dd2ef3f8d77d188b77f9ce0b073e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 11 Jun 2022 11:49:11 +0200 Subject: [PATCH 0458/1201] =?UTF-8?q?=D9=83=D9=89=D8=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/ug/fixspuriousaux.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py index dc06ade7..952644f8 100644 --- a/udapi/block/ud/ug/fixspuriousaux.py +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -15,6 +15,7 @@ def process_node(self, node): if node.udeprel == 'aux': # بەر/بار = give (used with actions done for the benefit of somebody) # چىق = go out + # چىقىش = come out # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) # باق = do ever? @@ -25,8 +26,12 @@ def process_node(self, node): # قايت = return # چۈش = fall down # قىل = do + # چاپ = jump + # قورق = fear + # كەلتۈر = cause + # كىر = enter # _ ... some putative auxiliaries do not even have a lemma - if re.match(r'^(بەر|بار|چىق|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش|قىل)$', node.lemma): + if re.match(r'^(بەر|بار|چىق|چىقىش|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش|قىل|چاپ|قورق|كەلتۈر|كىر)$', node.lemma): node.upos = 'VERB' # The auxiliary inherits the incoming relation of its original parent. lexverb = node.parent From 56b9dc693f1a928a719ae98eae253321bbdbf2a0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 13 Jun 2022 20:37:38 +0200 Subject: [PATCH 0459/1201] even single-sentence docs should have `meta["loaded_from"]` --- udapi/core/basereader.py | 1 + udapi/core/tests/test_coref.py | 1 + 2 files changed, 2 insertions(+) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 9210b910..53a1129c 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -177,6 +177,7 @@ def process_document(self, document): if root.newdoc and root.newdoc is not True: document.meta["docname"] = root.newdoc document.meta['global.Entity'] = self._global_entity + document.meta['loaded_from'] = self.filename filehandle = self.filehandle if filehandle is None: diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py index e0998b75..8952d6d8 100755 --- a/udapi/core/tests/test_coref.py +++ b/udapi/core/tests/test_coref.py @@ -21,6 +21,7 @@ def test_load(self): self.assertEqual(len(node.coref_entities), 1) self.assertEqual(len(node.coref_mentions), 1) self.assertEqual(node.coref_entities[0], coref_entities[0]) + self.assertEqual(docs[-1].meta["loaded_from"], data_filename) def test_edits(self): data_filename = os.path.join(os.path.dirname(__file__), 'data', 'fr-democrat-dev-sample.conllu') From 3df722c2b9f63732efab41aa095a4922b98b088b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 14 Jun 2022 13:22:34 +0200 Subject: [PATCH 0460/1201] We no longer use a global list of required/allowed features per UPOS. --- udapi/block/ud/cs/markfeatsbugs.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index e027d1cb..d7854982 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -9,36 +9,6 @@ class MarkFeatsBugs(Block): - allowed = { - 'NOUN': {'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}, - 'ADJ': {'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Poss': ['Yes'], - 'Gender[psor]': ['Masc', 'Fem'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'NumType': ['Ord'], - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Tense': ['Pres', 'Past'], - 'Voice': ['Act', 'Pass'], - 'Foreign': ['Yes']} - } - - required = { - 'NOUN': ['Gender', 'Number', 'Case', 'Polarity'], - 'ADJ': ['Gender', 'Number', 'Case', 'Degree', 'Polarity'] - } - def bug(self, node, bugstring): bugs = [] if node.misc['Bug']: From 2d0425ac2deda455e42aadb88093171ba394d2d6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 14 Jun 2022 22:18:20 +0200 Subject: [PATCH 0461/1201] Better checking of required and allowed features in Czech. --- udapi/block/ud/cs/markfeatsbugs.py | 327 ++++++++++++----------------- 1 file changed, 135 insertions(+), 192 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index d7854982..b7b6c5b1 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -9,6 +9,18 @@ class MarkFeatsBugs(Block): + # The convention used in PDT is not consistent. Adjectives are fully disambiguated + # (three genders, two animacies, three numbers, seven cases), even though some + # forms are shared among many feature combinations. On the other hand, pronouns + # and determiners omit some features in the context of certain values of other + # features (e.g., gender and animacy are not distinguished in plural if the case + # is genitive, dative, locative or instrumental). + # In contrast, ČNK (CNC) fully disambiguates pronouns and determiners just like + # adjectives. + # Here we can trigger one of the two conventions. It should become a block parameter + # in the future. + pdt20 = False # True = like in PDT 2.0; False = like in ČNK + def bug(self, node, bugstring): bugs = [] if node.misc['Bug']: @@ -231,60 +243,29 @@ def process_node(self, node): 'Variant': ['Short'] }) else: # not reflexive - self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) - if node.feats['Person'] == '3': - if re.match(r'^(Nom|Voc)$', node.feats['Case']): - self.check_required_features(node, ['Gender']) - # In PDT, animacy of personal pronouns is distinguished only for Person=3 Case=Nom Gender=Masc Number=Plur ('oni' vs. 'ony'). - # So we will neither require nor allow it in singular and dual. - if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Plur'], - 'Case': ['Nom', 'Voc'] - }) - else: # on, ona, ono, ony (Fem Plur) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Voc'] - }) - else: # non-nominatives also have PrepCase + if node.feats['Person'] == '3': # on, ona, ono, oni, ony + if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony + self.check_adjective_like(node, ['PronType', 'Person'], { + 'PronType': ['Prs'], + 'Person': ['3'] + }) + else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně # Mostly only two gender groups and no animacy: # Masc,Neut ... jeho, jemu, jej, něm, jím # Fem ... jí, ji, ní # Neut ... je - self.check_required_features(node, ['PrepCase']) - if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['Gender']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Gender': ['Masc,Neut', 'Fem', 'Neut'], - 'Number': ['Sing'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], - 'PrepCase': ['Npr', 'Pre'] - }) # No gender in dual and plural: # Plur ... jich, jim, je, nich, jimi - else: - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: # 1st and 2nd person do not have gender + self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # 1st and 2nd person do not have gender: já, ty + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) self.check_allowed_features(node, { 'PronType': ['Prs'], - 'Person': ['1', '2', '3'], + 'Person': ['1', '2'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Variant': ['Short'] @@ -337,44 +318,10 @@ def process_node(self, node): # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even # in the nominative, although there is no prepositional counter- # part (but similarly the locative has no prepositionless form). - if node.feats['Case'] == 'Nom': - if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': - self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case', 'PrepCase']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Plur'], - 'Case': ['Nom'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: # not Masc Plur - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case', 'PrepCase']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: # not Case=Nom - if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case', 'PrepCase']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc,Neut', 'Fem'], - 'Number': ['Sing'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: # non-nominative dual or plural: jichž, nichž, jimž, nimž, jež, něž, jimiž, nimiž - self.check_required_features(node, ['PronType', 'Number', 'Case', 'PrepCase']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], - 'PrepCase': ['Npr', 'Pre'] - }) + self.check_adjective_like(node, ['PronType', 'PrepCase'], { + 'PronType': ['Rel'], + 'PrepCase': ['Npr', 'Pre'] + }) else: # What remains is the relative pronoun 'an'. It behaves similarly # to 'jenž' but it does not have the PrepCase feature and it @@ -439,107 +386,15 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - # Gender is annotated in all cases in singular (můj, má, mé) - # but only in nominative, accusative, and vocative in plural - # (Nom/Voc mí, mé, má; Acc mé, má). Animacy is distinguished - # in plural if gender is distinguished and masculine; in - # singular it is distinguished only in accusative (mého, můj). - # Other cases in plural are gender-less (mých, mým, mými). - # Note that this is not consistent with adjectives, where we - # disambiguate gender in all cases in plural. - if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Poss', 'Gender', 'Number', 'Case']) - if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing'], - 'Case': ['Acc'] - }) - else: - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): - self.check_required_features(node, ['PronType', 'Poss', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Acc', 'Voc'] - }) - else: - self.check_required_features(node, ['PronType', 'Poss', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'], # only if not reflexive - 'Number': ['Dual', 'Plur'], - 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] - }) + self.check_adjective_like(node, ['PronType', 'Poss'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'], + 'Person': ['1', '2'], # only if not reflexive + 'Number[psor]': ['Sing', 'Plur'] # only if not reflexive + }) else: - # Gender is annotated in all cases in singular (ten, ta, to) - # but only in nominative, accusative, and vocative in plural - # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished - # in plural if gender is distinguished and masculine; in - # singular it is distinguished only in accusative (toho, ten). - # Other cases in plural are gender-less (těch, těm, těmi). - # Note that this is not consistent with adjectives, where we - # disambiguate gender in all cases in plural. - if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) - if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing'], - 'Case': ['Acc'] - }) - else: - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Acc', 'Voc'] - }) - else: - self.check_required_features(node, ['PronType', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] - }) + self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp']}) # NUMERALS ############################################################# elif node.upos == 'NUM': self.check_required_features(node, ['NumType', 'NumForm']) @@ -568,14 +423,25 @@ def process_node(self, node): }) elif re.match(r'^(dva|oba)$', node.lemma): self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) + if self.pdt20: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) else: self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) self.check_allowed_features(node, { @@ -686,3 +552,80 @@ def process_node(self, node): # THE REST: NO FEATURES ################################################ else: self.check_allowed_features(node, {}) + + def check_adjective_like(self, node, r0, a0): + """ + Long form of adjectives, pronouns and determiners mostly share declension + paradigms and thus the sets of features that are expected. Whether the + actual feature sets are the same depends on the tagging convention (PDT + vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are + not; in ČNK, both adjectives and pronouns (incl. determiners) are fully + disambiguated. This method defines the core inflectional features while + any extras (such as PronType for pronouns) have to be provided by the + caller in parameters r0 (list) and a0 (dict). + """ + required_features = [] + allowed_featurs = {} + full_set = node.upos == 'ADJ' or not self.pdt20 + if full_set: + # Even in the full set, animacy is only distinguished for the + # masculine gender. + if node.feats['Gender'] == 'Masc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + # Gender is annotated in all cases in singular (ten, ta, to) + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished + # in plural if gender is distinguished and it is masculine; in + # singular it is distinguished only in accusative (toho, ten). + # Other cases in plural are gender-less (těch, těm, těmi). + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + } + else: + required_features = ['Number', 'Case'] + allowed_features = { + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + } + required_features = r0 + required_features + a0.update(allowed_features) + allowed_features = a0 + self.check_required_features(node, required_features) + self.check_allowed_features(node, allowed_features) From 84334251ee1954fb42ec8cb570a38800262b68d7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 15 Jun 2022 10:12:55 +0200 Subject: [PATCH 0462/1201] Distinguish reflexive and irreflexive possessives. --- udapi/block/ud/cs/markfeatsbugs.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index b7b6c5b1..78acc6f8 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -386,13 +386,19 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - self.check_adjective_like(node, ['PronType', 'Poss'], { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'], - 'Person': ['1', '2'], # only if not reflexive - 'Number[psor]': ['Sing', 'Plur'] # only if not reflexive - }) + if node.feats['Reflex'] == 'Yes': + self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'] + }) + else: + self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2'], + 'Number[psor]': ['Sing', 'Plur'] + }) else: self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp']}) # NUMERALS ############################################################# From 4facb6d760aa0e0aaacf55f4b699076bf39b2cb8 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 15 Jun 2022 14:26:18 +0200 Subject: [PATCH 0463/1201] Animacy only for masculine gender, also for participles. --- udapi/block/ud/cs/markfeatsbugs.py | 36 ++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 78acc6f8..11ecd6d9 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -508,24 +508,36 @@ def process_node(self, node): 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist }) elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB - self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Part'], - 'Tense': ['Past'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Polarity': ['Pos', 'Neg'] - }) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) else: # converb self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf'], 'VerbForm': ['Conv'], 'Tense': ['Past', 'Pres'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Voice': ['Act'], 'Number': ['Sing', 'Dual', 'Plur'], 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy 'Polarity': ['Pos', 'Neg'] From 3feff237f673d9a65434af5241e3cb317b6c4820 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 21 Jun 2022 11:24:49 +0200 Subject: [PATCH 0464/1201] more Windows friendly --- udapi/core/resource.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/core/resource.py b/udapi/core/resource.py index 9e5923f1..ae7320c6 100644 --- a/udapi/core/resource.py +++ b/udapi/core/resource.py @@ -12,7 +12,9 @@ def require_file(path): raise IOError(path + " does not exist") return os.path.abspath(path) udapi_data = os.environ.get('UDAPI_DATA', os.environ.get('HOME')) - full_path = udapi_data + '/' + path + if udapi_data is None: + raise IOError(f"Empty environment vars: UDAPI_DATA={os.environ.get('UDAPI_DATA')} HOME={os.environ.get('HOME')}") + full_path = os.path.join(udapi_data, path) if not os.path.isfile(full_path): logging.info('Downloading %s to %s', BASEURL + path, full_path) os.makedirs(os.path.dirname(full_path), exist_ok=True) From 4178f18d5d7527c23eea23126d8d93de0087208c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Jun 2022 01:51:32 +0200 Subject: [PATCH 0465/1201] os.environ.get('HOME') does not work on Windows --- udapi/core/resource.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/core/resource.py b/udapi/core/resource.py index ae7320c6..da2ba561 100644 --- a/udapi/core/resource.py +++ b/udapi/core/resource.py @@ -2,6 +2,7 @@ import logging import urllib.request import os +from os.path import expanduser BASEURL = 'http://ufallab.ms.mff.cuni.cz/tectomt/share/data/' @@ -11,9 +12,9 @@ def require_file(path): if not os.path.isfile(path): raise IOError(path + " does not exist") return os.path.abspath(path) - udapi_data = os.environ.get('UDAPI_DATA', os.environ.get('HOME')) + udapi_data = os.environ.get('UDAPI_DATA', expanduser('~')) if udapi_data is None: - raise IOError(f"Empty environment vars: UDAPI_DATA={os.environ.get('UDAPI_DATA')} HOME={os.environ.get('HOME')}") + raise IOError(f"Empty environment vars: UDAPI_DATA={os.environ.get('UDAPI_DATA')} HOME={expanduser('~')}") full_path = os.path.join(udapi_data, path) if not os.path.isfile(full_path): logging.info('Downloading %s to %s', BASEURL + path, full_path) From b3f5dca4c0dfbe3e4f2fb6850a37fd41c745a0dd Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 27 Jul 2022 09:31:14 +0200 Subject: [PATCH 0466/1201] Catch more MWTs in Marathi. --- udapi/block/ud/mr/addformsinmwt.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py index 7077e665..bd63ee7d 100644 --- a/udapi/block/ud/mr/addformsinmwt.py +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -66,7 +66,8 @@ def process_node(self, node): logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) elif len(mwt.words) == 3 and re.match(r'^(ADP|PART)$', mwt.words[1].upos) and re.match(r'^(ADP|PART)$', mwt.words[2].upos): # Compound postpositions where the middle word is the possessive 'चा'. - if mwt.words[1].lemma == 'चा': + # The lemma of the middle word should be 'चा' but sometimes it is 'च्या'. + if re.match(r'^(चा|च्या)$', mwt.words[1].lemma): m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form) m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)(.+)$', mwt.form) if m: @@ -74,6 +75,7 @@ def process_node(self, node): node.form = m.group(1) elif node == mwt.words[1]: node.form = m.group(2) + node.lemma = 'चा' else: node.form = m.group(3) elif m2: @@ -81,9 +83,12 @@ def process_node(self, node): node.form = m2.group(1) elif node == mwt.words[1]: node.form = 'च' + m2.group(2) + node.lemma = 'चा' else: node.form = m2.group(3) else: logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) + else: + logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) else: logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma + '/' + x.upos for x in mwt.words]))) From db6ae9ba76de22af008021db6774e9e1c0db26ab Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 11:09:48 +0200 Subject: [PATCH 0467/1201] Added a block to fix certain instances of duplicate subjects in Danish. --- udapi/block/ud/da/fixmultisubject.py | 56 ++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 udapi/block/ud/da/fixmultisubject.py diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py new file mode 100644 index 00000000..7307f0cf --- /dev/null +++ b/udapi/block/ud/da/fixmultisubject.py @@ -0,0 +1,56 @@ +""" +Block ud.da.FixMultiSubject tries to fix some systemic instances of predicates +that have more than one subject dependent. +""" +from udapi.core.block import Block + +class FixMultiSubject(Block): + """ + Make sure that a predicate has at most one subject. Note that it can + only fix instances that follow certain pattern observed in the Danish + data. + """ + + def process_node(self, node): + subjects = [x for x in node.children if re.match(r'^[nc]subj$', x.udeprel)] + if len(subjects) > 1: + # Pattern 1: A node is is attached as xcomp to the current node, and + # one of the subjects is closer to that xcomp than to the current + # node. + xcompchildren = [x for x in node.children if x.udeprel == 'xcomp'] + if len(subjects) == 2 and len(xcompchildren) == 1: + xcompnode = xcompchildren[0] + dn = [dist(node, x) for x in subjects] + dx = [dist(xcompnode, x) for x in subjects] + # Is the first subject closer to xcomp than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to xcomp? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the xcomp node. + subjects[0].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + # Is the second subject closer to xcomp than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to xcomp? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + +def dist(x, y): + d = x.ord - y.ord + if d < 0: + d = -d + return d From 4c844704dbe939bc83096950e5bdfa839174b021 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 11:11:33 +0200 Subject: [PATCH 0468/1201] Bug fix. --- udapi/block/ud/da/fixmultisubject.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 7307f0cf..a6709718 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -3,6 +3,7 @@ that have more than one subject dependent. """ from udapi.core.block import Block +import re class FixMultiSubject(Block): """ From a3a8b808b93ecbbd01f616b37f31a046b417b945 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 11:13:25 +0200 Subject: [PATCH 0469/1201] Bug fix. --- udapi/block/ud/da/fixmultisubject.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index a6709718..90ab6b7b 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -30,7 +30,7 @@ def process_node(self, node): subjects[0].parent = xcompnode # There are typically other dependents that should belong to the xcomp node. for c in node.children: - if dist(xcompnode, c) < dist(node, c): + if c != xcompnode and dist(xcompnode, c) < dist(node, c): c.parent = xcompnode # The xcompnode should probably be attached as something else # than xcomp, perhaps even the direction of the relation should @@ -43,7 +43,7 @@ def process_node(self, node): subjects[1].parent = xcompnode # There are typically other dependents that should belong to the xcomp node. for c in node.children: - if dist(xcompnode, c) < dist(node, c): + if c != xcompnode and dist(xcompnode, c) < dist(node, c): c.parent = xcompnode # The xcompnode should probably be attached as something else # than xcomp, perhaps even the direction of the relation should From 575440891e7a4e49de2364b7524e7f6b46b3e95b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 13:12:53 +0200 Subject: [PATCH 0470/1201] Another pattern for duplicite subjects in Danish. --- udapi/block/ud/da/fixmultisubject.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 90ab6b7b..37fe5f13 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -19,6 +19,9 @@ def process_node(self, node): # one of the subjects is closer to that xcomp than to the current # node. xcompchildren = [x for x in node.children if x.udeprel == 'xcomp'] + # Pattern 2: Similar to pattern 1, but advcl instead of xcomp, and + # possibly not so many other mis-attached dependents. + advclchildren = [x for x in node.children if x.udeprel == 'advcl'] if len(subjects) == 2 and len(xcompchildren) == 1: xcompnode = xcompchildren[0] dn = [dist(node, x) for x in subjects] @@ -49,6 +52,20 @@ def process_node(self, node): # than xcomp, perhaps even the direction of the relation should # be reversed, but one would have to resolve this manually. xcompnode.misc['ToDo'] = 'check-xcomp' + elif len(subjects) == 2 and len(advclchildren) == 1: + advclnode = advclchildren[0] + dn = [dist(node, x) for x in subjects] + dx = [dist(xcompnode, x) for x in subjects] + # Is the first subject closer to advcl than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to advcl? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = advclnode + # Is the second subject closer to advcl than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to advcl? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = advclnode def dist(x, y): d = x.ord - y.ord From d643b8d05eb698e1443b84321f2fbad4a99291ff Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 13:14:31 +0200 Subject: [PATCH 0471/1201] Bug fix. --- udapi/block/ud/da/fixmultisubject.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 37fe5f13..142a51d9 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -55,7 +55,7 @@ def process_node(self, node): elif len(subjects) == 2 and len(advclchildren) == 1: advclnode = advclchildren[0] dn = [dist(node, x) for x in subjects] - dx = [dist(xcompnode, x) for x in subjects] + dx = [dist(advclnode, x) for x in subjects] # Is the first subject closer to advcl than it is to the current node? # At the same time, is the second subject closer to the current node than it is to advcl? if dx[0] < dn[0] and dn[1] < dx[1]: From 7ca292f2282c6627eff720316707f2ab66b0b519 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 13:19:53 +0200 Subject: [PATCH 0472/1201] Improvement: The problematic predicate can now have multiple advcl dependents. --- udapi/block/ud/da/fixmultisubject.py | 30 +++++++++++++++------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 142a51d9..453bc1c0 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -52,20 +52,22 @@ def process_node(self, node): # than xcomp, perhaps even the direction of the relation should # be reversed, but one would have to resolve this manually. xcompnode.misc['ToDo'] = 'check-xcomp' - elif len(subjects) == 2 and len(advclchildren) == 1: - advclnode = advclchildren[0] - dn = [dist(node, x) for x in subjects] - dx = [dist(advclnode, x) for x in subjects] - # Is the first subject closer to advcl than it is to the current node? - # At the same time, is the second subject closer to the current node than it is to advcl? - if dx[0] < dn[0] and dn[1] < dx[1]: - # The first subject should be re-attached to the advcl node. - subjects[0].parent = advclnode - # Is the second subject closer to advcl than it is to the current node? - # At the same time, is the first subject closer to the current node than it is to advcl? - elif dx[1] < dn[1] and dn[0] < dx[0]: - # The second subject should be re-attached to the xcomp node. - subjects[1].parent = advclnode + elif len(subjects) == 2 and len(advclchildren) > 0: + for advclnode in advclchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(advclnode, x) for x in subjects] + # Is the first subject closer to advcl than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to advcl? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = advclnode + break + # Is the second subject closer to advcl than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to advcl? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = advclnode + break def dist(x, y): d = x.ord - y.ord From 551f2ed23bbec82bcbed9c7780a07f1a2e65f02c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 17:01:27 +0200 Subject: [PATCH 0473/1201] A more lenient approach to catching the pattern. --- udapi/block/ud/da/fixmultisubject.py | 62 ++++++++++++++-------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 453bc1c0..401c054a 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -22,36 +22,38 @@ def process_node(self, node): # Pattern 2: Similar to pattern 1, but advcl instead of xcomp, and # possibly not so many other mis-attached dependents. advclchildren = [x for x in node.children if x.udeprel == 'advcl'] - if len(subjects) == 2 and len(xcompchildren) == 1: - xcompnode = xcompchildren[0] - dn = [dist(node, x) for x in subjects] - dx = [dist(xcompnode, x) for x in subjects] - # Is the first subject closer to xcomp than it is to the current node? - # At the same time, is the second subject closer to the current node than it is to xcomp? - if dx[0] < dn[0] and dn[1] < dx[1]: - # The first subject should be re-attached to the xcomp node. - subjects[0].parent = xcompnode - # There are typically other dependents that should belong to the xcomp node. - for c in node.children: - if c != xcompnode and dist(xcompnode, c) < dist(node, c): - c.parent = xcompnode - # The xcompnode should probably be attached as something else - # than xcomp, perhaps even the direction of the relation should - # be reversed, but one would have to resolve this manually. - xcompnode.misc['ToDo'] = 'check-xcomp' - # Is the second subject closer to xcomp than it is to the current node? - # At the same time, is the first subject closer to the current node than it is to xcomp? - elif dx[1] < dn[1] and dn[0] < dx[0]: - # The second subject should be re-attached to the xcomp node. - subjects[1].parent = xcompnode - # There are typically other dependents that should belong to the xcomp node. - for c in node.children: - if c != xcompnode and dist(xcompnode, c) < dist(node, c): - c.parent = xcompnode - # The xcompnode should probably be attached as something else - # than xcomp, perhaps even the direction of the relation should - # be reversed, but one would have to resolve this manually. - xcompnode.misc['ToDo'] = 'check-xcomp' + if len(subjects) == 2 and len(xcompchildren) > 0: + for xcompnode in xcompchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(xcompnode, x) for x in subjects] + # Is the first subject closer to xcomp than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to xcomp? + if dx[0] <= dn[0] and dn[1] <= dx[1]: + # The first subject should be re-attached to the xcomp node. + subjects[0].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if c != xcompnode and dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + break + # Is the second subject closer to xcomp than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to xcomp? + elif dx[1] <= dn[1] and dn[0] <= dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if c != xcompnode and dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + break elif len(subjects) == 2 and len(advclchildren) > 0: for advclnode in advclchildren: dn = [dist(node, x) for x in subjects] From 1f48111035d1cda1ec678af3ca4574087b11023d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 17:10:35 +0200 Subject: [PATCH 0474/1201] Taking commas into account when assessing node distance. --- udapi/block/ud/da/fixmultisubject.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 401c054a..a690d0ba 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -72,7 +72,20 @@ def process_node(self, node): break def dist(x, y): - d = x.ord - y.ord - if d < 0: - d = -d + if x.ord < y.ord: + a = x + b = y + else: + a = y + b = x + d = b.ord - a.ord + # Count the commas between the two nodes. A comma should be seen as increasing + # the distance of the nodes, that is, decreasing the probability that they + # are in the same clause. + nc = 0 + for i in a.root.descendants: + if i.ord > a.ord and i.ord < b.ord: + if i.form == ',': + nc += 1 + d += nc * 10 return d From a430ac7a458a7c169b6fd0ae2b996c4bb20d9db0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 17:22:54 +0200 Subject: [PATCH 0475/1201] New pattern: "amod" with a copula. --- udapi/block/ud/da/fixmultisubject.py | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index a690d0ba..8bd9333a 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -22,6 +22,10 @@ def process_node(self, node): # Pattern 2: Similar to pattern 1, but advcl instead of xcomp, and # possibly not so many other mis-attached dependents. advclchildren = [x for x in node.children if x.udeprel == 'advcl'] + # Pattern 3: Instead of xcomp or advcl, there is a simple amod + # (under a verb!), in fact an adjective with a copula that should + # have been advcl. + amodchildren = [x for x in node.children if x.udeprel == 'amod'] if len(subjects) == 2 and len(xcompchildren) > 0: for xcompnode in xcompchildren: dn = [dist(node, x) for x in subjects] @@ -70,6 +74,32 @@ def process_node(self, node): # The second subject should be re-attached to the xcomp node. subjects[1].parent = advclnode break + elif len(subjects) == 2 and len(amodchildren) > 0: + for amodnode in amodchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(amodnode, x) for x in subjects] + # Is the first subject closer to amod than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to amod? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break + # Is the second subject closer to amod than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to amod? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break def dist(x, y): if x.ord < y.ord: From 8589f3b64bcc070030a243186027354fe96682b9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 30 Jul 2022 17:28:46 +0200 Subject: [PATCH 0476/1201] Nouns with copula are "obl" not "amod". --- udapi/block/ud/da/fixmultisubject.py | 54 ++++++++++++++-------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py index 8bd9333a..e9367d46 100644 --- a/udapi/block/ud/da/fixmultisubject.py +++ b/udapi/block/ud/da/fixmultisubject.py @@ -24,8 +24,9 @@ def process_node(self, node): advclchildren = [x for x in node.children if x.udeprel == 'advcl'] # Pattern 3: Instead of xcomp or advcl, there is a simple amod # (under a verb!), in fact an adjective with a copula that should - # have been advcl. - amodchildren = [x for x in node.children if x.udeprel == 'amod'] + # have been advcl. Alternatively, the nonverbal clause is headed + # by a noun, and the deprel is obl instead of amod. + amodchildren = [x for x in node.children if re.match(r'^(amod|obl)$', x.udeprel)] if len(subjects) == 2 and len(xcompchildren) > 0: for xcompnode in xcompchildren: dn = [dist(node, x) for x in subjects] @@ -76,30 +77,31 @@ def process_node(self, node): break elif len(subjects) == 2 and len(amodchildren) > 0: for amodnode in amodchildren: - dn = [dist(node, x) for x in subjects] - dx = [dist(amodnode, x) for x in subjects] - # Is the first subject closer to amod than it is to the current node? - # At the same time, is the second subject closer to the current node than it is to amod? - if dx[0] < dn[0] and dn[1] < dx[1]: - # The first subject should be re-attached to the advcl node. - subjects[0].parent = amodnode - amodnode.deprel = 'advcl' - # There are typically other dependents that should belong to the amod node. - for c in node.children: - if c != amodnode and dist(amodnode, c) < dist(node, c): - c.parent = amodnode - break - # Is the second subject closer to amod than it is to the current node? - # At the same time, is the first subject closer to the current node than it is to amod? - elif dx[1] < dn[1] and dn[0] < dx[0]: - # The second subject should be re-attached to the xcomp node. - subjects[1].parent = amodnode - amodnode.deprel = 'advcl' - # There are typically other dependents that should belong to the amod node. - for c in node.children: - if c != amodnode and dist(amodnode, c) < dist(node, c): - c.parent = amodnode - break + if len([x for x in amodnode.children if x.udeprel == 'cop']) > 0: + dn = [dist(node, x) for x in subjects] + dx = [dist(amodnode, x) for x in subjects] + # Is the first subject closer to amod than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to amod? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break + # Is the second subject closer to amod than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to amod? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break def dist(x, y): if x.ord < y.ord: From dcc759cebef6d2b2343f64a65524b60e182a0570 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 1 Aug 2022 01:33:22 +0200 Subject: [PATCH 0477/1201] bugfix in create_block _import_blocks now returns pairs (name, instance), but we need just the instance --- udapi/core/basewriter.py | 2 +- udapi/core/run.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index cc72c6e7..cdc2c38f 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -66,7 +66,7 @@ def before_process_document(self, document): logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: - logging.warning('overwrite=1 but documet.meta["loaded_from"] is None') + logging.warning('overwrite=1 but document.meta["loaded_from"] is None') else: sys.stdout = self.orig_stdout else: diff --git a/udapi/core/run.py b/udapi/core/run.py index c3a4ca6f..a0cc4a9a 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -176,4 +176,4 @@ def scenario_string(self): def create_block(block, **kwargs): """A factory function for creating new block instances (handy for IPython).""" blocks = _import_blocks([block], [kwargs]) - return blocks[0] + return blocks[0][1] From d08d549ff8b04abbd3645bb256a0279f97b1b625 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 16 Aug 2022 22:44:42 +0200 Subject: [PATCH 0478/1201] Spurious auxiliaries in Hindi and Urdu. --- udapi/block/ud/hi/fixaux.py | 45 +++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 udapi/block/ud/hi/fixaux.py diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py new file mode 100644 index 00000000..c561d4ce --- /dev/null +++ b/udapi/block/ud/hi/fixaux.py @@ -0,0 +1,45 @@ +""" +Block to fix annotation of verbs that are currently treated as auxiliaries +but they should be treated as normal verbs instead. +""" +from udapi.core.block import Block +import logging +import re + +class FixAux(Block): + + def process_node(self, node): + # The following verbs appear in verb-verb compounds as the semantically + # less salient element: le (to take), de (to give), ḍāla (to throw), + # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come). There + # are also jā (to go) and paṛa (to fall) but we do not list them here + # because they can also act as genuine auxiliaries. + hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] + urcompound = ['لے', 'دے', 'بیٹھ', 'رکھ', 'آ'] + recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' + hiphase = ['लग', 'चुक'] + urphase = ['لگ', 'چک'] + rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' + if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': + node.deprel = 'compound' + # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. + node.upos = "VERB" + elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': + secpred = node.parent + grandparent = secpred.parent + node.parent = grandparent + node.deprel = secpred.deprel + secpred.parent = node + secpred.deprel = "xcomp" + ###!!! We should also take care of DEPS if they exist. + # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. + node.upos = "VERB" + # Examine the children of the original parent. + # Those that modify the clause should be re-attached to me. + # Those that modify the word (noun, adjective) should stay there. + for c in secpred.children: + # obl is borderline. It could modify an adjective rather than a clause. + # obj and iobj should not occur in copular clauses but it sometimes + # occurs with pseudocopulas: "I declare him handsome." + if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): + c.parent = node From 562f840ac00be75f48fbd52f6b6dab0178af13e7 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 11:33:37 +0200 Subject: [PATCH 0479/1201] Fix lemma before fixing auxiliary. --- udapi/block/ud/hi/fixaux.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index c561d4ce..54a9bd83 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -9,6 +9,7 @@ class FixAux(Block): def process_node(self, node): + self.fix_lemma(node) # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come). There @@ -17,6 +18,7 @@ def process_node(self, node): hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] urcompound = ['لے', 'دے', 'بیٹھ', 'رکھ', 'آ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' + # Control and raising verbs. hiphase = ['लग', 'चुक'] urphase = ['لگ', 'چک'] rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' @@ -43,3 +45,15 @@ def process_node(self, node): # occurs with pseudocopulas: "I declare him handsome." if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): c.parent = node + + def fix_lemma(self, node): + """ + Some verbal forms have wrong lemmas in the Hindi/Urdu treebanks. If they + are tagged AUX, it means that either the validator fails to recognize a + correct auxiliary, or we fail here to recognize a spurious auxiliary that + must be fixed. + """ + if node.upos == 'AUX': + # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" + if node.lemma == 'لگا': + node.lemma = 'لگ' From 1a8e7c721fe67f9fcdee86938e65ef185e745c1c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 11:46:53 +0200 Subject: [PATCH 0480/1201] =?UTF-8?q?=DA=86=D8=A7=DB=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 54a9bd83..f6507840 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -54,6 +54,12 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': + # چاہ is a wrong lemmatization of چاہتی, which is a wrong spelling of چاہیئے (cāhie) "should" + if node.lemma == 'چاہ': + node.lemma = 'چاہیئے' + if node.form == 'چاہتی': + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = 'چاہیئے' # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" if node.lemma == 'لگا': node.lemma = 'لگ' From 7aae1ba2f90630e1fcf84c2c33208c61ed1f726d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 11:58:16 +0200 Subject: [PATCH 0481/1201] =?UTF-8?q?=DA=86=D8=A7=DB=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index f6507840..e390dc72 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -19,8 +19,10 @@ def process_node(self, node): urcompound = ['لے', 'دے', 'بیٹھ', 'رکھ', 'آ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. - hiphase = ['लग', 'चुक'] - urphase = ['لگ', 'چک'] + # چاہنا चाहना (cāhnā) "to want, to wish" is a control verb but not an auxiliary. + # Its form چاہیئے (cāhie) "should, ought to" (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. + hiphase = ['लग', 'चुक', 'चाह'] + urphase = ['لگ', 'چک', 'چاہ'] rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': node.deprel = 'compound' @@ -54,12 +56,6 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': - # چاہ is a wrong lemmatization of چاہتی, which is a wrong spelling of چاہیئے (cāhie) "should" - if node.lemma == 'چاہ': - node.lemma = 'چاہیئے' - if node.form == 'چاہتی': - node.feats['Typo'] = 'Yes' - node.misc['CorrectForm'] = 'چاہیئے' # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" if node.lemma == 'لگا': node.lemma = 'لگ' From 47a7ef38c2c4c957b000743e8eed37681debf0f1 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 12:09:53 +0200 Subject: [PATCH 0482/1201] =?UTF-8?q?=E0=A4=A6=E0=A4=BF=E0=A4=96=E0=A4=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index e390dc72..f4cce0e4 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -19,10 +19,11 @@ def process_node(self, node): urcompound = ['لے', 'دے', 'بیٹھ', 'رکھ', 'آ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. - # چاہنا चाहना (cāhnā) "to want, to wish" is a control verb but not an auxiliary. - # Its form چاہیئے (cāhie) "should, ought to" (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. - hiphase = ['लग', 'चुक', 'चाह'] - urphase = ['لگ', 'چک', 'چاہ'] + # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. + # Its form چاہیئے (cāhie) “should, ought to” (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. + # دکھانا दिखाना (dikhānā) “to show” + hiphase = ['लग', 'चुक', 'चाह', 'दिखा'] + urphase = ['لگ', 'چک', 'چاہ', 'دکھا'] rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': node.deprel = 'compound' From 5aff16b435f5baa0ecb1bb830fa57ae6e980c109 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 12:22:58 +0200 Subject: [PATCH 0483/1201] ... kar ke --- udapi/block/ud/hi/fixaux.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index f4cce0e4..a1033d32 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -60,3 +60,8 @@ def fix_lemma(self, node): # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" if node.lemma == 'لگا': node.lemma = 'لگ' + # The postposition ke after a verbal stem is not an auxiliary. + # Example: علحدہ علحدہ کیس رجسٹر کر کے “by registering separate cases” + if node.lemma == 'کا' and node.form == 'کے': + node.upos = 'ADP' + node.deprel = 'mark' From 8acbf13ece8b280aee1ec785f61cf96f93fc3918 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 13:16:00 +0200 Subject: [PATCH 0484/1201] =?UTF-8?q?=DA=86=D8=A7=DB=81=D8=A6=DB=92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index a1033d32..8f484546 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -57,6 +57,9 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': + # چاہئے (cāhie) “should, ought to” occurs with alternative spellings (should they also be labeled as typos?) + if node.form == 'چاہئے' or node.form == 'چاہیئے' or node.form == 'چاہیے': + node.lemma = 'چاہئے' # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" if node.lemma == 'لگا': node.lemma = 'لگ' From 2db0c9e11cb9eba23aacbe9142ae7123851a44fc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 13:26:54 +0200 Subject: [PATCH 0485/1201] Plural of cahie. --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 8f484546..4967e9f9 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -60,6 +60,9 @@ def fix_lemma(self, node): # چاہئے (cāhie) “should, ought to” occurs with alternative spellings (should they also be labeled as typos?) if node.form == 'چاہئے' or node.form == 'چاہیئے' or node.form == 'چاہیے': node.lemma = 'چاہئے' + if node.form == 'چاہئیں': + node.lemma = 'چاہئے' + node.feats['Number'] = 'Plur' # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" if node.lemma == 'لگا': node.lemma = 'لگ' From d7241a18b7b5d866bb7a72df95be7dcf6deb8bd5 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 13:35:03 +0200 Subject: [PATCH 0486/1201] =?UTF-8?q?=DA=AF=DB=8C=D8=A7=20is=20a=20perfect?= =?UTF-8?q?ive=20participle=20of=20=D8=AC=D8=A7=D9=86=D8=A7=E2=80=8E=20(j?= =?UTF-8?q?=C4=81n=C4=81)=20=E2=80=9Cto=20go=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 4967e9f9..5fa11356 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -63,7 +63,10 @@ def fix_lemma(self, node): if node.form == 'چاہئیں': node.lemma = 'چاہئے' node.feats['Number'] = 'Plur' - # لگا is a perfective participle of لگنا (lagnā) "to seem, to appear" + # گیا is a perfective participle of جانا‎ (jānā) “to go” + if node.lemma == 'گیا': + node.lemma = 'جا' + # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': node.lemma = 'لگ' # The postposition ke after a verbal stem is not an auxiliary. From 027fe04bb6f4195070fc536d3a26770b1b38197a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 14:50:00 +0200 Subject: [PATCH 0487/1201] =?UTF-8?q?Urdu=20pseudo-auxiliary=20=D9=BE?= =?UTF-8?q?=DA=BE=DB=8C=D9=86=DA=A9=20(phenk).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 5fa11356..a04ed04f 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -11,12 +11,12 @@ class FixAux(Block): def process_node(self, node): self.fix_lemma(node) # The following verbs appear in verb-verb compounds as the semantically - # less salient element: le (to take), de (to give), ḍāla (to throw), + # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come). There # are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] - urcompound = ['لے', 'دے', 'بیٹھ', 'رکھ', 'آ'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'رکھ', 'آ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. From 5a9f2b875c5195f78d8cd6a5e6ef9ff42d7d572e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 15:15:22 +0200 Subject: [PATCH 0488/1201] =?UTF-8?q?=E0=A4=B5=E0=A4=BE=E0=A4=B2=E0=A4=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index a04ed04f..7b3b9c23 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -29,6 +29,13 @@ def process_node(self, node): node.deprel = 'compound' # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. node.upos = "VERB" + # والا (vālā) with infinitive is annotated as auxiliary but it should not. + # It is not even a verb (it does not have a verbal paradigm); it is more + # like an adjective morphologically, and like a noun syntactically. It means + # “the one who does the action of the content verb infinitive.” + elif node.lemma == 'वाला' or node.lemma == 'والا': + node.upos = 'ADJ' + node.deprel = 'compound' elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': secpred = node.parent grandparent = secpred.parent From c1c3518d872c265391c6efdcbf35a7829c8cfe99 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 15:21:02 +0200 Subject: [PATCH 0489/1201] =?UTF-8?q?If=20v=C3=A1l=C3=A1=20is=20no=20longe?= =?UTF-8?q?r=20adposition,=20it=20cannot=20have=20AdpType.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 7b3b9c23..e843dd6f 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -29,12 +29,15 @@ def process_node(self, node): node.deprel = 'compound' # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. node.upos = "VERB" - # والا (vālā) with infinitive is annotated as auxiliary but it should not. + # वाला والا (vālā) with infinitive is annotated as auxiliary but it should not. # It is not even a verb (it does not have a verbal paradigm); it is more # like an adjective morphologically, and like a noun syntactically. It means # “the one who does the action of the content verb infinitive.” + # Some occurrences in the original annotation are case or mark, so we do not + # check AUX/aux here. elif node.lemma == 'वाला' or node.lemma == 'والا': node.upos = 'ADJ' + node.feats['AdpType'] = '' node.deprel = 'compound' elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': secpred = node.parent From 0e5f4909850ac14097ec1a70c625275e3402bd35 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 15:33:25 +0200 Subject: [PATCH 0490/1201] =?UTF-8?q?Features=20of=20v=C4=81l=C4=81.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index e843dd6f..34566e06 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -38,6 +38,8 @@ def process_node(self, node): elif node.lemma == 'वाला' or node.lemma == 'والا': node.upos = 'ADJ' node.feats['AdpType'] = '' + node.feats['VerbForm'] = '' + node.feats['Aspect'] = '' node.deprel = 'compound' elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': secpred = node.parent From 783246b8c556823f7632fb0114ce4e0edbf2af94 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 15:42:37 +0200 Subject: [PATCH 0491/1201] =?UTF-8?q?=D8=B3=DA=A9=DB=92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 34566e06..b18c074f 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -81,6 +81,8 @@ def fix_lemma(self, node): # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': node.lemma = 'لگ' + if node.lemma == 'سکے': + node.lemma = 'سک' # The postposition ke after a verbal stem is not an auxiliary. # Example: علحدہ علحدہ کیس رجسٹر کر کے “by registering separate cases” if node.lemma == 'کا' and node.form == 'کے': From d357827545a125cb45e21d99670c309c30b85cd1 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 16:08:49 +0200 Subject: [PATCH 0492/1201] =?UTF-8?q?=D8=A8=D9=86=D8=A7=20is=20not=20the?= =?UTF-8?q?=20lemma=20of=20=DA=A9=D8=B1.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index b18c074f..75e7f9bb 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -69,6 +69,10 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': + # بنانا बनाना “make, create, produce, cause to be/become” + # (I don't know why in some instances بنا was used as lemma for کر “to do”.) + if node.form == 'کر' and node.lemma == 'بنا': + node.lemma = 'کر' # چاہئے (cāhie) “should, ought to” occurs with alternative spellings (should they also be labeled as typos?) if node.form == 'چاہئے' or node.form == 'چاہیئے' or node.form == 'چاہیے': node.lemma = 'چاہئے' From 646b0c47d64c0977b2354a75e10bb4596538e941 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 20:19:00 +0200 Subject: [PATCH 0493/1201] =?UTF-8?q?=DA=86=DA=A9=D8=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 75e7f9bb..9d59195d 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -79,6 +79,9 @@ def fix_lemma(self, node): if node.form == 'چاہئیں': node.lemma = 'چاہئے' node.feats['Number'] = 'Plur' + # چکا is a perfective participle of چکنا (cuknā) “to be finished” + if node.lemma == 'چکا': + node.lemma = 'چک' # گیا is a perfective participle of جانا‎ (jānā) “to go” if node.lemma == 'گیا': node.lemma = 'جا' From fc741a6edaabc0bdac2fd0a65293c7b561843519 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 20:27:20 +0200 Subject: [PATCH 0494/1201] =?UTF-8?q?=D8=B1=DB=81=D8=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 9d59195d..7b3de989 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -88,6 +88,10 @@ def fix_lemma(self, node): # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': node.lemma = 'لگ' + # رہا is a perfective participle of رہنا (rahnā) “to stay” + if node.lemma == 'رہا': + node.lemma = 'رہ' + # sakna to be able to if node.lemma == 'سکے': node.lemma = 'سک' # The postposition ke after a verbal stem is not an auxiliary. From 2848129c7eb7ee0d3f081181b48949acbded312f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 22:40:14 +0200 Subject: [PATCH 0495/1201] =?UTF-8?q?Lemmatization=20of=20=D9=88=D8=A7?= =?UTF-8?q?=D9=84=DB=8C.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 7b3de989..0c811365 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -94,6 +94,10 @@ def fix_lemma(self, node): # sakna to be able to if node.lemma == 'سکے': node.lemma = 'سک' + # The compound part vālā is not an auxiliary. We handle it in process_node() + # but it must be lemmatized properly. + if node.lemma == 'والی': + node.lemma = 'والا' # The postposition ke after a verbal stem is not an auxiliary. # Example: علحدہ علحدہ کیس رجسٹر کر کے “by registering separate cases” if node.lemma == 'کا' and node.form == 'کے': From 51f6f1614b451969aeff200138cdfef8afcad1ce Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 22:52:20 +0200 Subject: [PATCH 0496/1201] =?UTF-8?q?Lemmatization=20of=20=DA=AF=D8=A7,=20?= =?UTF-8?q?=DA=AF=DB=8C,=20=DA=AF=DB=92.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 0c811365..da5d6b42 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -82,6 +82,12 @@ def fix_lemma(self, node): # چکا is a perfective participle of چکنا (cuknā) “to be finished” if node.lemma == 'چکا': node.lemma = 'چک' + # گا, گی, گے denote the future tense. They are written as separate + # words in Urdu (while they are just suffixes in Hindi). However, + # when written as a separate auxiliary, all these forms should share + # the same lemma. + if node.lemma == 'گی' or node.lemma = 'گے': + node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” if node.lemma == 'گیا': node.lemma = 'جا' From c11aa6703f0854a3f83238695656da106a88c74a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 22:53:31 +0200 Subject: [PATCH 0497/1201] Bug fix. --- udapi/block/ud/hi/fixaux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index da5d6b42..1843e2de 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -86,7 +86,7 @@ def fix_lemma(self, node): # words in Urdu (while they are just suffixes in Hindi). However, # when written as a separate auxiliary, all these forms should share # the same lemma. - if node.lemma == 'گی' or node.lemma = 'گے': + if node.lemma == 'گی' or node.lemma == 'گے': node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” if node.lemma == 'گیا': From 57dffd5625e94b369ac06410196ae7a3e7f3bd27 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 23:09:57 +0200 Subject: [PATCH 0498/1201] Lemmatization of lena, dena. --- udapi/block/ud/hi/fixaux.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 1843e2de..00141be4 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -82,6 +82,9 @@ def fix_lemma(self, node): # چکا is a perfective participle of چکنا (cuknā) “to be finished” if node.lemma == 'چکا': node.lemma = 'چک' + # دیا is a perfective participle of دینا (denā) “to give” + if node.lemma == 'دیا': + node.lemma = 'دے' # گا, گی, گے denote the future tense. They are written as separate # words in Urdu (while they are just suffixes in Hindi). However, # when written as a separate auxiliary, all these forms should share @@ -91,6 +94,9 @@ def fix_lemma(self, node): # گیا is a perfective participle of جانا‎ (jānā) “to go” if node.lemma == 'گیا': node.lemma = 'جا' + # لیا is a perfective participle of لینا (lenā) “to take” + if node.lemma == 'لیا': + node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': node.lemma = 'لگ' From d6a66df056d57b0fe24ded3770aa182b35ca0a3f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 21 Aug 2022 23:22:29 +0200 Subject: [PATCH 0499/1201] =?UTF-8?q?=D8=A7=D9=B9=DA=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 00141be4..28293ab6 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -16,7 +16,7 @@ def process_node(self, node): # are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'رکھ', 'آ'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -69,6 +69,9 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': + # اٹھنا “to rise, get up” + if node.lemma == 'اٹھا': + node.lemma = 'اٹھ' # بنانا बनाना “make, create, produce, cause to be/become” # (I don't know why in some instances بنا was used as lemma for کر “to do”.) if node.form == 'کر' and node.lemma == 'بنا': From 160d20df833612ccbb52b7b1a4586467b9051bb1 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 09:58:18 +0200 Subject: [PATCH 0500/1201] =?UTF-8?q?=D8=AF=DB=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 28293ab6..5a4351e1 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -86,7 +86,7 @@ def fix_lemma(self, node): if node.lemma == 'چکا': node.lemma = 'چک' # دیا is a perfective participle of دینا (denā) “to give” - if node.lemma == 'دیا': + if node.lemma == 'دیا' or node.lemma == 'دی': node.lemma = 'دے' # گا, گی, گے denote the future tense. They are written as separate # words in Urdu (while they are just suffixes in Hindi). However, From eab5615969dbe4bd7b5349ad3a58f3ced14f19ee Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:07:29 +0200 Subject: [PATCH 0501/1201] =?UTF-8?q?Wrongly=20lemmatized=20forms=20of=20?= =?UTF-8?q?=D8=AC=D8=A7=D9=86=D8=A7=E2=80=8E=20(j=C4=81n=C4=81)=20?= =?UTF-8?q?=E2=80=9Cto=20go=E2=80=9D.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 5a4351e1..b6b68ceb 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -95,7 +95,7 @@ def fix_lemma(self, node): if node.lemma == 'گی' or node.lemma == 'گے': node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” - if node.lemma == 'گیا': + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی': node.lemma = 'جا' # لیا is a perfective participle of لینا (lenā) “to take” if node.lemma == 'لیا': From 18dc8b44dd288f8b125467144d56c1447c135f60 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:18:36 +0200 Subject: [PATCH 0502/1201] =?UTF-8?q?Wrongly=20lemmatized=20forms=20of=20?= =?UTF-8?q?=D8=AC=D8=A7=D9=86=D8=A7=E2=80=8E=20(j=C4=81n=C4=81)=20?= =?UTF-8?q?=E2=80=9Cto=20go=E2=80=9D.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/hi/fixaux.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index b6b68ceb..81f4653b 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -95,7 +95,8 @@ def fix_lemma(self, node): if node.lemma == 'گی' or node.lemma == 'گے': node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” - if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی': + # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان': node.lemma = 'جا' # لیا is a perfective participle of لینا (lenā) “to take” if node.lemma == 'لیا': From bd1f23a3c60e9c8e0bb1ccef87851fbafb6a4fa3 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:26:57 +0200 Subject: [PATCH 0503/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 81f4653b..3b7e2f02 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -82,6 +82,9 @@ def fix_lemma(self, node): if node.form == 'چاہئیں': node.lemma = 'چاہئے' node.feats['Number'] = 'Plur' + # چاہے seems to be a wrong lemma of چاہیں_گے “would like” + if node.lemma == 'چاہے': + node.lemma = 'چاہ' # چکا is a perfective participle of چکنا (cuknā) “to be finished” if node.lemma == 'چکا': node.lemma = 'چک' @@ -108,7 +111,7 @@ def fix_lemma(self, node): if node.lemma == 'رہا': node.lemma = 'رہ' # sakna to be able to - if node.lemma == 'سکے': + if node.lemma == 'سکے' or node.lemma == 'سکی': node.lemma = 'سک' # The compound part vālā is not an auxiliary. We handle it in process_node() # but it must be lemmatized properly. From 8b23ecc0f6c2e58777bd70e2b4c11c4c76fba58d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:32:59 +0200 Subject: [PATCH 0504/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 3b7e2f02..ecf7c3b2 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -99,7 +99,7 @@ def fix_lemma(self, node): node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. - if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان': + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا': node.lemma = 'جا' # لیا is a perfective participle of لینا (lenā) “to take” if node.lemma == 'لیا': @@ -108,10 +108,10 @@ def fix_lemma(self, node): if node.lemma == 'لگا': node.lemma = 'لگ' # رہا is a perfective participle of رہنا (rahnā) “to stay” - if node.lemma == 'رہا': + if node.lemma == 'رہا' or node.lemma == 'رہے': node.lemma = 'رہ' # sakna to be able to - if node.lemma == 'سکے' or node.lemma == 'سکی': + if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا': node.lemma = 'سک' # The compound part vālā is not an auxiliary. We handle it in process_node() # but it must be lemmatized properly. From 356a7cda698ec00b2c0fc594c3b3157a05c50300 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:39:13 +0200 Subject: [PATCH 0505/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index ecf7c3b2..f320bb98 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -12,11 +12,11 @@ def process_node(self, node): self.fix_lemma(node) # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), - # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come). There - # are also jā (to go) and paṛa (to fall) but we do not list them here + # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring). + # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -99,7 +99,7 @@ def fix_lemma(self, node): node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. - if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا': + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ': node.lemma = 'جا' # لیا is a perfective participle of لینا (lenā) “to take” if node.lemma == 'لیا': From f8e8a0aac7e6dfb52f2ff256d1f4c4fa95a6f271 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:48:40 +0200 Subject: [PATCH 0506/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index f320bb98..0a7f8d3a 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -12,11 +12,12 @@ def process_node(self, node): self.fix_lemma(node) # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), - # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring). + # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), + # pahuñc (to reach). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا'] + hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -102,7 +103,7 @@ def fix_lemma(self, node): if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ': node.lemma = 'جا' # لیا is a perfective participle of لینا (lenā) “to take” - if node.lemma == 'لیا': + if node.lemma == 'لیا' or node.lemma == 'لو': node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': From 04068d58a6b5488691079f5538ac122c084925ec Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 10:54:52 +0200 Subject: [PATCH 0507/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 0a7f8d3a..2ce70d6a 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -102,8 +102,11 @@ def fix_lemma(self, node): # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ': node.lemma = 'جا' + # Wrongly lemmatized present forms of “to be”. + if node.lemma == 'ہوں' or node.lemma == 'ہوا': + node.lemma = 'ہے' # لیا is a perfective participle of لینا (lenā) “to take” - if node.lemma == 'لیا' or node.lemma == 'لو': + if node.lemma == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی': node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': From d0275bec69254fc9b3b910584d10d13d4052e32c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:01:58 +0200 Subject: [PATCH 0508/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 2ce70d6a..2178e421 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -111,8 +111,11 @@ def fix_lemma(self, node): # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': node.lemma = 'لگ' + # پڑے is a perfective participle of پڑنا (paṛnā) “to fall” + if node.lemma == 'پڑے': + node.lemma = 'پڑ' # رہا is a perfective participle of رہنا (rahnā) “to stay” - if node.lemma == 'رہا' or node.lemma == 'رہے': + if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے': node.lemma = 'رہ' # sakna to be able to if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا': From 8d6866b473195bf97d79a9b25fd9b21984541e62 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:08:26 +0200 Subject: [PATCH 0509/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 2178e421..dcf6fe3a 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -13,11 +13,11 @@ def process_node(self, node): # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), - # pahuñc (to reach). + # pahuñc (to reach), dekh (to look). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -100,13 +100,13 @@ def fix_lemma(self, node): node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. - if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ': + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی': node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. if node.lemma == 'ہوں' or node.lemma == 'ہوا': node.lemma = 'ہے' # لیا is a perfective participle of لینا (lenā) “to take” - if node.lemma == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی': + if node.lemma == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': From 86248be9f7d975d6d96a2daab5208b2175a1762c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:15:02 +0200 Subject: [PATCH 0510/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index dcf6fe3a..84a4b9c1 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -120,6 +120,9 @@ def fix_lemma(self, node): # sakna to be able to if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا': node.lemma = 'سک' + # Wrongly lemmatized past forms of “to be”. + if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں': + node.lemma = 'تھا' # The compound part vālā is not an auxiliary. We handle it in process_node() # but it must be lemmatized properly. if node.lemma == 'والی': From 6a726c962590fd03e9bf56096853e80e0560d319 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:21:29 +0200 Subject: [PATCH 0511/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 84a4b9c1..ea865b14 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -70,6 +70,9 @@ def fix_lemma(self, node): must be fixed. """ if node.upos == 'AUX': + # آنے is the oblique infinitive form of “to come” + if node.lemma == 'آنہ': + node.lemma = 'آ' # اٹھنا “to rise, get up” if node.lemma == 'اٹھا': node.lemma = 'اٹھ' @@ -90,7 +93,7 @@ def fix_lemma(self, node): if node.lemma == 'چکا': node.lemma = 'چک' # دیا is a perfective participle of دینا (denā) “to give” - if node.lemma == 'دیا' or node.lemma == 'دی': + if node.lemma == 'دیا' or node.lemma == 'دی' or node.lemma == 'دیت': node.lemma = 'دے' # گا, گی, گے denote the future tense. They are written as separate # words in Urdu (while they are just suffixes in Hindi). However, From 0cd5546c5397b4055797d3f6e9e09f3f67f0c7ef Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:29:57 +0200 Subject: [PATCH 0512/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index ea865b14..df914996 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -13,11 +13,11 @@ def process_node(self, node): # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), - # pahuñc (to reach), dekh (to look). + # pahuñc (to reach), dekh (to look), phar (to return). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -117,6 +117,9 @@ def fix_lemma(self, node): # پڑے is a perfective participle of پڑنا (paṛnā) “to fall” if node.lemma == 'پڑے': node.lemma = 'پڑ' + # پھرے is a perfective participle of پھرنا (pharnā) “to return” + if node.lemma == 'پھرے': + node.lemma = 'پھر' # رہا is a perfective participle of رہنا (rahnā) “to stay” if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے': node.lemma = 'رہ' From 9046d0e4ed67f9dadd6088a55d14d390810e55d6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:40:44 +0200 Subject: [PATCH 0513/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index df914996..1507e8e4 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -13,11 +13,11 @@ def process_node(self, node): # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), - # pahuñc (to reach), dekh (to look), phar (to return). + # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر'] + urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -103,7 +103,7 @@ def fix_lemma(self, node): node.lemma = 'گا' # گیا is a perfective participle of جانا‎ (jānā) “to go” # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. - if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی': + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی' or node.lemma == 'جاتے' or node.lemma == 'جات': node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. if node.lemma == 'ہوں' or node.lemma == 'ہوا': From 27321a9cae361be1e251f3ffe165055c2a591ac3 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:48:48 +0200 Subject: [PATCH 0514/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 1507e8e4..e6435d09 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -17,7 +17,7 @@ def process_node(self, node): # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] - urcompound = ['لے', 'دے', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل'] + urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. @@ -124,7 +124,7 @@ def fix_lemma(self, node): if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے': node.lemma = 'رہ' # sakna to be able to - if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا': + if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا' or node.lemma == 'سکت': node.lemma = 'سک' # Wrongly lemmatized past forms of “to be”. if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں': From 3c0e186abb780fdcf6ae18064c7ea18170cdc37b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 11:56:40 +0200 Subject: [PATCH 0515/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index e6435d09..440bcd80 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -106,10 +106,12 @@ def fix_lemma(self, node): if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی' or node.lemma == 'جاتے' or node.lemma == 'جات': node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. - if node.lemma == 'ہوں' or node.lemma == 'ہوا': + # In one instance, ہے had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے': node.lemma = 'ہے' # لیا is a perfective participle of لینا (lenā) “to take” - if node.lemma == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': + # In one instance, لیا had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'لیا' or node.form == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” if node.lemma == 'لگا': From 5c9c72eed8952fbd51b44963816e5215f2c51922 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 12:16:16 +0200 Subject: [PATCH 0516/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 440bcd80..f6d9e7d7 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -95,6 +95,9 @@ def fix_lemma(self, node): # دیا is a perfective participle of دینا (denā) “to give” if node.lemma == 'دیا' or node.lemma == 'دی' or node.lemma == 'دیت': node.lemma = 'دے' + # دکھائیں (dikhānā) “to show” + if node.form == 'دکھائیں': + node.lemma = 'دکھا' # گا, گی, گے denote the future tense. They are written as separate # words in Urdu (while they are just suffixes in Hindi). However, # when written as a separate auxiliary, all these forms should share From f678f17af97ee046759b677d6c3b5cda1ab06451 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 12:27:02 +0200 Subject: [PATCH 0517/1201] Spurious semantic auxiliaries. --- udapi/block/ud/hi/fixaux.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index f6d9e7d7..1f2d670c 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -13,11 +13,12 @@ def process_node(self, node): # The following verbs appear in verb-verb compounds as the semantically # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), - # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk). + # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk), + # caṛh (to climb), saṛ (to rot). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] - urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل'] + urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. From bff1cecf97b947dc44a60a385209b9fca74a9bc2 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 12:53:16 +0200 Subject: [PATCH 0518/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 1f2d670c..e6ec6c53 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -17,7 +17,7 @@ def process_node(self, node): # caṛh (to climb), saṛ (to rot). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच'] + hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. @@ -90,7 +90,9 @@ def fix_lemma(self, node): # چاہے seems to be a wrong lemma of چاہیں_گے “would like” if node.lemma == 'چاہے': node.lemma = 'چاہ' - # چکا is a perfective participle of چکنا (cuknā) “to be finished” + # चुका چکا is a perfective participle of चुकना چکنا (cuknā) “to be finished” + if node.lemma == 'चुका': + node.lemma = 'चुक' if node.lemma == 'چکا': node.lemma = 'چک' # دیا is a perfective participle of دینا (denā) “to give” @@ -111,6 +113,8 @@ def fix_lemma(self, node): node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. # In one instance, ہے had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'हों': + node.lemma = 'है' if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے': node.lemma = 'ہے' # لیا is a perfective participle of لینا (lenā) “to take” From d956b903dbe9bb3d86079e579706bd9b91cf9a6d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 13:06:39 +0200 Subject: [PATCH 0519/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index e6ec6c53..be0eaf02 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -14,10 +14,10 @@ def process_node(self, node): # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk), - # caṛh (to climb), saṛ (to rot). + # caṛh (to climb), saṛ (to rot), nikāl (to remove). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल'] + hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. @@ -95,7 +95,9 @@ def fix_lemma(self, node): node.lemma = 'चुक' if node.lemma == 'چکا': node.lemma = 'چک' - # دیا is a perfective participle of دینا (denā) “to give” + # दिया دیا is a perfective participle of देना دینا (denā) “to give” + if node.lemma == 'दिया': + node.lemma = 'दे' if node.lemma == 'دیا' or node.lemma == 'دی' or node.lemma == 'دیت': node.lemma = 'دے' # دکھائیں (dikhānā) “to show” @@ -117,8 +119,10 @@ def fix_lemma(self, node): node.lemma = 'है' if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے': node.lemma = 'ہے' - # لیا is a perfective participle of لینا (lenā) “to take” + # लिया لیا is a perfective participle of लेना لینا (lenā) “to take” # In one instance, لیا had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'लिया': + node.lemma = 'ले' if node.lemma == 'لیا' or node.form == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': node.lemma = 'لے' # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” From 5308b9e5063d21b17ed2548111f54c2e81e8b1c6 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 13:16:16 +0200 Subject: [PATCH 0520/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index be0eaf02..16dfc0aa 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -109,8 +109,10 @@ def fix_lemma(self, node): # the same lemma. if node.lemma == 'گی' or node.lemma == 'گے': node.lemma = 'گا' - # گیا is a perfective participle of جانا‎ (jānā) “to go” - # جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. + # گیا is a perfective participle of जाना جانا‎ (jānā) “to go” + # जान جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. + if node.lemma == 'जाना' or node.lemma == 'जान': + node.lemma = 'जा' if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی' or node.lemma == 'جاتے' or node.lemma == 'جات': node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. From 7484b1e55e5820abbb079cf19f8bc7013ae1f726 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 13:53:52 +0200 Subject: [PATCH 0521/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 16dfc0aa..595f1725 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -14,10 +14,10 @@ def process_node(self, node): # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk), - # caṛh (to climb), saṛ (to rot), nikāl (to remove). + # caṛh (to climb), saṛ (to rot), nikāl (to remove), girā (to drop). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल'] + hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल', 'गिरा'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. @@ -136,13 +136,17 @@ def fix_lemma(self, node): # پھرے is a perfective participle of پھرنا (pharnā) “to return” if node.lemma == 'پھرے': node.lemma = 'پھر' - # رہا is a perfective participle of رہنا (rahnā) “to stay” + # रहा رہا is a perfective participle of रहना رہنا (rahnā) “to stay” + if node.lemma == 'रहा' or node.lemma == 'रहूं': + node.lemma = 'रह' if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے': node.lemma = 'رہ' # sakna to be able to if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا' or node.lemma == 'سکت': node.lemma = 'سک' # Wrongly lemmatized past forms of “to be”. + if node.lemma == 'थी': + node.lemma = 'था' if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں': node.lemma = 'تھا' # The compound part vālā is not an auxiliary. We handle it in process_node() From 8f5e017e9ab0329fa9894ccde17d2e07d56a207b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 15:29:42 +0200 Subject: [PATCH 0522/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 595f1725..041f5d44 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -127,7 +127,9 @@ def fix_lemma(self, node): node.lemma = 'ले' if node.lemma == 'لیا' or node.form == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': node.lemma = 'لے' - # لگا is a perfective participle of لگنا (lagnā) “to seem, to appear” + # लगा لگا is a perfective participle of लगना لگنا (lagnā) “to seem, to appear” + if node.lemma == 'लगा': + node.lemma = 'लग' if node.lemma == 'لگا': node.lemma = 'لگ' # پڑے is a perfective participle of پڑنا (paṛnā) “to fall” From 3341b087c67755c3fbe2b4724adcf1776421d453 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 15:47:29 +0200 Subject: [PATCH 0523/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 041f5d44..feb622d9 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -74,9 +74,6 @@ def fix_lemma(self, node): # آنے is the oblique infinitive form of “to come” if node.lemma == 'آنہ': node.lemma = 'آ' - # اٹھنا “to rise, get up” - if node.lemma == 'اٹھا': - node.lemma = 'اٹھ' # بنانا बनाना “make, create, produce, cause to be/become” # (I don't know why in some instances بنا was used as lemma for کر “to do”.) if node.form == 'کر' and node.lemma == 'بنا': @@ -151,6 +148,11 @@ def fix_lemma(self, node): node.lemma = 'था' if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں': node.lemma = 'تھا' + # उठा اٹھا is a perfective participle of उठना اٹھنا (uṭhnā) “to rise, get up” + if node.lemma == 'उठा': + node.lemma = 'उठ' + if node.lemma == 'اٹھا': + node.lemma = 'اٹھ' # The compound part vālā is not an auxiliary. We handle it in process_node() # but it must be lemmatized properly. if node.lemma == 'والی': From 326e3d200f3efce29098efc47f73e73ecf34cc21 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 16:02:34 +0200 Subject: [PATCH 0524/1201] Spurious semantic auxiliaries. --- udapi/block/ud/hi/fixaux.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index feb622d9..fd0c77c0 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -14,18 +14,20 @@ def process_node(self, node): # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk), - # caṛh (to climb), saṛ (to rot), nikāl (to remove), girā (to drop). + # caṛh (to climb), saṛ (to rot), nikāl (to remove), girā (to drop), samā + # (to encounter), dhamaka (to bully). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल', 'गिरा'] + hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल', 'गिरा', 'समा', 'धमक'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. - # چاہنا चाहना (cāhnā) “to want, to wish” is a control verb but not an auxiliary. + # चाहना چاہنا (cāhnā) “to want, to wish” is a control verb but not an auxiliary. # Its form چاہیئے (cāhie) “should, ought to” (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. - # دکھانا दिखाना (dikhānā) “to show” - hiphase = ['लग', 'चुक', 'चाह', 'दिखा'] - urphase = ['لگ', 'چک', 'چاہ', 'دکھا'] + # दिखाना دکھانا (dikhānā) “to show” + # बनना بننا (bananā) “to become” + hiphase = ['लग', 'चुक', 'चाह', 'दिखा', 'बन'] + urphase = ['لگ', 'چک', 'چاہ', 'دکھا', 'بن'] rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': node.deprel = 'compound' From 67dc40eb1a41ca4fd0be6554c2577767cf532c77 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 16:21:46 +0200 Subject: [PATCH 0525/1201] Spurious semantic auxiliaries. --- udapi/block/ud/hi/fixaux.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index fd0c77c0..ec61592f 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -11,14 +11,15 @@ class FixAux(Block): def process_node(self, node): self.fix_lemma(node) # The following verbs appear in verb-verb compounds as the semantically - # less salient element: le (to take), de (to give), ḍāla / phenk (to throw), + # less salient element: le (to take), de (to give), ḍāla / phenka (to throw), # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), - # pahuñc (to reach), dekh (to look), phar (to return), cal (to walk), - # caṛh (to climb), saṛ (to rot), nikāl (to remove), girā (to drop), samā - # (to encounter), dhamaka (to bully). + # pahuñca (to reach), dekha (to look), phara (to return), cala (to walk), + # caṛha (to climb), saṛa (to rot), nikala (to get out), nikāla (to remove), girā (to drop), + # samā (to encounter), dhamaka (to bully), khaḍā (to stand), daboca (to catch), + # gujara (to pass). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकाल', 'गिरा', 'समा', 'धमक'] + hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकल', 'निकाल', 'गिरा', 'समा', 'धमक', 'खडा', 'दबोच', 'गुजर'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. @@ -116,7 +117,7 @@ def fix_lemma(self, node): node.lemma = 'جا' # Wrongly lemmatized present forms of “to be”. # In one instance, ہے had a lemma from a neighboring verb, so we also look at the form. - if node.lemma == 'हों': + if node.lemma == 'हों' or node.lemma == 'है.': node.lemma = 'है' if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے': node.lemma = 'ہے' From 7ebadbeecf5a10f5a518987822f534f7ae4aa54c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 16:35:58 +0200 Subject: [PATCH 0526/1201] Fix lemmatization. --- udapi/block/ud/hi/fixaux.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index ec61592f..49518e05 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -132,6 +132,9 @@ def fix_lemma(self, node): node.lemma = 'लग' if node.lemma == 'لگا': node.lemma = 'لگ' + # पहुंचा پہنچا is a perfective participle of पहुंचना پہنچنا (pahuñcnā) “to reach” + if node.lemma == 'पहुंचा' or node.lemma == 'पहुँच': + node.lemma = 'पहुंच' # پڑے is a perfective participle of پڑنا (paṛnā) “to fall” if node.lemma == 'پڑے': node.lemma = 'پڑ' From 58bdbc12283ea253f33a12a3966b2166b82e463f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 22 Aug 2022 16:49:07 +0200 Subject: [PATCH 0527/1201] Spurious semantic auxiliaries. --- udapi/block/ud/hi/fixaux.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py index 49518e05..004ab4af 100644 --- a/udapi/block/ud/hi/fixaux.py +++ b/udapi/block/ud/hi/fixaux.py @@ -16,10 +16,10 @@ def process_node(self, node): # pahuñca (to reach), dekha (to look), phara (to return), cala (to walk), # caṛha (to climb), saṛa (to rot), nikala (to get out), nikāla (to remove), girā (to drop), # samā (to encounter), dhamaka (to bully), khaḍā (to stand), daboca (to catch), - # gujara (to pass). + # gujara (to pass), ghera (to surround), baca (to escape). # There are also jā (to go) and paṛa (to fall) but we do not list them here # because they can also act as genuine auxiliaries. - hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकल', 'निकाल', 'गिरा', 'समा', 'धमक', 'खडा', 'दबोच', 'गुजर'] + hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकल', 'निकाल', 'गिरा', 'समा', 'धमक', 'खडा', 'दबोच', 'गुजर', 'फूंक', 'घेर', 'बच'] urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' # Control and raising verbs. @@ -27,7 +27,7 @@ def process_node(self, node): # Its form چاہیئے (cāhie) “should, ought to” (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. # दिखाना دکھانا (dikhānā) “to show” # बनना بننا (bananā) “to become” - hiphase = ['लग', 'चुक', 'चाह', 'दिखा', 'बन'] + hiphase = ['लग', 'चुक', 'चाह', 'दिखा', 'बन', 'करा'] urphase = ['لگ', 'چک', 'چاہ', 'دکھا', 'بن'] rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': From 5a239dd41c325b1583fe635b9854a2e68d2893a9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 30 Sep 2022 13:21:36 +0200 Subject: [PATCH 0528/1201] FixLeaf: case and mark should be among the defaults. --- udapi/block/ud/fixleaf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py index d715ec01..9b4ce191 100644 --- a/udapi/block/ud/fixleaf.py +++ b/udapi/block/ud/fixleaf.py @@ -8,11 +8,11 @@ class FixLeaf(Block): """ - Make sure that aux and cop dependents are leaves unless one of the known - exceptions applies. + Make sure that function words are leaves unless one of the known exceptions + applies. """ - def __init__(self, deprels='aux,cop,cc', **kwargs): + def __init__(self, deprels='aux,cop,case,mark,cc', **kwargs): """ Args: deprels: comma-separated list of deprels to be fixed. Default = aux,cop,case,mark,cc. From b2d1ea858224ffb8ef37929e4e998b524dc9d98a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 30 Sep 2022 13:53:01 +0200 Subject: [PATCH 0529/1201] Paired punctuation must not cause non-projectivity if an outside node depends on an inside node. --- udapi/block/ud/fixpunct.py | 58 +++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 95cb40d0..15d310c7 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -27,15 +27,15 @@ '{': '}', '"': '"', # ASCII double quotes "'": "'", # ASCII single quotes - '“': '”', # quotation marks used in English,... - '„': '“', # Czech, German, Russian,... - '«': '»', # French, Russian, Spanish,... + '“': '”', # quotation marks used in English, ... + '„': '“', # Czech, German, Russian, ... + '«': '»', # French, Russian, Spanish, ... '‹': '›', # dtto '《': '》', # Korean, Chinese '「': '」', # Chinese, Japanese - '『': '』', # dtto - '¿': '?', # Spanish question quotation marks - '¡': '!', # Spanish exclamation quotation marks + '『': '』', # ditto + '¿': '?', # Spanish paired question marks + '¡': '!', # Spanish paired exclamation marks } FINAL_PUNCT = '.?!' @@ -65,7 +65,7 @@ def process_tree(self, root): # This may introduce multiple subroots, which will be fixed later on # (preventing to temporarily create multiple subroots here would prevent fixing some errors). for node in root.descendants: - while node.parent.upos == "PUNCT": + while node.parent.upos == 'PUNCT': node.parent = node.parent.parent # Second, fix paired punctuations: quotes and brackets, marking them in _punct_type. @@ -83,10 +83,9 @@ def process_tree(self, root): # Third, fix subordinate punctuation (i.e. any punctuation not marked in _punct_type). for node in root.descendants: - if node.upos == "PUNCT" and not self._punct_type[node.ord]: + if node.upos == 'PUNCT' and not self._punct_type[node.ord]: self._fix_subord_punct(node) - # UD requires "exactly one word is the head of the sentence, dependent on a notional ROOT", i.e. a single "subroot". # This seems to be a stronger rule than no-PUNCT-children because it is checked by the validator. # So lets prevent multiple subroots (at the cost of possibly re-introducing PUNCT-children). @@ -107,7 +106,7 @@ def process_tree(self, root): # TODO: This block changes parents not only for PUNCT nodes. These should be reflected into enhanced deps as well. if self.copy_to_enhanced: for node in root.descendants: - if node.upos == "PUNCT": + if node.upos == 'PUNCT': node.deps = [{'parent': node.parent, 'deprel': 'punct'}] def _fix_subord_punct(self, node): @@ -131,12 +130,12 @@ def _fix_subord_punct(self, node): l_cand, r_cand = node.prev_node, node.next_node if node.form in FINAL_PUNCT: r_cand = None - while l_cand.ord > 0 and l_cand.upos == "PUNCT": + while l_cand.ord > 0 and l_cand.upos == 'PUNCT': if self._punct_type[l_cand.ord] == 'opening' and l_cand.parent != node: l_cand = None break l_cand = l_cand.prev_node - while r_cand is not None and r_cand.upos == "PUNCT": + while r_cand is not None and r_cand.upos == 'PUNCT': if self._punct_type[r_cand.ord] == 'closing' and r_cand.parent != node: r_cand = None break @@ -193,7 +192,7 @@ def _fix_subord_punct(self, node): # We try to be conservative and keep the parent, unless we are sure it is wrong. if node.parent not in path: node.parent = cand - node.deprel = "punct" + node.deprel = 'punct' def _will_be_projective(self, node, cand): node.parent = cand @@ -206,7 +205,6 @@ def _fix_paired_punct(self, root, opening_node, closing_punct): if (self.check_paired_punct_upos or opening_node.form == "'") and opening_node.upos != 'PUNCT': return - nested_level = 0 for node in root.descendants[opening_node.ord:]: if node.form == closing_punct: @@ -219,14 +217,31 @@ def _fix_paired_punct(self, root, opening_node, closing_punct): nested_level += 1 def _fix_pair(self, root, opening_node, closing_node): + # Ideally, paired punctuation symbols should be attached to the single + # head of the subtree inside. Provided the inside segment is a single + # subtree. heads = [] punct_heads = [] - for node in root.descendants[opening_node.ord: closing_node.ord - 1]: - if node.parent.precedes(opening_node) or closing_node.precedes(node.parent): - if node.upos == 'PUNCT': - punct_heads.append(node) - else: - heads.append(node) + for node in root.descendants: + if node == opening_node or node == closing_node: + continue + # If this is a node inside of the pair, is its parent outside? + if opening_node.precedes(node) and node.precedes(closing_node): + if node.parent.precedes(opening_node) or closing_node.precedes(node.parent): + if node.upos == 'PUNCT': + punct_heads.append(node) + else: + heads.append(node) + # Not only the punctuation symbols must not be attached non-projectively, + # they also must not cause non-projectivity of other relations. This could + # happen if an outside node is attached to an inside node. To account for + # this, mark the inside parent as a head, too. + else: + if opening_node.precedes(node.parent) and node.parent.precedes(closing_node): + if node.parent.upos == 'PUNCT': + punct_heads.append(node.parent) + else: + heads.append(node.parent) # Punctuation should not have children, but if there is no other head candidate, # let's break this rule. @@ -246,6 +261,9 @@ def _fix_pair(self, root, opening_node, closing_node): # However, this means that the paired punctuation will be attached non-projectively, # which is forbidden by the UD guidelines. # Thus, we will choose the nearest head, which is the only way how to prevent non-projectivities. + # Sort the heads by their ords (this is not guaranteed because we were adding a mixture of + # inside heads and inside parents of outside nodes). + heads.sort(key=lambda x: x.ord) opening_node.parent = heads[0] closing_node.parent = heads[-1] From 559ec080cdf924289c95468b94640053c069621b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 11 Nov 2022 10:27:35 +0100 Subject: [PATCH 0530/1201] Optionally fix non-copula auxiliaries alongside copulas. --- udapi/block/ud/fixpseudocop.py | 53 +++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/udapi/block/ud/fixpseudocop.py b/udapi/block/ud/fixpseudocop.py index ab07eaaa..ecc5f0bd 100644 --- a/udapi/block/ud/fixpseudocop.py +++ b/udapi/block/ud/fixpseudocop.py @@ -7,33 +7,40 @@ class FixPseudoCop(Block): - def __init__(self, lemma, **kwargs): + def __init__(self, lemmas, noncopaux=False, **kwargs): """Create the ud.FixPseudoCop block instance. Args: - lemma: the lemma of the pseudocopula that should be fixed + lemmas: comma-separated list of lemmas of the pseudocopulas that should be fixed + noncopaux: do the same for non-copula auxiliaries with the given lemma """ super().__init__(**kwargs) - self.lemma = lemma + self.lemmas = lemmas.split(',') + self.noncopaux = noncopaux def process_node(self, node): - pseudocop = self.lemma - if node.lemma == pseudocop and node.udeprel == "cop": - secpred = node.parent - grandparent = secpred.parent - node.parent = grandparent - node.deprel = secpred.deprel - secpred.parent = node - secpred.deprel = "xcomp" - ###!!! We should also take care of DEPS if they exist. - # As a copula, the word was tagged AUX. Now it should be VERB. - node.upos = "VERB" - # Examine the children of the original parent. - # Those that modify the clause should be re-attached to me. - # Those that modify the word (noun, adjective) should stay there. - for c in secpred.children: - # obl is borderline. It could modify an adjective rather than a clause. - # obj and iobj should not occur in copular clauses but it sometimes - # occurs with pseudocopulas: "I declare him handsome." - if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): - c.parent = node + pseudocop = self.lemmas + if node.lemma in pseudocop: + # Besides spurious copulas, this block can be optionally used to fix spurious auxiliaries (if noncopaux is set). + if node.udeprel == 'cop' or self.noncopaux and node.udeprel == 'aux': + secpred = node.parent + grandparent = secpred.parent + node.parent = grandparent + node.deprel = secpred.deprel + secpred.parent = node + secpred.deprel = "xcomp" + ###!!! We should also take care of DEPS if they exist. + # As a copula, the word was tagged AUX. Now it should be VERB. + node.upos = "VERB" + # Examine the children of the original parent. + # Those that modify the clause should be re-attached to me. + # Those that modify the word (noun, adjective) should stay there. + for c in secpred.children: + # obl is borderline. It could modify an adjective rather than a clause. + # obj and iobj should not occur in copular clauses but it sometimes + # occurs with pseudocopulas: "I declare him handsome." + if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): + c.parent = node + # Another possible error is that the word is tagged AUX without being attached as "cop" or "aux". + elif self.noncopaux and node.upos == 'AUX': + node.upos = 'VERB' From ea1c77b57cfbd3cea9e7afa26a789bc688bf7003 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 11 Nov 2022 11:17:36 +0100 Subject: [PATCH 0531/1201] Delete --use-feature=in-tree-build This feature was removed in pip 22.3 (in-tree builds are now the default) and it results in CircleCI fails. --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7be539d2..4e88d664 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -35,7 +35,7 @@ jobs: # pip-dependency-file: test-requirements.txt # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements. - run: name: Install Udapi - command: pip install --use-feature=in-tree-build ".[test]" + command: pip install ".[test]" - run: name: Run pytest tests # This assumes pytest is installed via the install-package step above From 1ef23f857b8f8c9b7080b9c33d2dde56a14abf1f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 16:58:39 +0100 Subject: [PATCH 0532/1201] Reworked feature checking so that a similar block can be written for another language. --- udapi/block/ud/cs/markfeatsbugs.py | 37 ++------------- udapi/block/ud/markfeatsbugs.py | 75 ++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 32 deletions(-) create mode 100644 udapi/block/ud/markfeatsbugs.py diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 11ecd6d9..3fb8d058 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -2,12 +2,15 @@ Block to identify missing or ill-valued features in Czech. Any bugs that it finds will be saved in the MISC column as a Bug attribute, which can be later used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 attributes=form,lemma,upos,xpos,feats,deprel,misc """ -from udapi.core.block import Block +import udapi.block.ud.markfeatsbugs import logging import re -class MarkFeatsBugs(Block): +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): # The convention used in PDT is not consistent. Adjectives are fully disambiguated # (three genders, two animacies, three numbers, seven cases), even though some @@ -21,36 +24,6 @@ class MarkFeatsBugs(Block): # in the future. pdt20 = False # True = like in PDT 2.0; False = like in ČNK - def bug(self, node, bugstring): - bugs = [] - if node.misc['Bug']: - bugs = node.misc['Bug'].split('+') - if not bugstring in bugs: - bugs.append(bugstring) - node.misc['Bug'] = '+'.join(bugs) - - def check_allowed_features(self, node, allowed): - """ - We need a dictionary indexed by feature names that are allowed; for each - feature name, there is a list of allowed values. - """ - # Check for features that are not allowed but the node has them. - # For features that are allowed, check that their values are allowed. - for f in node.feats: - if f in allowed: - if not node.feats[f] in allowed[f]: - self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') - else: - self.bug(node, 'Feat' + f + 'NotAllowed') - - def check_required_features(self, node, required): - """ - We need a list of names of features whose values must not be empty. - """ - for f in required: - if not f in node.feats: - self.bug(node, 'Feat' + f + 'Missing') - def process_node(self, node): # NOUNS ################################################################ if node.upos == 'NOUN': diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py new file mode 100644 index 00000000..b24dcecb --- /dev/null +++ b/udapi/block/ud/markfeatsbugs.py @@ -0,0 +1,75 @@ +""" +Block to identify missing or ill-valued features in a treebank. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. This is a base block that only +implements service methods. A language-specific block must be derived from this +one and define the actual rules valid in that language. + +Usage (Czech example): cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +""" +from udapi.core.block import Block +import logging +import re + +class MarkFeatsBugs(Block): + + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def check_allowed_features(self, node, allowed): + """ + We need a dictionary indexed by feature names that are allowed; for each + feature name, there is a list of allowed values. + """ + # Check for features that are not allowed but the node has them. + # For features that are allowed, check that their values are allowed. + for f in node.feats: + if f in allowed: + if not node.feats[f] in allowed[f]: + self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') + else: + self.bug(node, 'Feat' + f + 'NotAllowed') + + def check_required_features(self, node, required): + """ + We need a list of names of features whose values must not be empty. + """ + for f in required: + if not f in node.feats: + self.bug(node, 'Feat' + f + 'Missing') + + def process_node(self, node): + """ + This is a generic block, do nothing here. In a language-specific block + based on this one, rules similar to the examples below can be specified: + + # NOUNS ################################################################ + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + #... + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) + """ + return From 5a836db5852a97a69b972646493024894a7d3ca4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 17:37:16 +0100 Subject: [PATCH 0533/1201] Added Latin. --- udapi/block/ud/cs/markfeatsbugs.py | 2 +- udapi/block/ud/la/markfeatsbugs.py | 608 +++++++++++++++++++++++++++++ 2 files changed, 609 insertions(+), 1 deletion(-) create mode 100644 udapi/block/ud/la/markfeatsbugs.py diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 3fb8d058..ef203033 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py new file mode 100644 index 00000000..8741eabb --- /dev/null +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -0,0 +1,608 @@ +""" +Block to identify missing or ill-valued features in Latin. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAM ud.la.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def process_node(self, node): + # NOUNS ################################################################ + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + # PROPER NOUNS ######################################################### + elif node.upos == 'PROPN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + if node.feats['Poss'] == 'Yes': # possessive adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'Foreign': ['Yes']}) + elif node.feats['NumType'] == 'Ord': # ordinal numerals are a subtype of adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives + self.check_required_features(node, ['VerbForm', 'Voice']) + if node.feats['Voice'] == 'Act': # active participles have tense, passives don't + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity', 'Variant']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: # regular adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # on, ona, ono, oni, ony + if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony + self.check_adjective_like(node, ['PronType', 'Person'], { + 'PronType': ['Prs'], + 'Person': ['3'] + }) + else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně + # Mostly only two gender groups and no animacy: + # Masc,Neut ... jeho, jemu, jej, něm, jím + # Fem ... jí, ji, ní + # Neut ... je + # No gender in dual and plural: + # Plur ... jich, jim, je, nich, jimi + self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # 1st and 2nd person do not have gender: já, ty + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + elif re.search(r'k[dt]o', node.lemma): # kdo (kto), kdož, někdo, nikdo + # There is no Number. Někdo and nikdo behave like singular; + # kdo is by default singular as well but it also occurs as a subject + # of plural verbs. + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'Gender': ['Masc'], + 'Animacy': ['Anim'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif re.match(r'^(co|což|něco|nicož)$', node.lemma): + # Although these pronouns behave by default as neuter singular, + # no Gender and Number is annotated. However, quite unusually, + # there is Animacy=Inan without Gender. + ###!!! This should probably be fixed in all Czech treebanks and + ###!!! in Interset. The pronoun should get Gender=Neut and no + ###!!! animacy. For now, let's at least make animacy an optional + ###!!! feature (I see that we already do not fill it in the Old + ###!!! Czech data). + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'Animacy': ['Inan'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif node.lemma == 'ješto': + # Unlike 'jenžto', this relative pronoun does not inflect, it + # always occurs in a nominative position, but the context can + # be any gender and number. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Case': ['Nom'] + }) + elif re.match(r'^(jenž|jenžto)$', node.lemma): + # The relative pronouns 'jenž', 'jenžto' inflect for gender; + # while we normally take this as a sign of DET (instead of PRON), + # these can never act as real DET because they never modify a + # nominal. + # Similarly to the personal pronoun 'on', animacy is only + # annotated for masculine nominative plural, non-nominative + # forms are merged for masculine and neuter (jehož, jemuž), and + # non-singular gender is only annotated in nominative (while + # these cases are common for all genders: jichž, jimž, jimiž). + # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even + # in the nominative, although there is no prepositional counter- + # part (but similarly the locative has no prepositionless form). + self.check_adjective_like(node, ['PronType', 'PrepCase'], { + 'PronType': ['Rel'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: + # What remains is the relative pronoun 'an'. It behaves similarly + # to 'jenž' but it does not have the PrepCase feature and it + # only occurs in the nominative. + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom'] + }) + else: # not Masc Plur: an, ana, ano, any + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. + # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. + if re.match(r'^(jeho|jejich|jich)(ž(to)?)?$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc,Neut'] + }) + elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): + # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. + # Congruent gender is annotated only in singular. Masculine and + # neuter are merged even in nominative. Feminine singular does + # not distinguish case in PDT but we need it in Old Czech at + # least for 'jejiej'. + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + if node.feats['Reflex'] == 'Yes': + self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'] + }) + else: + self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2'], + 'Number[psor]': ['Sing', 'Plur'] + }) + else: + self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp']}) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + ###!!! Somehow the NumValue feature from PDT via Interset is useless. + # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. + # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. + # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. + # 'pět' and more have Number=Plur, Case: pět, pěti. + if node.lemma == 'jeden': + self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(dva|oba)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Gender', 'Number', 'Case']) + if self.pdt20: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + self.check_required_features(node, ['Aspect', 'VerbForm']) + if node.feats['VerbForm'] == 'Inf': + # There is no voice. For some reason, PDT does not annotate that + # the infinitive form is active (while a passive infinitive is + # a combination of the infinitive with a passive participle). + self.check_required_features(node, ['Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Fin': + # Voice is optional. For some reason it is not annotated with + # imperatives (although passive imperatives are a combination + # of the active imperative and a passive participle). It is + # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. + if node.feats['Mood'] == 'Cnd': + self.check_required_features(node, ['Mood', 'Person']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'] # optional: it is not annotated in the third person + }) + elif node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # indicative + self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Voice': ['Act'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist + }) + elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # converb + self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] + }) + elif node.feats['Degree'] != '': + # Adverbs that are compared can also be negated. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_required_features(node, ['AdpType', 'Case']) + self.check_allowed_features(node, { + 'AdpType': ['Prep', 'Voc'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) + + def check_adjective_like(self, node, r0, a0): + """ + Long form of adjectives, pronouns and determiners mostly share declension + paradigms and thus the sets of features that are expected. Whether the + actual feature sets are the same depends on the tagging convention (PDT + vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are + not; in ČNK, both adjectives and pronouns (incl. determiners) are fully + disambiguated. This method defines the core inflectional features while + any extras (such as PronType for pronouns) have to be provided by the + caller in parameters r0 (list) and a0 (dict). + """ + required_features = [] + allowed_featurs = {} + full_set = node.upos == 'ADJ' or not self.pdt20 + if full_set: + # Even in the full set, animacy is only distinguished for the + # masculine gender. + if node.feats['Gender'] == 'Masc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + # Gender is annotated in all cases in singular (ten, ta, to) + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished + # in plural if gender is distinguished and it is masculine; in + # singular it is distinguished only in accusative (toho, ten). + # Other cases in plural are gender-less (těch, těm, těmi). + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + } + else: + required_features = ['Number', 'Case'] + allowed_features = { + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + } + required_features = r0 + required_features + a0.update(allowed_features) + allowed_features = a0 + self.check_required_features(node, required_features) + self.check_allowed_features(node, allowed_features) From 0f167c2a64adcb98a61740c348e3b7579502005d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 17:47:54 +0100 Subject: [PATCH 0534/1201] Removed Czech-specific rules from Latin block. For a start, the Latin rules check NOUNs and PROPNs only. --- udapi/block/ud/la/markfeatsbugs.py | 598 +---------------------------- 1 file changed, 11 insertions(+), 587 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 8741eabb..4cf6c1b3 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -16,593 +16,17 @@ def process_node(self, node): # NOUNS ################################################################ if node.upos == 'NOUN': self.check_required_features(node, ['Gender', 'Number', 'Case']) - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) - else: - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Foreign': ['Yes']}) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) - else: - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) - # ADJECTIVES ########################################################### - elif node.upos == 'ADJ': - if node.feats['Poss'] == 'Yes': # possessive adjectives - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case']) - self.check_allowed_features(node, { - 'Poss': ['Yes'], - 'Gender[psor]': ['Masc', 'Fem'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'Poss': ['Yes'], - 'Gender[psor]': ['Masc', 'Fem'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'Foreign': ['Yes']}) - elif node.feats['NumType'] == 'Ord': # ordinal numerals are a subtype of adjectives - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Ord'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Ord'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) - elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives - self.check_required_features(node, ['VerbForm', 'Voice']) - if node.feats['Voice'] == 'Act': # active participles have tense, passives don't - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Act'], - 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Act'], - 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Pass'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Pass'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: # regular adjectives - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) - # PRONOUNS ############################################################# - elif node.upos == 'PRON': - self.check_required_features(node, ['PronType']) - if node.feats['PronType'] == 'Prs': - if node.feats['Reflex'] == 'Yes': - self.check_required_features(node, ['PronType', 'Reflex', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Reflex': ['Yes'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], - 'Variant': ['Short'] - }) - else: # not reflexive - if node.feats['Person'] == '3': # on, ona, ono, oni, ony - if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony - self.check_adjective_like(node, ['PronType', 'Person'], { - 'PronType': ['Prs'], - 'Person': ['3'] - }) - else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně - # Mostly only two gender groups and no animacy: - # Masc,Neut ... jeho, jemu, jej, něm, jím - # Fem ... jí, ji, ní - # Neut ... je - # No gender in dual and plural: - # Plur ... jich, jim, je, nich, jimi - self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { - 'PronType': ['Prs'], - 'Person': ['3'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: # 1st and 2nd person do not have gender: já, ty - self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['1', '2'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Variant': ['Short'] - }) - elif re.search(r'k[dt]o', node.lemma): # kdo (kto), kdož, někdo, nikdo - # There is no Number. Někdo and nikdo behave like singular; - # kdo is by default singular as well but it also occurs as a subject - # of plural verbs. - self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], - 'Gender': ['Masc'], - 'Animacy': ['Anim'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] - }) - elif re.match(r'^(co|což|něco|nicož)$', node.lemma): - # Although these pronouns behave by default as neuter singular, - # no Gender and Number is annotated. However, quite unusually, - # there is Animacy=Inan without Gender. - ###!!! This should probably be fixed in all Czech treebanks and - ###!!! in Interset. The pronoun should get Gender=Neut and no - ###!!! animacy. For now, let's at least make animacy an optional - ###!!! feature (I see that we already do not fill it in the Old - ###!!! Czech data). - self.check_required_features(node, ['PronType', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], - 'Animacy': ['Inan'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] - }) - elif node.lemma == 'ješto': - # Unlike 'jenžto', this relative pronoun does not inflect, it - # always occurs in a nominative position, but the context can - # be any gender and number. - self.check_required_features(node, ['PronType', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Case': ['Nom'] - }) - elif re.match(r'^(jenž|jenžto)$', node.lemma): - # The relative pronouns 'jenž', 'jenžto' inflect for gender; - # while we normally take this as a sign of DET (instead of PRON), - # these can never act as real DET because they never modify a - # nominal. - # Similarly to the personal pronoun 'on', animacy is only - # annotated for masculine nominative plural, non-nominative - # forms are merged for masculine and neuter (jehož, jemuž), and - # non-singular gender is only annotated in nominative (while - # these cases are common for all genders: jichž, jimž, jimiž). - # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even - # in the nominative, although there is no prepositional counter- - # part (but similarly the locative has no prepositionless form). - self.check_adjective_like(node, ['PronType', 'PrepCase'], { - 'PronType': ['Rel'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: - # What remains is the relative pronoun 'an'. It behaves similarly - # to 'jenž' but it does not have the PrepCase feature and it - # only occurs in the nominative. - if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani - self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Plur'], - 'Case': ['Nom'] - }) - else: # not Masc Plur: an, ana, ano, any - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom'] - }) - # DETERMINERS ########################################################## - elif node.upos == 'DET': - # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. - # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. - if re.match(r'^(jeho|jejich|jich)(ž(to)?)?$', node.form.lower()): - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) - self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], - 'Poss': ['Yes'], - 'Person': ['3'], - 'Number[psor]': ['Sing', 'Dual', 'Plur'], - 'Gender[psor]': ['Masc,Neut'] - }) - elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): - # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. - # Congruent gender is annotated only in singular. Masculine and - # neuter are merged even in nominative. Feminine singular does - # not distinguish case in PDT but we need it in Old Czech at - # least for 'jejiej'. - if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], - 'Poss': ['Yes'], - 'Person': ['3'], - 'Number[psor]': ['Sing'], - 'Gender[psor]': ['Fem'], - 'Gender': ['Masc,Neut', 'Fem'], - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - else: - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], - 'Poss': ['Yes'], - 'Person': ['3'], - 'Number[psor]': ['Sing'], - 'Gender[psor]': ['Fem'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - if node.feats['Reflex'] == 'Yes': - self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'] - }) - else: - self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Person': ['1', '2'], - 'Number[psor]': ['Sing', 'Plur'] - }) - else: - self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp']}) - # NUMERALS ############################################################# - elif node.upos == 'NUM': - self.check_required_features(node, ['NumType', 'NumForm']) - # Arabic digits and Roman numerals do not have inflection features. - if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Digit', 'Roman'] - }) - else: - ###!!! Somehow the NumValue feature from PDT via Interset is useless. - # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. - # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. - # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. - # 'pět' and more have Number=Plur, Case: pět, pěti. - if node.lemma == 'jeden': - self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - elif re.match(r'^(dva|oba)$', node.lemma): - self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Gender', 'Number', 'Case']) - if self.pdt20: - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - else: - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - else: - self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - # VERBS AND AUXILIARIES ################################################ - elif re.match(r'^(VERB|AUX)$', node.upos): - self.check_required_features(node, ['Aspect', 'VerbForm']) - if node.feats['VerbForm'] == 'Inf': - # There is no voice. For some reason, PDT does not annotate that - # the infinitive form is active (while a passive infinitive is - # a combination of the infinitive with a passive participle). - self.check_required_features(node, ['Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Inf'], - 'Polarity': ['Pos', 'Neg'] - }) - elif node.feats['VerbForm'] == 'Fin': - # Voice is optional. For some reason it is not annotated with - # imperatives (although passive imperatives are a combination - # of the active imperative and a passive participle). It is - # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. - if node.feats['Mood'] == 'Cnd': - self.check_required_features(node, ['Mood', 'Person']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Cnd'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Dual', 'Plur'] # optional: it is not annotated in the third person - }) - elif node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Imp'], - 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) - 'Number': ['Sing', 'Dual', 'Plur'], - 'Polarity': ['Pos', 'Neg'] - }) - else: # indicative - self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Ind'], - 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative - 'Voice': ['Act'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist - }) - elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Part'], - 'Tense': ['Past'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Polarity': ['Pos', 'Neg'] - }) - else: - self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Part'], - 'Tense': ['Past'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Fem', 'Neut'], - 'Polarity': ['Pos', 'Neg'] - }) - else: # converb - self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Conv'], - 'Tense': ['Past', 'Pres'], - 'Voice': ['Act'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy - 'Polarity': ['Pos', 'Neg'] - }) - # ADVERBS ############################################################## - elif node.upos == 'ADV': - if node.feats['PronType'] != '': - # Pronominal adverbs are neither compared nor negated. - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] - }) - elif node.feats['Degree'] != '': - # Adverbs that are compared can also be negated. - self.check_required_features(node, ['Degree', 'Polarity']) - self.check_allowed_features(node, { - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'] - }) - else: - # The remaining adverbs are neither pronominal, nor compared or - # negated. - self.check_allowed_features(node, {}) - # ADPOSITIONS ########################################################## - elif node.upos == 'ADP': - self.check_required_features(node, ['AdpType', 'Case']) + self.check_required_features(node, ['Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'AdpType': ['Prep', 'Voc'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] - }) - # THE REST: NO FEATURES ################################################ - else: - self.check_allowed_features(node, {}) - - def check_adjective_like(self, node, r0, a0): - """ - Long form of adjectives, pronouns and determiners mostly share declension - paradigms and thus the sets of features that are expected. Whether the - actual feature sets are the same depends on the tagging convention (PDT - vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are - not; in ČNK, both adjectives and pronouns (incl. determiners) are fully - disambiguated. This method defines the core inflectional features while - any extras (such as PronType for pronouns) have to be provided by the - caller in parameters r0 (list) and a0 (dict). - """ - required_features = [] - allowed_featurs = {} - full_set = node.upos == 'ADJ' or not self.pdt20 - if full_set: - # Even in the full set, animacy is only distinguished for the - # masculine gender. - if node.feats['Gender'] == 'Masc': - required_features = ['Gender', 'Animacy', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - } - else: - required_features = ['Gender', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - } - else: - # Gender is annotated in all cases in singular (ten, ta, to) - # but only in nominative, accusative, and vocative in plural - # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished - # in plural if gender is distinguished and it is masculine; in - # singular it is distinguished only in accusative (toho, ten). - # Other cases in plural are gender-less (těch, těm, těmi). - # Note that this is not consistent with adjectives, where we - # disambiguate gender in all cases in plural. - if node.feats['Number'] == 'Sing': - if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': - required_features = ['Gender', 'Animacy', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing'], - 'Case': ['Acc'] - } - else: - required_features = ['Gender', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - } - elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): - required_features = ['Gender', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Acc', 'Voc'] - } - else: - required_features = ['Number', 'Case'] - allowed_features = { - 'Number': ['Dual', 'Plur'], - 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] - } - required_features = r0 + required_features - a0.update(allowed_features) - allowed_features = a0 - self.check_required_features(node, required_features) - self.check_allowed_features(node, allowed_features) + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) From 606515a088cc9779b3fef46795a0c4a6bb1f6613 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 17:51:05 +0100 Subject: [PATCH 0535/1201] Usage layout=compact. --- udapi/block/ud/cs/markfeatsbugs.py | 2 +- udapi/block/ud/la/markfeatsbugs.py | 2 +- udapi/block/ud/markfeatsbugs.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index ef203033..309e7ac8 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -3,7 +3,7 @@ finds will be saved in the MISC column as a Bug attribute, which can be later used in filters and highlighted in text output. -Usage: cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +Usage: cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 4cf6c1b3..8aea567f 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -3,7 +3,7 @@ finds will be saved in the MISC column as a Bug attribute, which can be later used in filters and highlighted in text output. -Usage: cat *.conllu | udapy -HAM ud.la.MarkFeatsBugs > bugs.html +Usage: cat *.conllu | udapy -HAMX layout=compact ud.la.MarkFeatsBugs > bugs.html Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py index b24dcecb..1bb8188b 100644 --- a/udapi/block/ud/markfeatsbugs.py +++ b/udapi/block/ud/markfeatsbugs.py @@ -5,7 +5,7 @@ implements service methods. A language-specific block must be derived from this one and define the actual rules valid in that language. -Usage (Czech example): cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +Usage (Czech example): cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html """ from udapi.core.block import Block import logging From 204da3bbb4bfa59c085c4c05a6bc8be2e134e27d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 21:05:39 +0100 Subject: [PATCH 0536/1201] More rules for Latin features (cloned from Czech). --- udapi/block/ud/la/markfeatsbugs.py | 148 +++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 8aea567f..96c7b682 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -30,3 +30,151 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'NameType': ['Giv', 'Sur', 'Geo'], 'Foreign': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # on, ona, ono, oni, ony + self.check_required_features(node, ['PronType', 'Person', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + else: # 1st and 2nd person do not have gender: já, ty + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2', '3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + else: + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + self.check_required_features(node, ['Aspect', 'VerbForm']) + if node.feats['VerbForm'] == 'Inf': + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Voice': ['Act'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Part': + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # verbal noun + self.check_required_features(node, ['Tense', 'Number', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Sing', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + # PARTICLES ############################################################ + elif node.upos == 'PART': + self.check_allowed_features(node, { + 'Polarity': ['Neg'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) From 98db11584577be72a5748c8c81cb4030348270c0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 22:32:50 +0100 Subject: [PATCH 0537/1201] Added feature rules for Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 191 +++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 udapi/block/ud/ml/markfeatsbugs.py diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py new file mode 100644 index 00000000..a46580d1 --- /dev/null +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -0,0 +1,191 @@ +""" +Block to identify missing or ill-valued features in Malayalam. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.ml.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def process_node(self, node): + # NOUNS AND PROPER NOUNS ############################################### + if re.match(r'^(NOUN|PROPN)$', node.upos): + self.check_required_features(node, ['Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], + 'Foreign': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + self.check_allowed_features(node, { + 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Deixis': ['Prox', 'Remt'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: # plural pronouns do not distinguish gender + self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Deixis': ['Prox', 'Remt'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2', '3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + # VERBS ################################################################ + elif node.upos == 'VERB': + self.check_required_features(node, ['VerbForm', 'Voice']) + if node.feats['VerbForm'] == 'Inf': + self.check_allowed_features(node, { + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + elif node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + else: + self.check_required_features(node, ['Mood', 'Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Nec'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + elif node.feats['VerbForm'] == 'Part': + self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + else: # verbal noun + self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + }) + # AUXILIARIES ########################################################## + elif node.upos == 'AUX': + self.check_required_features(node, ['VerbForm']) + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + # PARTICLES ############################################################ + elif node.upos == 'PART': + self.check_allowed_features(node, { + 'Polarity': ['Neg'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) From 84965b94e618f1f2b5fb2cdc3a46fca4dc897c5e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 23:44:42 +0100 Subject: [PATCH 0538/1201] Non-personal pronouns in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index a46580d1..fc25eccb 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -65,6 +65,12 @@ def process_node(self, node): 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] }) + else: # not personal + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) # DETERMINERS ########################################################## elif node.upos == 'DET': if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' From cce7db13d41deeba166ff7a766ae58c6a4fb3db0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Nov 2022 11:41:50 +0100 Subject: [PATCH 0539/1201] Malayalam determiners have fewer features than pronouns. --- udapi/block/ud/ml/markfeatsbugs.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index fc25eccb..41d4cf09 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -73,23 +73,16 @@ def process_node(self, node): }) # DETERMINERS ########################################################## elif node.upos == 'DET': - if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) + if node.feats['PronType'] == 'Art': + self.check_required_features(node, ['PronType', 'Definite']) self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Person': ['1', '2', '3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'PronType': ['Art'], + 'Definite': ['Ind'] }) else: - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_required_features(node, ['PronType']) self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] }) # NUMERALS ############################################################# elif node.upos == 'NUM': From 8a9435f4b3a510dc1b2f6f34c98ee9f5e9e80b5f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Nov 2022 15:09:34 +0100 Subject: [PATCH 0540/1201] Added Chinese lemmatization. --- udapi/block/ud/zh/lemmatize.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 udapi/block/ud/zh/lemmatize.py diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py new file mode 100644 index 00000000..7db798a0 --- /dev/null +++ b/udapi/block/ud/zh/lemmatize.py @@ -0,0 +1,34 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + # dictionary: form --> lemma + lemma = { + # The plural suffix -men. + '我們': '我', # trad + '我们': '我', # simp + '他們': '他', # trad + '他们': '他', # simp + '它們': '它', # trad + '它们': '它', # simp + '牠們': '牠', # trad + '她們': '她', # trad + '她们': '她', # simp + '人們': '人', # trad + '人们': '人' # simp + } + + def process_node(self, node): + """ + Parts of the Chinese treebanks lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + if node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form From 72f045ef84ea000f403d210301d33d1acf3f7018 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Nov 2022 12:46:54 +0100 Subject: [PATCH 0541/1201] Enable rewriting of lemmas in Chinese. --- udapi/block/ud/zh/lemmatize.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 7db798a0..2b7a2dc5 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -5,6 +5,20 @@ class Lemmatize(Block): + def __init__(self, rewrite='empty', **kwargs): + """ + Create the ud.zh.Lemmatize block instance. + + Args: + rewrite=empty: set the lemma if it was empty so far; do not touch the rest + rewrite=form: set the lemma if it was empty or equal to form; do not touch the rest + rewrite=all: set the lemma regardless of what it was previously + """ + super().__init__(**kwargs) + if not re.match(r'^(empty|form|all)$', rewrite): + raise ValueError("Unexpected value of parameter 'rewrite'") + self.rewrite = rewrite + # dictionary: form --> lemma lemma = { # The plural suffix -men. @@ -27,8 +41,11 @@ def process_node(self, node): of Sino-Tibetan languages is pretty straightforward most of the time, as the lemma typically equals to the actual word form. """ - if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': - if node.form in self.lemma: - node.lemma = self.lemma[node.form] - else: - node.lemma = node.form + if self.rewrite == 'empty' and not (node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + if node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form From b4dd844870291532089ce518bb0ad4d1f562d92a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Nov 2022 14:59:31 +0100 Subject: [PATCH 0542/1201] Use lemmatization to make copulas acceptable. --- udapi/block/ud/zh/lemmatize.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 2b7a2dc5..9b4c7cba 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -45,7 +45,13 @@ def process_node(self, node): return elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): return - if node.form in self.lemma: + # Verbs that are derived from the copula and tagged as the copula need + # to have the lemma of the copula (是 shì). + if re.search(r'是', node.form) and re.match(r'^(AUX|VERB)$', node.upos): + node.lemma = '是' + if node.form == '不是': + node.feats['Polarity'] = 'Neg' + elif node.form in self.lemma: node.lemma = self.lemma[node.form] else: node.lemma = node.form From 3bea246947ce88825cc15f690e0de744b85c37ee Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Nov 2022 23:40:44 +0100 Subject: [PATCH 0543/1201] Another Chinese copula. --- udapi/block/ud/zh/lemmatize.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 9b4c7cba..298f3501 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -46,9 +46,18 @@ def process_node(self, node): elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): return # Verbs that are derived from the copula and tagged as the copula need - # to have the lemma of the copula (是 shì). - if re.search(r'是', node.form) and re.match(r'^(AUX|VERB)$', node.upos): - node.lemma = '是' + # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). + # 亦為 亦为 Yì wèi také + # 則為 则为 Zé wèi potom + # 更為 更为 Gèng wèi více + # 認為 认为 Rènwéi myslet, věřit + # 以為 以为 Yǐwéi myslet, věřit + # 以爲 以为 Yǐwéi myslet, věřit + m = re.search(r'([是爲為为])', node.form) + if m and re.match(r'^(AUX|VERB)$', node.upos): + node.lemma = m.group(1) + if node.lemma == '爲': + node.lemma = '為' if node.form == '不是': node.feats['Polarity'] = 'Neg' elif node.form in self.lemma: From d9af327a10bc816334d6e0514f636206dfb44c9f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 27 Nov 2022 01:44:04 +0100 Subject: [PATCH 0544/1201] readers' parameter merge=1 so e.g. `udapy read.Conllu files=a.connlu,b.conllu merge=1` merges the two files into one document and should be equivalent to `cat a.conllu b.conllu | udapy read.Conllu from=-`. --- udapi/core/basereader.py | 82 +++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 38 deletions(-) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 53a1129c..a3b334da 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -13,7 +13,7 @@ class BaseReader(Block): # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', - sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs): + sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None @@ -28,6 +28,7 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id + self.merge = merge # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it in here, the reader. # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, @@ -111,43 +112,48 @@ def try_fast_load(self, document): if filehandle is None: self.finished = True return True - try: - trees = self.read_trees() - except NotImplementedError: - return False - document.meta['loaded_from'] = self.filename - document.meta['global.Entity'] = self._global_entity - if trees and trees[0].newdoc and trees[0].newdoc is not True: - document.meta["docname"] = trees[0].newdoc - - bundle, last_bundle_id = None, '' - for root in trees: - add_to_the_last_bundle = False - - if self.ignore_sent_id: - root._sent_id = None - elif root._sent_id is not None: - parts = root._sent_id.split('/', 1) - bundle_id = parts[0] - if len(parts) == 2: - root.zone = parts[1] - add_to_the_last_bundle = bundle_id == last_bundle_id - last_bundle_id = bundle_id - if self.zone != 'keep': - root.zone = self.zone - - # assign new/next bundle to `bundle` if needed - if not bundle or not add_to_the_last_bundle: - bundle = document.create_bundle() - if last_bundle_id != '': - bundle.bundle_id = last_bundle_id - - bundle.add_tree(root) - - self.next_filehandle() - if self.filehandle is None: - self.finished = True + while True: + try: + trees = self.read_trees() + except NotImplementedError: + return False + + document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity + if trees and trees[0].newdoc and trees[0].newdoc is not True: + document.meta["docname"] = trees[0].newdoc + + bundle, last_bundle_id = None, '' + for root in trees: + add_to_the_last_bundle = False + + if self.ignore_sent_id: + root._sent_id = None + elif root._sent_id is not None: + parts = root._sent_id.split('/', 1) + bundle_id = parts[0] + if len(parts) == 2: + root.zone = parts[1] + add_to_the_last_bundle = bundle_id == last_bundle_id + last_bundle_id = bundle_id + if self.zone != 'keep': + root.zone = self.zone + + # assign new/next bundle to `bundle` if needed + if not bundle or not add_to_the_last_bundle: + bundle = document.create_bundle() + if last_bundle_id != '': + bundle.bundle_id = last_bundle_id + + bundle.add_tree(root) + + self.next_filehandle() + if self.filehandle is None: + self.finished = True + return True + if not self.merge: + return True return True # pylint: disable=too-many-branches,too-many-statements @@ -190,7 +196,7 @@ def process_document(self, document): while True: root = self.filtered_read_tree() if root is None: - if trees_loaded == 0 and self.files.has_next_file(): + if (trees_loaded == 0 or self.merge) and self.files.has_next_file(): filehandle = self.next_filehandle() continue self.finished = not self.files.has_next_file() From e148621de92ea26550634d4972b6e0093660a103 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:11:40 +0100 Subject: [PATCH 0545/1201] Lemmatization of negated verbs in Chinese. --- udapi/block/ud/zh/lemmatize.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 298f3501..75d62716 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -45,6 +45,9 @@ def process_node(self, node): return elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): return + # Lemmatize negated verbs to their affirmative forms. + # 不是 bùshì = not be + # 没有 méiyǒu = not exist # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). # 亦為 亦为 Yì wèi také @@ -53,13 +56,16 @@ def process_node(self, node): # 認為 认为 Rènwéi myslet, věřit # 以為 以为 Yǐwéi myslet, věřit # 以爲 以为 Yǐwéi myslet, věřit - m = re.search(r'([是爲為为])', node.form) - if m and re.match(r'^(AUX|VERB)$', node.upos): - node.lemma = m.group(1) - if node.lemma == '爲': - node.lemma = '為' - if node.form == '不是': + if re.match(r'^(AUX|VERB)$', node.upos): + m1 = re.match(r'^(不|没)(.+)$', node.form) + m2 = re.search(r'([是爲為为])', node.form) + if m1: + node.lemma = m1.group(1) node.feats['Polarity'] = 'Neg' + elif m2: + node.lemma = m2.group(1) + if node.lemma == '爲': + node.lemma = '為' elif node.form in self.lemma: node.lemma = self.lemma[node.form] else: From 40f224a9e9554d9573d9059fb7aea16ea20a731a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:26:13 +0100 Subject: [PATCH 0546/1201] Oops! Wrong part! --- udapi/block/ud/zh/lemmatize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 75d62716..7658d9b4 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -60,7 +60,7 @@ def process_node(self, node): m1 = re.match(r'^(不|没)(.+)$', node.form) m2 = re.search(r'([是爲為为])', node.form) if m1: - node.lemma = m1.group(1) + node.lemma = m1.group(2) node.feats['Polarity'] = 'Neg' elif m2: node.lemma = m2.group(1) From 0e0d53905e40848c0e7a11e4d87aa3715a93ee33 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:35:42 +0100 Subject: [PATCH 0547/1201] =?UTF-8?q?Another=20negation=20pattern:=20?= =?UTF-8?q?=E6=9C=AA.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/zh/lemmatize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 7658d9b4..9c492800 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -48,6 +48,7 @@ def process_node(self, node): # Lemmatize negated verbs to their affirmative forms. # 不是 bùshì = not be # 没有 méiyǒu = not exist + # 未能 wèinéng = cannot # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). # 亦為 亦为 Yì wèi také @@ -57,7 +58,7 @@ def process_node(self, node): # 以為 以为 Yǐwéi myslet, věřit # 以爲 以为 Yǐwéi myslet, věřit if re.match(r'^(AUX|VERB)$', node.upos): - m1 = re.match(r'^(不|没)(.+)$', node.form) + m1 = re.match(r'^([不没未])(.+)$', node.form) m2 = re.search(r'([是爲為为])', node.form) if m1: node.lemma = m1.group(2) From 74445e4722de55a7e9714642def6fd09a1d5e2ae Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:50:50 +0100 Subject: [PATCH 0548/1201] Another negation pattern. --- udapi/block/ud/zh/lemmatize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 9c492800..436c3587 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -48,6 +48,7 @@ def process_node(self, node): # Lemmatize negated verbs to their affirmative forms. # 不是 bùshì = not be # 没有 méiyǒu = not exist + # 沒能 méinéng = cannot # 未能 wèinéng = cannot # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). @@ -58,7 +59,7 @@ def process_node(self, node): # 以為 以为 Yǐwéi myslet, věřit # 以爲 以为 Yǐwéi myslet, věřit if re.match(r'^(AUX|VERB)$', node.upos): - m1 = re.match(r'^([不没未])(.+)$', node.form) + m1 = re.match(r'^([不没沒未])(.+)$', node.form) m2 = re.search(r'([是爲為为])', node.form) if m1: node.lemma = m1.group(2) From 13088ed765f05f6f684595863a81a783e8ceafbb Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 18:18:26 +0100 Subject: [PATCH 0549/1201] Lemmatization of interrogative verbs in Chinese. --- udapi/block/ud/zh/lemmatize.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 436c3587..abacf29f 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -47,25 +47,32 @@ def process_node(self, node): return # Lemmatize negated verbs to their affirmative forms. # 不是 bùshì = not be - # 没有 méiyǒu = not exist - # 沒能 méinéng = cannot + # 沒有 没有 méiyǒu = not exist + # 沒能 没能 méinéng = cannot # 未能 wèinéng = cannot + # Lemmatize question verbs to their base forms. + # 要不要 yàobùyào = do (you) want? + # 有没有 yǒuméiyǒu = do (you) have? # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). - # 亦為 亦为 Yì wèi také - # 則為 则为 Zé wèi potom - # 更為 更为 Gèng wèi více - # 認為 认为 Rènwéi myslet, věřit - # 以為 以为 Yǐwéi myslet, věřit - # 以爲 以为 Yǐwéi myslet, věřit + # 亦為 亦为 yìwèi = také + # 則為 则为 zéwèi = potom + # 更為 更为 gèngwèi = více + # 認為 认为 rènwéi = myslet, věřit + # 以為 以为 yǐwéi = myslet, věřit + # 以爲 以为 yǐwéi = myslet, věřit if re.match(r'^(AUX|VERB)$', node.upos): m1 = re.match(r'^([不没沒未])(.+)$', node.form) - m2 = re.search(r'([是爲為为])', node.form) + m2 = re.match(r'^(.+)([不没沒未])\1$', node.form) + m3 = re.search(r'([是爲為为])', node.form) if m1: node.lemma = m1.group(2) node.feats['Polarity'] = 'Neg' elif m2: node.lemma = m2.group(1) + node.feats['Mood'] = 'Int' + elif m3: + node.lemma = m3.group(1) if node.lemma == '爲': node.lemma = '為' elif node.form in self.lemma: From 64f5bc7427efd3c32a84229e2c2c901b545118b4 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 1 Dec 2022 14:26:38 +0100 Subject: [PATCH 0550/1201] print also number of documents and paragraphs if any, based on newdoc and newpar annotations --- udapi/block/util/wc.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/udapi/block/util/wc.py b/udapi/block/util/wc.py index 137c95e9..e8ea2676 100644 --- a/udapi/block/util/wc.py +++ b/udapi/block/util/wc.py @@ -13,6 +13,7 @@ def __init__(self, tsv=False, **kwargs): """ super().__init__(**kwargs) self.trees, self.words, self.mwts, self.tokens, self.empty = 0, 0, 0, 0, 0 + self.docs, self.paragraphs = 0, 0 self.tsv = tsv def process_tree(self, tree): @@ -22,13 +23,21 @@ def process_tree(self, tree): self.mwts += mwtoks self.tokens += len(tree.token_descendants) if mwtoks else len(tree.descendants) self.empty += len(tree.empty_nodes) + if tree.newdoc: + self.docs += 1 + if tree.newpar: + self.paragraphs += 1 def process_end(self): if self.tsv: - print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty)))) + print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty, self.docs, self.paragraphs)))) else: print('%8d trees\n%8d words' % (self.trees, self.words)) if self.mwts: print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) if self.empty: print('%8d empty nodes' % self.empty) + if self.docs: + print('%8d documents' % self.docs) + if self.paragraphs: + print('%8d paragraphs' % self.paragraphs) From c29590fefe4a045c8c33c0e8729c3a2582d1cf5f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 2 Dec 2022 23:18:53 +0100 Subject: [PATCH 0551/1201] Enable separate checking of Flavio's approach to Latin morphology. --- udapi/block/ud/la/markfeatsbugs.py | 141 +++++++++++++++-------------- 1 file changed, 75 insertions(+), 66 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 96c7b682..149fcd18 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -12,15 +12,32 @@ class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + def __init__(self, flavio=False, **kwargs): + """ + Create the ud.la.MarkFeatsBugs block instance. + + Args: + flavio=1: Accept features as defined by Flavio for treebanks he + maintains. By default, a more conservative set of features and + values is expected. + """ + super().__init__(**kwargs) + self.flavio = flavio + def process_node(self, node): # NOUNS ################################################################ if node.upos == 'NOUN': - self.check_required_features(node, ['Gender', 'Number', 'Case']) - self.check_allowed_features(node, { + rf = ['Gender', 'Number', 'Case'] + af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes']} + if self.flavio: + rf.append('InflClass') + af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': self.check_required_features(node, ['Gender', 'Number', 'Case']) @@ -32,13 +49,20 @@ def process_node(self, node): 'Foreign': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree']) - self.check_allowed_features(node, { + rf = ['Gender', 'Number', 'Case', 'Degree'] + af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Foreign': ['Yes']}) + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Foreign': ['Yes']} + if self.flavio: + # Flavio does not use Degree=Pos, hence Degree is not required. + rf = [f for f in rf if f != 'Degree'] + rf.append('InflClass') + af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # PRONOUNS ############################################################# elif node.upos == 'PRON': self.check_required_features(node, ['PronType']) @@ -81,13 +105,19 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] }) else: - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { + rf = ['PronType', 'Gender', 'Number', 'Case'] + af = { 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} + if self.flavio: + rf.append('InflClass') + af['PronType'].append('Con') + af['InflClass'] = ['LatPron'] + af['Form'] = ['Emp'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # NUMERALS ############################################################# elif node.upos == 'NUM': self.check_required_features(node, ['NumType', 'NumForm']) @@ -98,73 +128,52 @@ def process_node(self, node): 'NumForm': ['Digit', 'Roman'] }) else: - self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_required_features(node, ['NumType', 'NumForm']) self.check_allowed_features(node, { 'NumType': ['Card'], - 'NumForm': ['Word'], - 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + 'NumForm': ['Word'] }) # VERBS AND AUXILIARIES ################################################ elif re.match(r'^(VERB|AUX)$', node.upos): - self.check_required_features(node, ['Aspect', 'VerbForm']) - if node.feats['VerbForm'] == 'Inf': - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Inf'], - 'Polarity': ['Pos', 'Neg'] - }) - elif node.feats['VerbForm'] == 'Fin': - if node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood', 'Person', 'Number']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Fin'], - 'Mood': ['Imp'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Plur'], - 'Polarity': ['Pos', 'Neg'] - }) - else: # indicative or subjunctive - self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Sub'], - 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative - 'Voice': ['Act'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Plur'], - 'Polarity': ['Pos', 'Neg'] - }) + rf = ['Aspect', 'VerbForm'] + af = { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Inf', 'Fin', 'Part', 'Vnoun'], + 'Polarity': ['Pos', 'Neg']} + if node.feats['VerbForm'] == 'Fin': + rf.extend(['Mood', 'Person', 'Number']) + af['Mood'] = ['Ind', 'Sub', 'Imp'] + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + if re.match(r'^(Ind|Sub)$', node.feats['Mood']): # indicative or subjunctive + rf.extend(['Voice', 'Tense']) + af['Voice'] = ['Act', 'Pass'] + af['Tense'] = ['Past', 'Imp', 'Pres', 'Fut'] elif node.feats['VerbForm'] == 'Part': - self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Part'], - 'Tense': ['Past'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB - 'Number': ['Sing', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Polarity': ['Pos', 'Neg'] - }) + rf.extend(['Tense', 'Gender', 'Number', 'Voice']) + af['Tense'] = ['Past'] + af['Voice'] = ['Act'] + af['Number'] = ['Sing', 'Plur'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] else: # verbal noun - self.check_required_features(node, ['Tense', 'Number', 'Voice']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Vnoun'], - 'Tense': ['Past', 'Pres'], - 'Voice': ['Act'], - 'Number': ['Sing', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular - 'Polarity': ['Pos', 'Neg'] - }) + rf.extend(['Tense', 'Voice']) + af['Tense'] = ['Past', 'Pres'] + af['Voice'] = ['Act'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] + if self.flavio: + # Flavio has killed Tense in his treebanks. + rf = [f for f in rf if f != 'Tense'] + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI2', 'LatX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # ADVERBS ############################################################## elif node.upos == 'ADV': if node.feats['PronType'] != '': # Pronominal adverbs are neither compared nor negated. self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'AdvType': ['Loc'] }) else: # The remaining adverbs are neither pronominal, nor compared or From 8b05a49741481d20cf4b0b4ec41bf92b4a696701 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 3 Dec 2022 12:36:35 +0100 Subject: [PATCH 0552/1201] Adjusted Latin feature rules. --- udapi/block/ud/la/markfeatsbugs.py | 209 +++++++++++++++++------------ 1 file changed, 121 insertions(+), 88 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 149fcd18..31d112b8 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -25,121 +25,146 @@ def __init__(self, flavio=False, **kwargs): self.flavio = flavio def process_node(self, node): + rf = [] + af = {} # NOUNS ################################################################ if node.upos == 'NOUN': - rf = ['Gender', 'Number', 'Case'] + if not node.feats['Abbr'] == 'Yes': + rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Dim'], + 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: - rf.append('InflClass') - af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - self.check_required_features(node, ['Gender', 'Number', 'Case']) - self.check_allowed_features(node, { + if not node.feats['Abbr'] == 'Yes': + rf = ['Gender', 'Number', 'Case'] + af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) + 'Abbr': ['Yes'], + 'Foreign': ['Yes']} + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Proper'] = ['Yes'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': - rf = ['Gender', 'Number', 'Case', 'Degree'] + if not node.feats['Abbr'] == 'Yes': + rf = ['Gender', 'Number', 'Case', 'Degree'] af = { + 'NumType': ['Ord', 'Dist'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: # Flavio does not use Degree=Pos, hence Degree is not required. rf = [f for f in rf if f != 'Degree'] - rf.append('InflClass') - af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PRONOUNS ############################################################# elif node.upos == 'PRON': - self.check_required_features(node, ['PronType']) + rf = ['PronType', 'Case'] + af = { + 'PronType': ['Prs', 'Rel', 'Ind'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + } if node.feats['PronType'] == 'Prs': - if node.feats['Reflex'] == 'Yes': - self.check_required_features(node, ['PronType', 'Reflex', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Reflex': ['Yes'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] - }) - else: # not reflexive - if node.feats['Person'] == '3': # on, ona, ono, oni, ony - self.check_required_features(node, ['PronType', 'Person', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) - else: # 1st and 2nd person do not have gender: já, ty - self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['1', '2'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) + af['Reflex'] = ['Yes'] + if node.feats['Reflex'] == 'Yes': # seipsum, se + # seipsum has gender and number but se does not, so it is not required + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Person'] = ['3'] + af['Case'] = ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + else: # not reflexive: ego, tu, is, nos + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 1st and 2nd person do not have gender + if node.feats['Person'] == '3': # is, id + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + elif re.match(r'^(Rel|Ind)$', node.feats['PronType']): + rf.extend(['Gender', 'Number']) + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['LatAnom', 'LatPron'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # DETERMINERS ########################################################## elif node.upos == 'DET': - if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Person': ['1', '2', '3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) + rf = ['PronType', 'Gender', 'Number', 'Case'] + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} + if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' + rf.extend(['Poss', 'Person[psor]']) + af['PronType'] = ['Prs'] + af['Poss'] = 'Yes' + af['Person[psor]'] = ['1', '2', '3'] + af['Reflex'] = ['Yes'] + # The possessor's number is distinguished in the first and second person (meus vs. noster) but not in the third person (suus). + if node.feats['Person[psor]'] != '3': + rf.append('Number[psor]') + af['Number[psor]'] = ['Sing', 'Plur'] else: - rf = ['PronType', 'Gender', 'Number', 'Case'] - af = { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} - if self.flavio: - rf.append('InflClass') - af['PronType'].append('Con') - af['InflClass'] = ['LatPron'] - af['Form'] = ['Emp'] - self.check_required_features(node, rf) - self.check_allowed_features(node, af) + af['PronType'] = ['Dem', 'Rel', 'Ind', 'Tot', 'Con'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['Form'] = ['Emp'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # NUMERALS ############################################################# elif node.upos == 'NUM': - self.check_required_features(node, ['NumType', 'NumForm']) + rf = ['NumType', 'NumForm'] + af = { + 'NumType': ['Card'], + 'NumForm': ['Word', 'Roman', 'Digit'] + } # Arabic digits and Roman numerals do not have inflection features. - if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Digit', 'Roman'] - }) - else: - self.check_required_features(node, ['NumType', 'NumForm']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'] - }) + if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # VERBS AND AUXILIARIES ################################################ elif re.match(r'^(VERB|AUX)$', node.upos): - rf = ['Aspect', 'VerbForm'] + rf = ['VerbForm'] af = { - 'Aspect': ['Imp', 'Perf', 'Prosp'], 'VerbForm': ['Inf', 'Fin', 'Part', 'Vnoun'], 'Polarity': ['Pos', 'Neg']} + # Main verbs have aspect but auxiliaries don't. + if node.upos == 'VERB': + rf.append('Aspect') + af['Aspect'] = ['Imp', 'Perf', 'Prosp'] if node.feats['VerbForm'] == 'Fin': rf.extend(['Mood', 'Person', 'Number']) af['Mood'] = ['Ind', 'Sub', 'Imp'] @@ -150,40 +175,48 @@ def process_node(self, node): af['Voice'] = ['Act', 'Pass'] af['Tense'] = ['Past', 'Imp', 'Pres', 'Fut'] elif node.feats['VerbForm'] == 'Part': - rf.extend(['Tense', 'Gender', 'Number', 'Voice']) + rf.extend(['Tense', 'Gender', 'Number', 'Voice', 'Case']) af['Tense'] = ['Past'] - af['Voice'] = ['Act'] + af['Voice'] = ['Act', 'Pass'] af['Number'] = ['Sing', 'Plur'] af['Gender'] = ['Masc', 'Fem', 'Neut'] - else: # verbal noun + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + af['Degree'] = ['Abs'] + elif node.feats['VerbForm'] == 'Vnoun': rf.extend(['Tense', 'Voice']) af['Tense'] = ['Past', 'Pres'] - af['Voice'] = ['Act'] + af['Voice'] = ['Act', 'Pass'] af['Gender'] = ['Masc', 'Fem', 'Neut'] + # else: nothing to be added form VerbForm=Inf if self.flavio: # Flavio has killed Tense in his treebanks. rf = [f for f in rf if f != 'Tense'] # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI2', 'LatX'] + if node.feats['VerbForm'] == 'Part': + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## elif node.upos == 'ADV': - if node.feats['PronType'] != '': - # Pronominal adverbs are neither compared nor negated. - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], - 'AdvType': ['Loc'] - }) - else: - # The remaining adverbs are neither pronominal, nor compared or - # negated. - self.check_allowed_features(node, {}) + af = { + 'AdvType': ['Loc', 'Tim'], + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'] + } + if self.flavio: + af['Compound'] = 'Yes' + af['Form'] = 'Emp' + self.check_allowed_features(node, af) # PARTICLES ############################################################ elif node.upos == 'PART': - self.check_allowed_features(node, { + af = { + 'PartType': ['Int'], 'Polarity': ['Neg'] - }) + } + if self.flavio: + af['Form'] = 'Emp' + self.check_allowed_features(node, af) # THE REST: NO FEATURES ################################################ else: self.check_allowed_features(node, {}) From ab86f1b93d6e20bf4f42c18c1af9f3c22c5e4f64 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 4 Dec 2022 12:07:10 +0100 Subject: [PATCH 0553/1201] Refined features of pronouns in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 71 +++++++++++++----------------- 1 file changed, 30 insertions(+), 41 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 41d4cf09..96cf8b55 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -27,50 +27,38 @@ def process_node(self, node): 'Foreign': ['Yes']}) # PRONOUNS ############################################################# elif node.upos == 'PRON': - self.check_required_features(node, ['PronType']) + rf = ['PronType', 'Case'] + af = { + 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + } if node.feats['PronType'] == 'Prs': + af['Reflex'] = ['Yes'] if node.feats['Reflex'] == 'Yes': - self.check_required_features(node, ['PronType', 'Reflex', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Reflex': ['Yes'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) + af['Case'] = [c for c in af['Case'] if c != 'Nom' and c != 'Voc'] else: # not reflexive - if node.feats['Person'] == '3': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī; or 3rd person താൻ tān̕ + if node.feats['Person'] == '3' and not node.lemma == 'താൻ': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ; but not താൻ tān̕ + rf.append('Deixis') + af['Deixis'] = ['Prox', 'Remt'] if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Deixis': ['Prox', 'Remt'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) - else: # plural pronouns do not distinguish gender - self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Deixis': ['Prox', 'Remt'], - 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) - else: # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī - self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['1', '2'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) - else: # not personal - self.check_required_features(node, ['PronType', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + # third person singular neuter pronouns also distinguish animacy (animate neuter are animals and plants, they have a different accusative form) + if node.feats['Gender'] == 'Neut': + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + else: # plural pronouns do not distinguish gender but they do distinguish animacy + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + elif node.feats['Person'] == '1' and node.feats['Number'] == 'Plur': + rf.append('Clusivity') + af['Clusivity'] = ['In', 'Ex'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # DETERMINERS ########################################################## elif node.upos == 'DET': if node.feats['PronType'] == 'Art': @@ -82,7 +70,8 @@ def process_node(self, node): else: self.check_required_features(node, ['PronType']) self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Deixis': ['Prox', 'Remt'] }) # NUMERALS ############################################################# elif node.upos == 'NUM': From cd9b962cb602eced89466af00ee077afd20d63bc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 4 Dec 2022 15:04:02 +0100 Subject: [PATCH 0554/1201] Write sentences in a HTML list. --- udapi/block/write/sentenceshtml.py | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 udapi/block/write/sentenceshtml.py diff --git a/udapi/block/write/sentenceshtml.py b/udapi/block/write/sentenceshtml.py new file mode 100644 index 00000000..e0f87241 --- /dev/null +++ b/udapi/block/write/sentenceshtml.py @@ -0,0 +1,37 @@ +"""SentencesHtml class is a writer for sentences in HTML list (could be Google-translated, remembering sentence correspondence).""" +from udapi.core.basewriter import BaseWriter + + +class SentencesHtml(BaseWriter): + """A writer of sentences in HTML list (one per item). + + Usage: + udapy write.SentencesHtml if_missing=empty < my.conllu > my.html + """ + + def __init__(self, title='Sentences from CoNLL-U', if_missing='detokenize', **kwargs): + """Create the SentencesHtml writer block. + + Parameters: + if_missing: What to do if `root.text` is `None`? (default=detokenize) + * `detokenize`: use `root.compute_text()` to compute the sentence. + * `empty`: print an empty line + * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` + * `fatal`: raise an exception + """ + super().__init__(**kwargs) + self.title = title + self.if_missing = if_missing + + def before_process_document(self, document): + super().before_process_document(document) + print('\n\n\n') + print('' + self.title + '') + print('\n\n

    \n') + + def after_process_document(self, document): + print("
\n\n") + super().after_process_document(document) + + def process_tree(self, tree): + print('
  • %s
  • ' % (tree.sent_id, tree.get_sentence(self.if_missing))) From faeecb50ca7c3dfbc1130c628593c9b7031f035e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 4 Dec 2022 19:30:44 +0100 Subject: [PATCH 0555/1201] Refined feature tests for Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 96cf8b55..2372bd23 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -57,6 +57,15 @@ def process_node(self, node): elif node.feats['Person'] == '1' and node.feats['Number'] == 'Plur': rf.append('Clusivity') af['Clusivity'] = ['In', 'Ex'] + # Interrogative pronouns, too, can be case-marked. Therefore, the + # base form must have Case=Nom. + # ആര് ār "who" (Nom) എന്ത് ent "what" (Nom, Acc.Inan) + # ആരെ āre "who" (Acc) എന്തെ ente "what" (Acc.Anim) എന്തിനെ entine "what" (Acc.Anim or maybe Inan but optional) + # ആരുടെ āruṭe "who" (Gen) എന്തിന് entin "what" (Gen) or "why" + # ആരൊക്കെ ārokke "who" (Dat?) എന്തൊക്കെ entokke "what" (Dat?) + elif node.feats['PronType'] == 'Int': + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # DETERMINERS ########################################################## @@ -101,13 +110,18 @@ def process_node(self, node): }) elif node.feats['VerbForm'] == 'Fin': if node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood', 'Voice']) + # Unlike other forms, the imperative distinguishes politeness. + # The verb stem serves as an informal imperative: തുറ tuṟa "open" + # The citation form may serve as a formal imperative: തുറക്കുക tuṟakkūka "open" + # Finally, there is another formal imperative with -kkū: തുറക്കൂ tuṟakkū "open" + self.check_required_features(node, ['Mood', 'Voice', 'Polite']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Polite': ['Infm', 'Form'] }) else: self.check_required_features(node, ['Mood', 'Tense', 'Voice']) From 9f1c9adadd6b5e53aa9cf5aaea9cd8e26cdfe663 Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Tue, 6 Dec 2022 10:24:48 +0100 Subject: [PATCH 0556/1201] Further adjusted Latin feature rules. --- udapi/block/ud/la/markfeatsbugs.py | 155 ++++++++++++++++++++--------- 1 file changed, 107 insertions(+), 48 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 31d112b8..323f60f7 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -29,7 +29,7 @@ def process_node(self, node): af = {} # NOUNS ################################################################ if node.upos == 'NOUN': - if not node.feats['Abbr'] == 'Yes': + if not node.feats['Abbr'] == 'Yes' or node.feats['Case']: # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], @@ -41,61 +41,71 @@ def process_node(self, node): if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['VerbForm'] = ['Part'] + af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - if not node.feats['Abbr'] == 'Yes': + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: # abbreviated and indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'NameType': ['Giv', 'Sur', 'Geo'], 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: - # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] - af['Proper'] = ['Yes'] + af['Compound'] = 'Yes' + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': - if not node.feats['Abbr'] == 'Yes': - rf = ['Gender', 'Number', 'Case', 'Degree'] + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: + rf = ['Gender', 'Number', 'Case'] af = { 'NumType': ['Ord', 'Dist'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Degree': ['Cmp', 'Sup', 'Abs'], 'Abbr': ['Yes'], - 'Foreign': ['Yes']} + 'Foreign': ['Yes'], + 'Polarity': ['Neg']} if self.flavio: # Flavio does not use Degree=Pos, hence Degree is not required. - rf = [f for f in rf if f != 'Degree'] + # rf = [f for f in rf if f != 'Degree'] # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] + af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] + af['Degree'].append('Dim') + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PRONOUNS ############################################################# elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { - 'PronType': ['Prs', 'Rel', 'Ind'], + 'PronType': ['Prs', 'Rel', 'Ind', 'Int', 'Rcp'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] if node.feats['Reflex'] == 'Yes': # seipsum, se + rf.extend(['Person']) # seipsum has gender and number but se does not, so it is not required - af['Gender'] = ['Masc'] - af['Number'] = ['Sing'] + # TODO: seipsum in ITTB, but why lemma seipsum instead of seipse? + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] af['Person'] = ['3'] - af['Case'] = ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Abl'] else: # not reflexive: ego, tu, is, nos rf.extend(['Person', 'Number']) af['Person'] = ['1', '2', '3'] @@ -104,22 +114,34 @@ def process_node(self, node): if node.feats['Person'] == '3': # is, id rf.append('Gender') af['Gender'] = ['Masc', 'Fem', 'Neut'] - elif re.match(r'^(Rel|Ind)$', node.feats['PronType']): + elif re.match(r'^(Rel|Int)$', node.feats['PronType']): rf.extend(['Gender', 'Number']) af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] + elif node.feats['PronType'] == 'Ind': + rf = [f for f in rf if f != 'Case'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatAnom', 'LatPron'] + af['Compound'] = ['Yes'] + af['Polarity'] = ['Neg'] + af['Form'] = ['Emp'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # DETERMINERS ########################################################## elif node.upos == 'DET': - rf = ['PronType', 'Gender', 'Number', 'Case'] + rf = ['PronType'] + if node.feats['Case']: + rf.extend(['Gender', 'Number', 'Case']) af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Cmp', 'Abs', 'Sup'], + 'Polarity': ['Neg'] + } if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' rf.extend(['Poss', 'Person[psor]']) af['PronType'] = ['Prs'] @@ -131,11 +153,16 @@ def process_node(self, node): rf.append('Number[psor]') af['Number[psor]'] = ['Sing', 'Plur'] else: - af['PronType'] = ['Dem', 'Rel', 'Ind', 'Tot', 'Con'] + af['PronType'] = ['Dem', 'Rel', 'Ind', 'Int', 'Tot', 'Con'] if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['Compound'] = ['Yes'] af['Form'] = ['Emp'] + af['NumType'] = ['Card'] + af['Degree'].append('Dim') + if re.match(r'^(unus|ambo)', node.lemma): + af['NumValue'] = ['1', '2'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # NUMERALS ############################################################# @@ -151,50 +178,59 @@ def process_node(self, node): af['Number'] = ['Sing', 'Plur'] af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] if self.flavio: - # Flavio added InflClass but not everywhere, so it is not required. + # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # VERBS AND AUXILIARIES ################################################ elif re.match(r'^(VERB|AUX)$', node.upos): - rf = ['VerbForm'] + rf = ['VerbForm', 'Aspect'] af = { - 'VerbForm': ['Inf', 'Fin', 'Part', 'Vnoun'], - 'Polarity': ['Pos', 'Neg']} + 'VerbForm': ['Inf', 'Fin', 'Part'], + 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], + 'Polarity': ['Neg'] + } + if not re.match(r'^(Ger|Gdv)$', node.feats['VerbForm']): + rf.append('Tense') + af['Tense'] = ['Pres', 'Fut'] + if node.upos == 'VERB': # and not node.lemma.endswith('sum'): # compounds of sum + rf.append('Voice') + af['Voice'] = ['Act', 'Pass'] # Main verbs have aspect but auxiliaries don't. - if node.upos == 'VERB': - rf.append('Aspect') - af['Aspect'] = ['Imp', 'Perf', 'Prosp'] - if node.feats['VerbForm'] == 'Fin': + # TODO: apparently, apparently AUXs have aspect as well + # if node.upos == 'VERB': + # rf.append('Aspect') + # af['Aspect'] = ['Imp', 'Inch', 'Perf', 'Prosp'] + if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive rf.extend(['Mood', 'Person', 'Number']) + af['Tense'].extend(['Past', 'Pqp']) af['Mood'] = ['Ind', 'Sub', 'Imp'] af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] - if re.match(r'^(Ind|Sub)$', node.feats['Mood']): # indicative or subjunctive - rf.extend(['Voice', 'Tense']) - af['Voice'] = ['Act', 'Pass'] - af['Tense'] = ['Past', 'Imp', 'Pres', 'Fut'] elif node.feats['VerbForm'] == 'Part': - rf.extend(['Tense', 'Gender', 'Number', 'Voice', 'Case']) - af['Tense'] = ['Past'] - af['Voice'] = ['Act', 'Pass'] + rf.extend(['Gender', 'Number', 'Case']) af['Number'] = ['Sing', 'Plur'] af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - af['Degree'] = ['Abs'] - elif node.feats['VerbForm'] == 'Vnoun': - rf.extend(['Tense', 'Voice']) - af['Tense'] = ['Past', 'Pres'] - af['Voice'] = ['Act', 'Pass'] + af['Degree'] = ['Abs', 'Cmp'] af['Gender'] = ['Masc', 'Fem', 'Neut'] - # else: nothing to be added form VerbForm=Inf + af['Tense'].append('Past') + # else: nothing to be added for VerbForm=Inf if self.flavio: # Flavio has killed Tense in his treebanks. rf = [f for f in rf if f != 'Tense'] + af['VerbForm'].append('Vnoun') # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI2', 'LatX'] - if node.feats['VerbForm'] == 'Part': + af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + if 'Degree' in af: + af['Degree'].append('Dim') + else: + af['Degree'] = ['Dim'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] + if re.match(r'^(Part|Vnoun)$', node.feats['VerbForm']): af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO'] + af['VerbForm'].append('Vnoun') self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## @@ -202,20 +238,43 @@ def process_node(self, node): af = { 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], - 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'] + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Polarity': ['Neg'] } if self.flavio: - af['Compound'] = 'Yes' - af['Form'] = 'Emp' + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['NumType'] = ['Card', 'Ord'] # e.g., primum + af['VerbForm'] = ['Part'] + af['Degree'].append('Dim') self.check_allowed_features(node, af) # PARTICLES ############################################################ elif node.upos == 'PART': af = { - 'PartType': ['Int'], + 'PartType': ['Int', 'Emp'], 'Polarity': ['Neg'] } if self.flavio: - af['Form'] = 'Emp' + af['Form'] = ['Emp'] + af['PronType'] = ['Dem'] + self.check_allowed_features(node, af) + # CONJUNCTIONS ######################################################### + elif re.match(r'^[CS]CONJ$', node.upos): + af = { + 'PronType': ['Rel', 'Con'], + 'Polarity': ['Neg']} + if self.flavio: + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['VerbForm'] = ['Fin'] + af['NumType'] = ['Card'] + self.check_allowed_features(node, af) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + if self.flavio: + af = { + 'VerbForm': ['Part'], + 'Proper': ['Yes']} self.check_allowed_features(node, af) # THE REST: NO FEATURES ################################################ else: From 3de5c225d9fc8e1a56922bd13b2feea8f4ca7bf4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 6 Dec 2022 11:21:33 +0100 Subject: [PATCH 0557/1201] Usage: the new parameter merge=1 implemented by Martin. --- udapi/block/ud/cs/markfeatsbugs.py | 2 +- udapi/block/ud/la/markfeatsbugs.py | 2 +- udapi/block/ud/ml/markfeatsbugs.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 309e7ac8..30ee90b2 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 31d112b8..74a06a07 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAMX layout=compact ud.la.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 2372bd23..b286a27c 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAMX layout=compact ud.ml.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging From 1544c7474cdf91aa2f1c52b3566dedf11a127e5f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 14 Dec 2022 17:17:00 +0100 Subject: [PATCH 0558/1201] update for newer versions of termcolor and colorama --- requirements.txt | 2 +- udapi/block/write/textmodetrees.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 647361f7..044d3af7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -colorama +colorama>=0.4.6 termcolor ufal.udpipe diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index f3f6e007..41539670 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -1,4 +1,5 @@ """An ASCII pretty printer of dependency trees.""" +import os import re import sys @@ -344,8 +345,12 @@ def before_process_document(self, document): super().before_process_document(document) if self.color == 'auto': self.color = sys.stdout.isatty() - if self.color: - colorama.init() + if self.color: + colorama.just_fix_windows_console() + # termcolor since 2.1 also autodetects whether sys.stdout.isatty() + # and if not, it disables the colors, so `cat i.conllu | udapy -T | less -R" + # does not work. We need to turn off termcolor's autodetection with FORCE_COLOR. + os.environ["FORCE_COLOR"] = "1" if self.print_doc_meta: for key, value in sorted(document.meta.items()): print('%s = %s' % (key, value)) From 9b0d20115a4dfea531519bf54f8fe5326ac77261 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 16 Dec 2022 23:03:43 +0100 Subject: [PATCH 0559/1201] read.Sentences newdoc_if_empty_line=1 --- udapi/block/read/sentences.py | 14 ++++++++++++-- udapi/core/document.py | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/udapi/block/read/sentences.py b/udapi/block/read/sentences.py index 356e196f..9b428331 100644 --- a/udapi/block/read/sentences.py +++ b/udapi/block/read/sentences.py @@ -9,6 +9,8 @@ class Sentences(BaseReader): Args: ignore_empty_lines: if True, delete empty lines from the input. Default=False. + newdoc_if_empty_line: if True, empty lines mark document boundaries, + which are marked with `root.newdoc`. Default=False. rstrip: a set of characters to be stripped from the end of each line. Default='\r\n '. You can use rstrip='\n' if you want to preserve any space or '\r' (Carriage Return) at end of line, @@ -16,8 +18,12 @@ class Sentences(BaseReader): As most blocks do not expect whitespace other than a space to appear in the processed text, using this feature is at your own risk. """ - def __init__(self, ignore_empty_lines=False, rstrip='\r\n ', **kwargs): + def __init__(self, ignore_empty_lines=False, newdoc_if_empty_line=False, + rstrip='\r\n ', **kwargs): + if ignore_empty_lines and newdoc_if_empty_line: + raise ValueError("ignore_empty_lines is not compatible with newdoc_if_empty_line") self.ignore_empty_lines = ignore_empty_lines + self.newdoc_if_empty_line = newdoc_if_empty_line self.rstrip = rstrip super().__init__(**kwargs) @@ -38,11 +44,15 @@ def read_tree(self, document=None): # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None - if self.ignore_empty_lines: + preceded_by_empty_line = False + if self.ignore_empty_lines or self.newdoc_if_empty_line: while line in {'\n', '\r\n'}: + preceded_by_empty_line = True line = self.filehandle.readline() if line == '': return None root = Root() root.text = line.rstrip(self.rstrip) + if self.newdoc_if_empty_line and preceded_by_empty_line: + root.newdoc = True return root diff --git a/udapi/core/document.py b/udapi/core/document.py index dcf146ea..d6a84f0e 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -23,7 +23,7 @@ def __init__(self, filename=None, **kwargs): No pre-processing is applied, so when loading the document from a *.txt file, `Document("a.txt").nodes` will be empty and you need to run tokenization first. You can pass additional parameters for `udapi.block.read.sentences` - (`ignore_empty_lines` and `rstrip`). + (`ignore_empty_lines`, `newdoc_if_empty_line` and `rstrip`). """ self.bundles = [] self._highest_bundle_id = 0 From 83989865bf94f3ae9355364f05ac32aef84e8979 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 20 Dec 2022 11:12:47 +0100 Subject: [PATCH 0560/1201] bugfix logging.warning takes multiple *args to be substituted for %s, not a single argument, see https://docs.python.org/3/library/logging.html#logging.debug However, using f-strings seems to be less error-prone. --- udapi/block/read/conllu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index d703fb26..7e59e2f9 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -73,7 +73,7 @@ def parse_comment_line(self, line, root): if entity_match is not None: global_entity = entity_match.group(1) if self._global_entity and self._global_entity != global_entity: - logging.warning("Mismatch in global.Entity: %s != %s", (self._global_entity, global_entity)) + logging.warning(f"Mismatch in global.Entity: {self._global_entity} != {global_entity}") self._global_entity = global_entity root.comment += '$GLOBAL.ENTITY\n' return From f93d4c92a64b9aad8bcdf1d2a8045bc6ae554cc5 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:04:47 +0100 Subject: [PATCH 0561/1201] fix a bug preventing to load two conllu files into two zones BaseReader calls ``` if self.zone != 'keep': root.zone = self.zone ``` so it supposes that root.sent_id will reflect the new zone. Originally, `root.sent_id` was computed each time on the fly, but after optimization it is cached in `root._sent_id`. --- udapi/core/root.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/core/root.py b/udapi/core/root.py index 0132566a..6a5717a2 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -95,6 +95,12 @@ def zone(self, zone): if self._bundle: self._bundle.check_zone(zone) self._zone = zone + if self._bundle is not None: + self._sent_id = self._bundle.address() + '/' + zone + elif self._sent_id: + self._sent_id = self._sent_id.split('/', 1)[0] + '/' + zone + else: + self._sent_id = '?/' + zone @property def parent(self): From 187a2b20139a60c0ca3ad8f08325b3851a695e86 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:08:59 +0100 Subject: [PATCH 0562/1201] util.MarkDiff ignore_parent=1 sometimes we may not be interested in differences in the topology --- udapi/block/util/markdiff.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/udapi/block/util/markdiff.py b/udapi/block/util/markdiff.py index 3d183f57..6c57ab36 100644 --- a/udapi/block/util/markdiff.py +++ b/udapi/block/util/markdiff.py @@ -9,7 +9,7 @@ class MarkDiff(Block): """Mark differences between parallel trees.""" def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc', - mark=1, add=False, print_stats=0, **kwargs): + mark=1, add=False, print_stats=0, ignore_parent=False, **kwargs): """Create the Mark block object. Params: gold_zone: Which of the zones should be treated as gold? @@ -20,6 +20,7 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block, so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block. print_stats: How many lines of statistics should be printed? -1 means all. + ignore_parent: ignore differences in dependency parents """ super().__init__(**kwargs) self.gold_zone = gold_zone @@ -27,6 +28,7 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc self.mark = mark self.add = add self.print_stats = print_stats + self.ignore_parent = ignore_parent self.stats = collections.Counter() def process_tree(self, tree): @@ -60,7 +62,7 @@ def process_tree(self, tree): edit, pred_lo, pred_hi, gold_lo, gold_hi = diff if edit == 'equal': for p_node, g_node in zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]): - if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: p_node.misc['Mark'] = self.mark g_node.misc['Mark'] = self.mark self.stats['ONLY-PARENT-CHANGED'] += 1 @@ -76,7 +78,7 @@ def process_tree(self, tree): p_value, g_value = p_node._get_attr(attr), g_node._get_attr(attr) if p_value != g_value: self.stats[f'{attr.upper()}: {p_value} -> {g_value}'] += 1 - if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: self.stats['PARENT-CHANGED'] += 1 pred_lo, gold_lo = pred_lo + n, gold_lo + n for node in gold_nodes[gold_lo:gold_hi]: From 2ad4922b5f9fe4196c5b67a00f42f45039f83c3a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:09:42 +0100 Subject: [PATCH 0563/1201] write.TextModeTreesHtml prints zones side by side by default --- udapi/block/write/textmodetreeshtml.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 9f9f6aa2..7fedc1b8 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -26,7 +26,7 @@ class TextModeTreesHtml(TextModeTrees): This block is a subclass of `TextModeTrees`, see its documentation for more info. """ - def __init__(self, color=True, title='Udapi visualization', **kwargs): + def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, **kwargs): """Create new TextModeTreesHtml block object. Args: see `TextModeTrees`. @@ -38,6 +38,7 @@ def __init__(self, color=True, title='Udapi visualization', **kwargs): """ super().__init__(color=color, **kwargs) self.title = title + self.zones_in_rows = zones_in_rows def before_process_document(self, document): # TextModeTrees.before_process_document changes the color property, @@ -82,3 +83,15 @@ def print_headers(self, root): print(escape(text)) if self.print_comments and root.comment: print('#' + self.colorize_comment(escape(root.comment)).rstrip().replace('\n', '\n#')) + + def process_bundle(self, bundle): + if self.zones_in_rows: + print("") + for tree in bundle: + if self._should_process_tree(tree): + print("") + print("
    ") + self.process_tree(tree) + print("
    ") + else: + super().process_bundle(bundle) From a49785d844e85771d499b3431cf8d8c9f3878307 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:28:09 +0100 Subject: [PATCH 0564/1201] empty zone does not need a slash in sent_id --- udapi/core/root.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/udapi/core/root.py b/udapi/core/root.py index 6a5717a2..3e6bf62b 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -95,12 +95,13 @@ def zone(self, zone): if self._bundle: self._bundle.check_zone(zone) self._zone = zone + slashzone = '/' + zone if zone else '' if self._bundle is not None: - self._sent_id = self._bundle.address() + '/' + zone + self._sent_id = self._bundle.address() + slashzone elif self._sent_id: - self._sent_id = self._sent_id.split('/', 1)[0] + '/' + zone + self._sent_id = self._sent_id.split('/', 1)[0] + slashzone else: - self._sent_id = '?/' + zone + self._sent_id = '?' + slashzone @property def parent(self): From 5a7ccdc00b7466d1a1469fec9b2a0a63efce1880 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 11 Jan 2023 14:43:05 +0100 Subject: [PATCH 0565/1201] Case=Ben allowed in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index b286a27c..47437e2a 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -19,7 +19,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], 'Foreign': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': @@ -30,7 +30,7 @@ def process_node(self, node): rf = ['PronType', 'Case'] af = { 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] @@ -97,7 +97,7 @@ def process_node(self, node): 'NumType': ['Card'], 'NumForm': ['Word'], 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] }) # VERBS ################################################################ elif node.upos == 'VERB': From e9fe589322d5f6d03d318862bc93ec9eba26bd85 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 18 Jan 2023 12:49:10 +0100 Subject: [PATCH 0566/1201] Comment: link to the issue where "interleaved" is defined. https://github.com/ufal/corefUD/issues/25 --- udapi/block/corefud/fixinterleaved.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/udapi/block/corefud/fixinterleaved.py b/udapi/block/corefud/fixinterleaved.py index c5a1b3ed..026b19f3 100644 --- a/udapi/block/corefud/fixinterleaved.py +++ b/udapi/block/corefud/fixinterleaved.py @@ -3,7 +3,9 @@ import itertools class FixInterleaved(Block): - """Fix mentions with interleaved or crossing spans.""" + """Fix mentions with interleaved or crossing spans. + https://github.com/ufal/corefUD/issues/25 + """ def __init__(self, same_entity_only=True, both_discontinuous=False, crossing_only=False, nested_same_subspan=True, **kwargs): @@ -58,8 +60,8 @@ def process_tree(self, tree): pass deleted.add(mB) - # By changing the mA.words, we could have create another error: - # making the span same as another mention. Let's fix it + # By changing the mA.words, we could have created another error: + # making the span same as another mention. Let's fix it. sA = set(mA.words) for mC in mentions: if mC in deleted or mC is mA or mC is mB: From 6a9501b6522fca2fe4d38c2fcdf8946170ae69c4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 14:44:30 +0100 Subject: [PATCH 0567/1201] Updated feature checking for ml. --- udapi/block/ud/ml/markfeatsbugs.py | 68 +++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 47437e2a..54119030 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -19,18 +19,21 @@ def process_node(self, node): self.check_allowed_features(node, { 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], - 'Foreign': ['Yes']}) + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': self.check_allowed_features(node, { - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) # PRONOUNS ############################################################# elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns - 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Typo': ['Yes'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] @@ -74,13 +77,15 @@ def process_node(self, node): self.check_required_features(node, ['PronType', 'Definite']) self.check_allowed_features(node, { 'PronType': ['Art'], - 'Definite': ['Ind'] + 'Definite': ['Ind'], + 'Typo': ['Yes'] }) else: self.check_required_features(node, ['PronType']) self.check_allowed_features(node, { 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], - 'Deixis': ['Prox', 'Remt'] + 'Deixis': ['Prox', 'Remt'], + 'Typo': ['Yes'] }) # NUMERALS ############################################################# elif node.upos == 'NUM': @@ -89,24 +94,27 @@ def process_node(self, node): if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): self.check_allowed_features(node, { 'NumType': ['Card'], - 'NumForm': ['Digit', 'Roman'] + 'NumForm': ['Digit', 'Roman'], + 'Typo': ['Yes'] }) else: - self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_required_features(node, ['NumType', 'NumForm', 'Case']) self.check_allowed_features(node, { 'NumType': ['Card'], 'NumForm': ['Word'], 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Typo': ['Yes'] }) # VERBS ################################################################ elif node.upos == 'VERB': - self.check_required_features(node, ['VerbForm', 'Voice']) + self.check_required_features(node, ['VerbForm']) if node.feats['VerbForm'] == 'Inf': self.check_allowed_features(node, { 'VerbForm': ['Inf'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Fin': if node.feats['Mood'] == 'Imp': @@ -121,26 +129,39 @@ def process_node(self, node): 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], - 'Polite': ['Infm', 'Form'] + 'Polite': ['Infm', 'Form'], + 'Typo': ['Yes'] + }) + elif node.feats['Mood'] == 'Nec': + self.check_required_features(node, ['Mood', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Nec'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) else: self.check_required_features(node, ['Mood', 'Tense', 'Voice']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Nec'], + 'Mood': ['Ind', 'Pot'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Part': - self.check_required_features(node, ['Tense', 'Voice']) + self.check_required_features(node, ['Tense']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Part'], 'Tense': ['Past'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) else: # verbal noun self.check_required_features(node, ['Tense', 'Voice']) @@ -151,6 +172,7 @@ def process_node(self, node): 'Gender': ['Masc', 'Fem', 'Neut'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) # AUXILIARIES ########################################################## elif node.upos == 'AUX': @@ -161,7 +183,8 @@ def process_node(self, node): 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], - 'Polarity': ['Pos', 'Neg'] + 'Polarity': ['Pos', 'Neg'], + 'Typo': ['Yes'] }) else: # indicative or subjunctive self.check_required_features(node, ['Mood', 'Tense']) @@ -171,23 +194,26 @@ def process_node(self, node): 'Mood': ['Ind', 'Sub'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'] + 'Typo': ['Yes'] }) # ADVERBS ############################################################## elif node.upos == 'ADV': if node.feats['PronType'] != '': # Pronominal adverbs are neither compared nor negated. self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Typo': ['Yes'] }) else: # The remaining adverbs are neither pronominal, nor compared or # negated. - self.check_allowed_features(node, {}) + self.check_allowed_features(node, {'Typo': ['Yes']}) # PARTICLES ############################################################ elif node.upos == 'PART': self.check_allowed_features(node, { - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Typo': ['Yes'] }) # THE REST: NO FEATURES ################################################ else: - self.check_allowed_features(node, {}) + self.check_allowed_features(node, {'Typo': ['Yes']}) From 448bba23b9aa90f8741efcd7565a516a7c84c85b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 14:45:28 +0100 Subject: [PATCH 0568/1201] bug fix --- udapi/block/ud/ml/markfeatsbugs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 54119030..4741d2fa 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -193,7 +193,7 @@ def process_node(self, node): 'VerbForm': ['Fin'], 'Mood': ['Ind', 'Sub'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative - 'Polarity': ['Pos', 'Neg'] + 'Polarity': ['Pos', 'Neg'], 'Typo': ['Yes'] }) # ADVERBS ############################################################## From 7524bd5cdbe88661eb09eb46f88bc3de07f5716e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 14:59:48 +0100 Subject: [PATCH 0569/1201] Updated feature checking for ml. --- udapi/block/ud/ml/markfeatsbugs.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 4741d2fa..be084e22 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -25,6 +25,7 @@ def process_node(self, node): # ADJECTIVES ########################################################### elif node.upos == 'ADJ': self.check_allowed_features(node, { + 'VerbForm': ['Part'], 'Foreign': ['Yes'], 'Typo': ['Yes']}) # PRONOUNS ############################################################# @@ -66,9 +67,9 @@ def process_node(self, node): # ആരെ āre "who" (Acc) എന്തെ ente "what" (Acc.Anim) എന്തിനെ entine "what" (Acc.Anim or maybe Inan but optional) # ആരുടെ āruṭe "who" (Gen) എന്തിന് entin "what" (Gen) or "why" # ആരൊക്കെ ārokke "who" (Dat?) എന്തൊക്കെ entokke "what" (Dat?) - elif node.feats['PronType'] == 'Int': - rf.append('Animacy') - af['Animacy'] = ['Anim', 'Inan'] + #elif node.feats['PronType'] == 'Int': + # rf.append('Animacy') + # af['Animacy'] = ['Anim', 'Inan'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # DETERMINERS ########################################################## @@ -122,13 +123,12 @@ def process_node(self, node): # The verb stem serves as an informal imperative: തുറ tuṟa "open" # The citation form may serve as a formal imperative: തുറക്കുക tuṟakkūka "open" # Finally, there is another formal imperative with -kkū: തുറക്കൂ tuṟakkū "open" - self.check_required_features(node, ['Mood', 'Voice', 'Polite']) + self.check_required_features(node, ['Mood', 'Polite']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'], 'Polite': ['Infm', 'Form'], 'Typo': ['Yes'] }) @@ -164,7 +164,9 @@ def process_node(self, node): 'Typo': ['Yes'] }) else: # verbal noun - self.check_required_features(node, ['Tense', 'Voice']) + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Vnoun'], From 0c0e0a257896741295c27661397e5d263aa8d1dc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 15:12:49 +0100 Subject: [PATCH 0570/1201] AUX allows Vnoun. --- udapi/block/ud/ml/markfeatsbugs.py | 45 ++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index be084e22..4f17c45f 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -174,28 +174,45 @@ def process_node(self, node): 'Gender': ['Masc', 'Fem', 'Neut'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], 'Typo': ['Yes'] }) # AUXILIARIES ########################################################## elif node.upos == 'AUX': self.check_required_features(node, ['VerbForm']) - if node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prog'], - 'VerbForm': ['Fin'], - 'Mood': ['Imp'], - 'Polarity': ['Pos', 'Neg'], - 'Typo': ['Yes'] - }) - else: # indicative or subjunctive - self.check_required_features(node, ['Mood', 'Tense']) + if node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Typo': ['Yes'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Typo': ['Yes'] + }) + else: # verbal noun + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], - 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Sub'], - 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], 'Polarity': ['Pos', 'Neg'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], 'Typo': ['Yes'] }) # ADVERBS ############################################################## From 94e7e85033515b101873c58e16a97dcd7b465dd9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 15:15:41 +0100 Subject: [PATCH 0571/1201] Foreign VERB --- udapi/block/ud/ml/markfeatsbugs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 4f17c45f..2cb4f791 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -115,6 +115,7 @@ def process_node(self, node): 'VerbForm': ['Inf'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Fin': @@ -130,6 +131,7 @@ def process_node(self, node): 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], 'Polite': ['Infm', 'Form'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) elif node.feats['Mood'] == 'Nec': @@ -140,6 +142,7 @@ def process_node(self, node): 'Mood': ['Nec'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) else: @@ -151,6 +154,7 @@ def process_node(self, node): 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Part': @@ -161,6 +165,7 @@ def process_node(self, node): 'Tense': ['Past'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) else: # verbal noun @@ -176,6 +181,7 @@ def process_node(self, node): 'Voice': ['Act', 'Pass', 'Cau'], # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) # AUXILIARIES ########################################################## From e79bd16052f39cad08782315887df7849177ce3d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 22:46:50 +0100 Subject: [PATCH 0572/1201] Conditional in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 2cb4f791..75552c36 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -32,7 +32,7 @@ def process_node(self, node): elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { - 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns + 'PronType': ['Prs', 'Int', 'Ind'], # demonstrative pronouns are treated as third person personal pronouns 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], 'Typo': ['Yes'] } @@ -150,7 +150,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Pot'], + 'Mood': ['Ind', 'Pot', 'Cnd'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], From 337e7f6d159cf68bacb88529ea843c6c8b67a18d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 26 Jan 2023 00:05:13 +0100 Subject: [PATCH 0573/1201] Conditional in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 75552c36..5ca2b4fb 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -202,7 +202,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Sub'], + 'Mood': ['Ind', 'Sub', 'Cnd'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], 'Typo': ['Yes'] From b6600ea65e001d76ffbec656382384d60511d76c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 26 Jan 2023 00:07:40 +0100 Subject: [PATCH 0574/1201] Don't print empty tables if no trees will be printed in a given bundle Fixes #110 --- udapi/block/write/textmodetreeshtml.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 7fedc1b8..5ccceb78 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -86,12 +86,22 @@ def print_headers(self, root): def process_bundle(self, bundle): if self.zones_in_rows: - print("") + # Don't print
    if no tree will be printed in this bundle. + marked_trees = [] for tree in bundle: if self._should_process_tree(tree): + if self.print_empty: + allnodes = [tree] + tree.descendants_and_empty + else: + allnodes = tree.descendants(add_self=1) + if self.should_print_tree(tree, allnodes): + marked_trees.append(tree) + if marked_trees: + print("") + for tree in marked_trees: print("") - print("
    ") self.process_tree(tree) print("
    ") + print("") else: super().process_bundle(bundle) From b8b68bf6474751dbf5ec7205ea40936c19c5aa73 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 26 Jan 2023 09:56:25 +0100 Subject: [PATCH 0575/1201] Do not check foreign words for Malayalam features. --- udapi/block/ud/ml/markfeatsbugs.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 5ca2b4fb..12e2ef0f 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -13,8 +13,17 @@ class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): def process_node(self, node): + # FOREIGN WORDS ######################################################## + # Do not put any restrictions on words that have Foreign=Yes. These may + # also have Lang=xx in MISC, which would mean that the official + # validator would judge them by the rules for language [xx]. But even + # if they are not fully code-switched (e.g. because they are written in + # the Malayalam script, like the English verb പ്ലാന്റ് plānṟ "plant"), + # they still may not have the regular features of Malayalam morphology. + if node.feats['Foreign'] == 'Yes': + pass # NOUNS AND PROPER NOUNS ############################################### - if re.match(r'^(NOUN|PROPN)$', node.upos): + elif re.match(r'^(NOUN|PROPN)$', node.upos): self.check_required_features(node, ['Animacy', 'Number', 'Case']) self.check_allowed_features(node, { 'Animacy': ['Anim', 'Inan'], From 1335522492d7c6cc528ab576dfb3142d4aac67e3 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 26 Jan 2023 21:59:37 +0100 Subject: [PATCH 0576/1201] improve definition of almost_forest in PrintMentions --- udapi/block/corefud/printmentions.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 7ed31b0d..12db433a 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -10,7 +10,7 @@ class PrintMentions(Block): def __init__(self, continuous='include', almost_continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, - print_total=True, + print_total=True, print_should=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, @@ -33,6 +33,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i random.seed(42) self.print_other_forms = print_other_forms self.print_total = print_total, + self.print_should = print_should, print_class = TextModeTreesHtml if html else TextModeTrees self.print_block = print_class( print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, @@ -61,7 +62,9 @@ def _ok(self, condition, value): return (condition and value == 'only') or (not condition and value=='exclude') def _is_auxiliary_etc(self, node): - if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'vocative'}: + if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}: + return True + if node.deprel == 'advmod:emph': return True if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: return True @@ -79,8 +82,25 @@ def _is_forest(self, mention, mwords, almost): for ch in w.children: if ch not in mwords: if not almost: + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid return False + # Punctuation before or after the mention span can depend on any of the mwords + # without breaking the almost_forest property. + # According to the UD guidelines, it should depend on the highest node within the phrase, + # i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines. + if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]): + continue + # Some auxiliary words (e.g. prepositions) may be excluded from the mention span + # without breaking the almost_forest property, but they need to depend + # on the mention head (or if the mention is not a catena, they need to depend + # on one of the potential heads, i.e. a node from mwords whose parent is not in mwords). + # For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head), + # but "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest + # because "with" depends on "Mary", which is not the mention head (nor a potential mention head). if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)): + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid return False return True From 0178372e381accb9c28795bcfff5f21366e48520 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 27 Jan 2023 22:49:12 +0100 Subject: [PATCH 0577/1201] Malayalam adpositions can have the Case feature. --- udapi/block/ud/ml/markfeatsbugs.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 12e2ef0f..c2a8e0f4 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -242,6 +242,13 @@ def process_node(self, node): # The remaining adverbs are neither pronominal, nor compared or # negated. self.check_allowed_features(node, {'Typo': ['Yes']}) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_allowed_features(node, { + # Case suffixes after numbers are separate tokens, they are attached + # via the 'case' relation and they bear the Case feature (the number does not). + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Typo': ['Yes']}) # PARTICLES ############################################################ elif node.upos == 'PART': self.check_allowed_features(node, { From c3da386bf36609774e34464899a048700631b4b9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 Jan 2023 11:08:43 +0100 Subject: [PATCH 0578/1201] ud.SetTranslation (e.g. lines from Google Translate) --- udapi/block/ud/settranslation.py | 59 ++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 udapi/block/ud/settranslation.py diff --git a/udapi/block/ud/settranslation.py b/udapi/block/ud/settranslation.py new file mode 100644 index 00000000..487cca06 --- /dev/null +++ b/udapi/block/ud/settranslation.py @@ -0,0 +1,59 @@ +""" +Block SetTranslation for setting of sentence-level translation (the attribute +text_en for English translation) from a separate text file (one sentence per +line). For example, one can export the original sentences using write.SentencesHtml, +then Google-translate them in the web browser, then CTRL+C CTRL+V to a plain +text editor, save them as translations.txt and import them using this block. + +Usage: +udapy -s ud.SetTranslation file=translations.txt < in.conllu > out.conllu + +Author: Dan Zeman +""" +from udapi.core.block import Block +import re +import logging + +class SetTranslation(Block): + """ + Set text_en to the next available translation. + """ + + def __init__(self, file, overwrite=False, **kwargs): + """ + Create the SetTranslation block. + + Parameters: + file: the name of the text file with the translations (one sentence per line) + overwrite=1: set the translation even if the sentence already has one + (default: do not overwrite existing translations) + """ + super().__init__(**kwargs) + self.file = file + fh = open(self.file, 'r', encoding='utf-8') + self.trlines = fh.readlines() + self.nlines = len(self.trlines) + self.iline = 0 + self.overwrite = overwrite + + def process_tree(self, tree): + if self.iline < self.nlines: + translation = self.trlines[self.iline] + self.iline += 1 + comments = [] + if tree.comment: + comments = tree.comment.split('\n') + i_tr = -1 + for i in range(len(comments)): + # The initial '#' character has been stripped. + if re.match(r'\s*text_en\s*=', comments[i]): + i_tr = i + break + if i_tr >= 0: + if self.overwrite: + comments[i_tr] = ' text_en = ' + translation + else: + comments.append(' text_en = ' + translation) + tree.comment = '\n'.join(comments) + elif self.iline == self.nlines: + logging.warning('There are only %d translation lines but there are more input sentences.' % self.nlines) From a75ab8d8bd9754b776911c41977fbcacdcf3b521 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 00:52:29 +0100 Subject: [PATCH 0579/1201] first draft of a coreference-visualization writer --- udapi/block/write/corefhtml.py | 123 +++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 udapi/block/write/corefhtml.py diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py new file mode 100644 index 00000000..fc49dfb4 --- /dev/null +++ b/udapi/block/write/corefhtml.py @@ -0,0 +1,123 @@ +"""CorefHtml class is a writer for HTML+JavaScript visualization of coreference.""" +from udapi.core.basewriter import BaseWriter +from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention + +ETYPES = 'person place organization animal plant object substance time number abstract event'.split() + +class CorefHtml(BaseWriter): + + def __init__(self, path_to_js='web', **kwargs): + super().__init__(**kwargs) + self.path_to_js = path_to_js + + def process_document(self, doc): + print('') + print('Udapi CorefUD viewer') + print('') + #print('') #$(window).on("load", function() {...} + #print('') + print('') + print('\n') + + for tree in doc.trees: + self.process_tree(tree) + + print('') + print('') + + def process_tree(self, tree): + mentions = set() + nodes_and_empty = tree.descendants_and_empty + for node in nodes_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + sent_mentions = [] + for mention in mentions: + mspan = mention.span + if ',' not in mspan: + sent_mentions.append(mention) + else: + entity = mention.entity + head_str = str(mention.words.index(mention.head) + 1) + subspans = mspan.split(',') + for idx,subspan in enumerate(subspans, 1): + subspan_eid = f'{entity.eid}[{idx}/{len(subspans)}]' + subspan_words = span_to_nodes(tree, subspan) + fake_entity = CorefEntity(subspan_eid, entity.etype) + fake_mention = CorefMention(subspan_words, head_str, fake_entity, add_word_backlinks=False) + if mention._other: + fake_mention._other = mention._other + if mention._bridging and idx == 1: + fake_mention._bridging = mention._bridging + sent_mentions.append(fake_mention) + sent_mentions.sort(reverse=True) + + opened = [] + print('

    ') + for node in nodes_and_empty: + while sent_mentions and sent_mentions[-1].words[0] == node: + m = sent_mentions.pop() + e = m.entity + classes = f'{e.eid} {e.etype or "other"}' + if all(w.is_empty() for w in m.words): + classes += ' empty' + if len(e.mentions) == 1: + classes += ' singleton' + title = f'eid={e.eid}\ntype={e.etype}\nhead={m.head.form}' + print(f'', end='') + opened.append(m) + + is_head = self._is_head(node) + if is_head: + print('', end='') + if node.is_empty(): + print('', end='') + print(node.form, end='') + if node.is_empty(): + print('', end='') + if is_head: + print('', end='') + + while opened and opened[-1].words[-1] == node: + print('', end='') + opened.pop() + + if not node.no_space_after: + print(' ', end='') + + print('

    ') + + def _is_head(self, node): + for mention in node.coref_mentions: + if mention.head == node: + return mention + return None + +# id needs to be a valid DOM querySelector +# so it cannot contain # nor / and it cannot start with a digit +def _id(node): + if node is None: + return 'null' + return '"n%s"' % node.address().replace('#', '-').replace('/', '-') + + +def _esc(string): + if string is None: + string = '' + return string.replace('\\', '\\\\').replace('"', r'\"') From e3ae1c3fb65fa62431e23c2bfff9d8534d458019 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 13:25:49 +0100 Subject: [PATCH 0580/1201] fix visualization of discontinuous mentions introduce CorefMentionSubspan instead of fake mentions (should be used also in store_coref_to_misc() in future) --- udapi/block/write/corefhtml.py | 40 +++++++++++----------------------- udapi/core/coref.py | 39 ++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index fc49dfb4..890b172a 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -44,44 +44,30 @@ def process_tree(self, tree): for m in node.coref_mentions: mentions.add(m) - sent_mentions = [] + subspans = [] for mention in mentions: - mspan = mention.span - if ',' not in mspan: - sent_mentions.append(mention) - else: - entity = mention.entity - head_str = str(mention.words.index(mention.head) + 1) - subspans = mspan.split(',') - for idx,subspan in enumerate(subspans, 1): - subspan_eid = f'{entity.eid}[{idx}/{len(subspans)}]' - subspan_words = span_to_nodes(tree, subspan) - fake_entity = CorefEntity(subspan_eid, entity.etype) - fake_mention = CorefMention(subspan_words, head_str, fake_entity, add_word_backlinks=False) - if mention._other: - fake_mention._other = mention._other - if mention._bridging and idx == 1: - fake_mention._bridging = mention._bridging - sent_mentions.append(fake_mention) - sent_mentions.sort(reverse=True) + subspans.extend(mention._subspans()) + subspans.sort(reverse=True) opened = [] print('

    ') for node in nodes_and_empty: - while sent_mentions and sent_mentions[-1].words[0] == node: - m = sent_mentions.pop() + while subspans and subspans[-1].words[0] == node: + subspan = subspans.pop() + m = subspan.mention e = m.entity classes = f'{e.eid} {e.etype or "other"}' - if all(w.is_empty() for w in m.words): + if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: classes += ' singleton' - title = f'eid={e.eid}\ntype={e.etype}\nhead={m.head.form}' - print(f'', end='') - opened.append(m) + title += f'\n{m.other}' + print(f'', end='') #data-eid="{e.eid}" + + opened.append(subspan) is_head = self._is_head(node) if is_head: diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 3eb76db3..1a6d1f95 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -128,6 +128,17 @@ def __init__(self, words, head=None, entity=None, add_word_backlinks=True): new_word._mentions.append(self) new_word._mentions.sort() + def _subspans(self): + mspan = self.span + if ',' not in mspan: + return [CorefMentionSubspan(self._words, self, '')] + root = self._words[0].root + subspans = mspan.split(',') + result = [] + for idx,subspan in enumerate(subspans, 1): + result.append(CorefMentionSubspan(span_to_nodes(root, subspan), self, f'[{idx}/{len(subspans)}]')) + return result + def __lt__(self, another): """Does this mention precedes (word-order wise) `another` mention? @@ -247,6 +258,32 @@ def span(self, new_span): self.words = span_to_nodes(self._head.root, new_span) +@functools.total_ordering +class CorefMentionSubspan(object): + """Helper class for representing a continuous subspan of a mention.""" + __slots__ = ['words', 'mention', 'subspan_id'] + + def __init__(self, words, mention, subspan_id): + if not words: + raise ValueError("mention.words must be non-empty") + self.words = sorted(words) + self.mention = mention + self.subspan_id = subspan_id + + def __lt__(self, another): + if self.words[0] is another.words[0]: + if len(self.words) > len(another.words): + return True + if len(self.words) < len(another.words): + return False + assert False + return self.words[0].precedes(another.words[0]) + + @property + def subspan_eid(self): + return self.mention._entity.eid + self.subspan_id + + CHARS_FORBIDDEN_IN_ID = "-=| \t()" @@ -886,7 +923,7 @@ def nodes_to_span(nodes): Note that empty nodes may form gaps in the span, so if a given tree contains an empty node with ord 5.1, but only nodes with ords 3, 4, 5, 6, 7.1 and 7.2 are provided as `nodes`, the resulting string will be "3-5,6,7.1-7.2". - This means that the implementation needs to iterate of all nodes + This means that the implementation needs to iterate over all nodes in a given tree (root.descendants_and_empty) to check for such gaps. """ if not nodes: From b78ef7eea0b76c4f41f8408d918092681d9c5fad Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 14:16:46 +0100 Subject: [PATCH 0581/1201] util.Normalize: sort attributes in FEATS and MISC --- udapi/block/util/normalize.py | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 udapi/block/util/normalize.py diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py new file mode 100644 index 00000000..5b4270cc --- /dev/null +++ b/udapi/block/util/normalize.py @@ -0,0 +1,40 @@ +"""util.Normalize normalizes the ordering of various attributes in CoNLL-U.""" +from udapi.core.block import Block + +class Normalize(Block): + """Normalize the ordering of attributes in the FEATS and MISC columns. + + The attribute-value pairs in the FEATS column in CoNLL-U files + must be sorted alphabetically (case-insensitive) according to the guidelines + (https://universaldependencies.org/format.html#morphological-annotation). + The same is highly recommended for the MISC column. + It is useful e.g. for comparing two conllu files with diff. + + Udapi does the sorting automatically, but for speed reasons + only when writing into these attributes. + This block thus just forces deserialization of node.feats and node.misc, + so that the Udapi later sorts the attributes during serialization. + It is a bit more efficient than something like + util.Eval node='node.feats["Number"] = node.feats["Number"]' + or + util.Eval node='node.misc["NonExistentAttribute"] = None' + """ + + def __init__(self, feats=True, misc=True, **kwargs): + """ + Args: + `feats`: normalize the ordering of FEATS. Default=True. + `misc`: normalize the ordering of MISC. Default=True. + """ + super().__init__(**kwargs) + self.feats = feats + self.misc = misc + # TODO: normalize also standardized comments like text, sent_id,... + + def process_node(self, node): + if self.feats: + node.feats._deserialize_if_empty() + node.feats._string = None + if self.misc: + node.misc._deserialize_if_empty() + node.misc._string = None From 90f338de077467acb4cb9ebebce68179419a0d77 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 14:29:30 +0100 Subject: [PATCH 0582/1201] allow writing to node.sdeprel, add tests --- udapi/core/node.py | 8 ++++++++ udapi/core/tests/test_node.py | 25 ++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 63242698..e188e134 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -166,6 +166,14 @@ def sdeprel(self): return parts[1] return '' + @sdeprel.setter + def sdeprel(self, value): + udeprel = self.udeprel + if value is not None and value != '': + self.deprel = udeprel + ':' + value + else: + self.deprel = udeprel + @property def feats(self): """Property for morphological features stored as a `Feats` object. diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index 28a45d85..8bc7f182 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -119,7 +119,7 @@ def test_draw(self): sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type def test_feats(self): - """Test the morphological featrues.""" + """Test the morphological features.""" node = Node(root=None) self.assertEqual(str(node.feats), '_') node.feats = '' @@ -145,6 +145,29 @@ def test_feats(self): self.assertEqual(str(node.feats), '_') self.assertEqual(node.feats, {}) + def test_deprel(self): + """Test getting setting the dependency relation.""" + node = Node(root=None, deprel='acl:relcl') + self.assertEqual(node.deprel, 'acl:relcl') + self.assertEqual(node.udeprel, 'acl') + self.assertEqual(node.sdeprel, 'relcl') + node.udeprel = 'advcl' + self.assertEqual(node.deprel, 'advcl:relcl') + node.sdeprel = 'tcl' + self.assertEqual(node.deprel, 'advcl:tcl') + node.sdeprel = '' + self.assertEqual(node.deprel, 'advcl') + self.assertEqual(node.udeprel, 'advcl') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj' + self.assertEqual(node.deprel, 'nsubj') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj:pass:outer' + self.assertEqual(node.deprel, 'nsubj:pass:outer') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, 'pass:outer') + def test_deps_getter(self): """Test enhanced dependencies.""" # Create a path to the test CoNLLU file. From 5817af214df034e42cf09ef2c08f0c8d15b3a0d9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 16:31:50 +0100 Subject: [PATCH 0583/1201] write.CorefHtml marks subspans of discontiuous mentions with a red border --- udapi/block/write/corefhtml.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 890b172a..e0ab830b 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -18,26 +18,34 @@ def process_document(self, doc): #print('') print('') print('\n') + mention_ids = {} + for entity in doc.coref_entities: + for idx, mention in enumerate(entity.mentions, 1): + mention_ids[mention] = f'{entity.eid}e{idx}' + for tree in doc.trees: - self.process_tree(tree) + self.process_tree(tree, mention_ids) print('') + ' e.stopPropagation();\n});\n' + '$("span").hover(\n' + ' function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");},\n' + ' function(e) {$("span").removeClass("active");}\n' + ');\n') print('') - def process_tree(self, tree): + def process_tree(self, tree, mention_ids): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -56,7 +64,7 @@ def process_tree(self, tree): subspan = subspans.pop() m = subspan.mention e = m.entity - classes = f'{e.eid} {e.etype or "other"}' + classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: From 355e7bdc32ab854827aff1f7277b069f5c5a8bc0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 17:57:48 +0100 Subject: [PATCH 0584/1201] write.CorefHtml shows also crossing mentions using valid (well-nested) html --- udapi/block/write/corefhtml.py | 56 +++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index e0ab830b..3efe9793 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -18,7 +18,8 @@ def process_document(self, doc): #print('') print('') @@ -35,15 +74,37 @@ def process_document(self, doc): for tree in doc.trees: self.process_tree(tree, mention_ids) - print('') + print('') print('') def _start_subspan(self, subspan, mention_ids, crossing=False): @@ -74,8 +135,10 @@ def process_tree(self, tree, mention_ids): subspans.extend(mention._subspans()) subspans.sort(reverse=True) + if tree.newpar: + print('


    ') opened = [] - print('

    ') + print(f'

    ') for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() From 9e11bd515e19fa59c0bdbc50654d29544b13a21b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 1 Feb 2023 18:03:19 +0100 Subject: [PATCH 0591/1201] util.Normalize now normalizes also sent_id --- udapi/block/util/normalize.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py index 5b4270cc..298bea42 100644 --- a/udapi/block/util/normalize.py +++ b/udapi/block/util/normalize.py @@ -20,16 +20,33 @@ class Normalize(Block): util.Eval node='node.misc["NonExistentAttribute"] = None' """ - def __init__(self, feats=True, misc=True, **kwargs): + def __init__(self, feats=True, misc=True, sent_id=True, start_sent_id=1, **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. `misc`: normalize the ordering of MISC. Default=True. + `sent_id`: normalize sent_id so it forms a sequence of integers + `start_sent_id`: the first sent_id number """ super().__init__(**kwargs) self.feats = feats self.misc = misc - # TODO: normalize also standardized comments like text, sent_id,... + self.sent_id = sent_id + self.next_sent_id = start_sent_id + # TODO: normalize also the order of standardized comments like text, sent_id,... + + def process_bundle(self, bundle): + if self.sent_id: + bundle.bundle_id = str(self.next_sent_id) + self.next_sent_id += 1 + + for tree in bundle: + if self._should_process_tree(tree): + self.process_tree(tree) + + def process_tree(self, tree): + for node in tree.descendants: + self.process_node(node) def process_node(self, node): if self.feats: From 4e1b75678dab1f2602cc26b641a31de977a98f14 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 00:47:37 +0100 Subject: [PATCH 0592/1201] sent_id should not be normalized by default Unlike feats and misc ordering, we can lose information this way - the original sent_id, so it is potentially dangerous. --- udapi/block/util/normalize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py index 298bea42..48cd6dc1 100644 --- a/udapi/block/util/normalize.py +++ b/udapi/block/util/normalize.py @@ -20,12 +20,12 @@ class Normalize(Block): util.Eval node='node.misc["NonExistentAttribute"] = None' """ - def __init__(self, feats=True, misc=True, sent_id=True, start_sent_id=1, **kwargs): + def __init__(self, feats=True, misc=True, sent_id=False, start_sent_id=1, **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. `misc`: normalize the ordering of MISC. Default=True. - `sent_id`: normalize sent_id so it forms a sequence of integers + `sent_id`: normalize sent_id so it forms a sequence of integers. Default=False. `start_sent_id`: the first sent_id number """ super().__init__(**kwargs) From b899af14c12c7ba4c9750ba39bf5f5544783ba59 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 09:53:59 +0100 Subject: [PATCH 0593/1201] write.Conllu path=another/directory keeps the file name, but changes the directory --- udapi/core/basewriter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index cdc2c38f..93f6463a 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -1,6 +1,7 @@ """BaseWriter is the base class for all writer blocks.""" import sys import logging +import os import udapi.core.coref from udapi.core.block import Block @@ -11,7 +12,7 @@ class BaseWriter(Block): """Base class for all reader blocks.""" def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8', - newline='\n', overwrite=False, **kwargs): + newline='\n', overwrite=False, path=None, **kwargs): super().__init__(**kwargs) self.orig_files = files self.orig_stdout = sys.stdout @@ -29,6 +30,7 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1") + self.path = path @property def filename(self): @@ -60,9 +62,11 @@ def before_process_document(self, document): sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: logging.warning('docname_as_file=1 but the document contains no docname') - elif self.overwrite: + elif self.overwrite or self.path: docname = document.meta.get('loaded_from', None) if docname is not None: + if self.path: + docname = os.path.join(self.path, os.path.split(docname)[1]) logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: From 9d183c1d979c50fabff9b3a295a0d8194a09c790 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 10:14:59 +0100 Subject: [PATCH 0594/1201] etype mismatch is stored in mention.other["orig_etype"] which allows easier debugging --- udapi/core/coref.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 1a13d9fb..12dda239 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -665,6 +665,7 @@ def load_coref_from_misc(doc, strict=True): entity.etype = etype elif etype and entity.etype and entity.etype != etype: logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + other["orig_etype"] = etype # CorefEntity could be created first with "Bridge=" without any type elif etype and entity.etype is None: entity.etype = etype From 5b3ed0268ccf76f5332fcce87ac0da9a42b221b8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 14:19:33 +0100 Subject: [PATCH 0595/1201] allow using e.g. write.CorefHtml path='html/*.html' --- udapi/core/basewriter.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 93f6463a..e17a64c3 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -66,11 +66,21 @@ def before_process_document(self, document): docname = document.meta.get('loaded_from', None) if docname is not None: if self.path: - docname = os.path.join(self.path, os.path.split(docname)[1]) + old_dir, old_filename = os.path.split(docname) + new_dir, new_filename = os.path.split(self.path) + old_file, old_ext = os.path.splitext(old_filename) + new_file, new_ext = os.path.splitext(new_filename) + if new_dir in ('', '*'): + new_dir = old_dir + if new_file in ('', '*'): + new_file = old_file + if new_ext in ('', '*'): + new_ext = old_ext + docname = os.path.join(new_dir, new_file + new_ext) logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: - logging.warning('overwrite=1 but document.meta["loaded_from"] is None') + logging.warning('using overwrite or path but document.meta["loaded_from"] is None') else: sys.stdout = self.orig_stdout else: From 34aa19d7d892790b81b2b79579fc4391c07a23ed Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 14:42:30 +0100 Subject: [PATCH 0596/1201] write.Conllu path=my_dir should be interpreted as path=my_dir/ --- udapi/core/basewriter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index e17a64c3..6e1b7446 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -30,6 +30,9 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1") + # interpret path=my_dir/my_subdir as path=my_dir/my_subdir/ + if path and path[-1] != os.sep and '*' not in path: + path += os.sep self.path = path @property From 301b808082254a9b45a2bd4cfe162719dc02bc23 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 4 Feb 2023 01:36:25 +0100 Subject: [PATCH 0597/1201] corefud.GuessSpan: add empty nodes that are causing gaps --- udapi/block/corefud/guessspan.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/guessspan.py b/udapi/block/corefud/guessspan.py index 5c3c6c12..d6093ece 100644 --- a/udapi/block/corefud/guessspan.py +++ b/udapi/block/corefud/guessspan.py @@ -4,6 +4,30 @@ class GuessSpan(Block): """Block corefud.GuessSpan heuristically fills mention spans, while keeping mention.head""" def process_coref_mention(self, mention): - mention.words = mention.head.descendants(add_self=True) - # TODO add empty nodes that are causing gaps + mwords = mention.head.descendants(add_self=True) # TODO add heuristics from corefud.PrintMentions almost_forest=1 + + # Add empty nodes that are causing gaps. + # A node "within the span" whose enhanced parent is in the mentions + # must be added to the mention as well. + # "within the span" includes also empty nodes "on the boundary". + # However, don't add empty nodes which are in a gap cause by non-empty nodes. + to_add = [] + min_ord = int(mwords[0].ord) if mwords[0].is_empty() else mwords[0].ord - 1 + max_ord = int(mwords[-1].ord) + 1 + root = mention.head.root + for empty in root.empty_nodes: + if empty in mwords: + continue + if empty.ord > max_ord: + break + if empty.ord > min_ord: + if any(enh['parent'] in mwords for enh in empty.deps): + to_add.append(empty) + elif empty.ord > min_ord + 1 and empty.ord < max_ord - 1: + prev_nonempty = root.descendants[int(empty.ord) - 1] + next_nonempty = root.descendants[int(empty.ord)] + if prev_nonempty in mwords and next_nonempty in mwords: + to_add.append(empty) + #else: empty.misc['Mark'] = f'not_in_treelet_of_{mention.entity.eid}' + mention.words = sorted(mwords + to_add) From 2285d27f5e9444d3db7a8a0b8db227b38e5c082b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 5 Feb 2023 01:06:32 +0100 Subject: [PATCH 0598/1201] write.CorefHtml: distinguish entities using colors, show eid and docname --- udapi/block/write/corefhtml.py | 41 +++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 8503854f..0a06b7e5 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -1,19 +1,21 @@ """CorefHtml class is a writer for HTML+JavaScript visualization of coreference.""" from udapi.core.basewriter import BaseWriter from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention +from collections import Counter import udapi.block.write.html ETYPES = 'person place organization animal plant object substance time number abstract event'.split() CSS = ''' .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.sentence span .eid {display:block; font-size: 10px;} .showtree {float:left; margin: 5px;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} .empty {color: gray;} -.singleton {border-style: dotted;} +.sentence .singleton {border-style: dotted;} .crossing:before {content: "!"; display: block; background: #ffd500;} .active {border: 1px solid red !important;} -.selected {background: red !important;} +.selected {background: red !important; text-shadow: 1px 1px 4px white, -1px 1px 4px white, 1px -1px 4px white, -1px -1px 4px white;} .other {background: hsl(0, 0%, 85%);} ''' @@ -50,9 +52,11 @@ class CorefHtml(BaseWriter): - def __init__(self, show_trees=True, **kwargs): + def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): super().__init__(**kwargs) self.show_trees = show_trees + self.show_eid = show_eid + self.colors = colors def process_document(self, doc): print('') @@ -63,16 +67,25 @@ def process_document(self, doc): print('') print('\n') mention_ids = {} + entity_colors = {} + entities_of_type = Counter() for entity in doc.coref_entities: + if self.colors: + count = entities_of_type[entity.etype] + entities_of_type[entity.etype] = count + 1 + entity_colors[entity] = f'c{count % self.colors}' for idx, mention in enumerate(entity.mentions, 1): mention_ids[mention] = f'{entity.eid}e{idx}' for tree in doc.trees: - self.process_tree(tree, mention_ids) + self.process_tree(tree, mention_ids, entity_colors) print('') print('') - def _start_subspan(self, subspan, mention_ids, crossing=False): + def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): m = subspan.mention e = m.entity classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"}' - title = f'eid={subspan.subspan_eid}\ntype={e.etype}\nhead={m.head.form}' + title = f'eid={subspan.subspan_eid}\ntype={e.etype} ({entity_colors[e]})\nhead={m.head.form}' + if self.colors: + classes += f' {entity_colors[e]}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: @@ -121,9 +136,11 @@ def _start_subspan(self, subspan, mention_ids, crossing=False): title += '\ncrossing' if m.other: title += f'\n{m.other}' - print(f'', end='') #data-eid="{e.eid}" + print(f'', end='') + if self.show_eid: + print(f'{subspan.subspan_eid}', end='') - def process_tree(self, tree, mention_ids): + def process_tree(self, tree, mention_ids, entity_colors): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -135,14 +152,16 @@ def process_tree(self, tree, mention_ids): subspans.extend(mention._subspans()) subspans.sort(reverse=True) - if tree.newpar: + if tree.newdoc: + print(f'


    {tree.newdoc if tree.newdoc is not True else ""}


    ') + elif tree.newpar: print('
    ') opened = [] print(f'

    ') for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() - self._start_subspan(subspan, mention_ids) + self._start_subspan(subspan, mention_ids, entity_colors) opened.append(subspan) is_head = self._is_head(node) @@ -180,7 +199,7 @@ def process_tree(self, tree, mention_ids): opened = new_opened print('' * (len(endings) + len(brokens)), end='') for broken in brokens: - self._start_subspan(broken, mention_ids, True) + self._start_subspan(broken, mention_ids, entity_colors, True) opened.append(subspan) if not node.no_space_after: From cae7c37efe8548c2e432b108e4aa06df3b778e3a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 15:07:42 +0100 Subject: [PATCH 0599/1201] `read.Conllu max_docs=3` will load only the first three documents This is nice for debugging coreference files, where we cannot load just first N sentences because there may be Bridge/SplitAnte referring to unknown eid. This way we load whole docs. --- udapi/block/read/conllu.py | 22 ++++++++++++++++++++-- udapi/core/basereader.py | 31 ++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index bba69696..d5623fba 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -81,8 +81,26 @@ def parse_comment_line(self, line, root): root.comment += line[1:] + "\n" def read_trees(self): - return [self.read_tree_from_lines(s.split('\n')) for s in - self.filehandle.read().split('\n\n') if s] + if not self.max_docs: + return [self.read_tree_from_lines(s.split('\n')) for s in + self.filehandle.read().split('\n\n') if s] + # udapi.core.basereader takes care about the max_docs parameter. + # However, we can make the loading much faster by not reading + # the whole file if the user wants just first N documents. + trees, lines, loaded_docs = [], [], 0 + for line in self.filehandle: + line = line.rstrip() + if line == '': + tree = self.read_tree_from_lines(lines) + lines = [] + if tree.newdoc: + if loaded_docs == self.max_docs: + return trees + loaded_docs += 1 + trees.append(tree) + else: + lines.append(line) + return def read_tree(self): if self.filehandle is None: diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index a3b334da..a841bf1b 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -13,7 +13,8 @@ class BaseReader(Block): # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', - sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, **kwargs): + sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, + max_docs=0, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None @@ -29,6 +30,8 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id self.merge = merge + self.max_docs = max_docs + self._docs_loaded = 0 # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it in here, the reader. # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, @@ -126,6 +129,11 @@ def try_fast_load(self, document): bundle, last_bundle_id = None, '' for root in trees: + if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return True + self._docs_loaded += 1 add_to_the_last_bundle = False if self.ignore_sent_id: @@ -180,8 +188,10 @@ def process_document(self, document): if root._sent_id is not None: bundle.bundle_id = root._sent_id.split('/', 1)[0] bundle.add_tree(root) - if root.newdoc and root.newdoc is not True: - document.meta["docname"] = root.newdoc + if root.newdoc: + self._docs_loaded += 1 + if root.newdoc is not True: + document.meta["docname"] = root.newdoc document.meta['global.Entity'] = self._global_entity document.meta['loaded_from'] = self.filename @@ -204,6 +214,17 @@ def process_document(self, document): if trees_loaded == 0: document.meta['loaded_from'] = self.filename document.meta['global.Entity'] = self._global_entity + # Parameter max_docs is primarily aimed for counting UD docs, ie. trees with newdoc. + # However, it could be useful even when working with files without the newdoc annotations, + # e.g. when using files='!*.conllu' or bundles_per_doc, in which case we count the Udapi documents + # so even if the first tree in udapi.Document does not have newdoc, we count it as a new document. + # The cases where newdoc is used are checked further below. + if not root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return + self._docs_loaded += 1 + add_to_the_last_bundle = False trees_loaded += 1 @@ -222,6 +243,9 @@ def process_document(self, document): # The `# newdoc` comment in CoNLL-U marks a start of a new document. if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return if not bundle and root.newdoc is not True: document.meta["docname"] = root.newdoc if bundle and self.split_docs: @@ -231,6 +255,7 @@ def process_document(self, document): len(orig_bundles)) self.finished = False return + self._docs_loaded += 1 # assign new/next bundle to `bundle` if needed if not bundle or not add_to_the_last_bundle: From ae34d8024d8ee95db6e1bf39581e44fc08bcbc73 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 15:25:50 +0100 Subject: [PATCH 0600/1201] refactor code duplication --- udapi/block/write/corefhtml.py | 29 +++-------------------------- udapi/block/write/html.py | 28 +++++++++++++++------------- 2 files changed, 18 insertions(+), 39 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 0a06b7e5..c7950ce9 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -50,6 +50,8 @@ }); ''' +WRITE_HTML = udapi.block.write.html.Html() + class CorefHtml(BaseWriter): def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): @@ -90,32 +92,7 @@ def process_document(self, doc): print('') print('') diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 148b29ee..48431900 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,16 +79,26 @@ def process_document(self, doc): print('\n') print('

    ') + + def print_doc_json(self, doc): print('data=[') for (bundle_number, bundle) in enumerate(doc, 1): - # TODO: if not self._should_process_bundle(bundle): continue if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' for tree in bundle.trees: - # TODO: if not self._should_process_tree(tree): continue zone = tree.zone if first_zone: first_zone = False @@ -101,24 +111,16 @@ def process_document(self, doc): print('"labels":["zone=%s","id=%s"]}' % (zone, tree.address())) desc += ',["[%s]","label"],[" ","space"]' % zone for node in tree.descendants: - desc += self.print_node(node) + desc += self.print_node_json(node) desc += r',["\n","newline"]' print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) print('];') - print("$('#treex-view').treexView(data);") - print('''function saveTree() { - var svg_el = jQuery('svg'); - if (svg_el.length) { - var svg = new Blob([svg_el.parent().html()], {type: "image/svg+xml"}); - saveAs(svg, 'tree.svg'); - } - }''') - print('') + @staticmethod - def print_node(node): + def print_node_json(node): """JSON representation of a given node.""" # pylint does not understand `.format(**locals())` and falsely alarms for unused vars # pylint: disable=too-many-locals,unused-variable From ca4d2b7f8240a0faca55f9aad6513d9a94968a08 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 19:53:25 +0100 Subject: [PATCH 0601/1201] write.CorefHtml: add side panel with an overview of entities --- udapi/block/write/corefhtml.py | 62 ++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index c7950ce9..280fc213 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -6,7 +6,25 @@ ETYPES = 'person place organization animal plant object substance time number abstract event'.split() +HEADER = ''' + +Udapi CorefUD viewer + +''' +# I use a pure CSS-3 solution: #overiew {resize: horizontal; overflow: auto;} +# so that the width of #overview can be changed by dragging the bottom right corner. +# The following lines would make the whole right border draggable: +# +# +# +#
    CSS = ''' +#wrap {display: flex; align-items: flex-start;} +#main {width: 100%; padding: 5px; background: white; z-index:100;} +#overview { position: sticky; top: 0; overflow-y: scroll; height:95vh; resize:horizontal; + display: grid; border-right: double; + padding: 5px; width: 20em; background: #ddd; border-radius: 5px; +} .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} .sentence span .eid {display:block; font-size: 10px;} .showtree {float:left; margin: 5px;} @@ -23,10 +41,16 @@ $("span").click(function(e) { let was_selected = $(this).hasClass("selected"); $("span").removeClass("selected"); - if (!was_selected){$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} + if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} e.stopPropagation(); }); +window.onhashchange = function() { + $("span").removeClass("selected"); + var fragment = window.location.hash.substring(1); + if (fragment) {$("." + fragment).addClass("selected");} +} + $("span").hover( function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, function(e) {$("span").removeClass("active");} @@ -60,10 +84,18 @@ def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): self.show_eid = show_eid self.colors = colors + def _representative_word(self, entity): + # return the first PROPN or NOUN. Or the most frequent one? + heads = [m.head for m in entity.mentions] + lemma_or_form = lambda n: n.lemma if n.lemma else n.form + for upos in ('PROPN', 'NOUN'): + nodes = [n for n in heads if n.upos == upos] + if nodes: + return lemma_or_form(nodes[0]) + return lemma_or_form(heads[0]) + def process_document(self, doc): - print('') - print('Udapi CorefUD viewer') - print('') + print(HEADER) if self.show_trees: print('') print('') - print('\n') + print('\n\n
    ') mention_ids = {} entity_colors = {} @@ -86,8 +118,21 @@ def process_document(self, doc): for idx, mention in enumerate(entity.mentions, 1): mention_ids[mention] = f'{entity.eid}e{idx}' + print('
    ') + print('' + '' + '\n') + for entity in doc.coref_entities: + print(f'' + f'' + f'') + print('
    eid#mword
    {entity.eid}{len(entity.mentions)}{self._representative_word(entity)}
    ') + print('
    ') + + print('
    ') for tree in doc.trees: self.process_tree(tree, mention_ids, entity_colors) + print('
    ') print('') - print('') + print('
    ') def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): m = subspan.mention @@ -113,7 +158,10 @@ def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): title += '\ncrossing' if m.other: title += f'\n{m.other}' - print(f'', end='') + span_id = '' + if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m: + span_id = f'id="{e.eid}" ' + print(f'', end='') if self.show_eid: print(f'{subspan.subspan_eid}', end='') From bbd702aa35fcf4e13d2a4ab2d3972a7efd89fcc5 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 16:22:03 +0100 Subject: [PATCH 0602/1201] Python glob.glob does not support {dir1,dir2} anyway --- udapi/core/files.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index 7fcd9149..c6973dad 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -58,14 +58,6 @@ def string_to_filenames(self, string): or commas. For specifying files with spaces or commas in filenames, you need to use wildcard patterns or '@' filelist. (But preferably don't use such filenames.) """ - # "!" means glob pattern which can contain {dir1,dir2} - # so it cannot be combined with separating tokens with comma. - if string[0] == '!': - pattern = string[1:] - filenames = glob.glob(pattern) - if not filenames: - raise RuntimeError('No filenames matched "%s" pattern' % pattern) - return filenames return list(itertools.chain.from_iterable(self._token_to_filenames(tok) for tok in string.replace(',', ' ').split())) From a5acaf43b1edb3468dfc493da6e7ae87f2d99966 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 17:58:45 +0100 Subject: [PATCH 0603/1201] ud.ComplyWithText: use node.misc['CorrectForm'] instead of node.misc['OrigForm'] which was a misleading name because the previous form value is usually not the real original form. --- udapi/block/ud/complywithtext.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index cead294a..bacc56a2 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -34,7 +34,7 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, - **kwargs): + previous_form_attr='CorrectForm', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -54,24 +54,33 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + previous_form_attr - when changing node.form, we store the previous value + in node.misc[previous_form_attr] (so no information is lost). + Default="CorrectForm" because we expect that the previous value + (i.e. the value of node.form before applying this block) + contained the corrected spelling, while root.text contains + the original spelling with typos as found in the raw text. + CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html + When setting this parameter to an empty string, no values will be stored to node.misc. """ super().__init__(**kwargs) self.fix_text = fix_text self.prefer_mwt = prefer_mwt self.allow_goeswith = allow_goeswith self.max_mwt_length = max_mwt_length + self.allow_add_punct = allow_add_punct + self.allow_delete_punct = allow_delete_punct + self.previous_form_attr = previous_form_attr @staticmethod def allow_space(form): """Is space allowed within this token form?""" return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) - @staticmethod - def store_orig_form(node, new_form): - """Store the original form of this node into MISC, unless the change is common&expected.""" - _ = new_form + def store_previous_form(self, node): + """Store the previous form of this node into MISC, unless the change is common&expected.""" if node.form not in ("''", "``"): - node.misc['OrigForm'] = node.form + node.misc[self.previous_form_attr] = node.form def process_tree(self, root): text = root.text @@ -203,7 +212,7 @@ def solve_diff(self, nodes, form): if ' ' in form: if len(nodes) == 1 and node.form == form.replace(' ', ''): if self.allow_space(form): - self.store_orig_form(node, form) + self.store_previous_form(node) node.form = form elif self.allow_goeswith: forms = form.split() @@ -235,7 +244,7 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_orig_form(node, form) + self.store_previous_form(node) node.form = form From a69c7a158edb91d12d2907f6802c3104d946ee0d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 18:00:46 +0100 Subject: [PATCH 0604/1201] ud.ComplyWithText fix_text=1 should always produce valid CoNLL-U so even if there are diffs which cannot be resolved, and thus we cannot fill SpaceAfter=No in the rest of the sentence, we must execute the "if self.fix_text:..." code, which changes the root.text (instead of changing the annotation of nodes). --- udapi/block/ud/complywithtext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index bacc56a2..1a13a4ec 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -121,7 +121,7 @@ def process_tree(self, root): node.misc['SpaceAfter'] = 'No' else: logging.warning('Node %s does not match text "%s"', node, tmp_text[:20]) - return + break # Edit root.text if needed. if self.fix_text: From fde163c32837ccc02a9b89d535be9769d4414340 Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Wed, 8 Feb 2023 14:23:05 +0100 Subject: [PATCH 0605/1201] further adjusted Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 122 ++++++++++++++++++----------- 1 file changed, 78 insertions(+), 44 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 323f60f7..111bceb9 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -29,7 +29,7 @@ def process_node(self, node): af = {} # NOUNS ################################################################ if node.upos == 'NOUN': - if not node.feats['Abbr'] == 'Yes' or node.feats['Case']: # abbreviated or indeclinable nouns + if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], @@ -37,11 +37,11 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Dim'], 'Abbr': ['Yes'], - 'Foreign': ['Yes']} + 'Foreign': ['Yes'], + 'VerbForm': ['Part']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] - af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] af['Compound'] = ['Yes'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] @@ -76,14 +76,12 @@ def process_node(self, node): 'Degree': ['Cmp', 'Sup', 'Abs'], 'Abbr': ['Yes'], 'Foreign': ['Yes'], - 'Polarity': ['Neg']} + 'Polarity': ['Neg'], + 'VerbForm': ['Part']} if self.flavio: - # Flavio does not use Degree=Pos, hence Degree is not required. - # rf = [f for f in rf if f != 'Degree'] # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] - af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] af['Degree'].append('Dim') af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] @@ -93,15 +91,16 @@ def process_node(self, node): elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { - 'PronType': ['Prs', 'Rel', 'Ind', 'Int', 'Rcp'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Proper': ['Yes'], + 'Compound': ['Yes'], + 'Polarity': ['Neg'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] if node.feats['Reflex'] == 'Yes': # seipsum, se rf.extend(['Person']) # seipsum has gender and number but se does not, so it is not required - # TODO: seipsum in ITTB, but why lemma seipsum instead of seipse? af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] af['Person'] = ['3'] @@ -122,6 +121,19 @@ def process_node(self, node): rf = [f for f in rf if f != 'Case'] af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] + # lexical check of PronTypes + af['PronType'] = [] + if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: + af['PronType'].append('Prs') + elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis']: + af['PronType'].append('Ind') + elif node.lemma in ['inuicem', 'invicem']: + af['PronType'].append('Rcp') + rf.remove('Case') + elif node.lemma in ['quicumque', 'qui', 'quisquis']: + af['PronType'].append('Rel') + if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis']: + af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatAnom', 'LatPron'] @@ -140,7 +152,9 @@ def process_node(self, node): 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Cmp', 'Abs', 'Sup'], - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Proper': ['Yes'], + 'PronType': [] } if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' rf.extend(['Poss', 'Person[psor]']) @@ -152,8 +166,24 @@ def process_node(self, node): if node.feats['Person[psor]'] != '3': rf.append('Number[psor]') af['Number[psor]'] = ['Sing', 'Plur'] - else: - af['PronType'] = ['Dem', 'Rel', 'Ind', 'Int', 'Tot', 'Con'] + if node.feats['PronType'] == 'Ind': + af['NumType'] = ['Card'] + # lexical check of PronTypes + if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: + if not af['PronType'] == ['Prs']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquot', 'quidam', 'quispiam', 'quivis', 'nullus', 'nonnullus', 'aliqui', 'qui', 'quilibet', 'quantuslibet', 'unus', 'uterque', 'ullus', 'multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + af['PronType'].append('Ind') + elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: + af['PronType'].append('Tot') + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus']: + af['PronType'].append('Rel') + elif node.lemma in ['qui', 'quantus', 'quot']: + af['PronType'].append('Int') + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot']: + af['PronType'].append('Dem') + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter']: + af['PronType'].append('Con') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] @@ -170,8 +200,8 @@ def process_node(self, node): rf = ['NumType', 'NumForm'] af = { 'NumType': ['Card'], - 'NumForm': ['Word', 'Roman', 'Digit'] - } + 'NumForm': ['Word', 'Roman', 'Digit'], + 'Proper': ['Yes']} # Arabic digits and Roman numerals do not have inflection features. if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): af['Gender'] = ['Masc', 'Fem', 'Neut'] @@ -186,40 +216,40 @@ def process_node(self, node): elif re.match(r'^(VERB|AUX)$', node.upos): rf = ['VerbForm', 'Aspect'] af = { - 'VerbForm': ['Inf', 'Fin', 'Part'], + 'VerbForm': ['Inf', 'Fin', 'Part', 'Conv'], 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Typo': ['Yes'] } - if not re.match(r'^(Ger|Gdv)$', node.feats['VerbForm']): + if node.feats['VerbForm'] not in ['Part', 'Conv']: rf.append('Tense') - af['Tense'] = ['Pres', 'Fut'] - if node.upos == 'VERB': # and not node.lemma.endswith('sum'): # compounds of sum + af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] + if node.upos == 'VERB': rf.append('Voice') af['Voice'] = ['Act', 'Pass'] - # Main verbs have aspect but auxiliaries don't. - # TODO: apparently, apparently AUXs have aspect as well - # if node.upos == 'VERB': - # rf.append('Aspect') - # af['Aspect'] = ['Imp', 'Inch', 'Perf', 'Prosp'] if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive rf.extend(['Mood', 'Person', 'Number']) - af['Tense'].extend(['Past', 'Pqp']) af['Mood'] = ['Ind', 'Sub', 'Imp'] af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] elif node.feats['VerbForm'] == 'Part': rf.extend(['Gender', 'Number', 'Case']) - af['Number'] = ['Sing', 'Plur'] - af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Sing'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Neut'] af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] af['Degree'] = ['Abs', 'Cmp'] - af['Gender'] = ['Masc', 'Fem', 'Neut'] - af['Tense'].append('Past') - # else: nothing to be added for VerbForm=Inf + if node.misc['TraditionalMood'].startswith('Gerundi'): + af['Voice'] = ['Pass'] + af['Aspect'] = 'Prosp' + elif node.feats['VerbForm'] == 'Conv': + rf.extend(['Case', 'Gender', 'Number']) + af['Case'] = ['Abl', 'Acc'] + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Voice'] = ['Act'] + elif node.feats['VerbForm'] == 'Inf': + af['Tense'].remove('Pqp') if self.flavio: - # Flavio has killed Tense in his treebanks. - rf = [f for f in rf if f != 'Tense'] - af['VerbForm'].append('Vnoun') # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] if 'Degree' in af: @@ -228,23 +258,22 @@ def process_node(self, node): af['Degree'] = ['Dim'] af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] - if re.match(r'^(Part|Vnoun)$', node.feats['VerbForm']): - af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO'] - af['VerbForm'].append('Vnoun') + if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## elif node.upos == 'ADV': af = { - 'AdvType': ['Loc', 'Tim'], + 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], - 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'NumType': ['Card', 'Ord'], # e.g., primum 'Polarity': ['Neg'] } if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] - af['NumType'] = ['Card', 'Ord'] # e.g., primum af['VerbForm'] = ['Part'] af['Degree'].append('Dim') self.check_allowed_features(node, af) @@ -262,7 +291,8 @@ def process_node(self, node): elif re.match(r'^[CS]CONJ$', node.upos): af = { 'PronType': ['Rel', 'Con'], - 'Polarity': ['Neg']} + 'Polarity': ['Neg'], + 'Compound': ['Yes']} if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] @@ -271,10 +301,14 @@ def process_node(self, node): self.check_allowed_features(node, af) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': + rf = ['AdpType'] + af = { + 'AdpType': ['Prep', 'Post'], + 'Abbr': ['Yes'] + } if self.flavio: - af = { - 'VerbForm': ['Part'], - 'Proper': ['Yes']} + af['VerbForm'] = ['Part'], + af['Proper'] = ['Yes'] self.check_allowed_features(node, af) # THE REST: NO FEATURES ################################################ else: From 29fb09caccd678560845ea3d80b2027145231c90 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:04:56 +0100 Subject: [PATCH 0606/1201] improve ud.ComplyWithText for KorKor --- udapi/block/ud/complywithtext.py | 81 ++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 1a13a4ec..02904731 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -24,7 +24,7 @@ """ import difflib import logging -import re +import regex from udapi.core.block import Block from udapi.core.mwt import MWT @@ -34,6 +34,7 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, + allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, previous_form_attr='CorrectForm', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. @@ -54,6 +55,14 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + allow_add_punct - allow creating punctuation-only nodes + allow_delete_punct - allow deleting extra punctuation-only nodes, + which are not represented in root.text + allow_hyphen_goeswith - if e.g. node.form=="mother-in-law" corresponds to + "mother in law" in root.text, convert it to three nodes: + node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") + node2(form="in", deprel="goeswith", upos="X", parent=node1) + node3(form="law", deprel="goeswith", upos="X", parent=node1). previous_form_attr - when changing node.form, we store the previous value in node.misc[previous_form_attr] (so no information is lost). Default="CorrectForm" because we expect that the previous value @@ -62,6 +71,7 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ the original spelling with typos as found in the raw text. CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html When setting this parameter to an empty string, no values will be stored to node.misc. + When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. """ super().__init__(**kwargs) self.fix_text = fix_text @@ -70,17 +80,20 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ self.max_mwt_length = max_mwt_length self.allow_add_punct = allow_add_punct self.allow_delete_punct = allow_delete_punct + self.allow_hyphen_goeswith = allow_hyphen_goeswith self.previous_form_attr = previous_form_attr @staticmethod def allow_space(form): """Is space allowed within this token form?""" - return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) + return regex.fullmatch('[0-9 ]+([,.][0-9]+)?', form) def store_previous_form(self, node): """Store the previous form of this node into MISC, unless the change is common&expected.""" - if node.form not in ("''", "``"): + if node.form not in ("''", "``") and self.previous_form_attr: node.misc[self.previous_form_attr] = node.form + if self.previous_form_attr == 'CorrectForm': + node.feats['Typo'] = 'Yes' def process_tree(self, root): text = root.text @@ -190,18 +203,38 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): for diff in diffs: edit, tree_lo, tree_hi, text_lo, text_hi = diff - # Focus only on edits of type 'replace', log insertions and deletions as failures. if edit == 'equal': - continue - if edit in ('insert', 'delete'): - logging.warning('Unable to solve token-vs-text mismatch\n%s', - _diff2str(diff, tree_chars, text)) - continue - - # Revert the splittng and solve the diff. - nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] - form = text[text_lo:text_hi] - self.solve_diff(nodes, form.strip()) + pass + elif edit == 'insert': + forms = text[text_lo:text_hi].split(' ') + if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: + #logging.info(f'trying to add {forms} before {char_nodes[tree_lo]}') + next_node = char_nodes[tree_lo] + for f in reversed(forms): + new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') + new.shift_before_node(next_node) + new.misc['Added'] = 1 + else: + logging.warning('Unable to insert nodes\n%s', + _diff2str(diff, tree_chars, text)) + elif edit == 'delete': + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + if all(regex.fullmatch('\p{P}+', n.form) for n in nodes): + if self.allow_delete_punct: + for node in nodes: + node.remove(children='rehang') + else: + logging.warning('Unable to delete punctuation nodes (try ud.ComplyWithText allow_delete_punct=1)\n%s', + _diff2str(diff, tree_chars, text)) + else: + logging.warning('Unable to delete non-punctuation nodes\n%s', + _diff2str(diff, tree_chars, text)) + else: + assert edit == 'replace' + # Revert the splittng and solve the diff. + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + form = text[text_lo:text_hi] + self.solve_diff(nodes, form.strip()) def solve_diff(self, nodes, form): """Fix a given (minimal) tokens-vs-text inconsistency.""" @@ -210,20 +243,25 @@ def solve_diff(self, nodes, form): # First, solve the cases when the text contains a space. if ' ' in form: - if len(nodes) == 1 and node.form == form.replace(' ', ''): + node_form = node.form + if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: + node_form = node_form.replace('-', '') + if len(nodes) == 1 and node_form == form.replace(' ', ''): if self.allow_space(form): self.store_previous_form(node) node.form = form elif self.allow_goeswith: + self.store_previous_form(node) forms = form.split() node.form = forms[0] + node.feats['Typo'] = 'Yes' for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos=node.upos) + new = node.create_child(form=split_form, deprel='goeswith', upos='X') new.shift_after_node(node) else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: - logging.warning('Unable to solve n:m diff:\n%s -> %s', nodes_str, form) + logging.warning(f'Unable to solve {len(nodes)}:{len(form.split(" "))} diff:\n{nodes_str} -> {form}') # Second, solve the cases when multiple nodes match one form (without any spaces). elif len(nodes) > 1: @@ -244,8 +282,13 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_previous_form(node) - node.form = form + if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): + new = node.create_child(form=form[len(node.form):], deprel='punct', upos='PUNCT') + new.shift_after_node(node) + new.misc['Added'] = 1 + else: + self.store_previous_form(node) + node.form = form def _nodes_to_chars(nodes): From d5a1a2a756ef13629984eb40af7b5853dbd8c7a0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:06:45 +0100 Subject: [PATCH 0607/1201] udapy hints when using a wrong block name or parameter name thanks to @michnov for this idea --- udapi/core/block.py | 23 +++++++++++++++++++---- udapi/core/run.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/udapi/core/block.py b/udapi/core/block.py index f039abce..fdcad9fa 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -1,5 +1,6 @@ """Block class represents the basic Udapi processing unit.""" import logging +import inspect def not_overridden(method): method.is_not_overridden = True @@ -14,9 +15,23 @@ class Block(object): Possible values are: process (default), skip, skip_warn, fail, delete. """ - def __init__(self, zones='all', if_empty_tree='process'): + def __init__(self, zones='all', if_empty_tree='process', **kwargs): self.zones = zones self.if_empty_tree = if_empty_tree + if kwargs: + params = set() + for cls in type(self).mro()[:-1]: + params.update(inspect.signature(cls.__init__).parameters.keys()) + params -= {'self', 'kwargs'} + raise TypeError(f"Extra parameters {kwargs}.\n" + f"Parameters of {self.block_name()} are:\n" + + '\n'.join(sorted(params))) + + def block_name(self): + module = ".".join(self.__module__.split(".")[:-1]) + if module.startswith('udapi.block.'): + module = module[12:] + return module + "." + self.__class__.__name__ def process_start(self): """A hook method that is executed before processing UD data""" @@ -73,7 +88,7 @@ def process_document(self, document): p_tree = not hasattr(self.process_tree, 'is_not_overridden') p_node = not hasattr(self.process_node, 'is_not_overridden') if not any((p_entity, p_mention, p_bundle, p_tree, p_node)): - raise Exception("No processing activity defined in block " + str(self)) + raise Exception("No processing activity defined in block " + self.block_name()) if p_entity or p_mention: for entity in document.coref_entities: @@ -85,8 +100,8 @@ def process_document(self, document): if p_bundle or p_tree or p_node: for bundle_no, bundle in enumerate(document.bundles, 1): - logging.debug('Block %s processing bundle #%d (id=%s)', - self.__class__.__name__, bundle_no, bundle.bundle_id) + logging.debug(f'Block {self.block_name()} processing ' + f'bundle #{bundle_no} (id={bundle.bundle_id})') if p_bundle: self.process_bundle(bundle) else: diff --git a/udapi/core/run.py b/udapi/core/run.py index a0cc4a9a..418baca6 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -67,6 +67,26 @@ def _parse_command_line_arguments(scenario): return block_names, block_args +def _blocks_in_a_package(package_name): + import importlib.util, pkgutil + + if not importlib.util.find_spec(package_name): + return [] + try: + package = __import__(package_name, fromlist="dummy") + submodule_names = [m.name for m in pkgutil.iter_modules(package.__path__)] + pname = package_name + if pname.startswith("udapi.block."): + pname = pname[12:] + blocks = [] + for sname in submodule_names: + module = __import__(f"{package_name}.{sname}", fromlist="dummy") + bname = [c for c in dir(module) if c.lower() == sname][0] + blocks.append(f"{pname}.{bname}") + return blocks + except: + return [] + def _import_blocks(block_names, block_args): """ Parse block names, import particular packages and call the constructor for each object. @@ -92,8 +112,17 @@ def _import_blocks(block_names, block_args): command = "from " + module + " import " + class_name + " as b" + str(block_id) logging.debug("Trying to run command: %s", command) exec(command) # pylint: disable=exec-used - except Exception: - logging.warning("Error when trying import the block %s", block_name) + except ModuleNotFoundError as err: + package_name = ".".join(module.split(".")[:-1]) + blocks = _blocks_in_a_package(package_name) + if not blocks: + raise + raise ModuleNotFoundError( + f"Cannot find block {block_name} (i.e. class {module}.{class_name})\n" + f"Available block in {package_name} are:\n" + + "\n".join(_blocks_in_a_package(package_name))) from err + except Exception as ex: + logging.warning(f"Cannot import block {block_name} (i.e. class {module}.{class_name})") raise # Run the imported module. From 49ed44d2e309523cdf3361c599934d5dbf58a2a8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:23:36 +0100 Subject: [PATCH 0608/1201] read.XY files='!*.conllu' should iterated over sorted files glob.glob() returns files in an arbitrary order (as `ls -U`) --- udapi/core/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index c6973dad..be59b2c0 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -65,7 +65,7 @@ def string_to_filenames(self, string): def _token_to_filenames(token): if token[0] == '!': pattern = token[1:] - filenames = glob.glob(pattern) + filenames = sorted(glob.glob(pattern)) if not filenames: raise RuntimeError('No filenames matched "%s" pattern' % pattern) elif token[0] == '@': From 1a4241104709e7647cf75ff84dbc68df3428fbe0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 9 Feb 2023 23:49:11 +0100 Subject: [PATCH 0609/1201] improve ud.ComplyWithText (for KorKor) --- udapi/block/ud/complywithtext.py | 70 ++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 02904731..c850018e 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -35,7 +35,8 @@ class ComplyWithText(Block): def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, - previous_form_attr='CorrectForm', **kwargs): + previous_form_label='CorrectForm', previous_text_label='CorrectText', + added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -63,8 +64,8 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") node2(form="in", deprel="goeswith", upos="X", parent=node1) node3(form="law", deprel="goeswith", upos="X", parent=node1). - previous_form_attr - when changing node.form, we store the previous value - in node.misc[previous_form_attr] (so no information is lost). + previous_form_label - when changing node.form, we store the previous value + in node.misc[previous_form_label] (so no information is lost). Default="CorrectForm" because we expect that the previous value (i.e. the value of node.form before applying this block) contained the corrected spelling, while root.text contains @@ -72,6 +73,12 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html When setting this parameter to an empty string, no values will be stored to node.misc. When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. + previous_text_label - when we are not able to adapt the annotation to match root.text + and fix_text is True, we store the previous root.text in a CoNLL-U comment with this label. + Default="CorrectText". When setting this parameter to an empty string, + no values will be stored to root.comment. + added_label - when creating new nodes because allow_add_punct=True, we mark these nodes + as new_node.misc[added_label] = 1. Default="Added". """ super().__init__(**kwargs) self.fix_text = fix_text @@ -81,7 +88,9 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ self.allow_add_punct = allow_add_punct self.allow_delete_punct = allow_delete_punct self.allow_hyphen_goeswith = allow_hyphen_goeswith - self.previous_form_attr = previous_form_attr + self.previous_form_label = previous_form_label + self.previous_text_label = previous_text_label + self.added_label = added_label @staticmethod def allow_space(form): @@ -90,9 +99,9 @@ def allow_space(form): def store_previous_form(self, node): """Store the previous form of this node into MISC, unless the change is common&expected.""" - if node.form not in ("''", "``") and self.previous_form_attr: - node.misc[self.previous_form_attr] = node.form - if self.previous_form_attr == 'CorrectForm': + if node.form not in ("''", "``") and self.previous_form_label: + node.misc[self.previous_form_label] = node.form + if self.previous_form_label == 'CorrectForm': node.feats['Typo'] = 'Yes' def process_tree(self, root): @@ -140,7 +149,8 @@ def process_tree(self, root): if self.fix_text: computed_text = root.compute_text() if text != computed_text: - root.add_comment('ToDoOrigText = ' + root.text) + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') root.text = computed_text def unspace_diffs(self, orig_diffs, tree_chars, text): @@ -152,6 +162,10 @@ def unspace_diffs(self, orig_diffs, tree_chars, text): tree_lo += 1 if tree_chars[tree_hi - 1] == ' ': tree_hi -= 1 + if text[text_lo] == ' ': + text_lo += 1 + if text[text_hi - 1] == ' ': + text_hi -= 1 old = tree_chars[tree_lo:tree_hi] new = text[text_lo:text_hi] if old == '' and new == '': @@ -208,12 +222,11 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): elif edit == 'insert': forms = text[text_lo:text_hi].split(' ') if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: - #logging.info(f'trying to add {forms} before {char_nodes[tree_lo]}') next_node = char_nodes[tree_lo] for f in reversed(forms): new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') new.shift_before_node(next_node) - new.misc['Added'] = 1 + new.misc[self.added_label] = 1 else: logging.warning('Unable to insert nodes\n%s', _diff2str(diff, tree_chars, text)) @@ -246,18 +259,26 @@ def solve_diff(self, nodes, form): node_form = node.form if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: node_form = node_form.replace('-', '') - if len(nodes) == 1 and node_form == form.replace(' ', ''): - if self.allow_space(form): - self.store_previous_form(node) - node.form = form - elif self.allow_goeswith: - self.store_previous_form(node) - forms = form.split() - node.form = forms[0] - node.feats['Typo'] = 'Yes' - for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos='X') + if len(nodes) == 1: + if node_form == form.replace(' ', ''): + if self.allow_space(form): + self.store_previous_form(node) + node.form = form + elif self.allow_goeswith: + self.store_previous_form(node) + forms = form.split() + node.form = forms[0] + node.feats['Typo'] = 'Yes' + for split_form in reversed(forms[1:]): + new = node.create_child(form=split_form, deprel='goeswith', upos='X') + new.shift_after_node(node) + else: + logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) + elif self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('[ \p{P}]+', form[len(node.form):]): + for punct_form in reversed(form[len(node.form):].split()): + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) + new.misc[self.added_label] = 1 else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: @@ -283,9 +304,10 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): - new = node.create_child(form=form[len(node.form):], deprel='punct', upos='PUNCT') + punct_form = form[len(node.form):] + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) - new.misc['Added'] = 1 + new.misc[self.added_label] = 1 else: self.store_previous_form(node) node.form = form @@ -313,6 +335,4 @@ def _log_diffs(diffs, tree_chars, text, msg): def _diff2str(diff, tree, text): old = '|' + ''.join(tree[diff[1]:diff[2]]) + '|' new = '|' + ''.join(text[diff[3]:diff[4]]) + '|' - if diff[0] == 'equal': - return '{:7} {!s:>50}'.format(diff[0], old) return '{:7} {!s:>50} --> {!s}'.format(diff[0], old, new) From 3abb76df036f7aa2e8f39437aa7d5b80032ae850 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:08:12 +0100 Subject: [PATCH 0610/1201] ud.ComplyWithText fix_text=1 should always produce valid CoNLL-U even if the raw texts include double spaces or no-break spaces (TODO: alternatively, we could annotate these using SpacesAfter). --- udapi/block/ud/complywithtext.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index c850018e..351ebc01 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -109,9 +109,13 @@ def process_tree(self, root): if text is None: raise ValueError('Tree %s has no text, cannot use ud.ComplyWithText' % root) - # Normalize the stored text (double space -> single space) + # Normalize the stored text (e.g. double space or no-break space -> single space) # and skip sentences which are already ok. text = ' '.join(text.split()) + if root.text != text and self.fix_text: + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') + root.text = text if text == root.compute_text(): return From 0c6f946802345cc670ece9663fc7007ff05efd73 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:09:36 +0100 Subject: [PATCH 0611/1201] corefud.PrintMentions should show Entity annotations in MISC by default --- udapi/block/corefud/printmentions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 12db433a..d011f686 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -12,7 +12,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, print_total=True, print_should=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, - minimize_cross=True, color=True, attributes='form,upos,deprel', + minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc', print_undef_as='_', print_doc_meta=True, print_comments=False, mark='(Mark)', hints=True, layout='classic', **kwargs): From f9dd071481e49944fe6c70629bf9d56a90bd86d6 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:27:46 +0100 Subject: [PATCH 0612/1201] keep newdoc and global.Entity when using read.Conllu sent_id_filter=regex The global.Entity comment will be read automatically by read.Conllu and then inserted automatically by write.Conllu, but only for trees with tree.newdoc, so we need to keep this annotation as well (move it to the new first tree in a given document). --- udapi/core/basereader.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index a841bf1b..71d57159 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -97,13 +97,19 @@ def filtered_read_tree(self): tree = self.read_tree() if self.sent_id_filter is None: return tree + + skipped_newdoc = None while True: if tree is None: return None if self.sent_id_filter.match(tree.sent_id) is not None: + if skipped_newdoc and not tree.newdoc: + tree.newdoc = skipped_newdoc return tree logging.debug('Skipping sentence %s as it does not match the sent_id_filter %s.', tree.sent_id, self.sent_id_filter) + if tree.newdoc: + skipped_newdoc = tree.newdoc tree = self.read_tree() def try_fast_load(self, document): From b036d572af97a9f06482ccdcd7e90cfe4f0f5655 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 15:15:48 +0100 Subject: [PATCH 0613/1201] update ord of empty nodes when deleting preceding nonempty nodes TODO: add tests, solve also deleting of empty nodes --- udapi/core/node.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 618e75eb..8a764498 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -516,6 +516,7 @@ def remove(self, children=None): `rehang_warn` means to rehang and warn:-). """ self._parent._children.remove(self) + empty_follows = None if children is not None and self._children: if children.startswith('rehang'): for child in self._children: @@ -523,6 +524,16 @@ def remove(self, children=None): self._parent._children.extend(self._children) self._parent._children.sort() self._children.clear() + elif self._root.empty_nodes: + will_be_removed = self.descendants(add_self=1) + prev_nonempty = self._root + empty_follows = {} + for node in self._root.descendants_and_empty: + if node.empty: + empty_follows[node] = prev_nonempty + elif node not in will_be_removed: + prev_nonempty = node + if children.endswith('warn'): logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) @@ -536,14 +547,29 @@ def remove(self, children=None): self._root._descendants.remove(self) except ValueError: pass # self may be an already deleted node e.g. if n.remove() called twice - for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): - node.ord = new_ord + else: + for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): + node.ord = new_ord + for empty in self._root.empty_nodes: + if empty > self: + empty.ord = round(empty.ord - 1, 1) else: # TODO nodes_to_remove = self.unordered_descendants() # and mark all nodes as deleted, remove them from MWT and coref mentions self._root._descendants = sorted(self._root.unordered_descendants()) for (new_ord, node) in enumerate(self._root._descendants, 1): node.ord = new_ord + # Decrease ord of empty nodes (keep their fractional part) + # Make sure that e.g. after deleting node with ord=2 + # ords "1 1.1 1.2 2 2.1" will become "1 1.1 1.2 1.3". + if empty_follows: + last_ord = 0 + for empty in self._root.empty_nodes: + prev_nonempty = empty_follows[empty] + new_ord = round(prev_nonempty.ord + (empty.ord % 1), 1) + while new_ord <= last_ord: + new_ord = round(new_ord + 0.1, 1) + last_ord, empty.ord = new_ord, new_ord def _shift_before_ord(self, reference_ord, without_children=False): """Internal method for changing word order.""" From 6c289d3bda8134a683f6362198888ee920520203 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 16:32:51 +0100 Subject: [PATCH 0614/1201] ud.ComplyWithText: the previous root.text value is better described as OrigText Unlike the previous node.form values, it is (usually) the original raw text including typos etc, so the label "CorrectText" was completely misleading. --- udapi/block/ud/complywithtext.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 351ebc01..b36b2512 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -35,7 +35,7 @@ class ComplyWithText(Block): def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, - previous_form_label='CorrectForm', previous_text_label='CorrectText', + previous_form_label='CorrectForm', previous_text_label='OrigText', added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. @@ -74,8 +74,8 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ When setting this parameter to an empty string, no values will be stored to node.misc. When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. previous_text_label - when we are not able to adapt the annotation to match root.text - and fix_text is True, we store the previous root.text in a CoNLL-U comment with this label. - Default="CorrectText". When setting this parameter to an empty string, + and fix_text is True, we store the previous root.text value in a CoNLL-U comment with this label. + Default="OrigText". When setting this parameter to an empty string, no values will be stored to root.comment. added_label - when creating new nodes because allow_add_punct=True, we mark these nodes as new_node.misc[added_label] = 1. Default="Added". From 043f4d73745a0155db76d5f4776d77f7ceeeba8a Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Fri, 17 Feb 2023 16:47:25 +0100 Subject: [PATCH 0615/1201] minor changes in Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 111bceb9..fde3b0bd 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -27,8 +27,11 @@ def __init__(self, flavio=False, **kwargs): def process_node(self, node): rf = [] af = {} + # PROIEL-specific: greek words without features + if node.lemma == 'greek.expression': + pass # NOUNS ################################################################ - if node.upos == 'NOUN': + elif node.upos == 'NOUN': if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { @@ -125,14 +128,14 @@ def process_node(self, node): af['PronType'] = [] if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: af['PronType'].append('Prs') - elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis']: + elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis', 'qui']: af['PronType'].append('Ind') elif node.lemma in ['inuicem', 'invicem']: af['PronType'].append('Rcp') rf.remove('Case') - elif node.lemma in ['quicumque', 'qui', 'quisquis']: + if node.lemma in ['quicumque', 'qui', 'quisquis']: af['PronType'].append('Rel') - if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis']: + if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis', 'ecqui']: af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. @@ -176,7 +179,7 @@ def process_node(self, node): af['PronType'].append('Ind') elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: af['PronType'].append('Tot') - if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus']: + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: af['PronType'].append('Rel') elif node.lemma in ['qui', 'quantus', 'quot']: af['PronType'].append('Int') From e84741a6e78acaaf13739945bd17814d569e3601 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Feb 2023 22:06:56 +0100 Subject: [PATCH 0616/1201] Remove NOCOREF entities e.g. from AnCora. --- udapi/block/corefud/removenocorefentities.py | 21 ++++++++++++++++++++ udapi/core/coref.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 udapi/block/corefud/removenocorefentities.py diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py new file mode 100644 index 00000000..8baba086 --- /dev/null +++ b/udapi/block/corefud/removenocorefentities.py @@ -0,0 +1,21 @@ +from udapi.core.block import Block +import udapi.core.coref +import re +import logging + +class RemoveNoCorefEntities(Block): + """ + Some corpora (e.g., AnCora) include annotation of named entities that are + not annotated for coreference. To distinguish them, their cluster ID starts + with 'NOCOREF' (optionally followed by entity type, so that one cluster + still has just one type). We may want to remove such entities from datasets + that are used to train coreference resolves, to prevent the resolvers from + thinking that all members of a NOCOREF cluster are coreferential. That is + what this block does. + """ + + def process_document(self, doc): + entities = doc.coref_entities + if not entities: + return + doc.coref_entities = [e for e in entities if not re.match(r'^NOCOREF', e.eid)] diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 12dda239..4cd656f1 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -300,7 +300,7 @@ def __init__(self, eid, etype=None): self.split_ante = [] def __lt__(self, another): - """Does this CorefEntity precedes (word-order wise) `another` entity? + """Does this CorefEntity precede (word-order wise) `another` entity? This method defines a total ordering of all entities by the first mention of each entity (see `CorefMention.__lt__`). From 16c3a48ed3eb7861757092649a6ece22b893151c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Feb 2023 22:27:19 +0100 Subject: [PATCH 0617/1201] Another method of removing entities. --- udapi/block/corefud/removenocorefentities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py index 8baba086..4551873c 100644 --- a/udapi/block/corefud/removenocorefentities.py +++ b/udapi/block/corefud/removenocorefentities.py @@ -18,4 +18,4 @@ def process_document(self, doc): entities = doc.coref_entities if not entities: return - doc.coref_entities = [e for e in entities if not re.match(r'^NOCOREF', e.eid)] + doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)} From 8b442889aca3c1b881d7d53896d1eb0547635cfa Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 21 Feb 2023 15:52:18 +0100 Subject: [PATCH 0618/1201] CorefUD: counting sentence sequences with no coref annotation --- udapi/block/corefud/countgaps.py | 67 ++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 udapi/block/corefud/countgaps.py diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py new file mode 100644 index 00000000..c8ee8d76 --- /dev/null +++ b/udapi/block/corefud/countgaps.py @@ -0,0 +1,67 @@ +from udapi.core.block import Block +from collections import Counter + +class CountGaps(Block): + """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" + + def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs): + super().__init__(**kwargs) + self.report_per_newdoc = report_per_newdoc + self.report_per_file = report_per_file + self.report_total = report_total + self._total_counter = Counter() + + def _report_stats(self, counter=None, header_id=None): + if not counter: + counter = self._total_counter + if header_id: + print(f"============ {header_id} ============") + for key in sorted(counter): + print(f"{key:2d}: {counter[key]}") + + def _count_empty_seqs(self, empty_seqs): + counter = Counter() + for seq in empty_seqs: + counter[len(seq)] += 1 + return counter + + def process_document(self, doc): + file_counter = Counter() + empty_seqs = [] + curr_seq = [] + newdoc = None + for i, tree in enumerate(doc.trees): + if tree.newdoc: + if i: + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_counter = self._count_empty_seqs(empty_seqs) + file_counter.update(newdoc_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_counter, header_id=newdoc) + newdoc = tree.newdoc + empty_seqs = [] + curr_seq = [] + + has_mention = any(node.coref_mentions for node in tree.descendants) + if not has_mention: + curr_seq.append(tree.sent_id) + elif curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_counter = self._count_empty_seqs(empty_seqs) + file_counter.update(newdoc_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_counter, header_id=newdoc) + + if self.report_per_file: + self._report_stats(file_counter, header_id="FULL DOC") + + self._total_counter.update(file_counter) + + def process_end(self): + if self.report_total: + self._report_stats(header_id="TOTAL") From 716461fe3b67711f71a8cee028668fe34ceffef0 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 21 Feb 2023 19:22:33 +0100 Subject: [PATCH 0619/1201] besides sequences, counting also paragraphs with no coref mentions --- udapi/block/corefud/countgaps.py | 63 +++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py index c8ee8d76..fc45540a 100644 --- a/udapi/block/corefud/countgaps.py +++ b/udapi/block/corefud/countgaps.py @@ -1,5 +1,5 @@ from udapi.core.block import Block -from collections import Counter +from collections import defaultdict, Counter class CountGaps(Block): """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" @@ -9,15 +9,15 @@ def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=T self.report_per_newdoc = report_per_newdoc self.report_per_file = report_per_file self.report_total = report_total - self._total_counter = Counter() + self._total_counter = defaultdict(Counter) - def _report_stats(self, counter=None, header_id=None): - if not counter: - counter = self._total_counter + def _report_stats(self, counter, header_id=None): if header_id: print(f"============ {header_id} ============") for key in sorted(counter): print(f"{key:2d}: {counter[key]}") + print("-------") + print(f"SUM: {sum([k*counter[k] for k in counter])}") def _count_empty_seqs(self, empty_seqs): counter = Counter() @@ -26,42 +26,69 @@ def _count_empty_seqs(self, empty_seqs): return counter def process_document(self, doc): - file_counter = Counter() + file_counters = defaultdict(Counter) empty_seqs = [] + empty_pars = [] curr_seq = [] + curr_par = [] + is_empty_par = True newdoc = None for i, tree in enumerate(doc.trees): if tree.newdoc: if i: if curr_seq: empty_seqs.append(curr_seq) - newdoc_counter = self._count_empty_seqs(empty_seqs) - file_counter.update(newdoc_counter) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if is_empty_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) if self.report_per_newdoc: - self._report_stats(newdoc_counter, header_id=newdoc) + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}") newdoc = tree.newdoc empty_seqs = [] + empty_pars = [] curr_seq = [] + curr_par = [] + is_empty_par = True + if tree.newpar: + if not tree.newdoc and is_empty_par: + empty_pars.append(curr_par) + curr_par = [] + is_empty_par = True has_mention = any(node.coref_mentions for node in tree.descendants) if not has_mention: curr_seq.append(tree.sent_id) - elif curr_seq: - empty_seqs.append(curr_seq) - curr_seq = [] + curr_par.append(tree.sent_id) + else: + if curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + is_empty_par = False if curr_seq: empty_seqs.append(curr_seq) - newdoc_counter = self._count_empty_seqs(empty_seqs) - file_counter.update(newdoc_counter) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if curr_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) if self.report_per_newdoc: - self._report_stats(newdoc_counter, header_id=newdoc) + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}") if self.report_per_file: - self._report_stats(file_counter, header_id="FULL DOC") + self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE") + self._report_stats(file_counters["par"], header_id="PAR STATS, FILE") - self._total_counter.update(file_counter) + self._total_counter["seq"].update(file_counters["seq"]) + self._total_counter["par"].update(file_counters["par"]) def process_end(self): if self.report_total: - self._report_stats(header_id="TOTAL") + self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL") + self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL") From c147469f5a4a9267902974846c6ff2d804447cdb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 25 Feb 2023 00:25:12 +0100 Subject: [PATCH 0620/1201] write.CorefHtml add visualization menu show: eid, trees, line breaks, paragraphs --- udapi/block/write/corefhtml.py | 39 +++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 280fc213..20f68291 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -11,7 +11,7 @@ Udapi CorefUD viewer ''' -# I use a pure CSS-3 solution: #overiew {resize: horizontal; overflow: auto;} +# I use a pure CSS-3 solution: #overview {resize: horizontal; overflow: auto;} # so that the width of #overview can be changed by dragging the bottom right corner. # The following lines would make the whole right border draggable: # @@ -25,9 +25,19 @@ display: grid; border-right: double; padding: 5px; width: 20em; background: #ddd; border-radius: 5px; } +#main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; + padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} +#menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} +.change .b1 {transform: translate(0, 9px) rotate(-45deg);} +.change .b2 {opacity: 0;} +.change .b3 {transform: translate(0, -9px) rotate(45deg);} + .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.sentence .tree span {border: none; padding: 0; display:inline;} .sentence span .eid {display:block; font-size: 10px;} -.showtree {float:left; margin: 5px;} +.showtree {margin: 5px; user-select: none;} +.display-inline {display: inline;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} .empty {color: gray;} .sentence .singleton {border-style: dotted;} @@ -55,16 +65,22 @@ function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, function(e) {$("span").removeClass("active");} ); + +function menuclick(x) { + x.classList.toggle("change"); + $("#main-menu").toggle(); +} + ''' SCRIPT_SHOWTREE = ''' $(".sentence").each(function(index){ var sent_id = this.id; - $(this).before( + $(this).prepend( $("
    ') print('
    ') + print('\n' + '\n') for tree in doc.trees: self.process_tree(tree, mention_ids, entity_colors) print('
    ') @@ -180,7 +203,7 @@ def process_tree(self, tree, mention_ids, entity_colors): if tree.newdoc: print(f'

    {tree.newdoc if tree.newdoc is not True else ""}


    ') elif tree.newpar: - print('
    ') + print('
    ') opened = [] print(f'

    ') for node in nodes_and_empty: @@ -188,7 +211,7 @@ def process_tree(self, tree, mention_ids, entity_colors): subspan = subspans.pop() self._start_subspan(subspan, mention_ids, entity_colors) opened.append(subspan) - + is_head = self._is_head(node) if is_head: print('', end='') @@ -199,7 +222,7 @@ def process_tree(self, tree, mention_ids, entity_colors): print('', end='') if is_head: print('', end='') - + while opened and opened[-1].words[-1] == node: print('', end='') opened.pop() @@ -229,7 +252,7 @@ def process_tree(self, tree, mention_ids, entity_colors): if not node.no_space_after: print(' ', end='') - + print('

    ') def _is_head(self, node): From 0b30f5b75ab2a53ed5e0425d536094dee5c56f02 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 25 Feb 2023 02:53:43 +0100 Subject: [PATCH 0621/1201] more visualization options --- udapi/block/write/corefhtml.py | 65 +++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 20f68291..fd500e7d 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -11,13 +11,7 @@ Udapi CorefUD viewer ''' -# I use a pure CSS-3 solution: #overview {resize: horizontal; overflow: auto;} -# so that the width of #overview can be changed by dragging the bottom right corner. -# The following lines would make the whole right border draggable: -# -# -# -#
    + CSS = ''' #wrap {display: flex; align-items: flex-start;} #main {width: 100%; padding: 5px; background: white; z-index:100;} @@ -27,15 +21,19 @@ } #main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#main-menu div {display: inline-block;} #menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} #menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} .change .b1 {transform: translate(0, 9px) rotate(-45deg);} .change .b2 {opacity: 0;} .change .b3 {transform: translate(0, -9px) rotate(45deg);} -.sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} -.sentence .tree span {border: none; padding: 0; display:inline;} -.sentence span .eid {display:block; font-size: 10px;} +.m {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.nobox {border:1px solid transparent; padding:0; background: transparent !important; display: inline} +.nobox .labels {display: inline;} +.nocolor {color: black !important;} +.nobold {font-weight: normal;} +.labels {display: block; font-size: 10px;} .showtree {margin: 5px; user-select: none;} .display-inline {display: inline;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} @@ -48,22 +46,22 @@ ''' SCRIPT_BASE = ''' -$("span").click(function(e) { +$(".m").click(function(e) { let was_selected = $(this).hasClass("selected"); - $("span").removeClass("selected"); + $(".m").removeClass("selected"); if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} e.stopPropagation(); }); window.onhashchange = function() { - $("span").removeClass("selected"); + $(".m").removeClass("selected"); var fragment = window.location.hash.substring(1); if (fragment) {$("." + fragment).addClass("selected");} } -$("span").hover( - function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, - function(e) {$("span").removeClass("active");} +$(".m").hover( + function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, + function(e) {$(".m").removeClass("active");} ); function menuclick(x) { @@ -94,10 +92,11 @@ class CorefHtml(BaseWriter): - def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): + def __init__(self, show_trees=True, show_eid=False, show_etype=False, colors=7, **kwargs): super().__init__(**kwargs) self.show_trees = show_trees self.show_eid = show_eid + self.show_etype = show_etype self.colors = colors def _representative_word(self, entity): @@ -120,6 +119,10 @@ def process_document(self, doc): if self.colors: for i in range(self.colors): print(f'.c{i} {{color: hsl({int(i * 360/self.colors)}, 100%, 30%);}}') + if not self.show_eid: + print('.eid {display: none;}') + if not self.show_etype: + print('.etype {display: none;}') print('') print('\n\n
    ') @@ -146,13 +149,19 @@ def process_document(self, doc): print('
    ') print('
    ') - print('\n' '\n') - for tree in doc.trees: - self.process_tree(tree, mention_ids, entity_colors) - print('
    ') - print('') print('
    ') - def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): + def _start_subspan(self, subspan, crossing=False): m = subspan.mention e = m.entity - classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"} m' + classes = f'{e.eid} {self._mention_ids[m]} {e.etype or "other"} m' title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}' if self.colors: - classes += f' {entity_colors[e]}' + classes += f' {self._entity_colors[e]}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: @@ -252,7 +303,7 @@ def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): f'{subspan.subspan_eid}' f' {e.etype}', end='') - def process_tree(self, tree, mention_ids, entity_colors): + def process_tree(self, tree): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -273,7 +324,7 @@ def process_tree(self, tree, mention_ids, entity_colors): for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() - self._start_subspan(subspan, mention_ids, entity_colors) + self._start_subspan(subspan) opened.append(subspan) is_head = self._is_head(node) @@ -311,7 +362,7 @@ def process_tree(self, tree, mention_ids, entity_colors): opened = new_opened print('' * (len(endings) + len(brokens)), end='') for broken in brokens: - self._start_subspan(broken, mention_ids, entity_colors, True) + self._start_subspan(broken, True) opened.append(subspan) if not node.no_space_after: diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 48431900..ae85d43c 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,7 +79,9 @@ def process_document(self, doc): print('\n') print('
    ') def print_doc_json(self, doc): - print('data=[') + print('[') for (bundle_number, bundle) in enumerate(doc, 1): if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' - for tree in bundle.trees: + try: + trees = bundle.trees + except: + trees = [bundle] # allow to call print_doc_json([tree1, tree2]) + for tree in trees: zone = tree.zone if first_zone: first_zone = False @@ -116,7 +122,7 @@ def print_doc_json(self, doc): print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) - print('];') + print(']') @staticmethod From 327bb6f9083f6131b4f986dac9b56f2570957f60 Mon Sep 17 00:00:00 2001 From: Federica Gamba Date: Thu, 30 Mar 2023 12:22:27 +0200 Subject: [PATCH 0626/1201] adjustments in Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 74 +++++++++++++++++++----------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index fde3b0bd..dce4592d 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -28,7 +28,8 @@ def process_node(self, node): rf = [] af = {} # PROIEL-specific: greek words without features - if node.lemma == 'greek.expression': + # LLCT-specific: corrupted nodes + if node.lemma in ['greek.expression', 'missing^token']: pass # NOUNS ################################################################ elif node.upos == 'NOUN': @@ -41,12 +42,14 @@ def process_node(self, node): 'Degree': ['Dim'], 'Abbr': ['Yes'], 'Foreign': ['Yes'], - 'VerbForm': ['Part']} + 'VerbForm': ['Part', 'Vnoun']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Proper'] = ['Yes'] + af['Polarity'] = ['Neg'] af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) self.check_allowed_features(node, af) @@ -61,10 +64,10 @@ def process_node(self, node): 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: - af['Compound'] = 'Yes' + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] - if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADJECTIVES ########################################################### @@ -72,7 +75,7 @@ def process_node(self, node): if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: rf = ['Gender', 'Number', 'Case'] af = { - 'NumType': ['Ord', 'Dist'], + 'NumType': ['Dist', 'Mult', 'Ord'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], @@ -83,9 +86,10 @@ def process_node(self, node): 'VerbForm': ['Part']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] + af['Variant'] = ['Greek'] af['Degree'].append('Dim') af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) @@ -112,10 +116,10 @@ def process_node(self, node): rf.extend(['Person', 'Number']) af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] - # 1st and 2nd person do not have gender + # 3rd person must have gender if node.feats['Person'] == '3': # is, id rf.append('Gender') - af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] elif re.match(r'^(Rel|Int)$', node.feats['PronType']): rf.extend(['Gender', 'Number']) af['Gender'] = ['Masc', 'Fem', 'Neut'] @@ -126,20 +130,20 @@ def process_node(self, node): af['Number'] = ['Sing', 'Plur'] # lexical check of PronTypes af['PronType'] = [] - if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: + if node.lemma in ['ego', 'tu', 'is', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'egoipse', 'egometipse', 'tumetipse', 'semetipse', 'nosmetipse']: af['PronType'].append('Prs') - elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis', 'qui']: + elif node.lemma in ['aliquis', 'nemo', 'nihil', 'nihilum', 'qui', 'quis', 'quisquis', 'quiuis', 'quivis']: af['PronType'].append('Ind') elif node.lemma in ['inuicem', 'invicem']: af['PronType'].append('Rcp') rf.remove('Case') - if node.lemma in ['quicumque', 'qui', 'quisquis']: + if node.lemma in ['qui', 'quicumque', 'quisquis']: af['PronType'].append('Rel') - if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis', 'ecqui']: + if node.lemma in [ 'ecquis', 'ecqui', 'numquis', 'qui', 'quis', 'quisnam']: af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['LatAnom', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurO', 'IndEurX', 'LatAnom', 'LatPron'] af['Compound'] = ['Yes'] af['Polarity'] = ['Neg'] af['Form'] = ['Emp'] @@ -175,25 +179,26 @@ def process_node(self, node): if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: if not af['PronType'] == ['Prs']: af['PronType'].append('Prs') - elif node.lemma in ['aliquot', 'quidam', 'quispiam', 'quivis', 'nullus', 'nonnullus', 'aliqui', 'qui', 'quilibet', 'quantuslibet', 'unus', 'uterque', 'ullus', 'multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + elif node.lemma in ['aliquantus', 'aliqui', 'aliquot', 'quidam', 'nonnullus', 'nullus', 'quantuscumque', 'quantuslibet', 'qui', 'quilibet', 'quispiam', 'quiuis', 'quivis', 'quotlibet', 'ullus', 'unus', 'uterque','multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: af['PronType'].append('Ind') elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: af['PronType'].append('Tot') if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: af['PronType'].append('Rel') - elif node.lemma in ['qui', 'quantus', 'quot']: + if node.lemma in ['qui', 'quantus', 'quot']: af['PronType'].append('Int') - elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot']: + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot', 'praedictus', 'praefatus', 'suprascriptus']: af['PronType'].append('Dem') - elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter']: + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter', 'uterlibet', 'uterque']: af['PronType'].append('Con') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] af['Compound'] = ['Yes'] af['Form'] = ['Emp'] af['NumType'] = ['Card'] af['Degree'].append('Dim') + af['PronType'].append('Art') if re.match(r'^(unus|ambo)', node.lemma): af['NumValue'] = ['1', '2'] self.check_required_features(node, rf) @@ -202,7 +207,7 @@ def process_node(self, node): elif node.upos == 'NUM': rf = ['NumType', 'NumForm'] af = { - 'NumType': ['Card'], + 'NumType': ['Card', 'Ord'], 'NumForm': ['Word', 'Roman', 'Digit'], 'Proper': ['Yes']} # Arabic digits and Roman numerals do not have inflection features. @@ -212,7 +217,9 @@ def process_node(self, node): af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['NumForm'].append('Reference') + af['Compound'] = ['Yes'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # VERBS AND AUXILIARIES ################################################ @@ -227,7 +234,7 @@ def process_node(self, node): if node.feats['VerbForm'] not in ['Part', 'Conv']: rf.append('Tense') af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] - if node.upos == 'VERB': + if node.upos == 'VERB' or (node.upos == 'AUX' and node.lemma != 'sum'): rf.append('Voice') af['Voice'] = ['Act', 'Pass'] if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive @@ -255,6 +262,7 @@ def process_node(self, node): if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + af['VerbType'] = ['Mod'] if 'Degree' in af: af['Degree'].append('Dim') else: @@ -262,7 +270,12 @@ def process_node(self, node): af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): - af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU'] + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + elif node.feats['VerbForm'] == 'Inf': + af['Case'] = ['Nom', 'Acc', 'Abl'] + af['Gender'] = ['Neut'] + af['Number'] = ['Sing'] + af['InflClass[nominal]'] = ['Ind'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## @@ -271,13 +284,13 @@ def process_node(self, node): 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], - 'NumType': ['Card', 'Ord'], # e.g., primum + 'NumType': ['Card', 'Mult', 'Ord'], # e.g., primum 'Polarity': ['Neg'] } if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] - af['VerbForm'] = ['Part'] + af['VerbForm'] = ['Fin', 'Part'] af['Degree'].append('Dim') self.check_allowed_features(node, af) # PARTICLES ############################################################ @@ -289,6 +302,7 @@ def process_node(self, node): if self.flavio: af['Form'] = ['Emp'] af['PronType'] = ['Dem'] + af['Compound'] = ['Yes'] self.check_allowed_features(node, af) # CONJUNCTIONS ######################################################### elif re.match(r'^[CS]CONJ$', node.upos): @@ -301,6 +315,8 @@ def process_node(self, node): af['Form'] = ['Emp'] af['VerbForm'] = ['Fin'] af['NumType'] = ['Card'] + af['ConjType'] = ['Expl'] + af['AdvType'] = ['Loc'] self.check_allowed_features(node, af) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': @@ -310,9 +326,13 @@ def process_node(self, node): 'Abbr': ['Yes'] } if self.flavio: - af['VerbForm'] = ['Part'], + af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] self.check_allowed_features(node, af) + # X ########################################################## + elif node.upos == 'X': + af = {'Abbr': ['Yes']} # THE REST: NO FEATURES ################################################ else: self.check_allowed_features(node, {}) From 1ddfce4aec593e222a0e3d26e8f74acf561d1356 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 31 Mar 2023 19:42:35 +0200 Subject: [PATCH 0627/1201] gzip the docs/* json and html files --- udapi/block/write/corefhtml.py | 49 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index cd0db1e5..6129b335 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -17,6 +17,7 @@ from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention from collections import Counter import udapi.block.write.html +import gzip import sys import os @@ -26,6 +27,7 @@ Udapi CorefUD viewer + ''' CSS = ''' @@ -87,21 +89,26 @@ $("#main-menu").toggle(); } -function load_doc(doc_num) { +async function load_doc(doc_num) { loading_now = true; - console.log("loading doc" + doc_num + ".html"); - $.get(docs_dir + "/doc" + doc_num + ".html", function(data){ - $("#main").append(data); - add_mention_listeners($("#doc" + doc_num + " .m")); - $("#doc" + doc_num + " .sentence").each(add_show_tree_button); - loading_now = false; - }).fail(function(){ + let filename = docs_dir + "/doc" + doc_num + ".html.gz" + console.log("loading " + filename); + try { + const res = await fetch(filename); + let raw = await res.arrayBuffer(); + data = pako.inflate(raw, {to: "string"}); + } catch (error){ if (! load_fail_reported) { load_fail_reported = true; - alert("Cannot load " + docs_dir + "/doc" + doc_num - + ".html\\nLocal files do not support lazy loading. Run a web server 'python -m http.server'"); + alert("Cannot load " + filename + "\\nLocal files do not support lazy loading." + + " Run a web server 'python -m http.server'\\n" + + "error = " + error); } - }); + } + $("#main").append(data); + add_mention_listeners($("#doc" + doc_num + " .m")); + $("#doc" + doc_num + " .sentence").each(add_show_tree_button); + loading_now = false; } var docs_loaded = 1; @@ -126,7 +133,7 @@ add_show_tree_button = function(index, el){ var sent_id = el.id; $(el).prepend( - $("