diff --git a/.circleci/config.yml b/.circleci/config.yml index 4e88d664..9530d5c7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -15,13 +15,16 @@ orbs: # See: https://circleci.com/docs/2.0/configuration-reference/#jobs jobs: build-and-test: # This is the name of the job, feel free to change it to better match what you're trying to do! + parameters: + python-version: + type: string # These next lines defines a Docker executors: https://circleci.com/docs/2.0/executor-types/ # You can specify an image from Dockerhub or use one of the convenience images from CircleCI's Developer Hub # A list of available CircleCI Docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python # The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container # Change the version below to your required version of python docker: - - image: cimg/python:3.9 + - image: cimg/python:<< parameters.python-version >> # Checkout the code as the first step. This is a dedicated CircleCI step. # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default. # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt. @@ -31,15 +34,15 @@ jobs: - checkout - python/install-packages: pkg-manager: pip - # app-dir: ~/project/package-directory/ # If you're requirements.txt isn't in the root directory. - # pip-dependency-file: test-requirements.txt # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements. - run: name: Install Udapi command: pip install ".[test]" + - run: mkdir -p test-results - run: name: Run pytest tests - # This assumes pytest is installed via the install-package step above - command: pytest + command: pytest --junitxml=test-results/junit.xml -o junit_family=legacy + - store_test_results: + path: test-results - run: name: Color TextModeTrees command: udapy read.Conllu files=udapi/core/tests/data/babinsky.conllu write.TextModeTrees color=1 @@ -51,7 +54,9 @@ jobs: # Invoke jobs via workflows # See: https://circleci.com/docs/2.0/configuration-reference/#workflows workflows: - sample: # This is the name of the workflow, feel free to change it to better match your workflow. - # Inside the workflow, you define the jobs you want to run. + test-matrix: jobs: - - build-and-test + - build-and-test: + matrix: + parameters: + python-version: ["3.9", "3.11", "3.13"] diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..0285eddb --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,70 @@ +# This workflow will upload a Python Package to PyPI when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + release-build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Build release distributions + run: | + # NOTE: put your own distribution build steps here. + python -m pip install build + python -m build + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: release-dists + path: dist/ + + pypi-publish: + runs-on: ubuntu-latest + needs: + - release-build + permissions: + # IMPORTANT: this permission is mandatory for trusted publishing + id-token: write + + # Dedicated environments with protections for publishing are strongly recommended. + # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules + environment: + name: pypi + # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status: + url: https://pypi.org/p/udapi + # + # ALTERNATIVE: if your GitHub Release name is the PyPI project version string + # ALTERNATIVE: exactly, uncomment the following line instead: + # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }} + + steps: + - name: Retrieve release distributions + uses: actions/download-artifact@v4 + with: + name: release-dists + path: dist/ + + - name: Publish release distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist/ diff --git a/.gitignore b/.gitignore index a75e7c05..adc7bbbc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.cache .idea +*.egg-info/ *.pyc -.cache +dist/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..8804cc4e --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,23 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Currently, RTD needs to select an OS with OpenSSL>=1.1.1 because of +# urllib3's dependence on that system library. (alternately, pin urllib3<2 +# See https://github.com/urllib3/urllib3/issues/2168 +build: + os: ubuntu-22.04 + tools: + python: "3.10" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + fail_on_warning: false + +python: + install: + - requirements: docs/requirements.txt diff --git a/CHANGES.txt b/CHANGES.txt index 67ced748..98e26605 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,20 @@ Udapi Change Log ---------------- See https://github.com/udapi/udapi-python/commits/master for details. +0.5.1 2025-11-05 + - make udapy compatible with Python 3.13 + +0.5.0 2025-10-18 + - added mwt.feats + - added root.prev_tree and root.next_tree + - .github/workflows/python-publish.yml + - edits by Dan Zeman in block.ud.* + +0.4.0 2025-03-28 + - support for CorefUD 1.3 + - edits by Dan Zeman in block.ud.* + - requires Python 3.9+ (difficult to test older versions in Circle-CI) + 0.3.0 2022-04-06 - support for CorefUD 1.0 (new CoNLL-U format for coreference annotation) - edits by Dan Zeman in block.ud.* diff --git a/README.md b/README.md index 0b41297f..36465c78 100644 --- a/README.md +++ b/README.md @@ -6,28 +6,24 @@ Python framework for processing Universal Dependencies data [![Documentation Status](https://readthedocs.org/projects/udapi/badge/)](http://udapi.readthedocs.io) ## Requirements -- You need Python 3.6 or higher. -- If the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser is needed, - make sure you have a C++11 compiler (e.g. [g++ 4.7 or newer](.travis.yml#L9)) - and install UDPipe with `pip3 install --user --upgrade ufal.udpipe`. +- You need Python 3.9 or higher. +- It is recommended to install Udapi in a Python virtual environment. +- If you need the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser (to be used from Udapi) + install it (with `pip install --upgrade ufal.udpipe`). ## Install Udapi for developers -Let's clone the git repo to `~/udapi-python/`, install dependencies -and setup `$PATH` and `$PYTHONPATH` accordingly. +Let's clone the git repo e.g. to `~/udapi-python/` and make an [editable installation](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) ```bash cd git clone https://github.com/udapi/udapi-python.git -pip3 install --user -r udapi-python/requirements.txt -echo '## Use Udapi from ~/udapi-python/ ##' >> ~/.bashrc -echo 'export PATH="$HOME/udapi-python/bin:$PATH"' >> ~/.bashrc -echo 'export PYTHONPATH="$HOME/udapi-python/:$PYTHONPATH"' >> ~/.bashrc -source ~/.bashrc # or open new bash +cd udapi-python +pip install -e . ``` ## Install Udapi for users This is similar to the above, but installs Udapi from PyPI to the standard (user) Python paths. ``` -pip3 install --user --upgrade udapi +pip install --upgrade udapi ``` Try `udapy -h` to check it is installed correctly. If it fails, make sure your `PATH` includes the directory where `pip3` installed the `udapy` script. diff --git a/bin/udapy b/bin/udapy index 528e3577..83c7a6f2 100755 --- a/bin/udapy +++ b/bin/udapy @@ -1,116 +1,7 @@ #!/usr/bin/env python3 -import os -import gc +"""Thin wrapper for backward compatibility. Calls udapi.cli.main().""" import sys -import atexit -import logging -import argparse +from udapi.cli import main -from udapi.core.run import Run - -# Parse command line arguments. -argparser = argparse.ArgumentParser( - formatter_class=argparse.RawTextHelpFormatter, - usage="udapy [optional_arguments] scenario", - epilog="See http://udapi.github.io", - description="udapy - Python interface to Udapi - API for Universal Dependencies\n\n" - "Examples of usage:\n" - " udapy -s read.Sentences udpipe.En < in.txt > out.conllu\n" - " udapy -T < sample.conllu | less -R\n" - " udapy -HAM ud.MarkBugs < sample.conllu > bugs.html\n") -argparser.add_argument( - "-q", "--quiet", action="store_true", - help="Warning, info and debug messages are suppressed. Only fatal errors are reported.") -argparser.add_argument( - "-v", "--verbose", action="store_true", - help="Warning, info and debug messages are printed to the STDERR.") -argparser.add_argument( - "-s", "--save", action="store_true", - help="Add write.Conllu to the end of the scenario") -argparser.add_argument( - "-T", "--save_text_mode_trees", action="store_true", - help="Add write.TextModeTrees color=1 to the end of the scenario") -argparser.add_argument( - "-H", "--save_html", action="store_true", - help="Add write.TextModeTreesHtml color=1 to the end of the scenario") -argparser.add_argument( - "-A", "--save_all_attributes", action="store_true", - help="Add attributes=form,lemma,upos,xpos,feats,deprel,misc (to be used after -T and -H)") -argparser.add_argument( - "-C", "--save_comments", action="store_true", - help="Add print_comments=1 (to be used after -T and -H)") -argparser.add_argument( - "-M", "--marked_only", action="store_true", - help="Add marked_only=1 to the end of the scenario (to be used after -T and -H)") -argparser.add_argument( - "-N", "--no_color", action="store_true", - help="Add color=0 to the end of the scenario, this overrides color=1 of -T and -H") -argparser.add_argument( - "-X", "--extra", action="append", - help="Add a specified parameter (or a block name) to the end of the scenario\n" - "For example 'udapy -TNX attributes=form,misc -X layout=align < my.conllu'") -argparser.add_argument( - "--gc", action="store_true", - help="By default, udapy disables Python garbage collection and at-exit cleanup\n" - "to speed up everything (especially reading CoNLL-U files). In edge cases,\n" - "when processing many files and running out of memory, you can disable this\n" - "optimization (i.e. enable garbage collection) with 'udapy --gc'.") -argparser.add_argument( - 'scenario', nargs=argparse.REMAINDER, help="A sequence of blocks and their parameters.") - -args = argparser.parse_args() - -# Set the level of logs according to parameters. -if args.verbose: - level = logging.DEBUG -elif args.quiet: - level = logging.CRITICAL -else: - level = logging.INFO - -logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', - level=level) - -# Process and provide the scenario. if __name__ == "__main__": - - # Disabling garbage collections makes the whole processing much faster. - # Similarly, we can save several seconds by partially disabling the at-exit Python cleanup - # (atexit hooks are called in reversed order of their registration, - # so flushing stdio buffers etc. will be still done before the os._exit(0) call). - # See https://instagram-engineering.com/dismissing-python-garbage-collection-at-instagram-4dca40b29172 - # Is it safe to disable GC? - # OS will free the memory allocated by this process after it ends anyway. - # The udapy wrapper is aimed for one-time tasks, not a long-running server, - # so in a typical case a document is loaded and almost no memory is freed before the end. - # Udapi documents have a many cyclic references, so running GC is quite slow. - if not args.gc: - gc.disable() - atexit.register(os._exit, 0) - atexit.register(sys.stderr.flush) - if args.save: - args.scenario = args.scenario + ['write.Conllu'] - if args.save_text_mode_trees: - args.scenario = args.scenario + ['write.TextModeTrees', 'color=1'] - if args.save_html: - args.scenario = args.scenario + ['write.TextModeTreesHtml', 'color=1'] - if args.save_all_attributes: - args.scenario = args.scenario + ['attributes=form,lemma,upos,xpos,feats,deprel,misc'] - if args.save_comments: - args.scenario = args.scenario + ['print_comments=1'] - if args.marked_only: - args.scenario = args.scenario + ['marked_only=1'] - if args.no_color: - args.scenario = args.scenario + ['color=0'] - if args.extra: - args.scenario += args.extra - - runner = Run(args) - # udapy is often piped to head etc., e.g. - # `seq 1000 | udapy -s read.Sentences | head` - # Let's prevent Python from reporting (with distracting stacktrace) - # "BrokenPipeError: [Errno 32] Broken pipe" - try: - runner.execute() - except BrokenPipeError: - pass + sys.exit(main()) diff --git a/bin/udapy.bat b/bin/udapy.bat new file mode 100644 index 00000000..013e08e7 --- /dev/null +++ b/bin/udapy.bat @@ -0,0 +1,4 @@ +@REM The Python launcher "py" must be accessible via the PATH environment variable. +@REM We assume that this batch script lies next to udapy in udapi-python/bin. +@REM The PYTHONPATH environment variable must contain path to udapi-python. +py %~dp$PATH:0\udapy %* diff --git a/docs/conf.py b/docs/conf.py index 45966b57..b7d0f6e5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,7 +51,7 @@ # General information about the project. project = 'Udapi' -copyright = '2017, Martin Popel' +copyright = '2023, Martin Popel' author = 'Martin Popel' # The version info for the project you're documenting, acts as replacement for @@ -61,14 +61,14 @@ # The short X.Y version. version = '0' # The full version, including alpha/beta/rc tags. -release = '2' +release = '3' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -167,7 +167,7 @@ def run_apidoc(_): module = os.path.abspath(os.path.join(cur_dir, "..", "udapi")) print(module) - from sphinx.apidoc import main + from sphinx.ext.apidoc import main main(['--separate', '-o', cur_dir, module, '--force']) def setup(app): diff --git a/docs/requirements.txt b/docs/requirements.txt index a994db47..a537f220 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,2 +1,4 @@ -colorama +colorama>=0.4.6 termcolor +ufal.udpipe +sphinx_rtd_theme diff --git a/pyproject.toml b/pyproject.toml index 374b58cb..18d5c717 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,36 @@ [build-system] -requires = [ - "setuptools>=42", - "wheel" -] +requires = ["setuptools>=42", "wheel"] build-backend = "setuptools.build_meta" + +[project] +name = "udapi" +version = "0.5.2" +description = "Python framework for processing Universal Dependencies data" +readme = "README.md" +requires-python = ">=3.9" +license = "GPL-3.0-or-later" +authors = [ + {name = "Martin Popel", email = "popel@ufal.mff.cuni.cz"} +] +classifiers = [ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", +] +dependencies = [ + "colorama", + "termcolor", +] + +[project.urls] +Homepage = "https://github.com/udapi/udapi-python" + +[project.optional-dependencies] +test = ["pytest"] +udpipe = ["ufal.udpipe"] + +[project.scripts] +udapy = "udapi.cli:main" + +[tool.setuptools] +packages = {find = {}} +include-package-data = true diff --git a/requirements.txt b/requirements.txt index 647361f7..044d3af7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -colorama +colorama>=0.4.6 termcolor ufal.udpipe diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index a14145ab..00000000 --- a/setup.cfg +++ /dev/null @@ -1,29 +0,0 @@ -[metadata] -name = udapi -version = 0.3.0 -author = Martin Popel -author_email = popel@ufal.mff.cuni.cz -description = Python framework for processing Universal Dependencies data -long_description = file: README.md -long_description_content_type = text/markdown -url = https://github.com/udapi/udapi-python -classifiers = - Programming Language :: Python :: 3 - License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+) - Operating System :: OS Independent - -[options] -packages = find: -python_requires = >=3.6 -include_package_data = True -scripts = - bin/udapy -install_requires = - colorama - termcolor - -[options.extras_require] -test = - pytest - - diff --git a/setup.py b/setup.py deleted file mode 100644 index 7f1a1763..00000000 --- a/setup.py +++ /dev/null @@ -1,4 +0,0 @@ -from setuptools import setup - -if __name__ == "__main__": - setup() diff --git a/tutorial/udapi-tutorial-dz.odt b/tutorial/udapi-tutorial-dz.odt new file mode 100644 index 00000000..d27ff8c4 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.odt differ diff --git a/tutorial/udapi-tutorial-dz.pdf b/tutorial/udapi-tutorial-dz.pdf new file mode 100644 index 00000000..86d975b6 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.pdf differ diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py new file mode 100644 index 00000000..fc45540a --- /dev/null +++ b/udapi/block/corefud/countgaps.py @@ -0,0 +1,94 @@ +from udapi.core.block import Block +from collections import defaultdict, Counter + +class CountGaps(Block): + """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" + + def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs): + super().__init__(**kwargs) + self.report_per_newdoc = report_per_newdoc + self.report_per_file = report_per_file + self.report_total = report_total + self._total_counter = defaultdict(Counter) + + def _report_stats(self, counter, header_id=None): + if header_id: + print(f"============ {header_id} ============") + for key in sorted(counter): + print(f"{key:2d}: {counter[key]}") + print("-------") + print(f"SUM: {sum([k*counter[k] for k in counter])}") + + def _count_empty_seqs(self, empty_seqs): + counter = Counter() + for seq in empty_seqs: + counter[len(seq)] += 1 + return counter + + def process_document(self, doc): + file_counters = defaultdict(Counter) + empty_seqs = [] + empty_pars = [] + curr_seq = [] + curr_par = [] + is_empty_par = True + newdoc = None + for i, tree in enumerate(doc.trees): + if tree.newdoc: + if i: + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if is_empty_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}") + newdoc = tree.newdoc + empty_seqs = [] + empty_pars = [] + curr_seq = [] + curr_par = [] + is_empty_par = True + if tree.newpar: + if not tree.newdoc and is_empty_par: + empty_pars.append(curr_par) + curr_par = [] + is_empty_par = True + + has_mention = any(node.coref_mentions for node in tree.descendants) + if not has_mention: + curr_seq.append(tree.sent_id) + curr_par.append(tree.sent_id) + else: + if curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + is_empty_par = False + + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if curr_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}") + + if self.report_per_file: + self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE") + self._report_stats(file_counters["par"], header_id="PAR STATS, FILE") + + self._total_counter["seq"].update(file_counters["seq"]) + self._total_counter["par"].update(file_counters["par"]) + + def process_end(self): + if self.report_total: + self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL") + self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL") diff --git a/udapi/block/corefud/delete.py b/udapi/block/corefud/delete.py new file mode 100644 index 00000000..5aaf94e7 --- /dev/null +++ b/udapi/block/corefud/delete.py @@ -0,0 +1,84 @@ +"""Delete coreference annotation (Entity|Bridge|SplitAnte) and optionally also empty nodes.""" + +from udapi.core.block import Block +import udapi.core.coref +import logging + +class Delete(Block): + + def __init__(self, coref=True, empty=False, misc=False, **kwargs): + """Args: + coref: delete coreference attributes in MISC, i.e (Entity|Bridge|SplitAnte) + empty: delete all empty nodes and references to them (from DEPS and MISC[Functor]) + misc: delete all attributes in MISC except for SpaceAfter + """ + super().__init__(**kwargs) + self.coref = coref + self.empty = empty + self.misc = misc + + def is_root_reachable_by_deps(self, node, parents_to_ignore=None): + """ Check if the root node is reachable from node, possibly after deleting the parents_to_ignore nodes. + """ + stack = [(node, [])] + while stack: + proc_node, path = stack.pop() + # root is reachable + if proc_node == node.root: + return True + # path forms a cycle, the root cannot be reached through this branch + if proc_node not in path: + for dep in proc_node.deps: + # the root cannot be reached through ignored nodes + if dep['parent'] not in parents_to_ignore: + # process the parent recursively + stack.append((dep['parent'], path + [proc_node])) + return False + + def _deps_ignore_nodes(self, node, parents_to_ignore): + """ Retrieve deps from the node, recursively ignoring specified parents. + """ + newdeps = [] + stack = [(node, [])] + while stack: + proc_node, skipped_nodes = stack.pop() + if proc_node not in skipped_nodes: + for dep in proc_node.deps: + if dep['parent'] in parents_to_ignore: + # process the ignored parent recursively + stack.append((dep['parent'], skipped_nodes + [proc_node])) + else: + # keep deps with a parent that shouldn't be ignored + newdeps.append(dep) + # If no newdeps were found (because of a cycle), return the root. + return newdeps if newdeps else [{'parent': node.root, 'deprel': 'root'}] + + def process_document(self, doc): + # This block should work both with coreference loaded (deserialized) and not. + if self.coref: + doc._eid_to_entity = None + for root in doc.trees: + if self.empty: + for node in root.descendants: + # process only the nodes dependent on empty nodes + if '.' in node.raw_deps: + # just remove empty parents if the root remains reachable + if self.is_root_reachable_by_deps(node, root.empty_nodes): + node.deps = [dep for dep in node.deps if not dep['parent'] in root.empty_nodes] + # otherwise propagate to non-empty ancestors + else: + node.deps = self._deps_ignore_nodes(node, root.empty_nodes) + # This needs to be done even if '.' not in node.raw_deps. + if '.' in node.misc['Functor'].split(':')[0]: + del node.misc['Functor'] + root.empty_nodes = [] + + if self.coref or self.misc: + for node in root.descendants + root.empty_nodes: + if self.misc: + node.misc = 'SpaceAfter=No' if node.no_space_after else None + if self.coref: + node._mentions = [] + if not self.misc: + for attr in ('Entity', 'Bridge', 'SplitAnte'): + del node.misc[attr] diff --git a/udapi/block/corefud/fixentityacrossnewdoc.py b/udapi/block/corefud/fixentityacrossnewdoc.py new file mode 100644 index 00000000..61e5e4f6 --- /dev/null +++ b/udapi/block/corefud/fixentityacrossnewdoc.py @@ -0,0 +1,25 @@ +from udapi.core.block import Block +import udapi.core.coref +import logging + +class FixEntityAcrossNewdoc(Block): + """ + Fix the error reported by validate.py --coref: + "[L6 Coref entity-across-newdoc] Same entity id should not occur in multiple documents" + by making the entity IDs (eid) unique in each newdoc document. + + This block uses Udapi's support for loading GUM-like GRP document-wide IDs + (so the implementation is simple, although unnecessarily slow). + After applying this block, IDs of all entities are prefixed with document numbers, + e.g. "e45" in the 12th document changes to "d12.e45". + If you prefer simple eid, use corefud.IndexClusters afterwards. + """ + + def process_document(self, doc): + if not doc.eid_to_entity: + logging.warning(f"No entities in document {doc.meta}") + udapi.core.coref.store_coref_to_misc(doc) + assert doc.meta["global.Entity"].startswith("eid") + doc.meta["global.Entity"] = "GRP" + doc.meta["global.Entity"][3:] + udapi.core.coref.load_coref_from_misc(doc) + doc.meta["global.Entity"] = "eid" + doc.meta["global.Entity"][3:] diff --git a/udapi/block/corefud/fixinterleaved.py b/udapi/block/corefud/fixinterleaved.py index c5a1b3ed..b4a42a43 100644 --- a/udapi/block/corefud/fixinterleaved.py +++ b/udapi/block/corefud/fixinterleaved.py @@ -3,7 +3,9 @@ import itertools class FixInterleaved(Block): - """Fix mentions with interleaved or crossing spans.""" + """Fix mentions with interleaved or crossing spans. + https://github.com/ufal/corefUD/issues/25 + """ def __init__(self, same_entity_only=True, both_discontinuous=False, crossing_only=False, nested_same_subspan=True, **kwargs): @@ -25,7 +27,7 @@ def process_tree(self, tree): if self.same_entity_only and mA.entity != mB.entity: continue - # Fully nested spans are OK, expect for same-subspan + # Fully nested spans are OK, except for same-subspan. sA, sB = set(mA.words), set(mB.words) if (sA <= sB) or (sB <= sA): if not self.nested_same_subspan: @@ -58,10 +60,10 @@ def process_tree(self, tree): pass deleted.add(mB) - # By changing the mA.words, we could have create another error: - # making the span same as another mention. Let's fix it + # By changing the mA.words, we could have created another error: + # making the span same as another mention. Let's fix it. sA = set(mA.words) - for mC in mentions: + for mC in sorted(mentions): if mC in deleted or mC is mA or mC is mB: continue if sA != set(mC.words): diff --git a/udapi/block/corefud/fixparentheses.py b/udapi/block/corefud/fixparentheses.py new file mode 100644 index 00000000..bc8e6504 --- /dev/null +++ b/udapi/block/corefud/fixparentheses.py @@ -0,0 +1,31 @@ +from udapi.core.block import Block + + +class FixParentheses(Block): + """Find mentions that contain opening parenthesis but do not contain the closing one (or the other way around). + If the missing parenthesis is an immediate neighbour of the mention span, add it to the span.""" + + def __init__(self, mark=True, **kwargs): + super().__init__(**kwargs) + self.mark = mark + + def process_coref_mention(self, mention): + words = [word.lemma for word in mention.words] + pairs = ['()', '[]', '{}'] + for pair in pairs: + if pair[0] in words: + if not pair[1] in words and pair[1] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[-1].ord == int(mention.words[-1].ord) and mention.words[-1].next_node and \ + mention.words[-1].next_node.lemma == pair[1]: + next_node = mention.words[-1].next_node + mention.words.append(next_node) + if self.mark: + next_node.misc['Mark'] = 1 + + elif pair[1] in words and pair[0] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[0].ord == int(mention.words[0].ord) and mention.words[0].prev_node \ + and mention.words[0].prev_node.lemma == pair[0]: + prev_node = mention.words[0].prev_node + mention.words.append(prev_node) + if self.mark: + prev_node.misc['Mark'] = 1 diff --git a/udapi/block/corefud/guessspan.py b/udapi/block/corefud/guessspan.py new file mode 100644 index 00000000..d6093ece --- /dev/null +++ b/udapi/block/corefud/guessspan.py @@ -0,0 +1,33 @@ +from udapi.core.block import Block + +class GuessSpan(Block): + """Block corefud.GuessSpan heuristically fills mention spans, while keeping mention.head""" + + def process_coref_mention(self, mention): + mwords = mention.head.descendants(add_self=True) + # TODO add heuristics from corefud.PrintMentions almost_forest=1 + + # Add empty nodes that are causing gaps. + # A node "within the span" whose enhanced parent is in the mentions + # must be added to the mention as well. + # "within the span" includes also empty nodes "on the boundary". + # However, don't add empty nodes which are in a gap cause by non-empty nodes. + to_add = [] + min_ord = int(mwords[0].ord) if mwords[0].is_empty() else mwords[0].ord - 1 + max_ord = int(mwords[-1].ord) + 1 + root = mention.head.root + for empty in root.empty_nodes: + if empty in mwords: + continue + if empty.ord > max_ord: + break + if empty.ord > min_ord: + if any(enh['parent'] in mwords for enh in empty.deps): + to_add.append(empty) + elif empty.ord > min_ord + 1 and empty.ord < max_ord - 1: + prev_nonempty = root.descendants[int(empty.ord) - 1] + next_nonempty = root.descendants[int(empty.ord)] + if prev_nonempty in mwords and next_nonempty in mwords: + to_add.append(empty) + #else: empty.misc['Mark'] = f'not_in_treelet_of_{mention.entity.eid}' + mention.words = sorted(mwords + to_add) diff --git a/udapi/block/corefud/link2cluster.py b/udapi/block/corefud/link2cluster.py new file mode 100644 index 00000000..08296531 --- /dev/null +++ b/udapi/block/corefud/link2cluster.py @@ -0,0 +1,137 @@ +import logging +from udapi.core.block import Block + +class Link2Cluster(Block): + """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format. + + Params: + id_attr: name of the attribute in MISC that stores the original-format IDs of nodes + ante_attr: name of the attribute in MISC that stores the ID of the antecedent + of the current node (in the same format as `id_attr`). + delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion? + (i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr, + bridge_attr, bridge_relation_attr if these are used). Default=True. + infstat_attr: name of the attribute in MISC that stores the information status of a given mention + Will be stored in `mention.other['infstat']`. Use None for ignoring this. + coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention + Will be stored in `mention.other['coreftype']`. Use None for ignoring this. + bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent + of the current node/mention (in the same format as `id_attr`). + Default=None, i.e. ignore this parameter. + bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type + (e.g. "part" or "subset"). Default=None, i.e. ignore this parameter. + eid_counter: use a global counter of entity.eid and start with a given number. Default=1. + The main goal of this parameter is to make eid unique across multiple documents. + If you use eid_counter=0, this feature will be turned off, + so entities will be created using `root.document.create_coref_entity()`, + with no eid parameter, so that the eid will start from "e1" in each document processed by this block. + """ + def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, + infstat_attr='information-status', coreftype_attr='coreftype', + bridge_attr=None, bridge_relation_attr=None, eid_counter=1, **kwargs): + super().__init__(**kwargs) + self.id_attr = id_attr + self.ante_attr = ante_attr + self.delete_orig_attrs = delete_orig_attrs + self.infstat_attr = infstat_attr + self.coreftype_attr = coreftype_attr + self.bridge_attr = bridge_attr + self.bridge_relation_attr = bridge_relation_attr + self.eid_counter = int(eid_counter) + + def _new_entity(self, doc): + if not self.eid_counter: + return doc.create_coref_entity() + entity = doc.create_coref_entity(eid=f"e{self.eid_counter}") + self.eid_counter += 1 + return entity + + def _new_mention(self, entity, node): + mention = entity.create_mention(head=node, words=[node]) + if self.infstat_attr and node.misc[self.infstat_attr]: + mention.other['infstat'] = node.misc[self.infstat_attr] + if self.delete_orig_attrs: + del node.misc[self.infstat_attr] + if self.coreftype_attr and node.misc[self.coreftype_attr]: + mention.other['coreftype'] = node.misc[self.coreftype_attr] + if self.delete_orig_attrs: + del node.misc[self.coreftype_attr] + return mention + + def process_document(self, doc): + id2node = {} + links = [] + bridges = [] + for node in doc.nodes_and_empty: + this_id = node.misc[self.id_attr] + if this_id != '': + id2node[this_id] = node + ante_id = node.misc[self.ante_attr] + if ante_id != '': + if ante_id == this_id: + logging.warning(f"{node} has a self-reference {self.ante_attr}={ante_id}") + else: + links.append([ante_id, this_id]) + if self.delete_orig_attrs: + for attr in (self.id_attr, self.ante_attr): + del node.misc[attr] + if self.bridge_attr: + bridge_id = node.misc[self.bridge_attr] + if bridge_id != '': + if bridge_id == this_id: + logging.warning(f"{node} has a self-reference bridging {self.bridge_attr}={bridge_id}") + else: + bridges.append([bridge_id, this_id, node.misc[self.bridge_relation_attr]]) + if self.delete_orig_attrs: + for attr in (self.bridge_attr, self.bridge_relation_attr): + del node.misc[attr] + + # It seems faster&simpler to process the links in any order and implement entity merging, + # rather than trying to sort the links so that no entity merging is needed. + for ante_id, this_id in links: + if ante_id not in id2node: + logging.warning(f"{ante_id} is referenced in {self.ante_attr}, but not in {self.id_attr}") + else: + ante_node, this_node = id2node[ante_id], id2node[this_id] + if not this_node.coref_mentions and not ante_node.coref_mentions: + # None of the nodes is part of any mention/entity. Let's create them. + entity = self._new_entity(this_node.root.document) + self._new_mention(entity, ante_node) + self._new_mention(entity, this_node) + elif this_node.coref_mentions and ante_node.coref_mentions: + # Both of the nodes are part of mentions in different entities. + # Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity). + # While the official API supports "stealing" a single mention (m.entity = another_entity), + # the implementation below using _mentions and _entity is a bit faster. + e_ante, e_this = this_node.coref_entities[0], ante_node.coref_entities[0] + assert e_ante != e_this + for mention in e_ante.mentions: + mention._entity = e_this + e_this._mentions.extend(e_ante.mentions) + e_this._mentions.sort() + e_ante._mentions.clear() + else: + # Only one of the nodes is part of an entity. Let's add the second one to this entity. + if ante_node.coref_mentions: + self._new_mention(ante_node.coref_entities[0], this_node) + else: + self._new_mention(this_node.coref_entities[0], ante_node) + + # Bridging + for ante_id, this_id, relation in bridges: + if ante_id not in id2node: + logging.warning(f"{ante_id} is referenced in {self.bridge_attr}, but not in {self.id_attr}") + else: + ante_node, this_node = id2node[ante_id], id2node[this_id] + if ante_node.coref_mentions: + m_ante = next(m for m in ante_node.coref_mentions if m.head is ante_node) + e_ante = m_ante.entity + else: + e_ante = self._new_entity(ante_node.root.document) + m_ante = self._new_mention(e_ante, ante_node) + if this_node.coref_mentions: + m_this = next(m for m in this_node.coref_mentions if m.head is this_node) + else: + e_this = self._new_entity(this_node.root.document) + m_this = self._new_mention(e_this, this_node) + m_this.bridging.append((e_ante, relation)) diff --git a/udapi/block/corefud/markpairs.py b/udapi/block/corefud/markpairs.py new file mode 100644 index 00000000..cc63b387 --- /dev/null +++ b/udapi/block/corefud/markpairs.py @@ -0,0 +1,138 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +from collections import Counter +import logging + +class MarkPairs(Block): + """Find pairs of coreference mentions within the same sentence with given properties. + Mark these pairs of mentions (using `misc["Mark"]`), so they can be further + processed or printed. + + Usage: + # Find pairs of mentions of the same entity within the same sentence: + cat my.conllu | udapy -TM corefud.MarkPairs same_entity=1 | less -R + + Properties: + same_entity - both mentions belong to the same entity (cluster) + both_continuous - both mentions have continuous spans + both_discontinuous - both mentions have discontinuous spans + nested - span of one mention is nested (a subset of) in the span of the other mention + crossing - spans are crossing (i.e. intersecting, but neither is subset of the other) + interleaved - spans are interleaved (i.e. not intersecting, but neither span precedes the other) + same_head - the same node is a head of both mentions + same_span - both mentions have the same span (which is invalid according to UD's validate.py) + same_subspan - at least one of the mentions is discontinuous and one of its subspans + is also a subspan (or span) of the other mention + + + You can combine any number of properties. + Each property can have one of the three values: + include - this is the default value: include pairs with this property, i.e. ignore the property + exclude - exclude (from the marking) pairs of mentions with this property + only - pairs of mentions without this property will be excluded + + As a shortcut, you can use -1 and 1 instead of exclude and only, so e.g. + nested=only same_head=exclude + can be written as + nested=1 same_head=-1 + """ + + def __init__(self, same_entity=0, both_continuous=0, both_discontinuous=0, + nested=0, crossing=0, interleaved=0, + same_head=0, same_span=0, same_subspan=0, + print_form=False, print_total=True, log=True, mark=True, **kwargs): + super().__init__(**kwargs) + + + self.same_entity = self._convert(same_entity) + self.both_continuous = self._convert(both_continuous) + self.both_discontinuous = self._convert(both_discontinuous) + self.nested = self._convert(nested) + self.crossing = self._convert(crossing) + self.interleaved = self._convert(interleaved) + self.same_head = self._convert(same_head) + self.same_span = self._convert(same_span) + self.same_subspan = self._convert(same_subspan) + + self.print_form = print_form + self.print_total = print_total + self.log = log + self.mark = mark + self.counter = Counter() + + def _convert(self, value): + if value in {-1, 0, 1}: + return value + if value == 'include': + return 0 + if value == 'only': + return 1 + if value == 'exclude': + return -1 + raise ValueError('unknown value ' + value) + + def _ok(self, condition, value): + if value == 0: + return True + return (condition and value == 1) or (not condition and value==-1) + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + self.counter['mentions'] += len(mentions) + + for mA, mB in itertools.combinations(mentions, 2): + self.counter['pairs'] += 1 + if not self._ok(mA.entity == mB.entity, self.same_entity): + continue + if not self._ok(mA.head == mB.head, self.same_head): + continue + + if self.both_continuous or self.both_discontinuous or self.same_span or self.same_subspan: + sA, sB = mA.span, mB.span + cA, cB = ',' not in sA, ',' not in sB + if not self._ok(cA and cB, self.both_continuous): + continue + if not self._ok(not cA and not cB, self.both_discontinuous): + continue + if not self._ok(sA == sB, self.same_span): + continue + if not self._ok(set(sA.split(',')).intersection(set(sB.split(','))), self.same_subspan): + continue + + if self.nested or self.crossing or self.interleaved: + wA, wB = set(mA.words), set(mB.words) + if not self._ok(wA <= wB or wB <= wA, self.nested): + continue + if not self._ok(wA.intersection(wB) and not wA <= wB and not wB <= wA, self.crossing): + continue + if self.interleaved: + a_precedes_b = mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0] + b_precedes_a = mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0] + if not self._ok(not wA.intersection(wB) and not a_precedes_b and not b_precedes_a, self.interleaved): + continue + + self.counter['matching'] += 1 + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + logging.info(f"Found mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") + + def after_process_document(self, doc): + if self.print_total: + #if self.max_trees and seen_trees > self.max_trees: + # print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + msg = f'######## Mentions = {self.counter["mentions"]}, matching/all pairs = {self.counter["matching"]} / {self.counter["pairs"]}' + logging.info(msg) + doc.meta["corefud.MarkPairs"] = msg diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 7ed31b0d..d011f686 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -10,9 +10,9 @@ class PrintMentions(Block): def __init__(self, continuous='include', almost_continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, - print_total=True, + print_total=True, print_should=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, - minimize_cross=True, color=True, attributes='form,upos,deprel', + minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc', print_undef_as='_', print_doc_meta=True, print_comments=False, mark='(Mark)', hints=True, layout='classic', **kwargs): @@ -33,6 +33,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i random.seed(42) self.print_other_forms = print_other_forms self.print_total = print_total, + self.print_should = print_should, print_class = TextModeTreesHtml if html else TextModeTrees self.print_block = print_class( print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, @@ -61,7 +62,9 @@ def _ok(self, condition, value): return (condition and value == 'only') or (not condition and value=='exclude') def _is_auxiliary_etc(self, node): - if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'vocative'}: + if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}: + return True + if node.deprel == 'advmod:emph': return True if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: return True @@ -79,8 +82,25 @@ def _is_forest(self, mention, mwords, almost): for ch in w.children: if ch not in mwords: if not almost: + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid return False + # Punctuation before or after the mention span can depend on any of the mwords + # without breaking the almost_forest property. + # According to the UD guidelines, it should depend on the highest node within the phrase, + # i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines. + if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]): + continue + # Some auxiliary words (e.g. prepositions) may be excluded from the mention span + # without breaking the almost_forest property, but they need to depend + # on the mention head (or if the mention is not a catena, they need to depend + # on one of the potential heads, i.e. a node from mwords whose parent is not in mwords). + # For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head), + # but "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest + # because "with" depends on "Mary", which is not the mention head (nor a potential mention head). if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)): + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid return False return True diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py new file mode 100644 index 00000000..4551873c --- /dev/null +++ b/udapi/block/corefud/removenocorefentities.py @@ -0,0 +1,21 @@ +from udapi.core.block import Block +import udapi.core.coref +import re +import logging + +class RemoveNoCorefEntities(Block): + """ + Some corpora (e.g., AnCora) include annotation of named entities that are + not annotated for coreference. To distinguish them, their cluster ID starts + with 'NOCOREF' (optionally followed by entity type, so that one cluster + still has just one type). We may want to remove such entities from datasets + that are used to train coreference resolves, to prevent the resolvers from + thinking that all members of a NOCOREF cluster are coreferential. That is + what this block does. + """ + + def process_document(self, doc): + entities = doc.coref_entities + if not entities: + return + doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)} diff --git a/udapi/block/corefud/singleparent.py b/udapi/block/corefud/singleparent.py new file mode 100644 index 00000000..ee9b1948 --- /dev/null +++ b/udapi/block/corefud/singleparent.py @@ -0,0 +1,47 @@ +"""If an empty node has multiple (enhanced-deps) parents, only the highest one is kept.""" +from udapi.core.block import Block +from collections import Counter +from udapi.core.node import find_minimal_common_treelet +import logging + +class SingleParent(Block): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._reasons = Counter() + + def process_tree(self, tree): + for empty in tree.empty_nodes: + self._reasons['_empty'] += 1 + if len(empty.deps) > 1: + self._reasons['_more-parents'] += 1 + parents = [d['parent'] for d in empty.deps] + nonempty_parents = [p for p in parents if not p.is_empty()] + if len(nonempty_parents) != len(parents): + self._reasons['empty-parent'] += 1 + #empty.misc['Mark'] = f"empty-parent:{empty.deps}" + logging.warning(f"Empty node {empty} has an empty parent.") + if not nonempty_parents: + empty.deps = [] + self._reasons['no-nonempty-parent'] += 1 + continue + (highest, added_nodes) = find_minimal_common_treelet(*nonempty_parents) + if highest in nonempty_parents: + self._reasons['one-governs'] += 1 + empty.deps = [d for d in empty.deps if d['parent'] is highest] + continue + nonempty_parents.sort(key=lambda n:n._get_attr('depth')) + if len(nonempty_parents)>1 and nonempty_parents[0]._get_attr('depth') == nonempty_parents[0]._get_attr('depth'): + self._reasons['same-depth'] += 1 + #empty.misc['Mark'] = f"same-depth:{empty.deps}" + else: + self._reasons['one-highest'] += 1 + #empty.misc['Mark'] = f"one-highest:{empty.deps}" + empty.deps = [d for d in empty.deps if d['parent'] is nonempty_parents[0]] + + def after_process_document(self, document): + message = "\n" + for k, v in self._reasons.most_common(): + message += f"{k}={v}\n" + #document.meta["bugs"] = message + logging.info(message) diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index cdd84e7a..527159e9 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -1,44 +1,75 @@ from udapi.core.block import Block from collections import Counter +import re class Stats(Block): """Block corefud.Stats prints various coreference-related statistics.""" - def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_entities=True, - report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM', - exclude_singletons=False, exclude_nonsingletons=False, style='human', **kwargs): + def __init__(self, m_len_max=5, e_len_max=5, + report_basics=False, report_mentions=True, report_entities=True, + report_details=True, report_words_per_doc=False, report_entity_range=False, + selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _', + exclude_singletons=False, exclude_nonsingletons=False, style='human', + per_doc=False, max_rows_per_page=50, docname='newdoc', docname_len=15, + highlight_docnames=None, + **kwargs): super().__init__(**kwargs) self.m_len_max = m_len_max - self.c_len_max = c_len_max + self.e_len_max = e_len_max + self.report_basics = report_basics self.report_mentions = report_mentions self.report_entities = report_entities self.report_details = report_details + self.report_words_per_doc = report_words_per_doc + self.report_entity_range = report_entity_range self.exclude_singletons = exclude_singletons self.exclude_nonsingletons = exclude_nonsingletons self.style = style - if style not in 'tex human'.split(): - raise ValueError(f'Unknown style f{style}') + if style not in 'tex tex-table tex-doc human'.split(): + raise ValueError(f'Unknown style {style}') + self.per_doc = per_doc + self.max_rows_per_page = max_rows_per_page + if docname not in 'newdoc filename'.split(): + raise ValueError(f'Unknown style {style}') + self.docname = docname + self.docname_len = docname_len + self.highlight_docnames = highlight_docnames + self._header_printed = False + self._lines_printed = None self.counter = Counter() self.mentions = 0 self.entities = 0 + self.singletons = 0 self.total_nodes = 0 self.longest_mention = 0 self.longest_entity = 0 self.m_words = 0 self.selected_upos = None if selected_upos == 'all' else selected_upos.split() + self.entity_ranges = [] def process_document(self, doc): self.total_nodes += len(list(doc.nodes)) + self.counter['documents'] += 1 + node2docord, current_docord = {}, 0 + if self.report_entity_range: + for node in doc.nodes_and_empty: + node2docord[node] = current_docord + current_docord += 1 + for entity in doc.coref_entities: len_mentions = len(entity.mentions) + if len_mentions == 1: + self.singletons += 1 if len_mentions == 1 and self.exclude_singletons: continue elif len_mentions > 1 and self.exclude_nonsingletons: continue + if self.report_entity_range: + self.entity_ranges.append(node2docord[entity.mentions[-1].head] - node2docord[entity.mentions[0].head]) self.longest_entity = max(len_mentions, self.longest_entity) self.counter['c_total_len'] += len_mentions - self.counter[f"c_len_{min(len_mentions, self.c_len_max)}"] += 1 + self.counter[f"c_len_{min(len_mentions, self.e_len_max)}"] += 1 self.entities += 1 if not self.report_mentions and not self.report_details: @@ -66,28 +97,85 @@ def process_document(self, doc): heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 self.counter['m_nontreelet'] += 1 if heads > 1 else 0 - def process_end(self): + if self.report_basics: + doc_words = 0 + for tree in doc.trees: + self.counter['sents'] += 1 + self.counter['words'] += len(tree.descendants) + self.counter['empty'] += len(tree.empty_nodes) + if tree.newdoc: + self.counter['newdocs'] += 1 + if doc_words > self.counter['max_words_per_doc']: + self.counter['max_words_per_doc'] = doc_words + doc_words = 0 + doc_words += len(tree.descendants) + + def after_process_document(self, doc): + if self.per_doc: + self.process_end(skip=False, doc=doc) + self.counter = Counter() + self.mentions = 0 + self.entities = 0 + self.singletons = 0 + self.total_nodes = 0 + self.longest_mention = 0 + self.longest_entity = 0 + self.m_words = 0 + self.entity_ranges = [] + + def process_end(self, skip=True, doc=None): + if not self._lines_printed: + self.print_header() + self._lines_printed = 0 + if self.per_doc: + if skip: + self.print_footer() + return + else: + docname = doc.meta['loaded_from'] if self.docname == 'filename' else doc[0].trees[0].newdoc + if self.style.startswith('tex'): + if self.highlight_docnames and re.search(self.highlight_docnames, docname): + docname = r"\NEW " + docname + docname = docname.replace('_', r'\_') + print(f"{docname:{self.docname_len}}", end='&' if self.style.startswith('tex') else '\n') + elif self.style.startswith('tex-'): + print(f"{self.counter['documents']:4} documents &") + self._lines_printed += 1 + mentions_nonzero = 1 if self.mentions == 0 else self.mentions entities_nonzero = 1 if self.entities == 0 else self.entities total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes columns =[ ] + if self.report_basics: + columns += [('docs', f"{self.counter['newdocs']:6,}"), + ('sents', f"{self.counter['sents']:7,}"), + ('words', f"{self.counter['words']:9,}"), + ('empty', f"{self.counter['empty']:7,}"),] + if self.report_words_per_doc: + columns += [('max_words/doc', f"{self.counter['max_words_per_doc']:7,}"), + ('words/doc', f"{self.counter['words']/self.counter['newdocs']:7,.0f}"),] if self.report_entities: columns += [('entities', f"{self.entities:7,}"), ('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"), ('longest_entity', f"{self.longest_entity:6}"), ('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")] - for i in range(1, self.c_len_max + 1): + if self.report_entity_range: + self.entity_ranges.sort() + percentile = self.entity_ranges[int(0.95 * (len(self.entity_ranges) - 1))] if self.entity_ranges else 0 + columns += [('entity_range_95percentile', f"{percentile:6,}"),] + for i in range(1, self.e_len_max + 1): percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero - columns.append((f"c_len_{i}{'' if i < self.c_len_max else '+'}", f"{percent:5.1f}")) + columns.append((f"c_len_{i}{'' if i < self.e_len_max else '+'}", f"{percent:5.1f}")) if self.report_mentions: columns += [('mentions', f"{self.mentions:7,}"), ('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"), ('longest_mention', f"{self.longest_mention:6}"), ('avg_mention', f"{self.counter['m_total_len'] / mentions_nonzero:5.1f}")] - for i in range(0, self.m_len_max + 1): - percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero - columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}")) + if self.m_len_max: + for i in range(0, self.m_len_max + 1): + percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero + columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}")) if self.report_details: columns += [('with_empty', f"{100 * self.counter['m_with_empty'] / mentions_nonzero:5.1f}"), ('with_gaps', f"{100 * self.counter['m_with_gaps'] / mentions_nonzero:5.1f}"), @@ -99,8 +187,119 @@ def process_end(self): for upos in upos_list: columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}")) - if self.style == 'tex': - print(" & ".join(c[1] for c in columns)) + if self.style.startswith('tex'): + print(" &".join(c[1] for c in columns), end=" \\\\\n") elif self.style == 'human': for c in columns: print(f"{c[0]:>15} = {c[1].strip():>10}") + if not self.per_doc: + self.print_footer() + elif self._lines_printed > self.max_rows_per_page: + self.print_footer(False) + self._lines_printed = 0 + + def print_header(self): + if not self.style.startswith('tex-'): + return + if self.style == 'tex-doc': + if self._lines_printed is None: + print(r'\documentclass[multi=mypage]{standalone}') + print(r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}') + print(r'\usepackage[table]{xcolor}\newcommand{\NEW}{\rowcolor{gray!50}}') + print(r'\title{Udapi coreference statistics}') + print(r'\begin{document}') + print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}') + lines = [r'\begin{mypage}'+"\n"+r'\begin{tabular}{@{}l ', + " " * self.docname_len, + ("document" if self.per_doc else "dataset ") + " " * (self.docname_len-8), + " " * self.docname_len] + if self.report_basics: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{text size} ' + lines[2] += r'& \MC{4}{total number of} ' + lines[3] += r'& docs & sents & words &empty n.' + if self.report_words_per_doc: + lines[0] += "rr " + lines[1] += r'& & ' + lines[2] += r'&\MC{2}{words/doc}' + lines[3] += r'& max & avg ' + if self.report_entities: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{entities} ' + lines[2] += r'& total &per 1k &\MC{2}{length}' + lines[3] += r'& count & words & max & avg ' + if self.report_entity_range: + lines[0] += "r " + lines[1] += r'& ' + lines[2] += r'& range ' + lines[3] += r'& p95 ' + if self.e_len_max: + for i in range(1, self.e_len_max + 1): + lines[0] += "r" + lines[2] += f"& {i:4}" + ("+ " if i==self.e_len_max else " ") + lines[3] += r'& [\%] ' + lines[0] += " " + lines[1] += r'& \MC{' + str(self.e_len_max) + r'}{distribution of entity lengths}' + if self.report_mentions: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{mentions} ' + lines[2] += r'& total &per 1k &\MC{2}{length}' + lines[3] += r'& count & words & max & avg ' + if self.m_len_max: + for i in range(0, self.m_len_max + 1): + lines[0] += "r" + lines[2] += f"& {i:4}" + ("+ " if i==self.m_len_max else " ") + lines[3] += r'& [\%] ' + lines[0] += " " + lines[1] += r'& \MC{' + str(self.m_len_max + 1) + r'}{distribution of mention lengths}' + " "*7 + if self.report_details: + lines[0] += "rrrr " + lines[1] += r'& \MC{3}{mention type} ' + lines[2] += r'&w/empty& w/gap&non-tree' + lines[3] += r'& [\%] ' * 3 + if self.selected_upos: + upos_list = self.selected_upos + ['other'] + else: + upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')] + lines[0] += "@{~}r" * len(upos_list) + lines[1] += r"& \MC{" + str(len(upos_list)) + r"}{distribution of head UPOS}" + lines[2] += ''.join(f'&{upos:7}' for upos in upos_list) + lines[3] += r'& [\%] ' * len(upos_list) + lines[0] += r'@{}}\toprule' + last_col = 1 + lines[1] += r'\\' + lines[2] += r'\\' + lines[3] += r'\\\midrule' + if self.report_basics: + lines[1] += r'\cmidrule(lr){2-7}' if self.report_words_per_doc else r'\cmidrule(lr){2-5}' + lines[2] += r'\cmidrule(lr){2-5}' + last_col += 4 + if self.report_words_per_doc: + lines[2] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+2}" + '}' + last_col += 2 + if self.report_entities: + _cols = 5 if self.report_entity_range else 5 + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+_cols}" + '}' + lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}' + last_col += _cols + if self.e_len_max: + last_col += self.e_len_max + lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}' + if self.report_mentions: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}' + lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}' + last_col += 4 + if self.m_len_max: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+self.m_len_max+1}" + '}' + last_col += self.m_len_max + 1 + if self.report_details: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+3}" + lines[1] += r'}\cmidrule(l){' + f"{last_col+4}-{last_col+3+len(upos_list)}" + '}' + print("\n".join(lines)) + + def print_footer(self, end_doc=True): + if not self.style.startswith('tex-'): + return + print(r'\bottomrule\end{tabular}'+"\n"+r'\end{mypage}') + if self.style == 'tex-doc' and end_doc: + print(r'\end{document}') diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py index ca5510e4..e4889770 100644 --- a/udapi/block/eval/f1.py +++ b/udapi/block/eval/f1.py @@ -126,7 +126,7 @@ def process_tree(self, tree): i, j, c, un_pred, un_gold, common = 0, 0, 0, [], [], [] while i < len(pred_tokens) and j < len(gold_tokens): if c == len(nf_common): - common += find_lcs(pred_tokens[i+1:], gold_tokens[j+1:]) + common += find_lcs(pred_tokens[i:], gold_tokens[j:]) break while nf_common[c] != pred_tokens[i]: un_pred.append(pred_tokens[i]) @@ -156,6 +156,13 @@ def process_tree(self, tree): self._pred[x] += 1 self._total[x] += 1 + @property + def f1(self): + pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero + precision = self.correct / pred + recall = self.correct / gold + return 2 * precision * recall / ((precision + recall) or 1) + def process_end(self): # Redirect the default filehandle to the file specified by self.files self.before_process_document(None) diff --git a/udapi/block/msf/case.py b/udapi/block/msf/case.py new file mode 100644 index 00000000..7d362c7f --- /dev/null +++ b/udapi/block/msf/case.py @@ -0,0 +1,448 @@ +""" +Morphosyntactic features (UniDive): +Derive a MS Case feature from morphological case and adposition. +""" +from udapi.core.block import Block +import logging + +class Case(Block): + + adposmap = { + 'v+Loc': 'Ine', + 'uvnitř+Gen': 'Ine', + 'uvnitř+': 'Ine', + 'mezi_uvnitř+Gen': 'Ine', # annotation error? + 'uprostřed+Gen': 'Ces', + 'mezi+Ins': 'Int', + 'mezi+Nom': 'Int', # annotation error + 'mezi+Voc': 'Int', # annotation error + 'vně+Gen': 'Ext', + 'stranou+Gen': 'Ext', + 'stranou+Dat': 'Ext', + 'na+Loc': 'Ade', + 'na_mimo+Loc': 'Ade', # annotation error? + 'na_úroveň+Gen': 'Ade', + 'na_úroveň+': 'Ade', + 'v_proces+Gen': 'Ade', # ??? + 'v_rámec+Gen': 'Ade', # ??? + 'v_rámec+': 'Ade', # ??? + 'v_řada+Gen': 'Ade', # ??? + 'z_oblast+Gen': 'Ade', # ??? + 'vedle+Gen': 'Apu', + 'u+Gen': 'Chz', + 'kolem+Gen': 'Cir', + 'kol+Gen': 'Cir', + 'dokola+Gen': 'Cir', + 'okolo+Gen': 'Cir', + 'v_oblast+Gen': 'Cir', + 'v_oblast+': 'Cir', + 'blízko+Dat': 'Prx', + 'blízko+Gen': 'Prx', + 'blízko+': 'Prx', + 'nedaleko+Gen': 'Prx', + 'daleko+Gen': 'Prx', # lemma of 'nedaleko' + 'poblíž+Gen': 'Prx', + 'daleko_od+Gen': 'Dst', + 'nad+Ins': 'Sup', + 'pod+Ins': 'Sub', + 'vespod+Gen': 'Sub', + 'před+Ins': 'Ant', + 'vpředu+Gen': 'Ant', + 'na_čelo+Gen': 'Ant', + 'v_čelo+Gen': 'Ant', + 'v_čelo+': 'Ant', + 'za+Ins': 'Pst', + 'naproti+Dat': 'Opp', + 'od+Gen': 'Abl', + 'od+Dat': 'Abl', # annotation error + 'směr_od+Gen': 'Abl', + 'z_strana+Gen': 'Abl', + 'z_strana+': 'Abl', + 'z+Gen': 'Ela', + 'z+Nom': 'Ela', # annotation error + 'z+Dat': 'Ela', # annotation error + 'zevnitř+Gen': 'Ela', + 'zprostřed+Gen': 'Cne', + 's+Gen': 'Del', + 'zpod+Gen': 'Sbe', + 'zpoza+Gen': 'Pse', + 'po+Loc': 'Per', + 'cesta+Gen': 'Per', + 'cesta+Ins': 'Per', + 'napříč+Gen': 'Crs', + 'napříč+Ins': 'Crs', + 'podél+Gen': 'Lng', + 'skrz+Acc': 'Inx', + 'přes+Acc': 'Spx', + 'přes+Nom': 'Spx', # annotation error + 'ob+Acc': 'Cix', + 'po+Acc': 'Ter', + 'po+Nom': 'Ter', # annotation error + 'po+Gen': 'Ter', # annotation error + 'do+Gen': 'Ill', + 'do+Acc': 'Ill', # annotation error + 'do_/+Gen': 'Ill', + 'dovnitř+Gen': 'Ill', + 'doprostřed+Gen': 'Cnl', + 'mezi+Acc': 'Itl', + 'na+Acc': 'All', + 'na+Nom': 'All', # annotation error + 'na+Gen': 'All', # annotation error + 'k+Dat': 'Apl', + 'k+Nom': 'Apl', # annotation error + 'vstříc+Dat': 'Apl', + 'do_oblast+Gen': 'Apl', + 'směr+': 'Apl', + 'směr_k+Dat': 'Apl', + 'směr_k+': 'Apl', + 'směr_na+Acc': 'Apl', + 'v_směr_k+Dat': 'Apl', + 'nad+Acc': 'Spl', + 'nad+Nom': 'Spl', # annotation error + 'pod+Acc': 'Sbl', + 'před+Acc': 'Anl', + 'před+Gen': 'Anl', # annotation error + 'za+Acc': 'Psl', + 'dík_za+Acc': 'Psl', # annotation error? + 'dokud': 'Tan', + 'nežli': 'Tan', + 'v+Acc': 'Tem', + 'v+Nom': 'Tem', # annotation error + 'v+Gen': 'Tem', # annotation error + 'při_příležitost+Gen': 'Tem', + 'současně_s+Ins': 'Tem', + 'u_příležitost+Gen': 'Tem', + 'v_období+Gen': 'Tpx', + 'počátkem+Gen': 'Din', + 'počátek+Gen': 'Din', + 'počínat+Ins': 'Din', + 'počínat+': 'Din', + 'začátkem+Gen': 'Din', + 'začátek+Gen': 'Din', + 'během+Gen': 'Dur', + 'postupem+Gen': 'Dur', + 'postup+Gen': 'Dur', + 'při+Loc': 'Dur', + 'v_průběh+Gen': 'Dur', + 'za+Gen': 'Der', + 'koncem+Gen': 'Dtr', + 'konec+Gen': 'Dtr', + 'k_konec+Gen': 'Dtr', + 'končit+Ins': 'Dtr', + 'závěrem+Gen': 'Dtr', + 'závěr+Gen': 'Dtr', + 'na_závěr+Gen': 'Dtr', + 'v_závěr+Gen': 'Dtr', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'před_po+Loc': 'Tps', + 'počínaje+Ins': 'Teg', + 'jménem+Nom': 'Atr', + 'jméno+Nom': 'Atr', + 'zdali': 'Atr', + 'že': 'Atr', + 'z_řada+Gen': 'Gen', + 's+Ins': 'Com', + 's+Nom': 'Com', # annotation error + 'spolu_s+Ins': 'Com', + 'spolu_s+': 'Com', + 'společně_s+Ins': 'Com', + 'společně_s+': 'Com', + 'v_čelo_s+Ins': 'Com', + 'v_spolupráce_s+Ins': 'Com', + 'bez+Gen': 'Abe', + 'včetně+Gen': 'Inc', + 'nad_rámec+Gen': 'Add', + 'kromě+Gen': 'Exc', + 'krom+Gen': 'Exc', + 'mimo+Acc': 'Exc', + 'mimo+Gen': 'Exc', + 'vyjma+Gen': 'Exc', + 'až_na+Acc': 'Exc', + 's_výjimka+Gen': 'Exc', + 's_výjimka+': 'Exc', + 'místo+Gen': 'Sbs', + 'místo+Ins': 'Sbs', # něčím místo něčím jiným + 'místo+Loc': 'Sbs', # annotation error + 'místo_do+Gen': 'Sbs', + 'místo_k+Dat': 'Sbs', + 'místo_na+Acc': 'Sbs', + 'místo_na+': 'Sbs', + 'místo_po+Loc': 'Sbs', + 'místo_v+Acc': 'Sbs', + 'místo_v+': 'Sbs', + 'místo_za+Acc': 'Sbs', + 'namísto+Gen': 'Sbs', + 'namísto_do+Gen': 'Sbs', + 'v_zastoupení+Gen': 'Sbs', + 'výměna_za+Acc': 'Sbs', + 'jako': 'Ess', + 'jako+': 'Ess', + 'jako+Nom': 'Ess', + 'jako+Acc': 'Ess', + 'jako+Dat': 'Ess', + 'jako_u+Gen': 'Ess', + 'jako_v+Loc': 'Ess', + 'formou+Gen': 'Ess', + 'forma+Gen': 'Ess', + 'v_forma+Gen': 'Ess', + 'v_podoba+Gen': 'Ess', + 'v_podoba+': 'Ess', + 'shoda+Gen': 'Equ', + 'v_shoda_s+Ins': 'Equ', + 'do_soulad_s+Ins': 'Sem', + 'na_způsob+Gen': 'Sem', + 'po_vzor+Gen': 'Sem', + 'úměrně+Dat': 'Sem', + 'úměrně_k+Dat': 'Sem', + 'úměrně_s+Ins': 'Sem', + 'v_analogie_s+Ins': 'Sem', + 'v_duch+Gen': 'Sem', + 'v_smysl+Gen': 'Sem', + 'oproti+Dat': 'Dsm', + 'na_rozdíl_od+Gen': 'Dsm', + 'na_rozdíl_od+': 'Dsm', + 'než': 'Cmp', + 'než+Nom': 'Cmp', + 'než+Gen': 'Cmp', + 'než+Acc': 'Cmp', + 'než_nad+Ins': 'Cmp', + 'než_v+Acc': 'Cmp', + 'než_v+Loc': 'Cmp', + 'v_poměr_k+Dat': 'Cmp', + 'v_poměr_k+': 'Cmp', + 'v_porovnání_k+Dat': 'Cmp', + 'v_porovnání_s+Ins': 'Cmp', + 'v_porovnání_s+': 'Cmp', + 'v_srovnání_s+Ins': 'Cmp', + 'v_srovnání_s+': 'Cmp', + 'o+Acc': 'Dif', + 'o+Nom': 'Dif', # annotation error + 'o+Gen': 'Dif', # annotation error + 'o+Dat': 'Dif', # annotation error + 'o_o+Acc': 'Dif', # annotation error + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'díky+Dat': 'Cau', + 'dík+Dat': 'Cau', + 'kvůli+Dat': 'Cau', + 'vinou+Gen': 'Cau', + 'vlivem+Gen': 'Cau', + 'vliv+Gen': 'Cau', + 'vliv+': 'Cau', + 'vinou+Gen': 'Cau', + 'vina+Gen': 'Cau', + 'zásluhou+Gen': 'Cau', + 'zásluha+Gen': 'Cau', + 'z_důvod+Gen': 'Cau', + 'v_důsledek+Gen': 'Cau', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'následek+Gen': 'Cau', + 'aby': 'Pur', + 'jméno+Gen': 'Pur', + 'pro_případ+Gen': 'Pur', + 'v_jméno+Gen': 'Pur', + 'v_zájem+Gen': 'Pur', + 'za_účel+Gen': 'Pur', + 'na_základ+Gen': 'Cns', + 'pod_vliv+Gen': 'Cns', + 's_ohled_na+Acc': 'Cns', + 's_přihlédnutí_k+Dat': 'Cns', + 's_přihlédnutí_na+Acc': 'Cns', + 'v_souvislost_s+Ins': 'Cns', + 'v_souvislost_s+': 'Cns', + 'v_světlo+Gen': 'Cns', + 'vzhledem_k+Dat': 'Cns', + 'v_soulad_s+Ins': 'Cns', + 'v_soulad_s+': 'Cns', + 'z_titul+Gen': 'Cns', + 'ať': 'Ign', + 'bez_ohled_na+Acc': 'Ign', + 'nehledě_k+Dat': 'Ign', + 'nehledě_na+Acc': 'Ign', + 'navzdory+Dat': 'Ccs', + 'vzdor+Dat': 'Ccs', + 'v_rozpor_s+Ins': 'Ccs', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'pokud+Nom': 'Cnd', + 'zda': 'Cnd', + 'v_případ+Gen': 'Cnd', + 'v_případ+': 'Cnd', + 'v_závislost_na+Loc': 'Cnd', + 'v_závislost_s+Ins': 'Cnd', + 'o+Loc': 'The', + 'ohledně+Gen': 'The', + 'stran+Gen': 'The', + 'co_do+Gen': 'The', + 'na_téma+Gen': 'The', + 'na_téma+Nom': 'The', + 'na_téma+': 'The', + 'na_úsek+Gen': 'The', + 'po_stránka+Gen': 'The', + 'v_obor+Gen': 'The', + 'v_otázka+Gen': 'The', + 'v_spojení_s+Ins': 'The', + 'v_věc+Gen': 'The', + 'v_vztah_k+Dat': 'The', + 'v_vztah_k+': 'The', + 'v_záležitost+Gen': 'The', + 'v_znamení+Gen': 'The', + 'z_hledisko+Gen': 'The', + 'z_hledisko+': 'The', + 'podle+Gen': 'Quo', + 'dle+Gen': 'Quo', + 'pomocí+Gen': 'Ins', + 's_pomoc+Gen': 'Ins', + 'prostřednictvím+Gen': 'Ins', + 'prostřednictví+Gen': 'Ins', + 'prostřednictví+Ins': 'Ins', # annotation error + 'prostřednictví+': 'Ins', + 'za_pomoc+Gen': 'Ins', + 'pro+Acc': 'Ben', + 'pro+Nom': 'Ben', # annotation error + 'pro+Gen': 'Ben', # annotation error + 'pro+Ins': 'Ben', # annotation error + 'napospas+Dat': 'Ben', + 'k_prospěch+Gen': 'Ben', + 'na_úkor+Gen': 'Ben', + 'na_vrub+Gen': 'Ben', + 'v_prospěch+Gen': 'Ben', + 'v_neprospěch+Gen': 'Ben', + 'v_služba+Gen': 'Ben', + 'proti+Dat': 'Adv', + 'proti+Gen': 'Adv', + 'kontra+Nom': 'Adv', + 'versus+Nom': 'Adv', + 'vůči+Dat': 'Adv', + # subordinators + 'dokud': 'Tan', + 'nežli': 'Tan', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'zdali': 'Atr', + 'že': 'Atr', + 'jako': 'Ess', + 'než': 'Cmp', + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'aby': 'Pur', + 'ať': 'Ign', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'zda': 'Cnd', + # coordinators + 'a': 'Conj', + 'i': 'Conj', + 'ani': 'Nnor', + 'nebo': 'Disj', + 'či': 'Disj', + 'ale': 'Advs', + 'avšak': 'Advs', + 'však': 'Advs', + 'nýbrž': 'Advs', + 'neboť': 'Reas', + 'tedy': 'Cnsq', + 'tak': 'Cnsq' + } + + def process_node(self, node): + """ + Derives a case value from preposition and morphological case. Stores it + as MSFCase in MISC. + """ + # Do not do anything for function words. + # Specifically for Case, also skip 'det' and 'amod' modifiers (congruent attributes) + # because their Case is only agreement feature inherited from the head noun. + if node.udeprel in ['case', 'mark', 'cc', 'aux', 'cop', 'punct']: + node.misc['MSFFunc'] = 'Yes' + return + elif node.udeprel in ['det', 'amod']: + node.misc['MSFFunc'] = 'No' + return + else: + node.misc['MSFFunc'] = 'No' + # Get all case markers (adpositions) attached to the current node. + adpositions = [] + for c in node.children: + if c.udeprel == 'case': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + adpositions.append(lemma) + # We assume that all features were copied from FEATS to MISC in mwe.MsfInit. + # They may have been further processed there, so we take the input from there. + msfcase = node.misc['MSFCase'] + if adpositions: + adpostring = '_'.join(adpositions) + caseadpostring = adpostring + '+' + msfcase + if caseadpostring in self.adposmap: + msfcase = self.adposmap[caseadpostring] + else: + logging.warn(f"No Case value found for '{caseadpostring}'.") + msfcase = caseadpostring + # Omer wants to collect cases from both adpositions and subordinators + # but we will consider subordinators only if we do not have any case + # from morphology or adpositions. + if not msfcase: + subordinators = [] + for c in node.children: + if c.udeprel == 'mark': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + subordinators.append(lemma) + if subordinators: + subordstring = '_'.join(subordinators) + if subordstring in self.adposmap: + msfcase = self.adposmap[subordstring] + # To lump coordinators with all the above makes even less sense but for + # the moment we do it. + if not msfcase: + coordinators = [] + for c in node.children: + if c.udeprel == 'cc': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + coordinators.append(lemma) + if coordinators: + coordstring = '_'.join(coordinators) + if coordstring in self.adposmap: + msfcase = self.adposmap[coordstring] + node.misc['MSFCase'] = msfcase diff --git a/udapi/block/msf/createabstract.py b/udapi/block/msf/createabstract.py new file mode 100644 index 00000000..fbdf73e5 --- /dev/null +++ b/udapi/block/msf/createabstract.py @@ -0,0 +1,45 @@ +""" +Morphosyntactic features (UniDive): +Create abstract nodes representing dropped arguments of predicates (if verbal +morphology signals that the subject is third person singular, and there is no +subject node, create an abstract node and copy the features there). +""" +from udapi.core.block import Block +import re + +class CreateAbstract(Block): + + def process_node(self, node): + """ + If a node has MSFVerbForm=Fin and at least one of the agreement features + MSFNumber, MSFPerson, MSFGender, MSFAnimacy, MSFPolite, assume that these + features characterize the subject (this block is not suitable for languages + with polypersonal agreement). Check that the subject is present. If not, + create an abstract node to represent it. + """ + if node.misc['MSFVerbForm'] == 'Fin' and any([node.misc[x] for x in ['MSFNumber', 'MSFPerson', 'MSFGender', 'MSFAnimacy', 'MSFPolite']]): + # Current node is a finite predicate. Does it have a subject? If not, create an abstract one. + if not any([x.udeprel in ['nsubj', 'csubj'] for x in node.children]): + # There could already be an abstract subject. We have to look for it in the enhanced graph. + if not any([re.match(r"^[nc]subj", edep['deprel']) for edep in node.deps]): + # Create an abstract subject. + subject = node.create_empty_child('nsubj') + subject.upos = 'PRON' + subject.feats['PronType'] = 'Prs' + subject.misc['MSFPronType'] = 'Prs' + subject.feats['Case'] = 'Nom' + subject.misc['MSFCase'] = 'Nom' + for f in ['Number', 'Person', 'Gender', 'Animacy', 'Polite']: + msf = 'MSF' + f + if node.misc[msf]: + subject.feats[f] = node.misc[msf] + subject.misc[msf] = node.misc[msf] + subject.misc['MSFFunc'] = 'No' + # Regardless of whether it had a subject or not, the agreement features + # should be removed from the verb. + ###!!! We also may want to check if the pre-existing subject has all the features. + node.misc['MSFNumber'] = '' + node.misc['MSFPerson'] = '' + node.misc['MSFGender'] = '' + node.misc['MSFAnimacy'] = '' + node.misc['MSFPolite'] = '' diff --git a/udapi/block/msf/init.py b/udapi/block/msf/init.py new file mode 100644 index 00000000..ceca12af --- /dev/null +++ b/udapi/block/msf/init.py @@ -0,0 +1,53 @@ +""" +Morphosyntactic features (UniDive): +Initialization. Copies features from FEATS as MSF* attributes to MISC. +""" +from udapi.core.block import Block +import re + +class Init(Block): + + + def process_node(self, node): + """ + For every feature in FEATS, creates its MSF* counterpart in MISC. + """ + for f in node.feats: + # Only selected features will be copied. Certain features are not + # interesting for the morphosyntactic annotation. + if f not in ['Abbr', 'AdpType', 'Emph', 'Foreign', 'NameType', 'Style', 'Typo', 'Variant']: + node.misc['MSF'+f] = node.feats[f] + # We are particularly interested in the Case feature but some nominals + # lack it (e.g. acronyms or numbers). If there is a preposition, it may + # indicate the expected case of the nominal. + if not node.feats['Case']: + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + node.misc['MSFCase'] = adpositions[0].feats['Case'] + # If we did not find a preposition to help us, we may be able to read + # the case off an adjectival modifier or determiner. + if not node.misc['MSFCase']: + modifiers = [x for x in node.children if x.udeprel in ['amod', 'det'] and x.feats['Case']] + if modifiers: + node.misc['MSFCase'] = modifiers[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if not node.misc['MSFCase']: + if node.udeprel == 'nsubj': + node.misc['MSFCase'] = 'Nom' + elif node.udeprel == 'obj': + node.misc['MSFCase'] = 'Acc' + # If the node contains Phrase features in MISC (periphrastic verb forms + # detected by Lenka's code), replace the MS features with them. + phrasefeatures = [x for x in node.misc if re.match(r"^Phrase[A-Z]", x)] + for pf in phrasefeatures: + msf = pf + if msf == 'PhraseForm': + msf = 'MSFVerbForm' + else: + msf = re.sub(r"Phrase", 'MSF', pf) + node.misc[msf] = node.misc[pf] + node.misc[pf] = '' diff --git a/udapi/block/msf/numphrase.py b/udapi/block/msf/numphrase.py new file mode 100644 index 00000000..22f68c9d --- /dev/null +++ b/udapi/block/msf/numphrase.py @@ -0,0 +1,36 @@ +""" +Morphosyntactic features (UniDive): +Case in Number Phrases like 'pět mužů' (five men) in Czech. +""" +from udapi.core.block import Block + +class NumPhrase(Block): + + + def process_node(self, node): + """ + Nouns with a 'nummod:gov' dependent are morphologically in genitive, + but the case of the whole phrase (number + counted noun) is different, + probably nominative or accusative. + """ + quantifiers = [x for x in node.children if x.deprel in ['nummod:gov', 'det:numgov']] + current_case = node.misc['MSFCase'] + if (current_case == 'Gen' or current_case == '') and quantifiers: + quantifier_case = quantifiers[0].misc['MSFCase'] + # The quantifier may lack the case feature (e.g. numbers expressed by digits) + # but we may be able to guess it from a preposition or other factors. + if quantifier_case == '': + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + quantifier_case = adpositions[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if quantifier_case == '': + if node.udeprel == 'nsubj': + quantifier_case = 'Nom' + elif node.udeprel == 'obj': + quantifier_case = 'Acc' + node.misc['MSFCase'] = quantifier_case diff --git a/udapi/block/msf/phrase.py b/udapi/block/msf/phrase.py new file mode 100644 index 00000000..cf5a8f81 --- /dev/null +++ b/udapi/block/msf/phrase.py @@ -0,0 +1,168 @@ +""" +Morphosyntactic features (UniDive): +An abstract block as a base for derivation of blocks that discover periphrastic +verb forms and save them as Phrase features in MISC. This block provides the +methods that save the features in MISC. It is based on the Writer module by +Lenka Krippnerová. +""" +from udapi.core.block import Block +import logging + +class Phrase(Block): + + def __init__(self, feature_prefix='CW', **kwargs): + """ + Parameters: + feature_prefix (string) - The prefix of phrase features (e. g. 'CW', 'Phrase'), default is 'CG' + """ + super().__init__(**kwargs) + self.feature_prefix = feature_prefix + + self.dictionary = { + 'person': f'{feature_prefix}Person', + 'number': f'{feature_prefix}Number', + 'mood': f'{feature_prefix}Mood', + 'tense': f'{feature_prefix}Tense', + 'voice': f'{feature_prefix}Voice', + 'aspect':f'{feature_prefix}Aspect', + 'form': f'{feature_prefix}Form', + 'reflex': f'{feature_prefix}Reflex', + 'polarity': f'{feature_prefix}Polarity', + 'gender': f'{feature_prefix}Gender', + 'animacy': f'{feature_prefix}Animacy', + 'ords': feature_prefix, + 'expl': f'{feature_prefix}Expl', + 'analytic': 'Analytic', + } + + # a dictionary where the key is the lemma of a negative particle and the value is a list of the lemmas of their possible children that have a 'fixed' relation + # we do not want to include these negative particles in the phrase; these are expressions like "never", etc. + self.negation_fixed = { + # Belarusian + 'ні' : ['раз'], + 'ня' : ['толькі'], + + # Upper Sorbian + 'nic' : ['naposledku'], + + # Polish + 'nie' : ['mało'], + + # Pomak + 'néma' : ['kak'], + + # Slovenian + 'ne' : ['le'], + + # Russian and Old East Slavic + 'не' : ['то', 'токмо'], + 'ни' : ['в', 'раз', 'шатко'], + 'нет' : ['нет'] + } + + def process_node(self, node): + """ + Override this in a derived class! + """ + logging.fatal('process_node() not implemented.') + + + + def write_node_info(self, node, + tense = None, + person = None, + number = None, + mood = None, + voice = None, + form = None, + reflex = None, + polarity = None, + ords = None, + gender = None, + animacy = None, + aspect = None, + expl=None, + analytic=None): + arguments = locals() + del arguments['self'] # delete self and node from arguments, + del arguments['node'] # we want only grammatical categories + for key,val in arguments.items(): + if val != None: + node.misc[self.dictionary[key]] = val + + def has_fixed_children(self, node): + """ + Returns True if the node has any children with the 'fixed' relation and the node's lemma along with the child's lemma are listed in self.negation_fixed. + """ + fixed_children = [x for x in node.children if x.udeprel == 'fixed'] + + if fixed_children: + if fixed_children[0].lemma in self.negation_fixed.get(node.lemma, []): + return True + return False + + def get_polarity(self, nodes): + """ + Returns 'Neg' if there is exactly one node with Polarity='Neg' among the given nodes. + Returns an empty string if there are zero or more than one such nodes. + """ + neg_count = 0 + for node in nodes: + if node.feats['Polarity'] == 'Neg': + neg_count += 1 + + if neg_count == 1: + return 'Neg' + + # neg_count can be zero or two, in either case we want to return an empty string so that the PhrasePolarity attribute is not generated + else: + return '' + + def get_negative_particles(self, nodes): + """ + Returns a list of all negative particles found among the children + of the specified nodes, except for negative particles with fixed children specified in self.negation_fixed. + """ + neg_particles = [] + for node in nodes: + neg = [x for x in node.children if x.upos == 'PART' and x.feats['Polarity'] == 'Neg' and x.udeprel == 'advmod' and not self.has_fixed_children(x)] + if neg: + neg_particles += neg + return neg_particles + + + def get_is_reflex(self,node,refl): + if node.feats['Voice'] == 'Mid': + return 'Yes' + if len(refl) == 0: + return node.feats['Reflex'] + return 'Yes' + + def get_expl_type(self,node, refl): + if node.feats['Voice'] == 'Mid': + return 'Pv' + if not refl: + return '' + if refl[0].deprel == 'expl': + return 'Pv' + return refl[0].deprel.split(':')[1].capitalize() + + def is_expl_pass(self,refl): + if len(refl) == 0: + return False + return refl[0].deprel == 'expl:pass' + + def get_voice(self,node,refl): + voice = node.feats['Voice'] + if self.is_expl_pass(refl): + return 'Pass' + return voice + + def get_analytic_bool(self,node): + auxes = [x for x in node.children if x.udeprel == 'aux'] + + if auxes: + return 'Yes' + else: + return 'No' + diff --git a/udapi/block/msf/removefunc.py b/udapi/block/msf/removefunc.py new file mode 100644 index 00000000..e169a2de --- /dev/null +++ b/udapi/block/msf/removefunc.py @@ -0,0 +1,17 @@ +""" +Morphosyntactic features (UniDive): +Cleanup. Removes MSF* features from MISC for function nodes (MSFFunc=Yes). +""" +from udapi.core.block import Block + +class RemoveFunc(Block): + + + def process_node(self, node): + """ + Removes MSF* features if MSFFunc=Yes. + """ + if node.misc['MSFFunc'] == 'Yes': + msfeats = [x for x in node.misc if x.startswith('MSF')] + for msf in msfeats: + node.misc[msf] = '' diff --git a/udapi/block/msf/romance/preprocessor.py b/udapi/block/msf/romance/preprocessor.py new file mode 100644 index 00000000..ad7aec1e --- /dev/null +++ b/udapi/block/msf/romance/preprocessor.py @@ -0,0 +1,20 @@ +from udapi.core.block import Block + +class Preprocessor(Block): + + + def process_node(self, node): + + # In Porttinari treebank, the negative adverb não is not marked with feat Polarity=Neg + if node.lemma == 'não' and node.upos == 'ADV': + node.feats['Polarity'] = 'Neg' + + if node.upos == 'ADV' and node.feats['PronType'] == 'Neg': + node.feats['PronType'] = '' + node.feats['Polarity'] = 'Neg' + + # In Romanian RRT treebank, there is no annotation of the voice feature + # Automatically assign passive voice + pass_auxes = [x for x in node.children if x.deprel == 'aux:pass'] + if pass_auxes: + node.feats['Voice'] = 'Pass' \ No newline at end of file diff --git a/udapi/block/msf/romance/romance.py b/udapi/block/msf/romance/romance.py new file mode 100644 index 00000000..ed05fa89 --- /dev/null +++ b/udapi/block/msf/romance/romance.py @@ -0,0 +1,965 @@ +import udapi.block.msf.phrase +from enum import Enum + +AUXES_HAVE = ['ter', 'haber', 'avere'] +AUXES_BE = ['estar', 'essere'] +MODALS = ['poder', 'deber', 'querer', 'saber', # Spanish + Portuguese + 'potere', 'dovere', 'volere', 'sapere'] # Italian + +class Aspect(str, Enum): + ANT = 'Ant' + IMP = 'Imp' + IMPPROG = 'ImpProg' + PERF = 'Perf' + PERFPROG = 'PerfProg' + PROG = 'Prog' + PQP = 'Pqp' + PQPPROG = 'PqpProg' + +class Tense(str, Enum): + FUT = 'Fut' + FUTFUT = 'FutFut' + PAST = 'Past' + PASTFUT = 'PastFut' + PASTPRES = 'PastPres' + PRES = 'Pres' + +class Romance(udapi.block.msf.phrase.Phrase): + + def __init__(self, neg=True, **kwargs): + """ + Parameters: + neg (bool) - If True, process negation and generate the PhrasePolarity=Neg attribute. + feature_prefix (string) - The prefix of phrase features (e. g. 'CG', 'Phrase'), default is 'CG' + """ + super().__init__(**kwargs) + self.neg = neg + + def process_node(self, node): + + if node.misc[self.feature_prefix] != '': + return + + cop = [x for x in node.children if x.udeprel == 'cop'] + + # only expl or expl:pv, no expl:impers or expl:pass + refl = [x for x in node.children if (x.lemma == 'se' or x.lemma == 'soi') and x.upos == 'PRON' and x.udeprel == 'expl' and x.deprel != 'expl:impers' and x.deprel != 'expl:pass'] + + if refl: + expl='Pv' + else: + expl=None + + if cop: + # find auxiliary verbs, modal verbs, and auxiliary verbs related to modal verbs among the children of the content verb and separate them from each other + auxes, neg, modals, modal_auxes, modal_neg = self.find_auxes_and_neg(node) + adp = [x for x in node.children if x.upos == 'ADP'] + + if modals: + # we consider modals themselves to be separate verb forms + self.process_modal_verbs(modals, modal_auxes, modal_neg) + + if auxes: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in auxes] + [r.ord for r in refl] + [a.ord for a in adp] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in auxes] + [a.ord for a in adp] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(cop[0], auxes, expl, polarity, phrase_ords, node) + else: + # no auxiliaries, only cop + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [c.ord for c in cop] + [r.ord for r in refl] + [a.ord for a in adp] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in adp] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_copulas(node, cop, expl, polarity, phrase_ords) + return + + if node.upos == 'VERB': #TODO maybe add "or node.feats['VerbForm'] == 'Part'"? + + # find auxiliary verbs, modal verbs, and auxiliary verbs related to modals among the children of the content verb and separate them from each other + auxes, neg, modals, modal_auxes, modal_neg = self.find_auxes_and_neg(node) + aux_pass = [x for x in auxes if x.deprel == 'aux:pass'] + auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass'] + + # infinitive with a subject is a subjunctive + subj = [x for x in node.children if x.udeprel == 'subj'] + if node.feats['VerbForm'] == 'Inf' and subj: + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + mood='Sub', + form='Fin', + tense=Tense.FUT.value, + gender=node.feats['Gender'], + voice=node.feats['Voice'], + expl=expl, + analytic=self.get_analytic_bool(node), + ords=[node.ord] + ) + return + + if modals: + # we consider modals themselves to be separate verb forms + self.process_modal_verbs(modals, modal_auxes, modal_neg) + + if not auxes: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_phrases_with_ir_aller_estar(node, expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(node, expl, polarity, phrase_ords, node) + + + else: + # no passive auxiliaries + if not aux_pass: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(node, auxes, expl, polarity, phrase_ords, node) + + # head verb has only passive auxiliary and no more other auxiliaries + elif not auxes_without_pass: + polarity = '' + + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # TODO phrase-level features are currently determined based on the first passive auxiliary, but it can happen that there are more than one passive auxiliary + self.process_phrases_with_ir_aller_estar(auxes[0], expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(auxes[0], expl, polarity, phrase_ords, node) + + # head verb has passive auxiliary and also other auxiliaries + else: + polarity = '' + + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(aux_pass[0], auxes_without_pass, expl, polarity, phrase_ords, node) + + def find_auxes_and_neg(self, node): + """ + Find all auxiliaries and negative adverbials among node.children and classifies them. + + Parameters: + node (udapi.core.node.Node): head word, look for auxiliaries in its children + + Returns: + tuple: a classification of auxiliaries consisting of: + - auxiliaries directly modifying the node, + - negative adverbs modifying the node, + - modal verbs, + - auxiliaries modifying a modal verb, + - negative adverbs modifying a modal verb. + """ + + node_auxes = [] + node_neg = [] + modals = [] + modal_auxes = [] + modal_neg = [] + + for child in node.children: + if child.udeprel == 'aux': + if child.lemma in MODALS: + modals.append(child) + modal_auxes = node_auxes # auxiliaries found so far are assumed to modify the modal verb (they come before it) + node_auxes = [] + + modal_neg = node_neg + node_neg = [] + + else: + node_auxes.append(child) + + elif child.upos == 'ADV' and child.feats['Polarity'] == 'Neg': + node_neg.append(child) + + return node_auxes, node_neg, modals, modal_auxes, modal_neg + + def process_modal_verbs(self, modals, modal_auxes, modal_neg): + """ + Annotates modal verb forms with the Phrase* attributes. + The modal verbs are kept as a single verb form, without including the infinitive of the content word. + + Parameters: + modals (list): all modal verbs among the children of the head content verb (currently assumes there is only one.) + modal_auxes (list): auxiliaries of the modal verb(s) + modal_neg (list): negative adverbs of the modal verb(s) + + """ + if not modal_auxes: + polarity = '' + if self.neg is True: + phrase_ords = [modals[0].ord] + [n.ord for n in modal_neg] + phrase_ords.sort() + + if modal_neg: + polarity='Neg' + else: + phrase_ords = [modals[0].ord] + self.process_phrases_with_ir_aller_estar(modals[0], '', polarity, phrase_ords, modals[0]) + self.process_simple_verb_forms(modals[0], '', polarity, phrase_ords, modals[0]) + + else: + polarity = '' + if self.neg is True: + phrase_ords = [modals[0].ord] + [a.ord for a in modal_auxes] + [n.ord for n in modal_neg] + if modal_neg: + polarity='Neg' + else: + phrase_ords = [modals[0].ord] + [a.ord for a in modal_auxes] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(modals[0], modal_auxes, '', polarity, phrase_ords, modals[0]) + + def process_phrases_with_ir_aller_estar(self, node, expl, polarity, phrase_ords, head_node): + aspect = '' + tense = node.feats['Tense'] + + # phrase already annotated + if head_node.misc[self.feature_prefix] != '': + return + + xcomps = [x for x in node.children if x.udeprel == 'xcomp'] + if node.lemma in ['ir', 'aller', 'estar', 'ter'] and node.upos == 'VERB' and xcomps: + node.misc['PeriAux'] = 'Yes' + + voice = node.feats['Voice'] + auxes = [x for x in xcomps[0].children if x.udeprel == 'aux'] + aux_pass = [x for x in auxes if x.deprel == 'aux:pass'] + auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass'] + + # European Portuguese: estar + a + Inf + if node.lemma == 'estar': + + if node.feats['Tense'] == 'Pres': + tense=Tense.PRES.value + aspect =Aspect.PROG.value + + elif node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + elif node.feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect=Aspect.PERFPROG.value + + elif node.feats['Tense'] == 'Fut': + tense=Tense.FUT.value + aspect=Aspect.PROG.value + + elif node.lemma == 'ter' and len(xcomps) > 1: + tense=Tense.PAST.value + aspect=Aspect.PROG.value + xcomps[0].misc['PeriAux'] = 'Yes' + + elif node.feats['Tense'] == 'Pres': + tense=Tense.FUT.value + + elif node.feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMP.value + + elif node.feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + + elif node.feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERF.value + + + if auxes_without_pass: + if auxes[0].lemma == 'estar': + aspect += 'Prog' + if auxes[0].lemma == 'haber': + aspect += 'Perf' + + + + adp_a = [x for x in xcomps[-1].children if x.lemma == 'a' and x.udeprel == 'mark'] + cop = [x for x in xcomps[0].children if x.udeprel == 'cop'] + phrase_ords = [node.ord] + [x.ord for x in xcomps] + [x.ord for x in auxes] + [x.ord for x in cop] + if adp_a: + phrase_ords += [x.ord for x in adp_a] + + if aux_pass: + voice='Pass' + + phrase_ords.sort() + + self.write_node_info(xcomps[-1], + tense = tense, + number = node.feats['Number'], + person = node.feats['Person'], + aspect = aspect, + mood = node.feats['Mood'], + form = 'Fin', + voice=voice, + expl = expl, + polarity = polarity, + analytic='Yes', + ords=phrase_ords) + return + + def process_simple_verb_forms(self, node, expl, polarity, phrase_ords, head_node): + """ + Annotate simple verb forms or passive verb forms that contain only a passive auxiliary. + + Parameters + node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words of the verb form. + head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase. + """ + + if node.misc['PeriAux'] != '': + return + + # Portuguese + # presente -> PhraseTense=Pres, PhraseAspect='' + # Futuro do presente -> PhraseTense=Fut, PhraseAspect='' + + # Spanish + # presente -> PhraseTense=Pres, PhraseAspect='' + # futuro simple -> PhraseTense=Fut, PhraseAspect='' + + # Italian + # presente -> PhraseTense=Pres, PhraseAspect='' + # futuro semplice -> PhraseTense=Fut, PhraseAspect='' + + aspect = '' + tense = node.feats['Tense'] + form = node.feats['VerbForm'] + + if node.feats['Mood'] == 'Ind': + + # Portuguese + # pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + + # Spanish + # pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp + + # Italian + # imperfetto -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # Portuguese + # pretérito perfeito -> PhraseTense=Past, PhraseAspect=Perf + + # Spanish + # pretérito perfecto -> PhraseTense=Past, PhraseAspect=Perf + + # Italian + # pass remoto -> PhraseTense=Past, PhraseAspect=Perf + elif node.feats['Tense'] == 'Past': + aspect=Aspect.PERF.value + + # Portuguese + # pretérito mais que perfeito simples -> PhraseTense=Past, PhraseAspect=Pqp + elif node.feats['Tense'] == 'Pqp': + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + else: + # viitorul popular/colocvial (intentional future) -> PhraseTense=Fut, PhraseAspect='' + o = [x for x in node.children if x.lemma == 'o' and x.upos == 'PART'] + sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART'] + + + if o and sa: + tense = Tense.FUT.value + phrase_ords.append(o[0].ord) + phrase_ords.append(sa[0].ord) + + phrase_ords.sort() + + + + # Portuguese + # subjunctive presente -> PhraseTense=Pres, PhraseAspect='' + # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' + + # Spanish + # subjunctive presente -> PhraseTense=Pres, PhraseAspect='' + # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' TODO not annotated in treebanks? + + # Italian + # Congiuntivo presente -> PhraseTense=Pres, PhraseAspect='' + if node.feats['Mood'] == 'Sub': + + if node.feats['Tense'] == 'Past': + aspect=Aspect.IMP.value + + # Portuguese + # subjunctive pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + + # Spanish + # Pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp + + # Italian + # Congiuntivo imperfetto -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # Portuguese + # Futuro do pretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + + # Spanish + # pospretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + + # Italian + # Condizionale presente -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + if node.feats['Mood'] == 'Cnd': + aspect='' + tense=Tense.PRES.value + + adp_en = [x for x in head_node.children if x.upos == 'ADP' and x.lemma == 'en' and x.udeprel == 'mark'] + if node.feats['VerbForm'] == 'Part' and adp_en: + phrase_ords.append(adp_en[0].ord) + phrase_ords.sort() + form = 'Ger' + + + self.write_node_info(head_node, + person=node.feats['Person'], + aspect=aspect, + number=node.feats['Number'], + mood=node.feats['Mood'], + form=form, + tense=tense, + gender=head_node.feats['Gender'], + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic=self.get_analytic_bool(head_node), + ords=phrase_ords + ) + + def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_ords, head_node): + """ + Annotate periphrastic verb forms with the Phrase* attributes. + + Parameters + node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary. + auxes (list[udapi.core.node.Node]): All auxiliaries except the passive auxiliaries. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words in the verb form. + head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase. + """ + + # phrase already annotated + if head_node.misc[self.feature_prefix] != '': + return + + if len(auxes) == 1: + # Cnd + if auxes[0].feats['Mood'] == 'Cnd' and (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'Ger'): + + # Portuguese + # aux estar cond + gerund -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].lemma == 'estar': + tense=Tense.PRES.value + aspect=Aspect.PROG.value + + # Portuguese + # Futuro do pretérito composto -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + + # Spanish + # Antepospretérito -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + + # Italian + # Condizionale passato -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + else: + tense=Tense.PAST.value + aspect='' + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + aspect=aspect, + mood='Cnd', + form='Fin', + expl=expl, + polarity=polarity, + voice=head_node.feats['Voice'], + analytic='Yes', + ords=phrase_ords) + return + + if auxes[0].lemma == 'vir' and auxes[0].feats['Tense'] in ['Pres', 'Imp', 'Past'] and node.feats['VerbForm'] == 'Ger': + + # aux Pres (vir) + gerund -> PhraseTense=PastPres, PraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=Aspect.PROG.value, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + if auxes[0].lemma == 'ir' and node.feats['VerbForm'] == 'Ger': + + # aux Pres (ir) + gerund -> PhraseTense=Pres, PhraseAspect=Prog + tense = auxes[0].feats['Tense'] + aspect = Aspect.PROG.value + + # aux Imp (ir) + gerund -> PhraseTense=Past, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + # Auxiliary 'estar' followed by a gerund + if node.feats['VerbForm'] == 'Ger': + + # Portuguese + Spanish + # pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg + # subjunctive pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + # Portuguese + Spanish + # pretérito perfeito (aux estar) -> PhraseTense=Past, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect=Aspect.PERFPROG.value + + # Portuguese + Spanish + # presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog + # futuro do presente (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog + # subjunctive presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Sub + # subjunctive futuro (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog, PhraseMood=Sub + else: + tense=auxes[0].feats['Tense'] + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + aspect=aspect, + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # Auxiliary 'ter' / 'haber' / 'avere' / 'essere' followed by a participle + if node.feats['VerbForm'] == 'Part': + + # Portuguese + # futuro do presente composto (aux ter) -> PhraseTense=Fut, PhraseAspect=Perf + + # Spanish + # Futuro compuesto antefuturo -> PhraseTense=Fut, PhraseAspect=Perf + + # Italian + # Futuro anteriore -> PhraseTense=Fut, PhraseAspect=Perf + aspect=Aspect.PERF.value + tense=auxes[0].feats['Tense'] + form='Fin' + mood=auxes[0].feats['Mood'] + + adp_en = [x for x in node.children if x.lemma == 'en' and x.upos == 'ADP' and x.udeprel == 'mark'] + if auxes[0].feats['VerbForm'] == 'Part' and adp_en: + tense=Tense.PAST.value + aspect='' + phrase_ords.append(adp_en[0].ord) + phrase_ords.sort() + form='Ger' + + + # Romanian + # Perfect compus -> PhraseTense=Past, PhraseAspect=Perf + elif auxes[0].lemma == 'avea': + tense = Tense.PAST.value + aspect = Aspect.PERF.value + form = 'Fin' + + # Spanish + # Pretérito perfecto compuesto ante presente -> PhraseTense=Past, PhraseAspect=Perf + + # Italian + # Passato prossimo (aux avere/essere) -> PhraseTense=Past, PhraseAspect=Perf + elif auxes[0].feats['Tense'] == 'Pres': + + # Portuguese + # pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf + # subjonctive pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf, PhraseMood=Sub + if auxes[0].lemma == 'fi' or auxes[0].feats['Mood'] == 'Sub': + tense = Tense.PASTPRES.value + + # subjonctive mood not annotated in Romanian data + if auxes[0].lemma == 'fi': + mood='Sub' + else: + tense=Tense.PAST.value + + # Portuguese + # pretérito mais que perfeito composto (aux ter/haver) -> PhraseTense=Past, PhraseAspect=Pqp + # subjonctive pretérito mais-que-perfeito composto (aux ter) -> PhraseTense=Past, PhraseAspect=Pqp, PhraseMood=Sub + + # Spanish + # pretérito pluscuamperfecto -> PhraseTense=Past, PhraseAspect=Pqp + + # Italian + # Trapassato prossimo -> PhraseTense=Past, PhraseAspect=Pqp + elif auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + # Spanish + # pretérito anterior ante pretérito -> PhraseTense=Past, PhraseAspect=Ant + + # Italian + # trapassato remoto -> PhraseTense=Past, PhraseAspect=Ant + + # French + # passé antérieur -> PhraseTense=Past, PhraseAspect=Ant + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect = Aspect.ANT.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=mood, + aspect=aspect, + form=form, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + # auxiliary 'ir' or 'vrea' followed by infinitive + if auxes[0].lemma in ['ir', 'vrea'] and node.feats['VerbForm'] == 'Inf': + + tense=node.feats['Tense'] + aspect='' + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect='' + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect='' + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=Imp + elif auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMP.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect='' + elif auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect='' + + # Futuro perifrástico passado perf -> PhraseTense=PastFut, PhraseAspect=Perf + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERF.value + + # Viitorul standard/literar/simplu -> PhraseTense=Fut, PhraseAspect='' + if auxes[0].lemma == 'vrea': + tense = Tense.FUT.value + aspect = '' + + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # condițional-optativ prezent -> PhraseTense=Pres, PhraseAspect='' + if auxes[0].lemma == 'avea' and node.feats['VerbForm'] == 'Inf': + tense=Tense.PRES.value + aspect='' + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood='Cnd', + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # viitor popular/colloquial (obligative future) -> PhraseTense=Fut, PhraseAspect='' + # viitor popular (potential future - contracted form) -> PhraseTense=Fut, PhraseAspect='' + if node.feats['VerbForm'] == 'Fin': + sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART'] + + if sa: + phrase_ords.append(sa[0].ord) + phrase_ords.sort() + + tense=Tense.FUT.value + aspect='' + + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=head_node.feats['Number'], + person=head_node.feats['Person'], + mood=head_node.feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + elif len(auxes) == 2: + # Romanian + # viitor anterior -> PhraseTense=Fut, PhraseAsoect=Perf + if auxes[0].lemma == 'vrea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part': + + self.write_node_info(head_node, + tense=Tense.PAST.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=Aspect.PERF.value, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # condițional-optativ perfect -> PhraseTense=Past + if auxes[0].lemma == 'avea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part': + + self.write_node_info(head_node, + tense=Tense.PAST.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood='Cnd', + form='Fin', + aspect='', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # Portuguese + # auxiliry 'ir' followed by auxiliary 'estar' in infinitive and a gerund + if auxes[0].lemma == 'ir' and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PROG.value + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=ImpProg + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMPPROG.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PROG.value + + if auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERFPROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + # auxiliriy 'ir' in present or future tense followed by auxiliary 'ter' in infinitive and a participle + if auxes[0].lemma == 'ir' and (auxes[0].feats['Tense'] in ['Pres', 'Fut']) and auxes[1].lemma == 'ter' and node.feats['VerbForm'] == 'Part': + + # Futuro perifrástico -> PhraseTense=FutFut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PERF.value + + # aux Pres (ir) + aux ter inf + pp -> PhraseTense=Fut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PERF.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + # Cnd (only ter/haber), Sub and Past,Pres,Fut tenses: 2 auxes - ter/haber + estar + if auxes[0].lemma in AUXES_HAVE and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + + tense = auxes[0].feats['Tense'] + aspect = Aspect.PERFPROG.value + + # aux ter cond + estar pp + gerund -> PhraseTense=Past, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].feats['Mood'] == 'Cnd': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + # Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg + # subjonctive Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + # Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg + # subjonctive Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + aspect=Aspect.PQPPROG.value + + # Futuro do presente composto -> PhraseTense=Fut, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Fut' and auxes[0].lemma == 'ter': + tense=Tense.FUT.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords, + ) + return + + def process_copulas(self, node, cop, expl, polarity, phrase_ords): + """ + Annotate non-verbal predicates with copula using the Phrase* attributes. + + This method is specialized for non-periphrastic copulas. + If any auxiliaries are present, process_periphrastic_verb_forms() is called instead. + + Parameters + node (udapi.core.node.Node): The non-verbal predicate that should receive the Phrase* attributes, i.e., the head of the phrase. + cop (list[udapi.core.node.Node]): The copula nodes. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words in the verb form. + """ + + # classify the morphological features of the copula node and propagate them to the entire phrase (treating the copula as the content verb) + self.process_phrases_with_ir_aller_estar(cop[0], expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(cop[0], expl, polarity, phrase_ords, node) + + # adjust PhraseAspect based on the lemma of the copula + if cop[0].feats['Tense'] in ['Pres', 'Fut']: + if cop[0].lemma == 'ser': + node.misc['PeriAspect'] = Aspect.PERF.value + elif cop[0].lemma == 'estar': + node.misc['PeriAspect'] = Aspect.IMP.value \ No newline at end of file diff --git a/udapi/block/msf/slavic/conditional.py b/udapi/block/msf/slavic/conditional.py new file mode 100644 index 00000000..9d15418f --- /dev/null +++ b/udapi/block/msf/slavic/conditional.py @@ -0,0 +1,97 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects conditional verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Conditional(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + if (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'PartRes') or node.feats['VerbForm'] == 'Fin': + # in most Slavic languages, the verb has feats['VerbForm'] == 'Part' but in Polish the verb has feats['VerbForm'] == 'Fin' + + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # list for auxiliary verbs for forming the conditional mood + cop = [x for x in node.children if x.udeprel == 'cop'] # in some cases it may happen that the cop follows the noun, we don't want to these cases in this branch + # in Polish the auxiliary verbs for conditional mood have deprel == 'aux:cnd', in other languages the auxiliary verbs have x.feats['Mood'] == 'Cnd' + + # the conditional mood can be formed using the auxiliary verb or some conjunctions (such as 'aby, kdyby...' in Czech) + # so x.udeprel == 'aux' can't be required because it doesn't meet the conjunctions + + if aux_cnd and not cop: + aux = [x for x in node.children if x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd'] # all auxiliary verbs and conjuctions with feats['Mood'] == 'Cnd' + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person='3' # TODO there is a problem in russian etc. (same as in past tense) + + for aux_verb in aux: + if aux_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + + + self.write_node_info(node, + person=person, + number=node.feats['Number'], + mood='Cnd', + form='Fin', + aspect=node.feats['Aspect'], + expl=self.get_expl_type(node,refl), + polarity=self.get_polarity(phrase_nodes), + voice=self.get_voice(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['VerbForm'] == 'Part' or x.feats['VerbForm'] == 'Fin')] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel=='aux:cnd'] + + if cop and aux_cnd: + # there can be a copula with Mood='Cnd' (i. e. in Old East Slavonic), we don't want to count these copula in phrase_ords twice, so there is x.udeprel != 'cop' in aux list + aux = [x for x in node.children if (x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd') and x.udeprel != 'cop'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + prep + refl + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + + person = '3' + + for aux_verb in aux: + if aux_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + for cop_verb in cop: + if cop_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=person, + number=copVerb.feats['Number'], + mood='Cnd', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + ords=phrase_ords, + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) \ No newline at end of file diff --git a/udapi/block/msf/slavic/converb.py b/udapi/block/msf/slavic/converb.py new file mode 100644 index 00000000..32714630 --- /dev/null +++ b/udapi/block/msf/slavic/converb.py @@ -0,0 +1,94 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects converb (transgressive) forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Converb(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # condition node.upos == 'VERB' to prevent copulas from entering this branch + if node.feats['VerbForm'] == 'Conv' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + form='Conv', + tense=node.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + voice=self.get_voice(node, refl), + analytic=self.get_analytic_bool(node) + ) + + # passive voice + elif node.upos == 'ADJ': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Conv'] + + if aux: + auxVerb = aux[0] + + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + form='Conv', + tense=auxVerb.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=auxVerb.feats['Gender'], + animacy=auxVerb.feats['Animacy'], + voice='Pass', + analytic=self.get_analytic_bool(node) + ) + + # copulas + else: + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Conv'] + + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + tense=copVerb.feats['Tense'], + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'], + form='Conv', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + voice=self.get_voice(copVerb, refl), + analytic=self.get_analytic_bool(node) + ) diff --git a/udapi/block/msf/slavic/future.py b/udapi/block/msf/slavic/future.py new file mode 100644 index 00000000..9cc17717 --- /dev/null +++ b/udapi/block/msf/slavic/future.py @@ -0,0 +1,207 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects future tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Future(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # future tense for Serbian and Croatian + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and (x.lemma == 'hteti' or x.lemma == 'htjeti')] + if node.upos != 'AUX' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + aux_other = [x for x in node.children if x.udeprel == 'aux'] # adding aux for passive voice + cop = [x for x in node.children if x.deprel == 'cop'] + + phrase_nodes = [node] + refl + aux_other + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + if not cop: + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], # srbstina ani chorvatstina vidy nema + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + else: + prep = [x for x in node.children if x.upos == 'ADP'] + phrase_nodes += prep + phrase_ords += [x.ord for x in prep] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + + return + + # Macedonian forms the future tense with the auxiliary word ќе and a verb in the present tense + # Bulgarian forms the future tense with the auxiliary word ще and a verb in the present tense + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + + if node.feats['Tense'] == 'Pres' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # future tense of perfect verbs + # Upper Sorbian forms the future tense in this way, however, the feats[Aspect] are not listed in the data + # in some languages ​​(e.g. in Russian) these verbs have the Tense Fut, in others (e.g. in Czech) they have the Tense Pres + if node.feats['Aspect'] == 'Perf' and (node.feats['Tense'] == 'Pres' or node.feats['Tense'] == 'Fut') and node.feats['VerbForm'] != 'Conv': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + form='Fin', + aspect='Perf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + + # future tense of imperfect verbs and passive voice + # in some languages ​​the verb is in the infinitive, in some it is in the l-participle + # the condition node.upos == 'ADJ' is due to the passive voice - the n-participle is marked as ADJ, but the auxiliary verb is not cop, but aux + if node.upos == 'VERB' or node.upos == 'ADJ': + + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Fut'] + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + if aux: + auxVerb = aux[0] + self.write_node_info(node, + tense='Fut', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + # simple future tense - e.g. in Serbian, the future tense can be formed by combining a verb with a full meaning and an auxiliary verb into one word, i.e. without an auxiliary verb + # or verbs like pojede, půjdeme... in Czech + + if not aux and node.feats['Tense'] == 'Fut': + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Fut'] + if cop: + copVerb = cop[0] + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood']=='Ind'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Fut', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + diff --git a/udapi/block/msf/slavic/imperative.py b/udapi/block/msf/slavic/imperative.py new file mode 100644 index 00000000..5a30d05e --- /dev/null +++ b/udapi/block/msf/slavic/imperative.py @@ -0,0 +1,89 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects imperative verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Imperative(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # the condition node.upos == 'VERB' ensures that copulas do not enter this branch + if node.feats['Mood'] == 'Imp' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + aspect=node.feats['Aspect'], + mood='Imp', + form='Fin', + voice='Act', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # verbs in the passive forms are marked as ADJ + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood'] == 'Imp'] + if aux: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Imp', + voice='Pass', + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Mood'] == 'Imp'] + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Imp', + form='Fin', + voice=self.get_voice(copVerb, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/infinitive.py b/udapi/block/msf/slavic/infinitive.py new file mode 100644 index 00000000..83bc0766 --- /dev/null +++ b/udapi/block/msf/slavic/infinitive.py @@ -0,0 +1,107 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects infinitive verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Infinitive(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + if node.feats['VerbForm'] == 'Inf' and node.upos == 'VERB': + aux = [x for x in node.children if x.udeprel == 'aux'] + if not aux: # the list of auxiliary list must be empty - we don't want to mark infinitives which are part of any other phrase (for example the infinititive is part of the future tense in Czech) + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes == neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] != 'Inf'] + if aux and not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Pass', + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + number=node.feats['Number'], + analytic=self.get_analytic_bool(node) + ) + return + + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + if cop and not aux_forb: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + voice=self.get_voice(cop[0], refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + + # there is a rare verb form called supine in Slovenian, it is used instead of infinitive as the argument of motion verbs + if node.feats['VerbForm'] == 'Sup': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Act', + form='Sup', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/past.py b/udapi/block/msf/slavic/past.py new file mode 100644 index 00000000..130d972d --- /dev/null +++ b/udapi/block/msf/slavic/past.py @@ -0,0 +1,212 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects past tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Past(udapi.block.msf.phrase.Phrase): + + def get_person_for_langs_with_simple_past(self, node, person): + """ + returns the person which is known from subject, languages with the simple past tense (e. g. Russian) do not express person in these verb forms + if the person was not taken from the subject, the third person would be filled in automatically due to languages ​​with a compound past but simple forms for the third person (e. g. Czech) + """ + subj = [x for x in node.children if x.udeprel == 'nsubj'] + if subj: + subj = subj[0] + if subj.feats['Person'] != '': + person = subj.feats['Person'] + return person + + def process_node(self, node): + + past_tenses = ['Past', 'Imp', 'Pqp'] + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['Tense'] in past_tenses)] + + # there is person 0 in Polish and Ukrainian which is for impersonal statements + # in Polish, verbs with Person=0 have also Tense=Past, in Ukrainian the tense is not specified + if node.feats['Person'] == '0': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood=node.feats['Mood'], + voice='Act', #In Polish, impersonal statements are annotated with Voice=Act. In Ukrainian, the Voice feature is missing; therefore, we decided to annotate these phrases with PhraseVoice=Act + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + # compound past tense + if (node.feats['VerbForm'] in ['Part', 'PartRes', 'Fin']) and node.upos == 'VERB' and node.feats['Voice'] != 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in ['Pres', '']] + aux_pqp = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in past_tenses] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + aux_pqp + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + aux_cnd = [x for x in node.children if (x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd') and x.udeprel != 'conj'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux: + person = aux[0].feats['Person'] + + elif not aux: + person = '3' + + if aux_pqp: + person = aux_pqp[0].feats['Person'] + + # in Slovenian, the participles are not annotated as Tense='Past', the Tense feature is missing here + # but in Bulgarian, there are cases where the participles are annotated as Tense='Imp' + tense = 'Past' + if node.feats['Tense'] == 'Imp': + tense = 'Imp' + if node.feats['Tense'] == 'Pqp': + tense = 'Pqp' + + self.write_node_info(node, + tense=tense, + person=person, + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + + # the past tense of some Slavic languages ​​is formed only by a verb without an auxiliary verb (e.g. Polish) + # or imperfect (special case of the past tense) e.g. in Bulgarian or Croatian + elif (node.feats['Tense'] in past_tenses) and node.upos == 'VERB' and node.feats['VerbForm'] != 'Conv': + + # the past tense is formed only by a content verb, not with an auxiliary + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + + if not aux_forb: + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + + + # passive + elif node.upos == 'ADJ' and node.feats['Voice'] == 'Pass' and not cop: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and (x.feats['Tense'] in past_tenses)] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux_past_tense: + aux_pres_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] # e. g. the auxiliary 'jsem' in the phrase 'byl jsem přinucen' + + phrase_nodes = [node] + aux_past_tense + aux_pres_tense + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_pres_tense: + person = aux_pres_tense[0].feats['Person'] + person = self.get_person_for_langs_with_simple_past(node, person) + + self.write_node_info(node, + tense=aux_past_tense[0].feats['Tense'], + person=person, + number=aux_past_tense[0].feats['Number'], + mood='Ind', + voice='Pass', + form='Fin', + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + else: + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if cop and not aux_cnd: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux_past_tense + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_past_tense: + person = aux_past_tense[0].feats['Person'] + + # In ru, be, uk, the person is not expressed in past tense and the verbform is Fin, not Part + if cop[0].feats['VerbForm'] == 'Fin': + person = '' + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + tense=cop[0].feats['Tense'], + person=person, + number=cop[0].feats['Number'], + mood='Ind', + voice=self.get_voice(cop[0], refl), + form='Fin', + expl=self.get_expl_type(node,refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=cop[0].feats['Gender'], + animacy=cop[0].feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) diff --git a/udapi/block/msf/slavic/preprocessor.py b/udapi/block/msf/slavic/preprocessor.py new file mode 100644 index 00000000..0672812b --- /dev/null +++ b/udapi/block/msf/slavic/preprocessor.py @@ -0,0 +1,83 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block serves as a preprocessor for Slavic languages before the other blocks +are applied to detect periphrastic verb forms. It improves harmonization of +annotations across the treebanks by addressing some known divergences. +""" + +from udapi.core.block import Block + +class Preprocessor(Block): + + def process_node(self,node): + + # in Ukrainian the active verb forms are not marked as PhraseVoice=Act + if (node.upos == 'VERB' or (node.upos == 'AUX' and node.feats['VerbForm'] == 'Fin')) and node.feats['Voice'] == '': + node.feats['Voice'] = 'Act' + + # in some languages, participles are annotated with UPOS=VERB, while in others they are annotated with UPOS=ADJ + # we change the UPOS to ADJ when a participle expresses case + #if node.upos == 'VERB' and node.feats['VerbForm'] == 'Part' and node.feats['Case'] != '': + # node.upos = 'ADJ' + + # in Polish, the conditional mood for auxiliary verbs is marked as deprel == 'aux:cnd' and not as in the last Slavic languages ​​feats['Mood'] == 'Cnd' + if node.deprel == 'aux:cnd': + node.feats['Mood'] = 'Cnd' + + # unify polarities - some languages ​​mark only Neg (Russian), some mark both Neg and Pos (Czech) + if node.feats['Polarity'] == 'Pos': + node.feats['Polarity'] = '' + + # In Ukrainian, there is no explicit annotation of reflexive verbs + # We decided to unify the annotation of reflexive verbs with Russian and Belarusian, where reflexive verbs are formed similarly + # We add the feature Voice=Mid to reflexive verbs + if node.upos == 'VERB' and (node.lemma.endswith('сь') or node.lemma.endswith('ся')): + node.feats['Voice'] = 'Mid' + + # makedonstina tvori budouci cas pomoci pomocneho slova ќе, u nejz neni nijak vyznaceno, ze se podili na tvorbe budouciho casu + # stejne tak bulharstina pomoci pomocneho slova ще + # makedonstina a bulharstina + if node.feats['Tense'] == 'Pres': + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + if len(aux) == 1: + aux[0].feats['Tense'] = 'Fut' + + # in Czech and in Old Church Slavonic, the participles are sometimes marked with the plural gender + if node.feats['Gender'] == 'Fem,Neut' or node.feats['Gender'] == 'Fem,Masc': + subj = [x for x in node.children if x.udeprel == 'nsubj'] + + # for relative pronouns, only one gender is indicated + if len(subj) == 1: + conj = [x for x in subj[0].children if x.deprel == 'conj'] + if len(conj) == 0: + node.feats['Gender'] = subj[0].feats['Gender'] + node.feats['Number'] = subj[0].feats['Number'] + + # participles in passive are sometimes annotated as VERB, sometimes as ADJ + #if node.upos == 'VERB' and node.feats['Voice'] == 'Pass': + # node.upos = 'ADJ' + + # there are cases where the node has deprel=='expl:pv' or 'expl:pass' or 'expl:impers' and Reflex is not Yes (i.e. Macedonian treebank) + # we add the Reflex=Yes feature + if node.deprel == 'expl:pv' or node.deprel == 'expl:pass' or node.deprel == 'expl:impers': + node.feats['Reflex'] = 'Yes' + + # fixing the mistake in Macedonian treebank (mk_mtb-ud-test.conllu), in sent_id=other0010, there is personal pronoun 'ми' marked as expl:pv, it should be iobj + if node.deprel == 'expl:pv' and node.lemma == 'ми' and node.feats['PronType'] == 'Prs': + node.deprel = '' + node.udeprel = 'iobj' + + # in Old Church Slavonic, there is feature Mood=Sub, but this is a notation for conditional mood + if node.feats['Mood'] == 'Sub': + node.feats['Mood'] = 'Cnd' + + # although infinitives in Old Church Slavonic are annotated with Tense=Pres, they do not convey tense; therefore, we remove this annotation + if node.feats['VerbForm'] == 'Inf': + node.feats['Tense'] = '' + + # in the russian Syntagrus corpus, the negative particles have no Polarity=Neg feature + if node.lemma == 'не' and node.upos == 'PART' and node.udeprel == 'advmod': + node.feats['Polarity'] = 'Neg' + + # TODO maybe we want to set Tense=Fut for the perfective verbs with Tense=Pres? This could solve the problem with the simplified detection of the future tense in Czech + # but there are many verbs with no Aspect value, so the problem is still there diff --git a/udapi/block/msf/slavic/present.py b/udapi/block/msf/slavic/present.py new file mode 100644 index 00000000..7521a08d --- /dev/null +++ b/udapi/block/msf/slavic/present.py @@ -0,0 +1,132 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects present tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Present(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + # the condition VerbForm == 'Fin' ensures that there are no transgressives between the found verbs + # the aspect is not always given in Czech treebanks, so we can't rely on the fact that the imperfect aspect is specified + if node.feats['Tense'] == 'Pres' and node.upos == 'VERB' and node.feats['VerbForm'] == 'Fin' and node.feats['Aspect'] !='Perf': + + aux_forb = [x for x in node.children if x.upos == 'AUX' and (x.lemma == 'ќе' or x.lemma == 'ще' or x.feats['Mood'] == 'Cnd')] # forbidden auxiliaries for present tense (these auxiliaries are used for the future tense or the conditional mood) + + if not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Pres', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # passive voice + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and x.lemma != 'hteti' and x.lemma != 'htjeti'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] != 'Pres'] # we don't want the past passive (e. g. 'byl jsem poučen' in Czech) + + if aux and not aux_forb: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + auxVerb = aux[0] + + self.write_node_info(node, + tense='Pres', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + form='Fin', + voice='Pass', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + # participles + # in some languages, participles are used as attributes (they express case and degree) + if node.upos == 'ADJ' and node.feats['VerbForm'] == 'Part': + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + cop = [x for x in node.children if x.udeprel == 'cop'] + + if not aux_forb and not cop: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + tense=node.feats['Tense'], + number=node.feats['Number'], + form='Part', + voice=self.get_voice(node, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Pres'] + aux_forb = [x for x in node.children if x.upos == 'AUX' and x.feats['Tense'] != 'Pres'] # in Serbian this can be a future tense + + if cop and not aux_forb: + aux = [x for x in node.children if x.udeprel == "aux" and x.feats['Mood'] == 'Ind' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Pres', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/mwe/normalize.py b/udapi/block/mwe/normalize.py new file mode 100644 index 00000000..e7ebf24f --- /dev/null +++ b/udapi/block/mwe/normalize.py @@ -0,0 +1,68 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC + and normalizes it so that the type is always annotated at the first word of + the expression.""" +from udapi.core.block import Block +import logging +import re + +class Normalize(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then saves them back but makes sure that the type is annotated at the + first word of the expression (as opposed to the syntactic head or to + any other word). + """ + (mwes, mwes_by_nodes) = self.collect_mwes(root) + nodes = root.descendants + for n in nodes: + # Erase the previous MWE annotations so we can start from scratch. + n.misc['Mwe'] = '' + # There may be multiple MWEs this node is member of. + annotations = [] + for m in mwes_by_nodes[n.ord]: + if n.ord == mwes[m]['nodes'][0]: + annotations.append("%s:%s" % (m, mwes[m]['type'])) + else: + annotations.append(m) + if annotations: + n.misc['Mwe'] = ';'.join(annotations) diff --git a/udapi/block/mwe/possessives.py b/udapi/block/mwe/possessives.py new file mode 100644 index 00000000..0849a210 --- /dev/null +++ b/udapi/block/mwe/possessives.py @@ -0,0 +1,74 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC, + looks for dependent possessive pronouns and reports how they are treated.""" +from udapi.core.block import Block +import logging +import re + +class Possessives(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then surveys the possessive pronouns. + """ + (mwes, mwes_by_nodes) = self.collect_mwes(root) + nodes = root.descendants + for m in mwes: + mwenodes = [x for x in nodes if m in mwes_by_nodes[x.ord]] + mweheads = [x for x in mwenodes if not x.parent in mwenodes] + mwedescendantset = set() + for x in mweheads: + mwedescendantset = mwedescendantset.union(set(x.descendants)) + mwedescendants = list(sorted(mwedescendantset)) + # Is there a possessive pronoun? + possprons = [x for x in mwedescendants if x.upos == 'PRON' and x.feats['Poss'] == 'Yes'] + inpp = [x for x in possprons if m in mwes_by_nodes[x.ord]] + outpp = [x for x in possprons if not m in mwes_by_nodes[x.ord]] + observation = '' + if inpp and outpp: + observation = 'both' + elif inpp: + observation = 'in' + elif outpp: + observation = 'out' + if observation: + expression = ' '.join([x.form if m in mwes_by_nodes[x.ord] else '('+x.form+')' for x in mwedescendants]) + print(observation + ': ' + expression) diff --git a/udapi/block/mwe/tosubdeprels.py b/udapi/block/mwe/tosubdeprels.py new file mode 100644 index 00000000..3682c0c7 --- /dev/null +++ b/udapi/block/mwe/tosubdeprels.py @@ -0,0 +1,62 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC + and projects it to subtypes of dependency relation labels. The motivation is + that a parser could learn to predict the multiword expressions.""" +from udapi.core.block import Block +import logging +import re + +class ToSubDeprels(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then saves the type of the MWE as a subtype of the deprels inside. + """ + nodes = root.descendants + (mwes, mwes_by_nodes) = self.collect_mwes(root) + # Now we hopefully know the type of every multiword expression in the sentence. + for n in nodes: + if mwes_by_nodes[n.ord]: + for m in mwes_by_nodes[n.ord]: + type = re.sub(r"\.", '', mwes[m]['type'].lower()) + # Add the MWE type to the DEPREL if the parent is also in the same MWE. + if n.parent.ord > 0 and m in mwes_by_nodes[n.parent.ord]: + n.deprel += ':' + type diff --git a/udapi/block/read/addbratann.py b/udapi/block/read/addbratann.py new file mode 100644 index 00000000..4f5fc877 --- /dev/null +++ b/udapi/block/read/addbratann.py @@ -0,0 +1,230 @@ +"""Add Brat coreference annotation from *.ann files. + +So far, tested on French LitBank data only. + +T12 HIST 362 366 qui +T13 HIST 349 362 une aventure +R1431 Coreference Arg1:T12 Arg2:T13 + +""" + +from udapi.core.block import Block +from udapi.core.files import Files +import logging +from bisect import bisect_left +import networkx as nx + +def _m(range_s, range_e, offset): + return f"{range_s}-{offset}:{range_e}-{offset}" if offset else f"{range_s}:{range_e}" + +class AddBratAnn(Block): + + def __init__(self, files, zone='', offset=0, detect_bom=True, keep_mention_id=True, + coref_attr="R", no_type_value='_Unsorted_', + **kwargs): + """Args: + files: file names with the coreference annotations (*.ann) + offset: what number to substract from the chatacter indices in the ann files + detect_bom: if True and the current txt file starts with BOM (byte-order mark), add 1 to the offset + """ + super().__init__(**kwargs) + self.zone = zone + self.files = Files(filenames=files) + self.offset = offset + self.detect_bom = detect_bom + self.keep_mention_id = keep_mention_id + self.coref_attr = coref_attr + self.no_type_value = no_type_value + + def process_document(self, document): + + # Read all the important info from the *.ann file. + mentions, attrs, split_ante, clusters = {}, [], [], [] + ann_filehandle = self.files.next_filehandle() + offset = self.offset + if self.detect_bom: + txt_filename = self.files.filename.replace("ann", "txt") + with open(txt_filename, 'rb') as txt_fh: + raw_bytes = txt_fh.read(3) + if raw_bytes == b'\xef\xbb\xbf': + offset += 1 + + for line in ann_filehandle: + line = line.rstrip('\n') + if not "\t" in line: + logging.warning(f"Unexpected line without tabs: {line}") + elif line.startswith("T"): + # T13 HIST 349 362 une aventure + try: + mention_id, type_and_range, form = line.split("\t") + # Usually range are two numbers, but can be more, e.g. type_and_range="Abstract 605 653;654 703" + # Let's take the first and last number only.´ + parts = type_and_range.split() + ne_type, range_s, range_e = parts[0], int(parts[1]), int(parts[-1]) + + # If form ends with spaces, remove them and adjust range_e + stripped_form = form.rstrip(" ") + if form != stripped_form: + num_spaces = len(form) - len(stripped_form) + logging.debug(f"Stripping {num_spaces} space{'s' if num_spaces>1 else ''} from {mention_id} '{form}' ({_m(range_s,range_e,offset)}->{range_e-num_spaces})") + form = stripped_form + range_e = range_e - num_spaces + + + mentions[mention_id] = [ne_type, range_s, range_e, form] + if self.keep_mention_id: + attrs.append(["mention_id", mention_id, mention_id]) + except Exception as e: + logging.warning(f"Unexpected mention line: {line}\n{e}") + elif line.startswith(self.coref_attr): + try: + cor_attr, mention_ids = line.rstrip().split("\t") + parts = mention_ids.split() + assert(parts[0] == "Coreference") + except Exception as e: + logging.warning(f"Unexpected coref line: '{line}'\n{e}") + clusters.append([p.split(":")[1] for p in parts[1:]]) + elif line.startswith("#"): + pass # Let's ignore annotators' comments + else: + logging.warning(f"Unexpected line in {self.files.filename}:\n{line}") + + # Some Brat ann files use link-based representation, e.g. + # R123 Coreference Arg1:T11 Arg2:T13 + # R124 Coreference Arg1:T12 Arg2:T14 + # R125 Coreference Arg1:T13 Arg2:T14 + # This actually means that all four mentions T11, T12, T13 and T14 are in the same cluster (entity). + # However, clusters = [["T11", "T13"], ["T12", "T14"], ["T13", "T14"]] + # and we need to convert it to clusters = [["T11", "T12", "T13", "T14"]] + # Note that if creating entities for link, in their original order, + # R123 and R125 would result in creating two entities and when hitting R125 + # we would need to merge them, i.e. delete one of them and move their mentions to the other. + # This is the solution of corefud.Link2Cluster, but here it seems easier to find connected components. + coref_graph = nx.Graph() + for mention_ids in clusters: + coref_graph.add_node(mention_ids[0]) + for mention_id in mention_ids[1:]: + coref_graph.add_node(mention_id) + coref_graph.add_edge(mention_id, mention_ids[0]) + clusters = [list(component) for component in nx.connected_components(coref_graph)] + + # Create entity objects for non-singletons. + entity_map = {} + for mention_ids in clusters: + etype, etype_index = None, 0 + for index, m_id in enumerate(mention_ids): + if mentions[m_id][0] == self.no_type_value: + pass + elif etype is None: + etype, etype_index = mentions[m_id][0], index + elif etype != mentions[m_id][0]: + logging.warning(f"Mention type mismatch {mention_ids[etype_index]}:{etype} != {m_id}:{mentions[m_id][0]}. Using the former.") + if etype is None: + etype = "other" + entity = document.create_coref_entity(etype=etype) + for m_id in mention_ids: + if m_id in entity_map: + logging.warning(f"Mention {m_id} already in Entity {entity_map[m_id].eid}, not adding to {entity.eid}") + else: + entity_map[m_id] = entity + + # Collect TokenRange (as pre-filled by UDPipe) for each token. + tokens, starts, ends = [], [], [] + for tree in document.trees: + for token in tree.token_descendants: + tokens.append(token) + range_s, range_e = token.misc["TokenRange"].split(":") + starts.append(int(range_s)) + ends.append(int(range_e)) + + # Create mention objects. + mention_map = {} + for mention_id, mention_values in mentions.items(): + + # Find Udapi tokens for each mention. + ne_type, range_s, range_e, form = mention_values + index_s = bisect_left(starts, range_s - offset) + if starts[index_s] != range_s - offset and index_s > 0: + index_s -= 1 + index_e = bisect_left(ends, range_e - offset) + mtokens = tokens[index_s : index_e+1] + token_s, token_e = tokens[index_s], tokens[index_e] + + # Solve cases when the character range crosses Udapi (UDPipe-predicted) token boundaries. + # If the start token is a multi-word token (MWT), + # we can still try to find the proper word within the MWT. + ok_s, ok_e = True, True + if starts[index_s] != range_s - offset: + ok_s = False + if token_s.is_mwt(): + mtokens.pop(0) + first_form = form.split()[0] + new_start = ends[index_s] + for w in reversed(token_s.words): + mtokens = [w] + mtokens + new_start -= len(w.form) + if w.form == first_form or new_start < range_s - offset: + ok_s = True + break + + # similarly for the end token + if ends[index_e] != range_e - offset: + ok_e = False + if token_e.is_mwt(): + mtokens.pop() + last_form = form.split()[-1] + new_end = starts[index_e] + for w in token_e.words: + mtokens.append(w) + new_end += len(w.form) + if w.form == last_form or new_end > range_e - offset: + ok_e = True + break + + if not ok_s or not ok_e: + logging.warning(f"Mention {mention_id} range {_m(range_s, range_e, offset)} ({form})" + f" crosses token boundaries: {token_s.misc} ({token_s.form}) " + f".. {token_e.misc} ({token_e.form})") + + # Project tokens (including MWTs) to words and check forms match. + words, udapi_form = [], "" + for token in mtokens: + words += token.words + udapi_form += token.form + if not token.no_space_after: + udapi_form += " " + udapi_form = udapi_form.rstrip() + if form != udapi_form: + logging.warning(f"Mention {mention_id}: ann form '{form}' != Udapi form '{udapi_form}'") + + # Make sure all words of the mention are in the same sentence. + root = words[0].root + mwords = [words[0]] + for word in words[1:]: + if word.root is root: + mwords.append(word) + else: + logging.warning(f"Cross-sentence mention. Word {word} not in {root}, thus omitting from the mention.") + + # Create entities for singletons + if mention_id not in entity_map: + entity_map[mention_id] = document.create_coref_entity(etype=ne_type) + + # Create the Udapi mention object + mention = entity_map[mention_id].create_mention(words=mwords) + mention_map[mention_id] = mention + + # Fill-in the additional mention attributes. + for attr_name, mention_id, attr_value in attrs: + if mention_id in mention_map: + mention_map[mention_id].other[attr_name] = attr_value + + # Fill-in split antecedents + for arg1, arg2 in split_ante: + if arg1 in entity_map and arg2 in entity_map: + if entity_map[arg1] in entity_map[arg2].split_ante: + logging.warning(f"Repeated SplitAnte: {arg1=} ({entity_map[arg1].eid}) {arg2=} ({entity_map[arg2].eid})") + else: + entity_map[arg2].split_ante.append(entity_map[arg1]) + else: + logging.warning(f"{arg1} or {arg2} not indexed in entity_map") diff --git a/udapi/block/read/addtext.py b/udapi/block/read/addtext.py new file mode 100644 index 00000000..4d0b7771 --- /dev/null +++ b/udapi/block/read/addtext.py @@ -0,0 +1,59 @@ +"""read.AddText is a reader for adding word-wrapped plain-text to existing trees.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root +import logging + +class AddText(BaseReader): + r"""A reader for plain-text files to be stored to existing trees. + + For example LitBank conll files are segmented to sentences and tokenized, + but the SpacesAfter attributes are missing. We need to load the original + (raw) texts, which are not tokenized and not segmented, only word-wrapped + (to 70 characters per line). + + Args: + add_newpar: add newpar CoNLL-U annotations on empty lines (and the beginning of file) + """ + def __init__(self, zone='', add_newpar=True, **kwargs): + super().__init__(zone=zone, **kwargs) + self.add_newpar = add_newpar + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def process_document(self, document): + filehandle = self.next_filehandle() + if filehandle is None: + self.finished = True + return + text = ''.join(self.filehandle.readlines()) + i, end, was_newpar = 0, len(text)-1, True + while i <= end and text[i].isspace(): + i += 1 + + for bundle in document.bundles: + root = bundle.get_tree(zone=self.zone) + if self.add_newpar and was_newpar: + root.newpar = True + was_newpar = False + for node in root.token_descendants: + if text[i:i+len(node.form)] == node.form: + i += len(node.form) + if i > end or text[i].isspace(): + del node.misc['SpaceAfter'] + was_newpar = i+1 < end and text[i+1] == '\n' and text[i] == '\n' + while i <= end and text[i].isspace(): + i += 1 + else: + node.misc['SpaceAfter'] = 'No' + was_newpar = False + else: + logging.warning('Node %s does not match text "%s"', node, text[i:i+20]) + return + root.text = root.compute_text() + self.finished = not self.files.has_next_file() diff --git a/udapi/block/read/ccv.py b/udapi/block/read/ccv.py new file mode 100644 index 00000000..eb449362 --- /dev/null +++ b/udapi/block/read/ccv.py @@ -0,0 +1,78 @@ +"""Ccv class is a reader for Corpus of Czech Verse json files.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root +from udapi.block.ud.setspaceafterfromtext import SetSpaceAfterFromText +import json + +class Ccv(BaseReader): + r"""A reader for Corpus of Czech Verse json files. + + See https://github.com/versotym/corpusCzechVerse + Each verse (line) is stored as one tree (although it is quite often not a whole sentence). + Start of each stanza is marked with `newpar`. + Start of each poem is marked with `newdoc = [poem_id]`. + + Args: + tokenize: create nodes + """ + def __init__(self, tokenize=True, **kwargs): + self.tokenize = tokenize + self._cache = None + super().__init__(**kwargs) + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def read_tree(self): + if self._cache: + return self._cache.pop() + else: + trees = self.read_trees() + if not trees: + return None + self._cache = list(reversed(trees[1:])) + return trees[0] + + def read_trees(self): + if self.filehandle is None: + return None + poems = json.load(self.filehandle) + all_trees = [] + for poem in poems: + poem_trees = [] + for stanza in poem["body"]: + stanza_trees = [] + for line in stanza: + root = Root() + root.text = line["text"] + root.json["rhyme"] = line["rhyme"] + root.json["metre"] = line["metre"] + root.json["stress"] = line["stress"] + stanza_trees.append(root) + if self.tokenize: + words = [[]] + [[w] for w in line["words"]] + for index, puncts in line["punct"].items(): + for punct in puncts: + words[int(index)].append({"token": punct, "lemma": punct}) + for word in words: + for w in word: + node = root.create_child(form=w["token"], lemma=w["lemma"]) + if "morph" in w: + node.xpos = w["morph"] + node.misc["xsampa"] = w["xsampa"] + node.misc["phoebe"] = w["phoebe"] + SetSpaceAfterFromText.process_tree(None, root) + stanza_trees[0].newpar = True + poem_trees.extend(stanza_trees) + root = poem_trees[0] + root.newdoc = poem["poem_id"] + root.json["p_author"] = poem["p_author"] + root.json["b_author"] = poem["b_author"] + root.json["biblio"] = poem["biblio"] + all_trees.extend(poem_trees) + return all_trees diff --git a/udapi/block/read/conll.py b/udapi/block/read/conll.py index f64cd9ff..d0aef1ee 100644 --- a/udapi/block/read/conll.py +++ b/udapi/block/read/conll.py @@ -79,22 +79,24 @@ def parse_node_line(self, line, root, nodes, parents, mwts): # but it allows for arbitrary columns node = root.create_child() for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] if attribute_name == 'head': try: - parents.append(int(fields[n_attribute])) + parents.append(int(value)) except ValueError as exception: - if not self.strict and fields[n_attribute] == '_': + if not self.strict and value == '_': if self.empty_parent == 'warn': logging.warning("Empty parent/head index in '%s'", line) parents.append(0) else: raise exception elif attribute_name == 'ord': - setattr(node, 'ord', int(fields[n_attribute])) + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") elif attribute_name == 'deps': - setattr(node, 'raw_deps', fields[n_attribute]) - elif attribute_name != '_' and fields[n_attribute] != '_': - setattr(node, attribute_name, fields[n_attribute]) + setattr(node, 'raw_deps', value) + elif attribute_name != '_' and value != '_': + setattr(node, attribute_name, value) nodes.append(node) @@ -134,11 +136,10 @@ def read_tree_from_lines(self, lines): if node is parent: if self.fix_cycles: logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) - node._parent = root - root._children.append(node) + parent = root else: raise ValueError(f"Detected a cycle: {node} attached to itself") - elif node.children: + elif node._children: climbing = parent._parent while climbing: if climbing is node: diff --git a/udapi/block/read/conll2012.py b/udapi/block/read/conll2012.py new file mode 100644 index 00000000..2adbd00f --- /dev/null +++ b/udapi/block/read/conll2012.py @@ -0,0 +1,153 @@ +""""Conll2012 is a reader block for the coreference in CoNLL-2012 format. + +This implementation was tested on the LitBank files only +(and quickly on Portuguese Corref-PT and Summ-it++v2), so far. +LitBank does not use most of the columns, so the implementation +should be improved to handle other types of CoNLL-2012 files. +""" +import json +import logging +import re + +import udapi.block.read.conllu +from udapi.core.root import Root +from udapi.core.node import Node + +RE_BEGIN = re.compile(r'^#begin document ([^ ]+)') + +class Conll2012(udapi.block.read.conllu.Conllu): + """A reader of the Conll2012 files.""" + + def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', emptyval='_', **kwargs): + """Create the Conll2012 reader object. + + Args: + attributes: comma-separated list of column names in the input files + (default='docname,_,ord,form,_,_,_,_,_,_,_,_,coref' suitable for LitBank) + For ignoring a column, use "_" as its name. + Column "ord" marks the column with 0-based (unlike in CoNLL-U, which uses 1-based) + word-order number/index (usualy called ID). + For Corref-PT-SemEval, use attributes='ord,form,_,_,_,_,coref'. + For Summ-it++v2, use attributes='ord,form,_,_,_,_,_,_,coref'. + For FantasyCoref, use attributes='docname,_,ord,form,_,_,_,_,_,_,_,coref'. + emptyval: a symbol that represents an empty value, especially in the coref column + (default='_' suitable for LitBank, Corref-PT-SemEval, and Summ-it++v2) + For FantasyCoref, use emptyval='-'. + """ + super().__init__(**kwargs) + self.node_attributes = attributes.split(',') + self._docname = 'd' + self.emptyval = emptyval + + def parse_comment_line(self, line, root): + if line.startswith("#end document"): + return + match = RE_BEGIN.match(line) + if match: + docname = match.group(1) + # LitBank and FantasyCoref use e.g. + # #begin document (1023_bleak_house_brat); part 0 + if docname.startswith('(') and docname.endswith(');'): + docname = docname[1:-2] + # Summ-it++v2 uses e.g. + # #begin document /home/andre/Recursos-fontes/Summit/Summ-it_v3.0/corpusAnotado_CCR/CIENCIA_2002_22010/CIENCIA_2002_22010.txt + elif docname.startswith('/home/'): + docname = docname.split('/')[-1] + # Corref-PT-SemEval uses e.g. + # #begin document D1_C30_Folha_07-08-2007_09h19.txt.xml + docname = docname.replace('.txt', '').replace('.xml', '') + # FantasyCoref may use parentheses within the document ID e.g. + # #begin document (051_Fundevogel_(Bird-foundling)); part 000 + docname = docname.replace('(', '').replace(')', '') + + root.newdoc = docname + self._global_entity = 'eid-etype-head-other' + root.comment += '$GLOBAL.ENTITY\n' + self._docname = docname + else: + logging.warning(f"Unexpected comment line: {line}") + + def parse_node_line(self, line, root, nodes): + fields = line.split('\t') + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'docname': + # FantasyCoref may use parentheses within the document ID + value = value.replace('(', '').replace(')', '') + if value != self._docname: + logging.warning(f"Document name mismatch {value} != {self._docname}") + + # convert the zero-based index to one-based + # but Corref-PT uses a mix of one-based and zero-based + elif attribute_name == 'ord': + #setattr(node, 'ord', int(value) + 1) + if node.ord not in(int(value) + 1, int(value)): + logging.warning(f"Mismatch: expected {node.ord=}, but found {int(value) + 1} {line=}") + + elif attribute_name == 'coref': + if value and value != self.emptyval: + # LitBank always separates chunks by a vertical bar, e.g. (13)|10) + # Summ-it++v2 does not, e.g. (13)10) + if '|' in value: + chunks = value.split("|") + else: + chunks = [x for x in re.split(r'(\([^()]+\)?|[^()]+\))', value) if x] + modified_entities = [] + escaped_docname = self._docname.replace("-", "") + for entity in chunks: + entity_num = entity.replace("(", "").replace(")","") + modified_entity = f"{escaped_docname}_e{entity_num}--1" + if entity.startswith("(") and entity.endswith(")"): + modified_entity = "(" + modified_entity + ")" + elif entity.startswith("("): + modified_entity = "(" + modified_entity + elif entity.endswith(")"): + modified_entity = f"{escaped_docname}_e{entity_num}" + ")" + + # to avoid parentheses clashes, put the entities with ")" first + if modified_entity.startswith("("): + modified_entities.append(modified_entity) + else: + modified_entities.insert(0, modified_entity) + node.misc['Entity'] = ''.join(modified_entities) + + elif attribute_name == 'form' or (attribute_name != '_' and value != '_'): + setattr(node, attribute_name, value) + nodes.append(node) + + def read_tree_from_lines(self, lines): + root = Root() + nodes = [root] + for line in lines: + if line == '': + pass + elif line[0] == '#': + self.parse_comment_line(line, root) + else: + self.parse_node_line(line, root, nodes) + + # If no nodes were read from the filehandle (so only root remained in nodes), + # we return None as a sign of failure (end of file or more than one empty line). + if len(nodes) == 1: + return None + + return root + + def read_trees(self): + if self.max_docs: + raise NotImplementedError("TODO implement max_docs in read.Conll2012") + # Corref-PT does not put an empty line before #end document, + # so we need to split both on #end document and empty lines. + return [self.read_tree_from_lines(s.split('\n')) for s in + re.split(r'\n\n+|\n#end document\n', self.filehandle.read()) if s] + + def read_tree(self): + raise NotImplementedError("TODO implement read_tree in read.Conll2012") diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index d703fb26..e19cd676 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -27,7 +27,7 @@ def __init__(self, strict=False, empty_parent='warn', fix_cycles=False, **kwargs strict: raise an exception if errors found (default=False, i.e. a robust mode) empty_parent: What to do if HEAD is _? Default=warn: issue a warning and attach to the root or if strict=1 issue an exception. With `empty_parent=ignore` no warning is issued. - fix_cycles: fix cycles by attaching a node in the cycle to the root + fix_cycles: fix cycles by attaching a node in the cycle to the root; fix also HEAD index out of range """ super().__init__(**kwargs) self.strict = strict @@ -73,7 +73,7 @@ def parse_comment_line(self, line, root): if entity_match is not None: global_entity = entity_match.group(1) if self._global_entity and self._global_entity != global_entity: - logging.warning("Mismatch in global.Entity: %s != %s", (self._global_entity, global_entity)) + logging.warning(f"Mismatch in global.Entity: {self._global_entity} != {global_entity}") self._global_entity = global_entity root.comment += '$GLOBAL.ENTITY\n' return @@ -81,8 +81,32 @@ def parse_comment_line(self, line, root): root.comment += line[1:] + "\n" def read_trees(self): - return [self.read_tree_from_lines(s.split('\n')) for s in - self.filehandle.read().split('\n\n') if s] + if not self.max_docs: + # Valid CoNLL-U files must have sentences separated by a single empty line. + # However, some users have to work with invalid files e.g. ending with two empty lines. + # It is obvious how to parse such files and re.split(r'\n\n+', s) is only twice as slow + # as s.split('\n\n') and this time is negligble + # relative to the main CoNLL-U parsing in read_tree_from_lines(). + return [self.read_tree_from_lines(s.split('\n')) for s in + re.split(r'\n\n+', self.filehandle.read()) if s] + # udapi.core.basereader takes care about the max_docs parameter. + # However, we can make the loading much faster by not reading + # the whole file if the user wants just first N documents. + trees, lines, loaded_docs = [], [], 0 + for line in self.filehandle: + line = line.rstrip() + if line == '': + tree = self.read_tree_from_lines(lines) + lines = [] + if tree.newdoc: + if loaded_docs == self.max_docs: + return trees + loaded_docs += 1 + if tree: + trees.append(tree) + else: + lines.append(line) + return trees def read_tree(self): if self.filehandle is None: @@ -169,15 +193,18 @@ def read_tree_from_lines(self, lines): try: parent = nodes[parents[node_ord]] except IndexError: - raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) + if self.fix_cycles: + logging.warning(f"Ignoring out-of-range HEAD (attaching to the root instead): {node} HEAD={parents[node_ord]}") + parent = root + else: + raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) if node is parent: if self.fix_cycles: - logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) - node._parent = root - root._children.append(node) + logging.warning("Ignoring a self-cycle (attaching to the root instead):\n%s", node) + parent = root else: raise ValueError(f"Detected a cycle: {node} attached to itself") - elif node.children: + elif node._children: climbing = parent._parent while climbing: if climbing is node: @@ -193,8 +220,12 @@ def read_tree_from_lines(self, lines): # Create multi-word tokens. for fields in mwts: - range_start, range_end = fields[0].split('-') + try: + range_start, range_end = fields[0].split('-') + except ValueError: + logging.warning(f"Wrong MWT range in\n{fields[0]}\n\n{lines}") + raise words = nodes[int(range_start):int(range_end) + 1] - root.create_multiword_token(words, form=fields[1], misc=fields[-1]) + root.create_multiword_token(words, form=fields[1], feats=fields[5], misc=fields[9]) return root diff --git a/udapi/block/read/conllup.py b/udapi/block/read/conllup.py new file mode 100644 index 00000000..16d83d07 --- /dev/null +++ b/udapi/block/read/conllup.py @@ -0,0 +1,107 @@ +"""Conllup is a reader block for the CoNLL-UPlus format. + +Columns which don't have standardize attributes in Udapi/CoNLL-U +are stored in MISC (as key=value pairs). + +This code has been only tested on Hungarian KorKor files for CorefUD so far. +However, in the end, it is not used there (xtsv files are used instead conllup). +""" +import logging +import re + +import udapi.block.read.conll +from udapi.core.root import Root +from udapi.core.node import Node + +RE_GLOBAL_COLUMNS = re.compile(r'^# global.columns\s*=\s*(.+)') +COLUMN_MAP = { + 'ID': 'ord', +} +NORMAL_ATTRS = 'form lemma upos xpos feats deprel misc'.split() + +class Conllup(udapi.block.read.conll.Conll): + """A reader of the CoNLL-UPlus files.""" + + def __init__(self, attributes='autodetect', save_global_columns=False, **kwargs): + """Create the Conllup reader object. + + Args: + attributes: comma-separated list of column names in the input files + (can be used if the global.columns header is missing or needs to be overriden). + Default='autodetect' which means the column names will be loaded from the global.columns header. + For ignoring a column, use "_" as its name. + save_global_columns: keep the "global.columns" header in root.comments. Default=False. + Note that when saving the output to CoNLL-U, the comment is not needed + and it may be even misleading. It could be helpful only once write.Conllup is implemented + (with the possibility to use the same columns as in the input file). + """ + super().__init__(**kwargs) + self.save_global_columns = save_global_columns + if attributes == 'autodetect': + self.node_attributes = None + else: + self.node_attributes = attributes.split(',') + + def parse_comment_line(self, line, root): + if self.node_attributes is None: + global_columns_match = RE_GLOBAL_COLUMNS.match(line) + if global_columns_match is None: + return super().parse_comment_line(line, root) + global_columns = global_columns_match.group(1) + self.node_attributes = [COLUMN_MAP.get(v, v.lower()) for v in global_columns.split(" ")] + if self.save_global_columns: + root.comment += line[1:] + '\n' + return + return super().parse_comment_line(line, root) + + def parse_node_line(self, line, root, nodes, parents, mwts): + fields = line.split('\t') + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + + # multi-word tokens will be processed later + if '-' in fields[0]: + mwts.append(fields) + return + if '.' in fields[0]: + raise NotImplementedError("Empty nodes in CoNLL-UPlus not implement yet in read.Conllup") + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + nonstandard_attrs = [] + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'head': + if value == '???': + value = 0 + try: + parents.append(int(value)) + except ValueError as exception: + if not self.strict and value == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception + elif attribute_name == 'ord': + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") + elif attribute_name == 'deps': + setattr(node, 'raw_deps', value) + elif value == '_' and attribute_name != 'form': + pass + elif attribute_name == '_': + pass + elif attribute_name in NORMAL_ATTRS: + setattr(node, attribute_name, value) + else: + nonstandard_attrs.append([attribute_name, value]) + + # This needs to be done after node.misc is created (if "misc" in node.attributes) + for attribute_name, value in nonstandard_attrs: + node.misc[attribute_name.capitalize()] = value + + nodes.append(node) diff --git a/udapi/block/read/sentences.py b/udapi/block/read/sentences.py index 356e196f..7487d580 100644 --- a/udapi/block/read/sentences.py +++ b/udapi/block/read/sentences.py @@ -9,6 +9,8 @@ class Sentences(BaseReader): Args: ignore_empty_lines: if True, delete empty lines from the input. Default=False. + newdoc_if_empty_line: if True, empty lines mark document boundaries, + which are marked with `root.newdoc`. Default=False. rstrip: a set of characters to be stripped from the end of each line. Default='\r\n '. You can use rstrip='\n' if you want to preserve any space or '\r' (Carriage Return) at end of line, @@ -16,8 +18,12 @@ class Sentences(BaseReader): As most blocks do not expect whitespace other than a space to appear in the processed text, using this feature is at your own risk. """ - def __init__(self, ignore_empty_lines=False, rstrip='\r\n ', **kwargs): + def __init__(self, ignore_empty_lines=False, newdoc_if_empty_line=False, + rstrip='\r\n ', **kwargs): + if ignore_empty_lines and newdoc_if_empty_line: + raise ValueError("ignore_empty_lines is not compatible with newdoc_if_empty_line") self.ignore_empty_lines = ignore_empty_lines + self.newdoc_if_empty_line = newdoc_if_empty_line self.rstrip = rstrip super().__init__(**kwargs) @@ -38,11 +44,20 @@ def read_tree(self, document=None): # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None - if self.ignore_empty_lines: + preceded_by_empty_line = False + if self.ignore_empty_lines or self.newdoc_if_empty_line: while line in {'\n', '\r\n'}: + preceded_by_empty_line = True line = self.filehandle.readline() if line == '': return None root = Root() root.text = line.rstrip(self.rstrip) + if self.newdoc_if_empty_line and preceded_by_empty_line: + root.newdoc = True return root + + # The first line in a file also marks a start of new document + def after_process_document(self, document): + if self.newdoc_if_empty_line: + document.bundles[0].trees[0].newdoc = True diff --git a/udapi/block/read/text.py b/udapi/block/read/text.py new file mode 100644 index 00000000..161b6b6e --- /dev/null +++ b/udapi/block/read/text.py @@ -0,0 +1,74 @@ +"""Text class is a reader for word-wrapped plain-text files.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root + + +class Text(BaseReader): + r"""A reader for plain-text files with sentences on one or more lines. + + Sentences are separated by one or more empty lines. + Newlines within sentences are substituted by a space. + + Args: + rstrip: a set of characters to be stripped from the end of each line. + Default='\r\n '. You can use rstrip='\n' if you want to preserve + any space or '\r' (Carriage Return) at end of line, + so that `udpipe.Base` keeps these characters in `SpacesAfter`. + As most blocks do not expect whitespace other than a space to appear + in the processed text, using this feature is at your own risk. + empty_line: how empty lines are handled. Default 'new_sentence' preserves + the current behaviour (empty lines mark sentence boundaries). Use + 'keep' to read the entire file content into a single sentence (tree), including + empty lines. Use 'newpar' to behave like 'new_sentence' but also set + `root.newpar = True` on each sentence. + """ + def __init__(self, rstrip='\r\n ', empty_line='new_sentence', **kwargs): + if empty_line not in {'new_sentence', 'keep', 'newpar'}: + raise ValueError("empty_line must be 'new_sentence', 'keep' or 'newpar'") + self.rstrip = rstrip + self.empty_line = empty_line + super().__init__(**kwargs) + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def read_tree(self, document=None): + if self.filehandle is None: + return None + if self.empty_line == 'keep': + content = self.filehandle.read() + if content == '': + return None + root = Root() + root.text = content + return root + lines = [] + line = None + while True: + line = self.filehandle.readline() + # if readline() returns an empty string, the end of the file has been + # reached, while a blank line is represented by '\n' + # (or '\r\n' if reading a Windows file on Unix machine). + if line == '': + if not lines: + return None + else: + break + elif line in {'\n', '\r\n'}: + if not lines: + continue + else: + break + else: + lines.append(line.rstrip(self.rstrip)) + + root = Root() + root.text = " ".join(lines) + if self.empty_line == 'newpar': + root.newpar = True + return root diff --git a/udapi/block/transform/flatten.py b/udapi/block/transform/flatten.py index ded64fb1..d218ad27 100644 --- a/udapi/block/transform/flatten.py +++ b/udapi/block/transform/flatten.py @@ -4,6 +4,22 @@ class Flatten(Block): """Apply `node.parent = node.root; node.deprel = 'root'` on all nodes.""" - def process_node(self, node): - node.parent = node.root - node.deprel = 'root' + def __init__(self, oneroot=False, **kwargs): + """Args: + oneroot: only the first node will have deprel 'root'. + All other nodes will depend on the first node with deprel 'dep'. + This option makes the trees valid according to the validator. + (default=False) + """ + super().__init__(**kwargs) + self.oneroot = oneroot + + def process_tree(self, tree): + for node in tree.descendants: + node.parent = node.root + node.deprel = 'root' + if self.oneroot: + first = tree.descendants[0] + for node in tree.descendants[1:]: + node.parent = first + node.deprel = 'dep' diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py index 2d251989..e7eb3989 100644 --- a/udapi/block/ud/addmwt.py +++ b/udapi/block/ud/addmwt.py @@ -15,6 +15,9 @@ def process_node(self, node): orig_attr[attr] = getattr(node, attr) orig_attr['feats'] = node.feats.copy() orig_attr['misc'] = node.misc.copy() + # Defaults for the newly created MWT + mwt_misc = node.misc.copy() + mwt_form = node.form forms = analysis['form'].split() main = analysis.get('main', 0) @@ -37,6 +40,7 @@ def process_node(self, node): elif orig_attr['form'][0].isupper(): nodes[0].form = nodes[0].form.title() + node.misc = None for attr in 'lemma upos xpos feats deprel misc'.split(): if attr in analysis: values = analysis[attr].split() @@ -47,6 +51,17 @@ def process_node(self, node): logging.warning("%s = %s" % (attr, analysis.get(attr, ''))) if values[i] == '*': setattr(new_node, attr, orig_attr[attr]) + # No MISC attribute should be duplicated on the word level and token level, + # so if copying MISC to a new_node, delete mwt_misc. + # However, SpaceAfter should be annotated only on the token level, + # so make sure it is not accidentally copied on the word level. + if attr == 'misc': + orig_attr['misc'].clear() + for a in 'SpaceAfter SpacesAfter SpacesBefore'.split(): + if new_node.misc[a]: + orig_attr['misc'][a] = new_node.misc[a] + del new_node.misc[a] + elif attr == 'feats' and '*' in values[i]: new_node.feats = values[i] for feat_name, feat_value in list(new_node.feats.items()): @@ -55,8 +70,23 @@ def process_node(self, node): else: setattr(new_node, attr, values[i]) - mwt = node.root.create_multiword_token(nodes, orig_attr['form'], orig_attr['misc']) - node.misc = None + # Entity (coreference) annotation should be only on the word level, + # so make sure it does not stay on the token level. + if mwt_misc['Entity']: + nodes[0].misc['Entity'] = mwt_misc['Entity'] + del mwt_misc['Entity'] + + # If node is already part of an MWT, we need to delete the old MWT and extend the new MWT. + if node.multiword_token: + mwt_words = node.multiword_token.words + mwt_form = node.multiword_token.form + if node.multiword_token.misc: + mwt_misc.update(node.multiword_token.misc) + node.multiword_token.remove() + mwt_words[mwt_words.index(node):mwt_words.index(node)+1] = nodes + nodes = mwt_words + + mwt = node.root.create_multiword_token(words=nodes, form=mwt_form, misc=mwt_misc) self.postprocess_mwt(mwt) def multiword_analysis(self, node): diff --git a/udapi/block/ud/addpuncttype.py b/udapi/block/ud/addpuncttype.py new file mode 100644 index 00000000..f5f20e06 --- /dev/null +++ b/udapi/block/ud/addpuncttype.py @@ -0,0 +1,91 @@ +""" +Some UD treebanks use features PunctType and PunctSide that classify +punctuation symbols. This block can be used to add such features to data where +they are missing – the classification is mostly deterministic. If the input +data already contains such features, their values will be overwritten. +""" +from udapi.core.block import Block + +# TODO We need to know the language, there are many other quotation styles, +# e.g. Finnish and Swedish uses the same symbol for opening and closing: ”X”. +# Danish uses uses the French quotes, but switched: »X«. + +PUNCT_TYPES = { + '(': 'Brck', + ')': 'Brck', + '[': 'Brck', + ']': 'Brck', + '{': 'Brck', + '}': 'Brck', + '.': 'Peri', + '...': 'Elip', + '…': 'Elip', + ',': 'Comm', + ';': 'Semi', + ':': 'Colo', + '!': 'Excl', + '¡': 'Excl', # Spanish initial exclamation mark + '?': 'Qest', + '¿': 'Qest', # Spanish initial question mark + '/': 'Colo', # it is used this way in AnCora + '-': 'Dash', + '–': 'Dash', + '—': 'Dash', + '"': 'Quot', + "'": 'Quot', + '`': 'Quot', + '“': 'Quot', # opening English, closing Czech + '”': 'Quot', # closing English + '„': 'Quot', # opening Czech + '‘': 'Quot', # opening English, closing Czech + '’': 'Quot', # closing English + '‚': 'Quot', # opening Czech + '«': 'Quot', # opening French, closing Danish + '»': 'Quot', # closing French, opening Danish + '‹': 'Quot', + '›': 'Quot', + '《': 'Quot', # Korean, Chinese + '》': 'Quot', + '「': 'Quot', # Chinese, Japanese + '」': 'Quot', + '『': 'Quot', + '』': 'Quot' +} + +PUNCT_SIDES = { + '(': 'Ini', + ')': 'Fin', + '[': 'Ini', + ']': 'Fin', + '{': 'Ini', + '}': 'Fin', + '¡': 'Ini', # Spanish initial exclamation mark + '!': 'Fin', # but outside Spanish people may expect empty value + '¿': 'Ini', # Spanish initial question mark + '?': 'Fin', + '《': 'Ini', # Korean, Chinese + '》': 'Fin', + '「': 'Ini', # Chinese, Japanese + '」': 'Fin', + '『': 'Ini', + '』': 'Fin' +} + + +class AddPunctType(Block): + """Add features PunctType and PunctSide where applicable.""" + + def process_node(self, node): + # The two features apply only to PUNCT. If they already occur elsewhere, erase them. + if node.upos != 'PUNCT': + node.feats['PunctType'] = '' + node.feats['PunctSide'] = '' + else: + if node.form in PUNCT_TYPES: + node.feats['PunctType'] = PUNCT_TYPES[node.form] + else: + node.feats['PunctType'] = '' + if node.form in PUNCT_SIDES: + node.feats['PunctSide'] = PUNCT_SIDES[node.form] + else: + node.feats['PunctSide'] = '' diff --git a/udapi/block/ud/ar/fixedeprels.py b/udapi/block/ud/ar/fixedeprels.py new file mode 100644 index 00000000..a4b359ff --- /dev/null +++ b/udapi/block/ud/ar/fixedeprels.py @@ -0,0 +1,699 @@ +"""Block to fix case-enhanced dependency relations in Arabic.""" +from udapi.core.block import Block +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'أَنَّ': [], + 'أَن': [], + 'إِنَّ': [], + 'إِذَا': [], + 'لَو': [], + 'حَيثُ': [], + 'مِثلَ': [], + 'لِأَنَّ': [], + 'كَمَا': [], +# 'فِي_حِينَ': [], + 'فَ': [] + } + + # Reduction and normalization of prepositions and conjunctions, including + # the derived and compound ones. The Latin transliterations are not really + # needed in the process. We include them here as documentation, but also + # to help the poor editor with rendering the lines. Ideally, each line + # should have left-to-right text at both the beginning and end. + substitution = [ + {'target': ('min:gen', 'مِن:gen'), + 'sources': + [('ibtida min', 'اِبتِدَاء_مِن')] + }, + {'target': ('ʾiṯra:gen', 'إِثرَ:gen'), # ʾiṯra = right after + 'sources': + [('ʾiṯra', 'إِثرَ')] + }, + {'target': ('ʾaṯnāʾa:gen', 'أَثنَاءَ:gen'), # ʾaṯnāʾa = during + 'sources': + [('ʾaṯnāʾa', 'أَثنَاءَ')] + }, + {'target': ('ʾiḏ', 'إِذ'), # ʾiḏ = because + 'sources': + [('ʾiḏ', 'إِذ'), + ('ʾiḏ ʾanna', 'إِذ_أَنَّ')] + }, + {'target': ('ʾiḏā', 'إِذَا'), # ʾiḏā = if + 'sources': + [('ʾiḏā', 'إِذَا'), + ('ʾiḏā', 'إِذًا')] + }, + ] + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'اِبتِدَاء_مِن': 'مِن:gen', + 'إِثرَ': 'إِثرَ:gen', # ʾiṯra = right after + 'أَثنَاءَ': 'أَثنَاءَ:gen', # ʾaṯnāʾa = during + 'إِذ': 'إِذ', # ʾiḏ = because + 'إِذ_أَنَّ': 'إِذ', # ʾiḏ ʾanna + 'إِذًا': 'إِذَا', + 'إِذَا': 'إِذَا', # remove morphological case; ʾiḏā = if + 'إِزَاءَ': 'إِزَاءَ:gen', # ʾizāʾa = regarding, facing, towards + 'أَلَّا': 'إِلَّا', + 'إِلَّا': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_إِذَا': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَن': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَنَّ': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَنَّ_هُوَ': 'إِلَّا', # ʾillā = except, unless + 'إِلَى': 'إِلَى:gen', # ʾilā = to + 'إِلَى_أَن': 'إِلَى:gen', + 'إِلَى_أَنَّ': 'إِلَى_أَنَّ', # until? that? + 'إِلَى_أَنَّ_لَدَى': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_مِن': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ_مِن': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ_مِن_بَينَ': 'إِلَى_أَنَّ', + 'إِلَى_بَعدَ': 'إِلَى:gen', + 'إِلَى_بَينَ': 'إِلَى_بَينِ:gen', # ʾilā bayni = to between + 'إِلَى_جَانِب': 'إِلَى_جَانِبِ:gen', # ʾilā ǧānibi = beside + 'إِلَى_حَوَالَى': 'إِلَى:gen', # ila hawala = to around X + 'إِلَى_حَوَالَى_مِن': 'إِلَى:gen', # ila hawala min + 'إِلَى_حَيثُ': 'إِلَى:gen', + 'إِلَى_حِينَ': 'فِي_حِينِ', # during + 'إِلَى_خَارِجَ': 'إِلَى_خَارِجِ:gen', # ʾilā ḫāriǧi = out + 'إِلَى_فِي': 'إِلَى:gen', + 'إِلَى_قَبلَ': 'إِلَى_قَبلِ:gen', # ʾilā qabli = until before X (e.g. until one year ago) + 'إِلَى_مِثلَ': 'مِثلَ', # miṯla = like + 'إِلَى_نَحوَ': 'إِلَى:gen', # to about N + 'أَمَّا': 'أَمَامَ:gen', + 'إِمَّا_لِ': 'لِ:gen', + 'أَمَامَ': 'أَمَامَ:gen', # ʾamāma = in front of + 'أَمَامَ_مِن': 'أَمَامَ:gen', + 'أَن': 'أَنَّ', # remove morphological case; ʾanna = that + 'أَنَّ': 'أَنَّ', # remove morphological case; ʾanna = that + 'إِن': 'إِنَّ', # remove morphological case; ʾinna = that + 'إِنَّ': 'إِنَّ', # remove morphological case; ʾinna = that + 'إِنَّمَا': 'إِنَّ', + 'إِيَّا': 'إِلَّا', + 'بِ': 'بِ:gen', # bi = for, with + 'بِ_اِتِّجَاه': 'بِاِتِّجَاهِ:gen', # bi-ittiǧāhi = towards + 'بِ_إِزَاءَ': 'إِزَاءَ:gen', # ʾizāʾa = regarding, facing, towards + 'بِ_اِستِثنَاء': 'بِاِستِثنَاءِ:gen', # biistiṯnāʾi = with exception of + 'بِ_اِسم': 'بِاِسمِ:gen', # biismi = in name of + 'بِ_إِضَافَة_إِلَى': 'بِاَلإِضَافَةِ_إِلَى:gen', # bi-al-ʾiḍāfati ʾilā = in addition to + 'بِ_إِضَافَة_إِلَى_أَنَّ': 'إِلَى_أَنَّ', + 'بِ_إِضَافَة_لِ': 'بِاَلإِضَافَةِ_إِلَى:gen', # in addition to + 'بِ_اِعتِبَار': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِبَار_أَنَّ': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِبَار_مِن': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِمَاد_عَلَى': 'بِاَلِاعتِمَادِ_عَلَى:gen', # bi-al-i-ʼʿtimādi ʿalā = depending on + 'بِ_إِلَى': 'بِ:gen', + 'بِ_أَنَّ': 'أَنَّ', # that + 'بِ_أَن': 'بِ:gen', + 'بِ_إِنَّ': 'بِ:gen', + 'بِ_أَنَّ_أَمَامَ': 'أَنَّ', # that + 'بِ_أَنَّ_لَا': 'أَنَّ', # that + 'بِ_أَنَّ_مِن': 'أَنَّ', # that + 'بِ_أَنَّ_هما_مِن': 'أَنَّ', # that + 'بِ_أَنَّ_هُوَ': 'أَنَّ', # that + 'بِ_أَنَّ_هُوَ_عَلَى': 'أَنَّ', # that + 'بِ_اِنطِلَاق': 'بِ:gen', + 'بِ_تَالِي_إِنَّ': 'بِ:gen', + 'بِ_تَعَاوُن_مَعَ': 'بِاَلتَّعَاوُنِ_مَعَ:gen', # bi-at-taʿāwuni maʿa = in cooperation with + 'بِ_تُهمَة': 'بِتُهمَةِ:gen', # bituhmati = on charges of + 'بِ_تَوَازِي_مَعَ': 'بِاَلتَّوَازِي_مَعَ:gen', # bi-at-tawāzī maʿa = in parallel with + 'بِ_ثُمَّ': 'بِ:gen', + 'بِ_جَانِب': 'بِجَانِبِ:gen', # biǧānibi = next to + 'بِ_جِهَة': 'بِ:gen', + 'بِ_حَالَة': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'بِ_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'بِ_حُضُور': 'فِي_حُضُورِ:gen', # together with + 'بِ_حَقّ': 'بِ:gen', + 'بِ_حُكم': 'بِ:gen', + 'بِ_حُلُول': 'بِ:gen', + 'بِ_حَوَالَى': 'بِ:gen', # bi hawala = with around X + 'بِ_حَيثُ': 'بِ:gen', + 'بِ_خُصُوص': 'بِخُصُوصِ:gen', # biḫuṣūṣi = with regard + 'بِ_خِلَاف': 'بِخِلَافِ:gen', # biḫilāfi = in addition to + 'بِ_دَاخِلَ': 'دَاخِلَ:gen', + 'بِ_دَعوَى': 'بِ:gen', + 'بِ_دَور': 'بِ:gen', # bidawri = with role, in turn? + 'بِ_دُون': 'دُونَ:gen', + 'بِ_دُونَ': 'دُونَ:gen', # bi dūni = without + 'بِ_دُونَ_أَن': 'دُونَ:gen', # bi dūni ʾan = without + 'بِ_رِعَايَة': 'بِ:gen', + 'بِ_رَغم': 'رَغمَ:gen', # despite + 'بِ_رَغم_أَنَّ': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن_أَن': 'بِ:gen', + 'بِ_رَغم_مِن_أَنَّ': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن_أَنَّ_هُوَ': 'بِ:gen', + 'بِ_رِفقَة': 'بِرِفقَةٍ:gen', # birifqatin = in company of + 'بِ_رِئَاسَة': 'بِ:gen', + 'بِ_سَبّ': 'بِ:gen', + 'بِ_سَبَب': 'بِسَبَبِ:gen', # bisababi = because of + 'بِ_شَأن': 'بِشَأنِ:gen', # bišaʾni = about, regarding (lit. with + matter) + 'بِ_شَرط_أَن': 'بِ:gen', + 'بِ_صَدَد': 'بِصَدَدِ:gen', # biṣadadi = with respect to + 'بِ_صَرف_نَظَر_عَن': 'بِصَرفِ_اَلنَّظَرِ_عَن:gen', # biṣarfi an-naẓari ʿan = regardless of + 'بِ_صِفَة': 'بِصِفَةِ:gen', # biṣifati = as + 'بِ_صُورَة': 'بِ:gen', + 'بِ_عَكس': 'بِ:gen', + 'بِ_عَلَى': 'بِ:gen', + 'بِ_عَن': 'بِ:gen', + 'بِ_عَين': 'بِ:gen', + 'بِ_غَضّ_نَظَر_عَن': 'بِغَضِّ_اَلنَّظَرِ_عَن:gen', # biġaḍḍi an-naẓari ʿan = regardless of + 'بِ_فَضل': 'بِفَضلِ:gen', # bifaḍli = thanks to + 'بِ_فِي': 'بِ:gen', + 'بِ_قَدر': 'بِ:gen', + 'بِ_قُرب_مِن': 'بِاَلقُربِ_مِن:gen', # bi-al-qurbi min = near (with proximity to) + 'بِ_قَصد': 'بِقَصدِ:gen', # biqaṣdi = with intention + 'بِ_كَ': 'بِ:gen', + 'بِ_لِ': 'بِ:gen', + 'بِ_لَا': 'بِ:gen', + 'بِ_مَا_أَنَّ': 'بِ:gen', + 'بِ_مَثَابَة': 'بِ:gen', + 'بِ_مِثلَ': 'مِثلَ', # miṯla = like + 'بِ_مُجَرَّد': 'بِ:gen', + 'بِ_مُسَاعَدَة': 'بِ:gen', + 'بِ_مُشَارَكَة': 'بِمُشَارَكَةِ:gen', # bimušārakati = with participation of + 'بِ_مُقَارَنَة_بِ': 'بِاَلمُقَارَنَةِ_بِ:gen', # bi-al-muqāranati bi = in comparison to + 'بِ_مُقتَضَى': 'بِمُقتَضَى:gen', # bimuqtaḍā = with requirement of + 'بِ_مِقدَار': 'بِ:gen', + 'بِ_مِن': 'بِ:gen', + 'بِ_مُنَاسَبَة': 'بِمُنَاسَبَةِ:gen', # bimunāsabati = on the occasion of + 'بِ_مُوجِب': 'بِمُوجِبِ:gen', # bimūǧibi = with motive + 'بِ_نَتِيجَة': 'بِ:gen', + 'بِ_نَحوَ': 'بِ:gen', # by about N + 'بِ_نِسبَة': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati (bin-nisbati) = in proportion/relation to + 'بِ_نِسبَة_إِلَى': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati ʾilā (bin-nisbati ʾilā) = in proportion/relation to + 'بِ_نِسبَة_لِ': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati li (bin-nisbati li) = in proportion/relation to + 'بِ_نِسبَة_لِ_مِن': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati li (bin-nisbati li) = in proportion/relation to + 'بِ_نَظَر_إِلَى': 'بِ:gen', + 'بِ_نِيَابَة_عَن': 'بِاَلنِّيَابَةِ_عَن:gen', # bi-an-niyābati ʿan = on behalf of + 'بِ_هَدَف': 'بِهَدَفِ:gen', # bihadafi = with goal + 'بِ_وَ_لِ': 'بِ:gen', + 'بِ_وَاسِطَة': 'بِوَاسِطَةِ:gen', # biwāsiṭati = by means of + 'بِ_وَاقِع': 'بِ:gen', + 'بِ_وَسَط': 'بِوَسَطِ:gen', # biwasaṭi = in the middle of + 'بِ_وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'بِ_وَصف': 'بِ:gen', + 'بازاء': 'بِ:gen', + 'بالتسخين': 'بِ:gen', + 'بَدَلًا_مِن': 'بَدَلًا_مِن:gen', # badalan min = instead of + 'بدون': 'دُونَ:gen', # without + 'بشان': 'بِشَأنِ:gen', + 'بَعدَ': 'بَعدَ:gen', # baʿda = after + 'بَعدَ_أَن': 'بَعدَ:gen', # baʿda ʾan = after + clause + 'بَعدَ_حَوَالَى': 'بَعدَ:gen', # baada hawala + 'بَعدَ_نَحوَ': 'بَعدَ:gen', # after about N + 'بَعدَمَا': 'بَعدَ:gen', # baʿdamā = after + 'بُعَيدَ': 'بُعَيدَ:gen', # buʿayda = shortly after + 'بَل': 'قَبلَ:gen', + 'بِنَاء_عَلَى': 'بناء_عَلَى:gen', + 'بناء_عَلَى': 'بناء_عَلَى:gen', # bnāʾ ʿalā = based on + 'بناء_لِ': 'لِ:gen', + 'بَيدَ': 'بِ:gen', + 'بَيدَ_أَنَّ': 'بِ:gen', + 'بَينَ': 'بَينَ:gen', # bayna = between + 'بَينَ_حَوَالَى': 'بَينَ:gen', # bayna hawala + 'بينا': 'بَينَ:gen', # bayna = between + 'بَينَ_وَ_وَ_وَ': 'بَينَ:gen', # bayna = between + 'بَينَمَا': 'بَينَ:gen', + 'بَينَمَا_لَم': 'بَينَ:gen', + 'تُجَاهَ': 'تُجَاهَ:gen', # tuǧāha = towards, facing + 'تَحتَ': 'تَحتَ:gen', # tahta = under + 'ثَمَّ': 'بِ:gen', + 'ثُمَّ': 'بِ:gen', + 'جَرَّاء': 'جَرَّاء:gen', # ǧarrāʾ = because of + 'حَتَّى': 'حَتَّى:gen', # ḥattā = until + 'حَتَّى_أَنَّ': 'حَتَّى:gen', # before + 'حَتَّى_إِنَّ': 'حَتَّى:gen', # before + 'حَتَّى_بِ': 'حَتَّى:gen', # before + 'حَتَّى_لَو': 'لَو', # even if + 'حَتَّى_وَ_لَو': 'لَو', # even if + 'حَتَّى_وإن': 'إِنَّ', + 'حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'حَسَبَمَا': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'حَوَالَى': 'حَوَالَى', # ḥawālā = around, about + 'حَوَالَى_مِن': 'مِن:gen', # hawala min = from around X + 'حَولَ': 'حَولَ:gen', # ḥawla = about + 'حولما_إِذَا': 'إِذَا', + 'حَولَ_مَا_إِذَا': 'إِذَا', + 'حِيَالَ': 'حِيَالَ:gen', # ḥiyāla = concerning + 'حَيثُ': 'حَيثُ', # remove morphological case; ḥayṯu = where (SCONJ, not ADV) + 'حِينَمَا': 'فِي_حِينِ', # during + 'خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside + 'خِلَالَ': 'خِلَالَ:gen', # ḫilāla = during + 'خَلفَ': 'خَلفَ:gen', # ḫalfa = behind + 'دَاخِل': + 'دَاخِلَ:gen', # dāḫila = inside of + 'دَاخِلَ': + 'دَاخِلَ:gen', # dāḫila = inside of + 'دُونَ': 'دُونَ:gen', # dūna = without + 'دُونَ_أَن': 'دُونَ:gen', # dūna ʾan = without + 'دُونَ_سِوَى': 'دُونَ:gen', # dūna siwā = without + 'دونما': 'دُونَ:gen', + 'ذٰلِكَ_بَعدَمَا': 'بَعدَ:gen', + 'ذٰلِكَ_عِندَمَا': 'بِ:gen', + 'ذٰلِكَ_لِأَنَّ': 'لِأَنَّ', # because + 'ذٰلِكَ_لِكَي': 'لِكَي', # li-kay = in order to + 'ذٰلِكَ_نَظَر_لِ': 'بِ:gen', + 'رَغمَ': 'رَغمَ:gen', # raġma = despite + 'رَغمَ_أَنَّ': 'رَغمَ:gen', # raġma ʾanna = despite + clause + 'رَغمَ_أَنَّ_مِن': 'رَغمَ:gen', # raġma ʾanna min = despite + 'رَهنَ': 'رَهنَ:gen', # rahna = depending on + 'رَيثَمَا': 'رَهنَ:gen', # rahna = depending on + 'سِوَى': 'سِوَى:gen', # siwā = except for + 'سِوَى_أَنَّ_هُوَ': 'سِوَى:gen', # siwā = except for + 'سِوَى_بِ': 'سِوَى:gen', # siwā = except for + 'سِوَى_عَلَى': 'سِوَى:gen', # siwā = except for + 'سِوَى_لِ': 'سِوَى:gen', # siwā = except for + 'ضِدَّ': 'ضِدَّ:gen', # ḍidda = against + 'ضِمنَ': 'ضِمنَ:gen', # ḍimna = within, inside, among + 'طَالَمَا': + 'طَالَمَا', # ṭālamā = as long as + 'طالَما': + 'طَالَمَا', # ṭālamā = as long as + 'طَالَمَا_أَنَّ': + 'طَالَمَا', # ṭālamā = as long as + 'طِوَالَ': 'طِوَالَ:gen', # ṭiwāla = throughout + 'طِيلَةَ': 'طِيلَةَ:gen', # ṭīlata = during + 'عبر': 'عَبرَ:gen', + 'عَبرَ': 'عَبرَ:gen', # ʿabra = via + 'عَدَا': 'عَدَا:gen', # ʿadā = except for + 'عَقِبَ': 'عَقِبَ:gen', # ʿaqiba = following + 'عَقِبَ_أَن': 'عَقِبَ:gen', # ʿaqiba = following + 'عَقِبَ_مِن': 'عَقِبَ:gen', # ʿaqiba = following + 'عَلَى': 'عَلَى:gen', # ʿalā = on + 'عَلَى_أبواب': 'عَلَى:gen', + 'عَلَى_إِثرَ': 'إِثرَ:gen', # ʿalā ʾiṯri = right after + 'عَلَى_أَثَر': 'عَلَى:gen', + 'عَلَى_اِختِلَاف': 'عَلَى:gen', + 'عَلَى_أَسَاس': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on + 'عَلَى_أَسَاس_أَنَّ': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on + 'عَلَى_اِعتِبَار_أَنَّ': 'عَلَى_اِعتِبَارِ_أَنَّ', # ʿalā iʿtibāri ʾanna = considering that + 'عَلَى_إِلَّا': 'إِلَّا', # ʾillā = except, unless + 'عَلَى_الفور': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_إِلَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَن': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَن_بِ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_عَلَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_مِن_شَأن': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_هُوَ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_هُوَ_لَدَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بِ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بِ_فِي': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بَينَ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_حَدّ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of + 'عَلَى_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'عَلَى_حَولَ': 'عَلَى:gen', + 'عَلَى_رَأس': 'عَلَى_رَأسِ:gen', # ʿalā raʾsi = on top of + 'عَلَى_رَغم': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغمَ_أَنَّ': 'رَغمَ:gen', # ʿalā raġma ʾanna = despite + clause + 'عَلَى_رَغم_أَنَّ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن_أَنَّ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن_أَنَّ_هُوَ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_طَرِيقَة': 'عَلَى_طَرِيقَةِ:gen', # ʿalā ṭarīqati = on the way + 'عَلَى_عَكس': 'عَلَى:gen', + 'عَلَى_غِرَار': 'عَلَى_غِرَارِ:gen', # ʿalā ġirāri = similar to + 'عَلَى_قَيد': 'عَلَى:gen', + 'عَلَى_لِسَان': 'عَلَى:gen', + 'عَلَى_مِثلَ': 'مِثلَ', # miṯla = like + 'عَلَى_مدى': 'عَلَى:gen', + 'عَلَى_مَدَى': 'عَلَى_مَدَى:gen', # ʿalā madā = on period + 'عَلَى_مَقرَبَة_مِن': 'عَلَى_مَقرَبَةٍ_مِن:gen', # ʿalā maqrabatin min = in the vicinity of + 'عَلَى_مِن': 'عَلَى:gen', + 'عَلَى_نَحوَ': 'عَلَى:gen', # to about N + 'عَلَى_يَد': 'عَلَى:gen', + 'عَن': 'عَن:gen', # ʿan = about, from + 'عَن_أَن': 'عَن:gen', + 'عَن_أَنَّ': 'عَن:gen', + 'عَن_أَنَّ_وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond + 'عَن_بِ': 'عَن:gen', + 'عَن_طَرِيق': 'عَن_طَرِيقِ:gen', # ʿan ṭarīqi = via + 'عَن_فِي_أَن': 'عَن:gen', + 'عَن_قُربَ': 'قُربَ:gen', # qurba = near + 'عَن_مِثلَ': 'مِثلَ', # miṯla = like + 'عَن_مِن': 'عَن:gen', + 'عِندَ': 'عِندَمَا', # ʿinda = when + 'عِندَمَا': 'عِندَمَا', # ʿindamā = when + 'غَيرَ': 'إِلَّا', + 'فَ': 'فَ', # fa = so (advcl or coordination) + 'فَ_إِذَا': 'فَ', # fa = so (advcl or coordination) + 'فَ_بَدَل_مِن_أَن': 'فَ', # fa = so (advcl or coordination) + 'فَ_بَينَ': 'فَ', # fa = so (advcl or coordination) + 'فَ_عَلَى': 'فَ', # fa = so (advcl or coordination) + 'فَ_فِي': 'فَ', # fa = so (advcl or coordination) + 'فَ_مِن': 'فَ', # fa = so (advcl or coordination) + 'فَورَ': 'فَورَ:gen', # fawra = as soon as + 'فَوقَ': 'فَوقَ:gen', # fawqa = above, over + 'فِي': 'فِي:gen', # fī = in + 'فِي_اِتِّجَاه': 'بِاِتِّجَاهِ:gen', # bi-ittiǧāhi = towards + 'فِي_أَثنَاءَ': 'أَثنَاءَ:gen', # ʾaṯnāʾa = during + 'فِي_إِطَار': 'فِي_إِطَار:gen', # fī ʾiṭār = in frame + 'فِي_اعقاب': 'فِي_أَعقَابِ:gen', + 'فِي_إِلَى': 'فِي:gen', + 'فِي_أَن': 'فِي:gen', + 'فِي_أَنَّ': 'فِي:gen', + 'فِي_أَنَّ_عَلَى': 'فِي:gen', + 'فِي_أَنَّ_لَدَى': 'فِي:gen', + 'فِي_أَنَّ_مِن': 'فِي:gen', + 'فِي_بِ': 'فِي:gen', + 'فِي_بِ_فِي': 'فِي:gen', + 'فِي_بَاطِن': 'فِي:gen', + 'فِي_بَعدَ': 'فِي:gen', + 'فِي_بَينَ': 'بَينَ:gen', + 'فِي_حَال': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'فِي_حَالَة': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'فِي_حَدّ': 'فِي:gen', + 'فِي_حُضُور': 'فِي_حُضُورِ:gen', # fī ḥuḍūri = in presence of + 'فِي_حَقّ': 'فِي:gen', + 'فِي_حُكم': 'فِي:gen', + 'فِي_حَوَالَى': 'فِي:gen', # fi hawala = in around X + 'فِي_حِين': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِينَ': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِين_أَنَّ': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِينَ_أَنَّ_هُوَ': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside + 'فِي_خِتَام': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion + 'فِي_خِتَامِ': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion + 'فِي_خِلَالَ': 'فِي:gen', + 'فِي_دَاخِل': + 'دَاخِلَ:gen', + 'فِي_دَاخِلَ': 'فِي:gen', + 'فِي_سَبِيل': 'فِي_سَبِيلِ:gen', # fī sabīli = in order to + 'فِي_سِيَاق': 'فِي:gen', + 'فِي_شَأن': 'فِي_شَأنِ:gen', # fī šaʾni = in regard of + 'فِي_شَكل': 'فِي:gen', + 'فِي_صَفّ': 'فِي:gen', + 'فِي_صُورَة': 'فِي:gen', + 'فِي_ضَوء': 'فِي_ضَوءِ:gen', # fī ḍawʾi = in light of + 'فِي_ظِلّ': 'فِي_ظِلِّ:gen', # fī ẓilli = in light of + 'فِي_عُقب': 'فِي_أَعقَابِ:gen', # fī ʾaʿqābi = in the aftermath of + 'فِي_غَضن': 'فِي:gen', + 'فِي_غُضُون': 'فِي:gen', + 'فِي_مَا': 'فِي:gen', + 'فِي_مِثلَ': 'مِثلَ', # miṯla = like + 'فِي_مَجَال': 'فِي_مَجَالِ:gen', # fī maǧāli = in the area of + 'فِي_مستشفى': 'فِي:gen', + 'فِي_مَعَ': 'فِي:gen', + 'فِي_مُقَابِلَ': 'مُقَابِلَ:gen', + 'فِي_مَقدَم': 'فِي:gen', + 'فِي_مِن': 'فِي:gen', + 'فِي_مُنَاسَبَة': 'فِي_مُنَاسَبَةِ:gen', # fī munāsabati = on the occasion of + 'فِي_مُوَاجَهَة': 'فِي:gen', + 'فِي_نَحوَ': 'فِي:gen', # in about N + 'فِي_نِطَاق': 'فِي:gen', + 'فِي_وَجه': 'فِي:gen', + 'فِي_وَسط': 'وَسطَ:gen', + 'فِي_وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'فِيمَا': 'فِيمَا', # fīmā = while + 'قُبَالَةَ': 'قُبَالَةَ:gen', # qubālata = in front of, facing + 'قَبلَ': 'قَبلَ:gen', # qabla = before + 'قَبلَ_أَن': 'قَبلَ:gen', # qabla = before + 'قَبلَ_حَوَالَى': 'قَبلَ:gen', # qabla hawala + 'قَبلَ_نَحوَ': 'قَبلَ:gen', # before about N + 'قُبَيلَ': 'قُبَيلَ:gen', # qubayla = before + 'قُربَ': 'قُربَ:gen', # qurba = near + 'قَيدَ': 'فِي:gen', + 'كَ': 'كَ:gen', # ka = in (temporal?) + 'كَ_أَنَّ': 'كَ:gen', + 'كَ_لِ': 'كَ:gen', + 'كَ_وَ_وَ': 'كَ:gen', + 'كَأَنَّمَا': 'كَأَنَّمَا', # ka-ʾannamā = as if + 'كُلَّمَا': 'كُلَّمَا', # kullamā = whenever + 'كَمَا': 'كَمَا', # remove morphological case; kamā = as + 'كَي': 'لِكَي', # kay = in order to + 'لَ': 'لِ:gen', + 'لَ_عَلَّ': 'لِ:gen', + 'لِ': 'لِ:gen', # li = to + 'لِ_أَجَلّ': 'لِ:gen', + 'لِ_إِلَى': 'لِ:gen', + 'لِ_أَمَامَ_وَ': 'لِ:gen', + 'لِ_أَن': 'لِ:gen', + 'لِ_بِ': 'لِ:gen', + 'لِ_جِهَة': 'لِ:gen', + 'لِ_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of + 'لِ_حَوَالَى': 'لِ:gen', # li hawala = for around X + 'لِ_خَارِجَ': 'لِخَارِجِ:gen', # liḫāriǧi = out + 'لِ_دُخُول': 'لِ:gen', + 'لِ_دَرَجَة_أَنَّ': 'لِ:gen', + 'لِ_سَبَب': 'لِ:gen', + 'لِ_صَالِح': 'لِصَالِحِ:gen', # liṣāliḥi = in interest of + 'لِ_عَلَى': 'لِ:gen', + 'لِ_عَن': 'لِ:gen', + 'لِ_عِندَ': 'لِ:gen', + 'لِ_فِي': 'لِ:gen', + 'لِ_فِي_بَينَ': 'لِ:gen', + 'لِ_كَون': 'لِكَونِ', # likawni = because + 'لِ_لِئَلّا': 'لِ:gen', + 'لِ_مِثلَ': 'مِثلَ', # miṯla = like + 'لِ_مَعَ': 'لِ:gen', + 'لِ_مِن': 'لِ:gen', + 'لِ_نَحوَ': 'لِ:gen', # to/for about N + 'لِ_وَ': 'لِ:gen', + 'لِ_وَ_فِي': 'لِ:gen', + 'لَا': 'إِلَّا', + 'لَا_سِيَّمَا_بَعدَ': 'بَعدَ:gen', + 'لَا_سِيَّمَا_وَ_أَنَّ': 'أَنَّ', + 'لَا_سِيَّمَا_وَ_أَنَّ_هُوَ': 'أَنَّ', + 'لِأَنَّ': 'لِأَنَّ', # remove morphological case; li-ʾanna = because + 'لدى': 'لَدَى:gen', + 'لَدَى': 'لَدَى:gen', # ladā = with, by, of, for + 'لِذَا': 'لِذَا', # liḏā = so, therefore + 'لِذَا_فَ': 'لِ:gen', + 'لِذٰلِكَ': 'لِذَا', # liḏā = so, therefore + 'لٰكِنَّ': 'مَعَ:gen', + 'لكن_إِذَا': 'إِذَا', + 'لكن_بِ': 'بِ:gen', + 'لٰكِن_بَعدَ': 'بَعدَ:gen', + 'لكن_دَاخِلَ': 'دَاخِلَ:gen', + 'لكن_لَدَى': 'لَدَى:gen', + 'لٰكِن_مَعَ': 'مَعَ:gen', + 'لِكَي': 'لِكَي', # li-kay = in order to + 'لَمَّا': 'كُلَّمَا', + 'لَمَّا_لِ': 'كُلَّمَا', + 'لَو': 'لَو', # law = if + 'لَو_أَنَّ': 'لَو', # if + 'لَو_مِن': 'لَو', # if + 'ما': 'مِمَّا', + 'مَا': 'مِمَّا', + 'ما_دَام': 'مِمَّا', + 'مادامت': 'مِمَّا', + 'مَالَم': 'مَالَم', # mālam = unless + 'مَا_إِذَا': 'إِذَا', + 'مِثلَ': 'مِثلَ', # remove morphological case; miṯla = like + 'مِثلَمَا': 'مِثلَ', # miṯla = like + 'مَعَ': 'مَعَ:gen', # maʿa = with + 'مَعَ_أَنَّ': 'مَعَ:gen', + 'مَعَ_بِ': 'مَعَ:gen', + 'مَعَ_فِي': 'مَعَ:gen', + 'مَعَ_مِن_بَينَ': 'بَينَ:gen', + 'مقابل': 'مُقَابِلَ:gen', + 'مُقَابِلَ': 'مُقَابِلَ:gen', # muqābila = in exchange for, opposite to, corresponding to + 'مُقَابِلَ_حَوَالَى': 'مُقَابِلَ:gen', # muqabila hawala + 'مُقَارَن_بِ': 'بِ:gen', + 'مِمَّا': 'مِمَّا', # mimmā = that, which + 'مِمَّا_لَدَى': 'مِمَّا', # mimmā = that, which + 'مِن': 'مِن:gen', # min = from + 'مِن_اجل': 'مِن_أَجلِ:gen', # min ʾaǧli = for the sake of + 'مِن_أَجل': 'مِن_أَجلِ:gen', # min ʾaǧli = for the sake of + 'مِن_أَجل_أَن': 'مِن:gen', + 'مِن_إِلَى': 'مِن:gen', + 'مِن_أَن': 'مِن:gen', + 'مِن_أَنَّ': 'مِن:gen', + 'مِن_بِ': 'مِن:gen', + 'مِن_بَعدَ': 'مِن:gen', + 'مِن_بَينَ': 'بَينَ:gen', + 'مِن_تَحتَ': 'مِن:gen', + 'مِن_ثَمَّ': 'مِن:gen', + 'مِن_ثُمَّ': 'مِن:gen', + 'مِن_جَانِب': 'إِلَى_جَانِبِ:gen', # min ǧānibi = beside + 'مِن_جَرَّاء': 'جَرَّاء:gen', # ǧarrāʾ = because of + 'مِن_حَوَالَى': 'مِن:gen', # min hawala = from around X + 'مِن_حَولَ': 'مِن:gen', + 'مِن_حَيثُ': 'مِن:gen', + 'مِن_خَارِج': 'مِن_خَارِجِ:gen', # min ḫāriǧi = from outside + 'مِن_خَارِجَ': 'مِن_خَارِجِ:gen', # min ḫāriǧi = from outside + 'مِن_خِلَالَ': 'مِن_خِلَالِ:gen', # min ḫilāli = through, during + 'مِن_دَاخِلَ': 'مِن_دَاخِلِ:gen', # min dāḫili = from inside + 'مِن_دُون': 'مِن_دُونِ:gen', # min dūni = without, beneath, underneath + 'مِن_دُونَ': 'مِن_دُونِ:gen', # min dūni = without, beneath, underneath + 'مِن_دُون_أَن': 'مِن_دُونِ:gen', + 'مِن_دُونَ_أَن': 'مِن_دُونِ:gen', # min dūni ʾan = without, beneath, underneath + clause + 'مِن_زَاوِيَة': 'مِن:gen', + 'مِن_شَأن': 'مِن_شَأنِ:gen', # min šaʾni = from matter + 'مِن_ضِمنَ': 'مِن_ضِمنِ:gen', # min ḍimni = from within = including + 'مِن_طَرَف': 'مِن:gen', + 'مِن_عَلَى': 'مِن:gen', + 'مِن_عِندَ': 'مِن:gen', + 'مِن_غَير_أَن': 'مِن:gen', + 'مِن_فَوقَ': 'مِن_فَوقِ:gen', # min fawqi = from above + 'مِن_فِي': 'مِن:gen', + 'مِن_قَبلَ': 'مِن_قِبَلِ:gen', + 'مِن_قِبَل': 'مِن_قِبَلِ:gen', # min qibali = by + 'مِن_قِبَل_بِ_فِي': 'مِن_قِبَلِ:gen', # min qibali = by + 'مِن_مِثلَ': 'مِثلَ', # miṯla = like + 'مِن_مِن': 'مِن:gen', + 'مِن_مِن_بَينَ': 'بَينَ:gen', + 'مِن_مَوقِع': 'مِن:gen', + 'مِن_نَاحِيَة': 'مِن:gen', + 'مِن_وَرَاءَ': 'مِن_وَرَاءِ:gen', # min warāʾi = from behind + 'مُنذُ': 'مُنذُ:gen', # munḏu = since + 'مُنذُ_أَن': 'مُنذُ:gen', + 'مُنذُ_نَحوَ': 'مُنذُ:gen', # since about N + 'مُنذُ_وَ_فِي': 'مُنذُ:gen', + 'مَهمَا': 'مَهمَا', # mahmā = regardless + 'نَاهِيك_بِ': 'بِ:gen', + 'نَتِيجَة_لِ': 'لِ:gen', + 'نَحوَ': 'نَحوَ', # naḥwa = about, approximately + 'نَحوَ_بِ': 'بِ:gen', # about by N + 'هذا_بالأضافة': 'بِ:gen', + 'وان': 'أَنَّ', + 'وإن': 'إِنَّ', + 'وبشان': 'بِشَأنِ:gen', + 'وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond + 'وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'وِفقَ': 'وِفقَ:gen', # wifqa = according to + 'وِفق_لِ': 'وِفقَ:gen', # wifqa = according to + 'ولو': 'إِذَا', # walaw = even if + 'ولو_أَنَّ': 'إِذَا' # walaw = even if + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + @staticmethod + def compose_edeprel(bdeprel, cdeprel): + """ + Composes enhanced deprel from the basic part and optional case + enhancement. + + Parameters + ---------- + bdeprel : str + Basic deprel (can include subtype, e.g., 'acl:relcl'). + cdeprel : TYPE + Case enhancement (can be composed of adposition and morphological + case, e.g., 'k:dat'). It is optional and it can be None or empty + string if there is no case enhancement. + + Returns + ------- + Full enhanced deprel (str). + """ + assert(bdeprel[-1] != ':') + edeprel = bdeprel + if cdeprel: + assert(cdeprel[0] != ':') + edeprel += ':'+cdeprel + return edeprel + + def process_tree(self, tree): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + + We cannot use the process_node() method because it ignores empty nodes. + """ + for node in tree.descendants_and_empty: + for edep in node.deps: + if edep['deprel'] == 'advcl:pred:إِذَن' or edep['deprel'] == 'advcl:pred:كدا' or edep['deprel'] == 'advcl:pred:لكن': + edep['deprel'] = 'advcl:pred' + continue + if edep['deprel'] == 'nmod:بِأَسْرِ:gen': + edep['deprel'] = 'nmod' + continue + m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel']) + if m: + bdeprel = m.group(1) + cdeprel = m.group(2) + solved = False + # Arabic clauses often start with وَ wa "and", which does not add + # much to the meaning but sometimes gets included in the enhanced + # case label. Remove it if there are more informative subsequent + # morphs. + cdeprel = re.sub(r'^وَ_', r'', cdeprel) + cdeprel = re.sub(r'^وَ:', r'', cdeprel) + cdeprel = re.sub(r'^وَ$', r'', cdeprel) + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.fullmatch(x+r'([_:].+)?', cdeprel) + if m and m.group(1) and not x+m.group(1) in exceptions: + cdeprel = x + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: + continue + # Split preposition from morphological case (if any), normalize + # the preposition and add the fixed morphological case where + # applicable. + m = re.fullmatch(r'([^:]+):(nom|gen|acc)', cdeprel) + adposition = m.group(1) if m else cdeprel + if adposition in self.unambiguous: + cdeprel = self.unambiguous[adposition] + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + continue + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/ca/elque.py b/udapi/block/ud/ca/elque.py new file mode 100644 index 00000000..6b3ad22b --- /dev/null +++ b/udapi/block/ud/ca/elque.py @@ -0,0 +1,116 @@ +""" +This block searches for relative clauses modifying a determiner ('el que...'). +It is written for Catalan but a similar block should work for Spanish and other +Romance languages. +""" +from udapi.core.block import Block +import logging +import re + +class ElQue(Block): + + def __init__(self, fix=False, **kwargs): + """ + Default: Print the annotation patterns but do not fix anything. + fix=1: Do not print the patterns but fix them. + """ + super().__init__(**kwargs) + self.fix = fix + + def process_node(self, node): + # We take 'que' as the central node of the construction. + if node.lemma == 'que' and node.upos == 'PRON' and node.parent.ord > node.ord: + # We will refer to the parent of 'que' as a verb, although it can be + # a non-verbal predicate, too. + que = node + verb = node.parent + # Check the lemma of the determiner. The form may vary for gender and number. + if que.prev_node and que.prev_node.lemma == 'el': + el = que.prev_node + adp = None + if el.prev_node and el.prev_node.upos == 'ADP': + adp = el.prev_node + if adp.udeprel == 'fixed': + adp = adp.parent + if self.fix: + self.fix_pattern(adp, el, que, verb) + else: + self.print_pattern(adp, el, que, verb) + + def print_pattern(self, adp, el, que, verb): + stanford = [] + if adp: + if adp.parent == el: + parentstr = 'el' + elif adp.parent == que: + parentstr = 'que' + elif adp.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(adp.deprel + '(' + parentstr + ', ADP)') + if el.parent == adp: + parentstr = 'ADP' + elif el.parent == que: + parentstr = 'que' + elif el.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(el.deprel + '(' + parentstr + ', el)') + # We found the verb as the parent of 'que', so we do not need to check the parent of 'que' now. + stanford.append(que.deprel + '(VERB, que)') + if verb.parent == adp: + parentstr = 'ADP' + elif verb.parent == el: + parentstr = 'el' + else: + parentstr = 'OTHER' + stanford.append(verb.deprel + '(' + parentstr + ', VERB)') + print('; '.join(stanford)) + + def fix_pattern(self, adp, el, que, verb): + if adp: + if adp.parent == que or adp.parent == verb: + attach(adp, el, 'case') + if el.parent == que: + ###!!! Just a temporary change. In the end it will be attached elsewhere. + attach(el, verb) + el.parent = verb + if len(el.deps) == 1: + el.deps[0]['parent'] = verb + if verb.parent != adp and verb.parent != el and verb.parent != que: + eldeprel = None + if re.match(r'^[nc]subj$', verb.udeprel): + eldeprel = 'nsubj' + elif re.match(r'^ccomp$', verb.udeprel): + eldeprel = 'obj' + elif re.match(r'^advcl$', verb.udeprel): + eldeprel = 'obl' + elif re.match(r'^acl$', verb.udeprel): + eldeprel = 'nmod' + elif re.match(r'^(xcomp|conj|appos|root)$', verb.udeprel): + eldeprel = verb.deprel + if eldeprel: + attach(el, verb.parent, eldeprel) + attach(verb, el, 'acl:relcl') + # If anything before 'el' depends on the verb ('cc', 'mark', 'punct' etc.), + # re-attach it to 'el'. + for c in verb.children: + if c.ord < el.ord and re.match(r'^(cc|mark|case|punct)$', c.udeprel): + attach(c, el) + +def attach(node, parent, deprel=None): + """ + Attach a node to a new parent with a new deprel in the basic tree. In + addition, if there are enhanced dependencies and there is just one incoming + enhanced relation (this is the case in AnCora), this relation will be + modified accordingly. + """ + node.parent = parent + if deprel: + node.deprel = deprel + if len(node.deps) == 1: + node.deps[0]['parent'] = parent + if deprel: + node.deps[0]['deprel'] = deprel diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index cead294a..b36b2512 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -24,7 +24,7 @@ """ import difflib import logging -import re +import regex from udapi.core.block import Block from udapi.core.mwt import MWT @@ -34,7 +34,9 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, - **kwargs): + allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, + previous_form_label='CorrectForm', previous_text_label='OrigText', + added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -54,33 +56,66 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + allow_add_punct - allow creating punctuation-only nodes + allow_delete_punct - allow deleting extra punctuation-only nodes, + which are not represented in root.text + allow_hyphen_goeswith - if e.g. node.form=="mother-in-law" corresponds to + "mother in law" in root.text, convert it to three nodes: + node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") + node2(form="in", deprel="goeswith", upos="X", parent=node1) + node3(form="law", deprel="goeswith", upos="X", parent=node1). + previous_form_label - when changing node.form, we store the previous value + in node.misc[previous_form_label] (so no information is lost). + Default="CorrectForm" because we expect that the previous value + (i.e. the value of node.form before applying this block) + contained the corrected spelling, while root.text contains + the original spelling with typos as found in the raw text. + CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html + When setting this parameter to an empty string, no values will be stored to node.misc. + When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. + previous_text_label - when we are not able to adapt the annotation to match root.text + and fix_text is True, we store the previous root.text value in a CoNLL-U comment with this label. + Default="OrigText". When setting this parameter to an empty string, + no values will be stored to root.comment. + added_label - when creating new nodes because allow_add_punct=True, we mark these nodes + as new_node.misc[added_label] = 1. Default="Added". """ super().__init__(**kwargs) self.fix_text = fix_text self.prefer_mwt = prefer_mwt self.allow_goeswith = allow_goeswith self.max_mwt_length = max_mwt_length + self.allow_add_punct = allow_add_punct + self.allow_delete_punct = allow_delete_punct + self.allow_hyphen_goeswith = allow_hyphen_goeswith + self.previous_form_label = previous_form_label + self.previous_text_label = previous_text_label + self.added_label = added_label @staticmethod def allow_space(form): """Is space allowed within this token form?""" - return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) + return regex.fullmatch('[0-9 ]+([,.][0-9]+)?', form) - @staticmethod - def store_orig_form(node, new_form): - """Store the original form of this node into MISC, unless the change is common&expected.""" - _ = new_form - if node.form not in ("''", "``"): - node.misc['OrigForm'] = node.form + def store_previous_form(self, node): + """Store the previous form of this node into MISC, unless the change is common&expected.""" + if node.form not in ("''", "``") and self.previous_form_label: + node.misc[self.previous_form_label] = node.form + if self.previous_form_label == 'CorrectForm': + node.feats['Typo'] = 'Yes' def process_tree(self, root): text = root.text if text is None: raise ValueError('Tree %s has no text, cannot use ud.ComplyWithText' % root) - # Normalize the stored text (double space -> single space) + # Normalize the stored text (e.g. double space or no-break space -> single space) # and skip sentences which are already ok. text = ' '.join(text.split()) + if root.text != text and self.fix_text: + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') + root.text = text if text == root.compute_text(): return @@ -112,13 +147,14 @@ def process_tree(self, root): node.misc['SpaceAfter'] = 'No' else: logging.warning('Node %s does not match text "%s"', node, tmp_text[:20]) - return + break # Edit root.text if needed. if self.fix_text: computed_text = root.compute_text() if text != computed_text: - root.add_comment('ToDoOrigText = ' + root.text) + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') root.text = computed_text def unspace_diffs(self, orig_diffs, tree_chars, text): @@ -130,6 +166,10 @@ def unspace_diffs(self, orig_diffs, tree_chars, text): tree_lo += 1 if tree_chars[tree_hi - 1] == ' ': tree_hi -= 1 + if text[text_lo] == ' ': + text_lo += 1 + if text[text_hi - 1] == ' ': + text_hi -= 1 old = tree_chars[tree_lo:tree_hi] new = text[text_lo:text_hi] if old == '' and new == '': @@ -181,18 +221,37 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): for diff in diffs: edit, tree_lo, tree_hi, text_lo, text_hi = diff - # Focus only on edits of type 'replace', log insertions and deletions as failures. if edit == 'equal': - continue - if edit in ('insert', 'delete'): - logging.warning('Unable to solve token-vs-text mismatch\n%s', - _diff2str(diff, tree_chars, text)) - continue - - # Revert the splittng and solve the diff. - nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] - form = text[text_lo:text_hi] - self.solve_diff(nodes, form.strip()) + pass + elif edit == 'insert': + forms = text[text_lo:text_hi].split(' ') + if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: + next_node = char_nodes[tree_lo] + for f in reversed(forms): + new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') + new.shift_before_node(next_node) + new.misc[self.added_label] = 1 + else: + logging.warning('Unable to insert nodes\n%s', + _diff2str(diff, tree_chars, text)) + elif edit == 'delete': + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + if all(regex.fullmatch('\p{P}+', n.form) for n in nodes): + if self.allow_delete_punct: + for node in nodes: + node.remove(children='rehang') + else: + logging.warning('Unable to delete punctuation nodes (try ud.ComplyWithText allow_delete_punct=1)\n%s', + _diff2str(diff, tree_chars, text)) + else: + logging.warning('Unable to delete non-punctuation nodes\n%s', + _diff2str(diff, tree_chars, text)) + else: + assert edit == 'replace' + # Revert the splittng and solve the diff. + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + form = text[text_lo:text_hi] + self.solve_diff(nodes, form.strip()) def solve_diff(self, nodes, form): """Fix a given (minimal) tokens-vs-text inconsistency.""" @@ -201,20 +260,33 @@ def solve_diff(self, nodes, form): # First, solve the cases when the text contains a space. if ' ' in form: - if len(nodes) == 1 and node.form == form.replace(' ', ''): - if self.allow_space(form): - self.store_orig_form(node, form) - node.form = form - elif self.allow_goeswith: - forms = form.split() - node.form = forms[0] - for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos=node.upos) + node_form = node.form + if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: + node_form = node_form.replace('-', '') + if len(nodes) == 1: + if node_form == form.replace(' ', ''): + if self.allow_space(form): + self.store_previous_form(node) + node.form = form + elif self.allow_goeswith: + self.store_previous_form(node) + forms = form.split() + node.form = forms[0] + node.feats['Typo'] = 'Yes' + for split_form in reversed(forms[1:]): + new = node.create_child(form=split_form, deprel='goeswith', upos='X') + new.shift_after_node(node) + else: + logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) + elif self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('[ \p{P}]+', form[len(node.form):]): + for punct_form in reversed(form[len(node.form):].split()): + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) + new.misc[self.added_label] = 1 else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: - logging.warning('Unable to solve n:m diff:\n%s -> %s', nodes_str, form) + logging.warning(f'Unable to solve {len(nodes)}:{len(form.split(" "))} diff:\n{nodes_str} -> {form}') # Second, solve the cases when multiple nodes match one form (without any spaces). elif len(nodes) > 1: @@ -235,8 +307,14 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_orig_form(node, form) - node.form = form + if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): + punct_form = form[len(node.form):] + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') + new.shift_after_node(node) + new.misc[self.added_label] = 1 + else: + self.store_previous_form(node) + node.form = form def _nodes_to_chars(nodes): @@ -261,6 +339,4 @@ def _log_diffs(diffs, tree_chars, text, msg): def _diff2str(diff, tree, text): old = '|' + ''.join(tree[diff[1]:diff[2]]) + '|' new = '|' + ''.join(text[diff[3]:diff[4]]) + '|' - if diff[0] == 'equal': - return '{:7} {!s:>50}'.format(diff[0], old) return '{:7} {!s:>50} --> {!s}'.format(diff[0], old, new) diff --git a/udapi/block/ud/cs/addmwt.py b/udapi/block/ud/cs/addmwt.py index 4c203ddc..a690c95b 100644 --- a/udapi/block/ud/cs/addmwt.py +++ b/udapi/block/ud/cs/addmwt.py @@ -1,17 +1,30 @@ """Block ud.cs.AddMwt for heuristic detection of multi-word tokens.""" import udapi.block.ud.addmwt +import re +import logging +# Define static rules for 'aby', 'kdyby' and similar forms. MWTS = { - 'abych': {'form': 'aby bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, - 'kdybych': {'form': 'když bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, - 'abys': {'form': 'aby bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, - 'kdybys': {'form': 'když bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, - 'aby': {'form': 'aby by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, - 'kdyby': {'form': 'když by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, - 'abychom': {'form': 'aby bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, - 'kdybychom': {'form': 'když bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, - 'abyste': {'form': 'aby byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, - 'kdybyste': {'form': 'když byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'abych': {'form': 'aby bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'kdybych': {'form': 'když bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'abys': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'abysi': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybys': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybysi': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'aby': {'form': 'aby by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'kdyby': {'form': 'když by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'abychom': {'form': 'aby bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychom': {'form': 'když bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + # Old Czech 'abychme' == Modern Czech 'abychom' + 'abychme': {'form': 'aby bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychme': {'form': 'když bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'abyste': {'form': 'aby byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'abyšte': {'form': 'aby byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyste': {'form': 'když byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyšte': {'form': 'když byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + # Old Czech 'abyšta' == dual number; 2nd or 3rd person, the one example in data so far is 3rd. + 'abyšta': {'form': 'aby byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, + 'kdybyšta': {'form': 'když byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, } for v in MWTS.values(): v['upos'] = 'SCONJ AUX' @@ -25,23 +38,52 @@ person = '1' elif 'Person=2' in v['feats']: person = '2' - v['xpos'] = 'J,------------- Vc-%s---%s-------' % (number, person) v['deprel'] = '* aux' v['lemma'] = v['form'].split()[0] + ' být' v['main'] = 0 v['shape'] = 'siblings' +# Define static rules for 'nač', 'oč', 'zač' (but not 'proč'). +# Add them to the already existing dictionary MWTS. # nač -> na + co -for prep in 'na za o'.split(): +for prep in 'na o za'.split(): MWTS[prep + 'č'] = { 'form': prep + ' co', 'lemma': prep + ' co', 'upos': 'ADP PRON', + 'xpos': 'RR--4---------- PQ--4----------', + 'feats': 'AdpType=Prep|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel', 'deprel': 'case *', 'main': 1, 'shape': 'subtree', } +# In 19th century texts (Hičkok etalon), one instance of 'seč' was also split (and annotated as ADP + accusative!) +# A few additional instances were found in older texts, too (e.g. 16th century). +# We must do it separately, as the preposition is vocalized. +MWTS['seč'] = { + 'form': 'se' + ' co', + 'lemma': 's' + ' co', + 'upos': 'ADP PRON', + 'xpos': 'RV--4---------- PQ--4----------', + 'feats': 'AdpType=Voc|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', +} + +# Old Czech 'toliť' (special case with 3 subtokens; general -ť will be solved dynamically below). +MWTS['toliť'] = { + 'form': 'to li ť', + 'lemma': 'ten li ť', + 'upos': 'DET SCONJ PART', + 'xpos': '* J,------------- TT-------------', + 'feats': '* _ _', + 'deprel': '* mark discourse', + 'main': 0, + 'shape': 'siblings' +} + class AddMwt(udapi.block.ud.addmwt.AddMwt): @@ -49,25 +91,153 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + # Avoid adding a MWT if the current node already is part of an MWT. + if node.multiword_token: + return None analysis = MWTS.get(node.form.lower(), None) if analysis is not None: return analysis - - # There is no VerbType=verbconj in the UD_Czech data. - # The purpose of this rule is rather to show that - # it is possible to write such "dynamic" rules - # (which cannot be included in static MWTS). - if node.form.lower().endswith('ť') and node.feats['VerbType'] == 'verbconj': - return { - 'form': node.form.lower()[:-1] + ' neboť', - 'lemma': '* neboť', - 'upos': '* CCONJ', - 'xpos': 'Vt-S---3P-NA--2 J^-------------', - 'feats': '* _', - 'deprel': '* cc', - 'main': 0, - 'shape': 'subtree', - } + # If the node did not match any of the static rules defined in MWTS, + # check it against the "dynamic" rules below. The enclitic 'ť' will be + # separated from its host but only if it has been marked by an annotator + # in MISC. (These are annotation conventions used for Old Czech in the + # Hičkok project.) + if node.misc['AddMwt'] != '': + subtokens = node.misc['AddMwt'].split() + if len(subtokens) != 2: + logging.warning("MISC 'AddMwt=%s' has unexpected number of subtokens." % node.misc['AddMwt']) + return None + token_from_subtokens = ''.join(subtokens) + if subtokens[1] == 'jsi': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' jsi', + 'lemma': '* být', + 'upos': '* AUX', + 'xpos': '* VB-S---2P-AAI--', + 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', + 'deprel': '* aux', + 'main': 0, + 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', + } + if subtokens[1] == 'jest': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' jest', + 'lemma': '* být', + 'upos': '* AUX', + 'xpos': '* VB-S---3P-AAI-2', + 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', + 'deprel': '* aux', + 'main': 0, + 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', + } + if subtokens[1] == 'i': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' i', + 'lemma': '* i', + 'upos': '* CCONJ', + 'xpos': '* J^-------------', + 'feats': '* _', + 'deprel': '* cc', + 'main': 0, + 'shape': 'subtree', + } + if subtokens[1] in ['ť', 'tě', 'ti']: + if token_from_subtokens != node.form: + logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) + return None + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' ' + subtokens[1], + 'lemma': '* ť', + 'upos': '* PART', + 'xpos': '* TT-------------', + 'feats': '* _', + 'deprel': '* discourse', + 'main': 0, + 'shape': 'subtree', + } + # dajžto = dajž + to + if subtokens[1] == 'to': + if token_from_subtokens != node.form: + logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) + return None + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' ' + subtokens[1], + 'lemma': '* ten', + 'upos': '* DET', + 'xpos': '* PDNS4----------', + 'feats': '* Case=Acc|Gender=Neut|Number=Sing|PronType=Dem', + 'deprel': '* obj', + 'main': 0, + 'shape': 'subtree', + } + # Contractions of prepositions and pronouns almost could be processed + # regardless of AddMwt instructions by the annotator, but we still + # require it to be on the safe side. For example, both 'přědeň' and + # 'přěden' are attested in Old Czech but then we do not want to catch + # 'on' (besides the wanted 'oň'). Another reason si that the pronoun + # could be masculine or neuter. We pick Gender=Masc and Animacy=Anim + # by default, unless the original token was annotated as Animacy=Inan + # or Gender=Neut. + m = re.match(r"^(na|nade|o|po|pro|přěde|ski?rz[eě]|za)[nň](ž?)$", node.form.lower()) + if m: + node.misc['AddMwt'] = '' + # Remove vocalization from 'přěde' (přěd něj) but keep it in 'skrze' + # (skrze něj). + if m.group(1) == 'přěde': + pform = 'přěd' + plemma = 'před' + adptype = 'Voc' + at = 'V' + elif re.match(r"^ski?rz[eě]$", m.group(1).lower()): + pform = m.group(1) + plemma = 'skrz' + adptype = 'Voc' + at = 'V' + else: + pform = m.group(1) + plemma = m.group(1) + adptype = 'Prep' + at = 'R' + # In UD PDT, Gender=Masc,Neut, and in PDT it is PEZS4--3 / P4ZS4---. + if node.feats['Gender'] == 'Neut': + gender = 'Neut' + animacy = '' + g = 'N' + elif node.feats['Animacy'] == 'Inan': + gender = 'Masc' + animacy = 'Animacy=Inan|' + g = 'I' + else: + gender = 'Masc' + animacy = 'Animacy=Anim|' + g = 'M' + if m.group(2).lower() == 'ž': + return { + 'form': pform + ' nějž', + 'lemma': plemma + ' jenž', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- P4'+g+'S4---------2', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|PrepCase=Pre|PronType=Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } + else: + return { + 'form': pform + ' něj', + 'lemma': plemma + ' on', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- PE'+g+'S4--3-------', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|Person=3|PrepCase=Pre|PronType=Prs', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } return None def postprocess_mwt(self, mwt): diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index f2f76b4b..4e2be633 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -1,6 +1,5 @@ """Block to fix case-enhanced dependency relations in Czech.""" from udapi.core.block import Block -import logging import re class FixEdeprels(Block): @@ -12,18 +11,25 @@ class FixEdeprels(Block): # by all the inner cases. # The list in the value contains exceptions that should be left intact. outermost = { + 'aby': [], 'ač': [], 'ačkoli': [], # 'ačkoliv' se převede na 'ačkoli' dole + 'ačkoliv': [], # ... ale možná ne když je doprovázeno předložkou + 'ať': [], 'byť': [], 'i_když': [], 'jak': [], 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole 'jako': [], 'jakoby': ['jakoby_pod:ins'], # these instances in FicTree should be spelled 'jako by' + 'když': [], 'než': ['než_aby'], + 'nežli': [], + 'pokud': [], 'protože': [], 'takže': [], - 'třebaže': [] + 'třebaže': [], + 'že': [] } # Secondary prepositions sometimes have the lemma of the original part of @@ -32,35 +38,54 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'á': 'na:acc', # "á konto té záležitosti", ovšem "á konto" není ani spojeno jako složená předložka (význam = "na konto") 'abi': 'aby', 'aby_na': 'na:loc', 'ačkoliv': 'ačkoli', 'ať': 'ať', # remove morphological case 'ať_forma': 'formou:gen', + 'ať_jako': 'jako', + 'ať_na': 'na:loc', + 'ať_s': 's:ins', 'ať_v': 'v:loc', + 'ať_v_oblast': 'v_oblasti:gen', 'ať_z': 'z:gen', + 'ať_z_hledisko': 'z_hlediska:gen', 'ať_z_strana': 'ze_strany:gen', 'až_do': 'do:gen', 'až_o': 'o:acc', 'během': 'během:gen', 'bez': 'bez:gen', 'bez_ohled_na': 'bez_ohledu_na:acc', + 'bez_na': 'bez_ohledu_na:acc', ###!!! a temporary hack to silence the validator about (https://github.com/UniversalDependencies/UD_Czech-PDT/issues/10#issuecomment-2710721703) 'bez_zřetel_k': 'bez_zřetele_k:dat', 'bez_zřetel_na': 'bez_zřetele_na:acc', + 'blízko': 'blízko:dat', + 'blízko_k': 'blízko:dat', 'blíž': 'blízko:dat', + 'blíže': 'blízko:dat', + 'bok_po_bok_s': 'bok_po_boku_s:ins', 'cesta': 'cestou:gen', + 'co_jako': 'jako', + 'coby': 'coby', # remove morphological case 'daleko': 'nedaleko:gen', 'daleko_od': 'od:gen', 'dík': 'díky:dat', 'díky': 'díky:dat', 'dle': 'dle:gen', 'do': 'do:gen', + 'do_čelo': 'do_čela:gen', 'do_k': 'k:dat', 'do_oblast': 'do_oblasti:gen', 'do_rozpor_s': 'do_rozporu_s:ins', + 'do_ruka': 'do_rukou:gen', 'do_soulad_s': 'do_souladu_s:ins', + 'důsledkem': 'v_důsledku:gen', 'forma': 'formou:gen', + 'formou': 'formou:gen', + 'hledět_na': 'nehledě_na:acc', 'i_když': 'i_když', # remove morphological case + 'i_pro': 'pro:acc', 'jak_aby': 'jak', 'jak_ad': 'jak', 'jakkoliv': 'jakkoli', @@ -68,33 +93,52 @@ class FixEdeprels(Block): 'jako_kupříkladu': 'jako', 'jakoby': 'jako', 'jakoby_pod': 'pod:ins', + 'jakožto': 'jako', 'jelikož_do': 'jelikož', + 'jenom': 'jen', + 'jesli': 'jestli', 'jestli_že': 'jestliže', + 'jménem': 'jménem:gen', 'k': 'k:dat', 'k_konec': 'ke_konci:gen', + 'k_prospěch': 'ku_prospěchu:gen', 'kdykoliv': 'kdykoli', 'kol': 'kolem:gen', 'kolem': 'kolem:gen', + 'kolem_dokola': 'kolem:gen', + 'koncem': 'koncem:gen', 'konec': 'koncem:gen', 'krom': 'kromě:gen', 'kromě': 'kromě:gen', + 'kvůli': 'kvůli:dat', + 'leda_když': 'ledaže', + 'li_jako': 'li', 'liž': 'li', 'mezi_uvnitř': 'uvnitř:gen', + 'na:ins': 'na:acc', 'na_báze': 'na_bázi:gen', 'na_čelo': 'na_čele:gen', 'na_mimo': 'na:loc', # na kurtě i mimo něj 'na_než': 'na:acc', # na víc než čtyři a půl kilometru 'na_od': 'na_rozdíl_od:gen', + 'na_počátek': 'na_počátku:gen', + 'na_počest': 'na_počest:gen', # appears also with :dat but the meaning is same 'na_podklad': 'na_podkladě:gen', 'na_rozdíl_od': 'na_rozdíl_od:gen', + 'na_strana': 'na_straně:gen', + 'na_účet': 'na_účet:gen', 'na_újma': 'gen', # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier 'na_úroveň': 'na_úrovni:gen', + 'na_úroveň_okolo': 'na_úrovni:gen', 'na_úsek': 'na_úseku:gen', + 'na_začátek': 'na_začátku:gen', 'na_základ': 'na_základě:gen', 'na_základna': 'na_základně:gen', 'na_závěr': 'na_závěr:gen', + 'na_zda': 'na:loc', # na tom, zda a v jaké formě... 'namísto': 'namísto:gen', 'namísto_do': 'do:gen', + 'napospas': 'napospas:dat', 'narozdíl_od': 'na_rozdíl_od:gen', 'následek': 'následkem:gen', 'navzdory': 'navzdory:dat', @@ -104,39 +148,58 @@ class FixEdeprels(Block): 'o_jako': 'jako', 'o_o': 'o:acc', 'od': 'od:gen', + 'od_počínaje': 'počínaje:ins', # od brambor počínaje a základní zeleninou konče 'ohledně': 'ohledně:gen', 'okolo': 'okolo:gen', 'oproti': 'oproti:dat', 'po_v': 'po:loc', + 'po_bok': 'po_boku:gen', 'po_doba': 'po_dobu:gen', + 'po_stránka': 'po_stránce:gen', 'po_vzor': 'po_vzoru:gen', 'poblíž': 'poblíž:gen', 'počátek': 'počátkem:gen', + 'počátkem': 'počátkem:gen', + 'počínaje': 'počínaje:ins', 'počínat': 'počínaje:ins', + 'počínat_od': 'počínaje:ins', 'pod_dojem': 'pod_dojmem:gen', + 'pod_tlak': 'pod_tlakem:gen', 'pod_vliv': 'pod_vlivem:gen', + 'pod_záminka': 'pod_záminkou:gen', + 'pod_záminka_že': 'pod_záminkou_že', + 'podél': 'podél:gen', 'podle': 'podle:gen', 'pomoc': 'pomocí:gen', 'pomocí': 'pomocí:gen', 'postup': 'postupem:gen', 'pouze_v': 'v:loc', 'pro': 'pro:acc', + 'pro_aby': 'pro:acc', 'prostřednictví': 'prostřednictvím:gen', 'prostřednictvím': 'prostřednictvím:gen', 'proti': 'proti:dat', + 'proto_aby': 'aby', 'protože': 'protože', # remove morphological case 'před_během': 'během:gen', # před a během utkání 'před_po': 'po:loc', # před a po vyloučení Schindlera 'přes': 'přes:acc', + 'přes_přes': 'přes:acc', # annotation error 'přestože': 'přestože', # remove morphological case 'při': 'při:loc', + 'při_pro': 'při:loc', 'při_příležitost': 'při_příležitosti:gen', + 'ruka_v_ruka_s': 'ruku_v_ruce_s:ins', + 's_cíl': 's_cílem', # s cílem projednat X 's_ohled_k': 's_ohledem_k:dat', 's_ohled_na': 's_ohledem_na:acc', 's_pomoc': 's_pomocí:gen', + 's_postup': 'postupem:gen', 's_přihlédnutí_k': 's_přihlédnutím_k:dat', 's_přihlédnutí_na': 's_přihlédnutím_na:acc', 's_výjimka': 's_výjimkou:gen', + 's_výjimka_z': 's_výjimkou:gen', + 's_výjimka_že': 's_výjimkou_že', 's_vyloučení': 's_vyloučením:gen', 's_zřetel_k': 'se_zřetelem_k:dat', 's_zřetel_na': 'se_zřetelem_na:acc', @@ -146,20 +209,29 @@ class FixEdeprels(Block): 'směr_k': 'směrem_k:dat', 'směr_na': 'směrem_na:acc', 'směr_od': 'směrem_od:gen', + 'směr_přes': 'směrem_přes:acc', + 'směr_z': 'směrem_z:gen', 'společně_s': 'společně_s:ins', 'spolu': 'spolu_s:ins', 'spolu_s': 'spolu_s:ins', + 'spolu_se': 'spolu_s:ins', 'stranou': 'stranou:gen', + 'stranou_od': 'stranou:gen', 'takže': 'takže', # remove morphological case 'takže_a': 'takže', 'třebaže': 'třebaže', # remove morphological case + 'tvář_v_tvář': 'tváří_v_tvář:dat', 'u': 'u:gen', 'u_příležitost': 'u_příležitosti:gen', 'uprostřed': 'uprostřed:gen', 'uvnitř': 'uvnitř:gen', + 'v:ins': 'v:loc', # ve skutečností (překlep) 'v_analogie_s': 'v_analogii_s:ins', + 'v_blízkost': 'v_blízkosti:gen', + 'v_čas': 'v_čase:gen', 'v_čelo': 'v_čele:gen', 'v_čelo_s': 'v_čele_s:ins', + 'v_doba': 'v_době:gen', 'v_dohoda_s': 'v_dohodě_s:ins', 'v_duch': 'v_duchu:gen', 'v_důsledek': 'v_důsledku:gen', @@ -170,12 +242,14 @@ class FixEdeprels(Block): 'v_konfrontace_s': 'v_konfrontaci_s:ins', 'v_kontext_s': 'v_kontextu_s:ins', 'v_na': 'na:loc', + 'v_neprospěch': 'v_neprospěch:gen', 'v_oblast': 'v_oblasti:gen', 'v_oblast_s': 's:ins', 'v_obor': 'v_oboru:gen', 'v_otázka': 'v_otázce:gen', 'v_podoba': 'v_podobě:gen', 'v_poměr_k': 'v_poměru_k:dat', + 'v_porovnání_s': 'v_porovnání_s:ins', 'v_proces': 'v_procesu:gen', 'v_prospěch': 've_prospěch:gen', 'v_protiklad_k': 'v_protikladu_k:dat', @@ -183,27 +257,34 @@ class FixEdeprels(Block): 'v_případ': 'v_případě:gen', 'v_případ_že': 'v_případě_že', 'v_rámec': 'v_rámci:gen', + 'v_reakce_na': 'v_reakci_na:acc', 'v_rozpor_s': 'v_rozporu_s:ins', 'v_řada': 'v_řadě:gen', 'v_shoda_s': 've_shodě_s:ins', 'v_služba': 've_službách:gen', 'v_směr': 've_směru:gen', 'v_směr_k': 've_směru_k:dat', + 'v_směr_na': 've_směru_k:dat', # same meaning as ve_směru_na:acc 'v_smysl': 've_smyslu:gen', 'v_součinnost_s': 'v_součinnosti_s:ins', 'v_souhlas_s': 'v_souhlasu_s:ins', 'v_soulad_s': 'v_souladu_s:ins', 'v_souvislost_s': 'v_souvislosti_s:ins', 'v_spojení_s': 've_spojení_s:ins', + 'v_spojení_se': 've_spojení_s:ins', 'v_spojený_s': 've_spojení_s:ins', 'v_spojitost_s': 've_spojitosti_s:ins', 'v_spolupráce_s': 've_spolupráci_s:ins', 'v_s_spolupráce': 've_spolupráci_s:ins', 'v_srovnání_s': 've_srovnání_s:ins', 'v_srovnání_se': 've_srovnání_s:ins', + 'v_stav': 've_stavu:gen', + 'v_stín': 've_stínu:gen', 'v_světlo': 've_světle:gen', + 'v_úroveň': 'v_úrovni:gen', 'v_věc': 've_věci:gen', 'v_vztah_k': 've_vztahu_k:dat', + 'v_vztah_s': 've_vztahu_k:dat', 'v_zájem': 'v_zájmu:gen', 'v_záležitost': 'v_záležitosti:gen', 'v_závěr': 'v_závěru:gen', @@ -212,9 +293,12 @@ class FixEdeprels(Block): 'v_znamení': 've_znamení:gen', 'včetně': 'včetně:gen', 'vedle': 'vedle:gen', + 'versus': 'versus:nom', 'vina': 'vinou:gen', 'vliv': 'vlivem:gen', + 'vlivem': 'vlivem:gen', 'vůči': 'vůči:dat', + 'výměna_za': 'výměnou_za:acc', 'vzhledem': 'vzhledem_k:dat', 'vzhledem_k': 'vzhledem_k:dat', 'z': 'z:gen', @@ -225,6 +309,7 @@ class FixEdeprels(Block): 'z_strana': 'ze_strany:gen', 'z_nedostatek': 'z_nedostatku:gen', 'z_titul': 'z_titulu:gen', + 'z_začátek': 'ze_začátku:gen', 'za_pomoc': 'za_pomoci:gen', 'za_účast': 'za_účasti:gen', 'za_účel': 'za_účelem:gen', @@ -253,261 +338,333 @@ def copy_case_from_adposition(self, node, adposition): else: return None - def process_node(self, node): + @staticmethod + def compose_edeprel(bdeprel, cdeprel): + """ + Composes enhanced deprel from the basic part and optional case + enhancement. + + Parameters + ---------- + bdeprel : str + Basic deprel (can include subtype, e.g., 'acl:relcl'). + cdeprel : TYPE + Case enhancement (can be composed of adposition and morphological + case, e.g., 'k:dat'). It is optional and it can be None or empty + string if there is no case enhancement. + + Returns + ------- + Full enhanced deprel (str). + """ + edeprel = bdeprel + if cdeprel: + edeprel += ':'+cdeprel + return edeprel + + def process_tree(self, tree): """ Occasionally the edeprels automatically derived from the Czech basic trees do not match the whitelist. For example, the noun is an abbreviation and its morphological case is unknown. + + We cannot use the process_node() method because it ignores empty nodes. """ - for edep in node.deps: - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) - if m: - bdeprel = m.group(1) - solved = False - # Issues caused by errors in the original annotation must be fixed early. - # Especially if acl|advcl occurs with a preposition that unambiguously - # receives a morphological case in the subsequent steps, and then gets - # flagged as solved. - edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! - edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:k(?::dat)?$', r'obl:k:dat', edep['deprel']) ###!!! Ale měli bychom opravit i závislost v základním stromu! - edep['deprel'] = re.sub(r'^advcl:místo(?::gen)?$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' - edep['deprel'] = re.sub(r'^acl:od(?::gen)?$', r'nmod:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:od(?::gen)?$', r'obl:od:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! - edep['deprel'] = re.sub(r'^advcl:podle(?::gen)?$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:pro(?::acc)?$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v_duchu?(?::gen)?$', r'obl:v_duchu:gen', edep['deprel']) - # Removing 'až' must be done early. The remainder may be 'počátek' - # and we will want to convert it to 'počátkem:gen'. - edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) - # If one of the following expressions occurs followed by another preposition - # or by morphological case, remove the additional case marking. For example, - # 'jako_v' becomes just 'jako'. - for x in self.outermost: - exceptions = self.outermost[x] - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) - if m and m.group(2) and not x+m.group(2) in exceptions: - edep['deprel'] = m.group(1)+':'+x - solved = True - break - if solved: - continue - for x in self.unambiguous: - # All secondary prepositions have only one fixed morphological case - # they appear with, so we can replace whatever case we encounter with the correct one. - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) - if m: - edep['deprel'] = m.group(1)+':'+self.unambiguous[x] - solved = True - break - if solved: - continue - # The following prepositions have more than one morphological case - # available. Thanks to the Case feature on prepositions, we can - # identify the correct one. - m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + for node in tree.descendants_and_empty: + for edep in node.deps: + m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel']) if m: - adpcase = self.copy_case_from_adposition(node, m.group(2)) - if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase): - edep['deprel'] = m.group(1)+':'+adpcase + bdeprel = m.group(1) + cdeprel = m.group(2) + solved = False + # Issues caused by errors in the original annotation must be fixed early. + # Especially if acl|advcl occurs with a preposition that unambiguously + # receives a morphological case in the subsequent steps, and then gets + # flagged as solved. + if re.match(r'advcl', bdeprel): + # The following advcl should in fact be obl. + if re.fullmatch(r'do(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'do:gen' + elif re.fullmatch(r'k(?::dat)?', cdeprel): ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'k:dat' + elif re.fullmatch(r'místo(?::gen)?', cdeprel): # 'v poslední době se množí bysem místo bych' + bdeprel = 'obl' + cdeprel = 'místo:gen' + elif re.fullmatch(r'od(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'od:gen' + elif re.fullmatch(r'podle(?::gen)?', cdeprel): + bdeprel = 'obl' + cdeprel = 'podle:gen' + elif re.fullmatch(r's(?::ins)?', cdeprel): ###!!! "seděli jsme tam s Člověče, nezlob se!" Měla by se opravit konverze stromu. + bdeprel = 'obl' + cdeprel = 's:ins' + elif re.fullmatch(r'v_duchu?(?::gen)?', cdeprel): + bdeprel = 'obl' + cdeprel = 'v_duchu:gen' + elif re.fullmatch(r'v', cdeprel): + bdeprel = 'obl' + cdeprel = 'v:loc' + # byl by pro, abychom... ###!!! Opravit i konverzi stromu. + elif re.fullmatch(r'pro(?::acc)?', cdeprel): + cdeprel = 'aby' + elif re.match(r'acl', bdeprel): + # The following acl should in fact be nmod. + if re.fullmatch(r'k(?::dat)?', cdeprel): + bdeprel = 'nmod' + cdeprel = 'k:dat' + elif re.fullmatch(r'na_způsob(?::gen)?', cdeprel): # 'střídmost na způsob Masarykova "jez dopolosyta"' + bdeprel = 'nmod' + cdeprel = 'na_způsob:gen' + elif re.fullmatch(r'od(?::gen)?', cdeprel): + bdeprel = 'nmod' + cdeprel = 'od:gen' + elif re.fullmatch(r'v', cdeprel): + bdeprel = 'nmod' + cdeprel = 'v:loc' + else: # bdeprel is 'obl' or 'nmod' + # The following subordinators should be removed if they occur with nominals. + if re.match(r'(ačkoli|když)', cdeprel): # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here! + cdeprel = '' + # Removing 'až' must be done early. The remainder may be 'počátek' + # and we will want to convert it to 'počátkem:gen'. + elif re.match(r'až_(.+):(gen|dat|acc|loc|ins)', cdeprel): + cdeprel = re.sub(r'až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2', cdeprel) + elif re.fullmatch(r'jestli(?::gen)?', cdeprel): # nevím, jestli osmého nebo devátého září + cdeprel = 'gen' + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.fullmatch(x+r'([_:].+)?', cdeprel) + if m and m.group(1) and not x+m.group(1) in exceptions: + cdeprel = x + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: continue - if re.match(r'^(acl|advcl):', edep['deprel']): - # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). - edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating - edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) - if edep['deprel'] == 'acl:v' and node.form == 'patře': - edep['deprel'] = 'nmod:v:loc' - node.deprel = 'nmod' - node.lemma = 'patro' - node.upos = 'NOUN' - node.xpos = 'NNNS6-----A----' - node.feats['Aspect'] = '' - node.feats['Gender'] = 'Neut' - node.feats['Tense'] = '' - node.feats['VerbForm'] = '' - node.feats['Voice'] = '' - elif re.match(r'^(nmod|obl(:arg)?):', edep['deprel']): - if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': - # This is a same-case noun-noun modifier, which just happens to be in the locative. - # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has - # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. - edep['deprel'] = 'nmod' - elif edep['deprel'] == 'obl:loc': - # Annotation error. The first occurrence in PDT dev: - # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' - # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. - # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. - edep['deprel'] = 'obl:v:loc' - elif edep['deprel'] == 'obl:arg:loc': - # Annotation error. The first occurrence in PDT dev: - edep['deprel'] = 'obl:arg:na:loc' - elif edep['deprel'] == 'nmod:loc': - # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. - edep['deprel'] = 'nmod:nom' - elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': - # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? - # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. - edep['deprel'] = 'obl' - elif edep['deprel'] == 'nmod:voc': - # 'v 8. čísle tiskoviny Ty rudá krávo' - edep['deprel'] = 'nmod:nom' - elif edep['deprel'] == 'nmod:co:nom': - # Annotation error: 'kompatibilní znamená tolik co slučitelný' - # 'co' should be relative pronoun rather than subordinating conjunction. - edep['deprel'] = 'acl:relcl' - node.deprel = 'acl:relcl' - elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): - edep['deprel'] = 'advcl:li' - elif re.match(r'^(nmod|obl(:arg)?):mezi:voc$', edep['deprel']): - edep['deprel'] = re.sub(r':voc$', r':acc', edep['deprel']) - elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^obl:místo_za:acc$', edep['deprel']): - # 'chytají krávu místo za rohy spíše za ocas' - # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. - for c in node.children: - if c.form == 'místo': - c.upos = 'ADV' - c.deprel = 'cc' - edep['deprel'] = 'obl:za:acc' - elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) - elif re.match(r'^(nmod|obl(:arg)?):na(:gen)?$', edep['deprel']): - edep['deprel'] = re.sub(r':gen$', '', edep['deprel']) - # The case is unknown. We need 'acc' or 'loc'. - # The locative is probably more frequent but it is not so likely with every noun. - # If there is an nummod:gov child, it must be accusative and not locative. - # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^obl:arg:na_konec$', edep['deprel']): - # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' - edep['deprel'] = 'obl:arg:na:acc' - elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.fullmatch(x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?', cdeprel) + if m: + cdeprel = self.unambiguous[x] + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + if re.match(r'(obl|nmod)', bdeprel): + m = re.fullmatch(r'(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?', cdeprel) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(1)) + if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase): + cdeprel = adpcase + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + continue + ###!!! bdeprel and cdeprel are not visible from here on but we may want to use them there as well. + if re.match(r'^(acl|advcl):', edep['deprel']): + # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating + edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) + if edep['deprel'] == 'acl:v' and node.form == 'patře': + edep['deprel'] = 'nmod:v:loc' + node.deprel = 'nmod' + node.lemma = 'patro' + node.upos = 'NOUN' + node.xpos = 'NNNS6-----A----' + node.feats['Aspect'] = '' + node.feats['Gender'] = 'Neut' + node.feats['Tense'] = '' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + elif re.match(r'^(nmod|obl(:arg)?):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and (node.parent == None or node.parent.feats['Case'] == 'Loc') or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'obl:loc': + # Annotation error. The first occurrence in PDT dev: + # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' + # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. + # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. + edep['deprel'] = 'obl:v:loc' + elif edep['deprel'] == 'obl:arg:loc': + # Annotation error. The first occurrence in PDT dev: + edep['deprel'] = 'obl:arg:na:loc' + elif edep['deprel'] == 'nmod:loc': + # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': + # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? + # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. + edep['deprel'] = 'obl' + elif edep['deprel'] == 'nmod:voc': + # 'v 8. čísle tiskoviny Ty rudá krávo' + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'nmod:co:nom': + # Annotation error: 'kompatibilní znamená tolik co slučitelný' + # 'co' should be relative pronoun rather than subordinating conjunction. + edep['deprel'] = 'acl:relcl' + node.deprel = 'acl:relcl' + elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): + edep['deprel'] = 'advcl:li' + elif re.match(r'^(nmod|obl(:arg)?):mezi:voc$', edep['deprel']): + edep['deprel'] = re.sub(r':voc$', r':acc', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): edep['deprel'] += ':acc' - else: + elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^obl:místo_za:acc$', edep['deprel']): + # 'chytají krávu místo za rohy spíše za ocas' + # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. + for c in node.children: + if c.form == 'místo': + c.upos = 'ADV' + c.deprel = 'cc' + edep['deprel'] = 'obl:za:acc' + elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):na(:gen)?$', edep['deprel']): + edep['deprel'] = re.sub(r':gen$', '', edep['deprel']) + # The case is unknown. We need 'acc' or 'loc'. + # The locative is probably more frequent but it is not so likely with every noun. + # If there is an nummod:gov child, it must be accusative and not locative. + # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:arg:na_konec$', edep['deprel']): + # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' + edep['deprel'] = 'obl:arg:na:acc' + elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): + # Annotation error. + if node.form == 's': + ohled = node.next_node + na = ohled.next_node + noun = na.next_node + self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') + self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(node, noun, 'case', 'case') + elif re.match(r'^nmod:pára:nom$', edep['deprel']): + # Annotation error: 'par excellence'. + edep['deprel'] = 'nmod' + for c in node.children: + if c.udeprel == 'case' and c.form.lower() == 'par': + c.lemma = 'par' + c.upos = 'ADP' + c.xpos = 'RR--X----------' + c.feats['Case'] = '' + c.feats['Gender'] = '' + c.feats['Number'] = '' + c.feats['Polarity'] = '' + c.feats['AdpType'] = 'Prep' + elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): + # Accusative would be possible but unlikely. edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): - # Annotation error. - if node.form == 's': - ohled = node.next_node - na = ohled.next_node - noun = na.next_node - self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') - self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') - self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') - self.set_basic_and_enhanced(node, noun, 'case', 'case') - elif re.match(r'^nmod:pára:nom$', edep['deprel']): - # Annotation error: 'par excellence'. - edep['deprel'] = 'nmod' - for c in node.children: - if c.udeprel == 'case' and c.form.lower() == 'par': - c.lemma = 'par' - c.upos = 'ADP' - c.xpos = 'RR--X----------' - c.feats['Case'] = '' - c.feats['Gender'] = '' - c.feats['Number'] = '' - c.feats['Polarity'] = '' - c.feats['AdpType'] = 'Prep' - elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: + elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): + # Genitive would be possible but unlikely. edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): - # Accusative would be possible but unlikely. - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): - # Genitive would be possible but unlikely. - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': - # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. - # Find the content nominal. - cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] - vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] - if len(cnouns) > 0 and len(vs) > 0: - cnoun = cnouns[0] - v = vs[0] - self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') - self.set_basic_and_enhanced(v, cnoun, 'case', 'case') - self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') - elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): - # ':nom' occurs in 'karneval v Rio de Janeiro' - edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': + # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. + # Find the content nominal. + cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] + vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] + if len(cnouns) > 0 and len(vs) > 0: + cnoun = cnouns[0] + v = vs[0] + self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') + self.set_basic_and_enhanced(v, cnoun, 'case', 'case') + self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') + elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): + # ':nom' occurs in 'karneval v Rio de Janeiro' + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): + # There is just one occurrence and it is an error: + # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' + # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. + edep['deprel'] = 'obl:s:ins' + elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): + # Instrumental would be possible but unlikely. edep['deprel'] += ':acc' else: - edep['deprel'] += ':loc' - elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): - # There is just one occurrence and it is an error: - # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' - # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. - edep['deprel'] = 'obl:s:ins' - elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): - # Instrumental would be possible but unlikely. - edep['deprel'] += ':acc' - else: - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' diff --git a/udapi/block/ud/cs/fixmorpho.py b/udapi/block/ud/cs/fixmorpho.py new file mode 100644 index 00000000..7fcb0e12 --- /dev/null +++ b/udapi/block/ud/cs/fixmorpho.py @@ -0,0 +1,471 @@ +""" +A Czech-specific block to fix lemmas, UPOS and morphological features in UD. +It should increase consistency across the Czech treebanks. It focuses on +individual closed-class verbs (such as the auxiliary "být") or on entire classes +of words (e.g. whether or not nouns should have the Polarity feature). It was +created as part of the Hičkok project (while importing nineteenth-century Czech +data) but it should be applicable on any other Czech treebank. +""" +from udapi.core.block import Block +import logging +import re + +class FixMorpho(Block): + + def process_node(self, node): + # Do not touch words marked as Foreign or Typo. They may not behave the + # way we expect in Czech data. + if node.feats['Foreign'] == 'Yes' or node.feats['Typo'] == 'Yes': + return + #---------------------------------------------------------------------- + # NOUNS, PROPER NOUNS, AND ADJECTIVES + #---------------------------------------------------------------------- + # Nouns do not have polarity but the Prague-style tagsets may mark it. + if node.upos in ['NOUN', 'PROPN']: + if node.feats['Polarity'] == 'Pos': + node.feats['Polarity'] = '' + elif node.feats['Polarity'] == 'Neg': + logging.warn(f'To remove Polarity=Neg from the NOUN {node.form}, we may have to change its lemma ({node.lemma}).') + # For some nouns, there is disagreement in whether to tag and lemmatize + # them as proper nouns. We must be careful and not add too many to this + # rule, as many of them could be used as surnames and then they should + # be PROPN. + if node.upos == 'PROPN' and re.fullmatch(r'(bůh|duch|hospodin|město|milost|pan|pán|panna|stvořitel|trojice)', node.lemma.lower()): + node.lemma = node.lemma.lower() + node.upos = 'NOUN' + # Lemmatization. + if node.upos == 'NOUN' and node.lemma == 'zem': + node.lemma = 'země' + if node.upos == 'ADJ': + # Adjectives should be lemmatized to lowercase even if they are part of + # a multiword name, e.g., "Malá" in "Malá Strana" should be lemmatized + # to "malý". Exception: Possessive adjectives derived from personal + # names, e.g., "Karlův". + if node.feats['Poss'] != 'Yes': + node.lemma = node.lemma.lower() + # Short forms of adjectives are rare in Modern Czech and uninflected + # (they are used as predicates), so they lack the Case feature. But + # they were inflected for Case in the past, so it is better to add + # Case=Nom for consistency. + if node.feats['Variant'] == 'Short' and node.feats['Case'] == '': + node.feats['Case'] = 'Nom' + #---------------------------------------------------------------------- + # PRONOUNS AND DETERMINERS + #---------------------------------------------------------------------- + # Clitic forms of personal pronouns have Variant=Short if there is also a longer, full form. + if node.upos == 'PRON' and node.feats['PronType'] == 'Prs' and re.fullmatch(r'(mi|mě|ti|tě|si|se|ho|mu)', node.form.lower()): + node.feats['Variant'] = 'Short' + # Forms of "my" should be lemmatized as "já". + if node.upos == 'PRON' and node.lemma == 'my': + node.lemma = 'já' + # Forms of "vy" should be lemmatized as "ty". + if node.upos == 'PRON' and node.lemma == 'vy': + node.lemma = 'ty' + # Forms of "oni" should be lemmatized as "on" and cases that allow + # a preposition should have PrepCase. + if node.upos == 'PRON' and node.lemma in ['on', 'oni']: + node.lemma = 'on' + if node.feats['Case'] not in ['Nom', 'Voc']: + if node.form.lower().startswith('j'): + node.feats['PrepCase'] = 'Npr' + elif re.match(r'[nň]', node.form.lower()): + node.feats['PrepCase'] = 'Pre' + # In 19th century data, the grammaticalized usages of "se", "si" are + # tagged as PART (rather than a reflexive PRON, which is the standard). + # Even if it already was tagged PRON, some features may have to be added. + if node.upos in ['PRON', 'PART'] and node.form.lower() in ['se', 'si']: + node.lemma = 'se' + node.upos = 'PRON' + node.feats['PronType'] = 'Prs' + node.feats['Reflex'] = 'Yes' + if node.form.lower() == 'se': + # Occasionally "se" can be genitive: "z prudkého do se dorážení". + if not node.feats['Case'] == 'Gen': + node.feats['Case'] = 'Acc' + else: + node.feats['Case'] = 'Dat' + node.feats['Variant'] = 'Short' + # As the genitive/accusative form of "on", "jeho" should have PrepCase. + if node.upos == 'PRON' and node.form.lower() == 'jeho': + node.feats['PrepCase'] = 'Npr' + # Possessive pronouns have Person, Gender[psor] and Number[psor]. + # Although it is questionable, plural possessors are lemmatized to singular + # possessors in an analogy to personal pronouns: "my" --> "já", "náš" --> "můj". + # Some source corpora lack Person and [psor] features, others do not respect + # the lemmatization rule, so in the end we have to look at the forms; but + # there are potentially many variants, especially in old texts. + if node.upos == 'DET' and node.feats['Poss'] == 'Yes': + if node.form.lower().startswith('m'): + # můj muoj mój mého mému mém mým moje má mojí mé moji mou mí mých mými + node.feats['Person'] = '1' + node.feats['Number[psor]'] = 'Sing' + elif node.form.lower().startswith('t'): + # tvůj tvuoj tvój tvého tvému tvém tvým tvoje tvá tvojí tvé tvoji tvou tví tvých tvými + node.feats['Person'] = '2' + node.feats['Number[psor]'] = 'Sing' + elif node.form.lower().startswith('n'): + # náš našeho našemu našem naším naše naší naši našich našim našimi + node.lemma = 'můj' + node.feats['Person'] = '1' + node.feats['Number[psor]'] = 'Plur' + elif node.form.lower().startswith('v'): + # váš vašeho vašemu vašem vaším vaše vaší vaši vašich vašim vašimi + node.lemma = 'tvůj' + node.feats['Person'] = '2' + node.feats['Number[psor]'] = 'Plur' + elif node.form.lower() == 'jeho': + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Sing' + if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']): + node.feats['Gender[psor]'] = 'Masc,Neut' + elif re.fullmatch(r'jehož', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Sing' + if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']): + node.feats['Gender[psor]'] = 'Masc,Neut' + elif re.fullmatch(r'(její|jejího|jejímu|jejím|jejích|jejími|jejíma)', node.form.lower()): + node.lemma = 'jeho' + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Sing' + node.feats['Gender[psor]'] = 'Fem' + elif re.fullmatch(r'jejíž', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Sing' + node.feats['Gender[psor]'] = 'Fem' + elif re.fullmatch(r'jich|jejich', node.form.lower()): + node.lemma = 'jeho' + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Plur' + elif re.fullmatch(r'jichž|jejichž', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Plur' + elif re.fullmatch(r'jichžto|jejichžto', node.form.lower()): + node.lemma = 'jehožto' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Plur' + elif node.lemma == 'čí': + node.feats['Poss'] = 'Yes' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Reflexive possessive pronoun should not forget the Reflex=Yes feature. + if node.upos == 'DET' and node.lemma == 'svůj': + node.feats['Reflex'] = 'Yes' + # Demonstrative, interrogative, relative, negative, total and indefinite + # pronouns (or determiners, because some of them get the DET tag). + if node.upos in ['PRON', 'DET']: + # Relative pronoun "jenž" should be PRON, not DET + # (it inflects for Gender but it can never be used as congruent attribute). + if re.fullmatch(r'(jenž|jenžto)', node.lemma): + node.upos = 'PRON' + if node.form.lower().startswith('j'): + node.feats['PrepCase'] = 'Npr' + else: + node.feats['PrepCase'] = 'Pre' + # Relative pronoun "ješto" should be PRON, not DET (if it is not SCONJ, but that was excluded by a condition above) + # (it inflects for Gender but it can never be used as congruent attribute). + elif node.form.lower() in ['ješto', 'ježto']: + node.lemma = 'jenžto' + node.upos = 'PRON' + node.feats['PrepCase'] = 'Npr' + # Relative pronoun "an" is PRON (not DET). + elif node.lemma == 'an': + node.upos = 'PRON' + node.feats['PronType'] = 'Rel' + # Pronoun "kdo" is PRON (not DET). + elif node.lemma == 'kdo': + node.lemma = 'kdo' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "kdož" is PRON (not DET). + elif node.lemma == 'kdož': + node.lemma = 'kdož' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Rel' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "někdo", "kdosi" is PRON (not DET). + elif re.fullmatch(r'(kdosi|někdo)', node.lemma): + node.upos = 'PRON' + node.feats['PronType'] = 'Ind' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "nikdo" is PRON (not DET). + elif node.lemma == 'nikdo': + node.lemma = 'nikdo' + node.upos = 'PRON' + node.feats['PronType'] = 'Neg' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "co" is PRON (not DET). + elif node.lemma == 'co': + node.lemma = 'co' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "což" is PRON (not DET). + elif node.lemma in ['což', 'cože']: + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Rel' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "něco" is PRON (not DET). + elif re.fullmatch(r'(cokoli|cosi|něco)', node.lemma): + node.upos = 'PRON' + node.feats['PronType'] = 'Ind' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "nic" is PRON (not DET). + elif node.lemma == 'nic': + node.lemma = 'nic' + node.upos = 'PRON' + node.feats['PronType'] = 'Neg' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "týž" is DET and PronType=Dem. + elif re.fullmatch(r'(tentýž|týž)', node.lemma): + node.upos = 'DET' + node.feats['PronType'] = 'Dem' + # Pronoun "každý" is DET and PronType=Tot. + elif node.lemma == 'každý': + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + # Pronoun "vše" is lemmatized to "všechen", it is DET and PronType=Tot. + elif node.form.lower() == 'vše': + node.lemma = 'všechen' + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + elif node.lemma == 'všechen': + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + elif re.fullmatch(r'(všecek|všecka|všecku|všecko|všickni)', node.form.lower()): + node.lemma = 'všechen' + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + # Pronoun "sám" is lemmatized to the long form, it is DET and PronType=Emp. + elif node.lemma in ['sám', 'samý']: + node.lemma = 'samý' + node.upos = 'DET' + node.feats['PronType'] = 'Emp' + node.feats['Variant'] = 'Short' if re.fullmatch(r'(sám|sama|samo|sami|samy|samu)', node.form.lower()) else '' + #---------------------------------------------------------------------- + # PRONOMINAL NUMERALS AND ADVERBS + #---------------------------------------------------------------------- + # The numeral "oba" should be NUM, not PRON or DET. But it should have PronType=Tot. + if node.upos in ['NUM', 'PRON', 'DET'] and node.lemma == 'oba': + node.upos = 'NUM' + node.feats['NumType'] = 'Card' + node.feats['NumForm'] = 'Word' + node.feats['PronType'] = 'Tot' + # Pronominal cardinal numerals should be DET, not NUM. + if node.upos == 'NUM': + if re.fullmatch(r'(mnoho|málo|několik)', node.lemma): + node.upos = 'DET' + node.feats['PronType'] = 'Ind' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' ###!!! so we are losing the distinction mnoho/nemnoho? + elif re.fullmatch(r'(toliko?)', node.lemma): + node.lemma = 'tolik' + node.upos = 'DET' + node.feats['PronType'] = 'Dem' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(kolik)', node.lemma): + node.upos = 'DET' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' + if node.upos in ['ADV', 'NUM']: + if re.fullmatch(r'(mnoho|málo|několi)krát', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + node.feats['PronType'] = 'Ind' + elif re.fullmatch(r'(tolikrát)', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + node.feats['PronType'] = 'Dem' + elif re.fullmatch(r'(kolikrát)', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Pronominal adverbs have PronType but most of them do not have Degree + # and Polarity. + if node.upos == 'ADV': + if re.fullmatch(r'(dosud|dotud|nyní|odsud|odtud|proto|sem|tady|tak|takož|takto|tam|tamto|teď|tehdy|tenkrát|tu|tudy|zde)', node.lemma): + node.feats['PronType'] = 'Dem' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(dokdy|dokud|jak|kam|kde|kdy|kterak|kudy|odkdy|odkud|proč)', node.lemma): + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(kdežto)', node.lemma): + node.feats['PronType'] = 'Rel' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(jakkoli|jaksi|kamkoli|kamsi|kdekoli|kdesi|kdykoli|kdysi|kudykoli|kudysi|nějak|někam|někde|někdy|někudy)', node.lemma): + node.feats['PronType'] = 'Ind' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(nic|nijak|nikam|nikde|nikdy|nikudy)', node.lemma): + node.feats['PronType'] = 'Neg' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + # Total pronominals can be negated ("nevždy"). Then they get Degree, too. + elif re.fullmatch(r'(odevšad|všude|všudy|ve?ždy|ve?ždycky)', node.lemma): + node.feats['PronType'] = 'Tot' + node.feats['Degree'] = 'Pos' + node.feats['Polarity'] = 'Pos' + #---------------------------------------------------------------------- + # VERBS AND AUXILIARIES + #---------------------------------------------------------------------- + # In Czech UD, "být" is always tagged as AUX and never as VERB, regardless + # of the fact that it can participate in purely existential constructions + # where it no longer acts as a copula. Czech tagsets typically do not + # distinguish AUX from VERB, which means that converted data may have to + # be fixed. + if node.upos == 'VERB' and node.lemma in ['být', 'bývat', 'bývávat']: + node.upos = 'AUX' + if node.upos in ['ADV', 'VERB'] and re.fullmatch(r'(ne)?lze', node.form.lower()): + node.upos = 'ADV' + node.lemma = 'lze' # not 'nelze' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + node.feats['Aspect'] = '' + node.feats['Mood'] = '' + node.feats['Tense'] = '' + node.feats['Person'] = '' + node.feats['Number'] = '' + node.feats['Degree'] = 'Pos' + if node.upos in ['VERB', 'AUX']: + # Most non-passive verb forms have Voice=Act, and infinitives should + # have it, too. Passive infinitives are always periphrastic. + # (This is not done in the PDT tagset, but we should add it.) + if node.feats['VerbForm'] == 'Inf': + node.feats['Voice'] = 'Act' + # Same for imperatives. + elif node.feats['Mood'] == 'Imp': + node.feats['Voice'] = 'Act' + # Some verbs lack the Aspect feature although they are not biaspectual. + if node.feats['Aspect'] == '': + if re.fullmatch(r'(cítit|čekat|činit|číst|dávat|dělat|dít|dívat|hledat|chodit|chtít|jít|kralovat|ležet|milovat|mít|mluvit|moci|mus[ei]t|mysl[ei]t|patřit|počínat|prosit|ptát|působit|sedět|snažit|vědět|vidět|vyprávět|zdát|znamenat|žít)', node.lemma): + node.feats['Aspect'] = 'Imp' + elif re.fullmatch(r'(dát|dojít|dostat|nalézt|napadnout|nechat|obrátit|odpovědět|otevřít|počít|položit|pomoci|poslat|postavit|povědět|poznat|přijít|přinést|říci|učinit|udělat|ukázat|vrátit|vstát|vydat|vzít|začít|zeptat|zůstat)', node.lemma): + node.feats['Aspect'] = 'Perf' + # We must look at word form to distinguish imperfective "stát" from perfective "stát se". + elif re.fullmatch(r'(stojí(me?|š|te)?|stál(a|o|i|y)?)', node.form.lower()): + node.feats['Aspect'] = 'Imp' + elif re.fullmatch(r'(stan(u|eš|e|eme?|ete|ou)|stal(a|o|i|y)?)', node.form.lower()): + node.feats['Aspect'] = 'Perf' + # Present forms of perfective verbs normally have Tense=Pres despite + # meaning future. However, a few imperfective verbs have a separate + # future form (distinct from present form), which gets Tense=Fut + # despite inflecting similarly to present forms. + if node.feats['Mood'] == 'Ind' and node.feats['Tense'] == 'Pres' and node.feats['Aspect'] != 'Perf' and re.match(r'(ne)?((bud|půjd|pojed|polez|pones)(u|eš|e|eme?|ete|ou)|polet(ím|íš|í|íme|íte))', node.form.lower()): + node.feats['Tense'] = 'Fut' + # Passive participles (including the short forms) should be ADJ, not VERB. + # But they keep the verbal features of VerbForm, Voice, Aspect. + if node.feats['VerbForm'] == 'Part' and node.feats['Voice'] == 'Pass': + node.upos = 'ADJ' + # But now we need an adjectival lemma. + ###!!! Bohužel to občas zahodí normalizaci, kterou tam Martinův tým zavedl ručně, např. "rozhřita" mělo lemma "rozehřát", ale já teď místo "rozehřátý" vyrobím "rozhřitý". + ###!!! odepříno - odepříný místo odepřený + ###!!! dovolíno - dovolíný místo dovolený + ###!!! vyslyšána - vyslyšaný místo vyslyšený + ###!!! obmezený místo omezený, oslyšaný místo oslyšený + node.misc['LDeriv'] = node.lemma + node.lemma = re.sub(r'([nt])[auoiy]?$', r'\1ý', node.form.lower()) + node.lemma = re.sub(r'áný$', r'aný', node.lemma) # ztroskotány --> ztroskotáný --> ztroskotaný; zachován, spořádán + if node.feats['Polarity'] == 'Neg': + node.lemma = re.sub(r'^ne', '', node.lemma) + if node.feats['Case'] == '': + node.feats['Case'] = 'Nom' + if node.feats['Degree'] == '': + node.feats['Degree'] = 'Pos' + node.feats['Variant'] = 'Short' + #---------------------------------------------------------------------- + # ADVERBS + #---------------------------------------------------------------------- + # Words that indicate the speaker's attitude are tagged ADV in UD, + # although the Czech tagsets often treat them as particles. + if node.upos == 'PART' and re.fullmatch(r'(ani|asi?|až|bezpochyby|bohdá|co|dokonce|jen|jistě|již|hlavně|hned|jednoduše|leda|možná|naopak|nejen|nejspíše?|opravdu|ovšem|patrně|právě|prej|prý|přece|především|rozhodně|skoro|skutečně|snad|spíše?|teda|tedy|třeba|určitě|věru|vlastně|vůbec|zajisté|zase|zrovna|zřejmě|zvlášť|zvláště)', node.lemma): + node.upos = 'ADV' + node.feats['Degree'] = 'Pos' + node.feats['Polarity'] = 'Pos' + node.misc['CzechParticle'] = 'Yes' + # Adverb "brzo" should be lemmatized as "brzy". + if node.upos == 'ADV' and node.form.lower() == 'brzo': + node.lemma = 'brzy' + if node.upos == 'ADV' and node.form.lower() == 'teprv': + node.lemma = 'teprve' + # All non-pronominal adverbs (and also some pronominal ones) should + # have Degree and Polarity. At least for now we also exclude adverbial + # numerals, e.g. "jednou" – "nejednou". + if node.upos == 'ADV' and node.feats['PronType'] == '' and node.feats['NumType'] == '': + if node.feats['Degree'] == '': + node.feats['Degree'] = 'Pos' + if node.feats['Polarity'] == '': + node.feats['Polarity'] = 'Pos' + #---------------------------------------------------------------------- + # PREPOSITIONS + #---------------------------------------------------------------------- + # Preposition "u" may combine with Case=Loc|Acc in old texts, and then + # it functions as a vocalized counterpart of "v". Nevertheless, we always + # lemmatize it as "u" and thus AdpType is Prep, not Voc. + if node.upos == 'ADP' and node.form.lower() == 'u': + node.lemma = 'u' + node.feats['AdpType'] = 'Prep' + #---------------------------------------------------------------------- + # CONJUNCTIONS + #---------------------------------------------------------------------- + # As a conjunction (and not particle/adverb), "ani" is coordinating and + # not subordinating. + if node.upos == 'SCONJ' and node.lemma == 'ani': + node.upos = 'CCONJ' + if node.upos == 'CCONJ' and node.lemma == 'nebť': + node.lemma = 'neboť' + #---------------------------------------------------------------------- + # PARTICLES (other than those already grabbed above) + #---------------------------------------------------------------------- + # "jako" should be SCONJ but 19th century data have it as PART. + if node.upos == 'PART': + if node.lemma == 'jako': + node.upos = 'SCONJ' + elif node.lemma == 'ti': + node.lemma = 'ť' diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 11ecd6d9..da9f5bda 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -2,12 +2,14 @@ Block to identify missing or ill-valued features in Czech. Any bugs that it finds will be saved in the MISC column as a Bug attribute, which can be later used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ -from udapi.core.block import Block -import logging +import udapi.block.ud.markfeatsbugs import re -class MarkFeatsBugs(Block): +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): # The convention used in PDT is not consistent. Adjectives are fully disambiguated # (three genders, two animacies, three numbers, seven cases), even though some @@ -21,59 +23,45 @@ class MarkFeatsBugs(Block): # in the future. pdt20 = False # True = like in PDT 2.0; False = like in ČNK - def bug(self, node, bugstring): - bugs = [] - if node.misc['Bug']: - bugs = node.misc['Bug'].split('+') - if not bugstring in bugs: - bugs.append(bugstring) - node.misc['Bug'] = '+'.join(bugs) - - def check_allowed_features(self, node, allowed): - """ - We need a dictionary indexed by feature names that are allowed; for each - feature name, there is a list of allowed values. - """ - # Check for features that are not allowed but the node has them. - # For features that are allowed, check that their values are allowed. - for f in node.feats: - if f in allowed: - if not node.feats[f] in allowed[f]: - self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') - else: - self.bug(node, 'Feat' + f + 'NotAllowed') - - def check_required_features(self, node, required): - """ - We need a list of names of features whose values must not be empty. - """ - for f in required: - if not f in node.feats: - self.bug(node, 'Feat' + f + 'Missing') - def process_node(self, node): + # Czech constraints should not be applied to foreign words. + if node.feats['Foreign'] == 'Yes': + pass # NOUNS ################################################################ - if node.upos == 'NOUN': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) - if node.feats['Gender'] == 'Masc': + elif node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case']) + if node.feats['VerbForm'] == 'Vnoun': + # verbal nouns: bytí, dělání, ... + self.check_allowed_features(node, { + 'VerbForm': ['Vnoun'], + 'Gender': ['Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes'] + }) + elif node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Animacy']) self.check_allowed_features(node, { 'Gender': ['Masc', 'Fem', 'Neut'], 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_allowed_features(node, { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + self.check_required_features(node, ['Gender', 'Number', 'Case']) if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Animacy']) self.check_allowed_features(node, { @@ -81,17 +69,17 @@ def process_node(self, node): 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_allowed_features(node, { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': if node.feats['Poss'] == 'Yes': # possessive adjectives @@ -104,8 +92,10 @@ def process_node(self, node): 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'Foreign': ['Yes']}) + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { @@ -114,31 +104,42 @@ def process_node(self, node): 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'Foreign': ['Yes']}) - elif node.feats['NumType'] == 'Ord': # ordinal numerals are a subtype of adjectives + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + elif node.feats['NumType'] == 'Ord' or node.feats['NumType'] == 'Mult': # ordinal numerals are a subtype of adjectives; same for some multiplicative numerals (dvojí, trojí) if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) self.check_allowed_features(node, { - 'NumType': ['Ord'], + 'NumType': ['Ord', 'Mult'], + 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho') 'Gender': ['Masc', 'Fem', 'Neut'], 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) + 'Variant': ['Short'], # sedmer (Mult Short) duch tvój; pól čtverta (Ord Short) komára + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'NumType': ['Ord'], + 'NumType': ['Ord', 'Mult'], + 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho') 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives self.check_required_features(node, ['VerbForm', 'Voice']) - if node.feats['Voice'] == 'Act': # active participles have tense, passives don't + if node.feats['Voice'] == 'Act': # active participles have tense, passives don't but they have degree if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -150,9 +151,12 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], 'Variant': ['Short'], - 'Foreign': ['Yes']}) + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -163,10 +167,13 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], 'Variant': ['Short'], - 'Foreign': ['Yes']}) + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Degree']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -176,10 +183,14 @@ def process_node(self, node): 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], 'Variant': ['Short'], - 'Foreign': ['Yes']}) + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Number', 'Case', 'Polarity', 'Degree']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -188,29 +199,12 @@ def process_node(self, node): 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], 'Variant': ['Short'], - 'Foreign': ['Yes']}) - elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: # regular adjectives + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: # regular adjectives, including short forms if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) self.check_allowed_features(node, { @@ -220,7 +214,10 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Degree': ['Pos', 'Cmp', 'Sup'], 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) self.check_allowed_features(node, { @@ -229,7 +226,10 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Degree': ['Pos', 'Cmp', 'Sup'], 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) # PRONOUNS ############################################################# elif node.upos == 'PRON': self.check_required_features(node, ['PronType']) @@ -249,13 +249,24 @@ def process_node(self, node): 'PronType': ['Prs'], 'Person': ['3'] }) + elif re.match(r"^(ho|mu)$", node.form.lower()): + # The short (clitic) forms do not have PrepCase in Modern Czech. + # Old Czech has also 'jmu' (besides 'jemu' and 'mu') and 'jho' + # (besides 'jeho' and 'ho'); it should not have Variant=Short + # and it should have PrepCase=Npr (the next block). + self.check_adjective_like(node, ['PronType', 'Person', 'Variant'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Variant': ['Short'] + }) else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně # Mostly only two gender groups and no animacy: - # Masc,Neut ... jeho, jemu, jej, něm, jím + # Masc,Neut ... jeho, jho, jemu, jmu, jej, něm, jím # Fem ... jí, ji, ní # Neut ... je # No gender in dual and plural: # Plur ... jich, jim, je, nich, jimi + # Here we require PrepCase but disallow Variant. self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { 'PronType': ['Prs'], 'Person': ['3'], @@ -270,18 +281,22 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Variant': ['Short'] }) - elif re.search(r'k[dt]o', node.lemma): # kdo (kto), kdož, někdo, nikdo + elif re.search(r'k[dt][oe]', node.lemma): # kdo (kto), kdož, někdo, nikdo # There is no Number. Někdo and nikdo behave like singular; - # kdo is by default singular as well but it also occurs as a subject - # of plural verbs. + # kdo is by default singular as well but it also occurs as subject + # of plural verbs ("ti, kdo nepřišli včas, byli vyloučeni"). + # In Old Czech, "nikde" is a variant of the pronoun "nikdo" (nobody) + # (while in New Czech, "nikde" (nowhere) is a pronominal adverb only). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kdo to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], 'Gender': ['Masc'], 'Animacy': ['Anim'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] }) - elif re.match(r'^(co|což|něco|nicož)$', node.lemma): + elif re.match(r'^(co(si?)?|což|což?koliv?|něco|lečco|lecco|ledacos?|nic|nicož)$', node.lemma): # Although these pronouns behave by default as neuter singular, # no Gender and Number is annotated. However, quite unusually, # there is Animacy=Inan without Gender. @@ -290,9 +305,11 @@ def process_node(self, node): ###!!! animacy. For now, let's at least make animacy an optional ###!!! feature (I see that we already do not fill it in the Old ###!!! Czech data). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, co to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. self.check_required_features(node, ['PronType', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], 'Animacy': ['Inan'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] }) @@ -300,6 +317,9 @@ def process_node(self, node): # Unlike 'jenžto', this relative pronoun does not inflect, it # always occurs in a nominative position, but the context can # be any gender and number. + # Update from the Hičkok project: 'ješto' is lemmatized to + # 'jenžto' (see below), meaning that this branch should not be + # needed for the new data. self.check_required_features(node, ['PronType', 'Case']) self.check_allowed_features(node, { 'PronType': ['Rel'], @@ -318,10 +338,24 @@ def process_node(self, node): # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even # in the nominative, although there is no prepositional counter- # part (but similarly the locative has no prepositionless form). - self.check_adjective_like(node, ['PronType', 'PrepCase'], { - 'PronType': ['Rel'], - 'PrepCase': ['Npr', 'Pre'] - }) + # Update from the Hičkok project: In Old Czech, both 'jenž' and + # 'jenžto' (or its variant 'ješto') can be used uninflected, + # accompanied by a resumptive pronoun which provides the inflection. + # In this case, the Hičkok data will not annotate Gender, Animacy, + # Number and Case of the relative pronoun. Therefore, we require + # the full set of features if any of them is present; otherwise, + # we only expect PronType and PrepCase. + if node.feats['Gender'] != '' or node.feats['Animacy'] != '' or node.feats['Number'] != '' or node.feats['Case'] != '': + self.check_adjective_like(node, ['PronType', 'PrepCase'], { + 'PronType': ['Rel'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: + self.check_required_features(node, ['PronType', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'PrepCase': ['Npr'] + }) else: # What remains is the relative pronoun 'an'. It behaves similarly # to 'jenž' but it does not have the PrepCase feature and it @@ -340,6 +374,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'PronType': ['Rel'], 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom'] }) @@ -347,45 +382,131 @@ def process_node(self, node): elif node.upos == 'DET': # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. - if re.match(r'^(jeho|jejich|jich)(ž(to)?)?$', node.form.lower()): + if re.match(r'^(je?ho|jejich|j[ií]ch)$', node.form.lower()): self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], + 'PronType': ['Prs'], 'Poss': ['Yes'], 'Person': ['3'], 'Number[psor]': ['Sing', 'Dual', 'Plur'], - 'Gender[psor]': ['Masc,Neut'] + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner because no n-form can be used (jeho dům VS. na jeho dům). + # Compare with genitive/accusative of the pronoun "on", there the form changes after preposition and PrepCase must be annotated + # (jeho se bojím VS. bez něho se neobejdu). }) - elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): + # Relative possessive determiners 'jehož' and 'jejichž' behave similarly + # to the personal possessive determiners but they do not have Person. + # Normally determiners do not change j->n after prepositions but we + # have an example in Old Czech (štěpové zlatí, na nichžto větviech...) + elif re.match(r'^(jeho|jejich|[jn][ií]ch)ž(e|to)?$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner (muž, jehož manželka zahynula při nehodě) because no n-form can be used + # (after preposition: muž, na jehož manželku jste si stěžoval). Compare with genitive/accusative of the relative pronoun "jenž", + # there the form changes after preposition and PrepCase must be annotated (muž, jehož se bojím VS. muž, bez něhož se neobejdeme). + }) + # Feminine personal possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)$', node.form.lower()): # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. - # Congruent gender is annotated only in singular. Masculine and - # neuter are merged even in nominative. Feminine singular does - # not distinguish case in PDT but we need it in Old Czech at - # least for 'jejiej'. - if node.feats['Number'] == 'Sing': + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (její bota, její boty, její botě, její botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiej') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], + 'PronType': ['Prs'], 'Poss': ['Yes'], 'Person': ['3'], 'Number[psor]': ['Sing'], 'Gender[psor]': ['Fem'], - 'Gender': ['Masc,Neut', 'Fem'], - 'Number': ['Sing'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) + # Feminine relative possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(e|to)?)$', node.form.lower()): + # The feminine possessive 'jejíž' slightly inflects, unlike 'jehož' and 'jejichž'. + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (jejíž bota, jejíž boty, jejíž botě, jejíž botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiejž') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) else: - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], + 'PronType': ['Rel'], 'Poss': ['Yes'], - 'Person': ['3'], 'Number[psor]': ['Sing'], 'Gender[psor]': ['Fem'], - 'Number': ['Dual', 'Plur'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) - elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + elif re.match(r'^(můj|tvůj|svůj)(ž(e|to)?)?$', node.lemma): if node.feats['Reflex'] == 'Yes': self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { 'PronType': ['Prs'], @@ -399,8 +520,46 @@ def process_node(self, node): 'Person': ['1', '2'], 'Number[psor]': ['Sing', 'Plur'] }) + elif re.match(r'^(ně|lec|ni)?číž?(koliv?)?$', node.lemma): + self.check_adjective_like(node, ['PronType', 'Poss'], { + 'PronType': ['Int', 'Rel', 'Ind', 'Neg'], + 'Poss': ['Yes'] + }) + elif re.match(r'^(sám|samý)$', node.lemma): + # The above condition looks at both lemma options, although only one lemma is assumed. + # However, in New Czech data the one lemma is "samý" while in Old Czech data it is "sám". + # Unlike other determiners, it allows Variant=Short: sám, sama, samu, samo, sami, samy. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Emp'], 'Variant': ['Short']}) + elif node.lemma == 'veškerý': + # In Old Czech, this determiner also allows Variant=Short: veškeren, veškera, veškeru, veškero, veškeři, veškery. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Tot'], 'Variant': ['Short']}) + elif node.lemma == 'žádný': + # In Old Czech, this determiner also allows Variant=Short: žáden, žádna, žádnu, žádno, žádni, žádny. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Neg'], 'Variant': ['Short']}) + elif node.feats['NumType'] in ['Ord', 'Mult']: # pronominal numerals 'několikátý', 'několikerý', 'několiký' etc. + self.check_adjective_like(node, ['PronType', 'NumType'], { + 'PronType': ['Ind', 'Int', 'Rel', 'Dem'], + 'NumType': ['Ord', 'Mult'] + }) + elif node.feats['NumType'] == 'Card': # pronominal quantifiers 'mnoho', 'málo', 'několik' etc. + if node.lemma == 'nejeden': + self.check_adjective_like(node, ['PronType', 'NumType'], {'PronType': ['Ind'], 'NumType': ['Card']}) + else: + # Lemmas 'hodně' and 'málo' have Degree even if used as quantifiers and not adverbs: + # hodně, více, nejvíce; málo, méně, nejméně + # Lemmas 'mnoho' and 'málo' can be negated (nemnoho, nemálo). + self.check_required_features(node, ['PronType', 'NumType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Ind', 'Int', 'Rel', 'Dem'], + 'NumType': ['Card'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) else: - self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp']}) + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Int', 'Rel', 'Ind', 'Neg', 'Tot']}) # NUMERALS ############################################################# elif node.upos == 'NUM': self.check_required_features(node, ['NumType', 'NumForm']) @@ -411,29 +570,44 @@ def process_node(self, node): 'NumForm': ['Digit', 'Roman'] }) else: - ###!!! Somehow the NumValue feature from PDT via Interset is useless. + if node.feats['NumType'] == 'Sets': + # 'jedny', 'dvoje', 'oboje', 'troje', 'čtvery' + # Number should perhaps be only Plur because the counted noun will be Plur. + # Gender is not annotated in PDT but there are different forms ('jedni' vs. 'jedny', + # and in Old Czech also 'dvoji' vs. 'dvoje'), so we should allow Gender (and Animacy). + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Sets'], + 'PronType': ['Tot'], # for 'oboje' + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. # 'pět' and more have Number=Plur, Case: pět, pěti. - if node.lemma == 'jeden': - self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Number', 'Case']) + # 'půl' has no Number and Case, although it behaves syntactically similarly to 'pět' (but genitive is still 'půl', not '*půli'). + # 'sto', 'tisíc', 'milión', 'miliarda' etc. have Gender (+ possibly Animacy) and Number (depending on their form). + elif node.lemma == 'jeden': + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) self.check_allowed_features(node, { 'NumType': ['Card'], 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) elif re.match(r'^(dva|oba)$', node.lemma): - self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Gender', 'Number', 'Case']) + self.check_required_features(node, ['NumType', 'NumForm', 'Gender', 'Number', 'Case']) if self.pdt20: self.check_allowed_features(node, { 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm 'Number': ['Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] @@ -441,33 +615,73 @@ def process_node(self, node): else: self.check_allowed_features(node, { 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Animacy': ['Anim', 'Inan'], 'Number': ['Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) + elif re.match(r'^(dvé|obé)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'obé' + 'NumForm': ['Word'], + 'Gender': ['Neut'], + 'Number': ['Sing'], # when 'dvé' is subject, the verb is neuter singular + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif node.lemma == 'půl': + self.check_required_features(node, ['NumType', 'NumForm']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'] + }) + elif re.match(r'^(sto|tisíc|.+ili[oó]n|.+iliarda)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) else: + # In PDT, cardinal numerals higher than four in nominative/accusative/vocative + # have Number=Sing instead of Plur! It may be motivated by the default + # agreement they trigger on verbs (but they don't have Gender=Neut). + # It does not make much sense but we must allow Sing before a better + # approach is defined and implemented in the data. + # On the other hand, we may want to allow Dual for "stě". self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) self.check_allowed_features(node, { 'NumType': ['Card'], 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Number': ['Plur'], + 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) # VERBS AND AUXILIARIES ################################################ - elif re.match(r'^(VERB|AUX)$', node.upos): - self.check_required_features(node, ['Aspect', 'VerbForm']) - if node.feats['VerbForm'] == 'Inf': + elif node.upos in ['VERB', 'AUX']: + # There are only three lemmas recognized as AUX in Czech. This is not + # about features and it would be caught by the UD validator, but it + # is error in morphology, so let's report it here as well. + if node.upos == 'AUX' and node.lemma not in ['být', 'bývat', 'bývávat']: + self.bug(node, 'NonAuxLemma') + # All Czech verbs (and some adjectives and nouns) must have VerbForm. + # Almost all verbs have lexical Aspect but we cannot require it + # because there are a few biaspectual verbs (e.g. 'analyzovat') that + # do not have the feature. + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] in ['Inf', 'Sup']: # There is no voice. For some reason, PDT does not annotate that # the infinitive form is active (while a passive infinitive is # a combination of the infinitive with a passive participle). self.check_required_features(node, ['Polarity']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Inf'], + 'VerbForm': ['Inf', 'Sup'], 'Polarity': ['Pos', 'Neg'] }) elif node.feats['VerbForm'] == 'Fin': @@ -475,24 +689,46 @@ def process_node(self, node): # imperatives (although passive imperatives are a combination # of the active imperative and a passive participle). It is # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. + # Conditional "by" has no person and number (it is typically + # 3rd person but it could be other persons, too, as in "ty by + # ses bál"). if node.feats['Mood'] == 'Cnd': - self.check_required_features(node, ['Mood', 'Person']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Cnd'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Dual', 'Plur'] # optional: it is not annotated in the third person - }) + if node.form.lower() == 'by': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'] + }) + elif node.form.lower() == 'byšta': + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['2', '3'], + 'Number': ['Dual'] + }) + else: + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'] + }) elif node.feats['Mood'] == 'Imp': self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], + 'Voice': ['Act'], # optional in Old Czech data, not used with imperatives in Modern Czech data (at least not yet) 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) 'Number': ['Sing', 'Dual', 'Plur'], - 'Polarity': ['Pos', 'Neg'] + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'] }) else: # indicative self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) @@ -505,7 +741,8 @@ def process_node(self, node): 'Person': ['1', '2', '3'], 'Number': ['Sing', 'Dual', 'Plur'], 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist + 'Variant': ['Short', 'Long'], # distinguishes sigmatic (Long) and asigmatic (Short) aorist + 'Emph': ['Yes'] }) elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB if node.feats['Gender'] == 'Masc': @@ -532,43 +769,136 @@ def process_node(self, node): 'Polarity': ['Pos', 'Neg'] }) else: # converb - self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Conv'], - 'Tense': ['Past', 'Pres'], - 'Voice': ['Act'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy - 'Polarity': ['Pos', 'Neg'] - }) + # Old Czech data annotate converb gender by context rather than form + # (because the form was different than in Modern Czech) and for + # masculines they also include animacy. In Modern Czech animacy is + # currently not annotated and Masc,Neut gender is merged. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) # ADVERBS ############################################################## elif node.upos == 'ADV': - if node.feats['PronType'] != '': - # Pronominal adverbs are neither compared nor negated. + if node.feats['NumType'] != '': + # Adverbial multiplicative numerals (jednou, dvakrát, třikrát) + # belong here. They have also pronominal counterparts (kolikrát, + # tolikrát, několikrát). There are also adverbial ordinal numerals + # (zaprvé, poprvé, zadruhé, podruhé). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] - }) - elif node.feats['Degree'] != '': - # Adverbs that are compared can also be negated. - self.check_required_features(node, ['Degree', 'Polarity']) - self.check_allowed_features(node, { - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'] + 'NumType': ['Mult', 'Ord'], + 'PronType': ['Dem', 'Int', 'Rel', 'Int,Rel', 'Ind'] }) + elif self.pdt20: + if node.feats['PronType'] != '': + # Pronominal adverbs in PDT are neither compared nor negated. + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] + }) + elif node.feats['Degree'] != '': + # Adverbs that are compared can also be negated. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) else: - # The remaining adverbs are neither pronominal, nor compared or - # negated. - self.check_allowed_features(node, {}) + if node.feats['PronType'] == 'Tot': + # Total adverbs in Old Czech can be negated: vždy, nevždy. + # Then for consistence with other adverbs, we also require + # Degree, although it will be always Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'PronType': ['Tot'], + 'Degree': ['Pos'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['PronType'] != '': + # Other pronominal adverbs are neither compared nor negated. + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg'] + }) + else: + # All other adverbs should have both Degree and Polarity, + # although for some of them the values will always be Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': self.check_required_features(node, ['AdpType', 'Case']) self.check_allowed_features(node, { 'AdpType': ['Prep', 'Voc'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Abbr': ['Yes'] + }) + # SUBORDINATING CONJUNCTIONS ########################################### + elif node.upos == 'SCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) + # COORDINATING CONJUNCTIONS ############################################ + elif node.upos == 'CCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) + # PARTICLES ############################################################ + elif node.upos == 'PART': + # "t." = "totiž" + self.check_allowed_features(node, { + 'Abbr': ['Yes'] }) # THE REST: NO FEATURES ################################################ + # (OR UNDEFINED UPOS) ################################################## else: + if not node.upos in ['INTJ', 'PUNCT', 'SYM', 'X']: + bugmsg = 'UnknownUpos' + if node.upos: + bugmsg += node.upos + self.bug(node, bugmsg) self.check_allowed_features(node, {}) def check_adjective_like(self, node, r0, a0): @@ -583,7 +913,7 @@ def check_adjective_like(self, node, r0, a0): caller in parameters r0 (list) and a0 (dict). """ required_features = [] - allowed_featurs = {} + allowed_features = {} full_set = node.upos == 'ADJ' or not self.pdt20 if full_set: # Even in the full set, animacy is only distinguished for the diff --git a/udapi/block/ud/el/addmwt.py b/udapi/block/ud/el/addmwt.py index 8381c69f..ac753ed5 100644 --- a/udapi/block/ud/el/addmwt.py +++ b/udapi/block/ud/el/addmwt.py @@ -8,13 +8,13 @@ import udapi.block.ud.addmwt MWTS = { - 'στη': {'form': 'σ τη', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing'}, - 'στην': {'form': 'σ την', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing'}, - 'στα': {'form': 'σ τα', 'feats': '_ Case=Acc|Gender=Neut|Number=Plur'}, - 'στους': {'form': 'σ τους', 'feats': '_ Case=Acc|Gender=Masc|Number=Plur'}, - 'στις': {'form': 'σ τις', 'feats': '_ Case=Acc|Gender=Fem|Number=Plur'}, - 'στον': {'form': 'σ τον', 'feats': '_ Case=Acc|Gender=Masc|Number=Sing'}, - 'στο': {'form': 'σ το', 'feats': '_ Case=Acc|Gender=*|Number=Sing'}, + 'στη': {'form': 'σ τη', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'}, + 'στην': {'form': 'σ την', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'}, + 'στα': {'form': 'σ τα', 'feats': '_ Case=Acc|Definite=Def|Gender=Neut|Number=Plur|PronType=Art'}, + 'στους': {'form': 'σ τους', 'feats': '_ Case=Acc|Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'στις': {'form': 'σ τις', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Plur|PronType=Art'}, + 'στον': {'form': 'σ τον', 'feats': '_ Case=Acc|Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'στο': {'form': 'σ το', 'feats': '_ Case=Acc|Definite=Def|Gender=*|Number=Sing|PronType=Art'}, } # shared values for all entries in MWTS diff --git a/udapi/block/ud/es/elque.py b/udapi/block/ud/es/elque.py new file mode 100644 index 00000000..4d14b98d --- /dev/null +++ b/udapi/block/ud/es/elque.py @@ -0,0 +1,116 @@ +""" +This block searches for relative clauses modifying a determiner ('el que, el cual...'). +It is written for Spanish but a similar block should work for other Romance +languages. +""" +from udapi.core.block import Block +import logging +import re + +class ElQue(Block): + + def __init__(self, fix=False, **kwargs): + """ + Default: Print the annotation patterns but do not fix anything. + fix=1: Do not print the patterns but fix them. + """ + super().__init__(**kwargs) + self.fix = fix + + def process_node(self, node): + # We take 'que' as the central node of the construction. + if re.match(r'^(que|cual)$', node.lemma) and node.upos == 'PRON' and node.parent.ord > node.ord: + # We will refer to the parent of 'que' as a verb, although it can be + # a non-verbal predicate, too. + que = node + verb = node.parent + # Check the lemma of the determiner. The form may vary for gender and number. + if que.prev_node and que.prev_node.lemma == 'el': + el = que.prev_node + adp = None + if el.prev_node and el.prev_node.upos == 'ADP': + adp = el.prev_node + if adp.udeprel == 'fixed': + adp = adp.parent + if self.fix: + self.fix_pattern(adp, el, que, verb) + else: + self.print_pattern(adp, el, que, verb) + + def print_pattern(self, adp, el, que, verb): + stanford = [] + if adp: + if adp.parent == el: + parentstr = 'el' + elif adp.parent == que: + parentstr = 'que' + elif adp.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(adp.deprel + '(' + parentstr + ', ADP)') + if el.parent == adp: + parentstr = 'ADP' + elif el.parent == que: + parentstr = 'que' + elif el.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(el.deprel + '(' + parentstr + ', el)') + # We found the verb as the parent of 'que', so we do not need to check the parent of 'que' now. + stanford.append(que.deprel + '(VERB, que)') + if verb.parent == adp: + parentstr = 'ADP' + elif verb.parent == el: + parentstr = 'el' + else: + parentstr = 'OTHER' + stanford.append(verb.deprel + '(' + parentstr + ', VERB)') + print('; '.join(stanford)) + + def fix_pattern(self, adp, el, que, verb): + if adp: + if adp.parent == que or adp.parent == verb: + attach(adp, el, 'case') + if el.parent == que: + ###!!! Just a temporary change. In the end it will be attached elsewhere. + attach(el, verb) + el.parent = verb + if len(el.deps) == 1: + el.deps[0]['parent'] = verb + if verb.parent != adp and verb.parent != el and verb.parent != que: + eldeprel = None + if re.match(r'^[nc]subj$', verb.udeprel): + eldeprel = 'nsubj' + elif re.match(r'^ccomp$', verb.udeprel): + eldeprel = 'obj' + elif re.match(r'^advcl$', verb.udeprel): + eldeprel = 'obl' + elif re.match(r'^acl$', verb.udeprel): + eldeprel = 'nmod' + elif re.match(r'^(xcomp|conj|appos|root)$', verb.udeprel): + eldeprel = verb.deprel + if eldeprel: + attach(el, verb.parent, eldeprel) + attach(verb, el, 'acl:relcl') + # If anything before 'el' depends on the verb ('cc', 'mark', 'punct' etc.), + # re-attach it to 'el'. + for c in verb.children: + if c.ord < el.ord and re.match(r'^(cc|mark|case|punct)$', c.udeprel): + attach(c, el) + +def attach(node, parent, deprel=None): + """ + Attach a node to a new parent with a new deprel in the basic tree. In + addition, if there are enhanced dependencies and there is just one incoming + enhanced relation (this is the case in AnCora), this relation will be + modified accordingly. + """ + node.parent = parent + if deprel: + node.deprel = deprel + if len(node.deps) == 1: + node.deps[0]['parent'] = parent + if deprel: + node.deps[0]['deprel'] = deprel diff --git a/udapi/block/ud/fixadvmodbyupos.py b/udapi/block/ud/fixadvmodbyupos.py new file mode 100644 index 00000000..a2e4439c --- /dev/null +++ b/udapi/block/ud/fixadvmodbyupos.py @@ -0,0 +1,103 @@ +""" +Block ud.FixAdvmodByUpos will change the dependency relation from advmod to something else +if the UPOS is not ADV. +""" +from udapi.core.block import Block + + +class FixAdvmodByUpos(Block): + """ + Make sure advmod is not used with UPOS it should not be used with. + """ + + def process_node(self, node): + if node.udeprel == 'advmod': + if node.upos in ['NOUN', 'PROPN', 'PRON', 'DET', 'NUM']: + node.deprel = 'obl' + elif node.upos == 'VERB': + node.deprel = 'advcl' + elif node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos in ['ADP', 'SCONJ']: + if node.parent.upos == 'VERB': + node.deprel = 'mark' + else: + node.deprel = 'case' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + else: + node.deprel = 'dep' + ###!!! The following are not advmod so they should probably have their + ###!!! own block or this block should have a different name. + elif node.udeprel == 'expl': + if node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'ADP': + node.deprel = 'case' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.udeprel in ['aux', 'cop']: + if node.upos != 'AUX': + node.deprel = 'dep' + elif node.udeprel == 'case': + if node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'PRON': + node.deprel = 'nmod' + elif node.udeprel == 'mark': + if node.upos in ['PRON', 'DET']: + node.deprel = 'nsubj' # it could be also obj, iobj, obl or nmod; just guessing what might be more probable + elif node.upos == 'NOUN': + node.deprel = 'obl' + elif node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.udeprel == 'cc': + if node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.upos == 'NOUN': + node.deprel = 'dep' + elif node.udeprel == 'det': + if node.upos == 'NOUN': + node.deprel = 'nmod' + elif node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'NUM': + node.deprel = 'nummod' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'VERB': + node.deprel = 'dep' + elif node.upos == 'SCONJ': + node.deprel = 'mark' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.upos == 'X': + node.deprel = 'dep' + elif node.udeprel == 'nummod': + if node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'PRON': + node.deprel = 'nmod' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'ADP': + node.deprel = 'case' + elif node.udeprel == 'punct': + if node.upos != 'PUNCT': + node.deprel = 'dep' + elif node.udeprel == 'obl' and node.parent.upos in ['NOUN', 'PROPN', 'PRON'] and node.parent.udeprel in ['nsubj', 'obj', 'iobj', 'obl', 'vocative', 'dislocated', 'expl', 'nmod']: + node.deprel = 'nmod' diff --git a/udapi/block/ud/fixcompoundname.py b/udapi/block/ud/fixcompoundname.py new file mode 100644 index 00000000..90596e35 --- /dev/null +++ b/udapi/block/ud/fixcompoundname.py @@ -0,0 +1,46 @@ +""" +Block ud.FixCompoundName finds compound relations between PROPN nodes and converts +them to flat:name. This is not necessarily correct in all situations. The difference +between compound and flat is that compound allows to distinguish head and modifier. +Multiword person names (given name and surname, or various other patterns) typically +should be analyzed as flat but there are treebanks that incorrectly use compound +for person names. This block can be used to fix them. +""" +from udapi.core.block import Block +import regex as re +import logging + + +class FixCompoundName(Block): + """ + Converts a compound relation between two PROPN nodes into a flat relation. + Compounds of a PROPN and a non-PROPN will be left alone, although they are + suspicious, too. + """ + + def process_node(self, node): + if node.upos == 'PROPN' and node.udeprel == 'compound' and node.parent.upos == 'PROPN': + origparent = node.parent + grandparent = origparent.parent + outdeprel = origparent.deprel + # See if there are other PROPN compound siblings. + # (The list node.children is automatically sorted by ord. If any new sorting is needed later, we can compare nodes directly, their default comparison value is ord.) + namewords = [x for x in origparent.children(add_self=True) if x.upos == 'PROPN' and (x.udeprel == 'compound' or x == origparent)] + # The Hindi treebank tags dates (['30', 'navaṁbara'], ['disaṁbara', '1993']) as PROPN compounds. + # This is wrong but it is also different from personal names we are targeting here. + # Hence, we will skip "names" that contain numbers. + if any(re.search(r"\d", x.form) for x in namewords): + #logging.info(str([x.misc['Translit'] for x in namewords])) + ###!!! We currently cannot transform enhanced dependencies. + ###!!! If we proceed, the basic tree would diverge from the enhanced dependencies. + if len(node.deps) > 0: + logging.fatal('There are enhanced dependencies but ud.FixCompoundName has been implemented only for basic dependencies.') + # The first name word will be the technical head. If it is the current parent, fine. + head = namewords[0] + rest = namewords[1:] + if head != origparent: + head.parent = grandparent + head.deprel = outdeprel + for n in rest: + n.parent = head + n.deprel = 'flat:name' diff --git a/udapi/block/ud/fixmultiobjects.py b/udapi/block/ud/fixmultiobjects.py new file mode 100644 index 00000000..485b85f0 --- /dev/null +++ b/udapi/block/ud/fixmultiobjects.py @@ -0,0 +1,47 @@ +""" +Block ud.FixMultiObjects will ensure that no node has more than one (direct) object child. +""" +from udapi.core.block import Block + + +class FixMultiObjects(Block): + """ + Make sure there is at most one object. + """ + + def process_node(self, node): + objects = [x for x in node.children if x.udeprel == 'obj'] + if len(objects) > 1: + subjects = [x for x in node.children if x.udeprel in ['nsubj', 'csubj']] + # Some heuristics that could work in AnCora: + # If all objects are after the verb, keep the one that is closest to the verb. + if objects[0].ord > node.ord: + objects = objects[1:] + for o in objects: + o.deprel = 'obl:arg' + o.deps[0]['deprel'] = 'obl:arg' + elif objects[-1].ord < node.ord: + objects = objects[:-1] + for o in objects: + o.deprel = 'dislocated' + o.deps[0]['deprel'] = 'dislocated' + # ho experimenta tot + elif objects[-1].lemma in ['tot', 'todo']: + objects[-1].parent = objects[0] + objects[-1].deprel = 'nmod' + objects[-1].deps[0]['parent'] = objects[0] + objects[-1].deps[0]['deprel'] = 'nmod' + # X se llama Y + elif node.lemma in ['llamar', 'considerar', 'decir', 'denunciar', 'causar', 'escribir', 'hacer', 'rubricar']: + objects[-1].deprel = 'xcomp' + objects[-1].deps[0]['deprel'] = 'xcomp' + elif len(subjects) == 0: + objects[0].deprel = 'nsubj' + objects[0].deps[0]['deprel'] = 'nsubj' + else: + objects[0].deprel = 'dislocated' + objects[0].deps[0]['deprel'] = 'dislocated' + # For the moment, we take the dummiest approach possible: The first object survives and all others are forced to a different deprel. + #objects = objects[1:] + #for o in objects: + # o.deprel = 'iobj' diff --git a/udapi/block/ud/fixmultisubjects.py b/udapi/block/ud/fixmultisubjects.py new file mode 100644 index 00000000..f8aeca06 --- /dev/null +++ b/udapi/block/ud/fixmultisubjects.py @@ -0,0 +1,23 @@ +""" +Block ud.FixMultiSubjects will ensure that no node has more than one subject child (except those +marked as :outer). +""" +import re +from udapi.core.block import Block + + +class FixMultiSubjects(Block): + """ + Make sure there is at most one subject that is not marked as :outer. + """ + + def process_node(self, node): + subjects = [x for x in node.children if re.match(r"^[nc]subj(:|$)", x.deprel) and not re.search(r":outer$", x.deprel)] + # For the moment, we take the dummiest approach possible: The first subject survives and all others are forced to a different deprel. + if len(subjects) > 1: + subjects = subjects[1:] + for s in subjects: + if re.match(r"^n", s.deprel): + s.deprel = 'obl' + else: + s.deprel = 'advcl' diff --git a/udapi/block/ud/fixpseudocop.py b/udapi/block/ud/fixpseudocop.py index ecc5f0bd..f4d9a1ec 100644 --- a/udapi/block/ud/fixpseudocop.py +++ b/udapi/block/ud/fixpseudocop.py @@ -2,7 +2,6 @@ but they should be treated as normal verbs (with secondary predication) instead.""" from udapi.core.block import Block -import logging import re class FixPseudoCop(Block): diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index 15d310c7..854a24a8 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -50,22 +50,34 @@ def __init__(self, check_paired_punct_upos=False, copy_to_enhanced=False, **kwar Args: check_paired_punct_upos: fix paired punctuation tokens only if their UPOS=PUNCT. The default is false, which means that fixed punctuation is detected only - based on the form with the exception of single quote / apostrophe character, - which is frequently ambiguous, so UPOS=PUNCT is checked always. - copy_to_enhanced: for all PUNCT nodes, let the enhanced depencies be the same - as the basic dependencies. + based on the form with the exception of single & double quote character, + which is frequently ambiguous*, so UPOS=PUNCT is checked always. + *) Single quote can be an apostrophe. Double quote as a NOUN can be the inch symbol. + copy_to_enhanced: for all upos=PUNCT, let the enhanced depencies + be the same as the basic dependencies. """ super().__init__(**kwargs) self._punct_type = None self.check_paired_punct_upos = check_paired_punct_upos self.copy_to_enhanced = copy_to_enhanced + def _is_punct(self, node): + if node.upos == 'PUNCT': + return True + if self.check_paired_punct_upos: + return False + if node.form in "'\"": + return False + if node.form in PAIRED_PUNCT or node.form in PAIRED_PUNCT.values(): + return True + return False + def process_tree(self, root): # First, make sure no PUNCT has children. # This may introduce multiple subroots, which will be fixed later on # (preventing to temporarily create multiple subroots here would prevent fixing some errors). for node in root.descendants: - while node.parent.upos == 'PUNCT': + while self._is_punct(node.parent): node.parent = node.parent.parent # Second, fix paired punctuations: quotes and brackets, marking them in _punct_type. @@ -77,7 +89,7 @@ def process_tree(self, root): self._punct_type = [None] * (1 + len(root.descendants)) for node in root.descendants: if self._punct_type[node.ord] != 'closing': - closing_punct = PAIRED_PUNCT.get(node.form, None) + closing_punct = PAIRED_PUNCT.get(node.form) if closing_punct is not None: self._fix_paired_punct(root, node, closing_punct) @@ -99,6 +111,8 @@ def process_tree(self, root): # This may not hold if the original subroot was a paired punctuation, which was rehanged. if root.children[0].udeprel != 'root': root.children[0].udeprel = 'root' + if self.copy_to_enhanced: + root.children[0].deps = [{'parent': root, 'deprel': 'root'}] for another_node in root.children[0].descendants: if another_node.udeprel == 'root': another_node.udeprel = 'punct' @@ -107,7 +121,7 @@ def process_tree(self, root): if self.copy_to_enhanced: for node in root.descendants: if node.upos == 'PUNCT': - node.deps = [{'parent': node.parent, 'deprel': 'punct'}] + node.deps = [{'parent': node.parent, 'deprel': node.deprel}] def _fix_subord_punct(self, node): # Dot used as the ordinal-number marker (in some languages) or abbreviation marker. @@ -148,13 +162,13 @@ def _fix_subord_punct(self, node): if l_cand is None or l_cand.is_root(): l_cand, l_path = None, [] else: - while (not l_cand.parent.is_root() and l_cand.parent.precedes(node) - and not node.precedes(l_cand.descendants(add_self=1)[-1])): + while (not l_cand.parent.is_root() and l_cand.parent < node + and not node < l_cand.descendants(add_self=1)[-1]): l_cand = l_cand.parent l_path.append(l_cand) if r_cand is not None: - while (not r_cand.parent.is_root() and node.precedes(r_cand.parent) - and not r_cand.descendants(add_self=1)[0].precedes(node)): + while (not r_cand.parent.is_root() and node < r_cand.parent + and not r_cand.descendants(add_self=1)[0] < node): r_cand = r_cand.parent r_path.append(r_cand) @@ -203,7 +217,7 @@ def _causes_gap(self, node): def _fix_paired_punct(self, root, opening_node, closing_punct): if (self.check_paired_punct_upos - or opening_node.form == "'") and opening_node.upos != 'PUNCT': + or opening_node.form in "'\"") and opening_node.upos != 'PUNCT': return nested_level = 0 for node in root.descendants[opening_node.ord:]: @@ -226,8 +240,8 @@ def _fix_pair(self, root, opening_node, closing_node): if node == opening_node or node == closing_node: continue # If this is a node inside of the pair, is its parent outside? - if opening_node.precedes(node) and node.precedes(closing_node): - if node.parent.precedes(opening_node) or closing_node.precedes(node.parent): + if node > opening_node and node < closing_node: + if node.parent < opening_node or node.parent > closing_node: if node.upos == 'PUNCT': punct_heads.append(node) else: @@ -236,12 +250,11 @@ def _fix_pair(self, root, opening_node, closing_node): # they also must not cause non-projectivity of other relations. This could # happen if an outside node is attached to an inside node. To account for # this, mark the inside parent as a head, too. - else: - if opening_node.precedes(node.parent) and node.parent.precedes(closing_node): - if node.parent.upos == 'PUNCT': - punct_heads.append(node.parent) - else: - heads.append(node.parent) + elif node.parent > opening_node and node.parent < closing_node: + if node.parent.upos == 'PUNCT': + punct_heads.append(node.parent) + else: + heads.append(node.parent) # Punctuation should not have children, but if there is no other head candidate, # let's break this rule. diff --git a/udapi/block/ud/fixroot.py b/udapi/block/ud/fixroot.py new file mode 100644 index 00000000..be972d8b --- /dev/null +++ b/udapi/block/ud/fixroot.py @@ -0,0 +1,37 @@ +""" +Block ud.FixRoot will ensure that the tree is free of common root-related errors. +Simple heuristics are used; it is likely that human inspection would lead to +a different solution. Nevertheless, if a quick fix is needed to pass the +validation, this block can be helpful. + +WARNING: The block currently ignores enhanced dependencies. +""" +import re +from udapi.core.block import Block + + +class FixRoot(Block): + """ + Fixes the following validation errors: + - Only one node must be attached directly to the artificial root node. + => If the root has multiple children, keep the first one. Attach the other + ones to the first one. Change their deprel to 'parataxis'. + - The node attached as a child of the artificial root node must have the + 'root' relation (or its subtype). + => If the root child has another deprel, change it to 'root'. + - The node attached as a child of the artificial root node is the only one + allowed to have the 'root' relation (or its subtype). + => If another node has that deprel, change it to 'parataxis'. + """ + + def process_tree(self, root): + rchildren = root.children + if len(rchildren) > 1: + for i in range(len(rchildren)-1): + rchildren[i+1].parent = rchildren[0] + rchildren[i+1].deprel = 'parataxis' + if rchildren[0].udeprel != 'root': + rchildren[0].deprel = 'root' + for n in root.descendants: + if not n.parent == root and n.udeprel == 'root': + n.deprel = 'parataxis' diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index d328212d..4ea23d06 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -242,7 +242,7 @@ def merge_reduplication(self, node): hyph.remove() node.remove() first.misc['SpaceAfter'] = '' - mwt = root.create_multiword_token([first, second], first.form + second.form, mwtmisc) + mwt = root.create_multiword_token([first, second], form=first.form + second.form, misc=mwtmisc) else: first.form = first.form + '-' + node.form if node.no_space_after: @@ -288,7 +288,7 @@ def merge_reduplication(self, node): prefix.remove() hyph.remove() stem.misc['SpaceAfter'] = '' - mwt = root.create_multiword_token([stem, second], stem.form + second.form, mwtmisc) + mwt = root.create_multiword_token([stem, second], form=stem.form + second.form, misc=mwtmisc) else: stem.form = prefix.form + '-' + stem.form prefix.remove() @@ -345,7 +345,7 @@ def fix_satu_satunya(self, node): if mwt: mwtmisc = mwt.misc.copy() mwt.remove() - mwt = root.create_multiword_token([satu0, nya], satu0.form + nya.form, mwtmisc) + mwt = root.create_multiword_token([satu0, nya], form=satu0.form + nya.form, misc=mwtmisc) satu0.misc['SpaceAfter'] = '' root.text = root.compute_text() if node.multiword_token and node.no_space_after: diff --git a/udapi/block/ud/joinasmwt.py b/udapi/block/ud/joinasmwt.py index 02c54206..be93bd3c 100644 --- a/udapi/block/ud/joinasmwt.py +++ b/udapi/block/ud/joinasmwt.py @@ -22,19 +22,30 @@ def process_node(self, node): if node.multiword_token: return mwt_nodes = [node] - while (node.no_space_after and node.next_node and not node.next_node.multiword_token - and node.form[-1].isalpha() and node.next_node.form[0].isalpha()): + while (node.next_node and not node.next_node.multiword_token + and self.should_join(node, node.next_node)): node = node.next_node mwt_nodes.append(node) if len(mwt_nodes) > 1: - mwt_form = ''.join([n.form for n in mwt_nodes]) - mwt = node.root.create_multiword_token(mwt_nodes, mwt_form) - if node.misc['SpaceAfter'] == 'No': - mwt.misc['SpaceAfter'] = 'No' + self.create_mwt(mwt_nodes) + + def should_join(self, node, next_node): + return node.no_space_after and node.form[-1].isalpha() and next_node.form[0].isalpha() + + def create_mwt(self, mwt_nodes): + mwt_form = ''.join([n.form for n in mwt_nodes]) + mwt = mwt_nodes[0].root.create_multiword_token(words=mwt_nodes, form=mwt_form) + if mwt_nodes[0].node.misc['SpaceAfter'] == 'No': + mwt.misc['SpaceAfter'] = 'No' + for mwt_node in mwt_nodes: + del mwt_node.misc['SpaceAfter'] + if self.revert_orig_form: for mwt_node in mwt_nodes: - del mwt_node.misc['SpaceAfter'] - if self.revert_orig_form: - for mwt_node in mwt_nodes: - if mwt_node.misc['OrigForm']: - mwt_node.form = mwt_node.misc['OrigForm'] - del mwt_node.misc['OrigForm'] + if mwt_node.misc['OrigForm']: + mwt_node.form = mwt_node.misc['OrigForm'] + del mwt_node.misc['OrigForm'] + self.postprocess_mwt() + + # a helper method to be overriden + def postprocess_mwt(self, mwt): + pass diff --git a/udapi/block/ud/jointoken.py b/udapi/block/ud/jointoken.py new file mode 100644 index 00000000..43d2b30d --- /dev/null +++ b/udapi/block/ud/jointoken.py @@ -0,0 +1,97 @@ +""" +Block ud.JoinToken will join a given token with the preceding one. +""" +from udapi.core.block import Block +import logging + + +class JoinToken(Block): + """ + Merge two tokens into one. A MISC attribute is used to mark the tokens that + should join the preceding token. (The attribute may have been set by an + annotator or by a previous block that tests the specific conditions under + which joining is desired.) Joining cannot be done across sentence + boundaries; if necessary, apply util.JoinSentence first. Multiword tokens + are currently not supported: None of the nodes to be merged can belong to + a MWT. (The block ud.JoinAsMwt may be of some help, but it works differently.) + Merging is simple if there is no space between the tokens (see SpaceAfter=No + at the first token). If there is a space, there are three options in theory: + + 1. Keep the tokens as two nodes but apply the UD goeswith relation + (see https://universaldependencies.org/u/overview/typos.html) and + the related annotation rules. + 2. Join them into one token that contains a space. Such "words with + spaces" can be exceptionally allowed in UD if they are registered + in the given language. + 3. Remove the space without any trace. Not recommended in UD unless the + underlying text was created directly for UD and can be thus considered + part of the annotation. + + At present, this block does not support merging with spaces at all, but + in the future one or more of the options may be added. + """ + + def __init__(self, misc_name='JoinToken', misc_value=None, **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the joining + default: JoinToken + misc_value: value of the MISC attribute to trigger the joining; + if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + self.misc_value = misc_value + + def process_node(self, node): + """ + The JoinToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be merged with the previous node and the + attribute will be removed from MISC, or a warning will be issued that + the merging cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + if node.misc[self.misc_name] == '': + return + if self.misc_value and node.misc[self.misc_name] != self.misc_value: + return + prevnode = node.prev_node + if not prevnode: + logging.warning("MISC %s cannot be used at the first token of a sentence." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if node.multiword_token or prevnode.multiword_token: + logging.warning("MISC %s cannot be used if one of the nodes belongs to a multiword token." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if prevnode.misc['SpaceAfter'] != 'No': + logging.warning("MISC %s cannot be used if there is space between the tokens." % self.misc_name) + node.misc['Bug'] = 'JoiningTokensWithSpaceNotSupported' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if prevnode.deps or node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # If the first token depends on the second token, re-attach it to the + # second token's parent to prevent cycles. + if prevnode in node.descendants: + prevnode.parent = node.parent + prevnode.deprel = node.deprel + # Re-attach all children of the second token to the first token. + for c in node.children: + c.parent = prevnode + # Concatenate the word forms of the two tokens. Assume that morphological + # annotation, including the lemma, is already updated accordingly (we + # cannot guess it anyway). + prevnode.form += node.form + # Remove SpaceAfter=No from the first token unless the second token has + # this attribute, too (meaning that there is no space between the second + # token and whatever comes next). + prevnode.misc['SpaceAfter'] = node.misc['SpaceAfter'] + # Remove the current node. The joining instruction was in its MISC, so + # it will disappear together with the node. + node.remove() diff --git a/udapi/block/ud/la/addmwt.py b/udapi/block/ud/la/addmwt.py new file mode 100644 index 00000000..27831151 --- /dev/null +++ b/udapi/block/ud/la/addmwt.py @@ -0,0 +1,41 @@ +""" Block ud.la.AddMwt for heuristic detection of multi-word (PRON + cum, nonne) and abbreviations-dots tokens. """ +import udapi.block.ud.addmwt + +MWTS = { + 'mecum': {'lemma': 'ego cum', 'form': 'me cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'tecum': {'lemma': 'tu cum', 'form': 'te cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'nobiscum': {'lemma': 'nos cum', 'form': 'nobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Neut|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'vobiscum': {'lemma': 'vos cum', 'form': 'vobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'uobiscum': {'lemma': 'uos cum', 'form': 'uobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'secum': {'lemma': 'sui cum', 'form': 'se cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, # can be singular or plural + 'nonne': {'lemma': 'non ne', 'form': 'non ne', 'upos': 'PART PART', 'feats': 'Polarity=Neg Clitic=Yes|PartType=Int', 'deprel': 'advmod:neg discourse', 'shape': 'sibling'} +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + # v['xpos'] = '' # treebank-specific + if 'shape' not in v: + v['shape'] = 'subtree' + v['main'] = 0 + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + if analysis is not None: + return analysis + + if node.form.endswith('.') and len(node.form) > 1 and node.form != '...': + # currently under discussion + return {'form': node.form[:-1] + ' .', + 'lemma': '* .', + 'upos': '* PUNCT', + 'xpos': '_ _', + 'feats': '* _', + 'deprel': '* punct', + 'main': 0, + 'shape': 'subtree'} + diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py new file mode 100644 index 00000000..a7b506e8 --- /dev/null +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -0,0 +1,338 @@ +""" +Block to identify missing or ill-valued features in Latin. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.la.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def __init__(self, flavio=False, **kwargs): + """ + Create the ud.la.MarkFeatsBugs block instance. + + Args: + flavio=1: Accept features as defined by Flavio for treebanks he + maintains. By default, a more conservative set of features and + values is expected. + """ + super().__init__(**kwargs) + self.flavio = flavio + + def process_node(self, node): + rf = [] + af = {} + # PROIEL-specific: greek words without features + # LLCT-specific: corrupted nodes + if node.lemma in ['greek.expression', 'missing^token']: + pass + # NOUNS ################################################################ + elif node.upos == 'NOUN': + if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns + rf = ['Gender', 'Number', 'Case'] + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Dim'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'VerbForm': ['Part', 'Vnoun']} + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Proper'] = ['Yes'] + af['Polarity'] = ['Neg'] + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # PROPER NOUNS ######################################################### + elif node.upos == 'PROPN': + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: # abbreviated and indeclinable nouns + rf = ['Gender', 'Number', 'Case'] + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes']} + if self.flavio: + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: + rf = ['Gender', 'Number', 'Case'] + af = { + 'NumType': ['Dist', 'Mult', 'Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Cmp', 'Sup', 'Abs'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Polarity': ['Neg'], + 'VerbForm': ['Part']} + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] + af['Variant'] = ['Greek'] + af['Degree'].append('Dim') + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + rf = ['PronType', 'Case'] + af = { + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Proper': ['Yes'], + 'Compound': ['Yes'], + 'Polarity': ['Neg'] + } + if node.feats['PronType'] == 'Prs': + af['Reflex'] = ['Yes'] + if node.feats['Reflex'] == 'Yes': # seipsum, se + rf.extend(['Person']) + # seipsum has gender and number but se does not, so it is not required + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + af['Person'] = ['3'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + else: # not reflexive: ego, tu, is, nos + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 3rd person must have gender + if node.feats['Person'] == '3': # is, id + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + elif re.match(r'^(Rel|Int)$', node.feats['PronType']): + rf.extend(['Gender', 'Number']) + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + elif node.feats['PronType'] == 'Ind': + rf = [f for f in rf if f != 'Case'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + # lexical check of PronTypes + af['PronType'] = [] + if node.lemma in ['ego', 'tu', 'is', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'egoipse', 'egometipse', 'tumetipse', 'semetipse', 'nosmetipse']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquis', 'nemo', 'nihil', 'nihilum', 'qui', 'quis', 'quisquis', 'quiuis', 'quivis']: + af['PronType'].append('Ind') + elif node.lemma in ['inuicem', 'invicem']: + af['PronType'].append('Rcp') + rf.remove('Case') + if node.lemma in ['qui', 'quicumque', 'quisquis']: + af['PronType'].append('Rel') + if node.lemma in [ 'ecquis', 'ecqui', 'numquis', 'qui', 'quis', 'quisnam']: + af['PronType'].append('Int') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurO', 'IndEurX', 'LatAnom', 'LatPron'] + af['Compound'] = ['Yes'] + af['Polarity'] = ['Neg'] + af['Form'] = ['Emp'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + rf = ['PronType'] + if node.feats['Case']: + rf.extend(['Gender', 'Number', 'Case']) + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Cmp', 'Abs', 'Sup'], + 'Polarity': ['Neg'], + 'Proper': ['Yes'], + 'PronType': [] + } + if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' + rf.extend(['Poss', 'Person[psor]']) + af['PronType'] = ['Prs'] + af['Poss'] = 'Yes' + af['Person[psor]'] = ['1', '2', '3'] + af['Reflex'] = ['Yes'] + # The possessor's number is distinguished in the first and second person (meus vs. noster) but not in the third person (suus). + if node.feats['Person[psor]'] != '3': + rf.append('Number[psor]') + af['Number[psor]'] = ['Sing', 'Plur'] + if node.feats['PronType'] == 'Ind': + af['NumType'] = ['Card'] + # lexical check of PronTypes + if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: + if not af['PronType'] == ['Prs']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquantus', 'aliqui', 'aliquot', 'quidam', 'nonnullus', 'nullus', 'quantuscumque', 'quantuslibet', 'qui', 'quilibet', 'quispiam', 'quiuis', 'quivis', 'quotlibet', 'ullus', 'unus', 'uterque','multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + af['PronType'].append('Ind') + elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: + af['PronType'].append('Tot') + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: + af['PronType'].append('Rel') + if node.lemma in ['qui', 'quantus', 'quot']: + af['PronType'].append('Int') + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot', 'praedictus', 'praefatus', 'suprascriptus']: + af['PronType'].append('Dem') + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter', 'uterlibet', 'uterque']: + af['PronType'].append('Con') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['NumType'] = ['Card'] + af['Degree'].append('Dim') + af['PronType'].append('Art') + if re.match(r'^(unus|ambo)', node.lemma): + af['NumValue'] = ['1', '2'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + rf = ['NumType', 'NumForm'] + af = { + 'NumType': ['Card', 'Ord'], + 'NumForm': ['Word', 'Roman', 'Digit'], + 'Proper': ['Yes']} + # Arabic digits and Roman numerals do not have inflection features. + if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['NumForm'].append('Reference') + af['Compound'] = ['Yes'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + rf = ['VerbForm', 'Aspect'] + af = { + 'VerbForm': ['Inf', 'Fin', 'Part', 'Conv'], + 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], + 'Polarity': ['Neg'], + 'Typo': ['Yes'] + } + if node.feats['VerbForm'] not in ['Part', 'Conv']: + rf.append('Tense') + af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] + if node.upos == 'VERB' or (node.upos == 'AUX' and node.lemma != 'sum'): + rf.append('Voice') + af['Voice'] = ['Act', 'Pass'] + if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive + rf.extend(['Mood', 'Person', 'Number']) + af['Mood'] = ['Ind', 'Sub', 'Imp'] + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + elif node.feats['VerbForm'] == 'Part': + rf.extend(['Gender', 'Number', 'Case']) + af['Number'] = ['Sing', 'Plur'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Sing'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Neut'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + af['Degree'] = ['Abs', 'Cmp'] + if node.misc['TraditionalMood'].startswith('Gerundi'): + af['Voice'] = ['Pass'] + af['Aspect'] = 'Prosp' + elif node.feats['VerbForm'] == 'Conv': + rf.extend(['Case', 'Gender', 'Number']) + af['Case'] = ['Abl', 'Acc'] + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Voice'] = ['Act'] + elif node.feats['VerbForm'] == 'Inf': + af['Tense'].remove('Pqp') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + af['VerbType'] = ['Mod'] + if 'Degree' in af: + af['Degree'].append('Dim') + else: + af['Degree'] = ['Dim'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] + if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + elif node.feats['VerbForm'] == 'Inf': + af['Case'] = ['Nom', 'Acc', 'Abl'] + af['Gender'] = ['Neut'] + af['Number'] = ['Sing'] + af['InflClass[nominal]'] = ['Ind'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + af = { + 'AdvType': ['Loc', 'Tim'], + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'NumType': ['Card', 'Mult', 'Ord'], # e.g., primum + 'Polarity': ['Neg'] + } + if self.flavio: + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['VerbForm'] = ['Fin', 'Part'] + af['Degree'].append('Dim') + self.check_allowed_features(node, af) + # PARTICLES ############################################################ + elif node.upos == 'PART': + af = { + 'PartType': ['Int', 'Emp'], + 'Polarity': ['Neg'] + } + if self.flavio: + af['Form'] = ['Emp'] + af['PronType'] = ['Dem'] + af['Compound'] = ['Yes'] + self.check_allowed_features(node, af) + # CONJUNCTIONS ######################################################### + elif re.match(r'^[CS]CONJ$', node.upos): + af = { + 'PronType': ['Rel', 'Con'], + 'Polarity': ['Neg'], + 'Compound': ['Yes']} + if self.flavio: + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['VerbForm'] = ['Fin'] + af['NumType'] = ['Card'] + af['ConjType'] = ['Expl'] + af['AdvType'] = ['Loc'] + self.check_allowed_features(node, af) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + rf = ['AdpType'] + af = { + 'AdpType': ['Prep', 'Post'], + 'Abbr': ['Yes'] + } + if self.flavio: + af['VerbForm'] = ['Part'] + af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] + self.check_allowed_features(node, af) + # X ########################################################## + elif node.upos == 'X': + af = {'Abbr': ['Yes']} + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) diff --git a/udapi/block/ud/lt/fixedeprels.py b/udapi/block/ud/lt/fixedeprels.py new file mode 100644 index 00000000..9b1cb98d --- /dev/null +++ b/udapi/block/ud/lt/fixedeprels.py @@ -0,0 +1,144 @@ +"""Block to fix case-enhanced dependency relations in Lithuanian.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'kaip': [], + 'lyg': [], + 'negu': [], + 'nei': [], + 'nes': [] + } + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'apie': 'apie:acc', # about (topic) + 'dėl': 'dėl:gen', # because of + 'iki': 'iki:gen', # until + 'iš': 'iš:gen', # from, out of + 'į': 'į:acc', # to, into, in + 'jei': 'jei', # remove morphological case # if + 'jeigu': 'jeigu', # remove morphological case # if + 'jog': 'jog', # remove morphological case # because + 'kadangi': 'kadangi', # remove morphological case # since, because + 'kai': 'kai', # remove morphological case # when + 'kaip': 'kaip', # remove morphological case # as, than + 'lyg': 'lyg', # remove morphological case # like + 'negu': 'negu', # remove morphological case # than + 'nei': 'nei', # remove morphological case # more than + 'nes': 'nes', # remove morphological case # because + 'nors': 'nors', # remove morphological case # though, although, when, if + 'nuo': 'nuo:gen', # from + 'pagal': 'pagal:acc', # according to, under, by + 'pagal_dėl': 'pagal:acc', + 'per': 'per:acc', # through, over (přes) + 'prie': 'prie:gen', # to, at, near, under + 'prieš': 'prieš:acc', # against + 'su': 'su:ins', # with + 'tarp': 'tarp:gen', # between + 'tarsi': 'tarsi', # remove morphological case # as if + 'virš': 'virš:gen' # above + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + # Issues caused by errors in the original annotation must be fixed early. + # Especially if acl|advcl occurs with a preposition that unambiguously + # receives a morphological case in the subsequent steps, and then gets + # flagged as solved. + edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel']) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m and m.group(2) and not x+m.group(2) in exceptions: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + continue + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. Exclude 'nom' and 'voc', which cannot + # be correct. + m = re.match(r'^(obl(?::arg)?|nmod):(po|už)(?::(?:nom|voc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase and not re.search(r':(nom|voc)$', adpcase): + edep['deprel'] = m.group(1)+':'+adpcase + continue + # The remaining instance of 'po' should be ':acc'. + elif m.group(2) == 'po': + edep['deprel'] = m.group(1)+':po:acc' + continue + # The remaining 'už' are ':acc' (they are second conjuncts + # in coordinated oblique modifiers). + elif m.group(2) == 'už': + edep['deprel'] = m.group(1)+':už:acc' + continue + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index 5ca0f703..ee58084a 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -118,7 +118,7 @@ def process_node(self, node): if upos == i_upos and not feats[i_feat]: # Some languages do not distinguish finite and non-finite forms of verbs. # The VerbForm feature is not obligatory in those languages. - if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb'}: + if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb', 'naq'}: self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) if feats['VerbForm'] == 'Fin': @@ -127,22 +127,22 @@ def process_node(self, node): if not feats['Mood']: self.log(node, 'finverb-mood', 'VerbForm=Fin but Mood feature is missing') - if feats['Degree'] and upos not in ('ADJ', 'ADV'): - self.log(node, 'degree-upos', - 'Degree=%s upos!=ADJ|ADV (but %s)' % (feats['Degree'], upos)) - - subject_children = [n for n in node.children if 'subj' in n.udeprel] + subject_children = [n for n in node.children if 'subj' in n.udeprel and n.sdeprel != 'outer'] if len(subject_children) > 1: - self.log(node, 'multi-subj', 'More than one [nc]subj(:pass)? child') - - object_children = [n for n in node.children if n.udeprel in ('obj', 'ccomp')] + self.log(node, 'multi-subj', 'More than one (non-outer) [nc]subj child') + + # Since "ccomp" is considered a clausal counterpart of "obj" in UD v2, + # one may conclude that "obj" and "ccomp" are mutually exclusive. + # However, this has always be a gray zone and people have occasionally + # brought up examples where they would want the two relations to co-occur. + # Also, there is no clausal counterpart for "iobj", which may cause some + # of the problems. It is probably safer not to consider "ccomp" in this + # test. Nevertheless, two "obj" under the same parent are definitely an + # error. + object_children = [n for n in node.children if n.udeprel == 'obj'] if len(object_children) > 1: self.log(node, 'multi-obj', 'More than one obj|ccomp child') - # In addition to http://universaldependencies.org/svalidation.html - if parent.udeprel == 'punct': - self.log(node, 'punct-child', 'parent.deprel=punct') - # See http://universaldependencies.org/u/overview/syntax.html#the-status-of-function-words # TODO: Promotion by Head Elision: It is difficult to detect this exception. # So far, I have just excluded "det" from the forbidded parent.deprel set @@ -154,7 +154,7 @@ def process_node(self, node): # so there should be no false alarms. Some errors are not reported, i.e. the cases # when advmod incorrectly depends on a function word ("right before midnight"). if parent.udeprel in ('aux', 'cop', 'mark', 'clf', 'case'): - if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod'): + if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod', 'reparandum'): self.log(node, parent.deprel + '-child', 'parent.deprel=%s deprel!=conj|cc|punct|fixed|goeswith' % parent.deprel) @@ -184,14 +184,6 @@ def process_node(self, node): if upos == 'PUNCT' and node.is_nonprojective_gap() and not parent.is_nonprojective_gap(): self.log(node, 'punct-nonproj-gap', 'upos=PUNCT and causing a non-projectivity') - # http://universaldependencies.org/u/dep/cc.html says - # "cc is the relation between a conjunct and a preceding - # [coordinating conjunction](http://universaldependencies.org/u/pos/CCONJ)." - # No other upos is allowed in the documentation, although e.g. PART is common in the data. - # There are clear cases of adverbs in role of cc (e.g. "respektive" in Swedish and Czech). - if udeprel == 'cc' and upos not in ('CCONJ', 'ADV'): - self.log(node, 'cc-upos', "deprel=cc upos!=CCONJ (but %s): " % upos) - if udeprel == 'cop': lemma = node.lemma if node.lemma != '_' else form self.cop_nodes[lemma].append(node) diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py new file mode 100644 index 00000000..26c5624d --- /dev/null +++ b/udapi/block/ud/markfeatsbugs.py @@ -0,0 +1,73 @@ +""" +Block to identify missing or ill-valued features in a treebank. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. This is a base block that only +implements service methods. A language-specific block must be derived from this +one and define the actual rules valid in that language. + +Usage (Czech example): cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html +""" +from udapi.core.block import Block + +class MarkFeatsBugs(Block): + + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def check_allowed_features(self, node, allowed): + """ + We need a dictionary indexed by feature names that are allowed; for each + feature name, there is a list of allowed values. + """ + # Check for features that are not allowed but the node has them. + # For features that are allowed, check that their values are allowed. + for f in node.feats: + if f in allowed: + if not node.feats[f] in allowed[f]: + self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') + else: + self.bug(node, 'Feat' + f + 'NotAllowed') + + def check_required_features(self, node, required): + """ + We need a list of names of features whose values must not be empty. + """ + for f in required: + if not f in node.feats: + self.bug(node, 'Feat' + f + 'Missing') + + def process_node(self, node): + """ + This is a generic block, do nothing here. In a language-specific block + based on this one, rules similar to the examples below can be specified: + + # NOUNS ################################################################ + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + #... + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) + """ + return diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py new file mode 100644 index 00000000..13c8434c --- /dev/null +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -0,0 +1,279 @@ +""" +Block to identify missing or ill-valued features in Malayalam. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.ml.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def process_node(self, node): + # FOREIGN WORDS ######################################################## + # Do not put any restrictions on words that have Foreign=Yes. These may + # also have Lang=xx in MISC, which would mean that the official + # validator would judge them by the rules for language [xx]. But even + # if they are not fully code-switched (e.g. because they are written in + # the Malayalam script, like the English verb പ്ലാന്റ് plānṟ "plant"), + # they still may not have the regular features of Malayalam morphology. + if node.feats['Foreign'] == 'Yes': + pass + # NOUNS AND PROPER NOUNS ############################################### + elif re.match(r'^(NOUN|PROPN)$', node.upos): + self.check_required_features(node, ['Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'NumType': ['Ord'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + rf = ['PronType', 'Case'] + af = { + 'PronType': ['Prs', 'Int', 'Ind'], # demonstrative pronouns are treated as third person personal pronouns + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + } + if node.feats['PronType'] == 'Prs': + af['Reflex'] = ['Yes'] + if node.feats['Reflex'] == 'Yes': + rf = ['PronType'] + else: # not reflexive + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī; or 3rd person താൻ tān̕ + if node.feats['Person'] == '3' and not node.lemma == 'താൻ': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ; but not താൻ tān̕ + rf.append('Deixis') + af['Deixis'] = ['Prox', 'Remt'] + if node.feats['Number'] == 'Sing': + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + # third person singular neuter pronouns also distinguish animacy (animate neuter are animals and plants, they have a different accusative form) + if node.feats['Gender'] == 'Neut': + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + else: # plural pronouns do not distinguish gender but they do distinguish animacy + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + elif node.feats['Person'] == '1' and node.feats['Number'] == 'Plur': + rf.append('Clusivity') + af['Clusivity'] = ['In', 'Ex'] + # Interrogative pronouns, too, can be case-marked. Therefore, the + # base form must have Case=Nom. + # ആര് ār "who" (Nom) എന്ത് ent "what" (Nom, Acc.Inan) + # ആരെ āre "who" (Acc) എന്തെ ente "what" (Acc.Anim) എന്തിനെ entine "what" (Acc.Anim or maybe Inan but optional) + # ആരുടെ āruṭe "who" (Gen) എന്തിന് entin "what" (Gen) or "why" + # ആരൊക്കെ ārokke "who" (Dat?) എന്തൊക്കെ entokke "what" (Dat?) + #elif node.feats['PronType'] == 'Int': + # rf.append('Animacy') + # af['Animacy'] = ['Anim', 'Inan'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + if node.feats['PronType'] == 'Art': + self.check_required_features(node, ['PronType', 'Definite']) + self.check_allowed_features(node, { + 'PronType': ['Art'], + 'Definite': ['Ind'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['PronType']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Deixis': ['Prox', 'Remt'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card', 'Frac'], + 'NumForm': ['Word'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # VERBS ################################################################ + elif node.upos == 'VERB': + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] == 'Inf': + self.check_allowed_features(node, { + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + # Unlike other forms, the imperative distinguishes politeness. + # The verb stem serves as an informal imperative: തുറ tuṟa "open" + # The citation form may serve as a formal imperative: തുറക്കുക tuṟakkūka "open" + # Finally, there is another formal imperative with -kkū: തുറക്കൂ tuṟakkū "open" + self.check_required_features(node, ['Mood', 'Polite']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Polite': ['Infm', 'Form'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['Mood'] == 'Nec': + self.check_required_features(node, ['Mood', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Nec'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['Mood', 'Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Pot', 'Cnd'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['VerbForm'] == 'Part': + self.check_required_features(node, ['Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + else: # verbal noun + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + # AUXILIARIES ########################################################## + elif node.upos == 'AUX': + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub', 'Cnd'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: # verbal noun + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Typo': ['Yes'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {'Typo': ['Yes']}) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_allowed_features(node, { + # Case suffixes after numbers are separate tokens, they are attached + # via the 'case' relation and they bear the Case feature (the number does not). + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes']}) + # PARTICLES ############################################################ + elif node.upos == 'PART': + self.check_allowed_features(node, { + 'Polarity': ['Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {'Abbr': ['Yes'], 'Typo': ['Yes']}) diff --git a/udapi/block/ud/printfixed.py b/udapi/block/ud/printfixed.py new file mode 100644 index 00000000..313943bb --- /dev/null +++ b/udapi/block/ud/printfixed.py @@ -0,0 +1,104 @@ +""" +Block PrintFixed prints occurrences of fixed multiword expressions in UD. It +can be run twice in a row, first collecting known fixed expressions and then +also reporting other occurrences of these expressions where they are not +annotated as fixed. + +Usage: +udapy ud.PrintFixed only_forms=1 < in.conllu | sort -u > fixed_expressions.txt +udapy ud.PrintFixed known_expressions=fixed_expressions.txt < in.conllu | sort | uniq -c | less + +Author: Dan Zeman +""" +from udapi.core.block import Block +import re +import logging + +class PrintFixed(Block): + """ + Print fixed multiword expressions. + """ + + def __init__(self, only_forms=False, known_expressions=None, **kwargs): + """ + Create the PrintFixed block. + + Parameters: + only_forms=1: print the word forms but not tags and other info; + This can be used to create the list of known forms that we want to + identify even if they are not annotated as fixed. + known_expressions: the name of the text file with the expressions + """ + super().__init__(**kwargs) + self.only_forms = only_forms + self.known_expressions = {} + self.first_words = {} + self.max_length = 2 + if known_expressions: + fh = open(known_expressions, 'r', encoding='utf-8') + n = 0 + for expression in fh.readlines(): + expression = expression.replace('\n', '') + if expression in self.known_expressions: + self.known_expressions[expression] += 1 + else: + self.known_expressions[expression] = 1 + logging.info("Read known fixed expression '%s'" % expression) + n += 1 + words = expression.split(' ') + first_word = words[0] + self.first_words[first_word] = 1 + length = len(words) + if length > self.max_length: + self.max_length = length + logging.info('Read %d known fixed expressions.' % n) + + def process_node(self, node): + fixed_children = [x for x in node.children if x.udeprel == 'fixed'] + if len(fixed_children) > 0: + # Fixed children are always to the right of of the parent. But there + # may be other nodes in between that are not fixed children (for + # example, there may be punctuation that is attached to one of the + # fixed nodes). + n = node + list_of_forms = [node.form.lower()] + list_of_tags = [node.upos] + while n != fixed_children[-1]: + n = n.next_node + if n.parent == node and n.udeprel == 'fixed': + list_of_forms.append(n.form.lower()) + list_of_tags.append(n.upos) + else: + list_of_forms.append('X') + list_of_tags.append('X') + forms = ' '.join(list_of_forms) + tags = ' '.join(list_of_tags) + if self.only_forms: + print(forms) + else: + print("%s / %s / %s" % (forms, tags, node.deprel)) + else: + # If this is not the first word of a fixed expression, check whether + # something that looks like a known fixed expression starts here. + # Note that it is also possible that a known expression starts here + # but only a subset is actually marked as such; we currently do not + # account for this. + if node.form.lower() in self.first_words: + n = node + list_of_forms = [node.form.lower()] + list_of_tags = [node.upos] + for i in range(self.max_length - 1): + n = n.next_node + if not n: + break + ###!!! At present we cannot identify known expressions with gaps ('X'). + list_of_forms.append(n.form.lower()) + list_of_tags.append(n.upos) + forms = ' '.join(list_of_forms) + if forms in self.known_expressions: + if self.only_forms: + print(forms) + else: + tags = ' '.join(list_of_tags) + print("%s / %s / NOT FIXED" % (forms, tags)) + break diff --git a/udapi/block/ud/pt/addhyphenmwt.py b/udapi/block/ud/pt/addhyphenmwt.py new file mode 100644 index 00000000..9492b1a2 --- /dev/null +++ b/udapi/block/ud/pt/addhyphenmwt.py @@ -0,0 +1,37 @@ +"""Block ud.pt.AddHyphenMwt for transforming hyphen compounds into multiword tokens in Portuguese-GSD. + +See https://github.com/UniversalDependencies/UD_Portuguese-GSD/issues/39 +""" +from udapi.core.block import Block + +class AddHyphenMwt(Block): + + def _ok(self, token): + # The hyphen in "al-Assad" perhaps should be kept as a separate word. + return token.form.isalnum() and token.form.lower() != 'al' + + def process_tree(self, root): + tokens, i = root.token_descendants, 1 + while i+1 < len(tokens): + start_i = i-1 + if tokens[i].form == "-" and self._ok(tokens[i-1]) and self._ok(tokens[i+1]): + while i+3 < len(tokens) and tokens[i+2].form == "-" and self._ok(tokens[i+3]): + i += 2 + compound, words = tokens[start_i:i+2], [] + for token in compound: + words += token.words + heads = [w for w in words if w.parent not in words] + cuckolds = [w for w in words if w not in heads and any(c not in words for c in w.children)] + if len(heads) > 1: + for h in heads: + h.misc["ToDo"] = 'NonCatenaCompound' + elif cuckolds: + for c in cuckolds: + c.misc["ToDo"] = 'HasChildrenOutsideCompound' + else: + compound_form = "".join(t.form for t in compound) + for hyphen in compound[1::2]: + hyphen.remove() + root.create_multiword_token([w for w in words if w.form != '-'], compound_form) + root.text = None + i += 1 diff --git a/udapi/block/ud/ro/fixfixed.py b/udapi/block/ud/ro/fixfixed.py new file mode 100644 index 00000000..14d16464 --- /dev/null +++ b/udapi/block/ud/ro/fixfixed.py @@ -0,0 +1,20 @@ +"""Block ud.ro.FixFixed + +Author: Dan Zeman +""" +import logging + +from udapi.core.block import Block + + +class FixFixed(Block): + """Block for fixing annotation of some 'fixed' expressions.""" + + def process_node(self, node): + fixchildren = [x for x in node.children if x.udeprel=='fixed'] + nfc = len(fixchildren) + if nfc > 0: + if node.udeprel == 'advmod' and node.feats['ExtPos'] == '': + node.feats['ExtPos'] = 'ADV' + elif node.feats['ExtPos'] == '': + logging.info('Another case: '+node.lemma+' '+' '.join([x.form for x in fixchildren])) diff --git a/udapi/block/ud/setspaceafterfromtext.py b/udapi/block/ud/setspaceafterfromtext.py index c5321221..ec7ab658 100644 --- a/udapi/block/ud/setspaceafterfromtext.py +++ b/udapi/block/ud/setspaceafterfromtext.py @@ -14,6 +14,10 @@ class SetSpaceAfterFromText(Block): """Block for setting of the SpaceAfter=No MISC attribute according to the sentence text.""" def process_tree(self, root): + # Empty nodes cannot have 'SpaceAfter=No', so make sure the file is valid. + for empty_node in root.empty_nodes: + del empty_node.misc['SpaceAfter'] + text = root.text if text is None: raise ValueError('Tree %s has no text, cannot use ud.SetSpaceAfterFromText' % root) diff --git a/udapi/block/ud/settranslation.py b/udapi/block/ud/settranslation.py new file mode 100644 index 00000000..487cca06 --- /dev/null +++ b/udapi/block/ud/settranslation.py @@ -0,0 +1,59 @@ +""" +Block SetTranslation for setting of sentence-level translation (the attribute +text_en for English translation) from a separate text file (one sentence per +line). For example, one can export the original sentences using write.SentencesHtml, +then Google-translate them in the web browser, then CTRL+C CTRL+V to a plain +text editor, save them as translations.txt and import them using this block. + +Usage: +udapy -s ud.SetTranslation file=translations.txt < in.conllu > out.conllu + +Author: Dan Zeman +""" +from udapi.core.block import Block +import re +import logging + +class SetTranslation(Block): + """ + Set text_en to the next available translation. + """ + + def __init__(self, file, overwrite=False, **kwargs): + """ + Create the SetTranslation block. + + Parameters: + file: the name of the text file with the translations (one sentence per line) + overwrite=1: set the translation even if the sentence already has one + (default: do not overwrite existing translations) + """ + super().__init__(**kwargs) + self.file = file + fh = open(self.file, 'r', encoding='utf-8') + self.trlines = fh.readlines() + self.nlines = len(self.trlines) + self.iline = 0 + self.overwrite = overwrite + + def process_tree(self, tree): + if self.iline < self.nlines: + translation = self.trlines[self.iline] + self.iline += 1 + comments = [] + if tree.comment: + comments = tree.comment.split('\n') + i_tr = -1 + for i in range(len(comments)): + # The initial '#' character has been stripped. + if re.match(r'\s*text_en\s*=', comments[i]): + i_tr = i + break + if i_tr >= 0: + if self.overwrite: + comments[i_tr] = ' text_en = ' + translation + else: + comments.append(' text_en = ' + translation) + tree.comment = '\n'.join(comments) + elif self.iline == self.nlines: + logging.warning('There are only %d translation lines but there are more input sentences.' % self.nlines) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py index 7208b6ef..7de53881 100644 --- a/udapi/block/ud/sk/fixedeprels.py +++ b/udapi/block/ud/sk/fixedeprels.py @@ -1,6 +1,5 @@ """Block to fix case-enhanced dependency relations in Slovak.""" from udapi.core.block import Block -import logging import re class FixEdeprels(Block): @@ -14,9 +13,11 @@ class FixEdeprels(Block): 'a_hoci': 'hoci', 'ako': 'ako', # remove morphological case 'ako_na': 'ako', + 'ako_z': 'ako', 'akoby_z': 'z:gen', 'akže': 'ak', 'ani_keby': 'keby', + 'ani_keď': 'keď', 'až_keď': 'keď', 'do': 'do:gen', 'k': 'k:dat', diff --git a/udapi/block/ud/splittoken.py b/udapi/block/ud/splittoken.py new file mode 100644 index 00000000..16c60a38 --- /dev/null +++ b/udapi/block/ud/splittoken.py @@ -0,0 +1,107 @@ +""" +Block ud.SplitToken will split a given token into multiple tokens. +""" +from udapi.core.block import Block +import re +import logging + + +class SplitToken(Block): + """ + Split a token into two or more. A MISC attribute is used to mark the tokens + that should be split. (The attribute may have been set by an annotator or + by a previous block that tests the specific conditions under which splitting + is desired.) Multiword tokens are currently not supported: The node to be + split cannot belong to a MWT. Note that the result will not be a MWT either + (use the block ud.AddMwt if that is desired). There will be simply a new + attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes + (indicating that this was an error in the source text). + """ + + def __init__(self, misc_name='SplitToken', **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the splitting + default: SplitToken + The value of the attribute should indicate where to split the token. + It should be a string that is identical to node.form except that + there is one or more spaces where the token should be split. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + + def process_node(self, node): + """ + The SplitToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be split to multiple nodes and the + attribute will be removed from MISC, or a warning will be issued that + the splitting cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + value = node.misc[self.misc_name] + if value == '': + return + if node.multiword_token: + logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # Verify that the value of the MISC attribute can be used as specification + # of the split. + if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value): + logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + if re.search(r'\s', node.form): + logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + if re.sub(r' ', '', value) != node.form: + logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + # Do the split. + space_after = node.misc['SpaceAfter'] + forms = value.split(' ') + # Optionally, SplitTokenMorpho in MISC can have the morphological annotation + # of the new tokens. For example: + # SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act + if node.misc['SplitTokenMorpho'] != '': + morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ') + del node.misc['SplitTokenMorpho'] + else: + morphoblocks = ['' for x in forms] + node.form = forms[0] + last_node = node + for form, morpho in zip(forms[1:], morphoblocks[1:]): + last_node.misc['SpaceAfter'] = 'No' + last_node.misc['CorrectSpaceAfter'] = 'Yes' + lemma = form + upos = node.upos + feats = str(node.feats) + xpos = node.xpos + if morpho != '': + cols = morpho.split('\\t') + for c in cols: + colname, value = c.split('=', 1) + if colname == 'LEMMA': + lemma = value + elif colname == 'UPOS': + upos = value + elif colname == 'FEATS': + feats = re.sub(r'\\p', '|', value) + elif colname == 'XPOS': + xpos = value + else: + logging.fatal(f"c = {c}") + new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep') + new_node.shift_after_node(last_node) + last_node = new_node + last_node.misc['SpaceAfter'] = space_after + del node.misc[self.misc_name] diff --git a/udapi/block/ud/splitunderscoretokens.py b/udapi/block/ud/splitunderscoretokens.py index 094f181a..44575e0c 100644 --- a/udapi/block/ud/splitunderscoretokens.py +++ b/udapi/block/ud/splitunderscoretokens.py @@ -23,7 +23,7 @@ class SplitUnderscoreTokens(Block): Real-world use cases: UD_Irish (`default_deprel=fixed`) and UD_Czech-CLTT v1.4. """ - def __init__(self, deprel=None, default_deprel='flat', **kwargs): + def __init__(self, deprel=None, default_deprel='flat', lemma='split', **kwargs): """Create the SplitUnderscoreTokens block instance. Args: @@ -31,14 +31,21 @@ def __init__(self, deprel=None, default_deprel='flat', **kwargs): Most common values are: flat, fixed, compound. Default=None. default_deprel: Which deprel to use for the newly created nodes if the heuristics in `deprel_for()` method fail. Default=flat. + lemma: What to do with the lemmas? + - 'split' (the default) means to split them on underscores as well + (and warn in case of a different number of underscores than in the form). + - 'form' means to copy the forms to the lemmas """ super().__init__(**kwargs) self.deprel = deprel self.default_deprel = default_deprel + self.lemma = lemma def process_node(self, node): if node.form != '_' and '_' in node.form: forms = node.form.split('_') + if self.lemma == 'form': + node.lemma = node.form lemmas = node.lemma.split('_') if len(forms) != len(lemmas): logging.warning("Different number of underscores in %s and %s, skipping.", diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py new file mode 100644 index 00000000..abacf29f --- /dev/null +++ b/udapi/block/ud/zh/lemmatize.py @@ -0,0 +1,81 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + def __init__(self, rewrite='empty', **kwargs): + """ + Create the ud.zh.Lemmatize block instance. + + Args: + rewrite=empty: set the lemma if it was empty so far; do not touch the rest + rewrite=form: set the lemma if it was empty or equal to form; do not touch the rest + rewrite=all: set the lemma regardless of what it was previously + """ + super().__init__(**kwargs) + if not re.match(r'^(empty|form|all)$', rewrite): + raise ValueError("Unexpected value of parameter 'rewrite'") + self.rewrite = rewrite + + # dictionary: form --> lemma + lemma = { + # The plural suffix -men. + '我們': '我', # trad + '我们': '我', # simp + '他們': '他', # trad + '他们': '他', # simp + '它們': '它', # trad + '它们': '它', # simp + '牠們': '牠', # trad + '她們': '她', # trad + '她们': '她', # simp + '人們': '人', # trad + '人们': '人' # simp + } + + def process_node(self, node): + """ + Parts of the Chinese treebanks lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + """ + if self.rewrite == 'empty' and not (node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + # Lemmatize negated verbs to their affirmative forms. + # 不是 bùshì = not be + # 沒有 没有 méiyǒu = not exist + # 沒能 没能 méinéng = cannot + # 未能 wèinéng = cannot + # Lemmatize question verbs to their base forms. + # 要不要 yàobùyào = do (you) want? + # 有没有 yǒuméiyǒu = do (you) have? + # Verbs that are derived from the copula and tagged as the copula need + # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). + # 亦為 亦为 yìwèi = také + # 則為 则为 zéwèi = potom + # 更為 更为 gèngwèi = více + # 認為 认为 rènwéi = myslet, věřit + # 以為 以为 yǐwéi = myslet, věřit + # 以爲 以为 yǐwéi = myslet, věřit + if re.match(r'^(AUX|VERB)$', node.upos): + m1 = re.match(r'^([不没沒未])(.+)$', node.form) + m2 = re.match(r'^(.+)([不没沒未])\1$', node.form) + m3 = re.search(r'([是爲為为])', node.form) + if m1: + node.lemma = m1.group(2) + node.feats['Polarity'] = 'Neg' + elif m2: + node.lemma = m2.group(1) + node.feats['Mood'] = 'Int' + elif m3: + node.lemma = m3.group(1) + if node.lemma == '爲': + node.lemma = '為' + elif node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index 3ec4a131..9d053cb7 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -1,8 +1,15 @@ """Block udpipe.Base for tagging and parsing using UDPipe.""" from udapi.core.block import Block -from udapi.tool.udpipe import UDPipe +from udapi.tool.udpipeonline import UDPipeOnline from udapi.core.bundle import Bundle +# Import UDPipe only if available (requires ufal.udpipe) +try: + from udapi.tool.udpipe import UDPipe + UDPIPE_AVAILABLE = True +except ImportError: + UDPIPE_AVAILABLE = False + KNOWN_MODELS = { 'af': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', 'af_afribooms': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', @@ -118,13 +125,14 @@ class Base(Block): """Base class for all UDPipe blocks.""" # pylint: disable=too-many-arguments - def __init__(self, model=None, model_alias=None, - tokenize=True, tag=True, parse=True, resegment=False, **kwargs): - """Create the udpipe.En block object.""" + def __init__(self, model=None, model_alias=None, online=False, + tokenize=True, tag=True, parse=True, resegment=False, + ranges=False, delete_nodes=False, **kwargs): super().__init__(**kwargs) - self.model, self.model_alias = model, model_alias + self.model, self.model_alias, self.online = model, model_alias, online self._tool = None self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment + self.ranges, self.delete_nodes = ranges, delete_nodes @property def tool(self): @@ -134,44 +142,58 @@ def tool(self): if not self.model: if not self.model_alias: raise ValueError('model (path/to/model) or model_alias (e.g. en) must be set!') - self.model = KNOWN_MODELS[self.model_alias] - self._tool = UDPipe(model=self.model) + if self.online: + self.model = self.model_alias + else: + self.model = KNOWN_MODELS[self.model_alias] + if self.online: + self._tool = UDPipeOnline(model=self.model) + else: + if not UDPIPE_AVAILABLE: + raise ImportError("UDPipe is not available. Install ufal.udpipe or use online=1") + self._tool = UDPipe(model=self.model) return self._tool def process_document(self, doc): - tok, tag, par = self.tokenize, self.tag, self.parse + tok, tag, par, reseg, ranges = self.tokenize, self.tag, self.parse, self.resegment, self.ranges + if self.zones == "all" and self.online: + self.tool.process_document(doc, tok, tag, par, reseg, ranges) + return old_bundles = doc.bundles new_bundles = [] for bundle in old_bundles: for tree in bundle: new_bundles.append(bundle) if self._should_process_tree(tree): + if self.delete_nodes: + for subroot in tree.children: + subroot.remove() if tok: - new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=self.resegment, - tag=self.tag, parse=self.parse) + new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=reseg, + tag=tag, parse=par, ranges=ranges) if self.resegment and len(new_trees) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' for i, new_tree in enumerate(new_trees[1:], 2): - new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) + new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}") new_tree.zone = tree.zone new_bundle.add_tree(new_tree) new_bundles.append(new_bundle) - elif not tok and tag and par: - self.tool.tag_parse_tree(tree) - elif not tok and not tag and not par and self.resegment: + elif not tok and not reseg and (tag or par): + self.tool.tag_parse_tree(tree, tag=tag, parse=par) + elif not tok and reseg and not tag and not par: sentences = self.tool.segment_text(tree.text) if len(sentences) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' tree.text = sentences[0] for i, sentence in enumerate(sentences[1:], 2): - new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) + new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}") new_tree = new_bundle.create_tree(zone=tree.zone) new_tree.text = sentence new_bundles.append(new_bundle) else: - raise ValueError("Unimplemented tokenize=%s tag=%s parse=%s" % (tok, tag, par)) + raise ValueError(f"Unimplemented tokenize={tok} tag={tag} parse={par} resegment={reseg}") doc.bundles = new_bundles ''' diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index df6aaabf..6e4f2ac9 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -30,7 +30,7 @@ class Eval(Block): def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, coref_mention=None, coref_entity=None, empty_nodes=False, - expand_code=True, **kwargs): + expand_code=True, mwt=None, **kwargs): super().__init__(**kwargs) self.doc = doc self.bundle = bundle @@ -38,6 +38,7 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.node = node self.start = start self.end = end + self.mwt = mwt self.before_doc = before_doc self.after_doc = after_doc self.before_bundle = before_bundle @@ -70,7 +71,7 @@ def process_document(self, document): if self.doc: exec(self.expand_eval_code(self.doc)) - if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node: + if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node or self.mwt: for bundle in doc.bundles: # TODO if self._should_process_bundle(bundle): self.process_bundle(bundle) @@ -96,7 +97,7 @@ def process_bundle(self, bundle): if self.bundle: exec(self.expand_eval_code(self.bundle)) - if self.tree or self.node: + if self.tree or self.node or self.mwt: trees = bundle.trees for tree in trees: if self._should_process_tree(tree): @@ -121,6 +122,11 @@ def process_tree(self, tree): this = node exec(self.expand_eval_code(self.node)) + if self.mwt: + for mwt in tree.multiword_tokens: + this = mwt + exec(self.expand_eval_code(self.mwt)) + def process_start(self): if self.start: exec(self.expand_eval_code(self.start)) diff --git a/udapi/block/util/joinsentence.py b/udapi/block/util/joinsentence.py new file mode 100644 index 00000000..578f3865 --- /dev/null +++ b/udapi/block/util/joinsentence.py @@ -0,0 +1,77 @@ +""" +Block util.JoinSentence will join a given sentence with the preceding one. +""" +import logging +from udapi.core.block import Block + +class JoinSentence(Block): + """ + Joins a sentence with the preceding one. There are two ways how to indicate + the sentences that this block should process. + + Method 1: Parameter sent_id provides the id of the sentence that should be + merged with the preceding one. At most one sentence pair from the input will + be merged, even if there are multiple sentences with the given id. + + Method 2: A MISC attribute can be specified that, if found, will trigger + joining of the current sentence to the previous one. With this approach, + multiple sentence pairs can be merged during one run. + """ + + def __init__(self, sent_id=None, misc_name=None, misc_value=None, **kwargs): + """ + Args: + sent_id: which sentence should be appended to the previous one + misc_name: name of the MISC attribute that can trigger the joining (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the joining; if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + if misc_name: + if sent_id: + logging.fatal('Cannot combine misc_value with sent_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + self.sent_id = sent_id + self.misc_name = misc_name + self.misc_value = misc_value + + def process_document(self, document): + previous_tree = None + for bundle_no, bundle in enumerate(document.bundles): + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to join all zones but we do not try to do it at present. + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + # The MISC attribute we are looking for should logically occur + # on the first node of the sentence but we can take it from any node. + join_commands = [n for n in root.descendants if n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if join_commands: + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove from the node the MISC attribute that triggered the sentence split. + for n in join_commands: + n.misc[self.misc_name] = '' + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + else: + previous_tree = root + elif bundle.bundle_id == self.sent_id: + logging.info('Found!') + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + root = bundle.get_tree() + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + # We have found our sentence. No need to process the rest of the document. + break diff --git a/udapi/block/util/mark.py b/udapi/block/util/mark.py index c57f7443..bcb4f894 100644 --- a/udapi/block/util/mark.py +++ b/udapi/block/util/mark.py @@ -15,7 +15,7 @@ class Mark(Block): udapy -TM util.Mark node='node.is_nonprojective()' < in | less -R """ - def __init__(self, node, mark=1, add=True, **kwargs): + def __init__(self, node, mark=1, mark_attr="Mark", add=True, print_stats=False, empty=False, **kwargs): """Create the Mark block object. Args: @@ -24,17 +24,36 @@ def __init__(self, node, mark=1, add=True, **kwargs): `mark`: the node will be marked with `Mark=` in `node.misc`. Default=1. + `mark_attr`: use this MISC attribute name instead of "Mark". + `add`: should we keep existing Mark|ToDo|Bug? Default=True. + + `print_stats`: print the total number of marked nodes to stdout at process_end + + `empty`: apply the code also on empty nodes """ super().__init__(**kwargs) self.mark = mark + self.mark_attr = mark_attr self.node = node self.add = add + self.print_stats = print_stats + self._marked = 0 + self.empty = empty def process_node(self, node): if eval(self.node): - node.misc['Mark'] = self.mark + node.misc[self.mark_attr] = self.mark + self._marked += 1 elif not self.add: - del node.misc['Mark'] + del node.misc[self.mark_attr] del node.misc['ToDo'] del node.misc['Bug'] + + def process_empty_node(self, empty_node): + if self.empty: + self.process_node(empty_node) + + def process_end(self): + if self.print_stats: + print(f'util.Mark marked {self._marked} nodes') diff --git a/udapi/block/util/markdiff.py b/udapi/block/util/markdiff.py index 3d183f57..e102ca9c 100644 --- a/udapi/block/util/markdiff.py +++ b/udapi/block/util/markdiff.py @@ -9,7 +9,8 @@ class MarkDiff(Block): """Mark differences between parallel trees.""" def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc', - mark=1, add=False, print_stats=0, **kwargs): + mark=1, mark_attr='Mark', add=False, print_stats=0, ignore_parent=False, + align=False, align_attr='Align', **kwargs): """Create the Mark block object. Params: gold_zone: Which of the zones should be treated as gold? @@ -17,17 +18,34 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc attributes: Which node attributes should be considered when searching for diffs? The tree topology, i.e. node parent is always considered. mark: What value should be used in `node.misc['Mark']` of the differing nodes? + mark_attr: use this MISC attribute name instead of "Mark". + Use mark_attr=0 to prevent marking diffs in MISC. add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block, so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block. print_stats: How many lines of statistics should be printed? -1 means all. + ignore_parent: ignore differences in dependency parents + align: store word alignment, possible values are False (no alignment stored, the default) + "from-pred", i.e. pred_node.misc["Align"] = aligned_gold_node.ord, + "from-gold", i.e. gold_node.misc["Align"] = aligned_pred_node.ord and + "both", i.e. both from-pred and from-gold. + If only forms should be considered for inducing the word alignment, + you should use "util.MarkDiff attributes='form' ignore_parent=1 align=1". + Only one-to-one alignment is supported. + align_attr: use this MISC attribute name instead of "Align". """ super().__init__(**kwargs) self.gold_zone = gold_zone self.attrs = attributes.split(',') self.mark = mark + self.mark_attr = mark_attr self.add = add self.print_stats = print_stats + self.ignore_parent = ignore_parent + self.align = align + self.align_attr = align_attr self.stats = collections.Counter() + if not mark_attr and not align and not print_stats: + raise ValueError('mark_attr=0 does not make sense without align or print_stats') def process_tree(self, tree): gold_tree = tree.bundle.get_tree(self.gold_zone) @@ -35,17 +53,17 @@ def process_tree(self, tree): return if not self.add: for node in tree.descendants + gold_tree.descendants: - del node.misc['Mark'] + del node.misc[self.mark_attr] del node.misc['ToDo'] del node.misc['Bug'] pred_nodes, gold_nodes = tree.descendants, gold_tree.descendants # Make sure both pred and gold trees are marked, even if one has just deleted nodes. - if len(pred_nodes) != len(gold_nodes): - tree.add_comment('Mark = %s' % self.mark) - gold_tree.add_comment('Mark = %s' % self.mark) - pred_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in pred_nodes] - gold_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in gold_nodes] + if len(pred_nodes) != len(gold_nodes) and self.mark_attr: + tree.add_comment(f'{self.mark_attr} = {self.mark}') + gold_tree.add_comment(f'{self.mark_attr} = {self.mark}') + pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in pred_nodes] + gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in gold_nodes] matcher = difflib.SequenceMatcher(None, pred_tokens, gold_tokens, autojunk=False) diffs = list(matcher.get_opcodes()) @@ -55,18 +73,24 @@ def process_tree(self, tree): if edit in {'equal', 'replace'}: for i in range(pred_lo, pred_hi): alignment[i] = i - pred_lo + gold_lo + if self.align in ("both", "from-pred"): + pred_nodes[i].misc[self.align_attr] = i - pred_lo + gold_lo + 1 + if self.align in ("both", "from-gold"): + gold_nodes[i - pred_lo + gold_lo].misc[self.align_attr] = i + 1 for diff in diffs: edit, pred_lo, pred_hi, gold_lo, gold_hi = diff if edit == 'equal': for p_node, g_node in zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]): - if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: - p_node.misc['Mark'] = self.mark - g_node.misc['Mark'] = self.mark + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: self.stats['ONLY-PARENT-CHANGED'] += 1 + if self.mark_attr: + p_node.misc[self.mark_attr] = self.mark + g_node.misc[self.mark_attr] = self.mark else: - for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]: - node.misc['Mark'] = self.mark + if self.mark_attr: + for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]: + node.misc[self.mark_attr] = self.mark if self.print_stats: if edit == 'replace': # first n nodes are treated as aligned, the rest is treated as ADDED/DELETED @@ -76,7 +100,7 @@ def process_tree(self, tree): p_value, g_value = p_node._get_attr(attr), g_node._get_attr(attr) if p_value != g_value: self.stats[f'{attr.upper()}: {p_value} -> {g_value}'] += 1 - if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: self.stats['PARENT-CHANGED'] += 1 pred_lo, gold_lo = pred_lo + n, gold_lo + n for node in gold_nodes[gold_lo:gold_hi]: diff --git a/udapi/block/util/markmwtbugsatnodes.py b/udapi/block/util/markmwtbugsatnodes.py new file mode 100644 index 00000000..ebc2ef4e --- /dev/null +++ b/udapi/block/util/markmwtbugsatnodes.py @@ -0,0 +1,25 @@ +"""util.MarkMwtBugsAtNodes copies Bug attributes from MISC of multiword tokens to MISC of member nodes. + Otherwise they will be ignored when write.TextModeTrees marked_only=1 is called.""" + +from udapi.core.block import Block + +class MarkMwtBugsAtNodes(Block): + """ + If a node belongs to a multiword token and the MWT has Bug in MISC, copy + the Bug to the node so that filtering trees with bugs works. + The same bug note will be copied to all nodes in the MWT. + """ + + ###!!! Do we want to do the same thing also with ToDo attributes? + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def process_node(self, node): + if node.multiword_token: + if node.multiword_token.misc['Bug']: + self.bug(node, node.multiword_token.misc['Bug']) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py new file mode 100644 index 00000000..4cce4ab8 --- /dev/null +++ b/udapi/block/util/normalize.py @@ -0,0 +1,97 @@ +"""util.Normalize normalizes the ordering of various attributes in CoNLL-U.""" +from udapi.core.block import Block +from pathlib import Path + +class Normalize(Block): + """Normalize the ordering of attributes in the FEATS and MISC columns. + + The attribute-value pairs in the FEATS column in CoNLL-U files + must be sorted alphabetically (case-insensitive) according to the guidelines + (https://universaldependencies.org/format.html#morphological-annotation). + The same is highly recommended for the MISC column. + It is useful e.g. for comparing two conllu files with diff. + + Udapi does the sorting automatically, but for speed reasons + only when writing into these attributes. + This block thus just forces deserialization of node.feats and node.misc, + so that the Udapi later sorts the attributes during serialization. + It is a bit more efficient than something like + util.Eval node='node.feats["Number"] = node.feats["Number"]' + or + util.Eval node='node.misc["NonExistentAttribute"] = None' + """ + + def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, start_sent_id=1, sent_id_prefix="", + sent_id_from_filename=False, sent_id_reset_at_newdoc=False, newdoc_from_filename=False, **kwargs): + """ + Args: + `feats`: normalize the ordering of FEATS. Default=True. + `misc`: normalize the ordering of MISC. Default=True. + `sent_id`: normalize sent_id so it forms a sequence of integers. Default=False. + `empty_node_ord`: normalize ord attributes of empty nodes. Default=False. + `start_sent_id`: the first sent_id number + `sent_id_prefix`: a string to be prepended before the integer sent_id. Default=empty string. + `sent_id_from_filename`: add Path(doc.meta["loaded_from"]).stem before the `sent_id_prefix`. Default=False. + `sent_id_reset_at_newdoc`: reset the sent_id counter to 1 for each new document. Default=False. + `newdoc_from_filename`: set newdoc to Path(doc.meta["loaded_from"]).stem. Default=False. + """ + super().__init__(**kwargs) + self.feats = feats + self.misc = misc + self.sent_id = sent_id + self.empty_node_ord = empty_node_ord + self.next_sent_id = start_sent_id + self.sent_id_prefix = sent_id_prefix + self.sent_id_from_filename = sent_id_from_filename + self.sent_id_reset_at_newdoc = sent_id_reset_at_newdoc + self.newdoc_from_filename = newdoc_from_filename + if sent_id_reset_at_newdoc and not sent_id_from_filename: + raise ValueError("Cannot use sent_id_reset_at_newdoc without sent_id_from_filename") + if sent_id_prefix or start_sent_id != 1 or sent_id_from_filename: + self.sent_id = True + + # TODO: normalize also the order of standardized comments like text, sent_id,... + + def process_bundle(self, bundle): + is_newdoc = any(tree.newdoc for tree in bundle.trees) + if self.newdoc_from_filename and is_newdoc: + tree = next(tree for tree in bundle.trees if tree.newdoc) + tree.newdoc = Path(bundle.document.meta["loaded_from"]).stem + if self.sent_id: + if self.sent_id_reset_at_newdoc and is_newdoc: + self.next_sent_id = 1 + prefix = self.sent_id_prefix + if self.sent_id_from_filename: + prefix = Path(bundle.document.meta["loaded_from"]).stem + prefix + bundle.bundle_id = prefix + str(self.next_sent_id) + self.next_sent_id += 1 + + for tree in bundle: + if self._should_process_tree(tree): + self.process_tree(tree) + + def process_tree(self, tree): + if self.empty_node_ord: + node_ord, empty_ord = 0, 0 + for node in tree.descendants_and_empty: + if node.is_empty(): + empty_ord += 1 + old_empty_ord, new_empty_ord = str(node.ord), f"{node_ord}.{empty_ord}" + if old_empty_ord != new_empty_ord: + # Make sure all nodes in this sentence have deserialized enhanced deps. + for n in tree.descendants_and_empty: + n.deps + node.ord = new_empty_ord + else: + empty_ord = 0 + node_ord = node.ord + for node in tree.descendants: + self.process_node(node) + + def process_node(self, node): + if self.feats: + node.feats._deserialize_if_empty() + node.feats._string = None + if self.misc: + node.misc._deserialize_if_empty() + node.misc._string = None diff --git a/udapi/block/util/see.py b/udapi/block/util/see.py index aa7131b7..9a895b88 100644 --- a/udapi/block/util/see.py +++ b/udapi/block/util/see.py @@ -51,7 +51,7 @@ class See(Block): """Print statistics about the nodes specified by the parameter `node`.""" - def __init__(self, node, n=5, stats=STATS, **kwargs): + def __init__(self, node, n=5, stats=STATS, empty=False, **kwargs): """Args: `node`: Python expression to be evaluated for each node and if True, the node will be considered "matching". @@ -62,6 +62,7 @@ def __init__(self, node, n=5, stats=STATS, **kwargs): `children` = number of children nodes, `p_lemma` = lemma of a parent node, etc). See `udapi.core.Node.get_attrs` for a full list of statistics. + `empty`: apply the code also on empty nodes """ super().__init__(**kwargs) self.node = node @@ -73,11 +74,13 @@ def __init__(self, node, n=5, stats=STATS, **kwargs): self.match[stat] = Counter() self.every[stat] = Counter() self.overall = Counter() + self.empty = empty def process_tree(self, root): self.overall['trees'] += 1 tree_match = False - for node in root.descendants: + nodes = root.descendants_and_empty if self.empty else root.descendants + for node in nodes: matching = self.process_node(node) self.overall['nodes'] += 1 if matching: diff --git a/udapi/block/util/splitsentence.py b/udapi/block/util/splitsentence.py new file mode 100644 index 00000000..b6ca57d8 --- /dev/null +++ b/udapi/block/util/splitsentence.py @@ -0,0 +1,134 @@ +""" +Block util.SplitSentence will split a given sentence at a given token. +""" +import logging +from udapi.core.block import Block +from udapi.core.root import Root + +class SplitSentence(Block): + """ + If the sent_id of the current sentence matches the parameter, splits the + sentence into two. The first token of the second sentence is also given as + a parameter. + + Alternatively, a MISC attribute can be specified that triggers sentence + splitting at the given token. With this approach, multiple sentence splits + can be performed during one run. + """ + + def __init__(self, sent_id=None, word_id=None, misc_name=None, misc_value=None, **kwargs): + """ + Args: + sent_id: which sentence should be split (new ids will have A and B appended) + word_id: which word should be the first word of the second sentence (tokens and words will be renumbered) + misc_name: name of the MISC attribute that can trigger the split (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the split; if not specified, then simple occurrence of the attribute with any value will cause the split + MISC attributes that have triggered sentence split will be removed from their node. + """ + super().__init__(**kwargs) + if misc_name: + if sent_id or word_id: + logging.fatal('Cannot combine misc_value with sent_id or word_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + if not word_id: + logging.fatal('Missing parameter word_id') + self.sent_id = sent_id + self.word_id = word_id + self.misc_name = misc_name + self.misc_value = misc_value + + def process_document(self, document): + for bundle_no, bundle in enumerate(document.bundles): + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to split all zones but we do not try to do it at present. + # (The zones may be translations to other languages and it is not likely that we would + # want to split each translation at the same position.) + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + split_points = [n for n in root.descendants if n.ord > 1 and n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if split_points: + # Create as many new bundles as there are split points. + n_new = len(split_points) + current_bid = bundle.bundle_id + idletter = 'B' # a letter will be added to bundle ids to distinguish them + for i in range(n_new): + new_bundle = document.create_bundle() + new_bundle.bundle_id = current_bid + idletter + new_root = Root(zone='') + new_bundle.add_tree(new_root) + # Identify nodes to move to the new bundle. + first_node_id = split_points[i].ord + if i < n_new - 1: + next_first_node_id = split_points[i+1].ord + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id and n.ord < next_first_node_id] + else: + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id] + new_root.steal_nodes(nodes_to_move) + self.make_zeros_roots(new_root) + new_root.text = new_root.compute_text() + # The new bundle was created at the end of the document. + # Move it to the position right after the current bundle. + document.bundles.pop() + document.bundles.insert(bundle_no + i + 1, new_bundle) + idletter = chr(ord(idletter) + 1) + # Remove from the node the MISC attribute that triggered the sentence split. + split_points[i].misc[self.misc_name] = '' + # Update the id of the current bundle, fix its zero-dependents and recompute sentence text. + bundle.bundle_id += 'A' + self.make_zeros_roots(root) + root.text = root.compute_text() + # Update the bundle numbers of the new bundles and all bundles after them. + updated_no = bundle_no + 1 + for b in document.bundles[(bundle_no+1):]: + b.number = updated_no + updated_no += 1 + elif bundle.bundle_id == self.sent_id: + logging.info('Found!') + root = bundle.get_tree() + nodes_to_move = [n for n in root.descendants if n.ord >= self.word_id] + if len(nodes_to_move) == 0: + logging.fatal('No nodes to move to the new sentence; word_id may be out of range') + # Create a new bundle at the end of the current document. + new_bundle = document.create_bundle() + # Move the new bundle to the position right after the current bundle. + new_bundle_no = bundle_no + 1 + document.bundles.pop() + document.bundles.insert(new_bundle_no, new_bundle) + updated_no = new_bundle_no + for b in document.bundles[new_bundle_no:]: + b.number = updated_no + updated_no += 1 + new_bundle.bundle_id = bundle.bundle_id + 'B' + bundle.bundle_id += 'A' + new_root = Root(zone='') + new_bundle.add_tree(new_root) + new_root.steal_nodes(nodes_to_move) + # The steal_nodes() method does not make sure that all nodes newly attached + # to the artificial root have the 'root' relation. Fix it. + self.make_zeros_roots(root) + self.make_zeros_roots(new_root) + # Update the sentence text attributes of the new sentences. + root.text = root.compute_text() + new_root.text = new_root.compute_text() + # We have found our sentence. No need to process the rest of the document. + break + + def make_zeros_roots(self, root): + """ + The steal_nodes() method does not make sure that all nodes newly attached + to the artificial root have the 'root' relation. Fix it. + """ + n_root = 0 + for n in root.descendants: + if n.parent.is_root(): + n.deprel = 'root' + n_root += 1 + if n_root > 1: + logging.warning('More than one 0:root relation in newly segmented sentence %s.' % root.bundle.bundle_id) diff --git a/udapi/block/util/wc.py b/udapi/block/util/wc.py index 137c95e9..9920d0b6 100644 --- a/udapi/block/util/wc.py +++ b/udapi/block/util/wc.py @@ -13,6 +13,7 @@ def __init__(self, tsv=False, **kwargs): """ super().__init__(**kwargs) self.trees, self.words, self.mwts, self.tokens, self.empty = 0, 0, 0, 0, 0 + self.docs, self.paragraphs = 0, 0 self.tsv = tsv def process_tree(self, tree): @@ -22,13 +23,21 @@ def process_tree(self, tree): self.mwts += mwtoks self.tokens += len(tree.token_descendants) if mwtoks else len(tree.descendants) self.empty += len(tree.empty_nodes) + if tree.newdoc or tree == tree.document[0].trees[0]: + self.docs += 1 + if tree.newpar: + self.paragraphs += 1 def process_end(self): if self.tsv: - print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty)))) + print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty, self.docs, self.paragraphs)))) else: print('%8d trees\n%8d words' % (self.trees, self.words)) if self.mwts: print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) if self.empty: print('%8d empty nodes' % self.empty) + if self.docs: + print('%8d documents' % self.docs) + if self.paragraphs: + print('%8d paragraphs' % self.paragraphs) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index abe20963..ad647477 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -117,7 +117,9 @@ def process_tree(self, tree): # pylint: disable=too-many-branches if mwt and node._ord > last_mwt_id: print('\t'.join((mwt.ord_range, '_' if mwt.form is None else mwt.form, - '_\t_\t_\t_\t_\t_\t_', + '_\t_\t_', + '_' if mwt._feats is None else str(mwt.feats), + '_\t_\t_', '_' if mwt._misc is None else str(mwt.misc)))) last_mwt_id = mwt.words[-1]._ord @@ -134,10 +136,10 @@ def process_tree(self, tree): # pylint: disable=too-many-branches '_' if node._feats is None else str(node.feats), head, node.deprel, node.raw_deps, '_' if node._misc is None else str(node.misc)))) - # Empty sentences are not allowed in CoNLL-U, + # Empty sentences (sentences with no non-empty nodes) are not allowed in CoNLL-U, # but with print_empty_trees==1 (which is the default), # we will print an artificial node, so we can print the comments. - if not nodes: + if not tree._descendants: print("1\t_\t_\t_\t_\t_\t0\t_\t_\tEmpty=Yes") # Empty line separates trees in CoNLL-U (and is required after the last tree as well) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py new file mode 100644 index 00000000..1d5d4716 --- /dev/null +++ b/udapi/block/write/corefhtml.py @@ -0,0 +1,478 @@ +"""CorefHtml class is a writer for HTML+JavaScript visualization of coreference. + +When using lazy loading of documents (infinite scrolling), +modern browsers don't allow JavaScript to load files from a local file system +("Access to XMLHttpRequest at 'file://.../doc2.html' from origin 'null' has been +blocked by CORS policy: Cross origin requests are only supported for protocol schemes: +http, data, chrome, chrome-extension, https.") + +The recommended solution is to start a local web server, e.g. using + python -m http.server +and browse http://0.0.0.0:8000/my.html. + +Non-recommended solution is to run + google-chrome --new-window --user-data-dir=/tmp/chrome-proxy --allow-file-access-from-files my.html +""" +from udapi.core.basewriter import BaseWriter +from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention +from collections import Counter +import udapi.block.write.html +import gzip +import sys +import os +import re + +ETYPES = 'person place organization animal plant object substance time number abstract event'.split() + +HTYPES = 'PROPN NOUN PRON VERB DET OTHER'.split() + +HEADER = ''' + +Udapi CorefUD viewer + + +''' + +CSS = ''' +#wrap {display: flex; align-items: flex-start;} +#main {width: 100%; padding: 5px; background: white; z-index:100;} +#overview { position: sticky; top: 0; overflow-y: scroll; height:95vh; resize:horizontal; + display: grid; border-right: double; + padding: 5px; width: 20em; background: #ddd; border-radius: 5px; +} +#main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; + padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#main-menu div {display: inline-block;} +#menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} +#menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} +.change .b1 {transform: translate(0, 9px) rotate(-45deg);} +.change .b2 {opacity: 0;} +.change .b3 {transform: translate(0, -9px) rotate(45deg);} + +.m {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.nobox {border:1px solid transparent; padding:0; background: transparent !important; display: inline} +.nobox .labels {display: inline;} +.nocolor {color: black !important;} +.nobold {font-weight: normal;} +.labels {display: block; font-size: 10px;} +.showtree {margin: 5px; user-select: none;} +.display-inline {display: inline;} +.close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} +i.empty {color: gray; border: 3px outset gray; padding: 1px;} +.sentence .singleton {border-style: dotted;} +.crossing:before {content: "!"; display: block; background: #ffd500;} +.active {border: 1px solid red !important;} +.selected {background: red !important; text-shadow: 1px 1px 4px white, -1px 1px 4px white, 1px -1px 4px white, -1px -1px 4px white;} +.sent_id {display: none; background: #ddd; border-radius: 3px;} +''' + +SCRIPT_BASE = ''' +function add_mention_listeners(mentions){ + mentions.click(function(e) { + let was_selected = $(this).hasClass("selected"); + $(".m").removeClass("selected"); + if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} + e.stopPropagation(); + }); + mentions.hover( + function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, + function(e) {$(".m").removeClass("active");} + ); +} +add_mention_listeners($(".m")); + +window.onhashchange = function() { + $(".m").removeClass("selected"); + var fragment = window.location.hash.substring(1); + if (fragment) {$("." + fragment).addClass("selected");} +} + +function menuclick(x) { + x.classList.toggle("change"); + $("#main-menu").toggle(); +} + +async function load_doc(doc_num) { + loading_now = true; + let filename = docs_dir + "/doc" + doc_num + ".html.gz" + console.log("loading " + filename); + try { + const res = await fetch(filename); + let raw = await res.arrayBuffer(); + data = pako.inflate(raw, {to: "string"}); + } catch (error){ + if (! load_fail_reported) { + load_fail_reported = true; + alert("Cannot load " + filename + "\\nLocal files do not support lazy loading." + + " Run a web server 'python -m http.server'\\n" + + "error = " + error); + } + } + $("#main").append(data); + add_mention_listeners($("#doc" + doc_num + " .m")); + $("#doc" + doc_num + " .sentence").each(add_show_tree_button); + $('.eid').toggle($('#show-eid')[0].checked); + $('.etype').toggle($('#show-etype')[0].checked); + $('.sent_id').toggle($('#show-sent_id')[0].checked); + $('.showtree').toggle($('#show-trees')[0].checked); + $('.m').toggleClass('nocolor', ! $('#show-color')[0].checked); + $('.m').toggleClass('nobox', ! $('#show-boxes')[0].checked); + $('.norm').toggle($('#show-norm')[0].checked); + $('.head').toggleClass('nobold', ! $('#show-heads')[0].checked); + $('.empty').toggle($('#show-empty')[0].checked); + $('.sentence').toggleClass('display-inline', ! $('#show-breaks')[0].checked); + $('.par').toggle($('#show-pars')[0].checked); + $('h1').toggle($('#show-docs')[0].checked); + $('.m').toggleClass('htype',$('#htype')[0].checked) + loading_now = false; +} + +var docs_loaded = 1; +var load_fail_reported = false; +var loading_now = false; +add_show_tree_button = function(index, el){ // to be redefined later if show_trees=True + $(el).prepend('🆔' + el.dataset.id + ''); +} +function load_more() { + if (!loading_now && $(window).scrollTop() >= $(document).height() - $(window).height() - 42 && docs_loaded < all_docs) { + docs_loaded += 1; + load_doc(docs_loaded); + } +} +$(window).scroll(load_more); +const resizeObserver = new ResizeObserver(entries =>load_more()); +resizeObserver.observe(document.body); +''' + +SCRIPT_SHOWTREE = ''' +function show_tree_in_tdiv(tdiv, doc_number, index){ + tdiv.treexView([docs_json[doc_number][index]]); + $("\n' + ) + + # The first ud_doc will be printed to the main html file. + self.process_ud_doc(ud_docs[0], 1) + print('') # id=main + + # Other ud_docs will be printed into separate files (so they can be loaded lazily) + orig_stdout = sys.stdout + try: + for i, ud_doc in enumerate(ud_docs[1:], 2): + sys.stdout = gzip.open(f"{self.docs_dir}/doc{i}.html.gz", 'wt') + self.process_ud_doc(ud_doc, i) + sys.stdout.close() + finally: + sys.stdout = orig_stdout + + print(f'') + print('') + + def _start_subspan(self, subspan, crossing=False): + m = subspan.mention + e = m.entity + classes = f'{_dom_esc(e.eid)} {self._mention_ids[m]} {e.etype or "other"} m' + title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}' + classes += f" {m.head.upos if m.head.upos in HTYPES else 'OTHER'}" + title += f'\nhead-upos={m.head.upos}' + if self.colors: + classes += f' {self._entity_colors[e]}' + if all(w.is_empty() for w in subspan.words): + classes += ' empty' + if len(e.mentions) == 1: + classes += ' singleton' + if crossing: + classes += ' crossing' + title += '\ncrossing' + if m.other: + title += f'\n{m.other}' + span_id = '' + if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m: + span_id = f'id="{_dom_esc(e.eid)}" ' + # The title should be always rendered left-to-right (e.g. "head=X", not "X=head"), + # so for RTL languages, we need to use explicit dir="ltr" and insert a nested span with dir="rtl". + if self.rtl: + print(f'' + f'{_dom_esc(subspan.subspan_eid)}' + f' {e.etype}', end='') + else: + print(f'' + f'{_dom_esc(subspan.subspan_eid)}' + f' {e.etype}', end='') + + def process_tree(self, tree): + mentions = set() + nodes_and_empty = tree.descendants_and_empty + for node in nodes_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + subspans = [] + for mention in mentions: + subspans.extend(mention._subspans()) + subspans.sort(reverse=True) + + if tree.newdoc: + print(f'

{tree.newdoc if tree.newdoc is not True else ""}


') + elif tree.newpar: + print('
') + opened, prev_node_mention = [], True + rtl = ' dir="rtl"' if self.rtl else "" + print(f'

') + for node in nodes_and_empty: + if not prev_node_mention and subspans and subspans[-1].words[0] == node: + print('', end='') + while subspans and subspans[-1].words[0] == node: + subspan = subspans.pop() + self._start_subspan(subspan) + opened.append(subspan) + + if not opened and prev_node_mention: + print('', end='') + prev_node_mention = True if opened else False + is_head = self._is_head(node) + if is_head: + print('', end='') + if node.is_empty(): + print('', end='') + print(node.form, end='') + if node.is_empty(): + print('', end='') + if is_head: + print('', end='') + + while opened and opened[-1].words[-1] == node: + if self.rtl: + print('', end='') + else: + print('', end='') + opened.pop() + + # Two mentions are crossing iff their spans have non-zero intersection, + # but neither is a subset of the other, e.g. (e1 ... (e2 ... e1) ... e2). + # Let's visualize this (simplified) as + # ......... + # i.e. let's split mention e2 into two subspans which are next to each other. + # Unfortunatelly, we cannot mark now both crossing mentions using html class "crossing" + # (opening tags are already printed), so we'll mark only the second part of the second mention. + endings = [x for x in opened if x.words[-1] == node] + if endings: + new_opened, brokens, found_crossing = [], [], False + for subspan in opened: + if subspan.words[-1] == node: + found_crossing = True + elif found_crossing: + brokens.append(subspan) + else: + new_opened.append(subspan) + opened = new_opened + print('' * (len(endings) + len(brokens)), end='') + for broken in brokens: + self._start_subspan(broken, True) + opened.append(subspan) + + if not node.no_space_after: + print(' ', end='') + + if not prev_node_mention: + print('', end='') + print('

') + + def _is_head(self, node): + for mention in node.coref_mentions: + if mention.head == node: + return mention + return None + + +# id needs to be a valid DOM querySelector +# so it cannot contain [#./:] and maybe more, +# so let's substitute all [^\w\d-] to be on the safe side. +# DOM IDs cannot start with a digit, so prepend e.g. "n" if needed. +def _dom_esc(string): + if string[0].isdecimal(): + string = 'n' + string + return re.sub(r'[^\w\d-]', '_', string) + +def _id(node): + if node is None: + return 'null' + return _dom_esc(node.address()) + +def _esc(string): + if string is None: + string = '' + return string.replace('\\', '\\\\').replace('"', r'\"') diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 148b29ee..ae85d43c 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,16 +79,32 @@ def process_document(self, doc): print('\n') print('
') + + def print_doc_json(self, doc): + print('[') for (bundle_number, bundle) in enumerate(doc, 1): - # TODO: if not self._should_process_bundle(bundle): continue if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' - for tree in bundle.trees: - # TODO: if not self._should_process_tree(tree): continue + try: + trees = bundle.trees + except: + trees = [bundle] # allow to call print_doc_json([tree1, tree2]) + for tree in trees: zone = tree.zone if first_zone: first_zone = False @@ -101,24 +117,16 @@ def process_document(self, doc): print('"labels":["zone=%s","id=%s"]}' % (zone, tree.address())) desc += ',["[%s]","label"],[" ","space"]' % zone for node in tree.descendants: - desc += self.print_node(node) + desc += self.print_node_json(node) desc += r',["\n","newline"]' print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) - print('];') - print("$('#treex-view').treexView(data);") - print('''function saveTree() { - var svg_el = jQuery('svg'); - if (svg_el.length) { - var svg = new Blob([svg_el.parent().html()], {type: "image/svg+xml"}); - saveAs(svg, 'tree.svg'); - } - }''') - print('') + print(']') + @staticmethod - def print_node(node): + def print_node_json(node): """JSON representation of a given node.""" # pylint does not understand `.format(**locals())` and falsely alarms for unused vars # pylint: disable=too-many-locals,unused-variable diff --git a/udapi/block/write/sentences.py b/udapi/block/write/sentences.py index 60eb6bec..70553d7d 100644 --- a/udapi/block/write/sentences.py +++ b/udapi/block/write/sentences.py @@ -3,13 +3,14 @@ class Sentences(BaseWriter): - """A writer of plain-text sentences (one per line). + """A writer of plain-text sentences (one sentence per line). Usage: udapy write.Sentences if_missing=empty < my.conllu > my.txt + udapy write.Sentences newdoc=1 newpar=1 < my.conllu > my.txt """ - def __init__(self, if_missing='detokenize', **kwargs): + def __init__(self, if_missing='detokenize', newdoc=None, newpar=None, **kwargs): """Create the Sentences writer block. Parameters: @@ -18,9 +19,21 @@ def __init__(self, if_missing='detokenize', **kwargs): * `empty`: print an empty line * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` * `fatal`: raise an exception + newdoc: What to do if `root.newdoc` is not None? (default=None) + * None: ignore it + * True: print an empty_line (except for the first tree, i.e. bundle.number==1) + newpar: What to do if `root.newpar` is not None? (default=None) + * None: ignore it + * True: print an empty_line (except for the first tree, i.e. bundle.number==1) """ super().__init__(**kwargs) self.if_missing = if_missing + self.newdoc = newdoc + self.newpar = newpar def process_tree(self, tree): + if self.newdoc and tree.newdoc and tree.bundle.number > 1: + print() + if self.newpar and tree.newpar and tree.bundle.number > 1: + print() print(tree.get_sentence(self.if_missing)) diff --git a/udapi/block/write/sentenceshtml.py b/udapi/block/write/sentenceshtml.py new file mode 100644 index 00000000..e0f87241 --- /dev/null +++ b/udapi/block/write/sentenceshtml.py @@ -0,0 +1,37 @@ +"""SentencesHtml class is a writer for sentences in HTML list (could be Google-translated, remembering sentence correspondence).""" +from udapi.core.basewriter import BaseWriter + + +class SentencesHtml(BaseWriter): + """A writer of sentences in HTML list (one per item). + + Usage: + udapy write.SentencesHtml if_missing=empty < my.conllu > my.html + """ + + def __init__(self, title='Sentences from CoNLL-U', if_missing='detokenize', **kwargs): + """Create the SentencesHtml writer block. + + Parameters: + if_missing: What to do if `root.text` is `None`? (default=detokenize) + * `detokenize`: use `root.compute_text()` to compute the sentence. + * `empty`: print an empty line + * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` + * `fatal`: raise an exception + """ + super().__init__(**kwargs) + self.title = title + self.if_missing = if_missing + + def before_process_document(self, document): + super().before_process_document(document) + print('\n\n\n') + print('' + self.title + '') + print('\n\n
    \n') + + def after_process_document(self, document): + print("
\n\n") + super().after_process_document(document) + + def process_tree(self, tree): + print('
  • %s
  • ' % (tree.sent_id, tree.get_sentence(self.if_missing))) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index f3f6e007..a8a7ab3d 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -1,4 +1,5 @@ """An ASCII pretty printer of dependency trees.""" +import os import re import sys @@ -13,6 +14,7 @@ 'upos': 'red', 'deprel': 'blue', 'ord': 'green', + 'misc[Entity]': 'magenta', } # Too many instance variables, arguments, branches... @@ -21,7 +23,7 @@ class TextModeTrees(BaseWriter): - """An ASCII pretty printer of dependency trees. + r"""An ASCII pretty printer of dependency trees. .. code-block:: bash @@ -108,7 +110,7 @@ class TextModeTrees(BaseWriter): │ ╰─┶ boxer NOUN acl:relcl ╰─╼ . PUNCT punct - Some non-projective trees cannot be printed witout crossing edges. + Some non-projective trees cannot be printed without crossing edges. TextModeTrees uses a special "bridge" symbol ─╪─ to mark this:: ─┮ @@ -121,17 +123,17 @@ class TextModeTrees(BaseWriter): (not file or pipe), each node attribute is printed in different color. If a given node's MISC contains any of `ToDo`, `Bug` or `Mark` attributes (or any other specified in the parameter `mark`), the node will be highlighted - (by reveresing the background and foreground colors). + (by reversing the background and foreground colors). This block's method `process_tree` can be called on any node (not only root), which is useful for printing subtrees using ``node.draw()``, which is internally implemented using this block. For use in LaTeX, you can insert the output of this block (without colors) - into \begin{verbatim}...\end{verbatim}, but you need to compile with pdflatex (xelatex not supported) - and you must add the following code into the preambule:: + into ``\begin{verbatim}...\end{verbatim}``, but you need to compile with pdflatex (xelatex not supported) + and you must add the following code into the preamble:: - \\usepackage{pmboxdraw} + \usepackage{pmboxdraw} \DeclareUnicodeCharacter{256D}{\textSFi} %╭ \DeclareUnicodeCharacter{2570}{\textSFii} %╰ @@ -142,41 +144,44 @@ class TextModeTrees(BaseWriter): def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color='auto', attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, print_empty=True, - mark='(ToDo|ToDoOrigText|Bug|Mark)', marked_only=False, hints=True, + print_mwt=False, mark='(ToDo|ToDoOrigText|Bug|Mark)', marked_only=False, hints=True, layout='classic', **kwargs): """Create new TextModeTrees block object. Args: - print_sent_id: Print ID of the tree (its root, aka "sent_id") above each tree? - print_sentence: Print plain-text detokenized sentence on a line above each tree? - add_empty_line: Print an empty line after each tree? - indent: Number of characters to indent node depth in the tree for better readability. - minimize_cross: Minimize crossings of edges in non-projective trees? - Trees without crossings are subjectively more readable, but usually - in practice also "deeper", that is with higher maximal line length. - color: Print the node attribute with ANSI terminal colors? - Default = 'auto' which means that color output only if the output filehandle - is interactive (console). Each attribute is assigned a color (the mapping is - tested on black background terminals and can be changed only in source code). - If you plan to pipe the output (e.g. to "less -R") and you want the colors, - you need to set explicitly color=1, see the example in Synopsis. - attributes: A comma-separated list of node attributes which should be printed. Possible - values are ord, form, lemma, upos, xpos, feats, deprel, deps, misc. - print_undef_as: What should be printed instead of undefined attribute values (if any)? - print_doc_meta: Print `document.meta` metadata before each document? - print_comments: Print comments (other than sent_id and text)? - print_empty: Print empty nodes? - mark: a regex. If `re.search(mark + '=', str(node.misc))` the node is highlighted. - If `print_comments and re.search(r'^ %s = ' % mark, root.comment, re.M)` - the comment is highlighted. - Empty string means no highlighting. Default = 'ToDo|ToDoOrigText|Bug|Mark'. - marked_only: print only trees containing one or more marked nodes/comments. Default=False. - hints: use thick-marked segments (┡ and ┢) to distinguish whether a given node precedes - or follows its parent. Default=True. If False, plain ├ is used in both cases. - layout: 'classic' (default) shows word attributes immediately next to each node, - 'compact' never print edges after (right to) words even in non-projectivities, - 'align-words' as 'compact' but all first attributes (forms by default) are aligned, - 'align' as 'align-words' but all attributes are aligned in columns. + print_sent_id: Print ID of the tree (its root, aka "sent_id") above each tree? + print_text: Print plain-text detokenized sentence on a line above each tree? + add_empty_line: Print an empty line after each tree? + indent: Number of characters to indent node depth in the tree for better readability. + minimize_cross: Minimize crossings of edges in non-projective trees? + Trees without crossings are subjectively more readable, but usually + in practice also "deeper", that is with higher maximal line length. + color: Print the node attribute with ANSI terminal colors? + Default = 'auto' which means that color output only if the output filehandle + is interactive (console). Each attribute is assigned a color (the mapping is + tested on black background terminals and can be changed only in source code). + If you plan to pipe the output (e.g. to "less -R") and you want the colors, + you need to set explicitly color=1, see the example in Synopsis. + attributes: A comma-separated list of node attributes which should be printed. Possible + values are ``ord``, ``form``, ``lemma``, ``upos``, ``xpos``, ``feats``, ``deprel``, ``deps``, ``misc``. + print_undef_as: What should be printed instead of undefined attribute values (if any)? + print_doc_meta: Print ``document.meta`` metadata before each document? + print_comments: Print comments (other than ``sent_id`` and ``text``)? + print_empty: Print empty nodes? Default=True + print_mwt: Print multi-word tokens? Default=False + mark: A regex pattern. If ``re.search(mark + '=', str(node.misc))`` matches, the node is highlighted. + If ``print_comments`` and ``re.search(r'^ %s = ' % mark, root.comment, re.M)`` matches, + the comment is highlighted. Empty string means no highlighting. + Default = ``'(ToDo|ToDoOrigText|Bug|Mark)'``. + marked_only: Print only trees containing one or more marked nodes/comments. Default ``False``. + hints: Use thick-marked segments (┡ and ┢) to distinguish whether a given node precedes + or follows its parent. Default ``True``. If ``False``, plain ├ is used in both cases. + layout: Tree layout style: + + - ``'classic'`` (default): shows word attributes immediately next to each node + - ``'compact'``: never print edges after (right to) words even in non-projectivities + - ``'align-words'``: like ``'compact'`` but all first attributes (forms by default) are aligned + - ``'align'``: like ``'align-words'`` but all attributes are aligned in columns """ super().__init__(**kwargs) self.print_sent_id = print_sent_id @@ -189,6 +194,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.print_doc_meta = print_doc_meta self.print_comments = print_comments self.print_empty = print_empty + self.print_mwt = print_mwt self.mark = mark self.marked_only = marked_only self.layout = layout @@ -243,21 +249,21 @@ def should_print_tree(self, root, allnodes): return False return self.comment_mark_re.search(root.comment) - def process_tree(self, root): + def process_tree(self, root, force_print=False): """Print the tree to (possibly redirected) sys.stdout.""" if self.print_empty: - if root.is_root(): + if root.is_root() and not self.print_mwt: allnodes = [root] + root.descendants_and_empty else: - allnodes = root.descendants(add_self=1) + allnodes = root.descendants(add_self=1, add_mwt=self.print_mwt) empty = [e for e in root._root.empty_nodes if e > allnodes[0] and e < allnodes[-1]] allnodes.extend(empty) allnodes.sort() else: - allnodes = root.descendants(add_self=1) - if not self.should_print_tree(root, allnodes): + allnodes = root.descendants(add_self=1, add_mwt=self.print_mwt) + if not force_print and not self.should_print_tree(root, allnodes): return - self._index_of = {allnodes[i].ord: i for i in range(len(allnodes))} + self._index_of = {allnodes[i].ord_range if allnodes[i].is_mwt() else allnodes[i].ord: i for i in range(len(allnodes))} self.lines = [''] * len(allnodes) self.lengths = [0] * len(allnodes) @@ -284,7 +290,7 @@ def process_tree(self, root): if self.layout == 'classic': self.add_node(idx, node) else: - if idx_node.parent is not node: + if idx_node.is_mwt() or idx_node.parent is not node: self._add(idx, self._vert[self._ends(idx, '─╭╰╪┡┢')]) else: precedes_parent = idx < self._index_of[node.ord] @@ -302,7 +308,7 @@ def process_tree(self, root): if self.layout == 'classic': for idx, node in enumerate(allnodes): - if node.is_empty(): + if node.is_empty() or node.is_mwt(): self.add_node(idx, node) else: columns_attrs = [[a] for a in self.attrs] if self.layout == 'align' else [self.attrs] @@ -344,11 +350,16 @@ def before_process_document(self, document): super().before_process_document(document) if self.color == 'auto': self.color = sys.stdout.isatty() - if self.color: - colorama.init() + if self.color: + colorama.just_fix_windows_console() + # termcolor since 2.1 also autodetects whether sys.stdout.isatty() + # and if not, it disables the colors, so `cat i.conllu | udapy -T | less -R" + # does not work. We need to turn off termcolor's autodetection with FORCE_COLOR. + os.environ["FORCE_COLOR"] = "1" if self.print_doc_meta: for key, value in sorted(document.meta.items()): - print('%s = %s' % (key, value)) + if key[0] != '_': + print('%s = %s' % (key, value)) def _add(self, idx, text): self.lines[idx] += text @@ -356,7 +367,7 @@ def _add(self, idx, text): def add_node(self, idx, node): """Render a node with its attributes.""" - if not node.is_root(): + if node.is_mwt() or not node.is_root(): values = node.get_attrs(self.attrs, undefs=self.print_undef_as) self.lengths[idx] += 1 + len(' '.join(values)) marked = self.is_marked(node) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 9f9f6aa2..0ad39da4 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -26,7 +26,7 @@ class TextModeTreesHtml(TextModeTrees): This block is a subclass of `TextModeTrees`, see its documentation for more info. """ - def __init__(self, color=True, title='Udapi visualization', **kwargs): + def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, whole_bundle=True, **kwargs): """Create new TextModeTreesHtml block object. Args: see `TextModeTrees`. @@ -35,9 +35,14 @@ def __init__(self, color=True, title='Udapi visualization', **kwargs): (see the `mark` parameter) to be more eye-catching. title: What title metadata to use for the html? + zones_in_rows: print trees from the same bundle side by side (i.e. in the same row). + whole_bundle: always print the whole bundle (all its trees) if any of the trees is marked + (relevant only with marked_only=True and zones_in_rows=True) """ super().__init__(color=color, **kwargs) self.title = title + self.zones_in_rows = zones_in_rows + self.whole_bundle = whole_bundle def before_process_document(self, document): # TextModeTrees.before_process_document changes the color property, @@ -82,3 +87,27 @@ def print_headers(self, root): print(escape(text)) if self.print_comments and root.comment: print('#' + self.colorize_comment(escape(root.comment)).rstrip().replace('\n', '\n#')) + + def process_bundle(self, bundle): + if self.zones_in_rows: + # Don't print
    if no tree will be printed in this bundle. + marked_trees = [] + for tree in bundle: + if self._should_process_tree(tree): + if self.print_empty: + allnodes = [tree] + tree.descendants_and_empty + else: + allnodes = tree.descendants(add_self=1) + if self.should_print_tree(tree, allnodes): + marked_trees.append(tree) + if marked_trees: + if self.whole_bundle: + marked_trees = bundle + print("") + for tree in marked_trees: + print("") + print("
    ") + self.process_tree(tree, force_print=True) + print("
    ") + else: + super().process_bundle(bundle) diff --git a/udapi/block/write/vislcg.py b/udapi/block/write/vislcg.py index 569b1056..acdf1e80 100644 --- a/udapi/block/write/vislcg.py +++ b/udapi/block/write/vislcg.py @@ -64,10 +64,7 @@ def process_tree(self, tree): # Print the line with forms and optional upos tags and feats. for token in tree.token_descendants: print('"<%s>"' % self._escape(token.form)) - try: - words = token.words - except AttributeError: - words = [token] + words = token.words print('\t' + self._node(words[0])) for nonfirst_mwt_word in words[1:]: print('\t\t' + self._node(nonfirst_mwt_word)) diff --git a/udapi/cli.py b/udapi/cli.py new file mode 100755 index 00000000..de55f8cb --- /dev/null +++ b/udapi/cli.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +import os +import gc +import sys +import atexit +import logging +import argparse + +from udapi.core.run import Run + +# Parse command line arguments. +argparser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter, + usage="udapy [optional_arguments] scenario", + epilog="See http://udapi.github.io", + description="udapy - Python interface to Udapi - API for Universal Dependencies\n\n" + "Examples of usage:\n" + " udapy -s read.Sentences udpipe.En < in.txt > out.conllu\n" + " udapy -T < sample.conllu | less -R\n" + " udapy -HAM ud.MarkBugs < sample.conllu > bugs.html\n") +argparser.add_argument( + "-q", "--quiet", action="store_true", + help="Warning, info and debug messages are suppressed. Only fatal errors are reported.") +argparser.add_argument( + "-v", "--verbose", action="store_true", + help="Warning, info and debug messages are printed to the STDERR.") +argparser.add_argument( + "-s", "--save", action="store_true", + help="Add write.Conllu to the end of the scenario") +argparser.add_argument( + "-T", "--save_text_mode_trees", action="store_true", + help="Add write.TextModeTrees color=1 to the end of the scenario") +argparser.add_argument( + "-H", "--save_html", action="store_true", + help="Add write.TextModeTreesHtml color=1 to the end of the scenario") +argparser.add_argument( + "-A", "--save_all_attributes", action="store_true", + help="Add attributes=form,lemma,upos,xpos,feats,deprel,misc (to be used after -T and -H)") +argparser.add_argument( + "-C", "--save_comments", action="store_true", + help="Add print_comments=1 (to be used after -T and -H)") +argparser.add_argument( + "-M", "--marked_only", action="store_true", + help="Add marked_only=1 to the end of the scenario (to be used after -T and -H)") +argparser.add_argument( + "-N", "--no_color", action="store_true", + help="Add color=0 to the end of the scenario, this overrides color=1 of -T and -H") +argparser.add_argument( + "-X", "--extra", action="append", + help="Add a specified parameter (or a block name) to the end of the scenario\n" + "For example 'udapy -TNX attributes=form,misc -X layout=align < my.conllu'") +argparser.add_argument( + "--gc", action="store_true", + help="By default, udapy disables Python garbage collection and at-exit cleanup\n" + "to speed up everything (especially reading CoNLL-U files). In edge cases,\n" + "when processing many files and running out of memory, you can disable this\n" + "optimization (i.e. enable garbage collection) with 'udapy --gc'.") +argparser.add_argument( + 'scenario', nargs=argparse.REMAINDER, help="A sequence of blocks and their parameters.") + + +# Process and provide the scenario. +def main(argv=None): + args = argparser.parse_args(argv) + + # Set the level of logs according to parameters. + if args.verbose: + level = logging.DEBUG + elif args.quiet: + level = logging.CRITICAL + else: + level = logging.INFO + + logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', + level=level) + + # Global flag to track if an unhandled exception occurred + _unhandled_exception_occurred = False + + def _custom_excepthook(exc_type, exc_value, traceback): + global _unhandled_exception_occurred + _unhandled_exception_occurred = True + + # Call the default excepthook to allow normal error reporting + sys.__excepthook__(exc_type, exc_value, traceback) + + # Override the default excepthook + sys.excepthook = _custom_excepthook + + # Disabling garbage collections makes the whole processing much faster. + # Similarly, we can save several seconds by partially disabling the at-exit Python cleanup + # (atexit hooks are called in reversed order of their registration, + # so flushing stdio buffers etc. will be still done before the os._exit(0) call). + # See https://instagram-engineering.com/dismissing-python-garbage-collection-at-instagram-4dca40b29172 + # Is it safe to disable GC? + # OS will free the memory allocated by this process after it ends anyway. + # The udapy wrapper is aimed for one-time tasks, not a long-running server, + # so in a typical case a document is loaded and almost no memory is freed before the end. + # Udapi documents have a many cyclic references, so running GC is quite slow. + if not args.gc: + gc.disable() + # When an exception/error has happened, udapy should exit with a non-zero exit code, + # so that users can use `udapy ... || echo "Error detected"` (or Makefile reports errors). + # However, we cannot use `atexit.register(lambda: os._exit(1 if sys.exc_info()[0] else 0))` + # because the Python has already exited the exception-handling block + # (the exception/error has been already reported and sys.exc_info()[0] is None). + # We thus keep record whether _unhandled_exception_occurred. + atexit.register(lambda: os._exit(1 if _unhandled_exception_occurred else 0)) + atexit.register(sys.stderr.flush) + if args.save: + args.scenario = args.scenario + ['write.Conllu'] + if args.save_text_mode_trees: + args.scenario = args.scenario + ['write.TextModeTrees', 'color=1'] + if args.save_html: + args.scenario = args.scenario + ['write.TextModeTreesHtml', 'color=1'] + if args.save_all_attributes: + args.scenario = args.scenario + ['attributes=form,lemma,upos,xpos,feats,deprel,misc'] + if args.save_comments: + args.scenario = args.scenario + ['print_comments=1'] + if args.marked_only: + args.scenario = args.scenario + ['marked_only=1'] + if args.no_color: + args.scenario = args.scenario + ['color=0'] + if args.extra: + args.scenario += args.extra + + runner = Run(args) + # udapy is often piped to head etc., e.g. + # `seq 1000 | udapy -s read.Sentences | head` + # Let's prevent Python from reporting (with distracting stacktrace) + # "BrokenPipeError: [Errno 32] Broken pipe" + try: + runner.execute() + except BrokenPipeError: + pass + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 53a1129c..c3bcf918 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -13,7 +13,8 @@ class BaseReader(Block): # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', - sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs): + sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, + max_docs=0, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None @@ -28,8 +29,11 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id + self.merge = merge + self.max_docs = max_docs + self._docs_loaded = 0 # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. - # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it in here, the reader. + # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it here, in the reader. # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, # which reads all the trees in a file at once, but it does not have access to the document instance, # it just returns a sequence of trees (which may be split into multiple documents if `bundles_per_doc` is set). @@ -93,13 +97,19 @@ def filtered_read_tree(self): tree = self.read_tree() if self.sent_id_filter is None: return tree + + skipped_newdoc = None while True: if tree is None: return None if self.sent_id_filter.match(tree.sent_id) is not None: + if skipped_newdoc and not tree.newdoc: + tree.newdoc = skipped_newdoc return tree logging.debug('Skipping sentence %s as it does not match the sent_id_filter %s.', tree.sent_id, self.sent_id_filter) + if tree.newdoc: + skipped_newdoc = tree.newdoc tree = self.read_tree() def try_fast_load(self, document): @@ -111,43 +121,56 @@ def try_fast_load(self, document): if filehandle is None: self.finished = True return True - try: - trees = self.read_trees() - except NotImplementedError: - return False + logging.info(f"Reading {self.files.filename}") - document.meta['loaded_from'] = self.filename - document.meta['global.Entity'] = self._global_entity - if trees and trees[0].newdoc and trees[0].newdoc is not True: - document.meta["docname"] = trees[0].newdoc - - bundle, last_bundle_id = None, '' - for root in trees: - add_to_the_last_bundle = False - - if self.ignore_sent_id: - root._sent_id = None - elif root._sent_id is not None: - parts = root._sent_id.split('/', 1) - bundle_id = parts[0] - if len(parts) == 2: - root.zone = parts[1] - add_to_the_last_bundle = bundle_id == last_bundle_id - last_bundle_id = bundle_id - if self.zone != 'keep': - root.zone = self.zone - - # assign new/next bundle to `bundle` if needed - if not bundle or not add_to_the_last_bundle: - bundle = document.create_bundle() - if last_bundle_id != '': - bundle.bundle_id = last_bundle_id - - bundle.add_tree(root) - - self.next_filehandle() - if self.filehandle is None: - self.finished = True + while True: + try: + trees = self.read_trees() + except NotImplementedError: + return False + + document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity + if trees and trees[0].newdoc and trees[0].newdoc is not True: + document.meta["docname"] = trees[0].newdoc + + bundle, last_bundle_id = None, '' + for root in trees: + if root is None: + continue + if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return True + self._docs_loaded += 1 + add_to_the_last_bundle = False + + if self.ignore_sent_id: + root._sent_id = None + elif root._sent_id is not None: + parts = root._sent_id.split('/', 1) + bundle_id = parts[0] + if len(parts) == 2: + root.zone = parts[1] + add_to_the_last_bundle = bundle_id == last_bundle_id + last_bundle_id = bundle_id + if self.zone != 'keep': + root.zone = self.zone + + # assign new/next bundle to `bundle` if needed + if not bundle or not add_to_the_last_bundle: + bundle = document.create_bundle() + if last_bundle_id != '': + bundle.bundle_id = last_bundle_id + + bundle.add_tree(root) + + self.next_filehandle() + if self.filehandle is None: + self.finished = True + return True + if not self.merge: + return True return True # pylint: disable=too-many-branches,too-many-statements @@ -174,8 +197,10 @@ def process_document(self, document): if root._sent_id is not None: bundle.bundle_id = root._sent_id.split('/', 1)[0] bundle.add_tree(root) - if root.newdoc and root.newdoc is not True: - document.meta["docname"] = root.newdoc + if root.newdoc: + self._docs_loaded += 1 + if root.newdoc is not True: + document.meta["docname"] = root.newdoc document.meta['global.Entity'] = self._global_entity document.meta['loaded_from'] = self.filename @@ -185,19 +210,32 @@ def process_document(self, document): if filehandle is None: self.finished = True return + logging.info(f"Reading {self.files.filename}") trees_loaded = 0 while True: root = self.filtered_read_tree() if root is None: - if trees_loaded == 0 and self.files.has_next_file(): + if (trees_loaded == 0 or self.merge) and self.files.has_next_file(): filehandle = self.next_filehandle() + logging.info(f"Reading {self.files.filename}") continue self.finished = not self.files.has_next_file() break if trees_loaded == 0: document.meta['loaded_from'] = self.filename document.meta['global.Entity'] = self._global_entity + # Parameter max_docs is primarily aimed for counting UD docs, ie. trees with newdoc. + # However, it could be useful even when working with files without the newdoc annotations, + # e.g. when using files='!*.conllu' or bundles_per_doc, in which case we count the Udapi documents + # so even if the first tree in udapi.Document does not have newdoc, we count it as a new document. + # The cases where newdoc is used are checked further below. + if not root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return + self._docs_loaded += 1 + add_to_the_last_bundle = False trees_loaded += 1 @@ -216,6 +254,9 @@ def process_document(self, document): # The `# newdoc` comment in CoNLL-U marks a start of a new document. if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return if not bundle and root.newdoc is not True: document.meta["docname"] = root.newdoc if bundle and self.split_docs: @@ -225,6 +266,7 @@ def process_document(self, document): len(orig_bundles)) self.finished = False return + self._docs_loaded += 1 # assign new/next bundle to `bundle` if needed if not bundle or not add_to_the_last_bundle: @@ -277,6 +319,6 @@ def read_documents(self): docs = [] while not self.finished: doc = Document() - self.process_document(doc) + self.apply_on_document(doc) docs.append(doc) return docs diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index cdc2c38f..071ec124 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -1,6 +1,8 @@ """BaseWriter is the base class for all writer blocks.""" import sys import logging +import os +from pathlib import Path import udapi.core.coref from udapi.core.block import Block @@ -8,10 +10,10 @@ class BaseWriter(Block): - """Base class for all reader blocks.""" + """Base class for all writer blocks.""" def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8', - newline='\n', overwrite=False, **kwargs): + newline='\n', overwrite=False, path=None, **kwargs): super().__init__(**kwargs) self.orig_files = files self.orig_stdout = sys.stdout @@ -29,6 +31,10 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1") + # interpret path=my_dir/my_subdir as path=my_dir/my_subdir/ + if path and path[-1] != os.sep and '*' not in path: + path += os.sep + self.path = path @property def filename(self): @@ -57,16 +63,28 @@ def before_process_document(self, document): docname = document.meta.get('docname', None) if docname is not None: logging.info('Writing to file %s.', docname) - sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) + sys.stdout = self._open(docname) else: logging.warning('docname_as_file=1 but the document contains no docname') - elif self.overwrite: + elif self.overwrite or self.path: docname = document.meta.get('loaded_from', None) if docname is not None: + if self.path: + old_dir, old_filename = os.path.split(docname) + new_dir, new_filename = os.path.split(self.path) + old_file, old_ext = os.path.splitext(old_filename) + new_file, new_ext = os.path.splitext(new_filename) + if new_dir in ('', '*'): + new_dir = old_dir + if new_file in ('', '*'): + new_file = old_file + if new_ext in ('', '*'): + new_ext = old_ext + docname = os.path.join(new_dir, new_file + new_ext) logging.info('Writing to file %s.', docname) - sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) + sys.stdout = self._open(docname) else: - logging.warning('overwrite=1 but document.meta["loaded_from"] is None') + logging.warning('using overwrite or path but document.meta["loaded_from"] is None') else: sys.stdout = self.orig_stdout else: @@ -79,10 +97,13 @@ def before_process_document(self, document): sys.stdout = self.orig_stdout else: logging.info('Writing to file %s.', filename) - sys.stdout = open(filename, 'wt', encoding=self.encoding, newline=self.newline) + sys.stdout = self._open(filename) if old_filehandle not in (sys.stdout, self.orig_stdout): old_filehandle.close() + def _open(self, filename): + Path(filename).parent.mkdir(parents=True, exist_ok=True) + return open(filename, 'wt', encoding=self.encoding, newline=self.newline) def after_process_document(self, document): sys.stdout.flush() diff --git a/udapi/core/block.py b/udapi/core/block.py index f039abce..d293df61 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -1,5 +1,6 @@ """Block class represents the basic Udapi processing unit.""" import logging +import inspect def not_overridden(method): method.is_not_overridden = True @@ -14,9 +15,23 @@ class Block(object): Possible values are: process (default), skip, skip_warn, fail, delete. """ - def __init__(self, zones='all', if_empty_tree='process'): + def __init__(self, zones='all', if_empty_tree='process', **kwargs): self.zones = zones self.if_empty_tree = if_empty_tree + if kwargs: + params = set() + for cls in type(self).mro()[:-1]: + params.update(inspect.signature(cls.__init__).parameters.keys()) + params -= {'self', 'kwargs'} + raise TypeError(f"Extra parameters {kwargs}.\n" + f"Parameters of {self.block_name()} are:\n" + + '\n'.join(sorted(params))) + + def block_name(self): + module = ".".join(self.__module__.split(".")[:-1]) + if module.startswith('udapi.block.'): + module = module[12:] + return module + "." + self.__class__.__name__ def process_start(self): """A hook method that is executed before processing UD data""" @@ -31,6 +46,11 @@ def process_node(self, _): """Process a UD node""" pass + @not_overridden + def process_empty_node(self, _): + """Process an empty node (in enhanced dependencies)""" + pass + @not_overridden def process_tree(self, tree): """Process a UD tree""" @@ -72,8 +92,9 @@ def process_document(self, document): p_bundle = not hasattr(self.process_bundle, 'is_not_overridden') p_tree = not hasattr(self.process_tree, 'is_not_overridden') p_node = not hasattr(self.process_node, 'is_not_overridden') - if not any((p_entity, p_mention, p_bundle, p_tree, p_node)): - raise Exception("No processing activity defined in block " + str(self)) + p_empty_node = not hasattr(self.process_empty_node, 'is_not_overridden') + if not any((p_entity, p_mention, p_bundle, p_tree, p_node, p_empty_node)): + raise Exception("No processing activity defined in block " + self.block_name()) if p_entity or p_mention: for entity in document.coref_entities: @@ -83,10 +104,10 @@ def process_document(self, document): for mention in entity.mentions: self.process_coref_mention(mention) - if p_bundle or p_tree or p_node: + if p_bundle or p_tree or p_node or p_empty_node: for bundle_no, bundle in enumerate(document.bundles, 1): - logging.debug('Block %s processing bundle #%d (id=%s)', - self.__class__.__name__, bundle_no, bundle.bundle_id) + logging.debug(f'Block {self.block_name()} processing ' + f'bundle #{bundle_no} (id={bundle.bundle_id})') if p_bundle: self.process_bundle(bundle) else: @@ -95,8 +116,12 @@ def process_document(self, document): if p_tree: self.process_tree(tree) else: - for node in tree.descendants: - self.process_node(node) + if p_node: + for node in tree.descendants: + self.process_node(node) + if p_empty_node: + for empty_node in tree.empty_nodes: + self.process_empty_node(empty_node) @not_overridden def process_coref_entity(self, entity): diff --git a/udapi/core/bundle.py b/udapi/core/bundle.py index 110ed42c..0a637f01 100644 --- a/udapi/core/bundle.py +++ b/udapi/core/bundle.py @@ -39,9 +39,9 @@ def bundle_id(self, bundle_id): tree._sent_id = bundle_id + '/' + tree.zone # pylint: disable=protected-access def __str__(self): - if self.bundle_id is None: + if self._bundle_id is None: return 'bundle without id' - return "bundle id='%s'" % self.bundle_id + return f"bundle id='{self._bundle_id}'" def __iter__(self): return iter(self.trees) @@ -72,7 +72,7 @@ def has_tree(self, zone=''): def create_tree(self, zone=None): """Return the root of a newly added tree with a given zone.""" root = Root() - root.zone = zone + root._zone = zone self.add_tree(root) return root @@ -89,8 +89,12 @@ def check_zone(self, new_zone): def add_tree(self, root): """Add an existing tree to the bundle.""" if root.zone is None: - root.zone = '' + root._zone = '' self.check_zone(root.zone) + if self._bundle_id: + root._sent_id = self._bundle_id + if root.zone: + root._sent_id += '/' + root.zone root.bundle = self self.trees.append(root) doc_json = root.json.get('__doc__') @@ -107,8 +111,17 @@ def remove(self): def address(self): """Return bundle_id or '?' if missing.""" - return self.bundle_id if self.bundle_id is not None else '?' + return self._bundle_id if self._bundle_id is not None else '?' def draw(self, **kwargs): """Pretty print the trees using TextModeTrees.""" TextModeTrees(**kwargs).process_bundle(self) + + @property + def nodes(self): + """An iterator over all nodes (excluding empty nodes) in all trees in this bundle.""" + for tree in self: + # tree.descendants is slightly slower than tree._descendants, + # but it seems safer, see the comment in udapi.core.block.Block.process_tree(). + for node in tree.descendants: + yield node diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 3eb76db3..aa27e6a7 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -128,6 +128,17 @@ def __init__(self, words, head=None, entity=None, add_word_backlinks=True): new_word._mentions.append(self) new_word._mentions.sort() + def _subspans(self): + mspan = self.span + if ',' not in mspan: + return [CorefMentionSubspan(self._words, self, '')] + root = self._words[0].root + subspans = mspan.split(',') + result = [] + for idx,subspan in enumerate(subspans, 1): + result.append(CorefMentionSubspan(span_to_nodes(root, subspan), self, f'[{idx}/{len(subspans)}]')) + return result + def __lt__(self, another): """Does this mention precedes (word-order wise) `another` mention? @@ -246,6 +257,41 @@ def span(self): def span(self, new_span): self.words = span_to_nodes(self._head.root, new_span) + def __str__(self): + """String representation of the CorefMention object: Mention.""" + return f"Mention<{self._entity._eid}: {self._head}>" + + def remove(self): + for word in self._words: + word._mentions.remove(self) + self._entity._mentions.remove(self) + + +@functools.total_ordering +class CorefMentionSubspan(object): + """Helper class for representing a continuous subspan of a mention.""" + __slots__ = ['words', 'mention', 'subspan_id'] + + def __init__(self, words, mention, subspan_id): + if not words: + raise ValueError("mention.words must be non-empty") + self.words = sorted(words) + self.mention = mention + self.subspan_id = subspan_id + + def __lt__(self, another): + if self.words[0] is another.words[0]: + if len(self.words) > len(another.words): + return True + if len(self.words) < len(another.words): + return False + return self.mention < another.mention + return self.words[0].precedes(another.words[0]) + + @property + def subspan_eid(self): + return self.mention._entity.eid + self.subspan_id + CHARS_FORBIDDEN_IN_ID = "-=| \t()" @@ -263,7 +309,7 @@ def __init__(self, eid, etype=None): self.split_ante = [] def __lt__(self, another): - """Does this CorefEntity precedes (word-order wise) `another` entity? + """Does this CorefEntity precede (word-order wise) `another` entity? This method defines a total ordering of all entities by the first mention of each entity (see `CorefMention.__lt__`). @@ -294,8 +340,8 @@ def eid(self, new_eid): def eid_or_grp(self): root = self._mentions[0].head.root meta = root.document.meta - if 'GRP' in meta['global.Entity'] and meta['tree2docid']: - docid = meta['tree2docid'][root] + if 'GRP' in meta['global.Entity'] and meta['_tree2docid']: + docid = meta['_tree2docid'][root] if self._eid.startswith(docid): return self._eid.replace(docid, '', 1) else: @@ -349,6 +395,11 @@ def all_bridging(self): for b in m._bridging: yield b + def __str__(self): + """String representation of the CorefEntity object: Entity.""" + first_mention_head = self._mentions[0].head.form if self._mentions else "" + return f"Entity<{self._eid}: {first_mention_head}>" + # BridgingLink # Especially the relation should be mutable, so we cannot use @@ -514,7 +565,7 @@ def load_coref_from_misc(doc, strict=True): highest_doc_n += 1 docid = f"d{highest_doc_n}." tree2docid[tree] = docid - doc.meta['tree2docid'] = tree2docid + doc.meta['_tree2docid'] = tree2docid elif 'eid' not in global_entity: raise ValueError("No eid in global.Entity = " + global_entity) fields = global_entity.split('-') @@ -566,7 +617,15 @@ def load_coref_from_misc(doc, strict=True): last_word = mention.words[-1] if node.root is not last_word.root: # TODO cross-sentence mentions - raise ValueError(f"Cross-sentence mentions not supported yet: {chunk} at {node}") + if strict: + raise ValueError(f"Cross-sentence mentions not supported yet: {chunk} at {node}") + else: + logging.warning(f"Cross-sentence mentions not supported yet: {chunk} at {node}. Deleting.") + entity = mention.entity + mention.words = [] + entity._mentions.remove(mention) + if not entity._mentions: + del entities[entity.eid] for w in node.root.descendants_and_empty: if last_word.precedes(w): mention._words.append(w) @@ -578,7 +637,9 @@ def load_coref_from_misc(doc, strict=True): mention.head = mention.words[head_idx - 1] except IndexError as err: _error(f"Invalid head_idx={head_idx} for {mention.entity.eid} " - f"closed at {node} with words={mention.words}", 1) + f"closed at {node} with words={mention.words}", strict) + if not strict and head_idx > len(mention.words): + mention.head = mention.words[-1] if subspan_idx and subspan_idx == total_subspans: m = discontinuous_mentions[eid].pop() if m is not mention: @@ -598,7 +659,8 @@ def load_coref_from_misc(doc, strict=True): try: head_idx = int(value) except ValueError as err: - raise ValueError(f"Non-integer {value} as head index in {chunk} in {node}: {err}") + _error(f"Non-integer {value} as head index in {chunk} in {node}: {err}", strict) + head_idx = 1 elif name == 'other': if other: new_other = OtherDualDict(value) @@ -628,6 +690,7 @@ def load_coref_from_misc(doc, strict=True): entity.etype = etype elif etype and entity.etype and entity.etype != etype: logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + other["orig_etype"] = etype # CorefEntity could be created first with "Bridge=" without any type elif etype and entity.etype is None: entity.etype = etype @@ -682,14 +745,14 @@ def load_coref_from_misc(doc, strict=True): entities[ante_str] = CorefEntity(ante_str) entities[this_str].split_ante.append(entities[ante_str]) - for entity_name, mentions in unfinished_mentions.items(): - for mention in mentions: - logging.warning(f"Mention {entity_name} opened at {mention.head}, but not closed. Deleting.") + for eid, mentions in unfinished_mentions.items(): + for mention, head_idx in mentions: + logging.warning(f"Mention {eid} opened at {mention.head}, but not closed. Deleting.") entity = mention.entity mention.words = [] entity._mentions.remove(mention) if not entity._mentions: - del entities[name] + del entities[eid] # c=doc.coref_entities should be sorted, so that c[0] < c[1] etc. # In other words, the dict should be sorted by the values (according to CorefEntity.__lt__), @@ -709,7 +772,7 @@ def store_coref_to_misc(doc): if not doc._eid_to_entity: return - tree2docid = doc.meta.get('tree2docid') + tree2docid = doc.meta.get('_tree2docid') global_entity = doc.meta.get('global.Entity') if not global_entity: global_entity = 'eid-etype-head-other' @@ -886,7 +949,7 @@ def nodes_to_span(nodes): Note that empty nodes may form gaps in the span, so if a given tree contains an empty node with ord 5.1, but only nodes with ords 3, 4, 5, 6, 7.1 and 7.2 are provided as `nodes`, the resulting string will be "3-5,6,7.1-7.2". - This means that the implementation needs to iterate of all nodes + This means that the implementation needs to iterate over all nodes in a given tree (root.descendants_and_empty) to check for such gaps. """ if not nodes: diff --git a/udapi/core/document.py b/udapi/core/document.py index dcf146ea..5f2bdf0b 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -23,7 +23,7 @@ def __init__(self, filename=None, **kwargs): No pre-processing is applied, so when loading the document from a *.txt file, `Document("a.txt").nodes` will be empty and you need to run tokenization first. You can pass additional parameters for `udapi.block.read.sentences` - (`ignore_empty_lines` and `rstrip`). + (`ignore_empty_lines`, `newdoc_if_empty_line` and `rstrip`). """ self.bundles = [] self._highest_bundle_id = 0 @@ -34,7 +34,7 @@ def __init__(self, filename=None, **kwargs): if filename.endswith(".conllu"): self.load_conllu(filename, **kwargs) elif filename.endswith(".txt"): - reader = SentencesReader(files=filename, **kwargs) + reader = SentencesReader(files=[filename], **kwargs) reader.apply_on_document(self) else: raise ValueError("Only *.conllu and *.txt are supported. Provided: " + filename) @@ -65,11 +65,11 @@ def create_bundle(self): def load_conllu(self, filename=None, **kwargs): """Load a document from a conllu-formatted file.""" - ConlluReader(files=filename, **kwargs).process_document(self) + ConlluReader(files=[filename], **kwargs).process_document(self) def store_conllu(self, filename): """Store a document into a conllu-formatted file.""" - ConlluWriter(files=filename).apply_on_document(self) + ConlluWriter(files=[filename]).apply_on_document(self) def from_conllu_string(self, string): """Load a document from a conllu-formatted string.""" @@ -96,7 +96,7 @@ def nodes(self): for bundle in self: for tree in bundle: # tree.descendants is slightly slower than tree._descendants, - # but it seems safer, see the comment in udapi.core.block.Block.process.process_tree(). + # but it seems safer, see the comment in udapi.core.block.Block.process_tree(). for node in tree.descendants: yield node @@ -159,9 +159,9 @@ def create_coref_entity(self, eid=None, etype=None): self._load_coref() if not eid: counter = 1 - while self._eid_to_entity.get(f'c{counter}'): + while self._eid_to_entity.get(f'e{counter}'): counter += 1 - eid = f'c{counter}' + eid = f'e{counter}' elif self._eid_to_entity.get(eid): raise ValueError("Entity with eid=%s already exists", eid) entity = udapi.core.coref.CorefEntity(eid, etype) diff --git a/udapi/core/files.py b/udapi/core/files.py index 7fcd9149..be59b2c0 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -58,14 +58,6 @@ def string_to_filenames(self, string): or commas. For specifying files with spaces or commas in filenames, you need to use wildcard patterns or '@' filelist. (But preferably don't use such filenames.) """ - # "!" means glob pattern which can contain {dir1,dir2} - # so it cannot be combined with separating tokens with comma. - if string[0] == '!': - pattern = string[1:] - filenames = glob.glob(pattern) - if not filenames: - raise RuntimeError('No filenames matched "%s" pattern' % pattern) - return filenames return list(itertools.chain.from_iterable(self._token_to_filenames(tok) for tok in string.replace(',', ' ').split())) @@ -73,7 +65,7 @@ def string_to_filenames(self, string): def _token_to_filenames(token): if token[0] == '!': pattern = token[1:] - filenames = glob.glob(pattern) + filenames = sorted(glob.glob(pattern)) if not filenames: raise RuntimeError('No filenames matched "%s" pattern' % pattern) elif token[0] == '@': diff --git a/udapi/core/mwt.py b/udapi/core/mwt.py index 684adfaf..00ba935c 100644 --- a/udapi/core/mwt.py +++ b/udapi/core/mwt.py @@ -1,19 +1,38 @@ """MWT class represents a multi-word token.""" from udapi.core.dualdict import DualDict - +from udapi.core.feats import Feats class MWT(object): """Class for representing multi-word tokens in UD trees.""" - __slots__ = ['words', 'form', '_misc', 'root'] + __slots__ = ['words', 'form', '_feats', '_misc', 'root'] - def __init__(self, words=None, form=None, misc=None, root=None): + def __init__(self, words=None, form=None, feats=None, misc=None, root=None): self.words = words if words is not None else [] self.form = form + self._feats = Feats(feats) if feats and feats != '_' else None self._misc = DualDict(misc) if misc and misc != '_' else None self.root = root for word in self.words: word._mwt = self # pylint: disable=W0212 + @property + def feats(self): + """Property `feats` in MWT should be used only for `Typo=Yes`. + + See https://universaldependencies.org/changes.html#typos-in-multiword-tokens + However, Udapi does not enforce this restriction and mwt.feats works exactly the same as node.feats. + """ + if self._feats is None: + self._feats = Feats() + return self._feats + + @feats.setter + def feats(self, value): + if self._feats is None: + self._feats = Feats(value) + else: + self._feats.set_mapping(value) + @property def misc(self): """Property for MISC attributes stored as a `DualDict` object. @@ -47,6 +66,90 @@ def address(self): """Full (document-wide) id of the multi-word token.""" return self.root.address + '#' + self.ord_range + @staticmethod + def is_mwt(): + """Is this a multi-word token? + + Returns always True. + False is returned only by instances of the Node class. + """ + return True + + @property + def no_space_after(self): + """Boolean property as a shortcut for `mwt.misc["SpaceAfter"] == "No"`.""" + return self.misc["SpaceAfter"] == "No" + + @staticmethod + def is_empty(): + """Is this an Empty node? + + Returns always False because multi-word tokens cannot be empty nodes. + """ + return False + + @staticmethod + def is_leaf(): + """Is this a node/mwt without any children? + + Returns always True because multi-word tokens cannot have children. + """ + return True + + def _get_attr(self, name): # pylint: disable=too-many-return-statements + if name == 'form': + return self.form + if name == 'ord': + return self.ord_range + if name in ('edge', 'children', 'siblings', 'depth'): + return 0 + if name == 'feats_split': + return str(self.feats).split('|') + if name == 'misc_split': + return str(self.misc).split('|') + if name.startswith('feats['): + return self.feats[name[6:-1]] + if name.startswith('misc['): + return self.misc[name[5:-1]] + return '' + + def get_attrs(self, attrs, undefs=None, stringify=True): + """Return multiple attributes or pseudo-attributes, possibly substituting empty ones. + + MWTs do not have children nor parents nor prev/next nodes, + so the pseudo-attributes: p_xy, c_xy, l_xy and r_xy are irrelevant (and return nothing). + Other pseudo-attributes (e.g. dir) return always the string "". + The only relevant pseudo-attributes are + feats_split and misc_split: a list of name=value formatted strings. + The `ord` attribute returns actually `mwt.ord_range`. + + Args: + attrs: A list of attribute names, e.g. ``['form', 'ord', 'feats_split']``. + undefs: A value to be used instead of None for empty (undefined) values. + stringify: Apply `str()` on each value (except for None) + """ + values = [] + for name in attrs: + nodes = [self] + if name[1] == '_': + nodes, name = [], name[2:] + for node in (n for n in nodes if n is not None): + if name in {'feats_split', 'misc_split'}: + values.extend(node._get_attr(name)) + else: + values.append(node._get_attr(name)) + + if undefs is not None: + values = [x if x is not None else undefs for x in values] + if stringify: + values = [str(x) if x is not None else None for x in values] + return values + + @property + def _ord(self): + self.words.sort() + return self.words[0]._ord + # TODO: node.remove() should check if the node is not part of any MWT # TODO: Document that editing words by mwt.words.append(node), del or remove(node) is not supported # TODO: Make mwt._words private and provide a setter diff --git a/udapi/core/node.py b/udapi/core/node.py index 63242698..c6a7a26a 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -166,6 +166,14 @@ def sdeprel(self): return parts[1] return '' + @sdeprel.setter + def sdeprel(self, value): + udeprel = self.udeprel + if value is not None and value != '': + self.deprel = udeprel + ':' + value + else: + self.deprel = udeprel + @property def feats(self): """Property for morphological features stored as a `Feats` object. @@ -244,7 +252,7 @@ def raw_deps(self): #if self._raw_deps is not None: # return self._raw_deps if self._deps: - self._raw_deps = '|'.join(f"{dep['parent']._ord}:{dep['deprel']}" for dep in self._deps) + self._raw_deps = '|'.join(f"{p}:{r}" for p, r in sorted(set((d['parent'].ord, d['deprel']) for d in self._deps))) return self._raw_deps @raw_deps.setter @@ -316,6 +324,8 @@ def parent(self, new_parent): # Check for None new_parent and cycles. if new_parent is None: raise ValueError(f'Cannot set None as parent: {self}') + if new_parent.is_empty(): + raise ValueError(f'Cannot set EmptyNode as parent in basic dependencies: {self}') if self is new_parent: raise CycleError('Cannot set a node as its own parent (cycle are forbidden): %s', self) if self._children and new_parent.is_descendant_of(self): @@ -496,7 +506,7 @@ def is_empty(): return False def remove(self, children=None): - """Delete this node and all its descendants. + """Delete this node (and all its descendants unlsess specified otherwise). Args: children: a string specifying what to do if the node has any children. @@ -506,6 +516,8 @@ def remove(self, children=None): `rehang_warn` means to rehang and warn:-). """ self._parent._children.remove(self) + + # If there are any children, do the action specified in the "children" parameter. if children is not None and self._children: if children.startswith('rehang'): for child in self._children: @@ -526,18 +538,51 @@ def remove(self, children=None): self._root._descendants.remove(self) except ValueError: pass # self may be an already deleted node e.g. if n.remove() called twice - for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): - node.ord = new_ord + else: + for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): + node.ord = new_ord + last_ord = 0 + for empty in self._root.empty_nodes: + if empty._ord > self._ord: + new_ord = round(empty._ord - 1, 1) + if new_ord <= last_ord: + new_ord = round(last_ord + 0.1, 1) + empty.ord = new_ord + last_ord = empty._ord else: + # Remember the position of empty nodes, so we can reorder them as well. + empty_follows = None + if self._root.empty_nodes: + will_be_removed = self if children and children.startswith('rehang') else self.descendants(add_self=1) + prev_nonempty = self._root + empty_follows = {} + for node in self._root.descendants_and_empty: + if node.is_empty(): + empty_follows[node] = prev_nonempty + elif node not in will_be_removed: + prev_nonempty = node + # TODO nodes_to_remove = self.unordered_descendants() # and mark all nodes as deleted, remove them from MWT and coref mentions self._root._descendants = sorted(self._root.unordered_descendants()) for (new_ord, node) in enumerate(self._root._descendants, 1): node.ord = new_ord + # Decrease ord of empty nodes (keep their fractional part) + # Make sure that e.g. after deleting node with ord=2 + # ords "1 1.1 1.2 2 2.1" will become "1 1.1 1.2 1.3". + if empty_follows: + last_ord = 0 + for empty in self._root.empty_nodes: + prev_nonempty = empty_follows[empty] + new_ord = round(prev_nonempty._ord + (empty._ord % 1), 1) + while new_ord <= last_ord: + new_ord = round(new_ord + 0.1, 1) + last_ord, empty.ord = new_ord, new_ord def _shift_before_ord(self, reference_ord, without_children=False): """Internal method for changing word order.""" all_nodes = self._root._descendants + empty_nodes = self._root.empty_nodes # Moving a single node can be faster than nodes_to_move = [self] if without_children or not self._children: @@ -548,14 +593,25 @@ def _shift_before_ord(self, reference_ord, without_children=False): all_nodes[i_ord - 1]._ord = i_ord all_nodes[reference_ord - 2] = self self._ord = reference_ord - 1 + for en in empty_nodes: + if en._ord > my_ord and en._ord < reference_ord: + en._ord -= 1 elif reference_ord < my_ord: for i_ord in range(my_ord, reference_ord, -1): all_nodes[i_ord - 1] = all_nodes[i_ord - 2] all_nodes[i_ord - 1]._ord = i_ord all_nodes[reference_ord - 1] = self self._ord = reference_ord + for en in empty_nodes: + # Empty nodes before the first overt token (ID=0.X) will be never moved this way. + # We cannot know whether the caller wanted to place the shifted node before or after them. + if en._ord < my_ord and en._ord > reference_ord: + en._ord += 1 + self._parent._children.sort() return + #TODO: Updating ords of empty nodes is implemented only for the simple case above, + # but it has to be implemented also for the complex case below! nodes_to_move = self.descendants(add_self=True) first_ord, last_ord = nodes_to_move[0]._ord, nodes_to_move[-1]._ord @@ -579,6 +635,7 @@ def _shift_before_ord(self, reference_ord, without_children=False): for node in nodes_to_move: all_nodes[trg_ord - 1], node._ord = node, trg_ord trg_ord += 1 + self._parent._children.sort() return # First, move a node from position src_ord to position trg_ord RIGHT-ward. @@ -612,6 +669,7 @@ def _shift_before_ord(self, reference_ord, without_children=False): for node in nodes_to_move: all_nodes[trg_ord - 1], node._ord = node, trg_ord trg_ord += 1 + self._parent._children.sort() def shift_after_node(self, reference_node, without_children=False, skip_if_descendant=False): """Shift this node after the reference_node.""" @@ -623,6 +681,8 @@ def shift_after_node(self, reference_node, without_children=False, skip_if_desce def shift_before_node(self, reference_node, without_children=False, skip_if_descendant=False): """Shift this node before the reference_node.""" + if reference_node.is_root(): + raise ValueError(f'Cannot shift a node before the root ({reference_node})') if not without_children and reference_node.is_descendant_of(self): if skip_if_descendant: return @@ -651,6 +711,8 @@ def shift_before_subtree(self, reference_node, without_children=0, skip_if_desce Args: without_children: shift just this node without its subtree? """ + if reference_node.is_root(): + raise ValueError(f'Cannot shift a node before the root ({reference_node})') if not without_children and reference_node.is_descendant_of(self): if skip_if_descendant: return @@ -702,21 +764,21 @@ def is_leaf(self): def _get_attr(self, name): # pylint: disable=too-many-return-statements if name == 'dir': - if self._parent.is_root(): + if not self._parent or self._parent.is_root(): return 'root' return 'left' if self.precedes(self._parent) else 'right' if name == 'edge': - if self._parent.is_root(): + if not self._parent or self._parent.is_root(): return 0 return self._ord - self._parent._ord if name == 'children': return len(self._children) if name == 'siblings': - return len(self._parent._children) - 1 + return 0 if not self._parent else len(self._parent._children) - 1 if name == 'depth': value = 0 tmp = self - while not tmp.is_root(): + while tmp and not tmp.is_root(): tmp = tmp._parent value += 1 return value @@ -848,6 +910,18 @@ def multiword_token(self): """ return self._mwt + @property + def words(self): + """Return one-item list with this node. + + This property is there for compatibility with udapi.core.mwt.MWT.words. + So that it is possible to use code such as: + for token in root.token_descendants: + words = token.words + ... + """ + return [self] + def is_nonprojective(self): """Is the node attached to its parent non-projectively? @@ -930,6 +1004,14 @@ def create_coref_entity(self, eid=None, etype=None, **kwargs): entity.create_mention(head=self, **kwargs) return entity + @staticmethod + def is_mwt(): + """Is this a multi-word token? + + Returns False for all Node instances. + True is returned only by instances of the MWT class. + """ + return False class CycleError(Exception): '''A cycle in the dependency tree detected (or would be created).''' @@ -981,6 +1063,19 @@ def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): """Attempts at changing the word order of EmptyNode result in NotImplemented exception.""" raise NotImplemented('Empty nodes cannot be re-order using shift* methods yet.') + def remove(self): + """Delete this empty node.""" + to_reorder = [e for e in self._root.empty_nodes if e._ord > self._ord and e._ord < self.ord+1] + for empty in to_reorder: + empty._ord = round(empty._ord - 0.1, 1) + try: + self._root.empty_nodes.remove(self) + except ValueError: + return # self may be an already deleted node e.g. if n.remove() called twice + for n in self._root.empty_nodes + self._root._descendants: + if n._deps: + n._deps = {(deprel, parent) for deprel, parent in n._deps if parent != self} + @functools.total_ordering class OrdTuple: """Class for the rare case of 9+ consecutive empty nodes, i.e. ords x.10, x.11 etc. @@ -1056,6 +1151,7 @@ class ListOfNodes(list): nodes = node.children nodes = node.children() nodes = node.children(add_self=True, following_only=True) + nodes = node.descendants(add_self=True, add_mwt=True) """ __slots__ = ('origin',) @@ -1069,16 +1165,28 @@ def __init__(self, iterable, origin): super().__init__(iterable) self.origin = origin - def __call__(self, add_self=False, following_only=False, preceding_only=False): + def __call__(self, add_self=False, following_only=False, preceding_only=False, add_mwt=False): """Returns a subset of nodes contained in this list as specified by the args.""" if add_self: self.append(self.origin) self.sort() + result = self if preceding_only: - return [x for x in self if x._ord <= self.origin._ord] + result = [x for x in result if x._ord <= self.origin._ord] if following_only: - return [x for x in self if x._ord >= self.origin._ord] - return self + result = [x for x in result if x._ord >= self.origin._ord] + if add_mwt: + new = [] + last_mwt_id = -1 + for node in result: + mwt = node.multiword_token + if mwt: + if node.ord > last_mwt_id: + last_mwt_id = mwt.words[-1].ord + new.append(mwt) + new.append(node) + result = new + return result def find_minimal_common_treelet(*args): diff --git a/udapi/core/root.py b/udapi/core/root.py index 0132566a..15f31e58 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -95,6 +95,13 @@ def zone(self, zone): if self._bundle: self._bundle.check_zone(zone) self._zone = zone + slashzone = '/' + zone if zone else '' + if self._bundle is not None: + self._sent_id = self._bundle.address() + slashzone + elif self._sent_id: + self._sent_id = self._sent_id.split('/', 1)[0] + slashzone + else: + self._sent_id = '?' + slashzone @property def parent(self): @@ -137,7 +144,7 @@ def remove(self, children=None): The default (None) is to delete them (and all their descendants). `warn` means to issue a warning. """ - if children is not None and self.children: + if children is not None and self._children: logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) self.bundle.trees = [root for root in self.bundle.trees if root != self] @@ -160,17 +167,28 @@ def create_empty_child(self, **kwargs): return new_node # TODO document whether misc is a string or dict or it can be both - def create_multiword_token(self, words=None, form=None, misc=None): + def create_multiword_token(self, words=None, form=None, feats=None, misc=None): """Create and return a new multi-word token (MWT) in this tree. The new MWT can be optionally initialized using the following args. Args: words: a list of nodes which are part of the new MWT form: string representing the surface form of the new MWT - misc: misc attribute of the new MWT + misc: FEATS attribute of the new MWT (only `Typo=Yes` allowed there in UD guidelines) + misc: MISC attribute of the new MWT """ - mwt = MWT(words, form, misc, root=self) + # Nested or overlapping MWTs are not allowed in CoNLL-U, + # so first remove all previous MWTs containing any of words. + for w in words: + if w.multiword_token: + w.multiword_token.remove() + # Now, create the new MWT. + mwt = MWT(words, form, feats, misc, root=self) self._mwts.append(mwt) + if words[-1].misc["SpaceAfter"] == "No": + mwt.misc["SpaceAfter"] = "No" + for word in words: + word.misc["SpaceAfter"] = "" return mwt @property @@ -261,10 +279,10 @@ def steal_nodes(self, nodes): node.ord = new_ord node._root = self if not whole_tree: - for child in [n for n in node.children if n not in nodes]: + for child in [n for n in node._children if n not in nodes]: child._parent = old_root - old_root._children = sorted(old_root.children + [child]) - node._children = [n for n in node.children if n in nodes] + old_root._children = sorted(old_root._children + [child]) + node._children = [n for n in node._children if n in nodes] if node.parent == old_root or (not whole_tree and node.parent not in nodes): node.parent._children = [n for n in node.parent._children if n != node] node._parent = self @@ -283,3 +301,39 @@ def steal_nodes(self, nodes): self.create_multiword_token(words=words, form=mwt.form, misc=mwt.misc) self._descendants += nodes # pylint: enable=protected-access + + def flatten(self, deprel='root'): + """Flatten the tree (i.e. attach all nodes to the root) and reset all deprels. + + This is equivalent to + for node in root.descendants: + node.parent = root + node.deprel = 'root' + but it is faster. + """ + self._children = self._descendants[:] + for node in self._children: + node._parent = self + node._children.clear() + + @property + def prev_tree(self): + """Return the previous tree (root) in the document (from the same zone).""" + doc = self._bundle._document + num = self._bundle.number + if len(doc.bundles) <= num - 1 or doc.bundles[num - 1] is not self._bundle: + num = doc.bundles.index(self._bundle) + 1 + if num == 1: + return None + return doc.bundles[num - 2].get_tree(zone=self._zone) + + @property + def next_tree(self): + """Return the next tree (root) in the document (from the same zone).""" + doc = self._bundle._document + num = self._bundle.number + if len(doc.bundles) <= num - 1 or doc.bundles[num - 1] is not self._bundle: + num = doc.bundles.index(self._bundle) + 1 + if len(doc.bundles) <= num: + return None + return doc.bundles[num].get_tree(zone=self._zone) diff --git a/udapi/core/run.py b/udapi/core/run.py index a0cc4a9a..6453641c 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -67,6 +67,30 @@ def _parse_command_line_arguments(scenario): return block_names, block_args +def _blocks_in_a_package(package_name): + import importlib.util, pkgutil + + if not importlib.util.find_spec(package_name): + return [] + try: + package = __import__(package_name, fromlist="dummy") + submodule_names = [m.name for m in pkgutil.iter_modules(package.__path__)] + pname = package_name + if pname.startswith("udapi.block."): + pname = pname[12:] + blocks = [] + for sname in submodule_names: + try: # ignore modules with compilation errors + module = __import__(f"{package_name}.{sname}", fromlist="dummy") + bnames = [c for c in dir(module) if c.lower() == sname] + if bnames: + blocks.append(f"{pname}.{bnames[0]}") + except: + pass + return blocks + except: + return [] + def _import_blocks(block_names, block_args): """ Parse block names, import particular packages and call the constructor for each object. @@ -75,9 +99,9 @@ def _import_blocks(block_names, block_args): :param block_args: A list of block arguments to be passed to block constructor. :return: A list of initialized objects. :rtype: list - """ blocks = [] + namespace = {} # Create a namespace dictionary to store imported classes for (block_id, block_name) in enumerate(block_names): # Importing module dynamically. @@ -91,17 +115,28 @@ def _import_blocks(block_names, block_args): try: command = "from " + module + " import " + class_name + " as b" + str(block_id) logging.debug("Trying to run command: %s", command) - exec(command) # pylint: disable=exec-used - except Exception: - logging.warning("Error when trying import the block %s", block_name) + exec(command, namespace) # Pass namespace as globals + except ModuleNotFoundError as err: + package_name = ".".join(module.split(".")[:-1]) + package_blocks = _blocks_in_a_package(package_name) + if not package_blocks: + raise + raise ModuleNotFoundError( + f"Cannot find block {block_name} (i.e. class {module}.{class_name})\n" + f"Available block in {package_name} are:\n" + + "\n".join(package_blocks)) from err + except Exception as ex: + logging.warning(f"Cannot import block {block_name} (i.e. class {module}.{class_name})") raise # Run the imported module. - kwargs = block_args[block_id] # pylint: disable=unused-variable + kwargs = block_args[block_id] + namespace['kwargs'] = kwargs # Add kwargs to the namespace command = "b%s(**kwargs)" % block_id logging.debug("Trying to evaluate this: %s", command) - new_block_instance = eval(command) # pylint: disable=eval-used - blocks.append((block_name, new_block_instance)) + new_block_instance = eval(command, namespace) # Pass namespace as globals + args = ' '.join(f"{k}={v}" for k,v in kwargs.items()) + blocks.append((block_name, new_block_instance, args)) return blocks @@ -132,12 +167,15 @@ def execute(self): # Import blocks (classes) and construct block instances. blocks = _import_blocks(block_names, block_args) + return self.run_blocks(blocks) + + def run_blocks(self, blocks): # Initialize blocks (process_start). - for bname, block in blocks: + for _, block, _ in blocks: block.process_start() readers = [] - for bname, block in blocks: + for _, block, _ in blocks: try: block.finished # pylint: disable=pointless-statement readers.append(block) @@ -147,15 +185,15 @@ def execute(self): logging.info('No reader specified, using read.Conllu') conllu_reader = Conllu() readers = [conllu_reader] - blocks = [('read.Conllu', conllu_reader)] + blocks + blocks = [('read.Conllu', conllu_reader, {})] + blocks # Apply blocks on the data. finished = False while not finished: document = Document() logging.info(" ---- ROUND ----") - for bname, block in blocks: - logging.info(f"Executing block {bname}") + for bname, block, args in blocks: + logging.info(f"Executing block {bname} {args}") block.apply_on_document(document) finished = True @@ -164,9 +202,12 @@ def execute(self): finished = finished and reader.finished # 6. close blocks (process_end) - for bname, block in blocks: + for _, block, _ in blocks: block.process_end() + # Some users may use the block instances (e.g. to retrieve some variables). + return blocks + # TODO: better implementation, included Scen def scenario_string(self): """Return the scenario string.""" diff --git a/udapi/core/tests/test_document.py b/udapi/core/tests/test_document.py index 66363ca9..28283dda 100755 --- a/udapi/core/tests/test_document.py +++ b/udapi/core/tests/test_document.py @@ -9,12 +9,15 @@ class TestDocument(unittest.TestCase): def test_init(self): doc = Document() - def test_iterator(self): + def test_ids(self): doc = Document() - doc.bundles = ['a', 'b', 'c'] - for bundle in doc: - print(bundle) - + bundle1 = doc.create_bundle() + bundle2 = doc.create_bundle() + self.assertEqual(bundle1.address(), "1") + self.assertEqual(bundle2.address(), "2") + self.assertEqual([b.bundle_id for b in doc], ["1", "2"]) + tree1 = bundle1.create_tree() + self.assertEqual(tree1.address(), "1") if __name__ == "__main__": unittest.main() diff --git a/udapi/core/tests/test_enhdeps.py b/udapi/core/tests/test_enhdeps.py index 3f473bf3..53a74389 100644 --- a/udapi/core/tests/test_enhdeps.py +++ b/udapi/core/tests/test_enhdeps.py @@ -57,7 +57,7 @@ def test_create_deps2empty(self): e.deps.append({'parent': h, 'deprel':'dep:e2h'}) d.deps.append({'parent': e, 'deprel': 'dep:d2e'}) self.assertEqual("2:dep:e2h", e.raw_deps, ) - self.assertEqual("5:conj|3.1:dep:d2e", d.raw_deps) + self.assertEqual("3.1:dep:d2e|5:conj", d.raw_deps) self.assertEqual(self.tree.descendants_and_empty, self.nodes[:3] + [e] + self.nodes[3:]) diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index 28a45d85..f2b64a3d 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -61,6 +61,9 @@ def test_topology(self): nodes[0].shift_after_node(nodes[1]) self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6]) self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6]) + self.assertEqual([node.ord for node in nodes[1].children], [2, 3, 4]) + nodes[3].shift_before_node(nodes[2]) + self.assertEqual([node.ord for node in nodes[1].children], [2, 3, 6]) def test_draw(self): """Test the draw() method, which uses udapi.block.write.textmodetrees.""" @@ -119,7 +122,7 @@ def test_draw(self): sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type def test_feats(self): - """Test the morphological featrues.""" + """Test the morphological features.""" node = Node(root=None) self.assertEqual(str(node.feats), '_') node.feats = '' @@ -145,6 +148,29 @@ def test_feats(self): self.assertEqual(str(node.feats), '_') self.assertEqual(node.feats, {}) + def test_deprel(self): + """Test getting setting the dependency relation.""" + node = Node(root=None, deprel='acl:relcl') + self.assertEqual(node.deprel, 'acl:relcl') + self.assertEqual(node.udeprel, 'acl') + self.assertEqual(node.sdeprel, 'relcl') + node.udeprel = 'advcl' + self.assertEqual(node.deprel, 'advcl:relcl') + node.sdeprel = 'tcl' + self.assertEqual(node.deprel, 'advcl:tcl') + node.sdeprel = '' + self.assertEqual(node.deprel, 'advcl') + self.assertEqual(node.udeprel, 'advcl') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj' + self.assertEqual(node.deprel, 'nsubj') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj:pass:outer' + self.assertEqual(node.deprel, 'nsubj:pass:outer') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, 'pass:outer') + def test_deps_getter(self): """Test enhanced dependencies.""" # Create a path to the test CoNLLU file. @@ -188,5 +214,56 @@ def test_deps_setter(self): self.assertEqual(nodes[0].raw_deps, '2:test') + def test_empty_nodes(self): + """Test creation of empty nodes and how their ord is changed when removing nodes.""" + root = Root() + for i in range(3): + root.create_child(form=f'node{i+1}') + + n1, n2, n3 = root.descendants() + n3.parent = n2 + e1 = n1.create_empty_child('dep', after=False, form='e1') + e2 = n1.create_empty_child('dep', after=False, form='e2') + e3 = n1.create_empty_child('dep', after=True, form='e3') + e4 = n1.create_empty_child('dep', after=True, form='e4') + e5 = n2.create_empty_child('dep', after=False, form='e5') + e6 = n1.create_empty_child('dep', after=True, form='e6') + + self.assertEqual(root.empty_nodes, [e1, e2, e3, e4, e5, e6]) + self.assertEqual(root.descendants_and_empty, [e1, e2, n1, e3, e4, e5, e6, n2, n3]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 1, 1.1, 1.2, 1.3, 1.4, 2, 3]) + e5.remove() + self.assertEqual(root.descendants_and_empty, [e1, e2, n1, e3, e4, e6, n2, n3]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 1, 1.1, 1.2, 1.3, 2, 3]) + n1.remove() + self.assertEqual(root.descendants_and_empty, [e1, e2, e3, e4, e6, n2, n3]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2]) + e7 = n3.create_empty_child('dep', after=True, form='e7') + self.assertEqual(root.descendants_and_empty, [e1, e2, e3, e4, e6, n2, n3, e7]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 2.1]) + n2.remove() + self.assertEqual(root.descendants_and_empty, [e1, e2, e3, e4, e6, e7]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) + + def test_enh_deps_and_reordering(self): + """Test reordering of node ord in enhanced deps when reorderin/removing nodes.""" + root = Root() + for i in range(3): + root.create_child(form=f'node{i+1}') + + n1, n2, n3 = root.descendants() + n1.raw_deps = '2:nsubj|3:obj' + self.assertEqual(n1.raw_deps, '2:nsubj|3:obj') + self.assertEqual(n1.deps, [{'parent': n2, 'deprel': 'nsubj'}, {'parent': n3, 'deprel': 'obj'}]) + n2.shift_after_node(n3) + self.assertEqual(n1.raw_deps, '2:obj|3:nsubj') + # TODO only node.raw_deps are currently guaranteed to return the deps sorted, not node.deps + #self.assertEqual(n1.deps, [{'parent': n3, 'deprel': 'obj'}, {'parent': n2, 'deprel': 'nsubj'}]) + # TODO: after removing a node, all deps should be updated + #n2.remove() + #self.assertEqual(n1.raw_deps, '2:nsubj') + #self.assertEqual(n1.deps, [{'parent': n3, 'deprel': 'obj'}]) + + if __name__ == "__main__": unittest.main() diff --git a/udapi/tool/udpipe.py b/udapi/tool/udpipe.py index 18f6b2ca..83e289a2 100644 --- a/udapi/tool/udpipe.py +++ b/udapi/tool/udpipe.py @@ -22,8 +22,10 @@ def __init__(self, model): self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT) - def tag_parse_tree(self, root): + def tag_parse_tree(self, root, tag=True, parse=True): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" + if not tag and not parse: + raise ValueError('tag_parse_tree(root, tag=False, parse=False) does not make sense.') descendants = root.descendants if not descendants: return @@ -34,11 +36,15 @@ def tag_parse_tree(self, root): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() - nodes = [root] + descendants + attrs = 'upos xpos lemma feats'.split() if tag else [] + if parse: + attrs.append('deprel') + root.flatten() for parsed_node in parsed_root.descendants: - node = nodes[parsed_node.ord] - node.parent = nodes[parsed_node.parent.ord] - for attr in 'upos xpos lemma feats deprel'.split(): + node = descendants[parsed_node.ord - 1] + if parse: + node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root + for attr in attrs: setattr(node, attr, getattr(parsed_node, attr)) # TODO: benchmark which solution is the fastest one. E.g. we could also do @@ -47,11 +53,13 @@ def tag_parse_tree(self, root): # pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants - def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): + def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, ranges=False): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. If resegment=True, the returned list of Udapi trees may contain multiple trees. """ + if ranges: + raise ValueError('ranges=True is implemented only in the REST API version (add "online=1" to the udpipe block)') if root.children: raise ValueError('Tree already contained nodes before tokenization') diff --git a/udapi/tool/udpipeonline.py b/udapi/tool/udpipeonline.py new file mode 100644 index 00000000..ced96d56 --- /dev/null +++ b/udapi/tool/udpipeonline.py @@ -0,0 +1,198 @@ +"""Wrapper for UDPipe online web service.""" +import io +import sys +import email.mime.multipart +import email.mime.nonmultipart +import email.policy +import json +import os +import sys +import urllib.error +import urllib.parse +import urllib.request + +from udapi.block.read.conllu import Conllu as ConlluReader +from udapi.core.root import Root + +class UDPipeOnline: + """Wrapper for UDPipe online web service.""" + + def __init__(self, model, server="https://lindat.mff.cuni.cz/services/udpipe/api"): + """Create the UDPipeOnline tool object.""" + self.model = model + self.server = server + + def list_models(self): + with urllib.request.urlopen(self.server + "/models") as request: + response = json.loads(request.read()) + return list(response["models"].keys()) + + def perform_request(self, params, method="process"): + if not params: + request_headers, request_data = {}, None + else: + message = email.mime.multipart.MIMEMultipart("form-data", policy=email.policy.HTTP) + + for name, value in params.items(): + payload = email.mime.nonmultipart.MIMENonMultipart("text", "plain") + payload.add_header("Content-Disposition", "form-data; name=\"{}\"".format(name)) + payload.add_header("Content-Transfer-Encoding", "8bit") + payload.set_payload(value, charset="utf-8") + message.attach(payload) + + request_data = message.as_bytes().split(b"\r\n\r\n", maxsplit=1)[1] + request_headers = {"Content-Type": message["Content-Type"]} + + try: + with urllib.request.urlopen(urllib.request.Request( + url=f"{self.server}/{method}", headers=request_headers, data=request_data + )) as request: + response = json.loads(request.read()) + except urllib.error.HTTPError as e: + print("An exception was raised during UDPipe 'process' REST request.\n" + "The service returned the following error:\n" + " {}".format(e.fp.read().decode("utf-8")), file=sys.stderr) + raise + except json.JSONDecodeError as e: + print("Cannot parse the JSON response of UDPipe 'process' REST request.\n" + " {}".format(e.msg), file=sys.stderr) + raise + + if "model" not in response or "result" not in response: + raise ValueError("Cannot parse the UDPipe 'process' REST request response.") + + return response["result"] + + def perform_request_urlencoded(self, params, method="process"): + """Perform a request using application/x-www-form-urlencoded to preserve LF newlines. + + This avoids CRLF normalization done by the email MIME serializer, ensuring that + the content of the 'data' field retains Unix LF ("\n") exactly as provided. + """ + request_data = urllib.parse.urlencode(params).encode("utf-8") + request_headers = {"Content-Type": "application/x-www-form-urlencoded; charset=utf-8"} + + try: + with urllib.request.urlopen(urllib.request.Request( + url=f"{self.server}/{method}", headers=request_headers, data=request_data + )) as request: + response = json.loads(request.read()) + except urllib.error.HTTPError as e: + print("An exception was raised during UDPipe '{}' REST request.\n" + "The service returned the following error:\n" + " {}".format(method, e.fp.read().decode("utf-8")), file=sys.stderr) + raise + except json.JSONDecodeError as e: + print("Cannot parse the JSON response of UDPipe '{}' REST request.\n" + " {}".format(method, e.msg), file=sys.stderr) + raise + + if "model" not in response or "result" not in response: + raise ValueError("Cannot parse the UDPipe '{}' REST request response.".format(method)) + + return response["result"] + + def tag_parse_tree(self, root, tag=True, parse=True): + """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" + if not tag and not parse: + raise ValueError('tag_parse_tree(root, tag=False, parse=False) does not make sense.') + descendants = root.descendants + if not descendants: + return + in_data = " ".join([n.form for n in descendants]) + params = {"model": self.model, "data": in_data, "input":"horizontal", "tagger":""} + attrs = 'upos xpos lemma feats'.split() if tag else [] + if parse: + params["parser"] = "" + attrs.append('deprel') + + out_data = self.perform_request_urlencoded(params=params) + conllu_reader = ConlluReader(empty_parent="ignore") + conllu_reader.files.filehandle = io.StringIO(out_data) + parsed_root = conllu_reader.read_tree() + if parse: + root.flatten() + for parsed_node in parsed_root.descendants: + node = descendants[parsed_node.ord - 1] + if parse: + node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root + for attr in attrs: + setattr(node, attr, getattr(parsed_node, attr)) + + def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, ranges=False): + """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. + + If resegment=True, the returned list of Udapi trees may contain multiple trees. + If ranges=True, each token will contain `node.misc[TokenRange]` will contain character level 0-based ranges, e.g. `0:2`. + """ + if parse and not tag: + raise ValueError('Combination parse=True tag=False is not allowed.') + if root.children: + raise ValueError('Tree already contained nodes before tokenization') + + # Tokenize and possibly segment the input text + params = {"model": self.model, "data": root.text, "tokenizer":"" if resegment else "presegmented"} + if tag: + params["tagger"] = "" + if parse: + params["parser"] = "" + if ranges: + params["tokenizer"] = "presegmented;ranges" if resegment else "ranges" + out_data = self.perform_request_urlencoded(params=params) + conllu_reader = ConlluReader(empty_parent="ignore") + conllu_reader.files.filehandle = io.StringIO(out_data) + trees = conllu_reader.read_trees() + + # The input "root" object must be the first item in "trees". + for attr in ('_children', '_descendants', '_mwts', 'text', 'comment'): + setattr(root, attr, getattr(trees[0], attr)) + for node in root._children: + node._parent = root + for node in root._descendants: + node._root = root + trees[0] = root + return trees + + def segment_text(self, text): + """Segment the provided text into sentences returned as a Python list.""" + params = {"model": self.model, "data": text, "tokenizer":"", "output": "plaintext=normalized_spaces"} + return self.perform_request_urlencoded(params=params).rstrip().split("\n") + + def process_document(self, doc, tokenize=True, tag=True, parse=True, resegment=False, ranges=False): + """Delete all existing bundles and substitute them with those parsed by UDPipe.""" + if parse and not tag: + raise ValueError('Combination parse=True tag=False is not allowed.') + params = {"model": self.model, "tokenizer": "presegmented"} + if tag: + params["tagger"] = "" + if parse: + params["parser"] = "" + if resegment: + params["tokenizer"] = "" + if ranges: + params["tokenizer"] = "ranges" if resegment else "presegmented;ranges" + + #in_trees = [] + #for bundle in doc.bundles: + # assert(len(bundle.trees) == 1) + # in_trees.append(bundle.trees[0]) + if tokenize: + params["data"] = "\n".join(root.text for root in doc.trees) + "\n" + else: + params["input"] = "horizontal" + params["data"] = "\n".join(" ".join([n.form for n in root.descendants]) for root in doc.trees) + "\n" + + out_data = self.perform_request_urlencoded(params=params) + conllu_reader = ConlluReader(empty_parent="ignore") + conllu_reader.files.filehandle = io.StringIO(out_data) + trees = conllu_reader.read_trees() + + bundles = list(reversed(doc.bundles)) + for tree in trees: + if bundles: + bundle = bundles.pop() + # TODO is this safe? + bundle.trees = [] + else: + bundle = doc.create_bundle() + bundle.add_tree(tree)