diff --git a/.circleci/config.yml b/.circleci/config.yml index 988f321d..9530d5c7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -15,13 +15,16 @@ orbs: # See: https://circleci.com/docs/2.0/configuration-reference/#jobs jobs: build-and-test: # This is the name of the job, feel free to change it to better match what you're trying to do! + parameters: + python-version: + type: string # These next lines defines a Docker executors: https://circleci.com/docs/2.0/executor-types/ # You can specify an image from Dockerhub or use one of the convenience images from CircleCI's Developer Hub # A list of available CircleCI Docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python # The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container # Change the version below to your required version of python docker: - - image: cimg/python:3.9 + - image: cimg/python:<< parameters.python-version >> # Checkout the code as the first step. This is a dedicated CircleCI step. # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default. # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt. @@ -31,15 +34,12 @@ jobs: - checkout - python/install-packages: pkg-manager: pip - # app-dir: ~/project/package-directory/ # If you're requirements.txt isn't in the root directory. - # pip-dependency-file: test-requirements.txt # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements. - run: name: Install Udapi command: pip install ".[test]" - run: mkdir -p test-results - run: name: Run pytest tests - # This assumes pytest is installed via the install-package step above command: pytest --junitxml=test-results/junit.xml -o junit_family=legacy - store_test_results: path: test-results @@ -54,7 +54,9 @@ jobs: # Invoke jobs via workflows # See: https://circleci.com/docs/2.0/configuration-reference/#workflows workflows: - sample: # This is the name of the workflow, feel free to change it to better match your workflow. - # Inside the workflow, you define the jobs you want to run. + test-matrix: jobs: - - build-and-test + - build-and-test: + matrix: + parameters: + python-version: ["3.9", "3.11", "3.13"] diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..0285eddb --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,70 @@ +# This workflow will upload a Python Package to PyPI when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + release-build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Build release distributions + run: | + # NOTE: put your own distribution build steps here. + python -m pip install build + python -m build + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: release-dists + path: dist/ + + pypi-publish: + runs-on: ubuntu-latest + needs: + - release-build + permissions: + # IMPORTANT: this permission is mandatory for trusted publishing + id-token: write + + # Dedicated environments with protections for publishing are strongly recommended. + # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules + environment: + name: pypi + # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status: + url: https://pypi.org/p/udapi + # + # ALTERNATIVE: if your GitHub Release name is the PyPI project version string + # ALTERNATIVE: exactly, uncomment the following line instead: + # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }} + + steps: + - name: Retrieve release distributions + uses: actions/download-artifact@v4 + with: + name: release-dists + path: dist/ + + - name: Publish release distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist/ diff --git a/.gitignore b/.gitignore index a75e7c05..adc7bbbc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.cache .idea +*.egg-info/ *.pyc -.cache +dist/ diff --git a/CHANGES.txt b/CHANGES.txt index 67ced748..98e26605 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,20 @@ Udapi Change Log ---------------- See https://github.com/udapi/udapi-python/commits/master for details. +0.5.1 2025-11-05 + - make udapy compatible with Python 3.13 + +0.5.0 2025-10-18 + - added mwt.feats + - added root.prev_tree and root.next_tree + - .github/workflows/python-publish.yml + - edits by Dan Zeman in block.ud.* + +0.4.0 2025-03-28 + - support for CorefUD 1.3 + - edits by Dan Zeman in block.ud.* + - requires Python 3.9+ (difficult to test older versions in Circle-CI) + 0.3.0 2022-04-06 - support for CorefUD 1.0 (new CoNLL-U format for coreference annotation) - edits by Dan Zeman in block.ud.* diff --git a/README.md b/README.md index 0b41297f..36465c78 100644 --- a/README.md +++ b/README.md @@ -6,28 +6,24 @@ Python framework for processing Universal Dependencies data [](http://udapi.readthedocs.io) ## Requirements -- You need Python 3.6 or higher. -- If the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser is needed, - make sure you have a C++11 compiler (e.g. [g++ 4.7 or newer](.travis.yml#L9)) - and install UDPipe with `pip3 install --user --upgrade ufal.udpipe`. +- You need Python 3.9 or higher. +- It is recommended to install Udapi in a Python virtual environment. +- If you need the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser (to be used from Udapi) + install it (with `pip install --upgrade ufal.udpipe`). ## Install Udapi for developers -Let's clone the git repo to `~/udapi-python/`, install dependencies -and setup `$PATH` and `$PYTHONPATH` accordingly. +Let's clone the git repo e.g. to `~/udapi-python/` and make an [editable installation](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) ```bash cd git clone https://github.com/udapi/udapi-python.git -pip3 install --user -r udapi-python/requirements.txt -echo '## Use Udapi from ~/udapi-python/ ##' >> ~/.bashrc -echo 'export PATH="$HOME/udapi-python/bin:$PATH"' >> ~/.bashrc -echo 'export PYTHONPATH="$HOME/udapi-python/:$PYTHONPATH"' >> ~/.bashrc -source ~/.bashrc # or open new bash +cd udapi-python +pip install -e . ``` ## Install Udapi for users This is similar to the above, but installs Udapi from PyPI to the standard (user) Python paths. ``` -pip3 install --user --upgrade udapi +pip install --upgrade udapi ``` Try `udapy -h` to check it is installed correctly. If it fails, make sure your `PATH` includes the directory where `pip3` installed the `udapy` script. diff --git a/bin/udapy b/bin/udapy index 528e3577..83c7a6f2 100755 --- a/bin/udapy +++ b/bin/udapy @@ -1,116 +1,7 @@ #!/usr/bin/env python3 -import os -import gc +"""Thin wrapper for backward compatibility. Calls udapi.cli.main().""" import sys -import atexit -import logging -import argparse +from udapi.cli import main -from udapi.core.run import Run - -# Parse command line arguments. -argparser = argparse.ArgumentParser( - formatter_class=argparse.RawTextHelpFormatter, - usage="udapy [optional_arguments] scenario", - epilog="See http://udapi.github.io", - description="udapy - Python interface to Udapi - API for Universal Dependencies\n\n" - "Examples of usage:\n" - " udapy -s read.Sentences udpipe.En < in.txt > out.conllu\n" - " udapy -T < sample.conllu | less -R\n" - " udapy -HAM ud.MarkBugs < sample.conllu > bugs.html\n") -argparser.add_argument( - "-q", "--quiet", action="store_true", - help="Warning, info and debug messages are suppressed. Only fatal errors are reported.") -argparser.add_argument( - "-v", "--verbose", action="store_true", - help="Warning, info and debug messages are printed to the STDERR.") -argparser.add_argument( - "-s", "--save", action="store_true", - help="Add write.Conllu to the end of the scenario") -argparser.add_argument( - "-T", "--save_text_mode_trees", action="store_true", - help="Add write.TextModeTrees color=1 to the end of the scenario") -argparser.add_argument( - "-H", "--save_html", action="store_true", - help="Add write.TextModeTreesHtml color=1 to the end of the scenario") -argparser.add_argument( - "-A", "--save_all_attributes", action="store_true", - help="Add attributes=form,lemma,upos,xpos,feats,deprel,misc (to be used after -T and -H)") -argparser.add_argument( - "-C", "--save_comments", action="store_true", - help="Add print_comments=1 (to be used after -T and -H)") -argparser.add_argument( - "-M", "--marked_only", action="store_true", - help="Add marked_only=1 to the end of the scenario (to be used after -T and -H)") -argparser.add_argument( - "-N", "--no_color", action="store_true", - help="Add color=0 to the end of the scenario, this overrides color=1 of -T and -H") -argparser.add_argument( - "-X", "--extra", action="append", - help="Add a specified parameter (or a block name) to the end of the scenario\n" - "For example 'udapy -TNX attributes=form,misc -X layout=align < my.conllu'") -argparser.add_argument( - "--gc", action="store_true", - help="By default, udapy disables Python garbage collection and at-exit cleanup\n" - "to speed up everything (especially reading CoNLL-U files). In edge cases,\n" - "when processing many files and running out of memory, you can disable this\n" - "optimization (i.e. enable garbage collection) with 'udapy --gc'.") -argparser.add_argument( - 'scenario', nargs=argparse.REMAINDER, help="A sequence of blocks and their parameters.") - -args = argparser.parse_args() - -# Set the level of logs according to parameters. -if args.verbose: - level = logging.DEBUG -elif args.quiet: - level = logging.CRITICAL -else: - level = logging.INFO - -logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', - level=level) - -# Process and provide the scenario. if __name__ == "__main__": - - # Disabling garbage collections makes the whole processing much faster. - # Similarly, we can save several seconds by partially disabling the at-exit Python cleanup - # (atexit hooks are called in reversed order of their registration, - # so flushing stdio buffers etc. will be still done before the os._exit(0) call). - # See https://instagram-engineering.com/dismissing-python-garbage-collection-at-instagram-4dca40b29172 - # Is it safe to disable GC? - # OS will free the memory allocated by this process after it ends anyway. - # The udapy wrapper is aimed for one-time tasks, not a long-running server, - # so in a typical case a document is loaded and almost no memory is freed before the end. - # Udapi documents have a many cyclic references, so running GC is quite slow. - if not args.gc: - gc.disable() - atexit.register(os._exit, 0) - atexit.register(sys.stderr.flush) - if args.save: - args.scenario = args.scenario + ['write.Conllu'] - if args.save_text_mode_trees: - args.scenario = args.scenario + ['write.TextModeTrees', 'color=1'] - if args.save_html: - args.scenario = args.scenario + ['write.TextModeTreesHtml', 'color=1'] - if args.save_all_attributes: - args.scenario = args.scenario + ['attributes=form,lemma,upos,xpos,feats,deprel,misc'] - if args.save_comments: - args.scenario = args.scenario + ['print_comments=1'] - if args.marked_only: - args.scenario = args.scenario + ['marked_only=1'] - if args.no_color: - args.scenario = args.scenario + ['color=0'] - if args.extra: - args.scenario += args.extra - - runner = Run(args) - # udapy is often piped to head etc., e.g. - # `seq 1000 | udapy -s read.Sentences | head` - # Let's prevent Python from reporting (with distracting stacktrace) - # "BrokenPipeError: [Errno 32] Broken pipe" - try: - runner.execute() - except BrokenPipeError: - pass + sys.exit(main()) diff --git a/bin/udapy.bat b/bin/udapy.bat new file mode 100644 index 00000000..013e08e7 --- /dev/null +++ b/bin/udapy.bat @@ -0,0 +1,4 @@ +@REM The Python launcher "py" must be accessible via the PATH environment variable. +@REM We assume that this batch script lies next to udapy in udapi-python/bin. +@REM The PYTHONPATH environment variable must contain path to udapi-python. +py %~dp$PATH:0\udapy %* diff --git a/pyproject.toml b/pyproject.toml index 374b58cb..18d5c717 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,36 @@ [build-system] -requires = [ - "setuptools>=42", - "wheel" -] +requires = ["setuptools>=42", "wheel"] build-backend = "setuptools.build_meta" + +[project] +name = "udapi" +version = "0.5.2" +description = "Python framework for processing Universal Dependencies data" +readme = "README.md" +requires-python = ">=3.9" +license = "GPL-3.0-or-later" +authors = [ + {name = "Martin Popel", email = "popel@ufal.mff.cuni.cz"} +] +classifiers = [ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", +] +dependencies = [ + "colorama", + "termcolor", +] + +[project.urls] +Homepage = "https://github.com/udapi/udapi-python" + +[project.optional-dependencies] +test = ["pytest"] +udpipe = ["ufal.udpipe"] + +[project.scripts] +udapy = "udapi.cli:main" + +[tool.setuptools] +packages = {find = {}} +include-package-data = true diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index a14145ab..00000000 --- a/setup.cfg +++ /dev/null @@ -1,29 +0,0 @@ -[metadata] -name = udapi -version = 0.3.0 -author = Martin Popel -author_email = popel@ufal.mff.cuni.cz -description = Python framework for processing Universal Dependencies data -long_description = file: README.md -long_description_content_type = text/markdown -url = https://github.com/udapi/udapi-python -classifiers = - Programming Language :: Python :: 3 - License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+) - Operating System :: OS Independent - -[options] -packages = find: -python_requires = >=3.6 -include_package_data = True -scripts = - bin/udapy -install_requires = - colorama - termcolor - -[options.extras_require] -test = - pytest - - diff --git a/setup.py b/setup.py deleted file mode 100644 index 7f1a1763..00000000 --- a/setup.py +++ /dev/null @@ -1,4 +0,0 @@ -from setuptools import setup - -if __name__ == "__main__": - setup() diff --git a/tutorial/udapi-tutorial-dz.odt b/tutorial/udapi-tutorial-dz.odt new file mode 100644 index 00000000..d27ff8c4 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.odt differ diff --git a/tutorial/udapi-tutorial-dz.pdf b/tutorial/udapi-tutorial-dz.pdf new file mode 100644 index 00000000..86d975b6 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.pdf differ diff --git a/udapi/block/corefud/delete.py b/udapi/block/corefud/delete.py index 4e68e8dd..5aaf94e7 100644 --- a/udapi/block/corefud/delete.py +++ b/udapi/block/corefud/delete.py @@ -25,19 +25,15 @@ def is_root_reachable_by_deps(self, node, parents_to_ignore=None): proc_node, path = stack.pop() # root is reachable if proc_node == node.root: - break + return True # path forms a cycle, the root cannot be reached through this branch - if proc_node in path: - continue - for dep in proc_node.deps: - # the root cannot be reached through ignored nodes - if dep['parent'] in parents_to_ignore: - continue - # process the parent recursively - stack.append((dep['parent'], path + [proc_node])) - else: - return False - return True + if proc_node not in path: + for dep in proc_node.deps: + # the root cannot be reached through ignored nodes + if dep['parent'] not in parents_to_ignore: + # process the parent recursively + stack.append((dep['parent'], path + [proc_node])) + return False def _deps_ignore_nodes(self, node, parents_to_ignore): """ Retrieve deps from the node, recursively ignoring specified parents. @@ -46,18 +42,16 @@ def _deps_ignore_nodes(self, node, parents_to_ignore): stack = [(node, [])] while stack: proc_node, skipped_nodes = stack.pop() - # if there is a cycle of skipped nodes, ground the subtree to the root - if proc_node in skipped_nodes: - newdeps.append({'parent': node.root, 'deprel': 'root'}) - continue - for dep in proc_node.deps: - # keep deps with a parent that shouldn't be ignored - if not dep['parent'] in parents_to_ignore: - newdeps.append(dep) - continue - # process the ignored parent recursively - stack.append((dep['parent'], skipped_nodes + [proc_node])) - return newdeps + if proc_node not in skipped_nodes: + for dep in proc_node.deps: + if dep['parent'] in parents_to_ignore: + # process the ignored parent recursively + stack.append((dep['parent'], skipped_nodes + [proc_node])) + else: + # keep deps with a parent that shouldn't be ignored + newdeps.append(dep) + # If no newdeps were found (because of a cycle), return the root. + return newdeps if newdeps else [{'parent': node.root, 'deprel': 'root'}] def process_document(self, doc): # This block should work both with coreference loaded (deserialized) and not. @@ -67,17 +61,14 @@ def process_document(self, doc): if self.empty: for node in root.descendants: # process only the nodes dependent on empty nodes - if not '.' in node.raw_deps: - continue - # just remove empty parents if the root remains reachable - if self.is_root_reachable_by_deps(node, root.empty_nodes): - node.deps = [dep for dep in node.deps if not dep['parent'] in root.empty_nodes] - # otherwise propagate to non-empty ancestors - else: - newdeps = self._deps_ignore_nodes(node, root.empty_nodes) - newdeps_sorted = sorted(set((dep['parent'].ord, dep['deprel']) for dep in newdeps)) - node.raw_deps = '|'.join(f"{p}:{r}" for p, r in newdeps_sorted) - + if '.' in node.raw_deps: + # just remove empty parents if the root remains reachable + if self.is_root_reachable_by_deps(node, root.empty_nodes): + node.deps = [dep for dep in node.deps if not dep['parent'] in root.empty_nodes] + # otherwise propagate to non-empty ancestors + else: + node.deps = self._deps_ignore_nodes(node, root.empty_nodes) + # This needs to be done even if '.' not in node.raw_deps. if '.' in node.misc['Functor'].split(':')[0]: del node.misc['Functor'] root.empty_nodes = [] diff --git a/udapi/block/corefud/fixparentheses.py b/udapi/block/corefud/fixparentheses.py old mode 100755 new mode 100644 index 9baeca98..bc8e6504 --- a/udapi/block/corefud/fixparentheses.py +++ b/udapi/block/corefud/fixparentheses.py @@ -1,31 +1,31 @@ -from udapi.core.block import Block - - -class FixParentheses(Block): - """Find mentions that contain opening parenthesis but do not contain the closing one (or the other way around). - If the missing parenthesis is an immediate neighbour of the mention span, add it to the span.""" - - def __init__(self, mark=True, **kwargs): - super().__init__(**kwargs) - self.mark = mark - - def process_coref_mention(self, mention): - words = [word.lemma for word in mention.words] - pairs = ['()', '[]', '{}'] - for pair in pairs: - if pair[0] in words: - if not pair[1] in words and pair[1] in [node.lemma for node in mention.head.root.descendants]: - if mention.words[-1].ord == int(mention.words[-1].ord) and mention.words[-1].next_node and \ - mention.words[-1].next_node.lemma == pair[1]: - next_node = mention.words[-1].next_node - mention.words.append(next_node) - if self.mark: - next_node.misc['Mark'] = 1 - - elif pair[1] in words and pair[0] in [node.lemma for node in mention.head.root.descendants]: - if mention.words[0].ord == int(mention.words[0].ord) and mention.words[0].prev_node \ - and mention.words[0].prev_node.lemma == pair[0]: - prev_node = mention.words[0].prev_node - mention.words.append(prev_node) - if self.mark: - prev_node.misc['Mark'] = 1 +from udapi.core.block import Block + + +class FixParentheses(Block): + """Find mentions that contain opening parenthesis but do not contain the closing one (or the other way around). + If the missing parenthesis is an immediate neighbour of the mention span, add it to the span.""" + + def __init__(self, mark=True, **kwargs): + super().__init__(**kwargs) + self.mark = mark + + def process_coref_mention(self, mention): + words = [word.lemma for word in mention.words] + pairs = ['()', '[]', '{}'] + for pair in pairs: + if pair[0] in words: + if not pair[1] in words and pair[1] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[-1].ord == int(mention.words[-1].ord) and mention.words[-1].next_node and \ + mention.words[-1].next_node.lemma == pair[1]: + next_node = mention.words[-1].next_node + mention.words.append(next_node) + if self.mark: + next_node.misc['Mark'] = 1 + + elif pair[1] in words and pair[0] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[0].ord == int(mention.words[0].ord) and mention.words[0].prev_node \ + and mention.words[0].prev_node.lemma == pair[0]: + prev_node = mention.words[0].prev_node + mention.words.append(prev_node) + if self.mark: + prev_node.misc['Mark'] = 1 diff --git a/udapi/block/corefud/link2cluster.py b/udapi/block/corefud/link2cluster.py index 3f9f9bb3..08296531 100644 --- a/udapi/block/corefud/link2cluster.py +++ b/udapi/block/corefud/link2cluster.py @@ -2,17 +2,66 @@ from udapi.core.block import Block class Link2Cluster(Block): - """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.""" + """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format. - def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, **kwargs): + Params: + id_attr: name of the attribute in MISC that stores the original-format IDs of nodes + ante_attr: name of the attribute in MISC that stores the ID of the antecedent + of the current node (in the same format as `id_attr`). + delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion? + (i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr, + bridge_attr, bridge_relation_attr if these are used). Default=True. + infstat_attr: name of the attribute in MISC that stores the information status of a given mention + Will be stored in `mention.other['infstat']`. Use None for ignoring this. + coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention + Will be stored in `mention.other['coreftype']`. Use None for ignoring this. + bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent + of the current node/mention (in the same format as `id_attr`). + Default=None, i.e. ignore this parameter. + bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type + (e.g. "part" or "subset"). Default=None, i.e. ignore this parameter. + eid_counter: use a global counter of entity.eid and start with a given number. Default=1. + The main goal of this parameter is to make eid unique across multiple documents. + If you use eid_counter=0, this feature will be turned off, + so entities will be created using `root.document.create_coref_entity()`, + with no eid parameter, so that the eid will start from "e1" in each document processed by this block. + """ + def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, + infstat_attr='information-status', coreftype_attr='coreftype', + bridge_attr=None, bridge_relation_attr=None, eid_counter=1, **kwargs): super().__init__(**kwargs) self.id_attr = id_attr self.ante_attr = ante_attr self.delete_orig_attrs = delete_orig_attrs + self.infstat_attr = infstat_attr + self.coreftype_attr = coreftype_attr + self.bridge_attr = bridge_attr + self.bridge_relation_attr = bridge_relation_attr + self.eid_counter = int(eid_counter) + + def _new_entity(self, doc): + if not self.eid_counter: + return doc.create_coref_entity() + entity = doc.create_coref_entity(eid=f"e{self.eid_counter}") + self.eid_counter += 1 + return entity + + def _new_mention(self, entity, node): + mention = entity.create_mention(head=node, words=[node]) + if self.infstat_attr and node.misc[self.infstat_attr]: + mention.other['infstat'] = node.misc[self.infstat_attr] + if self.delete_orig_attrs: + del node.misc[self.infstat_attr] + if self.coreftype_attr and node.misc[self.coreftype_attr]: + mention.other['coreftype'] = node.misc[self.coreftype_attr] + if self.delete_orig_attrs: + del node.misc[self.coreftype_attr] + return mention def process_document(self, doc): id2node = {} links = [] + bridges = [] for node in doc.nodes_and_empty: this_id = node.misc[self.id_attr] if this_id != '': @@ -26,6 +75,16 @@ def process_document(self, doc): if self.delete_orig_attrs: for attr in (self.id_attr, self.ante_attr): del node.misc[attr] + if self.bridge_attr: + bridge_id = node.misc[self.bridge_attr] + if bridge_id != '': + if bridge_id == this_id: + logging.warning(f"{node} has a self-reference bridging {self.bridge_attr}={bridge_id}") + else: + bridges.append([bridge_id, this_id, node.misc[self.bridge_relation_attr]]) + if self.delete_orig_attrs: + for attr in (self.bridge_attr, self.bridge_relation_attr): + del node.misc[attr] # It seems faster&simpler to process the links in any order and implement entity merging, # rather than trying to sort the links so that no entity merging is needed. @@ -36,14 +95,9 @@ def process_document(self, doc): ante_node, this_node = id2node[ante_id], id2node[this_id] if not this_node.coref_mentions and not ante_node.coref_mentions: # None of the nodes is part of any mention/entity. Let's create them. - entity = this_node.root.document.create_coref_entity() - m_ante = entity.create_mention(head=ante_node, words=[ante_node]) - m_this = entity.create_mention(head=this_node, words=[this_node]) - for node, mention in ((ante_node, m_ante), (this_node, m_this)): - if node.misc['information-status']: - mention.other['infstat'] = node.misc['information-status'] - if self.delete_orig_attrs: - del node.misc['information-status'] + entity = self._new_entity(this_node.root.document) + self._new_mention(entity, ante_node) + self._new_mention(entity, this_node) elif this_node.coref_mentions and ante_node.coref_mentions: # Both of the nodes are part of mentions in different entities. # Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity). @@ -59,6 +113,25 @@ def process_document(self, doc): else: # Only one of the nodes is part of an entity. Let's add the second one to this entity. if ante_node.coref_mentions: - ante_node.coref_entities[0].create_mention(head=this_node, words=[this_node]) + self._new_mention(ante_node.coref_entities[0], this_node) else: - this_node.coref_entities[0].create_mention(head=ante_node, words=[ante_node]) + self._new_mention(this_node.coref_entities[0], ante_node) + + # Bridging + for ante_id, this_id, relation in bridges: + if ante_id not in id2node: + logging.warning(f"{ante_id} is referenced in {self.bridge_attr}, but not in {self.id_attr}") + else: + ante_node, this_node = id2node[ante_id], id2node[this_id] + if ante_node.coref_mentions: + m_ante = next(m for m in ante_node.coref_mentions if m.head is ante_node) + e_ante = m_ante.entity + else: + e_ante = self._new_entity(ante_node.root.document) + m_ante = self._new_mention(e_ante, ante_node) + if this_node.coref_mentions: + m_this = next(m for m in this_node.coref_mentions if m.head is this_node) + else: + e_this = self._new_entity(this_node.root.document) + m_this = self._new_mention(e_this, this_node) + m_this.bridging.append((e_ante, relation)) diff --git a/udapi/block/corefud/markpairs.py b/udapi/block/corefud/markpairs.py new file mode 100644 index 00000000..cc63b387 --- /dev/null +++ b/udapi/block/corefud/markpairs.py @@ -0,0 +1,138 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +from collections import Counter +import logging + +class MarkPairs(Block): + """Find pairs of coreference mentions within the same sentence with given properties. + Mark these pairs of mentions (using `misc["Mark"]`), so they can be further + processed or printed. + + Usage: + # Find pairs of mentions of the same entity within the same sentence: + cat my.conllu | udapy -TM corefud.MarkPairs same_entity=1 | less -R + + Properties: + same_entity - both mentions belong to the same entity (cluster) + both_continuous - both mentions have continuous spans + both_discontinuous - both mentions have discontinuous spans + nested - span of one mention is nested (a subset of) in the span of the other mention + crossing - spans are crossing (i.e. intersecting, but neither is subset of the other) + interleaved - spans are interleaved (i.e. not intersecting, but neither span precedes the other) + same_head - the same node is a head of both mentions + same_span - both mentions have the same span (which is invalid according to UD's validate.py) + same_subspan - at least one of the mentions is discontinuous and one of its subspans + is also a subspan (or span) of the other mention + + + You can combine any number of properties. + Each property can have one of the three values: + include - this is the default value: include pairs with this property, i.e. ignore the property + exclude - exclude (from the marking) pairs of mentions with this property + only - pairs of mentions without this property will be excluded + + As a shortcut, you can use -1 and 1 instead of exclude and only, so e.g. + nested=only same_head=exclude + can be written as + nested=1 same_head=-1 + """ + + def __init__(self, same_entity=0, both_continuous=0, both_discontinuous=0, + nested=0, crossing=0, interleaved=0, + same_head=0, same_span=0, same_subspan=0, + print_form=False, print_total=True, log=True, mark=True, **kwargs): + super().__init__(**kwargs) + + + self.same_entity = self._convert(same_entity) + self.both_continuous = self._convert(both_continuous) + self.both_discontinuous = self._convert(both_discontinuous) + self.nested = self._convert(nested) + self.crossing = self._convert(crossing) + self.interleaved = self._convert(interleaved) + self.same_head = self._convert(same_head) + self.same_span = self._convert(same_span) + self.same_subspan = self._convert(same_subspan) + + self.print_form = print_form + self.print_total = print_total + self.log = log + self.mark = mark + self.counter = Counter() + + def _convert(self, value): + if value in {-1, 0, 1}: + return value + if value == 'include': + return 0 + if value == 'only': + return 1 + if value == 'exclude': + return -1 + raise ValueError('unknown value ' + value) + + def _ok(self, condition, value): + if value == 0: + return True + return (condition and value == 1) or (not condition and value==-1) + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + self.counter['mentions'] += len(mentions) + + for mA, mB in itertools.combinations(mentions, 2): + self.counter['pairs'] += 1 + if not self._ok(mA.entity == mB.entity, self.same_entity): + continue + if not self._ok(mA.head == mB.head, self.same_head): + continue + + if self.both_continuous or self.both_discontinuous or self.same_span or self.same_subspan: + sA, sB = mA.span, mB.span + cA, cB = ',' not in sA, ',' not in sB + if not self._ok(cA and cB, self.both_continuous): + continue + if not self._ok(not cA and not cB, self.both_discontinuous): + continue + if not self._ok(sA == sB, self.same_span): + continue + if not self._ok(set(sA.split(',')).intersection(set(sB.split(','))), self.same_subspan): + continue + + if self.nested or self.crossing or self.interleaved: + wA, wB = set(mA.words), set(mB.words) + if not self._ok(wA <= wB or wB <= wA, self.nested): + continue + if not self._ok(wA.intersection(wB) and not wA <= wB and not wB <= wA, self.crossing): + continue + if self.interleaved: + a_precedes_b = mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0] + b_precedes_a = mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0] + if not self._ok(not wA.intersection(wB) and not a_precedes_b and not b_precedes_a, self.interleaved): + continue + + self.counter['matching'] += 1 + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + logging.info(f"Found mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") + + def after_process_document(self, doc): + if self.print_total: + #if self.max_trees and seen_trees > self.max_trees: + # print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + msg = f'######## Mentions = {self.counter["mentions"]}, matching/all pairs = {self.counter["matching"]} / {self.counter["pairs"]}' + logging.info(msg) + doc.meta["corefud.MarkPairs"] = msg diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index e05815a6..527159e9 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -1,19 +1,27 @@ from udapi.core.block import Block from collections import Counter +import re class Stats(Block): """Block corefud.Stats prints various coreference-related statistics.""" - def __init__(self, m_len_max=5, e_len_max=5, report_mentions=True, report_entities=True, - report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM', + def __init__(self, m_len_max=5, e_len_max=5, + report_basics=False, report_mentions=True, report_entities=True, + report_details=True, report_words_per_doc=False, report_entity_range=False, + selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _', exclude_singletons=False, exclude_nonsingletons=False, style='human', - per_doc=False, max_rows_per_page=50, **kwargs): + per_doc=False, max_rows_per_page=50, docname='newdoc', docname_len=15, + highlight_docnames=None, + **kwargs): super().__init__(**kwargs) self.m_len_max = m_len_max self.e_len_max = e_len_max + self.report_basics = report_basics self.report_mentions = report_mentions self.report_entities = report_entities self.report_details = report_details + self.report_words_per_doc = report_words_per_doc + self.report_entity_range = report_entity_range self.exclude_singletons = exclude_singletons self.exclude_nonsingletons = exclude_nonsingletons self.style = style @@ -21,6 +29,11 @@ def __init__(self, m_len_max=5, e_len_max=5, report_mentions=True, report_entiti raise ValueError(f'Unknown style {style}') self.per_doc = per_doc self.max_rows_per_page = max_rows_per_page + if docname not in 'newdoc filename'.split(): + raise ValueError(f'Unknown style {style}') + self.docname = docname + self.docname_len = docname_len + self.highlight_docnames = highlight_docnames self._header_printed = False self._lines_printed = None @@ -33,10 +46,17 @@ def __init__(self, m_len_max=5, e_len_max=5, report_mentions=True, report_entiti self.longest_entity = 0 self.m_words = 0 self.selected_upos = None if selected_upos == 'all' else selected_upos.split() + self.entity_ranges = [] def process_document(self, doc): self.total_nodes += len(list(doc.nodes)) self.counter['documents'] += 1 + node2docord, current_docord = {}, 0 + if self.report_entity_range: + for node in doc.nodes_and_empty: + node2docord[node] = current_docord + current_docord += 1 + for entity in doc.coref_entities: len_mentions = len(entity.mentions) if len_mentions == 1: @@ -45,6 +65,8 @@ def process_document(self, doc): continue elif len_mentions > 1 and self.exclude_nonsingletons: continue + if self.report_entity_range: + self.entity_ranges.append(node2docord[entity.mentions[-1].head] - node2docord[entity.mentions[0].head]) self.longest_entity = max(len_mentions, self.longest_entity) self.counter['c_total_len'] += len_mentions self.counter[f"c_len_{min(len_mentions, self.e_len_max)}"] += 1 @@ -75,6 +97,18 @@ def process_document(self, doc): heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 self.counter['m_nontreelet'] += 1 if heads > 1 else 0 + if self.report_basics: + doc_words = 0 + for tree in doc.trees: + self.counter['sents'] += 1 + self.counter['words'] += len(tree.descendants) + self.counter['empty'] += len(tree.empty_nodes) + if tree.newdoc: + self.counter['newdocs'] += 1 + if doc_words > self.counter['max_words_per_doc']: + self.counter['max_words_per_doc'] = doc_words + doc_words = 0 + doc_words += len(tree.descendants) def after_process_document(self, doc): if self.per_doc: @@ -87,6 +121,7 @@ def after_process_document(self, doc): self.longest_mention = 0 self.longest_entity = 0 self.m_words = 0 + self.entity_ranges = [] def process_end(self, skip=True, doc=None): if not self._lines_printed: @@ -97,7 +132,12 @@ def process_end(self, skip=True, doc=None): self.print_footer() return else: - print(f"{doc[0].trees[0].newdoc:15}", end='&' if self.style.startswith('tex') else '\n') + docname = doc.meta['loaded_from'] if self.docname == 'filename' else doc[0].trees[0].newdoc + if self.style.startswith('tex'): + if self.highlight_docnames and re.search(self.highlight_docnames, docname): + docname = r"\NEW " + docname + docname = docname.replace('_', r'\_') + print(f"{docname:{self.docname_len}}", end='&' if self.style.startswith('tex') else '\n') elif self.style.startswith('tex-'): print(f"{self.counter['documents']:4} documents &") self._lines_printed += 1 @@ -107,11 +147,23 @@ def process_end(self, skip=True, doc=None): total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes columns =[ ] + if self.report_basics: + columns += [('docs', f"{self.counter['newdocs']:6,}"), + ('sents', f"{self.counter['sents']:7,}"), + ('words', f"{self.counter['words']:9,}"), + ('empty', f"{self.counter['empty']:7,}"),] + if self.report_words_per_doc: + columns += [('max_words/doc', f"{self.counter['max_words_per_doc']:7,}"), + ('words/doc', f"{self.counter['words']/self.counter['newdocs']:7,.0f}"),] if self.report_entities: columns += [('entities', f"{self.entities:7,}"), ('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"), ('longest_entity', f"{self.longest_entity:6}"), ('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")] + if self.report_entity_range: + self.entity_ranges.sort() + percentile = self.entity_ranges[int(0.95 * (len(self.entity_ranges) - 1))] if self.entity_ranges else 0 + columns += [('entity_range_95percentile', f"{percentile:6,}"),] for i in range(1, self.e_len_max + 1): percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero columns.append((f"c_len_{i}{'' if i < self.e_len_max else '+'}", f"{percent:5.1f}")) @@ -136,7 +188,7 @@ def process_end(self, skip=True, doc=None): columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}")) if self.style.startswith('tex'): - print(" & ".join(c[1] for c in columns), end=" \\\\\n") + print(" &".join(c[1] for c in columns), end=" \\\\\n") elif self.style == 'human': for c in columns: print(f"{c[0]:>15} = {c[1].strip():>10}") @@ -153,15 +205,34 @@ def print_header(self): if self._lines_printed is None: print(r'\documentclass[multi=mypage]{standalone}') print(r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}') + print(r'\usepackage[table]{xcolor}\newcommand{\NEW}{\rowcolor{gray!50}}') print(r'\title{Udapi coreference statistics}') print(r'\begin{document}') print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}') - lines = [r'\begin{mypage}\begin{tabular}{@{}l ', " "*15, ("document" if self.per_doc else "dataset ") + " "*7, " "*15] + lines = [r'\begin{mypage}'+"\n"+r'\begin{tabular}{@{}l ', + " " * self.docname_len, + ("document" if self.per_doc else "dataset ") + " " * (self.docname_len-8), + " " * self.docname_len] + if self.report_basics: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{text size} ' + lines[2] += r'& \MC{4}{total number of} ' + lines[3] += r'& docs & sents & words &empty n.' + if self.report_words_per_doc: + lines[0] += "rr " + lines[1] += r'& & ' + lines[2] += r'&\MC{2}{words/doc}' + lines[3] += r'& max & avg ' if self.report_entities: lines[0] += "rrrr " - lines[1] += r'& \MC{4}{entities} ' - lines[2] += r'& total & per 1k & \MC{2}{length} ' - lines[3] += r'& count & words & max & avg. ' + lines[1] += r'& \MC{4}{entities} ' + lines[2] += r'& total &per 1k &\MC{2}{length}' + lines[3] += r'& count & words & max & avg ' + if self.report_entity_range: + lines[0] += "r " + lines[1] += r'& ' + lines[2] += r'& range ' + lines[3] += r'& p95 ' if self.e_len_max: for i in range(1, self.e_len_max + 1): lines[0] += "r" @@ -171,9 +242,9 @@ def print_header(self): lines[1] += r'& \MC{' + str(self.e_len_max) + r'}{distribution of entity lengths}' if self.report_mentions: lines[0] += "rrrr " - lines[1] += r'& \MC{4}{mentions} ' - lines[2] += r'& total & per 1k & \MC{2}{length} ' - lines[3] += r'& count & words & max & avg. ' + lines[1] += r'& \MC{4}{mentions} ' + lines[2] += r'& total &per 1k &\MC{2}{length}' + lines[3] += r'& count & words & max & avg ' if self.m_len_max: for i in range(0, self.m_len_max + 1): lines[0] += "r" @@ -199,10 +270,18 @@ def print_header(self): lines[1] += r'\\' lines[2] += r'\\' lines[3] += r'\\\midrule' - if self.report_entities: + if self.report_basics: + lines[1] += r'\cmidrule(lr){2-7}' if self.report_words_per_doc else r'\cmidrule(lr){2-5}' + lines[2] += r'\cmidrule(lr){2-5}' last_col += 4 - lines[1] += r'\cmidrule(lr){2-5}' - lines[2] += r'\cmidrule(lr){4-5}' + if self.report_words_per_doc: + lines[2] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+2}" + '}' + last_col += 2 + if self.report_entities: + _cols = 5 if self.report_entity_range else 5 + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+_cols}" + '}' + lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}' + last_col += _cols if self.e_len_max: last_col += self.e_len_max lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}' @@ -221,6 +300,6 @@ def print_header(self): def print_footer(self, end_doc=True): if not self.style.startswith('tex-'): return - print(r'\bottomrule\end{tabular}\end{mypage}') + print(r'\bottomrule\end{tabular}'+"\n"+r'\end{mypage}') if self.style == 'tex-doc' and end_doc: print(r'\end{document}') diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py index ca5510e4..e4889770 100644 --- a/udapi/block/eval/f1.py +++ b/udapi/block/eval/f1.py @@ -126,7 +126,7 @@ def process_tree(self, tree): i, j, c, un_pred, un_gold, common = 0, 0, 0, [], [], [] while i < len(pred_tokens) and j < len(gold_tokens): if c == len(nf_common): - common += find_lcs(pred_tokens[i+1:], gold_tokens[j+1:]) + common += find_lcs(pred_tokens[i:], gold_tokens[j:]) break while nf_common[c] != pred_tokens[i]: un_pred.append(pred_tokens[i]) @@ -156,6 +156,13 @@ def process_tree(self, tree): self._pred[x] += 1 self._total[x] += 1 + @property + def f1(self): + pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero + precision = self.correct / pred + recall = self.correct / gold + return 2 * precision * recall / ((precision + recall) or 1) + def process_end(self): # Redirect the default filehandle to the file specified by self.files self.before_process_document(None) diff --git a/udapi/block/msf/case.py b/udapi/block/msf/case.py new file mode 100644 index 00000000..7d362c7f --- /dev/null +++ b/udapi/block/msf/case.py @@ -0,0 +1,448 @@ +""" +Morphosyntactic features (UniDive): +Derive a MS Case feature from morphological case and adposition. +""" +from udapi.core.block import Block +import logging + +class Case(Block): + + adposmap = { + 'v+Loc': 'Ine', + 'uvnitř+Gen': 'Ine', + 'uvnitř+': 'Ine', + 'mezi_uvnitř+Gen': 'Ine', # annotation error? + 'uprostřed+Gen': 'Ces', + 'mezi+Ins': 'Int', + 'mezi+Nom': 'Int', # annotation error + 'mezi+Voc': 'Int', # annotation error + 'vně+Gen': 'Ext', + 'stranou+Gen': 'Ext', + 'stranou+Dat': 'Ext', + 'na+Loc': 'Ade', + 'na_mimo+Loc': 'Ade', # annotation error? + 'na_úroveň+Gen': 'Ade', + 'na_úroveň+': 'Ade', + 'v_proces+Gen': 'Ade', # ??? + 'v_rámec+Gen': 'Ade', # ??? + 'v_rámec+': 'Ade', # ??? + 'v_řada+Gen': 'Ade', # ??? + 'z_oblast+Gen': 'Ade', # ??? + 'vedle+Gen': 'Apu', + 'u+Gen': 'Chz', + 'kolem+Gen': 'Cir', + 'kol+Gen': 'Cir', + 'dokola+Gen': 'Cir', + 'okolo+Gen': 'Cir', + 'v_oblast+Gen': 'Cir', + 'v_oblast+': 'Cir', + 'blízko+Dat': 'Prx', + 'blízko+Gen': 'Prx', + 'blízko+': 'Prx', + 'nedaleko+Gen': 'Prx', + 'daleko+Gen': 'Prx', # lemma of 'nedaleko' + 'poblíž+Gen': 'Prx', + 'daleko_od+Gen': 'Dst', + 'nad+Ins': 'Sup', + 'pod+Ins': 'Sub', + 'vespod+Gen': 'Sub', + 'před+Ins': 'Ant', + 'vpředu+Gen': 'Ant', + 'na_čelo+Gen': 'Ant', + 'v_čelo+Gen': 'Ant', + 'v_čelo+': 'Ant', + 'za+Ins': 'Pst', + 'naproti+Dat': 'Opp', + 'od+Gen': 'Abl', + 'od+Dat': 'Abl', # annotation error + 'směr_od+Gen': 'Abl', + 'z_strana+Gen': 'Abl', + 'z_strana+': 'Abl', + 'z+Gen': 'Ela', + 'z+Nom': 'Ela', # annotation error + 'z+Dat': 'Ela', # annotation error + 'zevnitř+Gen': 'Ela', + 'zprostřed+Gen': 'Cne', + 's+Gen': 'Del', + 'zpod+Gen': 'Sbe', + 'zpoza+Gen': 'Pse', + 'po+Loc': 'Per', + 'cesta+Gen': 'Per', + 'cesta+Ins': 'Per', + 'napříč+Gen': 'Crs', + 'napříč+Ins': 'Crs', + 'podél+Gen': 'Lng', + 'skrz+Acc': 'Inx', + 'přes+Acc': 'Spx', + 'přes+Nom': 'Spx', # annotation error + 'ob+Acc': 'Cix', + 'po+Acc': 'Ter', + 'po+Nom': 'Ter', # annotation error + 'po+Gen': 'Ter', # annotation error + 'do+Gen': 'Ill', + 'do+Acc': 'Ill', # annotation error + 'do_/+Gen': 'Ill', + 'dovnitř+Gen': 'Ill', + 'doprostřed+Gen': 'Cnl', + 'mezi+Acc': 'Itl', + 'na+Acc': 'All', + 'na+Nom': 'All', # annotation error + 'na+Gen': 'All', # annotation error + 'k+Dat': 'Apl', + 'k+Nom': 'Apl', # annotation error + 'vstříc+Dat': 'Apl', + 'do_oblast+Gen': 'Apl', + 'směr+': 'Apl', + 'směr_k+Dat': 'Apl', + 'směr_k+': 'Apl', + 'směr_na+Acc': 'Apl', + 'v_směr_k+Dat': 'Apl', + 'nad+Acc': 'Spl', + 'nad+Nom': 'Spl', # annotation error + 'pod+Acc': 'Sbl', + 'před+Acc': 'Anl', + 'před+Gen': 'Anl', # annotation error + 'za+Acc': 'Psl', + 'dík_za+Acc': 'Psl', # annotation error? + 'dokud': 'Tan', + 'nežli': 'Tan', + 'v+Acc': 'Tem', + 'v+Nom': 'Tem', # annotation error + 'v+Gen': 'Tem', # annotation error + 'při_příležitost+Gen': 'Tem', + 'současně_s+Ins': 'Tem', + 'u_příležitost+Gen': 'Tem', + 'v_období+Gen': 'Tpx', + 'počátkem+Gen': 'Din', + 'počátek+Gen': 'Din', + 'počínat+Ins': 'Din', + 'počínat+': 'Din', + 'začátkem+Gen': 'Din', + 'začátek+Gen': 'Din', + 'během+Gen': 'Dur', + 'postupem+Gen': 'Dur', + 'postup+Gen': 'Dur', + 'při+Loc': 'Dur', + 'v_průběh+Gen': 'Dur', + 'za+Gen': 'Der', + 'koncem+Gen': 'Dtr', + 'konec+Gen': 'Dtr', + 'k_konec+Gen': 'Dtr', + 'končit+Ins': 'Dtr', + 'závěrem+Gen': 'Dtr', + 'závěr+Gen': 'Dtr', + 'na_závěr+Gen': 'Dtr', + 'v_závěr+Gen': 'Dtr', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'před_po+Loc': 'Tps', + 'počínaje+Ins': 'Teg', + 'jménem+Nom': 'Atr', + 'jméno+Nom': 'Atr', + 'zdali': 'Atr', + 'že': 'Atr', + 'z_řada+Gen': 'Gen', + 's+Ins': 'Com', + 's+Nom': 'Com', # annotation error + 'spolu_s+Ins': 'Com', + 'spolu_s+': 'Com', + 'společně_s+Ins': 'Com', + 'společně_s+': 'Com', + 'v_čelo_s+Ins': 'Com', + 'v_spolupráce_s+Ins': 'Com', + 'bez+Gen': 'Abe', + 'včetně+Gen': 'Inc', + 'nad_rámec+Gen': 'Add', + 'kromě+Gen': 'Exc', + 'krom+Gen': 'Exc', + 'mimo+Acc': 'Exc', + 'mimo+Gen': 'Exc', + 'vyjma+Gen': 'Exc', + 'až_na+Acc': 'Exc', + 's_výjimka+Gen': 'Exc', + 's_výjimka+': 'Exc', + 'místo+Gen': 'Sbs', + 'místo+Ins': 'Sbs', # něčím místo něčím jiným + 'místo+Loc': 'Sbs', # annotation error + 'místo_do+Gen': 'Sbs', + 'místo_k+Dat': 'Sbs', + 'místo_na+Acc': 'Sbs', + 'místo_na+': 'Sbs', + 'místo_po+Loc': 'Sbs', + 'místo_v+Acc': 'Sbs', + 'místo_v+': 'Sbs', + 'místo_za+Acc': 'Sbs', + 'namísto+Gen': 'Sbs', + 'namísto_do+Gen': 'Sbs', + 'v_zastoupení+Gen': 'Sbs', + 'výměna_za+Acc': 'Sbs', + 'jako': 'Ess', + 'jako+': 'Ess', + 'jako+Nom': 'Ess', + 'jako+Acc': 'Ess', + 'jako+Dat': 'Ess', + 'jako_u+Gen': 'Ess', + 'jako_v+Loc': 'Ess', + 'formou+Gen': 'Ess', + 'forma+Gen': 'Ess', + 'v_forma+Gen': 'Ess', + 'v_podoba+Gen': 'Ess', + 'v_podoba+': 'Ess', + 'shoda+Gen': 'Equ', + 'v_shoda_s+Ins': 'Equ', + 'do_soulad_s+Ins': 'Sem', + 'na_způsob+Gen': 'Sem', + 'po_vzor+Gen': 'Sem', + 'úměrně+Dat': 'Sem', + 'úměrně_k+Dat': 'Sem', + 'úměrně_s+Ins': 'Sem', + 'v_analogie_s+Ins': 'Sem', + 'v_duch+Gen': 'Sem', + 'v_smysl+Gen': 'Sem', + 'oproti+Dat': 'Dsm', + 'na_rozdíl_od+Gen': 'Dsm', + 'na_rozdíl_od+': 'Dsm', + 'než': 'Cmp', + 'než+Nom': 'Cmp', + 'než+Gen': 'Cmp', + 'než+Acc': 'Cmp', + 'než_nad+Ins': 'Cmp', + 'než_v+Acc': 'Cmp', + 'než_v+Loc': 'Cmp', + 'v_poměr_k+Dat': 'Cmp', + 'v_poměr_k+': 'Cmp', + 'v_porovnání_k+Dat': 'Cmp', + 'v_porovnání_s+Ins': 'Cmp', + 'v_porovnání_s+': 'Cmp', + 'v_srovnání_s+Ins': 'Cmp', + 'v_srovnání_s+': 'Cmp', + 'o+Acc': 'Dif', + 'o+Nom': 'Dif', # annotation error + 'o+Gen': 'Dif', # annotation error + 'o+Dat': 'Dif', # annotation error + 'o_o+Acc': 'Dif', # annotation error + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'díky+Dat': 'Cau', + 'dík+Dat': 'Cau', + 'kvůli+Dat': 'Cau', + 'vinou+Gen': 'Cau', + 'vlivem+Gen': 'Cau', + 'vliv+Gen': 'Cau', + 'vliv+': 'Cau', + 'vinou+Gen': 'Cau', + 'vina+Gen': 'Cau', + 'zásluhou+Gen': 'Cau', + 'zásluha+Gen': 'Cau', + 'z_důvod+Gen': 'Cau', + 'v_důsledek+Gen': 'Cau', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'následek+Gen': 'Cau', + 'aby': 'Pur', + 'jméno+Gen': 'Pur', + 'pro_případ+Gen': 'Pur', + 'v_jméno+Gen': 'Pur', + 'v_zájem+Gen': 'Pur', + 'za_účel+Gen': 'Pur', + 'na_základ+Gen': 'Cns', + 'pod_vliv+Gen': 'Cns', + 's_ohled_na+Acc': 'Cns', + 's_přihlédnutí_k+Dat': 'Cns', + 's_přihlédnutí_na+Acc': 'Cns', + 'v_souvislost_s+Ins': 'Cns', + 'v_souvislost_s+': 'Cns', + 'v_světlo+Gen': 'Cns', + 'vzhledem_k+Dat': 'Cns', + 'v_soulad_s+Ins': 'Cns', + 'v_soulad_s+': 'Cns', + 'z_titul+Gen': 'Cns', + 'ať': 'Ign', + 'bez_ohled_na+Acc': 'Ign', + 'nehledě_k+Dat': 'Ign', + 'nehledě_na+Acc': 'Ign', + 'navzdory+Dat': 'Ccs', + 'vzdor+Dat': 'Ccs', + 'v_rozpor_s+Ins': 'Ccs', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'pokud+Nom': 'Cnd', + 'zda': 'Cnd', + 'v_případ+Gen': 'Cnd', + 'v_případ+': 'Cnd', + 'v_závislost_na+Loc': 'Cnd', + 'v_závislost_s+Ins': 'Cnd', + 'o+Loc': 'The', + 'ohledně+Gen': 'The', + 'stran+Gen': 'The', + 'co_do+Gen': 'The', + 'na_téma+Gen': 'The', + 'na_téma+Nom': 'The', + 'na_téma+': 'The', + 'na_úsek+Gen': 'The', + 'po_stránka+Gen': 'The', + 'v_obor+Gen': 'The', + 'v_otázka+Gen': 'The', + 'v_spojení_s+Ins': 'The', + 'v_věc+Gen': 'The', + 'v_vztah_k+Dat': 'The', + 'v_vztah_k+': 'The', + 'v_záležitost+Gen': 'The', + 'v_znamení+Gen': 'The', + 'z_hledisko+Gen': 'The', + 'z_hledisko+': 'The', + 'podle+Gen': 'Quo', + 'dle+Gen': 'Quo', + 'pomocí+Gen': 'Ins', + 's_pomoc+Gen': 'Ins', + 'prostřednictvím+Gen': 'Ins', + 'prostřednictví+Gen': 'Ins', + 'prostřednictví+Ins': 'Ins', # annotation error + 'prostřednictví+': 'Ins', + 'za_pomoc+Gen': 'Ins', + 'pro+Acc': 'Ben', + 'pro+Nom': 'Ben', # annotation error + 'pro+Gen': 'Ben', # annotation error + 'pro+Ins': 'Ben', # annotation error + 'napospas+Dat': 'Ben', + 'k_prospěch+Gen': 'Ben', + 'na_úkor+Gen': 'Ben', + 'na_vrub+Gen': 'Ben', + 'v_prospěch+Gen': 'Ben', + 'v_neprospěch+Gen': 'Ben', + 'v_služba+Gen': 'Ben', + 'proti+Dat': 'Adv', + 'proti+Gen': 'Adv', + 'kontra+Nom': 'Adv', + 'versus+Nom': 'Adv', + 'vůči+Dat': 'Adv', + # subordinators + 'dokud': 'Tan', + 'nežli': 'Tan', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'zdali': 'Atr', + 'že': 'Atr', + 'jako': 'Ess', + 'než': 'Cmp', + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'aby': 'Pur', + 'ať': 'Ign', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'zda': 'Cnd', + # coordinators + 'a': 'Conj', + 'i': 'Conj', + 'ani': 'Nnor', + 'nebo': 'Disj', + 'či': 'Disj', + 'ale': 'Advs', + 'avšak': 'Advs', + 'však': 'Advs', + 'nýbrž': 'Advs', + 'neboť': 'Reas', + 'tedy': 'Cnsq', + 'tak': 'Cnsq' + } + + def process_node(self, node): + """ + Derives a case value from preposition and morphological case. Stores it + as MSFCase in MISC. + """ + # Do not do anything for function words. + # Specifically for Case, also skip 'det' and 'amod' modifiers (congruent attributes) + # because their Case is only agreement feature inherited from the head noun. + if node.udeprel in ['case', 'mark', 'cc', 'aux', 'cop', 'punct']: + node.misc['MSFFunc'] = 'Yes' + return + elif node.udeprel in ['det', 'amod']: + node.misc['MSFFunc'] = 'No' + return + else: + node.misc['MSFFunc'] = 'No' + # Get all case markers (adpositions) attached to the current node. + adpositions = [] + for c in node.children: + if c.udeprel == 'case': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + adpositions.append(lemma) + # We assume that all features were copied from FEATS to MISC in mwe.MsfInit. + # They may have been further processed there, so we take the input from there. + msfcase = node.misc['MSFCase'] + if adpositions: + adpostring = '_'.join(adpositions) + caseadpostring = adpostring + '+' + msfcase + if caseadpostring in self.adposmap: + msfcase = self.adposmap[caseadpostring] + else: + logging.warn(f"No Case value found for '{caseadpostring}'.") + msfcase = caseadpostring + # Omer wants to collect cases from both adpositions and subordinators + # but we will consider subordinators only if we do not have any case + # from morphology or adpositions. + if not msfcase: + subordinators = [] + for c in node.children: + if c.udeprel == 'mark': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + subordinators.append(lemma) + if subordinators: + subordstring = '_'.join(subordinators) + if subordstring in self.adposmap: + msfcase = self.adposmap[subordstring] + # To lump coordinators with all the above makes even less sense but for + # the moment we do it. + if not msfcase: + coordinators = [] + for c in node.children: + if c.udeprel == 'cc': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + coordinators.append(lemma) + if coordinators: + coordstring = '_'.join(coordinators) + if coordstring in self.adposmap: + msfcase = self.adposmap[coordstring] + node.misc['MSFCase'] = msfcase diff --git a/udapi/block/msf/createabstract.py b/udapi/block/msf/createabstract.py new file mode 100644 index 00000000..fbdf73e5 --- /dev/null +++ b/udapi/block/msf/createabstract.py @@ -0,0 +1,45 @@ +""" +Morphosyntactic features (UniDive): +Create abstract nodes representing dropped arguments of predicates (if verbal +morphology signals that the subject is third person singular, and there is no +subject node, create an abstract node and copy the features there). +""" +from udapi.core.block import Block +import re + +class CreateAbstract(Block): + + def process_node(self, node): + """ + If a node has MSFVerbForm=Fin and at least one of the agreement features + MSFNumber, MSFPerson, MSFGender, MSFAnimacy, MSFPolite, assume that these + features characterize the subject (this block is not suitable for languages + with polypersonal agreement). Check that the subject is present. If not, + create an abstract node to represent it. + """ + if node.misc['MSFVerbForm'] == 'Fin' and any([node.misc[x] for x in ['MSFNumber', 'MSFPerson', 'MSFGender', 'MSFAnimacy', 'MSFPolite']]): + # Current node is a finite predicate. Does it have a subject? If not, create an abstract one. + if not any([x.udeprel in ['nsubj', 'csubj'] for x in node.children]): + # There could already be an abstract subject. We have to look for it in the enhanced graph. + if not any([re.match(r"^[nc]subj", edep['deprel']) for edep in node.deps]): + # Create an abstract subject. + subject = node.create_empty_child('nsubj') + subject.upos = 'PRON' + subject.feats['PronType'] = 'Prs' + subject.misc['MSFPronType'] = 'Prs' + subject.feats['Case'] = 'Nom' + subject.misc['MSFCase'] = 'Nom' + for f in ['Number', 'Person', 'Gender', 'Animacy', 'Polite']: + msf = 'MSF' + f + if node.misc[msf]: + subject.feats[f] = node.misc[msf] + subject.misc[msf] = node.misc[msf] + subject.misc['MSFFunc'] = 'No' + # Regardless of whether it had a subject or not, the agreement features + # should be removed from the verb. + ###!!! We also may want to check if the pre-existing subject has all the features. + node.misc['MSFNumber'] = '' + node.misc['MSFPerson'] = '' + node.misc['MSFGender'] = '' + node.misc['MSFAnimacy'] = '' + node.misc['MSFPolite'] = '' diff --git a/udapi/block/msf/init.py b/udapi/block/msf/init.py new file mode 100644 index 00000000..ceca12af --- /dev/null +++ b/udapi/block/msf/init.py @@ -0,0 +1,53 @@ +""" +Morphosyntactic features (UniDive): +Initialization. Copies features from FEATS as MSF* attributes to MISC. +""" +from udapi.core.block import Block +import re + +class Init(Block): + + + def process_node(self, node): + """ + For every feature in FEATS, creates its MSF* counterpart in MISC. + """ + for f in node.feats: + # Only selected features will be copied. Certain features are not + # interesting for the morphosyntactic annotation. + if f not in ['Abbr', 'AdpType', 'Emph', 'Foreign', 'NameType', 'Style', 'Typo', 'Variant']: + node.misc['MSF'+f] = node.feats[f] + # We are particularly interested in the Case feature but some nominals + # lack it (e.g. acronyms or numbers). If there is a preposition, it may + # indicate the expected case of the nominal. + if not node.feats['Case']: + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + node.misc['MSFCase'] = adpositions[0].feats['Case'] + # If we did not find a preposition to help us, we may be able to read + # the case off an adjectival modifier or determiner. + if not node.misc['MSFCase']: + modifiers = [x for x in node.children if x.udeprel in ['amod', 'det'] and x.feats['Case']] + if modifiers: + node.misc['MSFCase'] = modifiers[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if not node.misc['MSFCase']: + if node.udeprel == 'nsubj': + node.misc['MSFCase'] = 'Nom' + elif node.udeprel == 'obj': + node.misc['MSFCase'] = 'Acc' + # If the node contains Phrase features in MISC (periphrastic verb forms + # detected by Lenka's code), replace the MS features with them. + phrasefeatures = [x for x in node.misc if re.match(r"^Phrase[A-Z]", x)] + for pf in phrasefeatures: + msf = pf + if msf == 'PhraseForm': + msf = 'MSFVerbForm' + else: + msf = re.sub(r"Phrase", 'MSF', pf) + node.misc[msf] = node.misc[pf] + node.misc[pf] = '' diff --git a/udapi/block/msf/numphrase.py b/udapi/block/msf/numphrase.py new file mode 100644 index 00000000..22f68c9d --- /dev/null +++ b/udapi/block/msf/numphrase.py @@ -0,0 +1,36 @@ +""" +Morphosyntactic features (UniDive): +Case in Number Phrases like 'pět mužů' (five men) in Czech. +""" +from udapi.core.block import Block + +class NumPhrase(Block): + + + def process_node(self, node): + """ + Nouns with a 'nummod:gov' dependent are morphologically in genitive, + but the case of the whole phrase (number + counted noun) is different, + probably nominative or accusative. + """ + quantifiers = [x for x in node.children if x.deprel in ['nummod:gov', 'det:numgov']] + current_case = node.misc['MSFCase'] + if (current_case == 'Gen' or current_case == '') and quantifiers: + quantifier_case = quantifiers[0].misc['MSFCase'] + # The quantifier may lack the case feature (e.g. numbers expressed by digits) + # but we may be able to guess it from a preposition or other factors. + if quantifier_case == '': + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + quantifier_case = adpositions[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if quantifier_case == '': + if node.udeprel == 'nsubj': + quantifier_case = 'Nom' + elif node.udeprel == 'obj': + quantifier_case = 'Acc' + node.misc['MSFCase'] = quantifier_case diff --git a/udapi/block/msf/phrase.py b/udapi/block/msf/phrase.py new file mode 100644 index 00000000..cf5a8f81 --- /dev/null +++ b/udapi/block/msf/phrase.py @@ -0,0 +1,168 @@ +""" +Morphosyntactic features (UniDive): +An abstract block as a base for derivation of blocks that discover periphrastic +verb forms and save them as Phrase features in MISC. This block provides the +methods that save the features in MISC. It is based on the Writer module by +Lenka Krippnerová. +""" +from udapi.core.block import Block +import logging + +class Phrase(Block): + + def __init__(self, feature_prefix='CW', **kwargs): + """ + Parameters: + feature_prefix (string) - The prefix of phrase features (e. g. 'CW', 'Phrase'), default is 'CG' + """ + super().__init__(**kwargs) + self.feature_prefix = feature_prefix + + self.dictionary = { + 'person': f'{feature_prefix}Person', + 'number': f'{feature_prefix}Number', + 'mood': f'{feature_prefix}Mood', + 'tense': f'{feature_prefix}Tense', + 'voice': f'{feature_prefix}Voice', + 'aspect':f'{feature_prefix}Aspect', + 'form': f'{feature_prefix}Form', + 'reflex': f'{feature_prefix}Reflex', + 'polarity': f'{feature_prefix}Polarity', + 'gender': f'{feature_prefix}Gender', + 'animacy': f'{feature_prefix}Animacy', + 'ords': feature_prefix, + 'expl': f'{feature_prefix}Expl', + 'analytic': 'Analytic', + } + + # a dictionary where the key is the lemma of a negative particle and the value is a list of the lemmas of their possible children that have a 'fixed' relation + # we do not want to include these negative particles in the phrase; these are expressions like "never", etc. + self.negation_fixed = { + # Belarusian + 'ні' : ['раз'], + 'ня' : ['толькі'], + + # Upper Sorbian + 'nic' : ['naposledku'], + + # Polish + 'nie' : ['mało'], + + # Pomak + 'néma' : ['kak'], + + # Slovenian + 'ne' : ['le'], + + # Russian and Old East Slavic + 'не' : ['то', 'токмо'], + 'ни' : ['в', 'раз', 'шатко'], + 'нет' : ['нет'] + } + + def process_node(self, node): + """ + Override this in a derived class! + """ + logging.fatal('process_node() not implemented.') + + + + def write_node_info(self, node, + tense = None, + person = None, + number = None, + mood = None, + voice = None, + form = None, + reflex = None, + polarity = None, + ords = None, + gender = None, + animacy = None, + aspect = None, + expl=None, + analytic=None): + arguments = locals() + del arguments['self'] # delete self and node from arguments, + del arguments['node'] # we want only grammatical categories + for key,val in arguments.items(): + if val != None: + node.misc[self.dictionary[key]] = val + + def has_fixed_children(self, node): + """ + Returns True if the node has any children with the 'fixed' relation and the node's lemma along with the child's lemma are listed in self.negation_fixed. + """ + fixed_children = [x for x in node.children if x.udeprel == 'fixed'] + + if fixed_children: + if fixed_children[0].lemma in self.negation_fixed.get(node.lemma, []): + return True + return False + + def get_polarity(self, nodes): + """ + Returns 'Neg' if there is exactly one node with Polarity='Neg' among the given nodes. + Returns an empty string if there are zero or more than one such nodes. + """ + neg_count = 0 + for node in nodes: + if node.feats['Polarity'] == 'Neg': + neg_count += 1 + + if neg_count == 1: + return 'Neg' + + # neg_count can be zero or two, in either case we want to return an empty string so that the PhrasePolarity attribute is not generated + else: + return '' + + def get_negative_particles(self, nodes): + """ + Returns a list of all negative particles found among the children + of the specified nodes, except for negative particles with fixed children specified in self.negation_fixed. + """ + neg_particles = [] + for node in nodes: + neg = [x for x in node.children if x.upos == 'PART' and x.feats['Polarity'] == 'Neg' and x.udeprel == 'advmod' and not self.has_fixed_children(x)] + if neg: + neg_particles += neg + return neg_particles + + + def get_is_reflex(self,node,refl): + if node.feats['Voice'] == 'Mid': + return 'Yes' + if len(refl) == 0: + return node.feats['Reflex'] + return 'Yes' + + def get_expl_type(self,node, refl): + if node.feats['Voice'] == 'Mid': + return 'Pv' + if not refl: + return '' + if refl[0].deprel == 'expl': + return 'Pv' + return refl[0].deprel.split(':')[1].capitalize() + + def is_expl_pass(self,refl): + if len(refl) == 0: + return False + return refl[0].deprel == 'expl:pass' + + def get_voice(self,node,refl): + voice = node.feats['Voice'] + if self.is_expl_pass(refl): + return 'Pass' + return voice + + def get_analytic_bool(self,node): + auxes = [x for x in node.children if x.udeprel == 'aux'] + + if auxes: + return 'Yes' + else: + return 'No' + diff --git a/udapi/block/msf/removefunc.py b/udapi/block/msf/removefunc.py new file mode 100644 index 00000000..e169a2de --- /dev/null +++ b/udapi/block/msf/removefunc.py @@ -0,0 +1,17 @@ +""" +Morphosyntactic features (UniDive): +Cleanup. Removes MSF* features from MISC for function nodes (MSFFunc=Yes). +""" +from udapi.core.block import Block + +class RemoveFunc(Block): + + + def process_node(self, node): + """ + Removes MSF* features if MSFFunc=Yes. + """ + if node.misc['MSFFunc'] == 'Yes': + msfeats = [x for x in node.misc if x.startswith('MSF')] + for msf in msfeats: + node.misc[msf] = '' diff --git a/udapi/block/msf/romance/preprocessor.py b/udapi/block/msf/romance/preprocessor.py new file mode 100644 index 00000000..ad7aec1e --- /dev/null +++ b/udapi/block/msf/romance/preprocessor.py @@ -0,0 +1,20 @@ +from udapi.core.block import Block + +class Preprocessor(Block): + + + def process_node(self, node): + + # In Porttinari treebank, the negative adverb não is not marked with feat Polarity=Neg + if node.lemma == 'não' and node.upos == 'ADV': + node.feats['Polarity'] = 'Neg' + + if node.upos == 'ADV' and node.feats['PronType'] == 'Neg': + node.feats['PronType'] = '' + node.feats['Polarity'] = 'Neg' + + # In Romanian RRT treebank, there is no annotation of the voice feature + # Automatically assign passive voice + pass_auxes = [x for x in node.children if x.deprel == 'aux:pass'] + if pass_auxes: + node.feats['Voice'] = 'Pass' \ No newline at end of file diff --git a/udapi/block/msf/romance/romance.py b/udapi/block/msf/romance/romance.py new file mode 100644 index 00000000..ed05fa89 --- /dev/null +++ b/udapi/block/msf/romance/romance.py @@ -0,0 +1,965 @@ +import udapi.block.msf.phrase +from enum import Enum + +AUXES_HAVE = ['ter', 'haber', 'avere'] +AUXES_BE = ['estar', 'essere'] +MODALS = ['poder', 'deber', 'querer', 'saber', # Spanish + Portuguese + 'potere', 'dovere', 'volere', 'sapere'] # Italian + +class Aspect(str, Enum): + ANT = 'Ant' + IMP = 'Imp' + IMPPROG = 'ImpProg' + PERF = 'Perf' + PERFPROG = 'PerfProg' + PROG = 'Prog' + PQP = 'Pqp' + PQPPROG = 'PqpProg' + +class Tense(str, Enum): + FUT = 'Fut' + FUTFUT = 'FutFut' + PAST = 'Past' + PASTFUT = 'PastFut' + PASTPRES = 'PastPres' + PRES = 'Pres' + +class Romance(udapi.block.msf.phrase.Phrase): + + def __init__(self, neg=True, **kwargs): + """ + Parameters: + neg (bool) - If True, process negation and generate the PhrasePolarity=Neg attribute. + feature_prefix (string) - The prefix of phrase features (e. g. 'CG', 'Phrase'), default is 'CG' + """ + super().__init__(**kwargs) + self.neg = neg + + def process_node(self, node): + + if node.misc[self.feature_prefix] != '': + return + + cop = [x for x in node.children if x.udeprel == 'cop'] + + # only expl or expl:pv, no expl:impers or expl:pass + refl = [x for x in node.children if (x.lemma == 'se' or x.lemma == 'soi') and x.upos == 'PRON' and x.udeprel == 'expl' and x.deprel != 'expl:impers' and x.deprel != 'expl:pass'] + + if refl: + expl='Pv' + else: + expl=None + + if cop: + # find auxiliary verbs, modal verbs, and auxiliary verbs related to modal verbs among the children of the content verb and separate them from each other + auxes, neg, modals, modal_auxes, modal_neg = self.find_auxes_and_neg(node) + adp = [x for x in node.children if x.upos == 'ADP'] + + if modals: + # we consider modals themselves to be separate verb forms + self.process_modal_verbs(modals, modal_auxes, modal_neg) + + if auxes: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in auxes] + [r.ord for r in refl] + [a.ord for a in adp] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in auxes] + [a.ord for a in adp] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(cop[0], auxes, expl, polarity, phrase_ords, node) + else: + # no auxiliaries, only cop + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [c.ord for c in cop] + [r.ord for r in refl] + [a.ord for a in adp] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in adp] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_copulas(node, cop, expl, polarity, phrase_ords) + return + + if node.upos == 'VERB': #TODO maybe add "or node.feats['VerbForm'] == 'Part'"? + + # find auxiliary verbs, modal verbs, and auxiliary verbs related to modals among the children of the content verb and separate them from each other + auxes, neg, modals, modal_auxes, modal_neg = self.find_auxes_and_neg(node) + aux_pass = [x for x in auxes if x.deprel == 'aux:pass'] + auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass'] + + # infinitive with a subject is a subjunctive + subj = [x for x in node.children if x.udeprel == 'subj'] + if node.feats['VerbForm'] == 'Inf' and subj: + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + mood='Sub', + form='Fin', + tense=Tense.FUT.value, + gender=node.feats['Gender'], + voice=node.feats['Voice'], + expl=expl, + analytic=self.get_analytic_bool(node), + ords=[node.ord] + ) + return + + if modals: + # we consider modals themselves to be separate verb forms + self.process_modal_verbs(modals, modal_auxes, modal_neg) + + if not auxes: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_phrases_with_ir_aller_estar(node, expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(node, expl, polarity, phrase_ords, node) + + + else: + # no passive auxiliaries + if not aux_pass: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(node, auxes, expl, polarity, phrase_ords, node) + + # head verb has only passive auxiliary and no more other auxiliaries + elif not auxes_without_pass: + polarity = '' + + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # TODO phrase-level features are currently determined based on the first passive auxiliary, but it can happen that there are more than one passive auxiliary + self.process_phrases_with_ir_aller_estar(auxes[0], expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(auxes[0], expl, polarity, phrase_ords, node) + + # head verb has passive auxiliary and also other auxiliaries + else: + polarity = '' + + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(aux_pass[0], auxes_without_pass, expl, polarity, phrase_ords, node) + + def find_auxes_and_neg(self, node): + """ + Find all auxiliaries and negative adverbials among node.children and classifies them. + + Parameters: + node (udapi.core.node.Node): head word, look for auxiliaries in its children + + Returns: + tuple: a classification of auxiliaries consisting of: + - auxiliaries directly modifying the node, + - negative adverbs modifying the node, + - modal verbs, + - auxiliaries modifying a modal verb, + - negative adverbs modifying a modal verb. + """ + + node_auxes = [] + node_neg = [] + modals = [] + modal_auxes = [] + modal_neg = [] + + for child in node.children: + if child.udeprel == 'aux': + if child.lemma in MODALS: + modals.append(child) + modal_auxes = node_auxes # auxiliaries found so far are assumed to modify the modal verb (they come before it) + node_auxes = [] + + modal_neg = node_neg + node_neg = [] + + else: + node_auxes.append(child) + + elif child.upos == 'ADV' and child.feats['Polarity'] == 'Neg': + node_neg.append(child) + + return node_auxes, node_neg, modals, modal_auxes, modal_neg + + def process_modal_verbs(self, modals, modal_auxes, modal_neg): + """ + Annotates modal verb forms with the Phrase* attributes. + The modal verbs are kept as a single verb form, without including the infinitive of the content word. + + Parameters: + modals (list): all modal verbs among the children of the head content verb (currently assumes there is only one.) + modal_auxes (list): auxiliaries of the modal verb(s) + modal_neg (list): negative adverbs of the modal verb(s) + + """ + if not modal_auxes: + polarity = '' + if self.neg is True: + phrase_ords = [modals[0].ord] + [n.ord for n in modal_neg] + phrase_ords.sort() + + if modal_neg: + polarity='Neg' + else: + phrase_ords = [modals[0].ord] + self.process_phrases_with_ir_aller_estar(modals[0], '', polarity, phrase_ords, modals[0]) + self.process_simple_verb_forms(modals[0], '', polarity, phrase_ords, modals[0]) + + else: + polarity = '' + if self.neg is True: + phrase_ords = [modals[0].ord] + [a.ord for a in modal_auxes] + [n.ord for n in modal_neg] + if modal_neg: + polarity='Neg' + else: + phrase_ords = [modals[0].ord] + [a.ord for a in modal_auxes] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(modals[0], modal_auxes, '', polarity, phrase_ords, modals[0]) + + def process_phrases_with_ir_aller_estar(self, node, expl, polarity, phrase_ords, head_node): + aspect = '' + tense = node.feats['Tense'] + + # phrase already annotated + if head_node.misc[self.feature_prefix] != '': + return + + xcomps = [x for x in node.children if x.udeprel == 'xcomp'] + if node.lemma in ['ir', 'aller', 'estar', 'ter'] and node.upos == 'VERB' and xcomps: + node.misc['PeriAux'] = 'Yes' + + voice = node.feats['Voice'] + auxes = [x for x in xcomps[0].children if x.udeprel == 'aux'] + aux_pass = [x for x in auxes if x.deprel == 'aux:pass'] + auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass'] + + # European Portuguese: estar + a + Inf + if node.lemma == 'estar': + + if node.feats['Tense'] == 'Pres': + tense=Tense.PRES.value + aspect =Aspect.PROG.value + + elif node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + elif node.feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect=Aspect.PERFPROG.value + + elif node.feats['Tense'] == 'Fut': + tense=Tense.FUT.value + aspect=Aspect.PROG.value + + elif node.lemma == 'ter' and len(xcomps) > 1: + tense=Tense.PAST.value + aspect=Aspect.PROG.value + xcomps[0].misc['PeriAux'] = 'Yes' + + elif node.feats['Tense'] == 'Pres': + tense=Tense.FUT.value + + elif node.feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMP.value + + elif node.feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + + elif node.feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERF.value + + + if auxes_without_pass: + if auxes[0].lemma == 'estar': + aspect += 'Prog' + if auxes[0].lemma == 'haber': + aspect += 'Perf' + + + + adp_a = [x for x in xcomps[-1].children if x.lemma == 'a' and x.udeprel == 'mark'] + cop = [x for x in xcomps[0].children if x.udeprel == 'cop'] + phrase_ords = [node.ord] + [x.ord for x in xcomps] + [x.ord for x in auxes] + [x.ord for x in cop] + if adp_a: + phrase_ords += [x.ord for x in adp_a] + + if aux_pass: + voice='Pass' + + phrase_ords.sort() + + self.write_node_info(xcomps[-1], + tense = tense, + number = node.feats['Number'], + person = node.feats['Person'], + aspect = aspect, + mood = node.feats['Mood'], + form = 'Fin', + voice=voice, + expl = expl, + polarity = polarity, + analytic='Yes', + ords=phrase_ords) + return + + def process_simple_verb_forms(self, node, expl, polarity, phrase_ords, head_node): + """ + Annotate simple verb forms or passive verb forms that contain only a passive auxiliary. + + Parameters + node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words of the verb form. + head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase. + """ + + if node.misc['PeriAux'] != '': + return + + # Portuguese + # presente -> PhraseTense=Pres, PhraseAspect='' + # Futuro do presente -> PhraseTense=Fut, PhraseAspect='' + + # Spanish + # presente -> PhraseTense=Pres, PhraseAspect='' + # futuro simple -> PhraseTense=Fut, PhraseAspect='' + + # Italian + # presente -> PhraseTense=Pres, PhraseAspect='' + # futuro semplice -> PhraseTense=Fut, PhraseAspect='' + + aspect = '' + tense = node.feats['Tense'] + form = node.feats['VerbForm'] + + if node.feats['Mood'] == 'Ind': + + # Portuguese + # pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + + # Spanish + # pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp + + # Italian + # imperfetto -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # Portuguese + # pretérito perfeito -> PhraseTense=Past, PhraseAspect=Perf + + # Spanish + # pretérito perfecto -> PhraseTense=Past, PhraseAspect=Perf + + # Italian + # pass remoto -> PhraseTense=Past, PhraseAspect=Perf + elif node.feats['Tense'] == 'Past': + aspect=Aspect.PERF.value + + # Portuguese + # pretérito mais que perfeito simples -> PhraseTense=Past, PhraseAspect=Pqp + elif node.feats['Tense'] == 'Pqp': + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + else: + # viitorul popular/colocvial (intentional future) -> PhraseTense=Fut, PhraseAspect='' + o = [x for x in node.children if x.lemma == 'o' and x.upos == 'PART'] + sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART'] + + + if o and sa: + tense = Tense.FUT.value + phrase_ords.append(o[0].ord) + phrase_ords.append(sa[0].ord) + + phrase_ords.sort() + + + + # Portuguese + # subjunctive presente -> PhraseTense=Pres, PhraseAspect='' + # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' + + # Spanish + # subjunctive presente -> PhraseTense=Pres, PhraseAspect='' + # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' TODO not annotated in treebanks? + + # Italian + # Congiuntivo presente -> PhraseTense=Pres, PhraseAspect='' + if node.feats['Mood'] == 'Sub': + + if node.feats['Tense'] == 'Past': + aspect=Aspect.IMP.value + + # Portuguese + # subjunctive pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + + # Spanish + # Pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp + + # Italian + # Congiuntivo imperfetto -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # Portuguese + # Futuro do pretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + + # Spanish + # pospretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + + # Italian + # Condizionale presente -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + if node.feats['Mood'] == 'Cnd': + aspect='' + tense=Tense.PRES.value + + adp_en = [x for x in head_node.children if x.upos == 'ADP' and x.lemma == 'en' and x.udeprel == 'mark'] + if node.feats['VerbForm'] == 'Part' and adp_en: + phrase_ords.append(adp_en[0].ord) + phrase_ords.sort() + form = 'Ger' + + + self.write_node_info(head_node, + person=node.feats['Person'], + aspect=aspect, + number=node.feats['Number'], + mood=node.feats['Mood'], + form=form, + tense=tense, + gender=head_node.feats['Gender'], + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic=self.get_analytic_bool(head_node), + ords=phrase_ords + ) + + def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_ords, head_node): + """ + Annotate periphrastic verb forms with the Phrase* attributes. + + Parameters + node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary. + auxes (list[udapi.core.node.Node]): All auxiliaries except the passive auxiliaries. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words in the verb form. + head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase. + """ + + # phrase already annotated + if head_node.misc[self.feature_prefix] != '': + return + + if len(auxes) == 1: + # Cnd + if auxes[0].feats['Mood'] == 'Cnd' and (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'Ger'): + + # Portuguese + # aux estar cond + gerund -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].lemma == 'estar': + tense=Tense.PRES.value + aspect=Aspect.PROG.value + + # Portuguese + # Futuro do pretérito composto -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + + # Spanish + # Antepospretérito -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + + # Italian + # Condizionale passato -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + else: + tense=Tense.PAST.value + aspect='' + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + aspect=aspect, + mood='Cnd', + form='Fin', + expl=expl, + polarity=polarity, + voice=head_node.feats['Voice'], + analytic='Yes', + ords=phrase_ords) + return + + if auxes[0].lemma == 'vir' and auxes[0].feats['Tense'] in ['Pres', 'Imp', 'Past'] and node.feats['VerbForm'] == 'Ger': + + # aux Pres (vir) + gerund -> PhraseTense=PastPres, PraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=Aspect.PROG.value, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + if auxes[0].lemma == 'ir' and node.feats['VerbForm'] == 'Ger': + + # aux Pres (ir) + gerund -> PhraseTense=Pres, PhraseAspect=Prog + tense = auxes[0].feats['Tense'] + aspect = Aspect.PROG.value + + # aux Imp (ir) + gerund -> PhraseTense=Past, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + # Auxiliary 'estar' followed by a gerund + if node.feats['VerbForm'] == 'Ger': + + # Portuguese + Spanish + # pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg + # subjunctive pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + # Portuguese + Spanish + # pretérito perfeito (aux estar) -> PhraseTense=Past, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect=Aspect.PERFPROG.value + + # Portuguese + Spanish + # presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog + # futuro do presente (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog + # subjunctive presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Sub + # subjunctive futuro (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog, PhraseMood=Sub + else: + tense=auxes[0].feats['Tense'] + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + aspect=aspect, + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # Auxiliary 'ter' / 'haber' / 'avere' / 'essere' followed by a participle + if node.feats['VerbForm'] == 'Part': + + # Portuguese + # futuro do presente composto (aux ter) -> PhraseTense=Fut, PhraseAspect=Perf + + # Spanish + # Futuro compuesto antefuturo -> PhraseTense=Fut, PhraseAspect=Perf + + # Italian + # Futuro anteriore -> PhraseTense=Fut, PhraseAspect=Perf + aspect=Aspect.PERF.value + tense=auxes[0].feats['Tense'] + form='Fin' + mood=auxes[0].feats['Mood'] + + adp_en = [x for x in node.children if x.lemma == 'en' and x.upos == 'ADP' and x.udeprel == 'mark'] + if auxes[0].feats['VerbForm'] == 'Part' and adp_en: + tense=Tense.PAST.value + aspect='' + phrase_ords.append(adp_en[0].ord) + phrase_ords.sort() + form='Ger' + + + # Romanian + # Perfect compus -> PhraseTense=Past, PhraseAspect=Perf + elif auxes[0].lemma == 'avea': + tense = Tense.PAST.value + aspect = Aspect.PERF.value + form = 'Fin' + + # Spanish + # Pretérito perfecto compuesto ante presente -> PhraseTense=Past, PhraseAspect=Perf + + # Italian + # Passato prossimo (aux avere/essere) -> PhraseTense=Past, PhraseAspect=Perf + elif auxes[0].feats['Tense'] == 'Pres': + + # Portuguese + # pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf + # subjonctive pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf, PhraseMood=Sub + if auxes[0].lemma == 'fi' or auxes[0].feats['Mood'] == 'Sub': + tense = Tense.PASTPRES.value + + # subjonctive mood not annotated in Romanian data + if auxes[0].lemma == 'fi': + mood='Sub' + else: + tense=Tense.PAST.value + + # Portuguese + # pretérito mais que perfeito composto (aux ter/haver) -> PhraseTense=Past, PhraseAspect=Pqp + # subjonctive pretérito mais-que-perfeito composto (aux ter) -> PhraseTense=Past, PhraseAspect=Pqp, PhraseMood=Sub + + # Spanish + # pretérito pluscuamperfecto -> PhraseTense=Past, PhraseAspect=Pqp + + # Italian + # Trapassato prossimo -> PhraseTense=Past, PhraseAspect=Pqp + elif auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + # Spanish + # pretérito anterior ante pretérito -> PhraseTense=Past, PhraseAspect=Ant + + # Italian + # trapassato remoto -> PhraseTense=Past, PhraseAspect=Ant + + # French + # passé antérieur -> PhraseTense=Past, PhraseAspect=Ant + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect = Aspect.ANT.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=mood, + aspect=aspect, + form=form, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + # auxiliary 'ir' or 'vrea' followed by infinitive + if auxes[0].lemma in ['ir', 'vrea'] and node.feats['VerbForm'] == 'Inf': + + tense=node.feats['Tense'] + aspect='' + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect='' + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect='' + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=Imp + elif auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMP.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect='' + elif auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect='' + + # Futuro perifrástico passado perf -> PhraseTense=PastFut, PhraseAspect=Perf + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERF.value + + # Viitorul standard/literar/simplu -> PhraseTense=Fut, PhraseAspect='' + if auxes[0].lemma == 'vrea': + tense = Tense.FUT.value + aspect = '' + + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # condițional-optativ prezent -> PhraseTense=Pres, PhraseAspect='' + if auxes[0].lemma == 'avea' and node.feats['VerbForm'] == 'Inf': + tense=Tense.PRES.value + aspect='' + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood='Cnd', + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # viitor popular/colloquial (obligative future) -> PhraseTense=Fut, PhraseAspect='' + # viitor popular (potential future - contracted form) -> PhraseTense=Fut, PhraseAspect='' + if node.feats['VerbForm'] == 'Fin': + sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART'] + + if sa: + phrase_ords.append(sa[0].ord) + phrase_ords.sort() + + tense=Tense.FUT.value + aspect='' + + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=head_node.feats['Number'], + person=head_node.feats['Person'], + mood=head_node.feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + elif len(auxes) == 2: + # Romanian + # viitor anterior -> PhraseTense=Fut, PhraseAsoect=Perf + if auxes[0].lemma == 'vrea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part': + + self.write_node_info(head_node, + tense=Tense.PAST.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=Aspect.PERF.value, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # condițional-optativ perfect -> PhraseTense=Past + if auxes[0].lemma == 'avea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part': + + self.write_node_info(head_node, + tense=Tense.PAST.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood='Cnd', + form='Fin', + aspect='', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # Portuguese + # auxiliry 'ir' followed by auxiliary 'estar' in infinitive and a gerund + if auxes[0].lemma == 'ir' and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PROG.value + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=ImpProg + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMPPROG.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PROG.value + + if auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERFPROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + # auxiliriy 'ir' in present or future tense followed by auxiliary 'ter' in infinitive and a participle + if auxes[0].lemma == 'ir' and (auxes[0].feats['Tense'] in ['Pres', 'Fut']) and auxes[1].lemma == 'ter' and node.feats['VerbForm'] == 'Part': + + # Futuro perifrástico -> PhraseTense=FutFut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PERF.value + + # aux Pres (ir) + aux ter inf + pp -> PhraseTense=Fut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PERF.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + # Cnd (only ter/haber), Sub and Past,Pres,Fut tenses: 2 auxes - ter/haber + estar + if auxes[0].lemma in AUXES_HAVE and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + + tense = auxes[0].feats['Tense'] + aspect = Aspect.PERFPROG.value + + # aux ter cond + estar pp + gerund -> PhraseTense=Past, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].feats['Mood'] == 'Cnd': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + # Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg + # subjonctive Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + # Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg + # subjonctive Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + aspect=Aspect.PQPPROG.value + + # Futuro do presente composto -> PhraseTense=Fut, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Fut' and auxes[0].lemma == 'ter': + tense=Tense.FUT.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords, + ) + return + + def process_copulas(self, node, cop, expl, polarity, phrase_ords): + """ + Annotate non-verbal predicates with copula using the Phrase* attributes. + + This method is specialized for non-periphrastic copulas. + If any auxiliaries are present, process_periphrastic_verb_forms() is called instead. + + Parameters + node (udapi.core.node.Node): The non-verbal predicate that should receive the Phrase* attributes, i.e., the head of the phrase. + cop (list[udapi.core.node.Node]): The copula nodes. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words in the verb form. + """ + + # classify the morphological features of the copula node and propagate them to the entire phrase (treating the copula as the content verb) + self.process_phrases_with_ir_aller_estar(cop[0], expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(cop[0], expl, polarity, phrase_ords, node) + + # adjust PhraseAspect based on the lemma of the copula + if cop[0].feats['Tense'] in ['Pres', 'Fut']: + if cop[0].lemma == 'ser': + node.misc['PeriAspect'] = Aspect.PERF.value + elif cop[0].lemma == 'estar': + node.misc['PeriAspect'] = Aspect.IMP.value \ No newline at end of file diff --git a/udapi/block/msf/slavic/conditional.py b/udapi/block/msf/slavic/conditional.py new file mode 100644 index 00000000..9d15418f --- /dev/null +++ b/udapi/block/msf/slavic/conditional.py @@ -0,0 +1,97 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects conditional verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Conditional(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + if (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'PartRes') or node.feats['VerbForm'] == 'Fin': + # in most Slavic languages, the verb has feats['VerbForm'] == 'Part' but in Polish the verb has feats['VerbForm'] == 'Fin' + + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # list for auxiliary verbs for forming the conditional mood + cop = [x for x in node.children if x.udeprel == 'cop'] # in some cases it may happen that the cop follows the noun, we don't want to these cases in this branch + # in Polish the auxiliary verbs for conditional mood have deprel == 'aux:cnd', in other languages the auxiliary verbs have x.feats['Mood'] == 'Cnd' + + # the conditional mood can be formed using the auxiliary verb or some conjunctions (such as 'aby, kdyby...' in Czech) + # so x.udeprel == 'aux' can't be required because it doesn't meet the conjunctions + + if aux_cnd and not cop: + aux = [x for x in node.children if x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd'] # all auxiliary verbs and conjuctions with feats['Mood'] == 'Cnd' + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person='3' # TODO there is a problem in russian etc. (same as in past tense) + + for aux_verb in aux: + if aux_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + + + self.write_node_info(node, + person=person, + number=node.feats['Number'], + mood='Cnd', + form='Fin', + aspect=node.feats['Aspect'], + expl=self.get_expl_type(node,refl), + polarity=self.get_polarity(phrase_nodes), + voice=self.get_voice(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['VerbForm'] == 'Part' or x.feats['VerbForm'] == 'Fin')] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel=='aux:cnd'] + + if cop and aux_cnd: + # there can be a copula with Mood='Cnd' (i. e. in Old East Slavonic), we don't want to count these copula in phrase_ords twice, so there is x.udeprel != 'cop' in aux list + aux = [x for x in node.children if (x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd') and x.udeprel != 'cop'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + prep + refl + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + + person = '3' + + for aux_verb in aux: + if aux_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + for cop_verb in cop: + if cop_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=person, + number=copVerb.feats['Number'], + mood='Cnd', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + ords=phrase_ords, + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) \ No newline at end of file diff --git a/udapi/block/msf/slavic/converb.py b/udapi/block/msf/slavic/converb.py new file mode 100644 index 00000000..32714630 --- /dev/null +++ b/udapi/block/msf/slavic/converb.py @@ -0,0 +1,94 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects converb (transgressive) forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Converb(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # condition node.upos == 'VERB' to prevent copulas from entering this branch + if node.feats['VerbForm'] == 'Conv' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + form='Conv', + tense=node.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + voice=self.get_voice(node, refl), + analytic=self.get_analytic_bool(node) + ) + + # passive voice + elif node.upos == 'ADJ': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Conv'] + + if aux: + auxVerb = aux[0] + + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + form='Conv', + tense=auxVerb.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=auxVerb.feats['Gender'], + animacy=auxVerb.feats['Animacy'], + voice='Pass', + analytic=self.get_analytic_bool(node) + ) + + # copulas + else: + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Conv'] + + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + tense=copVerb.feats['Tense'], + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'], + form='Conv', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + voice=self.get_voice(copVerb, refl), + analytic=self.get_analytic_bool(node) + ) diff --git a/udapi/block/msf/slavic/future.py b/udapi/block/msf/slavic/future.py new file mode 100644 index 00000000..9cc17717 --- /dev/null +++ b/udapi/block/msf/slavic/future.py @@ -0,0 +1,207 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects future tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Future(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # future tense for Serbian and Croatian + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and (x.lemma == 'hteti' or x.lemma == 'htjeti')] + if node.upos != 'AUX' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + aux_other = [x for x in node.children if x.udeprel == 'aux'] # adding aux for passive voice + cop = [x for x in node.children if x.deprel == 'cop'] + + phrase_nodes = [node] + refl + aux_other + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + if not cop: + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], # srbstina ani chorvatstina vidy nema + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + else: + prep = [x for x in node.children if x.upos == 'ADP'] + phrase_nodes += prep + phrase_ords += [x.ord for x in prep] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + + return + + # Macedonian forms the future tense with the auxiliary word ќе and a verb in the present tense + # Bulgarian forms the future tense with the auxiliary word ще and a verb in the present tense + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + + if node.feats['Tense'] == 'Pres' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # future tense of perfect verbs + # Upper Sorbian forms the future tense in this way, however, the feats[Aspect] are not listed in the data + # in some languages (e.g. in Russian) these verbs have the Tense Fut, in others (e.g. in Czech) they have the Tense Pres + if node.feats['Aspect'] == 'Perf' and (node.feats['Tense'] == 'Pres' or node.feats['Tense'] == 'Fut') and node.feats['VerbForm'] != 'Conv': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + form='Fin', + aspect='Perf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + + # future tense of imperfect verbs and passive voice + # in some languages the verb is in the infinitive, in some it is in the l-participle + # the condition node.upos == 'ADJ' is due to the passive voice - the n-participle is marked as ADJ, but the auxiliary verb is not cop, but aux + if node.upos == 'VERB' or node.upos == 'ADJ': + + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Fut'] + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + if aux: + auxVerb = aux[0] + self.write_node_info(node, + tense='Fut', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + # simple future tense - e.g. in Serbian, the future tense can be formed by combining a verb with a full meaning and an auxiliary verb into one word, i.e. without an auxiliary verb + # or verbs like pojede, půjdeme... in Czech + + if not aux and node.feats['Tense'] == 'Fut': + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Fut'] + if cop: + copVerb = cop[0] + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood']=='Ind'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Fut', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + diff --git a/udapi/block/msf/slavic/imperative.py b/udapi/block/msf/slavic/imperative.py new file mode 100644 index 00000000..5a30d05e --- /dev/null +++ b/udapi/block/msf/slavic/imperative.py @@ -0,0 +1,89 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects imperative verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Imperative(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # the condition node.upos == 'VERB' ensures that copulas do not enter this branch + if node.feats['Mood'] == 'Imp' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + aspect=node.feats['Aspect'], + mood='Imp', + form='Fin', + voice='Act', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # verbs in the passive forms are marked as ADJ + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood'] == 'Imp'] + if aux: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Imp', + voice='Pass', + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Mood'] == 'Imp'] + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Imp', + form='Fin', + voice=self.get_voice(copVerb, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/infinitive.py b/udapi/block/msf/slavic/infinitive.py new file mode 100644 index 00000000..83bc0766 --- /dev/null +++ b/udapi/block/msf/slavic/infinitive.py @@ -0,0 +1,107 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects infinitive verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Infinitive(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + if node.feats['VerbForm'] == 'Inf' and node.upos == 'VERB': + aux = [x for x in node.children if x.udeprel == 'aux'] + if not aux: # the list of auxiliary list must be empty - we don't want to mark infinitives which are part of any other phrase (for example the infinititive is part of the future tense in Czech) + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes == neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] != 'Inf'] + if aux and not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Pass', + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + number=node.feats['Number'], + analytic=self.get_analytic_bool(node) + ) + return + + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + if cop and not aux_forb: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + voice=self.get_voice(cop[0], refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + + # there is a rare verb form called supine in Slovenian, it is used instead of infinitive as the argument of motion verbs + if node.feats['VerbForm'] == 'Sup': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Act', + form='Sup', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/past.py b/udapi/block/msf/slavic/past.py new file mode 100644 index 00000000..130d972d --- /dev/null +++ b/udapi/block/msf/slavic/past.py @@ -0,0 +1,212 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects past tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Past(udapi.block.msf.phrase.Phrase): + + def get_person_for_langs_with_simple_past(self, node, person): + """ + returns the person which is known from subject, languages with the simple past tense (e. g. Russian) do not express person in these verb forms + if the person was not taken from the subject, the third person would be filled in automatically due to languages with a compound past but simple forms for the third person (e. g. Czech) + """ + subj = [x for x in node.children if x.udeprel == 'nsubj'] + if subj: + subj = subj[0] + if subj.feats['Person'] != '': + person = subj.feats['Person'] + return person + + def process_node(self, node): + + past_tenses = ['Past', 'Imp', 'Pqp'] + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['Tense'] in past_tenses)] + + # there is person 0 in Polish and Ukrainian which is for impersonal statements + # in Polish, verbs with Person=0 have also Tense=Past, in Ukrainian the tense is not specified + if node.feats['Person'] == '0': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood=node.feats['Mood'], + voice='Act', #In Polish, impersonal statements are annotated with Voice=Act. In Ukrainian, the Voice feature is missing; therefore, we decided to annotate these phrases with PhraseVoice=Act + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + # compound past tense + if (node.feats['VerbForm'] in ['Part', 'PartRes', 'Fin']) and node.upos == 'VERB' and node.feats['Voice'] != 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in ['Pres', '']] + aux_pqp = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in past_tenses] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + aux_pqp + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + aux_cnd = [x for x in node.children if (x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd') and x.udeprel != 'conj'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux: + person = aux[0].feats['Person'] + + elif not aux: + person = '3' + + if aux_pqp: + person = aux_pqp[0].feats['Person'] + + # in Slovenian, the participles are not annotated as Tense='Past', the Tense feature is missing here + # but in Bulgarian, there are cases where the participles are annotated as Tense='Imp' + tense = 'Past' + if node.feats['Tense'] == 'Imp': + tense = 'Imp' + if node.feats['Tense'] == 'Pqp': + tense = 'Pqp' + + self.write_node_info(node, + tense=tense, + person=person, + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + + # the past tense of some Slavic languages is formed only by a verb without an auxiliary verb (e.g. Polish) + # or imperfect (special case of the past tense) e.g. in Bulgarian or Croatian + elif (node.feats['Tense'] in past_tenses) and node.upos == 'VERB' and node.feats['VerbForm'] != 'Conv': + + # the past tense is formed only by a content verb, not with an auxiliary + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + + if not aux_forb: + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + + + # passive + elif node.upos == 'ADJ' and node.feats['Voice'] == 'Pass' and not cop: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and (x.feats['Tense'] in past_tenses)] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux_past_tense: + aux_pres_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] # e. g. the auxiliary 'jsem' in the phrase 'byl jsem přinucen' + + phrase_nodes = [node] + aux_past_tense + aux_pres_tense + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_pres_tense: + person = aux_pres_tense[0].feats['Person'] + person = self.get_person_for_langs_with_simple_past(node, person) + + self.write_node_info(node, + tense=aux_past_tense[0].feats['Tense'], + person=person, + number=aux_past_tense[0].feats['Number'], + mood='Ind', + voice='Pass', + form='Fin', + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + else: + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if cop and not aux_cnd: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux_past_tense + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_past_tense: + person = aux_past_tense[0].feats['Person'] + + # In ru, be, uk, the person is not expressed in past tense and the verbform is Fin, not Part + if cop[0].feats['VerbForm'] == 'Fin': + person = '' + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + tense=cop[0].feats['Tense'], + person=person, + number=cop[0].feats['Number'], + mood='Ind', + voice=self.get_voice(cop[0], refl), + form='Fin', + expl=self.get_expl_type(node,refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=cop[0].feats['Gender'], + animacy=cop[0].feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) diff --git a/udapi/block/msf/slavic/preprocessor.py b/udapi/block/msf/slavic/preprocessor.py new file mode 100644 index 00000000..0672812b --- /dev/null +++ b/udapi/block/msf/slavic/preprocessor.py @@ -0,0 +1,83 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block serves as a preprocessor for Slavic languages before the other blocks +are applied to detect periphrastic verb forms. It improves harmonization of +annotations across the treebanks by addressing some known divergences. +""" + +from udapi.core.block import Block + +class Preprocessor(Block): + + def process_node(self,node): + + # in Ukrainian the active verb forms are not marked as PhraseVoice=Act + if (node.upos == 'VERB' or (node.upos == 'AUX' and node.feats['VerbForm'] == 'Fin')) and node.feats['Voice'] == '': + node.feats['Voice'] = 'Act' + + # in some languages, participles are annotated with UPOS=VERB, while in others they are annotated with UPOS=ADJ + # we change the UPOS to ADJ when a participle expresses case + #if node.upos == 'VERB' and node.feats['VerbForm'] == 'Part' and node.feats['Case'] != '': + # node.upos = 'ADJ' + + # in Polish, the conditional mood for auxiliary verbs is marked as deprel == 'aux:cnd' and not as in the last Slavic languages feats['Mood'] == 'Cnd' + if node.deprel == 'aux:cnd': + node.feats['Mood'] = 'Cnd' + + # unify polarities - some languages mark only Neg (Russian), some mark both Neg and Pos (Czech) + if node.feats['Polarity'] == 'Pos': + node.feats['Polarity'] = '' + + # In Ukrainian, there is no explicit annotation of reflexive verbs + # We decided to unify the annotation of reflexive verbs with Russian and Belarusian, where reflexive verbs are formed similarly + # We add the feature Voice=Mid to reflexive verbs + if node.upos == 'VERB' and (node.lemma.endswith('сь') or node.lemma.endswith('ся')): + node.feats['Voice'] = 'Mid' + + # makedonstina tvori budouci cas pomoci pomocneho slova ќе, u nejz neni nijak vyznaceno, ze se podili na tvorbe budouciho casu + # stejne tak bulharstina pomoci pomocneho slova ще + # makedonstina a bulharstina + if node.feats['Tense'] == 'Pres': + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + if len(aux) == 1: + aux[0].feats['Tense'] = 'Fut' + + # in Czech and in Old Church Slavonic, the participles are sometimes marked with the plural gender + if node.feats['Gender'] == 'Fem,Neut' or node.feats['Gender'] == 'Fem,Masc': + subj = [x for x in node.children if x.udeprel == 'nsubj'] + + # for relative pronouns, only one gender is indicated + if len(subj) == 1: + conj = [x for x in subj[0].children if x.deprel == 'conj'] + if len(conj) == 0: + node.feats['Gender'] = subj[0].feats['Gender'] + node.feats['Number'] = subj[0].feats['Number'] + + # participles in passive are sometimes annotated as VERB, sometimes as ADJ + #if node.upos == 'VERB' and node.feats['Voice'] == 'Pass': + # node.upos = 'ADJ' + + # there are cases where the node has deprel=='expl:pv' or 'expl:pass' or 'expl:impers' and Reflex is not Yes (i.e. Macedonian treebank) + # we add the Reflex=Yes feature + if node.deprel == 'expl:pv' or node.deprel == 'expl:pass' or node.deprel == 'expl:impers': + node.feats['Reflex'] = 'Yes' + + # fixing the mistake in Macedonian treebank (mk_mtb-ud-test.conllu), in sent_id=other0010, there is personal pronoun 'ми' marked as expl:pv, it should be iobj + if node.deprel == 'expl:pv' and node.lemma == 'ми' and node.feats['PronType'] == 'Prs': + node.deprel = '' + node.udeprel = 'iobj' + + # in Old Church Slavonic, there is feature Mood=Sub, but this is a notation for conditional mood + if node.feats['Mood'] == 'Sub': + node.feats['Mood'] = 'Cnd' + + # although infinitives in Old Church Slavonic are annotated with Tense=Pres, they do not convey tense; therefore, we remove this annotation + if node.feats['VerbForm'] == 'Inf': + node.feats['Tense'] = '' + + # in the russian Syntagrus corpus, the negative particles have no Polarity=Neg feature + if node.lemma == 'не' and node.upos == 'PART' and node.udeprel == 'advmod': + node.feats['Polarity'] = 'Neg' + + # TODO maybe we want to set Tense=Fut for the perfective verbs with Tense=Pres? This could solve the problem with the simplified detection of the future tense in Czech + # but there are many verbs with no Aspect value, so the problem is still there diff --git a/udapi/block/msf/slavic/present.py b/udapi/block/msf/slavic/present.py new file mode 100644 index 00000000..7521a08d --- /dev/null +++ b/udapi/block/msf/slavic/present.py @@ -0,0 +1,132 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects present tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Present(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + # the condition VerbForm == 'Fin' ensures that there are no transgressives between the found verbs + # the aspect is not always given in Czech treebanks, so we can't rely on the fact that the imperfect aspect is specified + if node.feats['Tense'] == 'Pres' and node.upos == 'VERB' and node.feats['VerbForm'] == 'Fin' and node.feats['Aspect'] !='Perf': + + aux_forb = [x for x in node.children if x.upos == 'AUX' and (x.lemma == 'ќе' or x.lemma == 'ще' or x.feats['Mood'] == 'Cnd')] # forbidden auxiliaries for present tense (these auxiliaries are used for the future tense or the conditional mood) + + if not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Pres', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # passive voice + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and x.lemma != 'hteti' and x.lemma != 'htjeti'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] != 'Pres'] # we don't want the past passive (e. g. 'byl jsem poučen' in Czech) + + if aux and not aux_forb: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + auxVerb = aux[0] + + self.write_node_info(node, + tense='Pres', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + form='Fin', + voice='Pass', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + # participles + # in some languages, participles are used as attributes (they express case and degree) + if node.upos == 'ADJ' and node.feats['VerbForm'] == 'Part': + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + cop = [x for x in node.children if x.udeprel == 'cop'] + + if not aux_forb and not cop: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + tense=node.feats['Tense'], + number=node.feats['Number'], + form='Part', + voice=self.get_voice(node, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Pres'] + aux_forb = [x for x in node.children if x.upos == 'AUX' and x.feats['Tense'] != 'Pres'] # in Serbian this can be a future tense + + if cop and not aux_forb: + aux = [x for x in node.children if x.udeprel == "aux" and x.feats['Mood'] == 'Ind' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Pres', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/read/addbratann.py b/udapi/block/read/addbratann.py new file mode 100644 index 00000000..4f5fc877 --- /dev/null +++ b/udapi/block/read/addbratann.py @@ -0,0 +1,230 @@ +"""Add Brat coreference annotation from *.ann files. + +So far, tested on French LitBank data only. + +T12 HIST 362 366 qui +T13 HIST 349 362 une aventure +R1431 Coreference Arg1:T12 Arg2:T13 + +""" + +from udapi.core.block import Block +from udapi.core.files import Files +import logging +from bisect import bisect_left +import networkx as nx + +def _m(range_s, range_e, offset): + return f"{range_s}-{offset}:{range_e}-{offset}" if offset else f"{range_s}:{range_e}" + +class AddBratAnn(Block): + + def __init__(self, files, zone='', offset=0, detect_bom=True, keep_mention_id=True, + coref_attr="R", no_type_value='_Unsorted_', + **kwargs): + """Args: + files: file names with the coreference annotations (*.ann) + offset: what number to substract from the chatacter indices in the ann files + detect_bom: if True and the current txt file starts with BOM (byte-order mark), add 1 to the offset + """ + super().__init__(**kwargs) + self.zone = zone + self.files = Files(filenames=files) + self.offset = offset + self.detect_bom = detect_bom + self.keep_mention_id = keep_mention_id + self.coref_attr = coref_attr + self.no_type_value = no_type_value + + def process_document(self, document): + + # Read all the important info from the *.ann file. + mentions, attrs, split_ante, clusters = {}, [], [], [] + ann_filehandle = self.files.next_filehandle() + offset = self.offset + if self.detect_bom: + txt_filename = self.files.filename.replace("ann", "txt") + with open(txt_filename, 'rb') as txt_fh: + raw_bytes = txt_fh.read(3) + if raw_bytes == b'\xef\xbb\xbf': + offset += 1 + + for line in ann_filehandle: + line = line.rstrip('\n') + if not "\t" in line: + logging.warning(f"Unexpected line without tabs: {line}") + elif line.startswith("T"): + # T13 HIST 349 362 une aventure + try: + mention_id, type_and_range, form = line.split("\t") + # Usually range are two numbers, but can be more, e.g. type_and_range="Abstract 605 653;654 703" + # Let's take the first and last number only.´ + parts = type_and_range.split() + ne_type, range_s, range_e = parts[0], int(parts[1]), int(parts[-1]) + + # If form ends with spaces, remove them and adjust range_e + stripped_form = form.rstrip(" ") + if form != stripped_form: + num_spaces = len(form) - len(stripped_form) + logging.debug(f"Stripping {num_spaces} space{'s' if num_spaces>1 else ''} from {mention_id} '{form}' ({_m(range_s,range_e,offset)}->{range_e-num_spaces})") + form = stripped_form + range_e = range_e - num_spaces + + + mentions[mention_id] = [ne_type, range_s, range_e, form] + if self.keep_mention_id: + attrs.append(["mention_id", mention_id, mention_id]) + except Exception as e: + logging.warning(f"Unexpected mention line: {line}\n{e}") + elif line.startswith(self.coref_attr): + try: + cor_attr, mention_ids = line.rstrip().split("\t") + parts = mention_ids.split() + assert(parts[0] == "Coreference") + except Exception as e: + logging.warning(f"Unexpected coref line: '{line}'\n{e}") + clusters.append([p.split(":")[1] for p in parts[1:]]) + elif line.startswith("#"): + pass # Let's ignore annotators' comments + else: + logging.warning(f"Unexpected line in {self.files.filename}:\n{line}") + + # Some Brat ann files use link-based representation, e.g. + # R123 Coreference Arg1:T11 Arg2:T13 + # R124 Coreference Arg1:T12 Arg2:T14 + # R125 Coreference Arg1:T13 Arg2:T14 + # This actually means that all four mentions T11, T12, T13 and T14 are in the same cluster (entity). + # However, clusters = [["T11", "T13"], ["T12", "T14"], ["T13", "T14"]] + # and we need to convert it to clusters = [["T11", "T12", "T13", "T14"]] + # Note that if creating entities for link, in their original order, + # R123 and R125 would result in creating two entities and when hitting R125 + # we would need to merge them, i.e. delete one of them and move their mentions to the other. + # This is the solution of corefud.Link2Cluster, but here it seems easier to find connected components. + coref_graph = nx.Graph() + for mention_ids in clusters: + coref_graph.add_node(mention_ids[0]) + for mention_id in mention_ids[1:]: + coref_graph.add_node(mention_id) + coref_graph.add_edge(mention_id, mention_ids[0]) + clusters = [list(component) for component in nx.connected_components(coref_graph)] + + # Create entity objects for non-singletons. + entity_map = {} + for mention_ids in clusters: + etype, etype_index = None, 0 + for index, m_id in enumerate(mention_ids): + if mentions[m_id][0] == self.no_type_value: + pass + elif etype is None: + etype, etype_index = mentions[m_id][0], index + elif etype != mentions[m_id][0]: + logging.warning(f"Mention type mismatch {mention_ids[etype_index]}:{etype} != {m_id}:{mentions[m_id][0]}. Using the former.") + if etype is None: + etype = "other" + entity = document.create_coref_entity(etype=etype) + for m_id in mention_ids: + if m_id in entity_map: + logging.warning(f"Mention {m_id} already in Entity {entity_map[m_id].eid}, not adding to {entity.eid}") + else: + entity_map[m_id] = entity + + # Collect TokenRange (as pre-filled by UDPipe) for each token. + tokens, starts, ends = [], [], [] + for tree in document.trees: + for token in tree.token_descendants: + tokens.append(token) + range_s, range_e = token.misc["TokenRange"].split(":") + starts.append(int(range_s)) + ends.append(int(range_e)) + + # Create mention objects. + mention_map = {} + for mention_id, mention_values in mentions.items(): + + # Find Udapi tokens for each mention. + ne_type, range_s, range_e, form = mention_values + index_s = bisect_left(starts, range_s - offset) + if starts[index_s] != range_s - offset and index_s > 0: + index_s -= 1 + index_e = bisect_left(ends, range_e - offset) + mtokens = tokens[index_s : index_e+1] + token_s, token_e = tokens[index_s], tokens[index_e] + + # Solve cases when the character range crosses Udapi (UDPipe-predicted) token boundaries. + # If the start token is a multi-word token (MWT), + # we can still try to find the proper word within the MWT. + ok_s, ok_e = True, True + if starts[index_s] != range_s - offset: + ok_s = False + if token_s.is_mwt(): + mtokens.pop(0) + first_form = form.split()[0] + new_start = ends[index_s] + for w in reversed(token_s.words): + mtokens = [w] + mtokens + new_start -= len(w.form) + if w.form == first_form or new_start < range_s - offset: + ok_s = True + break + + # similarly for the end token + if ends[index_e] != range_e - offset: + ok_e = False + if token_e.is_mwt(): + mtokens.pop() + last_form = form.split()[-1] + new_end = starts[index_e] + for w in token_e.words: + mtokens.append(w) + new_end += len(w.form) + if w.form == last_form or new_end > range_e - offset: + ok_e = True + break + + if not ok_s or not ok_e: + logging.warning(f"Mention {mention_id} range {_m(range_s, range_e, offset)} ({form})" + f" crosses token boundaries: {token_s.misc} ({token_s.form}) " + f".. {token_e.misc} ({token_e.form})") + + # Project tokens (including MWTs) to words and check forms match. + words, udapi_form = [], "" + for token in mtokens: + words += token.words + udapi_form += token.form + if not token.no_space_after: + udapi_form += " " + udapi_form = udapi_form.rstrip() + if form != udapi_form: + logging.warning(f"Mention {mention_id}: ann form '{form}' != Udapi form '{udapi_form}'") + + # Make sure all words of the mention are in the same sentence. + root = words[0].root + mwords = [words[0]] + for word in words[1:]: + if word.root is root: + mwords.append(word) + else: + logging.warning(f"Cross-sentence mention. Word {word} not in {root}, thus omitting from the mention.") + + # Create entities for singletons + if mention_id not in entity_map: + entity_map[mention_id] = document.create_coref_entity(etype=ne_type) + + # Create the Udapi mention object + mention = entity_map[mention_id].create_mention(words=mwords) + mention_map[mention_id] = mention + + # Fill-in the additional mention attributes. + for attr_name, mention_id, attr_value in attrs: + if mention_id in mention_map: + mention_map[mention_id].other[attr_name] = attr_value + + # Fill-in split antecedents + for arg1, arg2 in split_ante: + if arg1 in entity_map and arg2 in entity_map: + if entity_map[arg1] in entity_map[arg2].split_ante: + logging.warning(f"Repeated SplitAnte: {arg1=} ({entity_map[arg1].eid}) {arg2=} ({entity_map[arg2].eid})") + else: + entity_map[arg2].split_ante.append(entity_map[arg1]) + else: + logging.warning(f"{arg1} or {arg2} not indexed in entity_map") diff --git a/udapi/block/read/addtext.py b/udapi/block/read/addtext.py index 040174be..4d0b7771 100644 --- a/udapi/block/read/addtext.py +++ b/udapi/block/read/addtext.py @@ -32,7 +32,7 @@ def process_document(self, document): self.finished = True return text = ''.join(self.filehandle.readlines()) - i, end, was_newpar = 0, len(text), True + i, end, was_newpar = 0, len(text)-1, True while i <= end and text[i].isspace(): i += 1 diff --git a/udapi/block/read/conll.py b/udapi/block/read/conll.py index f64cd9ff..d0aef1ee 100644 --- a/udapi/block/read/conll.py +++ b/udapi/block/read/conll.py @@ -79,22 +79,24 @@ def parse_node_line(self, line, root, nodes, parents, mwts): # but it allows for arbitrary columns node = root.create_child() for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] if attribute_name == 'head': try: - parents.append(int(fields[n_attribute])) + parents.append(int(value)) except ValueError as exception: - if not self.strict and fields[n_attribute] == '_': + if not self.strict and value == '_': if self.empty_parent == 'warn': logging.warning("Empty parent/head index in '%s'", line) parents.append(0) else: raise exception elif attribute_name == 'ord': - setattr(node, 'ord', int(fields[n_attribute])) + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") elif attribute_name == 'deps': - setattr(node, 'raw_deps', fields[n_attribute]) - elif attribute_name != '_' and fields[n_attribute] != '_': - setattr(node, attribute_name, fields[n_attribute]) + setattr(node, 'raw_deps', value) + elif attribute_name != '_' and value != '_': + setattr(node, attribute_name, value) nodes.append(node) @@ -134,11 +136,10 @@ def read_tree_from_lines(self, lines): if node is parent: if self.fix_cycles: logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) - node._parent = root - root._children.append(node) + parent = root else: raise ValueError(f"Detected a cycle: {node} attached to itself") - elif node.children: + elif node._children: climbing = parent._parent while climbing: if climbing is node: diff --git a/udapi/block/read/conll2012.py b/udapi/block/read/conll2012.py index f4b73dc8..2adbd00f 100644 --- a/udapi/block/read/conll2012.py +++ b/udapi/block/read/conll2012.py @@ -18,7 +18,7 @@ class Conll2012(udapi.block.read.conllu.Conllu): """A reader of the Conll2012 files.""" - def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', **kwargs): + def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', emptyval='_', **kwargs): """Create the Conll2012 reader object. Args: @@ -29,10 +29,15 @@ def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', **kwar word-order number/index (usualy called ID). For Corref-PT-SemEval, use attributes='ord,form,_,_,_,_,coref'. For Summ-it++v2, use attributes='ord,form,_,_,_,_,_,_,coref'. + For FantasyCoref, use attributes='docname,_,ord,form,_,_,_,_,_,_,_,coref'. + emptyval: a symbol that represents an empty value, especially in the coref column + (default='_' suitable for LitBank, Corref-PT-SemEval, and Summ-it++v2) + For FantasyCoref, use emptyval='-'. """ super().__init__(**kwargs) self.node_attributes = attributes.split(',') self._docname = 'd' + self.emptyval = emptyval def parse_comment_line(self, line, root): if line.startswith("#end document"): @@ -40,7 +45,7 @@ def parse_comment_line(self, line, root): match = RE_BEGIN.match(line) if match: docname = match.group(1) - # LitBank uses e.g. + # LitBank and FantasyCoref use e.g. # #begin document (1023_bleak_house_brat); part 0 if docname.startswith('(') and docname.endswith(');'): docname = docname[1:-2] @@ -51,6 +56,9 @@ def parse_comment_line(self, line, root): # Corref-PT-SemEval uses e.g. # #begin document D1_C30_Folha_07-08-2007_09h19.txt.xml docname = docname.replace('.txt', '').replace('.xml', '') + # FantasyCoref may use parentheses within the document ID e.g. + # #begin document (051_Fundevogel_(Bird-foundling)); part 000 + docname = docname.replace('(', '').replace(')', '') root.newdoc = docname self._global_entity = 'eid-etype-head-other' @@ -72,6 +80,8 @@ def parse_node_line(self, line, root, nodes): for (n_attribute, attribute_name) in enumerate(self.node_attributes): value = fields[n_attribute] if attribute_name == 'docname': + # FantasyCoref may use parentheses within the document ID + value = value.replace('(', '').replace(')', '') if value != self._docname: logging.warning(f"Document name mismatch {value} != {self._docname}") @@ -83,7 +93,7 @@ def parse_node_line(self, line, root, nodes): logging.warning(f"Mismatch: expected {node.ord=}, but found {int(value) + 1} {line=}") elif attribute_name == 'coref': - if value and value != '_': + if value and value != self.emptyval: # LitBank always separates chunks by a vertical bar, e.g. (13)|10) # Summ-it++v2 does not, e.g. (13)10) if '|' in value: diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index bb76bfee..e19cd676 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -27,7 +27,7 @@ def __init__(self, strict=False, empty_parent='warn', fix_cycles=False, **kwargs strict: raise an exception if errors found (default=False, i.e. a robust mode) empty_parent: What to do if HEAD is _? Default=warn: issue a warning and attach to the root or if strict=1 issue an exception. With `empty_parent=ignore` no warning is issued. - fix_cycles: fix cycles by attaching a node in the cycle to the root + fix_cycles: fix cycles by attaching a node in the cycle to the root; fix also HEAD index out of range """ super().__init__(**kwargs) self.strict = strict @@ -193,15 +193,18 @@ def read_tree_from_lines(self, lines): try: parent = nodes[parents[node_ord]] except IndexError: - raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) + if self.fix_cycles: + logging.warning(f"Ignoring out-of-range HEAD (attaching to the root instead): {node} HEAD={parents[node_ord]}") + parent = root + else: + raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) if node is parent: if self.fix_cycles: - logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) - node._parent = root - root._children.append(node) + logging.warning("Ignoring a self-cycle (attaching to the root instead):\n%s", node) + parent = root else: raise ValueError(f"Detected a cycle: {node} attached to itself") - elif node.children: + elif node._children: climbing = parent._parent while climbing: if climbing is node: @@ -223,6 +226,6 @@ def read_tree_from_lines(self, lines): logging.warning(f"Wrong MWT range in\n{fields[0]}\n\n{lines}") raise words = nodes[int(range_start):int(range_end) + 1] - root.create_multiword_token(words, form=fields[1], misc=fields[-1]) + root.create_multiword_token(words, form=fields[1], feats=fields[5], misc=fields[9]) return root diff --git a/udapi/block/read/conllup.py b/udapi/block/read/conllup.py new file mode 100644 index 00000000..16d83d07 --- /dev/null +++ b/udapi/block/read/conllup.py @@ -0,0 +1,107 @@ +"""Conllup is a reader block for the CoNLL-UPlus format. + +Columns which don't have standardize attributes in Udapi/CoNLL-U +are stored in MISC (as key=value pairs). + +This code has been only tested on Hungarian KorKor files for CorefUD so far. +However, in the end, it is not used there (xtsv files are used instead conllup). +""" +import logging +import re + +import udapi.block.read.conll +from udapi.core.root import Root +from udapi.core.node import Node + +RE_GLOBAL_COLUMNS = re.compile(r'^# global.columns\s*=\s*(.+)') +COLUMN_MAP = { + 'ID': 'ord', +} +NORMAL_ATTRS = 'form lemma upos xpos feats deprel misc'.split() + +class Conllup(udapi.block.read.conll.Conll): + """A reader of the CoNLL-UPlus files.""" + + def __init__(self, attributes='autodetect', save_global_columns=False, **kwargs): + """Create the Conllup reader object. + + Args: + attributes: comma-separated list of column names in the input files + (can be used if the global.columns header is missing or needs to be overriden). + Default='autodetect' which means the column names will be loaded from the global.columns header. + For ignoring a column, use "_" as its name. + save_global_columns: keep the "global.columns" header in root.comments. Default=False. + Note that when saving the output to CoNLL-U, the comment is not needed + and it may be even misleading. It could be helpful only once write.Conllup is implemented + (with the possibility to use the same columns as in the input file). + """ + super().__init__(**kwargs) + self.save_global_columns = save_global_columns + if attributes == 'autodetect': + self.node_attributes = None + else: + self.node_attributes = attributes.split(',') + + def parse_comment_line(self, line, root): + if self.node_attributes is None: + global_columns_match = RE_GLOBAL_COLUMNS.match(line) + if global_columns_match is None: + return super().parse_comment_line(line, root) + global_columns = global_columns_match.group(1) + self.node_attributes = [COLUMN_MAP.get(v, v.lower()) for v in global_columns.split(" ")] + if self.save_global_columns: + root.comment += line[1:] + '\n' + return + return super().parse_comment_line(line, root) + + def parse_node_line(self, line, root, nodes, parents, mwts): + fields = line.split('\t') + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + + # multi-word tokens will be processed later + if '-' in fields[0]: + mwts.append(fields) + return + if '.' in fields[0]: + raise NotImplementedError("Empty nodes in CoNLL-UPlus not implement yet in read.Conllup") + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + nonstandard_attrs = [] + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'head': + if value == '???': + value = 0 + try: + parents.append(int(value)) + except ValueError as exception: + if not self.strict and value == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception + elif attribute_name == 'ord': + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") + elif attribute_name == 'deps': + setattr(node, 'raw_deps', value) + elif value == '_' and attribute_name != 'form': + pass + elif attribute_name == '_': + pass + elif attribute_name in NORMAL_ATTRS: + setattr(node, attribute_name, value) + else: + nonstandard_attrs.append([attribute_name, value]) + + # This needs to be done after node.misc is created (if "misc" in node.attributes) + for attribute_name, value in nonstandard_attrs: + node.misc[attribute_name.capitalize()] = value + + nodes.append(node) diff --git a/udapi/block/read/text.py b/udapi/block/read/text.py index 0213bdcb..161b6b6e 100644 --- a/udapi/block/read/text.py +++ b/udapi/block/read/text.py @@ -16,9 +16,17 @@ class Text(BaseReader): so that `udpipe.Base` keeps these characters in `SpacesAfter`. As most blocks do not expect whitespace other than a space to appear in the processed text, using this feature is at your own risk. + empty_line: how empty lines are handled. Default 'new_sentence' preserves + the current behaviour (empty lines mark sentence boundaries). Use + 'keep' to read the entire file content into a single sentence (tree), including + empty lines. Use 'newpar' to behave like 'new_sentence' but also set + `root.newpar = True` on each sentence. """ - def __init__(self, rstrip='\r\n ', **kwargs): + def __init__(self, rstrip='\r\n ', empty_line='new_sentence', **kwargs): + if empty_line not in {'new_sentence', 'keep', 'newpar'}: + raise ValueError("empty_line must be 'new_sentence', 'keep' or 'newpar'") self.rstrip = rstrip + self.empty_line = empty_line super().__init__(**kwargs) @staticmethod @@ -32,6 +40,13 @@ def is_multizone_reader(): def read_tree(self, document=None): if self.filehandle is None: return None + if self.empty_line == 'keep': + content = self.filehandle.read() + if content == '': + return None + root = Root() + root.text = content + return root lines = [] line = None while True: @@ -54,4 +69,6 @@ def read_tree(self, document=None): root = Root() root.text = " ".join(lines) + if self.empty_line == 'newpar': + root.newpar = True return root diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py index 996f4dc9..e7eb3989 100644 --- a/udapi/block/ud/addmwt.py +++ b/udapi/block/ud/addmwt.py @@ -86,7 +86,7 @@ def process_node(self, node): mwt_words[mwt_words.index(node):mwt_words.index(node)+1] = nodes nodes = mwt_words - mwt = node.root.create_multiword_token(nodes, mwt_form, mwt_misc) + mwt = node.root.create_multiword_token(words=nodes, form=mwt_form, misc=mwt_misc) self.postprocess_mwt(mwt) def multiword_analysis(self, node): diff --git a/udapi/block/ud/ar/fixedeprels.py b/udapi/block/ud/ar/fixedeprels.py index ad093e1c..a4b359ff 100644 --- a/udapi/block/ud/ar/fixedeprels.py +++ b/udapi/block/ud/ar/fixedeprels.py @@ -1,6 +1,5 @@ """Block to fix case-enhanced dependency relations in Arabic.""" from udapi.core.block import Block -import logging import re class FixEdeprels(Block): @@ -21,10 +20,40 @@ class FixEdeprels(Block): 'مِثلَ': [], 'لِأَنَّ': [], 'كَمَا': [], - 'فِي_حِينَ': [], +# 'فِي_حِينَ': [], 'فَ': [] } + # Reduction and normalization of prepositions and conjunctions, including + # the derived and compound ones. The Latin transliterations are not really + # needed in the process. We include them here as documentation, but also + # to help the poor editor with rendering the lines. Ideally, each line + # should have left-to-right text at both the beginning and end. + substitution = [ + {'target': ('min:gen', 'مِن:gen'), + 'sources': + [('ibtida min', 'اِبتِدَاء_مِن')] + }, + {'target': ('ʾiṯra:gen', 'إِثرَ:gen'), # ʾiṯra = right after + 'sources': + [('ʾiṯra', 'إِثرَ')] + }, + {'target': ('ʾaṯnāʾa:gen', 'أَثنَاءَ:gen'), # ʾaṯnāʾa = during + 'sources': + [('ʾaṯnāʾa', 'أَثنَاءَ')] + }, + {'target': ('ʾiḏ', 'إِذ'), # ʾiḏ = because + 'sources': + [('ʾiḏ', 'إِذ'), + ('ʾiḏ ʾanna', 'إِذ_أَنَّ')] + }, + {'target': ('ʾiḏā', 'إِذَا'), # ʾiḏā = if + 'sources': + [('ʾiḏā', 'إِذَا'), + ('ʾiḏā', 'إِذًا')] + }, + ] + # Secondary prepositions sometimes have the lemma of the original part of # speech. We want the grammaticalized form instead. List even those that # will have the same lexical form, as we also want to check the morphological @@ -137,6 +166,7 @@ class FixEdeprels(Block): 'بِ_صَدَد': 'بِصَدَدِ:gen', # biṣadadi = with respect to 'بِ_صَرف_نَظَر_عَن': 'بِصَرفِ_اَلنَّظَرِ_عَن:gen', # biṣarfi an-naẓari ʿan = regardless of 'بِ_صِفَة': 'بِصِفَةِ:gen', # biṣifati = as + 'بِ_صُورَة': 'بِ:gen', 'بِ_عَكس': 'بِ:gen', 'بِ_عَلَى': 'بِ:gen', 'بِ_عَن': 'بِ:gen', @@ -197,6 +227,7 @@ class FixEdeprels(Block): 'بَينَ': 'بَينَ:gen', # bayna = between 'بَينَ_حَوَالَى': 'بَينَ:gen', # bayna hawala 'بينا': 'بَينَ:gen', # bayna = between + 'بَينَ_وَ_وَ_وَ': 'بَينَ:gen', # bayna = between 'بَينَمَا': 'بَينَ:gen', 'بَينَمَا_لَم': 'بَينَ:gen', 'تُجَاهَ': 'تُجَاهَ:gen', # tuǧāha = towards, facing @@ -217,14 +248,17 @@ class FixEdeprels(Block): 'حَوَالَى_مِن': 'مِن:gen', # hawala min = from around X 'حَولَ': 'حَولَ:gen', # ḥawla = about 'حولما_إِذَا': 'إِذَا', + 'حَولَ_مَا_إِذَا': 'إِذَا', 'حِيَالَ': 'حِيَالَ:gen', # ḥiyāla = concerning 'حَيثُ': 'حَيثُ', # remove morphological case; ḥayṯu = where (SCONJ, not ADV) 'حِينَمَا': 'فِي_حِينِ', # during 'خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside 'خِلَالَ': 'خِلَالَ:gen', # ḫilāla = during 'خَلفَ': 'خَلفَ:gen', # ḫalfa = behind - 'دَاخِل': 'دَاخِلَ:gen', # dāḫila = inside of - 'دَاخِلَ': 'دَاخِلَ:gen', # dāḫila = inside of + 'دَاخِل': + 'دَاخِلَ:gen', # dāḫila = inside of + 'دَاخِلَ': + 'دَاخِلَ:gen', # dāḫila = inside of 'دُونَ': 'دُونَ:gen', # dūna = without 'دُونَ_أَن': 'دُونَ:gen', # dūna ʾan = without 'دُونَ_سِوَى': 'دُونَ:gen', # dūna siwā = without @@ -246,9 +280,12 @@ class FixEdeprels(Block): 'سِوَى_لِ': 'سِوَى:gen', # siwā = except for 'ضِدَّ': 'ضِدَّ:gen', # ḍidda = against 'ضِمنَ': 'ضِمنَ:gen', # ḍimna = within, inside, among - 'طَالَمَا': 'طَالَمَا', # ṭālamā = as long as - 'طالَما': 'طَالَمَا:gen', - 'طَالَمَا_أَنَّ': 'طَالَمَا', # ṭālamā = as long as + 'طَالَمَا': + 'طَالَمَا', # ṭālamā = as long as + 'طالَما': + 'طَالَمَا', # ṭālamā = as long as + 'طَالَمَا_أَنَّ': + 'طَالَمَا', # ṭālamā = as long as 'طِوَالَ': 'طِوَالَ:gen', # ṭiwāla = throughout 'طِيلَةَ': 'طِيلَةَ:gen', # ṭīlata = during 'عبر': 'عَبرَ:gen', @@ -266,18 +303,32 @@ class FixEdeprels(Block): 'عَلَى_أَسَاس_أَنَّ': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on 'عَلَى_اِعتِبَار_أَنَّ': 'عَلَى_اِعتِبَارِ_أَنَّ', # ʿalā iʿtibāri ʾanna = considering that 'عَلَى_إِلَّا': 'إِلَّا', # ʾillā = except, unless - 'عَلَى_الفور': 'عَلَى:gen', - 'عَلَى_إِلَى': 'عَلَى:gen', - 'عَلَى_أَن': 'عَلَى:gen', # ʿalā = on - 'عَلَى_أَنَّ': 'عَلَى:gen', # ʿalā = on - 'عَلَى_أَن_بِ': 'عَلَى:gen', # ʿalā = on - 'عَلَى_أَنَّ_مِن_شَأن': 'عَلَى:gen', # ʿalā = on - 'عَلَى_أَنَّ_هُوَ': 'عَلَى:gen', # ʿalā = on - 'عَلَى_أَنَّ_هُوَ_لَدَى': 'عَلَى:gen', # ʿalā = on - 'عَلَى_بِ': 'عَلَى:gen', - 'عَلَى_بِ_فِي': 'عَلَى:gen', - 'عَلَى_بَينَ': 'عَلَى:gen', - 'عَلَى_حَدّ': 'عَلَى:gen', + 'عَلَى_الفور': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_إِلَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَن': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَن_بِ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_عَلَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_مِن_شَأن': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_هُوَ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_هُوَ_لَدَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بِ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بِ_فِي': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بَينَ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_حَدّ': + 'عَلَى:gen', # ʿalā = on 'عَلَى_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of 'عَلَى_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on 'عَلَى_حَولَ': 'عَلَى:gen', @@ -345,14 +396,20 @@ class FixEdeprels(Block): 'فِي_حَقّ': 'فِي:gen', 'فِي_حُكم': 'فِي:gen', 'فِي_حَوَالَى': 'فِي:gen', # fi hawala = in around X - 'فِي_حِين': 'فِي_حِينِ', # fī ḥīni = while - 'فِي_حِينَ': 'فِي_حِينِ', # fī ḥīni = while - 'فِي_حِين_أَنَّ': 'فِي_حِينِ', + 'فِي_حِين': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِينَ': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِين_أَنَّ': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِينَ_أَنَّ_هُوَ': + 'فِي_حِينِ', # fī ḥīni = while 'فِي_خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside 'فِي_خِتَام': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion 'فِي_خِتَامِ': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion 'فِي_خِلَالَ': 'فِي:gen', - 'فِي_دَاخِل': 'دَاخِل:gen', + 'فِي_دَاخِل': + 'دَاخِلَ:gen', 'فِي_دَاخِلَ': 'فِي:gen', 'فِي_سَبِيل': 'فِي_سَبِيلِ:gen', # fī sabīli = in order to 'فِي_سِيَاق': 'فِي:gen', @@ -398,6 +455,7 @@ class FixEdeprels(Block): 'كَمَا': 'كَمَا', # remove morphological case; kamā = as 'كَي': 'لِكَي', # kay = in order to 'لَ': 'لِ:gen', + 'لَ_عَلَّ': 'لِ:gen', 'لِ': 'لِ:gen', # li = to 'لِ_أَجَلّ': 'لِ:gen', 'لِ_إِلَى': 'لِ:gen', @@ -453,6 +511,7 @@ class FixEdeprels(Block): 'ما_دَام': 'مِمَّا', 'مادامت': 'مِمَّا', 'مَالَم': 'مَالَم', # mālam = unless + 'مَا_إِذَا': 'إِذَا', 'مِثلَ': 'مِثلَ', # remove morphological case; miṯla = like 'مِثلَمَا': 'مِثلَ', # miṯla = like 'مَعَ': 'مَعَ:gen', # maʿa = with @@ -526,6 +585,7 @@ class FixEdeprels(Block): 'وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond 'وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle 'وِفقَ': 'وِفقَ:gen', # wifqa = according to + 'وِفق_لِ': 'وِفقَ:gen', # wifqa = according to 'ولو': 'إِذَا', # walaw = even if 'ولو_أَنَّ': 'إِذَا' # walaw = even if } @@ -543,46 +603,82 @@ def copy_case_from_adposition(self, node, adposition): else: return None - def process_node(self, node): + @staticmethod + def compose_edeprel(bdeprel, cdeprel): + """ + Composes enhanced deprel from the basic part and optional case + enhancement. + + Parameters + ---------- + bdeprel : str + Basic deprel (can include subtype, e.g., 'acl:relcl'). + cdeprel : TYPE + Case enhancement (can be composed of adposition and morphological + case, e.g., 'k:dat'). It is optional and it can be None or empty + string if there is no case enhancement. + + Returns + ------- + Full enhanced deprel (str). + """ + assert(bdeprel[-1] != ':') + edeprel = bdeprel + if cdeprel: + assert(cdeprel[0] != ':') + edeprel += ':'+cdeprel + return edeprel + + def process_tree(self, tree): """ Occasionally the edeprels automatically derived from the Czech basic trees do not match the whitelist. For example, the noun is an abbreviation and its morphological case is unknown. + + We cannot use the process_node() method because it ignores empty nodes. """ - for edep in node.deps: - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) - if m: - bdeprel = m.group(1) - solved = False - # Arabic clauses often start with وَ wa "and", which does not add - # much to the meaning but sometimes gets included in the enhanced - # case label. Remove it if there are more informative subsequent - # morphs. - edep['deprel'] = re.sub(r':وَ_', r':', edep['deprel']) - edep['deprel'] = re.sub(r':وَ:', r':', edep['deprel']) - edep['deprel'] = re.sub(r':وَ$', r'', edep['deprel']) - # If one of the following expressions occurs followed by another preposition - # or by morphological case, remove the additional case marking. For example, - # 'jako_v' becomes just 'jako'. - for x in self.outermost: - exceptions = self.outermost[x] - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) - if m and m.group(2) and not x+m.group(2) in exceptions: - edep['deprel'] = m.group(1)+':'+x - solved = True - break - if solved: + for node in tree.descendants_and_empty: + for edep in node.deps: + if edep['deprel'] == 'advcl:pred:إِذَن' or edep['deprel'] == 'advcl:pred:كدا' or edep['deprel'] == 'advcl:pred:لكن': + edep['deprel'] = 'advcl:pred' continue - for x in self.unambiguous: - # All secondary prepositions have only one fixed morphological case - # they appear with, so we can replace whatever case we encounter with the correct one. - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) - if m: - edep['deprel'] = m.group(1)+':'+self.unambiguous[x] - solved = True - break - if solved: + if edep['deprel'] == 'nmod:بِأَسْرِ:gen': + edep['deprel'] = 'nmod' continue + m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel']) + if m: + bdeprel = m.group(1) + cdeprel = m.group(2) + solved = False + # Arabic clauses often start with وَ wa "and", which does not add + # much to the meaning but sometimes gets included in the enhanced + # case label. Remove it if there are more informative subsequent + # morphs. + cdeprel = re.sub(r'^وَ_', r'', cdeprel) + cdeprel = re.sub(r'^وَ:', r'', cdeprel) + cdeprel = re.sub(r'^وَ$', r'', cdeprel) + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.fullmatch(x+r'([_:].+)?', cdeprel) + if m and m.group(1) and not x+m.group(1) in exceptions: + cdeprel = x + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: + continue + # Split preposition from morphological case (if any), normalize + # the preposition and add the fixed morphological case where + # applicable. + m = re.fullmatch(r'([^:]+):(nom|gen|acc)', cdeprel) + adposition = m.group(1) if m else cdeprel + if adposition in self.unambiguous: + cdeprel = self.unambiguous[adposition] + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + continue def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' diff --git a/udapi/block/ud/cs/addmwt.py b/udapi/block/ud/cs/addmwt.py index 4c203ddc..a690c95b 100644 --- a/udapi/block/ud/cs/addmwt.py +++ b/udapi/block/ud/cs/addmwt.py @@ -1,17 +1,30 @@ """Block ud.cs.AddMwt for heuristic detection of multi-word tokens.""" import udapi.block.ud.addmwt +import re +import logging +# Define static rules for 'aby', 'kdyby' and similar forms. MWTS = { - 'abych': {'form': 'aby bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, - 'kdybych': {'form': 'když bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, - 'abys': {'form': 'aby bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, - 'kdybys': {'form': 'když bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, - 'aby': {'form': 'aby by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, - 'kdyby': {'form': 'když by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, - 'abychom': {'form': 'aby bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, - 'kdybychom': {'form': 'když bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, - 'abyste': {'form': 'aby byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, - 'kdybyste': {'form': 'když byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'abych': {'form': 'aby bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'kdybych': {'form': 'když bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'abys': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'abysi': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybys': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybysi': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'aby': {'form': 'aby by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'kdyby': {'form': 'když by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'abychom': {'form': 'aby bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychom': {'form': 'když bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + # Old Czech 'abychme' == Modern Czech 'abychom' + 'abychme': {'form': 'aby bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychme': {'form': 'když bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'abyste': {'form': 'aby byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'abyšte': {'form': 'aby byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyste': {'form': 'když byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyšte': {'form': 'když byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + # Old Czech 'abyšta' == dual number; 2nd or 3rd person, the one example in data so far is 3rd. + 'abyšta': {'form': 'aby byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, + 'kdybyšta': {'form': 'když byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, } for v in MWTS.values(): v['upos'] = 'SCONJ AUX' @@ -25,23 +38,52 @@ person = '1' elif 'Person=2' in v['feats']: person = '2' - v['xpos'] = 'J,------------- Vc-%s---%s-------' % (number, person) v['deprel'] = '* aux' v['lemma'] = v['form'].split()[0] + ' být' v['main'] = 0 v['shape'] = 'siblings' +# Define static rules for 'nač', 'oč', 'zač' (but not 'proč'). +# Add them to the already existing dictionary MWTS. # nač -> na + co -for prep in 'na za o'.split(): +for prep in 'na o za'.split(): MWTS[prep + 'č'] = { 'form': prep + ' co', 'lemma': prep + ' co', 'upos': 'ADP PRON', + 'xpos': 'RR--4---------- PQ--4----------', + 'feats': 'AdpType=Prep|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel', 'deprel': 'case *', 'main': 1, 'shape': 'subtree', } +# In 19th century texts (Hičkok etalon), one instance of 'seč' was also split (and annotated as ADP + accusative!) +# A few additional instances were found in older texts, too (e.g. 16th century). +# We must do it separately, as the preposition is vocalized. +MWTS['seč'] = { + 'form': 'se' + ' co', + 'lemma': 's' + ' co', + 'upos': 'ADP PRON', + 'xpos': 'RV--4---------- PQ--4----------', + 'feats': 'AdpType=Voc|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', +} + +# Old Czech 'toliť' (special case with 3 subtokens; general -ť will be solved dynamically below). +MWTS['toliť'] = { + 'form': 'to li ť', + 'lemma': 'ten li ť', + 'upos': 'DET SCONJ PART', + 'xpos': '* J,------------- TT-------------', + 'feats': '* _ _', + 'deprel': '* mark discourse', + 'main': 0, + 'shape': 'siblings' +} + class AddMwt(udapi.block.ud.addmwt.AddMwt): @@ -49,25 +91,153 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + # Avoid adding a MWT if the current node already is part of an MWT. + if node.multiword_token: + return None analysis = MWTS.get(node.form.lower(), None) if analysis is not None: return analysis - - # There is no VerbType=verbconj in the UD_Czech data. - # The purpose of this rule is rather to show that - # it is possible to write such "dynamic" rules - # (which cannot be included in static MWTS). - if node.form.lower().endswith('ť') and node.feats['VerbType'] == 'verbconj': - return { - 'form': node.form.lower()[:-1] + ' neboť', - 'lemma': '* neboť', - 'upos': '* CCONJ', - 'xpos': 'Vt-S---3P-NA--2 J^-------------', - 'feats': '* _', - 'deprel': '* cc', - 'main': 0, - 'shape': 'subtree', - } + # If the node did not match any of the static rules defined in MWTS, + # check it against the "dynamic" rules below. The enclitic 'ť' will be + # separated from its host but only if it has been marked by an annotator + # in MISC. (These are annotation conventions used for Old Czech in the + # Hičkok project.) + if node.misc['AddMwt'] != '': + subtokens = node.misc['AddMwt'].split() + if len(subtokens) != 2: + logging.warning("MISC 'AddMwt=%s' has unexpected number of subtokens." % node.misc['AddMwt']) + return None + token_from_subtokens = ''.join(subtokens) + if subtokens[1] == 'jsi': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' jsi', + 'lemma': '* být', + 'upos': '* AUX', + 'xpos': '* VB-S---2P-AAI--', + 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', + 'deprel': '* aux', + 'main': 0, + 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', + } + if subtokens[1] == 'jest': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' jest', + 'lemma': '* být', + 'upos': '* AUX', + 'xpos': '* VB-S---3P-AAI-2', + 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', + 'deprel': '* aux', + 'main': 0, + 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', + } + if subtokens[1] == 'i': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' i', + 'lemma': '* i', + 'upos': '* CCONJ', + 'xpos': '* J^-------------', + 'feats': '* _', + 'deprel': '* cc', + 'main': 0, + 'shape': 'subtree', + } + if subtokens[1] in ['ť', 'tě', 'ti']: + if token_from_subtokens != node.form: + logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) + return None + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' ' + subtokens[1], + 'lemma': '* ť', + 'upos': '* PART', + 'xpos': '* TT-------------', + 'feats': '* _', + 'deprel': '* discourse', + 'main': 0, + 'shape': 'subtree', + } + # dajžto = dajž + to + if subtokens[1] == 'to': + if token_from_subtokens != node.form: + logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) + return None + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' ' + subtokens[1], + 'lemma': '* ten', + 'upos': '* DET', + 'xpos': '* PDNS4----------', + 'feats': '* Case=Acc|Gender=Neut|Number=Sing|PronType=Dem', + 'deprel': '* obj', + 'main': 0, + 'shape': 'subtree', + } + # Contractions of prepositions and pronouns almost could be processed + # regardless of AddMwt instructions by the annotator, but we still + # require it to be on the safe side. For example, both 'přědeň' and + # 'přěden' are attested in Old Czech but then we do not want to catch + # 'on' (besides the wanted 'oň'). Another reason si that the pronoun + # could be masculine or neuter. We pick Gender=Masc and Animacy=Anim + # by default, unless the original token was annotated as Animacy=Inan + # or Gender=Neut. + m = re.match(r"^(na|nade|o|po|pro|přěde|ski?rz[eě]|za)[nň](ž?)$", node.form.lower()) + if m: + node.misc['AddMwt'] = '' + # Remove vocalization from 'přěde' (přěd něj) but keep it in 'skrze' + # (skrze něj). + if m.group(1) == 'přěde': + pform = 'přěd' + plemma = 'před' + adptype = 'Voc' + at = 'V' + elif re.match(r"^ski?rz[eě]$", m.group(1).lower()): + pform = m.group(1) + plemma = 'skrz' + adptype = 'Voc' + at = 'V' + else: + pform = m.group(1) + plemma = m.group(1) + adptype = 'Prep' + at = 'R' + # In UD PDT, Gender=Masc,Neut, and in PDT it is PEZS4--3 / P4ZS4---. + if node.feats['Gender'] == 'Neut': + gender = 'Neut' + animacy = '' + g = 'N' + elif node.feats['Animacy'] == 'Inan': + gender = 'Masc' + animacy = 'Animacy=Inan|' + g = 'I' + else: + gender = 'Masc' + animacy = 'Animacy=Anim|' + g = 'M' + if m.group(2).lower() == 'ž': + return { + 'form': pform + ' nějž', + 'lemma': plemma + ' jenž', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- P4'+g+'S4---------2', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|PrepCase=Pre|PronType=Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } + else: + return { + 'form': pform + ' něj', + 'lemma': plemma + ' on', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- PE'+g+'S4--3-------', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|Person=3|PrepCase=Pre|PronType=Prs', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } return None def postprocess_mwt(self, mwt): diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index f2f76b4b..4e2be633 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -1,6 +1,5 @@ """Block to fix case-enhanced dependency relations in Czech.""" from udapi.core.block import Block -import logging import re class FixEdeprels(Block): @@ -12,18 +11,25 @@ class FixEdeprels(Block): # by all the inner cases. # The list in the value contains exceptions that should be left intact. outermost = { + 'aby': [], 'ač': [], 'ačkoli': [], # 'ačkoliv' se převede na 'ačkoli' dole + 'ačkoliv': [], # ... ale možná ne když je doprovázeno předložkou + 'ať': [], 'byť': [], 'i_když': [], 'jak': [], 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole 'jako': [], 'jakoby': ['jakoby_pod:ins'], # these instances in FicTree should be spelled 'jako by' + 'když': [], 'než': ['než_aby'], + 'nežli': [], + 'pokud': [], 'protože': [], 'takže': [], - 'třebaže': [] + 'třebaže': [], + 'že': [] } # Secondary prepositions sometimes have the lemma of the original part of @@ -32,35 +38,54 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'á': 'na:acc', # "á konto té záležitosti", ovšem "á konto" není ani spojeno jako složená předložka (význam = "na konto") 'abi': 'aby', 'aby_na': 'na:loc', 'ačkoliv': 'ačkoli', 'ať': 'ať', # remove morphological case 'ať_forma': 'formou:gen', + 'ať_jako': 'jako', + 'ať_na': 'na:loc', + 'ať_s': 's:ins', 'ať_v': 'v:loc', + 'ať_v_oblast': 'v_oblasti:gen', 'ať_z': 'z:gen', + 'ať_z_hledisko': 'z_hlediska:gen', 'ať_z_strana': 'ze_strany:gen', 'až_do': 'do:gen', 'až_o': 'o:acc', 'během': 'během:gen', 'bez': 'bez:gen', 'bez_ohled_na': 'bez_ohledu_na:acc', + 'bez_na': 'bez_ohledu_na:acc', ###!!! a temporary hack to silence the validator about (https://github.com/UniversalDependencies/UD_Czech-PDT/issues/10#issuecomment-2710721703) 'bez_zřetel_k': 'bez_zřetele_k:dat', 'bez_zřetel_na': 'bez_zřetele_na:acc', + 'blízko': 'blízko:dat', + 'blízko_k': 'blízko:dat', 'blíž': 'blízko:dat', + 'blíže': 'blízko:dat', + 'bok_po_bok_s': 'bok_po_boku_s:ins', 'cesta': 'cestou:gen', + 'co_jako': 'jako', + 'coby': 'coby', # remove morphological case 'daleko': 'nedaleko:gen', 'daleko_od': 'od:gen', 'dík': 'díky:dat', 'díky': 'díky:dat', 'dle': 'dle:gen', 'do': 'do:gen', + 'do_čelo': 'do_čela:gen', 'do_k': 'k:dat', 'do_oblast': 'do_oblasti:gen', 'do_rozpor_s': 'do_rozporu_s:ins', + 'do_ruka': 'do_rukou:gen', 'do_soulad_s': 'do_souladu_s:ins', + 'důsledkem': 'v_důsledku:gen', 'forma': 'formou:gen', + 'formou': 'formou:gen', + 'hledět_na': 'nehledě_na:acc', 'i_když': 'i_když', # remove morphological case + 'i_pro': 'pro:acc', 'jak_aby': 'jak', 'jak_ad': 'jak', 'jakkoliv': 'jakkoli', @@ -68,33 +93,52 @@ class FixEdeprels(Block): 'jako_kupříkladu': 'jako', 'jakoby': 'jako', 'jakoby_pod': 'pod:ins', + 'jakožto': 'jako', 'jelikož_do': 'jelikož', + 'jenom': 'jen', + 'jesli': 'jestli', 'jestli_že': 'jestliže', + 'jménem': 'jménem:gen', 'k': 'k:dat', 'k_konec': 'ke_konci:gen', + 'k_prospěch': 'ku_prospěchu:gen', 'kdykoliv': 'kdykoli', 'kol': 'kolem:gen', 'kolem': 'kolem:gen', + 'kolem_dokola': 'kolem:gen', + 'koncem': 'koncem:gen', 'konec': 'koncem:gen', 'krom': 'kromě:gen', 'kromě': 'kromě:gen', + 'kvůli': 'kvůli:dat', + 'leda_když': 'ledaže', + 'li_jako': 'li', 'liž': 'li', 'mezi_uvnitř': 'uvnitř:gen', + 'na:ins': 'na:acc', 'na_báze': 'na_bázi:gen', 'na_čelo': 'na_čele:gen', 'na_mimo': 'na:loc', # na kurtě i mimo něj 'na_než': 'na:acc', # na víc než čtyři a půl kilometru 'na_od': 'na_rozdíl_od:gen', + 'na_počátek': 'na_počátku:gen', + 'na_počest': 'na_počest:gen', # appears also with :dat but the meaning is same 'na_podklad': 'na_podkladě:gen', 'na_rozdíl_od': 'na_rozdíl_od:gen', + 'na_strana': 'na_straně:gen', + 'na_účet': 'na_účet:gen', 'na_újma': 'gen', # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier 'na_úroveň': 'na_úrovni:gen', + 'na_úroveň_okolo': 'na_úrovni:gen', 'na_úsek': 'na_úseku:gen', + 'na_začátek': 'na_začátku:gen', 'na_základ': 'na_základě:gen', 'na_základna': 'na_základně:gen', 'na_závěr': 'na_závěr:gen', + 'na_zda': 'na:loc', # na tom, zda a v jaké formě... 'namísto': 'namísto:gen', 'namísto_do': 'do:gen', + 'napospas': 'napospas:dat', 'narozdíl_od': 'na_rozdíl_od:gen', 'následek': 'následkem:gen', 'navzdory': 'navzdory:dat', @@ -104,39 +148,58 @@ class FixEdeprels(Block): 'o_jako': 'jako', 'o_o': 'o:acc', 'od': 'od:gen', + 'od_počínaje': 'počínaje:ins', # od brambor počínaje a základní zeleninou konče 'ohledně': 'ohledně:gen', 'okolo': 'okolo:gen', 'oproti': 'oproti:dat', 'po_v': 'po:loc', + 'po_bok': 'po_boku:gen', 'po_doba': 'po_dobu:gen', + 'po_stránka': 'po_stránce:gen', 'po_vzor': 'po_vzoru:gen', 'poblíž': 'poblíž:gen', 'počátek': 'počátkem:gen', + 'počátkem': 'počátkem:gen', + 'počínaje': 'počínaje:ins', 'počínat': 'počínaje:ins', + 'počínat_od': 'počínaje:ins', 'pod_dojem': 'pod_dojmem:gen', + 'pod_tlak': 'pod_tlakem:gen', 'pod_vliv': 'pod_vlivem:gen', + 'pod_záminka': 'pod_záminkou:gen', + 'pod_záminka_že': 'pod_záminkou_že', + 'podél': 'podél:gen', 'podle': 'podle:gen', 'pomoc': 'pomocí:gen', 'pomocí': 'pomocí:gen', 'postup': 'postupem:gen', 'pouze_v': 'v:loc', 'pro': 'pro:acc', + 'pro_aby': 'pro:acc', 'prostřednictví': 'prostřednictvím:gen', 'prostřednictvím': 'prostřednictvím:gen', 'proti': 'proti:dat', + 'proto_aby': 'aby', 'protože': 'protože', # remove morphological case 'před_během': 'během:gen', # před a během utkání 'před_po': 'po:loc', # před a po vyloučení Schindlera 'přes': 'přes:acc', + 'přes_přes': 'přes:acc', # annotation error 'přestože': 'přestože', # remove morphological case 'při': 'při:loc', + 'při_pro': 'při:loc', 'při_příležitost': 'při_příležitosti:gen', + 'ruka_v_ruka_s': 'ruku_v_ruce_s:ins', + 's_cíl': 's_cílem', # s cílem projednat X 's_ohled_k': 's_ohledem_k:dat', 's_ohled_na': 's_ohledem_na:acc', 's_pomoc': 's_pomocí:gen', + 's_postup': 'postupem:gen', 's_přihlédnutí_k': 's_přihlédnutím_k:dat', 's_přihlédnutí_na': 's_přihlédnutím_na:acc', 's_výjimka': 's_výjimkou:gen', + 's_výjimka_z': 's_výjimkou:gen', + 's_výjimka_že': 's_výjimkou_že', 's_vyloučení': 's_vyloučením:gen', 's_zřetel_k': 'se_zřetelem_k:dat', 's_zřetel_na': 'se_zřetelem_na:acc', @@ -146,20 +209,29 @@ class FixEdeprels(Block): 'směr_k': 'směrem_k:dat', 'směr_na': 'směrem_na:acc', 'směr_od': 'směrem_od:gen', + 'směr_přes': 'směrem_přes:acc', + 'směr_z': 'směrem_z:gen', 'společně_s': 'společně_s:ins', 'spolu': 'spolu_s:ins', 'spolu_s': 'spolu_s:ins', + 'spolu_se': 'spolu_s:ins', 'stranou': 'stranou:gen', + 'stranou_od': 'stranou:gen', 'takže': 'takže', # remove morphological case 'takže_a': 'takže', 'třebaže': 'třebaže', # remove morphological case + 'tvář_v_tvář': 'tváří_v_tvář:dat', 'u': 'u:gen', 'u_příležitost': 'u_příležitosti:gen', 'uprostřed': 'uprostřed:gen', 'uvnitř': 'uvnitř:gen', + 'v:ins': 'v:loc', # ve skutečností (překlep) 'v_analogie_s': 'v_analogii_s:ins', + 'v_blízkost': 'v_blízkosti:gen', + 'v_čas': 'v_čase:gen', 'v_čelo': 'v_čele:gen', 'v_čelo_s': 'v_čele_s:ins', + 'v_doba': 'v_době:gen', 'v_dohoda_s': 'v_dohodě_s:ins', 'v_duch': 'v_duchu:gen', 'v_důsledek': 'v_důsledku:gen', @@ -170,12 +242,14 @@ class FixEdeprels(Block): 'v_konfrontace_s': 'v_konfrontaci_s:ins', 'v_kontext_s': 'v_kontextu_s:ins', 'v_na': 'na:loc', + 'v_neprospěch': 'v_neprospěch:gen', 'v_oblast': 'v_oblasti:gen', 'v_oblast_s': 's:ins', 'v_obor': 'v_oboru:gen', 'v_otázka': 'v_otázce:gen', 'v_podoba': 'v_podobě:gen', 'v_poměr_k': 'v_poměru_k:dat', + 'v_porovnání_s': 'v_porovnání_s:ins', 'v_proces': 'v_procesu:gen', 'v_prospěch': 've_prospěch:gen', 'v_protiklad_k': 'v_protikladu_k:dat', @@ -183,27 +257,34 @@ class FixEdeprels(Block): 'v_případ': 'v_případě:gen', 'v_případ_že': 'v_případě_že', 'v_rámec': 'v_rámci:gen', + 'v_reakce_na': 'v_reakci_na:acc', 'v_rozpor_s': 'v_rozporu_s:ins', 'v_řada': 'v_řadě:gen', 'v_shoda_s': 've_shodě_s:ins', 'v_služba': 've_službách:gen', 'v_směr': 've_směru:gen', 'v_směr_k': 've_směru_k:dat', + 'v_směr_na': 've_směru_k:dat', # same meaning as ve_směru_na:acc 'v_smysl': 've_smyslu:gen', 'v_součinnost_s': 'v_součinnosti_s:ins', 'v_souhlas_s': 'v_souhlasu_s:ins', 'v_soulad_s': 'v_souladu_s:ins', 'v_souvislost_s': 'v_souvislosti_s:ins', 'v_spojení_s': 've_spojení_s:ins', + 'v_spojení_se': 've_spojení_s:ins', 'v_spojený_s': 've_spojení_s:ins', 'v_spojitost_s': 've_spojitosti_s:ins', 'v_spolupráce_s': 've_spolupráci_s:ins', 'v_s_spolupráce': 've_spolupráci_s:ins', 'v_srovnání_s': 've_srovnání_s:ins', 'v_srovnání_se': 've_srovnání_s:ins', + 'v_stav': 've_stavu:gen', + 'v_stín': 've_stínu:gen', 'v_světlo': 've_světle:gen', + 'v_úroveň': 'v_úrovni:gen', 'v_věc': 've_věci:gen', 'v_vztah_k': 've_vztahu_k:dat', + 'v_vztah_s': 've_vztahu_k:dat', 'v_zájem': 'v_zájmu:gen', 'v_záležitost': 'v_záležitosti:gen', 'v_závěr': 'v_závěru:gen', @@ -212,9 +293,12 @@ class FixEdeprels(Block): 'v_znamení': 've_znamení:gen', 'včetně': 'včetně:gen', 'vedle': 'vedle:gen', + 'versus': 'versus:nom', 'vina': 'vinou:gen', 'vliv': 'vlivem:gen', + 'vlivem': 'vlivem:gen', 'vůči': 'vůči:dat', + 'výměna_za': 'výměnou_za:acc', 'vzhledem': 'vzhledem_k:dat', 'vzhledem_k': 'vzhledem_k:dat', 'z': 'z:gen', @@ -225,6 +309,7 @@ class FixEdeprels(Block): 'z_strana': 'ze_strany:gen', 'z_nedostatek': 'z_nedostatku:gen', 'z_titul': 'z_titulu:gen', + 'z_začátek': 'ze_začátku:gen', 'za_pomoc': 'za_pomoci:gen', 'za_účast': 'za_účasti:gen', 'za_účel': 'za_účelem:gen', @@ -253,261 +338,333 @@ def copy_case_from_adposition(self, node, adposition): else: return None - def process_node(self, node): + @staticmethod + def compose_edeprel(bdeprel, cdeprel): + """ + Composes enhanced deprel from the basic part and optional case + enhancement. + + Parameters + ---------- + bdeprel : str + Basic deprel (can include subtype, e.g., 'acl:relcl'). + cdeprel : TYPE + Case enhancement (can be composed of adposition and morphological + case, e.g., 'k:dat'). It is optional and it can be None or empty + string if there is no case enhancement. + + Returns + ------- + Full enhanced deprel (str). + """ + edeprel = bdeprel + if cdeprel: + edeprel += ':'+cdeprel + return edeprel + + def process_tree(self, tree): """ Occasionally the edeprels automatically derived from the Czech basic trees do not match the whitelist. For example, the noun is an abbreviation and its morphological case is unknown. + + We cannot use the process_node() method because it ignores empty nodes. """ - for edep in node.deps: - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) - if m: - bdeprel = m.group(1) - solved = False - # Issues caused by errors in the original annotation must be fixed early. - # Especially if acl|advcl occurs with a preposition that unambiguously - # receives a morphological case in the subsequent steps, and then gets - # flagged as solved. - edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! - edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:k(?::dat)?$', r'obl:k:dat', edep['deprel']) ###!!! Ale měli bychom opravit i závislost v základním stromu! - edep['deprel'] = re.sub(r'^advcl:místo(?::gen)?$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' - edep['deprel'] = re.sub(r'^acl:na_způsob(?::gen)?$', r'nmod:na_způsob:gen', edep['deprel']) # 'střídmost na způsob Masarykova "jez dopolosyta"' - edep['deprel'] = re.sub(r'^acl:od(?::gen)?$', r'nmod:od:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:od(?::gen)?$', r'obl:od:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! - edep['deprel'] = re.sub(r'^advcl:podle(?::gen)?$', r'obl:podle:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:pro(?::acc)?$', r'obl:pro:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^advcl:v_duchu?(?::gen)?$', r'obl:v_duchu:gen', edep['deprel']) - # Removing 'až' must be done early. The remainder may be 'počátek' - # and we will want to convert it to 'počátkem:gen'. - edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) - # If one of the following expressions occurs followed by another preposition - # or by morphological case, remove the additional case marking. For example, - # 'jako_v' becomes just 'jako'. - for x in self.outermost: - exceptions = self.outermost[x] - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) - if m and m.group(2) and not x+m.group(2) in exceptions: - edep['deprel'] = m.group(1)+':'+x - solved = True - break - if solved: - continue - for x in self.unambiguous: - # All secondary prepositions have only one fixed morphological case - # they appear with, so we can replace whatever case we encounter with the correct one. - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) - if m: - edep['deprel'] = m.group(1)+':'+self.unambiguous[x] - solved = True - break - if solved: - continue - # The following prepositions have more than one morphological case - # available. Thanks to the Case feature on prepositions, we can - # identify the correct one. - m = re.match(r'^(obl(?::arg)?|nmod):(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + for node in tree.descendants_and_empty: + for edep in node.deps: + m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel']) if m: - adpcase = self.copy_case_from_adposition(node, m.group(2)) - if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase): - edep['deprel'] = m.group(1)+':'+adpcase + bdeprel = m.group(1) + cdeprel = m.group(2) + solved = False + # Issues caused by errors in the original annotation must be fixed early. + # Especially if acl|advcl occurs with a preposition that unambiguously + # receives a morphological case in the subsequent steps, and then gets + # flagged as solved. + if re.match(r'advcl', bdeprel): + # The following advcl should in fact be obl. + if re.fullmatch(r'do(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'do:gen' + elif re.fullmatch(r'k(?::dat)?', cdeprel): ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'k:dat' + elif re.fullmatch(r'místo(?::gen)?', cdeprel): # 'v poslední době se množí bysem místo bych' + bdeprel = 'obl' + cdeprel = 'místo:gen' + elif re.fullmatch(r'od(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'od:gen' + elif re.fullmatch(r'podle(?::gen)?', cdeprel): + bdeprel = 'obl' + cdeprel = 'podle:gen' + elif re.fullmatch(r's(?::ins)?', cdeprel): ###!!! "seděli jsme tam s Člověče, nezlob se!" Měla by se opravit konverze stromu. + bdeprel = 'obl' + cdeprel = 's:ins' + elif re.fullmatch(r'v_duchu?(?::gen)?', cdeprel): + bdeprel = 'obl' + cdeprel = 'v_duchu:gen' + elif re.fullmatch(r'v', cdeprel): + bdeprel = 'obl' + cdeprel = 'v:loc' + # byl by pro, abychom... ###!!! Opravit i konverzi stromu. + elif re.fullmatch(r'pro(?::acc)?', cdeprel): + cdeprel = 'aby' + elif re.match(r'acl', bdeprel): + # The following acl should in fact be nmod. + if re.fullmatch(r'k(?::dat)?', cdeprel): + bdeprel = 'nmod' + cdeprel = 'k:dat' + elif re.fullmatch(r'na_způsob(?::gen)?', cdeprel): # 'střídmost na způsob Masarykova "jez dopolosyta"' + bdeprel = 'nmod' + cdeprel = 'na_způsob:gen' + elif re.fullmatch(r'od(?::gen)?', cdeprel): + bdeprel = 'nmod' + cdeprel = 'od:gen' + elif re.fullmatch(r'v', cdeprel): + bdeprel = 'nmod' + cdeprel = 'v:loc' + else: # bdeprel is 'obl' or 'nmod' + # The following subordinators should be removed if they occur with nominals. + if re.match(r'(ačkoli|když)', cdeprel): # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here! + cdeprel = '' + # Removing 'až' must be done early. The remainder may be 'počátek' + # and we will want to convert it to 'počátkem:gen'. + elif re.match(r'až_(.+):(gen|dat|acc|loc|ins)', cdeprel): + cdeprel = re.sub(r'až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2', cdeprel) + elif re.fullmatch(r'jestli(?::gen)?', cdeprel): # nevím, jestli osmého nebo devátého září + cdeprel = 'gen' + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.fullmatch(x+r'([_:].+)?', cdeprel) + if m and m.group(1) and not x+m.group(1) in exceptions: + cdeprel = x + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: continue - if re.match(r'^(acl|advcl):', edep['deprel']): - # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). - edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) - edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating - edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) - if edep['deprel'] == 'acl:v' and node.form == 'patře': - edep['deprel'] = 'nmod:v:loc' - node.deprel = 'nmod' - node.lemma = 'patro' - node.upos = 'NOUN' - node.xpos = 'NNNS6-----A----' - node.feats['Aspect'] = '' - node.feats['Gender'] = 'Neut' - node.feats['Tense'] = '' - node.feats['VerbForm'] = '' - node.feats['Voice'] = '' - elif re.match(r'^(nmod|obl(:arg)?):', edep['deprel']): - if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': - # This is a same-case noun-noun modifier, which just happens to be in the locative. - # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has - # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. - edep['deprel'] = 'nmod' - elif edep['deprel'] == 'obl:loc': - # Annotation error. The first occurrence in PDT dev: - # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' - # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. - # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. - edep['deprel'] = 'obl:v:loc' - elif edep['deprel'] == 'obl:arg:loc': - # Annotation error. The first occurrence in PDT dev: - edep['deprel'] = 'obl:arg:na:loc' - elif edep['deprel'] == 'nmod:loc': - # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. - edep['deprel'] = 'nmod:nom' - elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': - # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? - # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. - edep['deprel'] = 'obl' - elif edep['deprel'] == 'nmod:voc': - # 'v 8. čísle tiskoviny Ty rudá krávo' - edep['deprel'] = 'nmod:nom' - elif edep['deprel'] == 'nmod:co:nom': - # Annotation error: 'kompatibilní znamená tolik co slučitelný' - # 'co' should be relative pronoun rather than subordinating conjunction. - edep['deprel'] = 'acl:relcl' - node.deprel = 'acl:relcl' - elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): - edep['deprel'] = 'advcl:li' - elif re.match(r'^(nmod|obl(:arg)?):mezi:voc$', edep['deprel']): - edep['deprel'] = re.sub(r':voc$', r':acc', edep['deprel']) - elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): - edep['deprel'] += ':acc' - elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): - edep['deprel'] += ':gen' - elif re.match(r'^obl:místo_za:acc$', edep['deprel']): - # 'chytají krávu místo za rohy spíše za ocas' - # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. - for c in node.children: - if c.form == 'místo': - c.upos = 'ADV' - c.deprel = 'cc' - edep['deprel'] = 'obl:za:acc' - elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) - elif re.match(r'^(nmod|obl(:arg)?):na(:gen)?$', edep['deprel']): - edep['deprel'] = re.sub(r':gen$', '', edep['deprel']) - # The case is unknown. We need 'acc' or 'loc'. - # The locative is probably more frequent but it is not so likely with every noun. - # If there is an nummod:gov child, it must be accusative and not locative. - # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^obl:arg:na_konec$', edep['deprel']): - # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' - edep['deprel'] = 'obl:arg:na:acc' - elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.fullmatch(x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?', cdeprel) + if m: + cdeprel = self.unambiguous[x] + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + if re.match(r'(obl|nmod)', bdeprel): + m = re.fullmatch(r'(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?', cdeprel) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(1)) + if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase): + cdeprel = adpcase + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + continue + ###!!! bdeprel and cdeprel are not visible from here on but we may want to use them there as well. + if re.match(r'^(acl|advcl):', edep['deprel']): + # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating + edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) + if edep['deprel'] == 'acl:v' and node.form == 'patře': + edep['deprel'] = 'nmod:v:loc' + node.deprel = 'nmod' + node.lemma = 'patro' + node.upos = 'NOUN' + node.xpos = 'NNNS6-----A----' + node.feats['Aspect'] = '' + node.feats['Gender'] = 'Neut' + node.feats['Tense'] = '' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + elif re.match(r'^(nmod|obl(:arg)?):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and (node.parent == None or node.parent.feats['Case'] == 'Loc') or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'obl:loc': + # Annotation error. The first occurrence in PDT dev: + # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' + # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. + # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. + edep['deprel'] = 'obl:v:loc' + elif edep['deprel'] == 'obl:arg:loc': + # Annotation error. The first occurrence in PDT dev: + edep['deprel'] = 'obl:arg:na:loc' + elif edep['deprel'] == 'nmod:loc': + # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': + # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? + # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. + edep['deprel'] = 'obl' + elif edep['deprel'] == 'nmod:voc': + # 'v 8. čísle tiskoviny Ty rudá krávo' + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'nmod:co:nom': + # Annotation error: 'kompatibilní znamená tolik co slučitelný' + # 'co' should be relative pronoun rather than subordinating conjunction. + edep['deprel'] = 'acl:relcl' + node.deprel = 'acl:relcl' + elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): + edep['deprel'] = 'advcl:li' + elif re.match(r'^(nmod|obl(:arg)?):mezi:voc$', edep['deprel']): + edep['deprel'] = re.sub(r':voc$', r':acc', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): edep['deprel'] += ':acc' - else: + elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^obl:místo_za:acc$', edep['deprel']): + # 'chytají krávu místo za rohy spíše za ocas' + # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. + for c in node.children: + if c.form == 'místo': + c.upos = 'ADV' + c.deprel = 'cc' + edep['deprel'] = 'obl:za:acc' + elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):na(:gen)?$', edep['deprel']): + edep['deprel'] = re.sub(r':gen$', '', edep['deprel']) + # The case is unknown. We need 'acc' or 'loc'. + # The locative is probably more frequent but it is not so likely with every noun. + # If there is an nummod:gov child, it must be accusative and not locative. + # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:arg:na_konec$', edep['deprel']): + # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' + edep['deprel'] = 'obl:arg:na:acc' + elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): + # Annotation error. + if node.form == 's': + ohled = node.next_node + na = ohled.next_node + noun = na.next_node + self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') + self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(node, noun, 'case', 'case') + elif re.match(r'^nmod:pára:nom$', edep['deprel']): + # Annotation error: 'par excellence'. + edep['deprel'] = 'nmod' + for c in node.children: + if c.udeprel == 'case' and c.form.lower() == 'par': + c.lemma = 'par' + c.upos = 'ADP' + c.xpos = 'RR--X----------' + c.feats['Case'] = '' + c.feats['Gender'] = '' + c.feats['Number'] = '' + c.feats['Polarity'] = '' + c.feats['AdpType'] = 'Prep' + elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): + # Accusative would be possible but unlikely. edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): - # Annotation error. - if node.form == 's': - ohled = node.next_node - na = ohled.next_node - noun = na.next_node - self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') - self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') - self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') - self.set_basic_and_enhanced(node, noun, 'case', 'case') - elif re.match(r'^nmod:pára:nom$', edep['deprel']): - # Annotation error: 'par excellence'. - edep['deprel'] = 'nmod' - for c in node.children: - if c.udeprel == 'case' and c.form.lower() == 'par': - c.lemma = 'par' - c.upos = 'ADP' - c.xpos = 'RR--X----------' - c.feats['Case'] = '' - c.feats['Gender'] = '' - c.feats['Number'] = '' - c.feats['Polarity'] = '' - c.feats['AdpType'] = 'Prep' - elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: - edep['deprel'] += ':loc' - elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): - if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: - edep['deprel'] += ':acc' - else: + elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): + # Genitive would be possible but unlikely. edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): - # Accusative would be possible but unlikely. - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): - # Genitive would be possible but unlikely. - edep['deprel'] += ':ins' - elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': - # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. - # Find the content nominal. - cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] - vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] - if len(cnouns) > 0 and len(vs) > 0: - cnoun = cnouns[0] - v = vs[0] - self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') - self.set_basic_and_enhanced(v, cnoun, 'case', 'case') - self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') - elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): - # ':nom' occurs in 'karneval v Rio de Janeiro' - edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) - if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': + # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. + # Find the content nominal. + cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] + vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] + if len(cnouns) > 0 and len(vs) > 0: + cnoun = cnouns[0] + v = vs[0] + self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') + self.set_basic_and_enhanced(v, cnoun, 'case', 'case') + self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') + elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): + # ':nom' occurs in 'karneval v Rio de Janeiro' + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): + # There is just one occurrence and it is an error: + # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' + # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. + edep['deprel'] = 'obl:s:ins' + elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): + # Instrumental would be possible but unlikely. edep['deprel'] += ':acc' else: - edep['deprel'] += ':loc' - elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): - # There is just one occurrence and it is an error: - # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' - # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. - edep['deprel'] = 'obl:s:ins' - elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): - # Instrumental would be possible but unlikely. - edep['deprel'] += ':acc' - else: - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) - edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) - edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) def set_basic_and_enhanced(self, node, parent, deprel, edeprel): ''' diff --git a/udapi/block/ud/cs/fixmorpho.py b/udapi/block/ud/cs/fixmorpho.py new file mode 100644 index 00000000..7fcb0e12 --- /dev/null +++ b/udapi/block/ud/cs/fixmorpho.py @@ -0,0 +1,471 @@ +""" +A Czech-specific block to fix lemmas, UPOS and morphological features in UD. +It should increase consistency across the Czech treebanks. It focuses on +individual closed-class verbs (such as the auxiliary "být") or on entire classes +of words (e.g. whether or not nouns should have the Polarity feature). It was +created as part of the Hičkok project (while importing nineteenth-century Czech +data) but it should be applicable on any other Czech treebank. +""" +from udapi.core.block import Block +import logging +import re + +class FixMorpho(Block): + + def process_node(self, node): + # Do not touch words marked as Foreign or Typo. They may not behave the + # way we expect in Czech data. + if node.feats['Foreign'] == 'Yes' or node.feats['Typo'] == 'Yes': + return + #---------------------------------------------------------------------- + # NOUNS, PROPER NOUNS, AND ADJECTIVES + #---------------------------------------------------------------------- + # Nouns do not have polarity but the Prague-style tagsets may mark it. + if node.upos in ['NOUN', 'PROPN']: + if node.feats['Polarity'] == 'Pos': + node.feats['Polarity'] = '' + elif node.feats['Polarity'] == 'Neg': + logging.warn(f'To remove Polarity=Neg from the NOUN {node.form}, we may have to change its lemma ({node.lemma}).') + # For some nouns, there is disagreement in whether to tag and lemmatize + # them as proper nouns. We must be careful and not add too many to this + # rule, as many of them could be used as surnames and then they should + # be PROPN. + if node.upos == 'PROPN' and re.fullmatch(r'(bůh|duch|hospodin|město|milost|pan|pán|panna|stvořitel|trojice)', node.lemma.lower()): + node.lemma = node.lemma.lower() + node.upos = 'NOUN' + # Lemmatization. + if node.upos == 'NOUN' and node.lemma == 'zem': + node.lemma = 'země' + if node.upos == 'ADJ': + # Adjectives should be lemmatized to lowercase even if they are part of + # a multiword name, e.g., "Malá" in "Malá Strana" should be lemmatized + # to "malý". Exception: Possessive adjectives derived from personal + # names, e.g., "Karlův". + if node.feats['Poss'] != 'Yes': + node.lemma = node.lemma.lower() + # Short forms of adjectives are rare in Modern Czech and uninflected + # (they are used as predicates), so they lack the Case feature. But + # they were inflected for Case in the past, so it is better to add + # Case=Nom for consistency. + if node.feats['Variant'] == 'Short' and node.feats['Case'] == '': + node.feats['Case'] = 'Nom' + #---------------------------------------------------------------------- + # PRONOUNS AND DETERMINERS + #---------------------------------------------------------------------- + # Clitic forms of personal pronouns have Variant=Short if there is also a longer, full form. + if node.upos == 'PRON' and node.feats['PronType'] == 'Prs' and re.fullmatch(r'(mi|mě|ti|tě|si|se|ho|mu)', node.form.lower()): + node.feats['Variant'] = 'Short' + # Forms of "my" should be lemmatized as "já". + if node.upos == 'PRON' and node.lemma == 'my': + node.lemma = 'já' + # Forms of "vy" should be lemmatized as "ty". + if node.upos == 'PRON' and node.lemma == 'vy': + node.lemma = 'ty' + # Forms of "oni" should be lemmatized as "on" and cases that allow + # a preposition should have PrepCase. + if node.upos == 'PRON' and node.lemma in ['on', 'oni']: + node.lemma = 'on' + if node.feats['Case'] not in ['Nom', 'Voc']: + if node.form.lower().startswith('j'): + node.feats['PrepCase'] = 'Npr' + elif re.match(r'[nň]', node.form.lower()): + node.feats['PrepCase'] = 'Pre' + # In 19th century data, the grammaticalized usages of "se", "si" are + # tagged as PART (rather than a reflexive PRON, which is the standard). + # Even if it already was tagged PRON, some features may have to be added. + if node.upos in ['PRON', 'PART'] and node.form.lower() in ['se', 'si']: + node.lemma = 'se' + node.upos = 'PRON' + node.feats['PronType'] = 'Prs' + node.feats['Reflex'] = 'Yes' + if node.form.lower() == 'se': + # Occasionally "se" can be genitive: "z prudkého do se dorážení". + if not node.feats['Case'] == 'Gen': + node.feats['Case'] = 'Acc' + else: + node.feats['Case'] = 'Dat' + node.feats['Variant'] = 'Short' + # As the genitive/accusative form of "on", "jeho" should have PrepCase. + if node.upos == 'PRON' and node.form.lower() == 'jeho': + node.feats['PrepCase'] = 'Npr' + # Possessive pronouns have Person, Gender[psor] and Number[psor]. + # Although it is questionable, plural possessors are lemmatized to singular + # possessors in an analogy to personal pronouns: "my" --> "já", "náš" --> "můj". + # Some source corpora lack Person and [psor] features, others do not respect + # the lemmatization rule, so in the end we have to look at the forms; but + # there are potentially many variants, especially in old texts. + if node.upos == 'DET' and node.feats['Poss'] == 'Yes': + if node.form.lower().startswith('m'): + # můj muoj mój mého mému mém mým moje má mojí mé moji mou mí mých mými + node.feats['Person'] = '1' + node.feats['Number[psor]'] = 'Sing' + elif node.form.lower().startswith('t'): + # tvůj tvuoj tvój tvého tvému tvém tvým tvoje tvá tvojí tvé tvoji tvou tví tvých tvými + node.feats['Person'] = '2' + node.feats['Number[psor]'] = 'Sing' + elif node.form.lower().startswith('n'): + # náš našeho našemu našem naším naše naší naši našich našim našimi + node.lemma = 'můj' + node.feats['Person'] = '1' + node.feats['Number[psor]'] = 'Plur' + elif node.form.lower().startswith('v'): + # váš vašeho vašemu vašem vaším vaše vaší vaši vašich vašim vašimi + node.lemma = 'tvůj' + node.feats['Person'] = '2' + node.feats['Number[psor]'] = 'Plur' + elif node.form.lower() == 'jeho': + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Sing' + if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']): + node.feats['Gender[psor]'] = 'Masc,Neut' + elif re.fullmatch(r'jehož', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Sing' + if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']): + node.feats['Gender[psor]'] = 'Masc,Neut' + elif re.fullmatch(r'(její|jejího|jejímu|jejím|jejích|jejími|jejíma)', node.form.lower()): + node.lemma = 'jeho' + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Sing' + node.feats['Gender[psor]'] = 'Fem' + elif re.fullmatch(r'jejíž', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Sing' + node.feats['Gender[psor]'] = 'Fem' + elif re.fullmatch(r'jich|jejich', node.form.lower()): + node.lemma = 'jeho' + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Plur' + elif re.fullmatch(r'jichž|jejichž', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Plur' + elif re.fullmatch(r'jichžto|jejichžto', node.form.lower()): + node.lemma = 'jehožto' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Plur' + elif node.lemma == 'čí': + node.feats['Poss'] = 'Yes' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Reflexive possessive pronoun should not forget the Reflex=Yes feature. + if node.upos == 'DET' and node.lemma == 'svůj': + node.feats['Reflex'] = 'Yes' + # Demonstrative, interrogative, relative, negative, total and indefinite + # pronouns (or determiners, because some of them get the DET tag). + if node.upos in ['PRON', 'DET']: + # Relative pronoun "jenž" should be PRON, not DET + # (it inflects for Gender but it can never be used as congruent attribute). + if re.fullmatch(r'(jenž|jenžto)', node.lemma): + node.upos = 'PRON' + if node.form.lower().startswith('j'): + node.feats['PrepCase'] = 'Npr' + else: + node.feats['PrepCase'] = 'Pre' + # Relative pronoun "ješto" should be PRON, not DET (if it is not SCONJ, but that was excluded by a condition above) + # (it inflects for Gender but it can never be used as congruent attribute). + elif node.form.lower() in ['ješto', 'ježto']: + node.lemma = 'jenžto' + node.upos = 'PRON' + node.feats['PrepCase'] = 'Npr' + # Relative pronoun "an" is PRON (not DET). + elif node.lemma == 'an': + node.upos = 'PRON' + node.feats['PronType'] = 'Rel' + # Pronoun "kdo" is PRON (not DET). + elif node.lemma == 'kdo': + node.lemma = 'kdo' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "kdož" is PRON (not DET). + elif node.lemma == 'kdož': + node.lemma = 'kdož' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Rel' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "někdo", "kdosi" is PRON (not DET). + elif re.fullmatch(r'(kdosi|někdo)', node.lemma): + node.upos = 'PRON' + node.feats['PronType'] = 'Ind' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "nikdo" is PRON (not DET). + elif node.lemma == 'nikdo': + node.lemma = 'nikdo' + node.upos = 'PRON' + node.feats['PronType'] = 'Neg' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "co" is PRON (not DET). + elif node.lemma == 'co': + node.lemma = 'co' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "což" is PRON (not DET). + elif node.lemma in ['což', 'cože']: + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Rel' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "něco" is PRON (not DET). + elif re.fullmatch(r'(cokoli|cosi|něco)', node.lemma): + node.upos = 'PRON' + node.feats['PronType'] = 'Ind' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "nic" is PRON (not DET). + elif node.lemma == 'nic': + node.lemma = 'nic' + node.upos = 'PRON' + node.feats['PronType'] = 'Neg' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "týž" is DET and PronType=Dem. + elif re.fullmatch(r'(tentýž|týž)', node.lemma): + node.upos = 'DET' + node.feats['PronType'] = 'Dem' + # Pronoun "každý" is DET and PronType=Tot. + elif node.lemma == 'každý': + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + # Pronoun "vše" is lemmatized to "všechen", it is DET and PronType=Tot. + elif node.form.lower() == 'vše': + node.lemma = 'všechen' + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + elif node.lemma == 'všechen': + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + elif re.fullmatch(r'(všecek|všecka|všecku|všecko|všickni)', node.form.lower()): + node.lemma = 'všechen' + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + # Pronoun "sám" is lemmatized to the long form, it is DET and PronType=Emp. + elif node.lemma in ['sám', 'samý']: + node.lemma = 'samý' + node.upos = 'DET' + node.feats['PronType'] = 'Emp' + node.feats['Variant'] = 'Short' if re.fullmatch(r'(sám|sama|samo|sami|samy|samu)', node.form.lower()) else '' + #---------------------------------------------------------------------- + # PRONOMINAL NUMERALS AND ADVERBS + #---------------------------------------------------------------------- + # The numeral "oba" should be NUM, not PRON or DET. But it should have PronType=Tot. + if node.upos in ['NUM', 'PRON', 'DET'] and node.lemma == 'oba': + node.upos = 'NUM' + node.feats['NumType'] = 'Card' + node.feats['NumForm'] = 'Word' + node.feats['PronType'] = 'Tot' + # Pronominal cardinal numerals should be DET, not NUM. + if node.upos == 'NUM': + if re.fullmatch(r'(mnoho|málo|několik)', node.lemma): + node.upos = 'DET' + node.feats['PronType'] = 'Ind' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' ###!!! so we are losing the distinction mnoho/nemnoho? + elif re.fullmatch(r'(toliko?)', node.lemma): + node.lemma = 'tolik' + node.upos = 'DET' + node.feats['PronType'] = 'Dem' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(kolik)', node.lemma): + node.upos = 'DET' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' + if node.upos in ['ADV', 'NUM']: + if re.fullmatch(r'(mnoho|málo|několi)krát', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + node.feats['PronType'] = 'Ind' + elif re.fullmatch(r'(tolikrát)', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + node.feats['PronType'] = 'Dem' + elif re.fullmatch(r'(kolikrát)', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Pronominal adverbs have PronType but most of them do not have Degree + # and Polarity. + if node.upos == 'ADV': + if re.fullmatch(r'(dosud|dotud|nyní|odsud|odtud|proto|sem|tady|tak|takož|takto|tam|tamto|teď|tehdy|tenkrát|tu|tudy|zde)', node.lemma): + node.feats['PronType'] = 'Dem' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(dokdy|dokud|jak|kam|kde|kdy|kterak|kudy|odkdy|odkud|proč)', node.lemma): + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(kdežto)', node.lemma): + node.feats['PronType'] = 'Rel' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(jakkoli|jaksi|kamkoli|kamsi|kdekoli|kdesi|kdykoli|kdysi|kudykoli|kudysi|nějak|někam|někde|někdy|někudy)', node.lemma): + node.feats['PronType'] = 'Ind' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(nic|nijak|nikam|nikde|nikdy|nikudy)', node.lemma): + node.feats['PronType'] = 'Neg' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + # Total pronominals can be negated ("nevždy"). Then they get Degree, too. + elif re.fullmatch(r'(odevšad|všude|všudy|ve?ždy|ve?ždycky)', node.lemma): + node.feats['PronType'] = 'Tot' + node.feats['Degree'] = 'Pos' + node.feats['Polarity'] = 'Pos' + #---------------------------------------------------------------------- + # VERBS AND AUXILIARIES + #---------------------------------------------------------------------- + # In Czech UD, "být" is always tagged as AUX and never as VERB, regardless + # of the fact that it can participate in purely existential constructions + # where it no longer acts as a copula. Czech tagsets typically do not + # distinguish AUX from VERB, which means that converted data may have to + # be fixed. + if node.upos == 'VERB' and node.lemma in ['být', 'bývat', 'bývávat']: + node.upos = 'AUX' + if node.upos in ['ADV', 'VERB'] and re.fullmatch(r'(ne)?lze', node.form.lower()): + node.upos = 'ADV' + node.lemma = 'lze' # not 'nelze' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + node.feats['Aspect'] = '' + node.feats['Mood'] = '' + node.feats['Tense'] = '' + node.feats['Person'] = '' + node.feats['Number'] = '' + node.feats['Degree'] = 'Pos' + if node.upos in ['VERB', 'AUX']: + # Most non-passive verb forms have Voice=Act, and infinitives should + # have it, too. Passive infinitives are always periphrastic. + # (This is not done in the PDT tagset, but we should add it.) + if node.feats['VerbForm'] == 'Inf': + node.feats['Voice'] = 'Act' + # Same for imperatives. + elif node.feats['Mood'] == 'Imp': + node.feats['Voice'] = 'Act' + # Some verbs lack the Aspect feature although they are not biaspectual. + if node.feats['Aspect'] == '': + if re.fullmatch(r'(cítit|čekat|činit|číst|dávat|dělat|dít|dívat|hledat|chodit|chtít|jít|kralovat|ležet|milovat|mít|mluvit|moci|mus[ei]t|mysl[ei]t|patřit|počínat|prosit|ptát|působit|sedět|snažit|vědět|vidět|vyprávět|zdát|znamenat|žít)', node.lemma): + node.feats['Aspect'] = 'Imp' + elif re.fullmatch(r'(dát|dojít|dostat|nalézt|napadnout|nechat|obrátit|odpovědět|otevřít|počít|položit|pomoci|poslat|postavit|povědět|poznat|přijít|přinést|říci|učinit|udělat|ukázat|vrátit|vstát|vydat|vzít|začít|zeptat|zůstat)', node.lemma): + node.feats['Aspect'] = 'Perf' + # We must look at word form to distinguish imperfective "stát" from perfective "stát se". + elif re.fullmatch(r'(stojí(me?|š|te)?|stál(a|o|i|y)?)', node.form.lower()): + node.feats['Aspect'] = 'Imp' + elif re.fullmatch(r'(stan(u|eš|e|eme?|ete|ou)|stal(a|o|i|y)?)', node.form.lower()): + node.feats['Aspect'] = 'Perf' + # Present forms of perfective verbs normally have Tense=Pres despite + # meaning future. However, a few imperfective verbs have a separate + # future form (distinct from present form), which gets Tense=Fut + # despite inflecting similarly to present forms. + if node.feats['Mood'] == 'Ind' and node.feats['Tense'] == 'Pres' and node.feats['Aspect'] != 'Perf' and re.match(r'(ne)?((bud|půjd|pojed|polez|pones)(u|eš|e|eme?|ete|ou)|polet(ím|íš|í|íme|íte))', node.form.lower()): + node.feats['Tense'] = 'Fut' + # Passive participles (including the short forms) should be ADJ, not VERB. + # But they keep the verbal features of VerbForm, Voice, Aspect. + if node.feats['VerbForm'] == 'Part' and node.feats['Voice'] == 'Pass': + node.upos = 'ADJ' + # But now we need an adjectival lemma. + ###!!! Bohužel to občas zahodí normalizaci, kterou tam Martinův tým zavedl ručně, např. "rozhřita" mělo lemma "rozehřát", ale já teď místo "rozehřátý" vyrobím "rozhřitý". + ###!!! odepříno - odepříný místo odepřený + ###!!! dovolíno - dovolíný místo dovolený + ###!!! vyslyšána - vyslyšaný místo vyslyšený + ###!!! obmezený místo omezený, oslyšaný místo oslyšený + node.misc['LDeriv'] = node.lemma + node.lemma = re.sub(r'([nt])[auoiy]?$', r'\1ý', node.form.lower()) + node.lemma = re.sub(r'áný$', r'aný', node.lemma) # ztroskotány --> ztroskotáný --> ztroskotaný; zachován, spořádán + if node.feats['Polarity'] == 'Neg': + node.lemma = re.sub(r'^ne', '', node.lemma) + if node.feats['Case'] == '': + node.feats['Case'] = 'Nom' + if node.feats['Degree'] == '': + node.feats['Degree'] = 'Pos' + node.feats['Variant'] = 'Short' + #---------------------------------------------------------------------- + # ADVERBS + #---------------------------------------------------------------------- + # Words that indicate the speaker's attitude are tagged ADV in UD, + # although the Czech tagsets often treat them as particles. + if node.upos == 'PART' and re.fullmatch(r'(ani|asi?|až|bezpochyby|bohdá|co|dokonce|jen|jistě|již|hlavně|hned|jednoduše|leda|možná|naopak|nejen|nejspíše?|opravdu|ovšem|patrně|právě|prej|prý|přece|především|rozhodně|skoro|skutečně|snad|spíše?|teda|tedy|třeba|určitě|věru|vlastně|vůbec|zajisté|zase|zrovna|zřejmě|zvlášť|zvláště)', node.lemma): + node.upos = 'ADV' + node.feats['Degree'] = 'Pos' + node.feats['Polarity'] = 'Pos' + node.misc['CzechParticle'] = 'Yes' + # Adverb "brzo" should be lemmatized as "brzy". + if node.upos == 'ADV' and node.form.lower() == 'brzo': + node.lemma = 'brzy' + if node.upos == 'ADV' and node.form.lower() == 'teprv': + node.lemma = 'teprve' + # All non-pronominal adverbs (and also some pronominal ones) should + # have Degree and Polarity. At least for now we also exclude adverbial + # numerals, e.g. "jednou" – "nejednou". + if node.upos == 'ADV' and node.feats['PronType'] == '' and node.feats['NumType'] == '': + if node.feats['Degree'] == '': + node.feats['Degree'] = 'Pos' + if node.feats['Polarity'] == '': + node.feats['Polarity'] = 'Pos' + #---------------------------------------------------------------------- + # PREPOSITIONS + #---------------------------------------------------------------------- + # Preposition "u" may combine with Case=Loc|Acc in old texts, and then + # it functions as a vocalized counterpart of "v". Nevertheless, we always + # lemmatize it as "u" and thus AdpType is Prep, not Voc. + if node.upos == 'ADP' and node.form.lower() == 'u': + node.lemma = 'u' + node.feats['AdpType'] = 'Prep' + #---------------------------------------------------------------------- + # CONJUNCTIONS + #---------------------------------------------------------------------- + # As a conjunction (and not particle/adverb), "ani" is coordinating and + # not subordinating. + if node.upos == 'SCONJ' and node.lemma == 'ani': + node.upos = 'CCONJ' + if node.upos == 'CCONJ' and node.lemma == 'nebť': + node.lemma = 'neboť' + #---------------------------------------------------------------------- + # PARTICLES (other than those already grabbed above) + #---------------------------------------------------------------------- + # "jako" should be SCONJ but 19th century data have it as PART. + if node.upos == 'PART': + if node.lemma == 'jako': + node.upos = 'SCONJ' + elif node.lemma == 'ti': + node.lemma = 'ť' diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 7e1f8ffb..da9f5bda 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -7,7 +7,6 @@ Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs -import logging import re class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): @@ -30,7 +29,7 @@ def process_node(self, node): pass # NOUNS ################################################################ elif node.upos == 'NOUN': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + self.check_required_features(node, ['Gender', 'Number', 'Case']) if node.feats['VerbForm'] == 'Vnoun': # verbal nouns: bytí, dělání, ... self.check_allowed_features(node, { @@ -38,8 +37,9 @@ def process_node(self, node): 'Gender': ['Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes'] + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes'] }) elif node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Animacy']) @@ -48,18 +48,20 @@ def process_node(self, node): 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_allowed_features(node, { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + self.check_required_features(node, ['Gender', 'Number', 'Case']) if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Animacy']) self.check_allowed_features(node, { @@ -67,17 +69,17 @@ def process_node(self, node): 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_allowed_features(node, { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': if node.feats['Poss'] == 'Yes': # possessive adjectives @@ -90,8 +92,10 @@ def process_node(self, node): 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'Foreign': ['Yes']}) + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { @@ -100,31 +104,42 @@ def process_node(self, node): 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'Foreign': ['Yes']}) - elif node.feats['NumType'] == 'Ord': # ordinal numerals are a subtype of adjectives + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + elif node.feats['NumType'] == 'Ord' or node.feats['NumType'] == 'Mult': # ordinal numerals are a subtype of adjectives; same for some multiplicative numerals (dvojí, trojí) if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) self.check_allowed_features(node, { - 'NumType': ['Ord'], + 'NumType': ['Ord', 'Mult'], + 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho') 'Gender': ['Masc', 'Fem', 'Neut'], 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) + 'Variant': ['Short'], # sedmer (Mult Short) duch tvój; pól čtverta (Ord Short) komára + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'NumType': ['Ord'], + 'NumType': ['Ord', 'Mult'], + 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho') 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives self.check_required_features(node, ['VerbForm', 'Voice']) - if node.feats['Voice'] == 'Act': # active participles have tense, passives don't + if node.feats['Voice'] == 'Act': # active participles have tense, passives don't but they have degree if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -136,9 +151,12 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], 'Variant': ['Short'], - 'Foreign': ['Yes']}) + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -149,10 +167,13 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], 'Variant': ['Short'], - 'Foreign': ['Yes']}) + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Degree']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -162,10 +183,14 @@ def process_node(self, node): 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], 'Variant': ['Short'], - 'Foreign': ['Yes']}) + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Number', 'Case', 'Polarity', 'Degree']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -174,29 +199,12 @@ def process_node(self, node): 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], 'Variant': ['Short'], - 'Foreign': ['Yes']}) - elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: # regular adjectives + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: # regular adjectives, including short forms if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) self.check_allowed_features(node, { @@ -206,7 +214,10 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Degree': ['Pos', 'Cmp', 'Sup'], 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) self.check_allowed_features(node, { @@ -215,7 +226,10 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Degree': ['Pos', 'Cmp', 'Sup'], 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) # PRONOUNS ############################################################# elif node.upos == 'PRON': self.check_required_features(node, ['PronType']) @@ -235,16 +249,19 @@ def process_node(self, node): 'PronType': ['Prs'], 'Person': ['3'] }) - elif node.feats['Variant'] == 'Short': # ho, mu - # The short (clitic) forms do not have PrepCase. - self.check_adjective_like(node, ['PronType', 'Person'], { + elif re.match(r"^(ho|mu)$", node.form.lower()): + # The short (clitic) forms do not have PrepCase in Modern Czech. + # Old Czech has also 'jmu' (besides 'jemu' and 'mu') and 'jho' + # (besides 'jeho' and 'ho'); it should not have Variant=Short + # and it should have PrepCase=Npr (the next block). + self.check_adjective_like(node, ['PronType', 'Person', 'Variant'], { 'PronType': ['Prs'], 'Person': ['3'], 'Variant': ['Short'] }) else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně # Mostly only two gender groups and no animacy: - # Masc,Neut ... jeho, jemu, jej, něm, jím + # Masc,Neut ... jeho, jho, jemu, jmu, jej, něm, jím # Fem ... jí, ji, ní # Neut ... je # No gender in dual and plural: @@ -264,18 +281,22 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Variant': ['Short'] }) - elif re.search(r'k[dt]o', node.lemma): # kdo (kto), kdož, někdo, nikdo + elif re.search(r'k[dt][oe]', node.lemma): # kdo (kto), kdož, někdo, nikdo # There is no Number. Někdo and nikdo behave like singular; - # kdo is by default singular as well but it also occurs as a subject - # of plural verbs. + # kdo is by default singular as well but it also occurs as subject + # of plural verbs ("ti, kdo nepřišli včas, byli vyloučeni"). + # In Old Czech, "nikde" is a variant of the pronoun "nikdo" (nobody) + # (while in New Czech, "nikde" (nowhere) is a pronominal adverb only). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kdo to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], 'Gender': ['Masc'], 'Animacy': ['Anim'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] }) - elif re.match(r'^(co|což|něco|nicož)$', node.lemma): + elif re.match(r'^(co(si?)?|což|což?koliv?|něco|lečco|lecco|ledacos?|nic|nicož)$', node.lemma): # Although these pronouns behave by default as neuter singular, # no Gender and Number is annotated. However, quite unusually, # there is Animacy=Inan without Gender. @@ -284,9 +305,11 @@ def process_node(self, node): ###!!! animacy. For now, let's at least make animacy an optional ###!!! feature (I see that we already do not fill it in the Old ###!!! Czech data). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, co to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. self.check_required_features(node, ['PronType', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], 'Animacy': ['Inan'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] }) @@ -294,6 +317,9 @@ def process_node(self, node): # Unlike 'jenžto', this relative pronoun does not inflect, it # always occurs in a nominative position, but the context can # be any gender and number. + # Update from the Hičkok project: 'ješto' is lemmatized to + # 'jenžto' (see below), meaning that this branch should not be + # needed for the new data. self.check_required_features(node, ['PronType', 'Case']) self.check_allowed_features(node, { 'PronType': ['Rel'], @@ -312,10 +338,24 @@ def process_node(self, node): # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even # in the nominative, although there is no prepositional counter- # part (but similarly the locative has no prepositionless form). - self.check_adjective_like(node, ['PronType', 'PrepCase'], { - 'PronType': ['Rel'], - 'PrepCase': ['Npr', 'Pre'] - }) + # Update from the Hičkok project: In Old Czech, both 'jenž' and + # 'jenžto' (or its variant 'ješto') can be used uninflected, + # accompanied by a resumptive pronoun which provides the inflection. + # In this case, the Hičkok data will not annotate Gender, Animacy, + # Number and Case of the relative pronoun. Therefore, we require + # the full set of features if any of them is present; otherwise, + # we only expect PronType and PrepCase. + if node.feats['Gender'] != '' or node.feats['Animacy'] != '' or node.feats['Number'] != '' or node.feats['Case'] != '': + self.check_adjective_like(node, ['PronType', 'PrepCase'], { + 'PronType': ['Rel'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: + self.check_required_features(node, ['PronType', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'PrepCase': ['Npr'] + }) else: # What remains is the relative pronoun 'an'. It behaves similarly # to 'jenž' but it does not have the PrepCase feature and it @@ -334,6 +374,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'PronType': ['Rel'], 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom'] }) @@ -341,45 +382,131 @@ def process_node(self, node): elif node.upos == 'DET': # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. - if re.match(r'^(jeho|jejich|jich)(ž(to)?)?$', node.form.lower()): + if re.match(r'^(je?ho|jejich|j[ií]ch)$', node.form.lower()): self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], + 'PronType': ['Prs'], 'Poss': ['Yes'], 'Person': ['3'], 'Number[psor]': ['Sing', 'Dual', 'Plur'], - 'Gender[psor]': ['Masc,Neut'] + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner because no n-form can be used (jeho dům VS. na jeho dům). + # Compare with genitive/accusative of the pronoun "on", there the form changes after preposition and PrepCase must be annotated + # (jeho se bojím VS. bez něho se neobejdu). + }) + # Relative possessive determiners 'jehož' and 'jejichž' behave similarly + # to the personal possessive determiners but they do not have Person. + # Normally determiners do not change j->n after prepositions but we + # have an example in Old Czech (štěpové zlatí, na nichžto větviech...) + elif re.match(r'^(jeho|jejich|[jn][ií]ch)ž(e|to)?$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner (muž, jehož manželka zahynula při nehodě) because no n-form can be used + # (after preposition: muž, na jehož manželku jste si stěžoval). Compare with genitive/accusative of the relative pronoun "jenž", + # there the form changes after preposition and PrepCase must be annotated (muž, jehož se bojím VS. muž, bez něhož se neobejdeme). }) - elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): + # Feminine personal possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)$', node.form.lower()): # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. - # Congruent gender is annotated only in singular. Masculine and - # neuter are merged even in nominative. Feminine singular does - # not distinguish case in PDT but we need it in Old Czech at - # least for 'jejiej'. - if node.feats['Number'] == 'Sing': + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (její bota, její boty, její botě, její botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiej') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], + 'PronType': ['Prs'], 'Poss': ['Yes'], 'Person': ['3'], 'Number[psor]': ['Sing'], 'Gender[psor]': ['Fem'], - 'Gender': ['Masc,Neut', 'Fem'], - 'Number': ['Sing'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) + # Feminine relative possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(e|to)?)$', node.form.lower()): + # The feminine possessive 'jejíž' slightly inflects, unlike 'jehož' and 'jejichž'. + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (jejíž bota, jejíž boty, jejíž botě, jejíž botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiejž') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) else: - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], + 'PronType': ['Rel'], 'Poss': ['Yes'], - 'Person': ['3'], 'Number[psor]': ['Sing'], 'Gender[psor]': ['Fem'], - 'Number': ['Dual', 'Plur'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) - elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + elif re.match(r'^(můj|tvůj|svůj)(ž(e|to)?)?$', node.lemma): if node.feats['Reflex'] == 'Yes': self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { 'PronType': ['Prs'], @@ -393,11 +520,46 @@ def process_node(self, node): 'Person': ['1', '2'], 'Number[psor]': ['Sing', 'Plur'] }) - elif re.match(r'^(samý)$', node.lemma): + elif re.match(r'^(ně|lec|ni)?číž?(koliv?)?$', node.lemma): + self.check_adjective_like(node, ['PronType', 'Poss'], { + 'PronType': ['Int', 'Rel', 'Ind', 'Neg'], + 'Poss': ['Yes'] + }) + elif re.match(r'^(sám|samý)$', node.lemma): + # The above condition looks at both lemma options, although only one lemma is assumed. + # However, in New Czech data the one lemma is "samý" while in Old Czech data it is "sám". # Unlike other determiners, it allows Variant=Short: sám, sama, samu, samo, sami, samy. self.check_adjective_like(node, ['PronType'], {'PronType': ['Emp'], 'Variant': ['Short']}) + elif node.lemma == 'veškerý': + # In Old Czech, this determiner also allows Variant=Short: veškeren, veškera, veškeru, veškero, veškeři, veškery. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Tot'], 'Variant': ['Short']}) + elif node.lemma == 'žádný': + # In Old Czech, this determiner also allows Variant=Short: žáden, žádna, žádnu, žádno, žádni, žádny. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Neg'], 'Variant': ['Short']}) + elif node.feats['NumType'] in ['Ord', 'Mult']: # pronominal numerals 'několikátý', 'několikerý', 'několiký' etc. + self.check_adjective_like(node, ['PronType', 'NumType'], { + 'PronType': ['Ind', 'Int', 'Rel', 'Dem'], + 'NumType': ['Ord', 'Mult'] + }) + elif node.feats['NumType'] == 'Card': # pronominal quantifiers 'mnoho', 'málo', 'několik' etc. + if node.lemma == 'nejeden': + self.check_adjective_like(node, ['PronType', 'NumType'], {'PronType': ['Ind'], 'NumType': ['Card']}) + else: + # Lemmas 'hodně' and 'málo' have Degree even if used as quantifiers and not adverbs: + # hodně, více, nejvíce; málo, méně, nejméně + # Lemmas 'mnoho' and 'málo' can be negated (nemnoho, nemálo). + self.check_required_features(node, ['PronType', 'NumType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Ind', 'Int', 'Rel', 'Dem'], + 'NumType': ['Card'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) else: - self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot']}) + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Int', 'Rel', 'Ind', 'Neg', 'Tot']}) # NUMERALS ############################################################# elif node.upos == 'NUM': self.check_required_features(node, ['NumType', 'NumForm']) @@ -408,11 +570,28 @@ def process_node(self, node): 'NumForm': ['Digit', 'Roman'] }) else: + if node.feats['NumType'] == 'Sets': + # 'jedny', 'dvoje', 'oboje', 'troje', 'čtvery' + # Number should perhaps be only Plur because the counted noun will be Plur. + # Gender is not annotated in PDT but there are different forms ('jedni' vs. 'jedny', + # and in Old Czech also 'dvoji' vs. 'dvoje'), so we should allow Gender (and Animacy). + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Sets'], + 'PronType': ['Tot'], # for 'oboje' + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. # 'pět' and more have Number=Plur, Case: pět, pěti. - if node.lemma == 'jeden': + # 'půl' has no Number and Case, although it behaves syntactically similarly to 'pět' (but genitive is still 'půl', not '*půli'). + # 'sto', 'tisíc', 'milión', 'miliarda' etc. have Gender (+ possibly Animacy) and Number (depending on their form). + elif node.lemma == 'jeden': self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) self.check_allowed_features(node, { 'NumType': ['Card'], @@ -427,6 +606,7 @@ def process_node(self, node): if self.pdt20: self.check_allowed_features(node, { 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' 'NumForm': ['Word'], 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm 'Number': ['Dual', 'Plur'], @@ -435,31 +615,73 @@ def process_node(self, node): else: self.check_allowed_features(node, { 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' 'NumForm': ['Word'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Animacy': ['Anim', 'Inan'], 'Number': ['Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) + elif re.match(r'^(dvé|obé)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'obé' + 'NumForm': ['Word'], + 'Gender': ['Neut'], + 'Number': ['Sing'], # when 'dvé' is subject, the verb is neuter singular + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif node.lemma == 'půl': + self.check_required_features(node, ['NumType', 'NumForm']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'] + }) + elif re.match(r'^(sto|tisíc|.+ili[oó]n|.+iliarda)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) else: + # In PDT, cardinal numerals higher than four in nominative/accusative/vocative + # have Number=Sing instead of Plur! It may be motivated by the default + # agreement they trigger on verbs (but they don't have Gender=Neut). + # It does not make much sense but we must allow Sing before a better + # approach is defined and implemented in the data. + # On the other hand, we may want to allow Dual for "stě". self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) self.check_allowed_features(node, { 'NumType': ['Card'], 'NumForm': ['Word'], - 'Number': ['Plur'], + 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) # VERBS AND AUXILIARIES ################################################ - elif re.match(r'^(VERB|AUX)$', node.upos): - self.check_required_features(node, ['Aspect', 'VerbForm']) - if node.feats['VerbForm'] == 'Inf': + elif node.upos in ['VERB', 'AUX']: + # There are only three lemmas recognized as AUX in Czech. This is not + # about features and it would be caught by the UD validator, but it + # is error in morphology, so let's report it here as well. + if node.upos == 'AUX' and node.lemma not in ['být', 'bývat', 'bývávat']: + self.bug(node, 'NonAuxLemma') + # All Czech verbs (and some adjectives and nouns) must have VerbForm. + # Almost all verbs have lexical Aspect but we cannot require it + # because there are a few biaspectual verbs (e.g. 'analyzovat') that + # do not have the feature. + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] in ['Inf', 'Sup']: # There is no voice. For some reason, PDT does not annotate that # the infinitive form is active (while a passive infinitive is # a combination of the infinitive with a passive participle). self.check_required_features(node, ['Polarity']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Inf'], + 'VerbForm': ['Inf', 'Sup'], 'Polarity': ['Pos', 'Neg'] }) elif node.feats['VerbForm'] == 'Fin': @@ -467,24 +689,46 @@ def process_node(self, node): # imperatives (although passive imperatives are a combination # of the active imperative and a passive participle). It is # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. + # Conditional "by" has no person and number (it is typically + # 3rd person but it could be other persons, too, as in "ty by + # ses bál"). if node.feats['Mood'] == 'Cnd': - self.check_required_features(node, ['Mood', 'Person']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Cnd'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Dual', 'Plur'] # optional: it is not annotated in the third person - }) + if node.form.lower() == 'by': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'] + }) + elif node.form.lower() == 'byšta': + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['2', '3'], + 'Number': ['Dual'] + }) + else: + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'] + }) elif node.feats['Mood'] == 'Imp': self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], + 'Voice': ['Act'], # optional in Old Czech data, not used with imperatives in Modern Czech data (at least not yet) 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) 'Number': ['Sing', 'Dual', 'Plur'], - 'Polarity': ['Pos', 'Neg'] + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'] }) else: # indicative self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) @@ -497,7 +741,8 @@ def process_node(self, node): 'Person': ['1', '2', '3'], 'Number': ['Sing', 'Dual', 'Plur'], 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist + 'Variant': ['Short', 'Long'], # distinguishes sigmatic (Long) and asigmatic (Short) aorist + 'Emph': ['Yes'] }) elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB if node.feats['Gender'] == 'Masc': @@ -524,43 +769,136 @@ def process_node(self, node): 'Polarity': ['Pos', 'Neg'] }) else: # converb - self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Conv'], - 'Tense': ['Past', 'Pres'], - 'Voice': ['Act'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy - 'Polarity': ['Pos', 'Neg'] - }) + # Old Czech data annotate converb gender by context rather than form + # (because the form was different than in Modern Czech) and for + # masculines they also include animacy. In Modern Czech animacy is + # currently not annotated and Masc,Neut gender is merged. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) # ADVERBS ############################################################## elif node.upos == 'ADV': - if node.feats['PronType'] != '': - # Pronominal adverbs are neither compared nor negated. - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] - }) - elif node.feats['Degree'] != '': - # Adverbs that are compared can also be negated. - self.check_required_features(node, ['Degree', 'Polarity']) + if node.feats['NumType'] != '': + # Adverbial multiplicative numerals (jednou, dvakrát, třikrát) + # belong here. They have also pronominal counterparts (kolikrát, + # tolikrát, několikrát). There are also adverbial ordinal numerals + # (zaprvé, poprvé, zadruhé, podruhé). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. self.check_allowed_features(node, { - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'] + 'NumType': ['Mult', 'Ord'], + 'PronType': ['Dem', 'Int', 'Rel', 'Int,Rel', 'Ind'] }) + elif self.pdt20: + if node.feats['PronType'] != '': + # Pronominal adverbs in PDT are neither compared nor negated. + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] + }) + elif node.feats['Degree'] != '': + # Adverbs that are compared can also be negated. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) else: - # The remaining adverbs are neither pronominal, nor compared or - # negated. - self.check_allowed_features(node, {}) + if node.feats['PronType'] == 'Tot': + # Total adverbs in Old Czech can be negated: vždy, nevždy. + # Then for consistence with other adverbs, we also require + # Degree, although it will be always Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'PronType': ['Tot'], + 'Degree': ['Pos'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['PronType'] != '': + # Other pronominal adverbs are neither compared nor negated. + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg'] + }) + else: + # All other adverbs should have both Degree and Polarity, + # although for some of them the values will always be Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': self.check_required_features(node, ['AdpType', 'Case']) self.check_allowed_features(node, { 'AdpType': ['Prep', 'Voc'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Abbr': ['Yes'] + }) + # SUBORDINATING CONJUNCTIONS ########################################### + elif node.upos == 'SCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) + # COORDINATING CONJUNCTIONS ############################################ + elif node.upos == 'CCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) + # PARTICLES ############################################################ + elif node.upos == 'PART': + # "t." = "totiž" + self.check_allowed_features(node, { + 'Abbr': ['Yes'] }) # THE REST: NO FEATURES ################################################ + # (OR UNDEFINED UPOS) ################################################## else: + if not node.upos in ['INTJ', 'PUNCT', 'SYM', 'X']: + bugmsg = 'UnknownUpos' + if node.upos: + bugmsg += node.upos + self.bug(node, bugmsg) self.check_allowed_features(node, {}) def check_adjective_like(self, node, r0, a0): @@ -575,7 +913,7 @@ def check_adjective_like(self, node, r0, a0): caller in parameters r0 (list) and a0 (dict). """ required_features = [] - allowed_featurs = {} + allowed_features = {} full_set = node.upos == 'ADJ' or not self.pdt20 if full_set: # Even in the full set, animacy is only distinguished for the diff --git a/udapi/block/ud/fixadvmodbyupos.py b/udapi/block/ud/fixadvmodbyupos.py index 781e7586..a2e4439c 100644 --- a/udapi/block/ud/fixadvmodbyupos.py +++ b/udapi/block/ud/fixadvmodbyupos.py @@ -29,10 +29,75 @@ def process_node(self, node): node.deprel = 'discourse' else: node.deprel = 'dep' - ###!!! The following are not advmod so they should probably have their own block or this block should have a different name. + ###!!! The following are not advmod so they should probably have their + ###!!! own block or this block should have a different name. elif node.udeprel == 'expl': if node.upos == 'AUX': node.deprel = 'aux' + elif node.upos == 'ADP': + node.deprel = 'case' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.udeprel in ['aux', 'cop']: + if node.upos != 'AUX': + node.deprel = 'dep' + elif node.udeprel == 'case': + if node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'PRON': + node.deprel = 'nmod' elif node.udeprel == 'mark': - if node.upos == 'PRON': + if node.upos in ['PRON', 'DET']: node.deprel = 'nsubj' # it could be also obj, iobj, obl or nmod; just guessing what might be more probable + elif node.upos == 'NOUN': + node.deprel = 'obl' + elif node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.udeprel == 'cc': + if node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.upos == 'NOUN': + node.deprel = 'dep' + elif node.udeprel == 'det': + if node.upos == 'NOUN': + node.deprel = 'nmod' + elif node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'NUM': + node.deprel = 'nummod' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'VERB': + node.deprel = 'dep' + elif node.upos == 'SCONJ': + node.deprel = 'mark' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.upos == 'X': + node.deprel = 'dep' + elif node.udeprel == 'nummod': + if node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'PRON': + node.deprel = 'nmod' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'ADP': + node.deprel = 'case' + elif node.udeprel == 'punct': + if node.upos != 'PUNCT': + node.deprel = 'dep' + elif node.udeprel == 'obl' and node.parent.upos in ['NOUN', 'PROPN', 'PRON'] and node.parent.udeprel in ['nsubj', 'obj', 'iobj', 'obl', 'vocative', 'dislocated', 'expl', 'nmod']: + node.deprel = 'nmod' diff --git a/udapi/block/ud/fixmultiobjects.py b/udapi/block/ud/fixmultiobjects.py new file mode 100644 index 00000000..485b85f0 --- /dev/null +++ b/udapi/block/ud/fixmultiobjects.py @@ -0,0 +1,47 @@ +""" +Block ud.FixMultiObjects will ensure that no node has more than one (direct) object child. +""" +from udapi.core.block import Block + + +class FixMultiObjects(Block): + """ + Make sure there is at most one object. + """ + + def process_node(self, node): + objects = [x for x in node.children if x.udeprel == 'obj'] + if len(objects) > 1: + subjects = [x for x in node.children if x.udeprel in ['nsubj', 'csubj']] + # Some heuristics that could work in AnCora: + # If all objects are after the verb, keep the one that is closest to the verb. + if objects[0].ord > node.ord: + objects = objects[1:] + for o in objects: + o.deprel = 'obl:arg' + o.deps[0]['deprel'] = 'obl:arg' + elif objects[-1].ord < node.ord: + objects = objects[:-1] + for o in objects: + o.deprel = 'dislocated' + o.deps[0]['deprel'] = 'dislocated' + # ho experimenta tot + elif objects[-1].lemma in ['tot', 'todo']: + objects[-1].parent = objects[0] + objects[-1].deprel = 'nmod' + objects[-1].deps[0]['parent'] = objects[0] + objects[-1].deps[0]['deprel'] = 'nmod' + # X se llama Y + elif node.lemma in ['llamar', 'considerar', 'decir', 'denunciar', 'causar', 'escribir', 'hacer', 'rubricar']: + objects[-1].deprel = 'xcomp' + objects[-1].deps[0]['deprel'] = 'xcomp' + elif len(subjects) == 0: + objects[0].deprel = 'nsubj' + objects[0].deps[0]['deprel'] = 'nsubj' + else: + objects[0].deprel = 'dislocated' + objects[0].deps[0]['deprel'] = 'dislocated' + # For the moment, we take the dummiest approach possible: The first object survives and all others are forced to a different deprel. + #objects = objects[1:] + #for o in objects: + # o.deprel = 'iobj' diff --git a/udapi/block/ud/fixpseudocop.py b/udapi/block/ud/fixpseudocop.py index ecc5f0bd..f4d9a1ec 100644 --- a/udapi/block/ud/fixpseudocop.py +++ b/udapi/block/ud/fixpseudocop.py @@ -2,7 +2,6 @@ but they should be treated as normal verbs (with secondary predication) instead.""" from udapi.core.block import Block -import logging import re class FixPseudoCop(Block): diff --git a/udapi/block/ud/fixroot.py b/udapi/block/ud/fixroot.py new file mode 100644 index 00000000..be972d8b --- /dev/null +++ b/udapi/block/ud/fixroot.py @@ -0,0 +1,37 @@ +""" +Block ud.FixRoot will ensure that the tree is free of common root-related errors. +Simple heuristics are used; it is likely that human inspection would lead to +a different solution. Nevertheless, if a quick fix is needed to pass the +validation, this block can be helpful. + +WARNING: The block currently ignores enhanced dependencies. +""" +import re +from udapi.core.block import Block + + +class FixRoot(Block): + """ + Fixes the following validation errors: + - Only one node must be attached directly to the artificial root node. + => If the root has multiple children, keep the first one. Attach the other + ones to the first one. Change their deprel to 'parataxis'. + - The node attached as a child of the artificial root node must have the + 'root' relation (or its subtype). + => If the root child has another deprel, change it to 'root'. + - The node attached as a child of the artificial root node is the only one + allowed to have the 'root' relation (or its subtype). + => If another node has that deprel, change it to 'parataxis'. + """ + + def process_tree(self, root): + rchildren = root.children + if len(rchildren) > 1: + for i in range(len(rchildren)-1): + rchildren[i+1].parent = rchildren[0] + rchildren[i+1].deprel = 'parataxis' + if rchildren[0].udeprel != 'root': + rchildren[0].deprel = 'root' + for n in root.descendants: + if not n.parent == root and n.udeprel == 'root': + n.deprel = 'parataxis' diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py index d328212d..4ea23d06 100644 --- a/udapi/block/ud/id/fixgsd.py +++ b/udapi/block/ud/id/fixgsd.py @@ -242,7 +242,7 @@ def merge_reduplication(self, node): hyph.remove() node.remove() first.misc['SpaceAfter'] = '' - mwt = root.create_multiword_token([first, second], first.form + second.form, mwtmisc) + mwt = root.create_multiword_token([first, second], form=first.form + second.form, misc=mwtmisc) else: first.form = first.form + '-' + node.form if node.no_space_after: @@ -288,7 +288,7 @@ def merge_reduplication(self, node): prefix.remove() hyph.remove() stem.misc['SpaceAfter'] = '' - mwt = root.create_multiword_token([stem, second], stem.form + second.form, mwtmisc) + mwt = root.create_multiword_token([stem, second], form=stem.form + second.form, misc=mwtmisc) else: stem.form = prefix.form + '-' + stem.form prefix.remove() @@ -345,7 +345,7 @@ def fix_satu_satunya(self, node): if mwt: mwtmisc = mwt.misc.copy() mwt.remove() - mwt = root.create_multiword_token([satu0, nya], satu0.form + nya.form, mwtmisc) + mwt = root.create_multiword_token([satu0, nya], form=satu0.form + nya.form, misc=mwtmisc) satu0.misc['SpaceAfter'] = '' root.text = root.compute_text() if node.multiword_token and node.no_space_after: diff --git a/udapi/block/ud/joinasmwt.py b/udapi/block/ud/joinasmwt.py index 02c54206..be93bd3c 100644 --- a/udapi/block/ud/joinasmwt.py +++ b/udapi/block/ud/joinasmwt.py @@ -22,19 +22,30 @@ def process_node(self, node): if node.multiword_token: return mwt_nodes = [node] - while (node.no_space_after and node.next_node and not node.next_node.multiword_token - and node.form[-1].isalpha() and node.next_node.form[0].isalpha()): + while (node.next_node and not node.next_node.multiword_token + and self.should_join(node, node.next_node)): node = node.next_node mwt_nodes.append(node) if len(mwt_nodes) > 1: - mwt_form = ''.join([n.form for n in mwt_nodes]) - mwt = node.root.create_multiword_token(mwt_nodes, mwt_form) - if node.misc['SpaceAfter'] == 'No': - mwt.misc['SpaceAfter'] = 'No' + self.create_mwt(mwt_nodes) + + def should_join(self, node, next_node): + return node.no_space_after and node.form[-1].isalpha() and next_node.form[0].isalpha() + + def create_mwt(self, mwt_nodes): + mwt_form = ''.join([n.form for n in mwt_nodes]) + mwt = mwt_nodes[0].root.create_multiword_token(words=mwt_nodes, form=mwt_form) + if mwt_nodes[0].node.misc['SpaceAfter'] == 'No': + mwt.misc['SpaceAfter'] = 'No' + for mwt_node in mwt_nodes: + del mwt_node.misc['SpaceAfter'] + if self.revert_orig_form: for mwt_node in mwt_nodes: - del mwt_node.misc['SpaceAfter'] - if self.revert_orig_form: - for mwt_node in mwt_nodes: - if mwt_node.misc['OrigForm']: - mwt_node.form = mwt_node.misc['OrigForm'] - del mwt_node.misc['OrigForm'] + if mwt_node.misc['OrigForm']: + mwt_node.form = mwt_node.misc['OrigForm'] + del mwt_node.misc['OrigForm'] + self.postprocess_mwt() + + # a helper method to be overriden + def postprocess_mwt(self, mwt): + pass diff --git a/udapi/block/ud/jointoken.py b/udapi/block/ud/jointoken.py new file mode 100644 index 00000000..43d2b30d --- /dev/null +++ b/udapi/block/ud/jointoken.py @@ -0,0 +1,97 @@ +""" +Block ud.JoinToken will join a given token with the preceding one. +""" +from udapi.core.block import Block +import logging + + +class JoinToken(Block): + """ + Merge two tokens into one. A MISC attribute is used to mark the tokens that + should join the preceding token. (The attribute may have been set by an + annotator or by a previous block that tests the specific conditions under + which joining is desired.) Joining cannot be done across sentence + boundaries; if necessary, apply util.JoinSentence first. Multiword tokens + are currently not supported: None of the nodes to be merged can belong to + a MWT. (The block ud.JoinAsMwt may be of some help, but it works differently.) + Merging is simple if there is no space between the tokens (see SpaceAfter=No + at the first token). If there is a space, there are three options in theory: + + 1. Keep the tokens as two nodes but apply the UD goeswith relation + (see https://universaldependencies.org/u/overview/typos.html) and + the related annotation rules. + 2. Join them into one token that contains a space. Such "words with + spaces" can be exceptionally allowed in UD if they are registered + in the given language. + 3. Remove the space without any trace. Not recommended in UD unless the + underlying text was created directly for UD and can be thus considered + part of the annotation. + + At present, this block does not support merging with spaces at all, but + in the future one or more of the options may be added. + """ + + def __init__(self, misc_name='JoinToken', misc_value=None, **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the joining + default: JoinToken + misc_value: value of the MISC attribute to trigger the joining; + if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + self.misc_value = misc_value + + def process_node(self, node): + """ + The JoinToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be merged with the previous node and the + attribute will be removed from MISC, or a warning will be issued that + the merging cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + if node.misc[self.misc_name] == '': + return + if self.misc_value and node.misc[self.misc_name] != self.misc_value: + return + prevnode = node.prev_node + if not prevnode: + logging.warning("MISC %s cannot be used at the first token of a sentence." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if node.multiword_token or prevnode.multiword_token: + logging.warning("MISC %s cannot be used if one of the nodes belongs to a multiword token." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if prevnode.misc['SpaceAfter'] != 'No': + logging.warning("MISC %s cannot be used if there is space between the tokens." % self.misc_name) + node.misc['Bug'] = 'JoiningTokensWithSpaceNotSupported' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if prevnode.deps or node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # If the first token depends on the second token, re-attach it to the + # second token's parent to prevent cycles. + if prevnode in node.descendants: + prevnode.parent = node.parent + prevnode.deprel = node.deprel + # Re-attach all children of the second token to the first token. + for c in node.children: + c.parent = prevnode + # Concatenate the word forms of the two tokens. Assume that morphological + # annotation, including the lemma, is already updated accordingly (we + # cannot guess it anyway). + prevnode.form += node.form + # Remove SpaceAfter=No from the first token unless the second token has + # this attribute, too (meaning that there is no space between the second + # token and whatever comes next). + prevnode.misc['SpaceAfter'] = node.misc['SpaceAfter'] + # Remove the current node. The joining instruction was in its MISC, so + # it will disappear together with the node. + node.remove() diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index 2204eb4f..ee58084a 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -118,7 +118,7 @@ def process_node(self, node): if upos == i_upos and not feats[i_feat]: # Some languages do not distinguish finite and non-finite forms of verbs. # The VerbForm feature is not obligatory in those languages. - if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb'}: + if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb', 'naq'}: self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) if feats['VerbForm'] == 'Fin': @@ -127,15 +127,19 @@ def process_node(self, node): if not feats['Mood']: self.log(node, 'finverb-mood', 'VerbForm=Fin but Mood feature is missing') - if feats['Degree'] and upos not in ('ADJ', 'ADV'): - self.log(node, 'degree-upos', - 'Degree=%s upos!=ADJ|ADV (but %s)' % (feats['Degree'], upos)) - subject_children = [n for n in node.children if 'subj' in n.udeprel and n.sdeprel != 'outer'] if len(subject_children) > 1: self.log(node, 'multi-subj', 'More than one (non-outer) [nc]subj child') - object_children = [n for n in node.children if n.udeprel in ('obj', 'ccomp')] + # Since "ccomp" is considered a clausal counterpart of "obj" in UD v2, + # one may conclude that "obj" and "ccomp" are mutually exclusive. + # However, this has always be a gray zone and people have occasionally + # brought up examples where they would want the two relations to co-occur. + # Also, there is no clausal counterpart for "iobj", which may cause some + # of the problems. It is probably safer not to consider "ccomp" in this + # test. Nevertheless, two "obj" under the same parent are definitely an + # error. + object_children = [n for n in node.children if n.udeprel == 'obj'] if len(object_children) > 1: self.log(node, 'multi-obj', 'More than one obj|ccomp child') @@ -150,7 +154,7 @@ def process_node(self, node): # so there should be no false alarms. Some errors are not reported, i.e. the cases # when advmod incorrectly depends on a function word ("right before midnight"). if parent.udeprel in ('aux', 'cop', 'mark', 'clf', 'case'): - if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod'): + if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod', 'reparandum'): self.log(node, parent.deprel + '-child', 'parent.deprel=%s deprel!=conj|cc|punct|fixed|goeswith' % parent.deprel) @@ -180,14 +184,6 @@ def process_node(self, node): if upos == 'PUNCT' and node.is_nonprojective_gap() and not parent.is_nonprojective_gap(): self.log(node, 'punct-nonproj-gap', 'upos=PUNCT and causing a non-projectivity') - # http://universaldependencies.org/u/dep/cc.html says - # "cc is the relation between a conjunct and a preceding - # [coordinating conjunction](http://universaldependencies.org/u/pos/CCONJ)." - # No other upos is allowed in the documentation, although e.g. PART is common in the data. - # There are clear cases of adverbs in role of cc (e.g. "respektive" in Swedish and Czech). - if udeprel == 'cc' and upos not in ('CCONJ', 'ADV'): - self.log(node, 'cc-upos', "deprel=cc upos!=CCONJ (but %s): " % upos) - if udeprel == 'cop': lemma = node.lemma if node.lemma != '_' else form self.cop_nodes[lemma].append(node) diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py index 1bb8188b..26c5624d 100644 --- a/udapi/block/ud/markfeatsbugs.py +++ b/udapi/block/ud/markfeatsbugs.py @@ -8,8 +8,6 @@ Usage (Czech example): cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html """ from udapi.core.block import Block -import logging -import re class MarkFeatsBugs(Block): diff --git a/udapi/block/ud/ro/fixfixed.py b/udapi/block/ud/ro/fixfixed.py new file mode 100644 index 00000000..14d16464 --- /dev/null +++ b/udapi/block/ud/ro/fixfixed.py @@ -0,0 +1,20 @@ +"""Block ud.ro.FixFixed + +Author: Dan Zeman +""" +import logging + +from udapi.core.block import Block + + +class FixFixed(Block): + """Block for fixing annotation of some 'fixed' expressions.""" + + def process_node(self, node): + fixchildren = [x for x in node.children if x.udeprel=='fixed'] + nfc = len(fixchildren) + if nfc > 0: + if node.udeprel == 'advmod' and node.feats['ExtPos'] == '': + node.feats['ExtPos'] = 'ADV' + elif node.feats['ExtPos'] == '': + logging.info('Another case: '+node.lemma+' '+' '.join([x.form for x in fixchildren])) diff --git a/udapi/block/ud/setspaceafterfromtext.py b/udapi/block/ud/setspaceafterfromtext.py index c5321221..ec7ab658 100644 --- a/udapi/block/ud/setspaceafterfromtext.py +++ b/udapi/block/ud/setspaceafterfromtext.py @@ -14,6 +14,10 @@ class SetSpaceAfterFromText(Block): """Block for setting of the SpaceAfter=No MISC attribute according to the sentence text.""" def process_tree(self, root): + # Empty nodes cannot have 'SpaceAfter=No', so make sure the file is valid. + for empty_node in root.empty_nodes: + del empty_node.misc['SpaceAfter'] + text = root.text if text is None: raise ValueError('Tree %s has no text, cannot use ud.SetSpaceAfterFromText' % root) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py index 7208b6ef..7de53881 100644 --- a/udapi/block/ud/sk/fixedeprels.py +++ b/udapi/block/ud/sk/fixedeprels.py @@ -1,6 +1,5 @@ """Block to fix case-enhanced dependency relations in Slovak.""" from udapi.core.block import Block -import logging import re class FixEdeprels(Block): @@ -14,9 +13,11 @@ class FixEdeprels(Block): 'a_hoci': 'hoci', 'ako': 'ako', # remove morphological case 'ako_na': 'ako', + 'ako_z': 'ako', 'akoby_z': 'z:gen', 'akže': 'ak', 'ani_keby': 'keby', + 'ani_keď': 'keď', 'až_keď': 'keď', 'do': 'do:gen', 'k': 'k:dat', diff --git a/udapi/block/ud/splittoken.py b/udapi/block/ud/splittoken.py new file mode 100644 index 00000000..16c60a38 --- /dev/null +++ b/udapi/block/ud/splittoken.py @@ -0,0 +1,107 @@ +""" +Block ud.SplitToken will split a given token into multiple tokens. +""" +from udapi.core.block import Block +import re +import logging + + +class SplitToken(Block): + """ + Split a token into two or more. A MISC attribute is used to mark the tokens + that should be split. (The attribute may have been set by an annotator or + by a previous block that tests the specific conditions under which splitting + is desired.) Multiword tokens are currently not supported: The node to be + split cannot belong to a MWT. Note that the result will not be a MWT either + (use the block ud.AddMwt if that is desired). There will be simply a new + attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes + (indicating that this was an error in the source text). + """ + + def __init__(self, misc_name='SplitToken', **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the splitting + default: SplitToken + The value of the attribute should indicate where to split the token. + It should be a string that is identical to node.form except that + there is one or more spaces where the token should be split. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + + def process_node(self, node): + """ + The SplitToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be split to multiple nodes and the + attribute will be removed from MISC, or a warning will be issued that + the splitting cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + value = node.misc[self.misc_name] + if value == '': + return + if node.multiword_token: + logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # Verify that the value of the MISC attribute can be used as specification + # of the split. + if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value): + logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + if re.search(r'\s', node.form): + logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + if re.sub(r' ', '', value) != node.form: + logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + # Do the split. + space_after = node.misc['SpaceAfter'] + forms = value.split(' ') + # Optionally, SplitTokenMorpho in MISC can have the morphological annotation + # of the new tokens. For example: + # SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act + if node.misc['SplitTokenMorpho'] != '': + morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ') + del node.misc['SplitTokenMorpho'] + else: + morphoblocks = ['' for x in forms] + node.form = forms[0] + last_node = node + for form, morpho in zip(forms[1:], morphoblocks[1:]): + last_node.misc['SpaceAfter'] = 'No' + last_node.misc['CorrectSpaceAfter'] = 'Yes' + lemma = form + upos = node.upos + feats = str(node.feats) + xpos = node.xpos + if morpho != '': + cols = morpho.split('\\t') + for c in cols: + colname, value = c.split('=', 1) + if colname == 'LEMMA': + lemma = value + elif colname == 'UPOS': + upos = value + elif colname == 'FEATS': + feats = re.sub(r'\\p', '|', value) + elif colname == 'XPOS': + xpos = value + else: + logging.fatal(f"c = {c}") + new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep') + new_node.shift_after_node(last_node) + last_node = new_node + last_node.misc['SpaceAfter'] = space_after + del node.misc[self.misc_name] diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index d94f8cc5..9d053cb7 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -1,9 +1,15 @@ """Block udpipe.Base for tagging and parsing using UDPipe.""" from udapi.core.block import Block -from udapi.tool.udpipe import UDPipe from udapi.tool.udpipeonline import UDPipeOnline from udapi.core.bundle import Bundle +# Import UDPipe only if available (requires ufal.udpipe) +try: + from udapi.tool.udpipe import UDPipe + UDPIPE_AVAILABLE = True +except ImportError: + UDPIPE_AVAILABLE = False + KNOWN_MODELS = { 'af': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', 'af_afribooms': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', @@ -121,13 +127,12 @@ class Base(Block): # pylint: disable=too-many-arguments def __init__(self, model=None, model_alias=None, online=False, tokenize=True, tag=True, parse=True, resegment=False, - delete_nodes=False, **kwargs): - """Create the udpipe.En block object.""" + ranges=False, delete_nodes=False, **kwargs): super().__init__(**kwargs) self.model, self.model_alias, self.online = model, model_alias, online self._tool = None self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment - self.delete_nodes = delete_nodes + self.ranges, self.delete_nodes = ranges, delete_nodes @property def tool(self): @@ -144,11 +149,16 @@ def tool(self): if self.online: self._tool = UDPipeOnline(model=self.model) else: + if not UDPIPE_AVAILABLE: + raise ImportError("UDPipe is not available. Install ufal.udpipe or use online=1") self._tool = UDPipe(model=self.model) return self._tool def process_document(self, doc): - tok, tag, par, reseg = self.tokenize, self.tag, self.parse, self.resegment + tok, tag, par, reseg, ranges = self.tokenize, self.tag, self.parse, self.resegment, self.ranges + if self.zones == "all" and self.online: + self.tool.process_document(doc, tok, tag, par, reseg, ranges) + return old_bundles = doc.bundles new_bundles = [] for bundle in old_bundles: @@ -160,7 +170,7 @@ def process_document(self, doc): subroot.remove() if tok: new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=reseg, - tag=tag, parse=par) + tag=tag, parse=par, ranges=ranges) if self.resegment and len(new_trees) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index df6aaabf..6e4f2ac9 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -30,7 +30,7 @@ class Eval(Block): def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, coref_mention=None, coref_entity=None, empty_nodes=False, - expand_code=True, **kwargs): + expand_code=True, mwt=None, **kwargs): super().__init__(**kwargs) self.doc = doc self.bundle = bundle @@ -38,6 +38,7 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.node = node self.start = start self.end = end + self.mwt = mwt self.before_doc = before_doc self.after_doc = after_doc self.before_bundle = before_bundle @@ -70,7 +71,7 @@ def process_document(self, document): if self.doc: exec(self.expand_eval_code(self.doc)) - if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node: + if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node or self.mwt: for bundle in doc.bundles: # TODO if self._should_process_bundle(bundle): self.process_bundle(bundle) @@ -96,7 +97,7 @@ def process_bundle(self, bundle): if self.bundle: exec(self.expand_eval_code(self.bundle)) - if self.tree or self.node: + if self.tree or self.node or self.mwt: trees = bundle.trees for tree in trees: if self._should_process_tree(tree): @@ -121,6 +122,11 @@ def process_tree(self, tree): this = node exec(self.expand_eval_code(self.node)) + if self.mwt: + for mwt in tree.multiword_tokens: + this = mwt + exec(self.expand_eval_code(self.mwt)) + def process_start(self): if self.start: exec(self.expand_eval_code(self.start)) diff --git a/udapi/block/util/joinsentence.py b/udapi/block/util/joinsentence.py new file mode 100644 index 00000000..578f3865 --- /dev/null +++ b/udapi/block/util/joinsentence.py @@ -0,0 +1,77 @@ +""" +Block util.JoinSentence will join a given sentence with the preceding one. +""" +import logging +from udapi.core.block import Block + +class JoinSentence(Block): + """ + Joins a sentence with the preceding one. There are two ways how to indicate + the sentences that this block should process. + + Method 1: Parameter sent_id provides the id of the sentence that should be + merged with the preceding one. At most one sentence pair from the input will + be merged, even if there are multiple sentences with the given id. + + Method 2: A MISC attribute can be specified that, if found, will trigger + joining of the current sentence to the previous one. With this approach, + multiple sentence pairs can be merged during one run. + """ + + def __init__(self, sent_id=None, misc_name=None, misc_value=None, **kwargs): + """ + Args: + sent_id: which sentence should be appended to the previous one + misc_name: name of the MISC attribute that can trigger the joining (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the joining; if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + if misc_name: + if sent_id: + logging.fatal('Cannot combine misc_value with sent_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + self.sent_id = sent_id + self.misc_name = misc_name + self.misc_value = misc_value + + def process_document(self, document): + previous_tree = None + for bundle_no, bundle in enumerate(document.bundles): + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to join all zones but we do not try to do it at present. + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + # The MISC attribute we are looking for should logically occur + # on the first node of the sentence but we can take it from any node. + join_commands = [n for n in root.descendants if n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if join_commands: + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove from the node the MISC attribute that triggered the sentence split. + for n in join_commands: + n.misc[self.misc_name] = '' + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + else: + previous_tree = root + elif bundle.bundle_id == self.sent_id: + logging.info('Found!') + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + root = bundle.get_tree() + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + # We have found our sentence. No need to process the rest of the document. + break diff --git a/udapi/block/util/markmwtbugsatnodes.py b/udapi/block/util/markmwtbugsatnodes.py new file mode 100644 index 00000000..ebc2ef4e --- /dev/null +++ b/udapi/block/util/markmwtbugsatnodes.py @@ -0,0 +1,25 @@ +"""util.MarkMwtBugsAtNodes copies Bug attributes from MISC of multiword tokens to MISC of member nodes. + Otherwise they will be ignored when write.TextModeTrees marked_only=1 is called.""" + +from udapi.core.block import Block + +class MarkMwtBugsAtNodes(Block): + """ + If a node belongs to a multiword token and the MWT has Bug in MISC, copy + the Bug to the node so that filtering trees with bugs works. + The same bug note will be copied to all nodes in the MWT. + """ + + ###!!! Do we want to do the same thing also with ToDo attributes? + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def process_node(self, node): + if node.multiword_token: + if node.multiword_token.misc['Bug']: + self.bug(node, node.multiword_token.misc['Bug']) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py index b150d551..4cce4ab8 100644 --- a/udapi/block/util/normalize.py +++ b/udapi/block/util/normalize.py @@ -1,9 +1,10 @@ """util.Normalize normalizes the ordering of various attributes in CoNLL-U.""" from udapi.core.block import Block +from pathlib import Path class Normalize(Block): """Normalize the ordering of attributes in the FEATS and MISC columns. - + The attribute-value pairs in the FEATS column in CoNLL-U files must be sorted alphabetically (case-insensitive) according to the guidelines (https://universaldependencies.org/format.html#morphological-annotation). @@ -20,7 +21,8 @@ class Normalize(Block): util.Eval node='node.misc["NonExistentAttribute"] = None' """ - def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, start_sent_id=1, sent_id_prefix="", **kwargs): + def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, start_sent_id=1, sent_id_prefix="", + sent_id_from_filename=False, sent_id_reset_at_newdoc=False, newdoc_from_filename=False, **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. @@ -29,6 +31,9 @@ def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, s `empty_node_ord`: normalize ord attributes of empty nodes. Default=False. `start_sent_id`: the first sent_id number `sent_id_prefix`: a string to be prepended before the integer sent_id. Default=empty string. + `sent_id_from_filename`: add Path(doc.meta["loaded_from"]).stem before the `sent_id_prefix`. Default=False. + `sent_id_reset_at_newdoc`: reset the sent_id counter to 1 for each new document. Default=False. + `newdoc_from_filename`: set newdoc to Path(doc.meta["loaded_from"]).stem. Default=False. """ super().__init__(**kwargs) self.feats = feats @@ -37,13 +42,28 @@ def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, s self.empty_node_ord = empty_node_ord self.next_sent_id = start_sent_id self.sent_id_prefix = sent_id_prefix - if sent_id_prefix or start_sent_id != 1: + self.sent_id_from_filename = sent_id_from_filename + self.sent_id_reset_at_newdoc = sent_id_reset_at_newdoc + self.newdoc_from_filename = newdoc_from_filename + if sent_id_reset_at_newdoc and not sent_id_from_filename: + raise ValueError("Cannot use sent_id_reset_at_newdoc without sent_id_from_filename") + if sent_id_prefix or start_sent_id != 1 or sent_id_from_filename: self.sent_id = True + # TODO: normalize also the order of standardized comments like text, sent_id,... def process_bundle(self, bundle): + is_newdoc = any(tree.newdoc for tree in bundle.trees) + if self.newdoc_from_filename and is_newdoc: + tree = next(tree for tree in bundle.trees if tree.newdoc) + tree.newdoc = Path(bundle.document.meta["loaded_from"]).stem if self.sent_id: - bundle.bundle_id = self.sent_id_prefix + str(self.next_sent_id) + if self.sent_id_reset_at_newdoc and is_newdoc: + self.next_sent_id = 1 + prefix = self.sent_id_prefix + if self.sent_id_from_filename: + prefix = Path(bundle.document.meta["loaded_from"]).stem + prefix + bundle.bundle_id = prefix + str(self.next_sent_id) self.next_sent_id += 1 for tree in bundle: diff --git a/udapi/block/util/see.py b/udapi/block/util/see.py index aa7131b7..9a895b88 100644 --- a/udapi/block/util/see.py +++ b/udapi/block/util/see.py @@ -51,7 +51,7 @@ class See(Block): """Print statistics about the nodes specified by the parameter `node`.""" - def __init__(self, node, n=5, stats=STATS, **kwargs): + def __init__(self, node, n=5, stats=STATS, empty=False, **kwargs): """Args: `node`: Python expression to be evaluated for each node and if True, the node will be considered "matching". @@ -62,6 +62,7 @@ def __init__(self, node, n=5, stats=STATS, **kwargs): `children` = number of children nodes, `p_lemma` = lemma of a parent node, etc). See `udapi.core.Node.get_attrs` for a full list of statistics. + `empty`: apply the code also on empty nodes """ super().__init__(**kwargs) self.node = node @@ -73,11 +74,13 @@ def __init__(self, node, n=5, stats=STATS, **kwargs): self.match[stat] = Counter() self.every[stat] = Counter() self.overall = Counter() + self.empty = empty def process_tree(self, root): self.overall['trees'] += 1 tree_match = False - for node in root.descendants: + nodes = root.descendants_and_empty if self.empty else root.descendants + for node in nodes: matching = self.process_node(node) self.overall['nodes'] += 1 if matching: diff --git a/udapi/block/util/splitsentence.py b/udapi/block/util/splitsentence.py index 2886cd5d..b6ca57d8 100644 --- a/udapi/block/util/splitsentence.py +++ b/udapi/block/util/splitsentence.py @@ -10,35 +10,87 @@ class SplitSentence(Block): If the sent_id of the current sentence matches the parameter, splits the sentence into two. The first token of the second sentence is also given as a parameter. + + Alternatively, a MISC attribute can be specified that triggers sentence + splitting at the given token. With this approach, multiple sentence splits + can be performed during one run. """ - def __init__(self, sent_id=None, word_id=None, **kwargs): + def __init__(self, sent_id=None, word_id=None, misc_name=None, misc_value=None, **kwargs): """ Args: sent_id: which sentence should be split (new ids will have A and B appended) word_id: which word should be the first word of the second sentence (tokens and words will be renumbered) + misc_name: name of the MISC attribute that can trigger the split (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the split; if not specified, then simple occurrence of the attribute with any value will cause the split + MISC attributes that have triggered sentence split will be removed from their node. """ super().__init__(**kwargs) - if not sent_id: - logging.fatal('Missing parameter sent_id') - if not word_id: - logging.fatal('Missing parameter word_id') + if misc_name: + if sent_id or word_id: + logging.fatal('Cannot combine misc_value with sent_id or word_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + if not word_id: + logging.fatal('Missing parameter word_id') self.sent_id = sent_id self.word_id = word_id + self.misc_name = misc_name + self.misc_value = misc_value def process_document(self, document): for bundle_no, bundle in enumerate(document.bundles): - if bundle.bundle_id == self.sent_id: + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to split all zones but we do not try to do it at present. + # (The zones may be translations to other languages and it is not likely that we would + # want to split each translation at the same position.) + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + split_points = [n for n in root.descendants if n.ord > 1 and n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if split_points: + # Create as many new bundles as there are split points. + n_new = len(split_points) + current_bid = bundle.bundle_id + idletter = 'B' # a letter will be added to bundle ids to distinguish them + for i in range(n_new): + new_bundle = document.create_bundle() + new_bundle.bundle_id = current_bid + idletter + new_root = Root(zone='') + new_bundle.add_tree(new_root) + # Identify nodes to move to the new bundle. + first_node_id = split_points[i].ord + if i < n_new - 1: + next_first_node_id = split_points[i+1].ord + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id and n.ord < next_first_node_id] + else: + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id] + new_root.steal_nodes(nodes_to_move) + self.make_zeros_roots(new_root) + new_root.text = new_root.compute_text() + # The new bundle was created at the end of the document. + # Move it to the position right after the current bundle. + document.bundles.pop() + document.bundles.insert(bundle_no + i + 1, new_bundle) + idletter = chr(ord(idletter) + 1) + # Remove from the node the MISC attribute that triggered the sentence split. + split_points[i].misc[self.misc_name] = '' + # Update the id of the current bundle, fix its zero-dependents and recompute sentence text. + bundle.bundle_id += 'A' + self.make_zeros_roots(root) + root.text = root.compute_text() + # Update the bundle numbers of the new bundles and all bundles after them. + updated_no = bundle_no + 1 + for b in document.bundles[(bundle_no+1):]: + b.number = updated_no + updated_no += 1 + elif bundle.bundle_id == self.sent_id: logging.info('Found!') - # In general, a bundle may contain multiple trees in different zones. - # In UD data, we always expect just one zone (labeled '') per bundle. - # This code could be extended to split all zones but we do not try to do it at present. - # (The zones may be translations to other languages and it is not likely that we would - # want to split each translation at the same position.) - if len(bundle.trees) != 1: - logging.fatal('Cannot process bundles that have less or more than 1 zone') - if not bundle.has_tree(zone=''): - logging.fatal('Cannot process bundles that do not have the zone with empty zone id') root = bundle.get_tree() nodes_to_move = [n for n in root.descendants if n.ord >= self.word_id] if len(nodes_to_move) == 0: @@ -60,22 +112,23 @@ def process_document(self, document): new_root.steal_nodes(nodes_to_move) # The steal_nodes() method does not make sure that all nodes newly attached # to the artificial root have the 'root' relation. Fix it. - n_root = 0 - for n in root.descendants: - if n.parent.is_root(): - n.deprel = 'root' - n_root += 1 - if n_root > 1: - logging.warning('More than one 0:root relation in the first part of the sentence.') - n_root = 0 - for n in new_root.descendants: - if n.parent.is_root(): - n.deprel = 'root' - n_root += 1 - if n_root > 1: - logging.warning('More than one 0:root relation in the second part of the sentence.') + self.make_zeros_roots(root) + self.make_zeros_roots(new_root) # Update the sentence text attributes of the new sentences. root.text = root.compute_text() new_root.text = new_root.compute_text() # We have found our sentence. No need to process the rest of the document. break + + def make_zeros_roots(self, root): + """ + The steal_nodes() method does not make sure that all nodes newly attached + to the artificial root have the 'root' relation. Fix it. + """ + n_root = 0 + for n in root.descendants: + if n.parent.is_root(): + n.deprel = 'root' + n_root += 1 + if n_root > 1: + logging.warning('More than one 0:root relation in newly segmented sentence %s.' % root.bundle.bundle_id) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index abe20963..ad647477 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -117,7 +117,9 @@ def process_tree(self, tree): # pylint: disable=too-many-branches if mwt and node._ord > last_mwt_id: print('\t'.join((mwt.ord_range, '_' if mwt.form is None else mwt.form, - '_\t_\t_\t_\t_\t_\t_', + '_\t_\t_', + '_' if mwt._feats is None else str(mwt.feats), + '_\t_\t_', '_' if mwt._misc is None else str(mwt.misc)))) last_mwt_id = mwt.words[-1]._ord @@ -134,10 +136,10 @@ def process_tree(self, tree): # pylint: disable=too-many-branches '_' if node._feats is None else str(node.feats), head, node.deprel, node.raw_deps, '_' if node._misc is None else str(node.misc)))) - # Empty sentences are not allowed in CoNLL-U, + # Empty sentences (sentences with no non-empty nodes) are not allowed in CoNLL-U, # but with print_empty_trees==1 (which is the default), # we will print an artificial node, so we can print the comments. - if not nodes: + if not tree._descendants: print("1\t_\t_\t_\t_\t_\t0\t_\t_\tEmpty=Yes") # Empty line separates trees in CoNLL-U (and is required after the last tree as well) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index 41539670..a8a7ab3d 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -14,6 +14,7 @@ 'upos': 'red', 'deprel': 'blue', 'ord': 'green', + 'misc[Entity]': 'magenta', } # Too many instance variables, arguments, branches... @@ -22,7 +23,7 @@ class TextModeTrees(BaseWriter): - """An ASCII pretty printer of dependency trees. + r"""An ASCII pretty printer of dependency trees. .. code-block:: bash @@ -109,7 +110,7 @@ class TextModeTrees(BaseWriter): │ ╰─┶ boxer NOUN acl:relcl ╰─╼ . PUNCT punct - Some non-projective trees cannot be printed witout crossing edges. + Some non-projective trees cannot be printed without crossing edges. TextModeTrees uses a special "bridge" symbol ─╪─ to mark this:: ─┮ @@ -122,17 +123,17 @@ class TextModeTrees(BaseWriter): (not file or pipe), each node attribute is printed in different color. If a given node's MISC contains any of `ToDo`, `Bug` or `Mark` attributes (or any other specified in the parameter `mark`), the node will be highlighted - (by reveresing the background and foreground colors). + (by reversing the background and foreground colors). This block's method `process_tree` can be called on any node (not only root), which is useful for printing subtrees using ``node.draw()``, which is internally implemented using this block. For use in LaTeX, you can insert the output of this block (without colors) - into \begin{verbatim}...\end{verbatim}, but you need to compile with pdflatex (xelatex not supported) - and you must add the following code into the preambule:: + into ``\begin{verbatim}...\end{verbatim}``, but you need to compile with pdflatex (xelatex not supported) + and you must add the following code into the preamble:: - \\usepackage{pmboxdraw} + \usepackage{pmboxdraw} \DeclareUnicodeCharacter{256D}{\textSFi} %╭ \DeclareUnicodeCharacter{2570}{\textSFii} %╰ @@ -143,41 +144,44 @@ class TextModeTrees(BaseWriter): def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color='auto', attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, print_empty=True, - mark='(ToDo|ToDoOrigText|Bug|Mark)', marked_only=False, hints=True, + print_mwt=False, mark='(ToDo|ToDoOrigText|Bug|Mark)', marked_only=False, hints=True, layout='classic', **kwargs): """Create new TextModeTrees block object. Args: - print_sent_id: Print ID of the tree (its root, aka "sent_id") above each tree? - print_sentence: Print plain-text detokenized sentence on a line above each tree? - add_empty_line: Print an empty line after each tree? - indent: Number of characters to indent node depth in the tree for better readability. - minimize_cross: Minimize crossings of edges in non-projective trees? - Trees without crossings are subjectively more readable, but usually - in practice also "deeper", that is with higher maximal line length. - color: Print the node attribute with ANSI terminal colors? - Default = 'auto' which means that color output only if the output filehandle - is interactive (console). Each attribute is assigned a color (the mapping is - tested on black background terminals and can be changed only in source code). - If you plan to pipe the output (e.g. to "less -R") and you want the colors, - you need to set explicitly color=1, see the example in Synopsis. - attributes: A comma-separated list of node attributes which should be printed. Possible - values are ord, form, lemma, upos, xpos, feats, deprel, deps, misc. - print_undef_as: What should be printed instead of undefined attribute values (if any)? - print_doc_meta: Print `document.meta` metadata before each document? - print_comments: Print comments (other than sent_id and text)? - print_empty: Print empty nodes? - mark: a regex. If `re.search(mark + '=', str(node.misc))` the node is highlighted. - If `print_comments and re.search(r'^ %s = ' % mark, root.comment, re.M)` - the comment is highlighted. - Empty string means no highlighting. Default = 'ToDo|ToDoOrigText|Bug|Mark'. - marked_only: print only trees containing one or more marked nodes/comments. Default=False. - hints: use thick-marked segments (┡ and ┢) to distinguish whether a given node precedes - or follows its parent. Default=True. If False, plain ├ is used in both cases. - layout: 'classic' (default) shows word attributes immediately next to each node, - 'compact' never print edges after (right to) words even in non-projectivities, - 'align-words' as 'compact' but all first attributes (forms by default) are aligned, - 'align' as 'align-words' but all attributes are aligned in columns. + print_sent_id: Print ID of the tree (its root, aka "sent_id") above each tree? + print_text: Print plain-text detokenized sentence on a line above each tree? + add_empty_line: Print an empty line after each tree? + indent: Number of characters to indent node depth in the tree for better readability. + minimize_cross: Minimize crossings of edges in non-projective trees? + Trees without crossings are subjectively more readable, but usually + in practice also "deeper", that is with higher maximal line length. + color: Print the node attribute with ANSI terminal colors? + Default = 'auto' which means that color output only if the output filehandle + is interactive (console). Each attribute is assigned a color (the mapping is + tested on black background terminals and can be changed only in source code). + If you plan to pipe the output (e.g. to "less -R") and you want the colors, + you need to set explicitly color=1, see the example in Synopsis. + attributes: A comma-separated list of node attributes which should be printed. Possible + values are ``ord``, ``form``, ``lemma``, ``upos``, ``xpos``, ``feats``, ``deprel``, ``deps``, ``misc``. + print_undef_as: What should be printed instead of undefined attribute values (if any)? + print_doc_meta: Print ``document.meta`` metadata before each document? + print_comments: Print comments (other than ``sent_id`` and ``text``)? + print_empty: Print empty nodes? Default=True + print_mwt: Print multi-word tokens? Default=False + mark: A regex pattern. If ``re.search(mark + '=', str(node.misc))`` matches, the node is highlighted. + If ``print_comments`` and ``re.search(r'^ %s = ' % mark, root.comment, re.M)`` matches, + the comment is highlighted. Empty string means no highlighting. + Default = ``'(ToDo|ToDoOrigText|Bug|Mark)'``. + marked_only: Print only trees containing one or more marked nodes/comments. Default ``False``. + hints: Use thick-marked segments (┡ and ┢) to distinguish whether a given node precedes + or follows its parent. Default ``True``. If ``False``, plain ├ is used in both cases. + layout: Tree layout style: + + - ``'classic'`` (default): shows word attributes immediately next to each node + - ``'compact'``: never print edges after (right to) words even in non-projectivities + - ``'align-words'``: like ``'compact'`` but all first attributes (forms by default) are aligned + - ``'align'``: like ``'align-words'`` but all attributes are aligned in columns """ super().__init__(**kwargs) self.print_sent_id = print_sent_id @@ -190,6 +194,7 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.print_doc_meta = print_doc_meta self.print_comments = print_comments self.print_empty = print_empty + self.print_mwt = print_mwt self.mark = mark self.marked_only = marked_only self.layout = layout @@ -244,21 +249,21 @@ def should_print_tree(self, root, allnodes): return False return self.comment_mark_re.search(root.comment) - def process_tree(self, root): + def process_tree(self, root, force_print=False): """Print the tree to (possibly redirected) sys.stdout.""" if self.print_empty: - if root.is_root(): + if root.is_root() and not self.print_mwt: allnodes = [root] + root.descendants_and_empty else: - allnodes = root.descendants(add_self=1) + allnodes = root.descendants(add_self=1, add_mwt=self.print_mwt) empty = [e for e in root._root.empty_nodes if e > allnodes[0] and e < allnodes[-1]] allnodes.extend(empty) allnodes.sort() else: - allnodes = root.descendants(add_self=1) - if not self.should_print_tree(root, allnodes): + allnodes = root.descendants(add_self=1, add_mwt=self.print_mwt) + if not force_print and not self.should_print_tree(root, allnodes): return - self._index_of = {allnodes[i].ord: i for i in range(len(allnodes))} + self._index_of = {allnodes[i].ord_range if allnodes[i].is_mwt() else allnodes[i].ord: i for i in range(len(allnodes))} self.lines = [''] * len(allnodes) self.lengths = [0] * len(allnodes) @@ -285,7 +290,7 @@ def process_tree(self, root): if self.layout == 'classic': self.add_node(idx, node) else: - if idx_node.parent is not node: + if idx_node.is_mwt() or idx_node.parent is not node: self._add(idx, self._vert[self._ends(idx, '─╭╰╪┡┢')]) else: precedes_parent = idx < self._index_of[node.ord] @@ -303,7 +308,7 @@ def process_tree(self, root): if self.layout == 'classic': for idx, node in enumerate(allnodes): - if node.is_empty(): + if node.is_empty() or node.is_mwt(): self.add_node(idx, node) else: columns_attrs = [[a] for a in self.attrs] if self.layout == 'align' else [self.attrs] @@ -353,7 +358,8 @@ def before_process_document(self, document): os.environ["FORCE_COLOR"] = "1" if self.print_doc_meta: for key, value in sorted(document.meta.items()): - print('%s = %s' % (key, value)) + if key[0] != '_': + print('%s = %s' % (key, value)) def _add(self, idx, text): self.lines[idx] += text @@ -361,7 +367,7 @@ def _add(self, idx, text): def add_node(self, idx, node): """Render a node with its attributes.""" - if not node.is_root(): + if node.is_mwt() or not node.is_root(): values = node.get_attrs(self.attrs, undefs=self.print_undef_as) self.lengths[idx] += 1 + len(' '.join(values)) marked = self.is_marked(node) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 5ccceb78..0ad39da4 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -26,7 +26,7 @@ class TextModeTreesHtml(TextModeTrees): This block is a subclass of `TextModeTrees`, see its documentation for more info. """ - def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, **kwargs): + def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, whole_bundle=True, **kwargs): """Create new TextModeTreesHtml block object. Args: see `TextModeTrees`. @@ -35,10 +35,14 @@ def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, (see the `mark` parameter) to be more eye-catching. title: What title metadata to use for the html? + zones_in_rows: print trees from the same bundle side by side (i.e. in the same row). + whole_bundle: always print the whole bundle (all its trees) if any of the trees is marked + (relevant only with marked_only=True and zones_in_rows=True) """ super().__init__(color=color, **kwargs) self.title = title self.zones_in_rows = zones_in_rows + self.whole_bundle = whole_bundle def before_process_document(self, document): # TextModeTrees.before_process_document changes the color property, @@ -97,10 +101,12 @@ def process_bundle(self, bundle): if self.should_print_tree(tree, allnodes): marked_trees.append(tree) if marked_trees: + if self.whole_bundle: + marked_trees = bundle print("
| ") - self.process_tree(tree) + self.process_tree(tree, force_print=True) print(" | ") print("