diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000..9530d5c7 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,62 @@ +# Use the latest 2.1 version of CircleCI pipeline process engine. +# See: https://circleci.com/docs/2.0/configuration-reference +version: 2.1 + +# Orbs are reusable packages of CircleCI configuration that you may share across projects, enabling you to create encapsulated, parameterized commands, jobs, and executors that can be used across multiple projects. +# See: https://circleci.com/docs/2.0/orb-intro/ +orbs: + # The python orb contains a set of prepackaged CircleCI configuration you can use repeatedly in your configuration files + # Orb commands and jobs help you with common scripting around a language/tool + # so you dont have to copy and paste it everywhere. + # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python + python: circleci/python@1.5.0 + +# Define a job to be invoked later in a workflow. +# See: https://circleci.com/docs/2.0/configuration-reference/#jobs +jobs: + build-and-test: # This is the name of the job, feel free to change it to better match what you're trying to do! + parameters: + python-version: + type: string + # These next lines defines a Docker executors: https://circleci.com/docs/2.0/executor-types/ + # You can specify an image from Dockerhub or use one of the convenience images from CircleCI's Developer Hub + # A list of available CircleCI Docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python + # The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container + # Change the version below to your required version of python + docker: + - image: cimg/python:<< parameters.python-version >> + # Checkout the code as the first step. This is a dedicated CircleCI step. + # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default. + # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt. + # Then run your tests! + # CircleCI will report the results back to your VCS provider. + steps: + - checkout + - python/install-packages: + pkg-manager: pip + - run: + name: Install Udapi + command: pip install ".[test]" + - run: mkdir -p test-results + - run: + name: Run pytest tests + command: pytest --junitxml=test-results/junit.xml -o junit_family=legacy + - store_test_results: + path: test-results + - run: + name: Color TextModeTrees + command: udapy read.Conllu files=udapi/core/tests/data/babinsky.conllu write.TextModeTrees color=1 + - run: + name: External tests + command: cd udapi/core/tests && ./external_tests.sh + + +# Invoke jobs via workflows +# See: https://circleci.com/docs/2.0/configuration-reference/#workflows +workflows: + test-matrix: + jobs: + - build-and-test: + matrix: + parameters: + python-version: ["3.9", "3.11", "3.13"] diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..0285eddb --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,70 @@ +# This workflow will upload a Python Package to PyPI when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + release-build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Build release distributions + run: | + # NOTE: put your own distribution build steps here. + python -m pip install build + python -m build + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: release-dists + path: dist/ + + pypi-publish: + runs-on: ubuntu-latest + needs: + - release-build + permissions: + # IMPORTANT: this permission is mandatory for trusted publishing + id-token: write + + # Dedicated environments with protections for publishing are strongly recommended. + # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules + environment: + name: pypi + # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status: + url: https://pypi.org/p/udapi + # + # ALTERNATIVE: if your GitHub Release name is the PyPI project version string + # ALTERNATIVE: exactly, uncomment the following line instead: + # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }} + + steps: + - name: Retrieve release distributions + uses: actions/download-artifact@v4 + with: + name: release-dists + path: dist/ + + - name: Publish release distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist/ diff --git a/.gitignore b/.gitignore index a75e7c05..adc7bbbc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.cache .idea +*.egg-info/ *.pyc -.cache +dist/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..8804cc4e --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,23 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Currently, RTD needs to select an OS with OpenSSL>=1.1.1 because of +# urllib3's dependence on that system library. (alternately, pin urllib3<2 +# See https://github.com/urllib3/urllib3/issues/2168 +build: + os: ubuntu-22.04 + tools: + python: "3.10" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + fail_on_warning: false + +python: + install: + - requirements: docs/requirements.txt diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index d45259b8..00000000 --- a/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -language: python -python: - - "3.3" - - "3.4" - - "3.5" -before_install: - - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - - sudo apt-get update -qq - - sudo apt-get install -qq gcc-4.8 g++-4.8 - - CC=g++-4.8 pip install ufal.udpipe -install: - - python setup.py install -script: - - python -m pytest - - cd udapi/core/tests && ./external_tests.sh diff --git a/CHANGES.txt b/CHANGES.txt new file mode 100644 index 00000000..98e26605 --- /dev/null +++ b/CHANGES.txt @@ -0,0 +1,35 @@ +Udapi Change Log +---------------- +See https://github.com/udapi/udapi-python/commits/master for details. + +0.5.1 2025-11-05 + - make udapy compatible with Python 3.13 + +0.5.0 2025-10-18 + - added mwt.feats + - added root.prev_tree and root.next_tree + - .github/workflows/python-publish.yml + - edits by Dan Zeman in block.ud.* + +0.4.0 2025-03-28 + - support for CorefUD 1.3 + - edits by Dan Zeman in block.ud.* + - requires Python 3.9+ (difficult to test older versions in Circle-CI) + +0.3.0 2022-04-06 + - support for CorefUD 1.0 (new CoNLL-U format for coreference annotation) + - edits by Dan Zeman in block.ud.* + - Circle-CI (instead of Travis-CI) + +0.2.3 2021-02-23 + - support for enhanced dependencies and coreference + - requires Python 3.6+ due to f-strings + - speed-up (benchmark 40.5s -> 10.4s) + +0.2.2 2018-01-08 + - support for loading/storing documents from/to strings + - allow private modules (starting with dot instead of udapi.block) + - MorphoDiTa wrapper udapi/tool/morphodita.py + - root.sent_id returns always the same as root.address() + +0.2.1 2017-10-23 the first PyPI release diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..9cecc1d4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + {one line to give the program's name and a brief idea of what it does.} + Copyright (C) {year} {name of author} + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + {project} Copyright (C) {year} {fullname} + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md index 4621c918..36465c78 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,29 @@ # udapi-python Python framework for processing Universal Dependencies data -[![Build Status](https://travis-ci.org/udapi/udapi-python.svg?branch=master)](https://travis-ci.org/udapi/udapi-python) +[![Build Status](https://circleci.com/gh/udapi/udapi-python.svg?style=shield)](https://circleci.com/gh/udapi/udapi-python) [![Website](https://img.shields.io/website-up-down-green-red/http/udapi.github.io.svg)](http://udapi.github.io) [![Documentation Status](https://readthedocs.org/projects/udapi/badge/)](http://udapi.readthedocs.io) ## Requirements -- You need Python 3.3 or higher. -- If the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser is needed, make sure you have a C++11 compiler (e.g. [g++ 4.7 or newer](.travis.yml#L9)). +- You need Python 3.9 or higher. +- It is recommended to install Udapi in a Python virtual environment. +- If you need the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser (to be used from Udapi) + install it (with `pip install --upgrade ufal.udpipe`). ## Install Udapi for developers -Let's clone the git repo to `~/udapi-python/`, install dependencies -and setup `$PATH` and `$PYTHONPATH` accordingly. +Let's clone the git repo e.g. to `~/udapi-python/` and make an [editable installation](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) ```bash cd git clone https://github.com/udapi/udapi-python.git -pip3 install --user -r udapi-python/requirements.txt -echo '## Use Udapi from ~/udapi-python/ ##' >> ~/.bashrc -echo 'export PATH="$HOME/udapi-python/bin:$PATH"' >> ~/.bashrc -echo 'export PYTHONPATH="$HOME/udapi-python/:$PYTHONPATH"' >> ~/.bashrc -source ~/.bashrc # or open new bash +cd udapi-python +pip install -e . ``` ## Install Udapi for users -This is similar to the above, but installs Udapi to the standard (user) Python paths. +This is similar to the above, but installs Udapi from PyPI to the standard (user) Python paths. ``` -pip3 install --user --upgrade git+https://github.com/udapi/udapi-python.git +pip install --upgrade udapi ``` Try `udapy -h` to check it is installed correctly. If it fails, make sure your `PATH` includes the directory where `pip3` installed the `udapy` script. diff --git a/bin/udapy b/bin/udapy index c756c5cb..83c7a6f2 100755 --- a/bin/udapy +++ b/bin/udapy @@ -1,79 +1,7 @@ #!/usr/bin/env python3 +"""Thin wrapper for backward compatibility. Calls udapi.cli.main().""" +import sys +from udapi.cli import main -import logging -import argparse - -from udapi.core.run import Run - -# Parse command line arguments. -argparser = argparse.ArgumentParser( - description='udapy - Python interface to Udapi - API for Universal Dependencies') -argparser.add_argument( - "-q", "--quiet", action="store_true", - help="Warning, info and debug messages are suppressed. Only fatal errors are reported.") -argparser.add_argument( - "-v", "--verbose", action="store_true", - help="Warning, info and debug messages are printed to the STDERR.") -argparser.add_argument( - "-s", "--save", action="store_true", - help="Add write.Conllu to the end of the scenario") -argparser.add_argument( - "-T", "--save_text_mode_trees", action="store_true", - help="Add write.TextModeTrees color=1 to the end of the scenario") -argparser.add_argument( - "-H", "--save_html", action="store_true", - help="Add write.TextModeTreesHtml color=1 to the end of the scenario") -argparser.add_argument( - "-A", "--save_all_attributes", action="store_true", - help="Add attributes=form,lemma,upos,xpos,feats,deprel,misc (to be used after -T and -H)") -argparser.add_argument( - "-C", "--save_comments", action="store_true", - help="Add print_comments=1 (to be used after -T and -H)") -argparser.add_argument( - "-M", "--marked_only", action="store_true", - help="Add marked_only=1 to the end of the scenario (to be used after -T and -H)") -argparser.add_argument( - "-N", "--no_color", action="store_true", - help="Add color=0 to the end of the scenario, this overrides color=1 of -T and -H") -argparser.add_argument( - 'scenario', nargs=argparse.REMAINDER, help="A sequence of blocks and their parameters.") - -args = argparser.parse_args() - -# Set the level of logs according to parameters. -if args.verbose: - level = logging.DEBUG -elif args.quiet: - level = logging.CRITICAL -else: - level = logging.INFO - -logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', - level=level) - -# Process and provide the scenario. if __name__ == "__main__": - if args.save: - args.scenario = args.scenario + ['write.Conllu'] - if args.save_text_mode_trees: - args.scenario = args.scenario + ['write.TextModeTrees', 'color=1'] - if args.save_html: - args.scenario = args.scenario + ['write.TextModeTreesHtml', 'color=1'] - if args.save_all_attributes: - args.scenario = args.scenario + ['attributes=form,lemma,upos,xpos,feats,deprel,misc'] - if args.save_comments: - args.scenario = args.scenario + ['print_comments=1'] - if args.marked_only: - args.scenario = args.scenario + ['marked_only=1'] - if args.no_color: - args.scenario = args.scenario + ['color=0'] - - runner = Run(args) - # udapy is often piped to head etc., e.g. - # `seq 1000 | udapy -s read.Sentences | head` - # Let's prevent Python from reporting (with distracting stacktrace) - # "BrokenPipeError: [Errno 32] Broken pipe" - try: - runner.execute() - except BrokenPipeError: - pass + sys.exit(main()) diff --git a/bin/udapy.bat b/bin/udapy.bat new file mode 100644 index 00000000..013e08e7 --- /dev/null +++ b/bin/udapy.bat @@ -0,0 +1,4 @@ +@REM The Python launcher "py" must be accessible via the PATH environment variable. +@REM We assume that this batch script lies next to udapy in udapi-python/bin. +@REM The PYTHONPATH environment variable must contain path to udapi-python. +py %~dp$PATH:0\udapy %* diff --git a/demo/python-demo.sh b/demo/python-demo.sh index aefa17cf..d83e51d9 100755 --- a/demo/python-demo.sh +++ b/demo/python-demo.sh @@ -3,4 +3,4 @@ export PATH=../bin:$PATH export PYTHONPATH=../:$PYTHONPATH -udapy read.Conllu filename=en-sample.conllu demo.RehangPrepositions write.Conllu > prepositions-up.conllu +udapy read.Conllu files=en-sample.conllu demo.RehangPrepositions write.Conllu > prepositions-up.conllu diff --git a/docs/conf.py b/docs/conf.py index 3e7864a5..b7d0f6e5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,7 +21,6 @@ import sys sys.path.insert(0, os.path.abspath('..')) - # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. @@ -52,7 +51,7 @@ # General information about the project. project = 'Udapi' -copyright = '2017, Martin Popel' +copyright = '2023, Martin Popel' author = 'Martin Popel' # The version info for the project you're documenting, acts as replacement for @@ -62,14 +61,14 @@ # The short X.Y version. version = '0' # The full version, including alpha/beta/rc tags. -release = '1' +release = '3' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -168,7 +167,7 @@ def run_apidoc(_): module = os.path.abspath(os.path.join(cur_dir, "..", "udapi")) print(module) - from sphinx.apidoc import main + from sphinx.ext.apidoc import main main(['--separate', '-o', cur_dir, module, '--force']) def setup(app): diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..a537f220 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,4 @@ +colorama>=0.4.6 +termcolor +ufal.udpipe +sphinx_rtd_theme diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..18d5c717 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "udapi" +version = "0.5.2" +description = "Python framework for processing Universal Dependencies data" +readme = "README.md" +requires-python = ">=3.9" +license = "GPL-3.0-or-later" +authors = [ + {name = "Martin Popel", email = "popel@ufal.mff.cuni.cz"} +] +classifiers = [ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", +] +dependencies = [ + "colorama", + "termcolor", +] + +[project.urls] +Homepage = "https://github.com/udapi/udapi-python" + +[project.optional-dependencies] +test = ["pytest"] +udpipe = ["ufal.udpipe"] + +[project.scripts] +udapy = "udapi.cli:main" + +[tool.setuptools] +packages = {find = {}} +include-package-data = true diff --git a/requirements.txt b/requirements.txt index 647361f7..044d3af7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -colorama +colorama>=0.4.6 termcolor ufal.udpipe diff --git a/setup.py b/setup.py deleted file mode 100644 index 1ec8e468..00000000 --- a/setup.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 - -from setuptools import setup, find_packages - -# python_requires is supported by pip only from November 2016, -# so let's check the Python version also the old way. -import sys -if sys.version_info < (3, 3): - raise SystemExit('Udapi requires Python 3.3 or higher.') - -setup( - name='udapi-python', - version='0.2', - description='Python framework for processing Universal Dependencies data', - author='Martin Popel', - author_email='popel@ufal.mff.cuni.cz', - url='https://github.com/udapi/udapi-python', - packages=find_packages(), - scripts=['bin/udapy'], - tests_require=['pytest'], - install_requires=['colorama', 'termcolor', 'ufal.udpipe'], - python_requires='>=3.3', -) diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 00000000..e079f8a6 --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1 @@ +pytest diff --git a/tutorial/01-visualizing.ipynb b/tutorial/01-visualizing.ipynb new file mode 100644 index 00000000..70bea240 --- /dev/null +++ b/tutorial/01-visualizing.ipynb @@ -0,0 +1,554 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "Udapi is an API and framework for processing [Universal Dependencies](http://universaldependencies.org/). In this tutorial, we will focus on the Python version of Udapi. Perl and Java versions are [available](http://udapi.github.io/) as well, but they are missing some of the features.\n", + "\n", + "Udapi can be used from the shell (e.g. Bash), using the wrapper script `udapy`. It can be also used as a library, from Python, IPython or Jupyter notebooks. We will show both of these ways bellow.\n", + "\n", + "This tutorial uses Details sections for extra info (if you want to know more or if you run into problems). You need to click on it to show its content.\n", + "
Details\n", + "It is a substitute for footnotes. The content may be long and showing it in the main text may be distracting.\n", + "
\n", + "\n", + "### Install (upgrade) Udapi\n", + "First, make sure you have the newest version of Udapi. If you have already installed Udapi [using git clone](https://github.com/udapi/udapi-python#install-udapi-for-developers), just run `git pull`. If you have not installed Udapi yet, run\n", + "
Details\n", + "
    \n", + "
  • The command below installs Udapi from GitHub (from the master branch). With pip3 install --user --upgrade udapi, you can install the last version released on PyPI (possibly older).\n", + "
  • The exclamation mark (!) in Jupyter or IPython means that the following command will be executed by the system shell (e.g. Bash).\n", + "
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install --user --upgrade git+https://github.com/udapi/udapi-python.git\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, make sure you can run the command-line interface `udapy`, e.g. by printing the help message." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: udapy [optional_arguments] scenario\r\n", + "\r\n", + "udapy - Python interface to Udapi - API for Universal Dependencies\r\n", + "\r\n", + "Examples of usage:\r\n", + " udapy -s read.Sentences udpipe.En < in.txt > out.conllu\r\n", + " udapy -T < sample.conllu | less -R\r\n", + " udapy -HAM ud.MarkBugs < sample.conllu > bugs.html\r\n", + "\r\n", + "positional arguments:\r\n", + " scenario A sequence of blocks and their parameters.\r\n", + "\r\n", + "optional arguments:\r\n", + " -h, --help show this help message and exit\r\n", + " -q, --quiet Warning, info and debug messages are suppressed. Only fatal errors are reported.\r\n", + " -v, --verbose Warning, info and debug messages are printed to the STDERR.\r\n", + " -s, --save Add write.Conllu to the end of the scenario\r\n", + " -T, --save_text_mode_trees\r\n", + " Add write.TextModeTrees color=1 to the end of the scenario\r\n", + " -H, --save_html Add write.TextModeTreesHtml color=1 to the end of the scenario\r\n", + " -A, --save_all_attributes\r\n", + " Add attributes=form,lemma,upos,xpos,feats,deprel,misc (to be used after -T and -H)\r\n", + " -C, --save_comments Add print_comments=1 (to be used after -T and -H)\r\n", + " -M, --marked_only Add marked_only=1 to the end of the scenario (to be used after -T and -H)\r\n", + " -N, --no_color Add color=0 to the end of the scenario, this overrides color=1 of -T and -H\r\n", + "\r\n", + "See http://udapi.github.io\r\n" + ] + } + ], + "source": [ + "!udapy -h" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Details: If the previous command fails with \"udapy: command not found\"\n", + "This means that Udapi is not properly installed. When installing Udapi with pip3 --user, it is installed into ~/.local/lib/python3.6/site-packages/udapi/ (or similar depending on your Python version) and the wrapper into ~/.local/bin. Thus you need to\n", + "
\n",
+    "export PATH=\"$HOME/.local/bin/:$PATH\"\n",
+    "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Browse CoNLL-U files\n", + "### Get sample UD data\n", + "\n", + "Download and extract [ud20sample.tgz](http://ufal.mff.cuni.cz/~popel/udapi/ud20sample.tgz). There are just 100 sentences for each of the 70 treebanks (`sample.conllu`), plus 4 bigger files (`train.conllu` and `dev.conllu`) for German, English, French and Czech. For full UD ([2.0](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1983) or [newer](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3424)), go to [Lindat](https://lindat.cz)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-12-01 07:53:37-- http://ufal.mff.cuni.cz/~popel/udapi/ud20sample.tgz\n", + "Resolving ufal.mff.cuni.cz (ufal.mff.cuni.cz)... 195.113.20.52\n", + "Connecting to ufal.mff.cuni.cz (ufal.mff.cuni.cz)|195.113.20.52|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4670982 (4,5M) [application/x-gzip]\n", + "Saving to: ‘ud20sample.tgz.1’\n", + "\n", + "ud20sample.tgz.1 100%[===================>] 4,45M 1,49MB/s in 3,0s \n", + "\n", + "2020-12-01 07:53:40 (1,49 MB/s) - ‘ud20sample.tgz.1’ saved [4670982/4670982]\n", + "\n", + "/home/martin/udapi/python/notebook/sample\n" + ] + } + ], + "source": [ + "!wget http://ufal.mff.cuni.cz/~popel/udapi/ud20sample.tgz\n", + "!tar -xf ud20sample.tgz\n", + "%cd sample" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's choose one of the sample files and see the raw [CoNLL-U format](https://universaldependencies.org/format.html).\n", + "
Details: executing from Bash, IPython, Jupyter\n", + "
    \n", + "
  • If you see \"No such file or directory\" error, make sure you executed the previous cell. Note that the cd command is not prefixed by an exclamation mark because that would run in a sub-shell, which \"forgets\" the changed directory when finished. It is prefixed by a percent sign, which marks it as IPython magic.\n", + "
  • cat is another IPython magic command, this time an alias for the shell command of the same name (so you can prefix cat with an exclamation mark, if you prefer), which prints a given file. With automagic on, you can use it without the percent sign.\n", + "
  • In this tutorial, we use | head to show just the first 10 lines of the output (preventing thus big ipynb file size). You can ignore the \"cat: write error: Broken pipe\" warning.\n", + "
  • When using Jupyter, you can omit the | head because long outputs are automatically wrapped in a text box with a scrollbar.\n", + "
  • When running this from IPython or Bash, you can use a pager: less UD_Ancient_Greek/sample.conllu\n", + "
\n", + "
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# newdoc id = tlg0008.tlg001.perseus-grc1.13.tb.xml\r\n", + "# sent_id = tlg0008.tlg001.perseus-grc1.13.tb.xml@1144\r\n", + "# text = ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·\r\n", + "1\tἐρᾷ\tἐράω\tVERB\tv3spia---\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act\t0\troot\t_\t_\r\n", + "2\tμὲν\tμέν\tADV\td--------\t_\t1\tadvmod\t_\t_\r\n", + "3\tἁγνὸς\tἁγνός\tADJ\ta-s---mn-\tCase=Nom|Gender=Masc|Number=Sing\t4\tnmod\t_\t_\r\n", + "4\tοὐρανὸς\tοὐρανός\tNOUN\tn-s---mn-\tCase=Nom|Gender=Masc|Number=Sing\t1\tnsubj\t_\t_\r\n", + "5\tτρῶσαι\tτιτρώσκω\tVERB\tv--ana---\tTense=Past|VerbForm=Inf|Voice=Act\t1\txcomp\t_\t_\r\n", + "6\tχθόνα\tχθών\tNOUN\tn-s---fa-\tCase=Acc|Gender=Fem|Number=Sing\t5\tobj\t_\tSpaceAfter=No\r\n", + "7\t,\t,\tPUNCT\tu--------\t_\t1\tpunct\t_\t_\r\n", + "cat: write error: Broken pipe\r\n" + ] + } + ], + "source": [ + "cat UD_Ancient_Greek/sample.conllu | head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Browse conllu files with `udapy -T`\n", + "While the CoNLL-U format was designed with readibility (by both machines and humans) on mind, it may be still a bit difficult to read and interpret by humans. Let's visualize the dependency tree structure using ASCII-art by piping the conllu file into `udapy -T`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020-12-01 08:00:33,276 [ INFO] execute - No reader specified, using read.Conllu\n", + "2020-12-01 08:00:33,276 [ INFO] execute - ---- ROUND ----\n", + "2020-12-01 08:00:33,276 [ INFO] execute - Executing block Conllu\n", + "2020-12-01 08:00:33,305 [ INFO] execute - Executing block TextModeTrees\n", + "docname = tlg0008.tlg001.perseus-grc1.13.tb.xml\n", + "loaded_from = -\n", + "# sent_id = tlg0008.tlg001.perseus-grc1.13.tb.xml@1144\n", + "# text = ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·\n", + "─┮\n", + " ╰─┮ \u001b[33mἐρᾷ\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mroot\u001b[0m\n", + " ┡─╼ \u001b[33mμὲν\u001b[0m \u001b[31mADV\u001b[0m \u001b[34madvmod\u001b[0m\n", + " │ ╭─╼ \u001b[33mἁγνὸς\u001b[0m \u001b[31mADJ\u001b[0m \u001b[34mnmod\u001b[0m\n", + " ┡─┶ \u001b[33mοὐρανὸς\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m\n", + " ┡─┮ \u001b[33mτρῶσαι\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m\n", + " │ ╰─╼ \u001b[33mχθόνα\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m\n", + " ┡─╼ \u001b[33m,\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m\n", + " │ ╭─╼ \u001b[33mἔρως\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m\n", + " ┡─╼ \u001b[33mδὲ\u001b[0m \u001b[31mCCONJ\u001b[0m \u001b[34mcc\u001b[0m │\n", + " │ ┢─╼ \u001b[33mγαῖαν\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m\n", + " ┡───────────────┾ \u001b[33mλαμβάνει\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mconj\u001b[0m\n", + " │ │ ╭─╼ \u001b[33mγάμου\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m\n", + " │ ╰─┶ \u001b[33mτυχεῖν\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m\n", + " ╰─╼ \u001b[33m·\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "cat UD_Ancient_Greek/sample.conllu | udapy -T | head -n 20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Details:\n", + "
    \n", + "
  • You may be used to see dependency trees where the root node is on the top and words are ordered horizontally (left to right). Here, the root is on left and words are ordered vertically (top to bottom).\n", + "
  • The colors are implemented using the colorama package and ANSI escape codes. When running this from IPython or Bash and using less, you need to instruct it to display the colors with -R:\n", + "\n", + "cat UD_Ancient_Greek/sample.conllu | udapy -T | less -R\n", + "\n", + "
  • You can also use udapy -T -N to disable the colors.\n", + "
  • udapy -q suppresses all Udapi messages (warnings, info, debug) printed on the standard error output, so only fatal errors are printed. By default only debug messages are suppresses, but these can be printed with udapy -v.\n", + "
  • But you already know this because you have read udapy -h, am I right?\n", + "
\n", + "
\n", + "\n", + "`udapy -T` is a shortcut for `udapy write.TextModeTrees color=1`, where `write.TextModeTrees` is a so-called *block* (a basic Udapi processing unit) and `color=1` is its parameter. See [the documentation](https://udapi.readthedocs.io/en/latest/udapi.block.write.html#module-udapi.block.write.textmodetrees) (or even [the source code](https://github.com/udapi/udapi-python/blob/master/udapi/block/write/textmodetrees.py) of `write.TextModeTrees` to learn about further parameters. Now, let's print also the LEMMA and MISC columns and display the columns vertically aligned using parameters `layout=align attributes=form,lemma,upos,deprel,misc`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "docname = tlg0008.tlg001.perseus-grc1.13.tb.xml\r\n", + "loaded_from = -\r\n", + "# sent_id = tlg0008.tlg001.perseus-grc1.13.tb.xml@1144\r\n", + "# text = ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·\r\n", + "─┮ \r\n", + " ╰─┮ \u001b[33mἐρᾷ\u001b[0m \u001b[36mἐράω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mroot\u001b[0m _\u001b[0m\r\n", + " ┡─╼ \u001b[33mμὲν\u001b[0m \u001b[36mμέν\u001b[0m \u001b[31mADV\u001b[0m \u001b[34madvmod\u001b[0m _\u001b[0m\r\n", + " │ ╭─╼ \u001b[33mἁγνὸς\u001b[0m \u001b[36mἁγνός\u001b[0m \u001b[31mADJ\u001b[0m \u001b[34mnmod\u001b[0m _\u001b[0m\r\n", + " ┡─┶ \u001b[33mοὐρανὸς\u001b[0m \u001b[36mοὐρανός\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m _\u001b[0m\r\n", + " ┡─┮ \u001b[33mτρῶσαι\u001b[0m \u001b[36mτιτρώσκω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m _\u001b[0m\r\n", + " │ ╰─╼ \u001b[33mχθόνα\u001b[0m \u001b[36mχθών\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m SpaceAfter=No\u001b[0m\r\n", + " ┡─╼ \u001b[33m,\u001b[0m \u001b[36m,\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m _\u001b[0m\r\n", + " │ ╭─╼ \u001b[33mἔρως\u001b[0m \u001b[36mἔρως\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m _\u001b[0m\r\n", + " ┡─╼ │ \u001b[33mδὲ\u001b[0m \u001b[36mδέ\u001b[0m \u001b[31mCCONJ\u001b[0m \u001b[34mcc\u001b[0m _\u001b[0m\r\n", + " │ ┢─╼ \u001b[33mγαῖαν\u001b[0m \u001b[36mγαῖα\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m _\u001b[0m\r\n", + " ┡───┾ \u001b[33mλαμβάνει\u001b[0m \u001b[36mλαμβάνω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mconj\u001b[0m _\u001b[0m\r\n", + " │ │ ╭─╼ \u001b[33mγάμου\u001b[0m \u001b[36mγάμος\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m _\u001b[0m\r\n", + " │ ╰─┶ \u001b[33mτυχεῖν\u001b[0m \u001b[36mτυγχάνω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m SpaceAfter=No\u001b[0m\r\n", + " ╰─╼ \u001b[33m·\u001b[0m \u001b[36m·\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m _\u001b[0m\r\n", + "\r\n" + ] + } + ], + "source": [ + "cat UD_Ancient_Greek/sample.conllu | udapy -q write.TextModeTrees color=1 layout=align attributes=form,lemma,upos,deprel,misc | head -n 20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Browse conllu files from IPython/Jupyter\n", + "So far, we were using Udapi only via its command-line interface `udapy`, which is handy, but not very Pythonic. So let's now use Udapi as a library and load the English conllu sample file into a document `doc` and visualize the sixth tree (i.e. `doc[5]` in zero-based indexing)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0006\n", + "# text = The third was being run by the head of an investment firm.\n", + "─┮\n", + " │ ╭─╼ \u001b[33mThe\u001b[0m \u001b[31mDET\u001b[0m \u001b[34mdet\u001b[0m\n", + " │ ╭─┶ \u001b[33mthird\u001b[0m \u001b[31mADJ\u001b[0m \u001b[34mnsubj:pass\u001b[0m\n", + " │ ┢─╼ \u001b[33mwas\u001b[0m \u001b[31mAUX\u001b[0m \u001b[34maux\u001b[0m\n", + " │ ┢─╼ \u001b[33mbeing\u001b[0m \u001b[31mAUX\u001b[0m \u001b[34maux:pass\u001b[0m\n", + " ╰─┾ \u001b[33mrun\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mroot\u001b[0m\n", + " │ ╭─╼ \u001b[33mby\u001b[0m \u001b[31mADP\u001b[0m \u001b[34mcase\u001b[0m\n", + " │ ┢─╼ \u001b[33mthe\u001b[0m \u001b[31mDET\u001b[0m \u001b[34mdet\u001b[0m\n", + " ┡─┾ \u001b[33mhead\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobl\u001b[0m\n", + " │ │ ╭─╼ \u001b[33mof\u001b[0m \u001b[31mADP\u001b[0m \u001b[34mcase\u001b[0m\n", + " │ │ ┢─╼ \u001b[33man\u001b[0m \u001b[31mDET\u001b[0m \u001b[34mdet\u001b[0m\n", + " │ │ ┢─╼ \u001b[33minvestment\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mcompound\u001b[0m\n", + " │ ╰─┶ \u001b[33mfirm\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnmod\u001b[0m\n", + " ╰─╼ \u001b[33m.\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "import udapi\n", + "doc = udapi.Document(\"UD_English/sample.conllu\")\n", + "doc[5].draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Details:\n", + "
    \n", + "
  • doc = udapi.Document(filename) is a shortcut for\n", + "
    \n",
    +    "import udapi.core.document\n",
    +    "doc = udapi.core.document.Document(filename)\n",
    +    "
    \n", + "
  • We can print the whole document using doc.draw().\n", + "
  • doc.draw(**kwargs) is a shortcut for creating a write.TextModeTrees block and applying it on the document:\n", + "
    \n",
    +    "import udapi.block.write.textmodetrees\n",
    +    "block = udapi.block.write.textmodetrees.TextModeTrees(**kwargs)\n",
    +    "block.run(doc)\n",
    +    "
    \n", + "
\n", + "
\n", + "\n", + "The `draw()` method takes the same parameters as the `write.TextModeTrees` block, so we can for example display only the node ID (aka `ord`, i.e. word-order index), form and [universal (morpho-syntactic) features](https://universaldependencies.org/u/feat/index.html).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0006\n", + "# text = The third was being run by the head of an investment firm.\n", + "─┮ \n", + " │ ╭─╼ \u001b[32m1\u001b[0m \u001b[33mThe\u001b[0m Definite=Def|PronType=Art\u001b[0m\n", + " │ ╭─┶ \u001b[32m2\u001b[0m \u001b[33mthird\u001b[0m Degree=Pos|NumType=Ord\u001b[0m\n", + " │ ┢─╼ \u001b[32m3\u001b[0m \u001b[33mwas\u001b[0m Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\u001b[0m\n", + " │ ┢─╼ \u001b[32m4\u001b[0m \u001b[33mbeing\u001b[0m VerbForm=Ger\u001b[0m\n", + " ╰─┾ \u001b[32m5\u001b[0m \u001b[33mrun\u001b[0m Tense=Past|VerbForm=Part|Voice=Pass\u001b[0m\n", + " │ ╭─╼ \u001b[32m6\u001b[0m \u001b[33mby\u001b[0m _\u001b[0m\n", + " │ ┢─╼ \u001b[32m7\u001b[0m \u001b[33mthe\u001b[0m Definite=Def|PronType=Art\u001b[0m\n", + " ┡─┾ \u001b[32m8\u001b[0m \u001b[33mhead\u001b[0m Number=Sing\u001b[0m\n", + " │ │ ╭─╼ \u001b[32m9\u001b[0m \u001b[33mof\u001b[0m _\u001b[0m\n", + " │ │ ┢─╼ \u001b[32m10\u001b[0m \u001b[33man\u001b[0m Definite=Ind|PronType=Art\u001b[0m\n", + " │ │ ┢─╼ \u001b[32m11\u001b[0m \u001b[33minvestment\u001b[0m Number=Sing\u001b[0m\n", + " │ ╰─┶ \u001b[32m12\u001b[0m \u001b[33mfirm\u001b[0m Number=Sing\u001b[0m\n", + " ╰─╼ \u001b[32m13\u001b[0m \u001b[33m.\u001b[0m _\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "doc[5].draw(layout=\"align\", attributes=\"ord,form,feats\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Document representation in Udapi\n", + "\n", + "Udapi [document](https://github.com/udapi/udapi-python/blob/master/udapi/core/document.py) consists of a sequence of so-called *bundles*, mirroring a sequence of sentences in a typical natural language text.\n", + "\n", + "A [bundle](https://github.com/udapi/udapi-python/blob/master/udapi/core/bundle.py) corresponds to a sentence,\n", + "possibly in multiple versions or with different representations, such as sentence-tuples from parallel corpora, or paraphrases in the same language or alternative analyses (e.g. parses produced by different parsers). If there are more trees in a bundle, they must be distinguished by a so-called *zone* (a label which contains the language code).\n", + "\n", + "Each tree is represented by a special (artificial) [root](https://github.com/udapi/udapi-python/blob/master/udapi/core/root.py) node, which is added to the top of a CoNLL-U tree in the Udapi model. The root node bears the ID of a given tree/sentence (`sent_id`) and its word order (`ord`) is 0. Technically, Root is subclass of Node, with some extra methods.\n", + "\n", + "The [Node](https://github.com/udapi/udapi-python/blob/master/udapi/core/node.py) class corresponds to a node\n", + "of a dependency tree. It provides access to all the CoNLL-U-defined attributes (`ord`, `form`, `lemma`, `upos`, `xpos`, `feats`, `deprel`, `deps`, `misc`). There are methods for tree traversal (`parent`, `root`, `children`, `descendants`); word-order traversal (`next_node`, `prev_node`); tree manipulation (`parent` setter) including word-order changes (`shift_after_node(x)`, `shift_before_subtree(x)`, etc.); and utility methods: `is_descendant_of(x)`, `is_nonprojective()`, `precedes(x)`, `is_leaf()`, `is_root()`, `get_attrs([])`, `compute_text()`, `draw()`.\n", + "\n", + "## Exercise 1: Count prepositions and postpositions\n", + "[Prepositions and postpositions](https://en.wikipedia.org/wiki/Preposition_and_postposition) are together called *adpositions* and assigned the [ADP](https://universaldependencies.org/u/pos/ADP.html) universal part-of-speech tag (`upos`) in UD. Some languages (e.g. English) use mostly prepositions, others mostly postpositions.\n", + "* Do you know any English postpositions?\n", + "* Guess the typical adposition type (i.e. whether a given language uses more prepositions or postpositions) for at least 10 languages of your choice (from those in UD2.0).\n", + "* Complete the following code and find out how many prepositions and postpositions are in `UD_English/sample.conllu` (which has been loaded into `doc`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prepositions, postpositions = 0, 0\n", + "# Iterate over all nodes in the document (in all trees)\n", + "for node in doc.nodes:\n", + " if node.upos == \"ADP\":\n", + " # TODO: fix this code to actually distinguish prepositions and postpositions\n", + " prepositions += 1\n", + "# Print the results\n", + "prepositions, postpositions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you don't know how to proceed click on the following hints.\n", + "
Hint 1:\n", + "In some dependency grammars, adpositions govern noun (i.e. adposition is the *parent* of a given noun node). In other dependency grammars, adpositions depend on nouns (i.e. noun is the *parent* of a given adposition). Find out which style is being used by UD. Check the UD documentation or inspect some of the tree visualizations and guess.\n", + "
\n", + "
Hint 2:\n", + "See the Node documentation and find out how to obtain dependency parent and dependency children. Note that these are properties of a given node, rather than methods, so you should not write parentheses () after the property name.\n", + "
\n", + "
Hint 3:\n", + "doc.nodes iterates over all nodes in the document sorted by the word order, but this would be cumbersome to exploit. Find a method of Node to detect the relative word order of two nodes (within the same tree/sentence).\n", + "
\n", + "
Hint 4:\n", + "Use node.parent and node.precedes(another_node).\n", + "The latter is a shortcut for node.ord < another_node.ord.\n", + "
\n", + "
Solution:\n", + "
\n",
+    "for node in doc.nodes:\n",
+    "    if node.upos == \"ADP\":\n",
+    "        if node.precedes(node.parent):\n",
+    "            prepositions += 1\n",
+    "        else:\n",
+    "            postpositions += 1\n",
+    "
\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2: Explore English postpositions\n", + "The previous exercise indicates there are 7 occurrences of postpositions in the English sample. Find these 7 occurrences and visualize them using `node.draw()`. Count which adpositions (`lemma`) with which dependency relations (`deprel`) are responsible for these occurrences. Recompute these statistics on the bigger English training data. Can you explain these occurrences? What are the reasons? Is any occurrence an annotation error?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For the statistics, you may find useful: count[\"any string\"] += 1\n", + "import collections\n", + "count = collections.Counter()\n", + "big_doc = udapi.Document(\"UD_English/train.conllu\")\n", + "\n", + "for node in doc.nodes:\n", + " # TODO detect postposition\n", + " pass\n", + "\n", + "# Print the statistics\n", + "count.most_common()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Solution 1:\n", + "
\n",
+    "for node in doc.nodes:\n",
+    "    if node.upos == \"ADP\" and node.parent.precedes(node):\n",
+    "        node.parent.draw()\n",
+    "        count[node.lemma + \" \" + node.deprel] += 1\n",
+    "
\n", + "
\n", + "
Hint 1:\n", + "We can see there are many particles of phrase verbs, e.g. \"busted up\".\n", + "These seem to be correctly annotated as ADP according to the UD guidelines.\n", + "Let's filter out those cases and focus on the rest and let's switch to the big train data.\n", + "
\n", + "
Solution 2:\n", + "
\n",
+    "count = collections.Counter()\n",
+    "for node in big_doc.nodes:\n",
+    "    if node.upos == \"ADP\" and node.parent.precedes(node) and node.parent.upos != \"VERB\":\n",
+    "        count[node.lemma + \" \" + node.deprel] += 1\n",
+    "count.most_common()\n",
+    "
\n", + "Alternatively to node.parent.upos != \"VERB\",\n", + "you could also filter out node.deprel != \"compound:prt\",\n", + "or directly focus on node.deprel == \"case\"\n", + "
\n", + "
Partial answer:\n", + "Most of the occurrences are actually annotated correctly,\n", + "although they are not typically considered as postpositions.\n", + "For example, node.deprel == \"fixed\" is being used for multi-word adpositions,\n", + "such as \"because of\", where \"of\" depends on \"because\" from technical (and consistency) reasons,\n", + "but the whole multi-word adpositions precedes its governing nound, so it is actually a multi-word preposition.\n", + "\n", + "What about the remaining occurrences, after filtering out node.deprel not in {\"compound:prt\", \"fixed\"}?\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next tutorial, 02-blocks.ipynb (not finished yet), we will explore several useful Udapi blocks, some of which may be handy when working further on Exercise 2 or similar tasks." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorial/README.md b/tutorial/README.md new file mode 100644 index 00000000..425f7df5 --- /dev/null +++ b/tutorial/README.md @@ -0,0 +1,9 @@ +# Udapi tutorial + +To run this tutorial, install [Jupyter Notebook](https://jupyter.org/install.html) (or JupyterLab) and run `jupyter notebook` from this directory. + +Don't display the tutorial `ipynb` files on GitHub because it cannot render the collapsible Details, Hints and Solution sections, so you would miss important parts of the tutorial. +If you don't have Jupyter installed, you can display the tutorial with https://nbviewer.jupyter.org, using the following links: + +- [01-visualizing.ipynb](https://nbviewer.jupyter.org/github/udapi/udapi-python/blob/master/tutorial/01-visualizing.ipynb) +- 02-blocks.ipynb (not finished yet) diff --git a/tutorial/udapi-tutorial-dz.odt b/tutorial/udapi-tutorial-dz.odt new file mode 100644 index 00000000..d27ff8c4 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.odt differ diff --git a/tutorial/udapi-tutorial-dz.pdf b/tutorial/udapi-tutorial-dz.pdf new file mode 100644 index 00000000..86d975b6 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.pdf differ diff --git a/udapi/__init__.py b/udapi/__init__.py index e69de29b..6c281c0f 100644 --- a/udapi/__init__.py +++ b/udapi/__init__.py @@ -0,0 +1,3 @@ +from .core.document import Document +from .core.run import create_block +from .core.node import CycleError diff --git a/udapi/block/corefud/__init__.py b/udapi/block/corefud/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/corefud/bridgingclusters.py b/udapi/block/corefud/bridgingclusters.py new file mode 100644 index 00000000..30ac49a7 --- /dev/null +++ b/udapi/block/corefud/bridgingclusters.py @@ -0,0 +1,17 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class BridgingClusters(Block): + + def process_node(self,node): + + if 'Bridging' in node.misc and "+" in node.misc['BridgingAllTargetClusterTexts']: + print("SENTENCE : "+node.root.get_sentence()) + print("SOURCE MENTION: "+node.misc['MentionText']) + print("RELATION: "+node.misc['Bridging']) + print("TARGET MENTION: "+node.misc['BridgingTargetMentionText']) + print("TARGET CLUSTER: "+node.misc['BridgingAllTargetClusterTexts']) + print() + + diff --git a/udapi/block/corefud/concatmentionmisc.py b/udapi/block/corefud/concatmentionmisc.py new file mode 100644 index 00000000..74483368 --- /dev/null +++ b/udapi/block/corefud/concatmentionmisc.py @@ -0,0 +1,24 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class ConcatMentionMisc(Block): + """All MISC attributes named MentionMisc_... are concatenated into MentionMisc""" + + def process_tree(self,root): + for node in root.descendants_and_empty: + for attrname in list(node.misc): + matchObj = re.match('MentionMisc_([^[]+)((\[\d+\])?)',attrname) + if matchObj: + innerattrib = matchObj.group(1) + index = matchObj.group(2) + + finalattr = 'MentionMisc'+index + value = node.misc[attrname].replace(",", "%2C") + + if finalattr not in node.misc: + node.misc[finalattr] = f'{innerattrib}:{value}' + else: + node.misc[finalattr] += f',{innerattrib}:{value}' + del node.misc[attrname] + diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py new file mode 100644 index 00000000..fc45540a --- /dev/null +++ b/udapi/block/corefud/countgaps.py @@ -0,0 +1,94 @@ +from udapi.core.block import Block +from collections import defaultdict, Counter + +class CountGaps(Block): + """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" + + def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs): + super().__init__(**kwargs) + self.report_per_newdoc = report_per_newdoc + self.report_per_file = report_per_file + self.report_total = report_total + self._total_counter = defaultdict(Counter) + + def _report_stats(self, counter, header_id=None): + if header_id: + print(f"============ {header_id} ============") + for key in sorted(counter): + print(f"{key:2d}: {counter[key]}") + print("-------") + print(f"SUM: {sum([k*counter[k] for k in counter])}") + + def _count_empty_seqs(self, empty_seqs): + counter = Counter() + for seq in empty_seqs: + counter[len(seq)] += 1 + return counter + + def process_document(self, doc): + file_counters = defaultdict(Counter) + empty_seqs = [] + empty_pars = [] + curr_seq = [] + curr_par = [] + is_empty_par = True + newdoc = None + for i, tree in enumerate(doc.trees): + if tree.newdoc: + if i: + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if is_empty_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}") + newdoc = tree.newdoc + empty_seqs = [] + empty_pars = [] + curr_seq = [] + curr_par = [] + is_empty_par = True + if tree.newpar: + if not tree.newdoc and is_empty_par: + empty_pars.append(curr_par) + curr_par = [] + is_empty_par = True + + has_mention = any(node.coref_mentions for node in tree.descendants) + if not has_mention: + curr_seq.append(tree.sent_id) + curr_par.append(tree.sent_id) + else: + if curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + is_empty_par = False + + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if curr_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}") + + if self.report_per_file: + self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE") + self._report_stats(file_counters["par"], header_id="PAR STATS, FILE") + + self._total_counter["seq"].update(file_counters["seq"]) + self._total_counter["par"].update(file_counters["par"]) + + def process_end(self): + if self.report_total: + self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL") + self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL") diff --git a/udapi/block/corefud/delete.py b/udapi/block/corefud/delete.py new file mode 100644 index 00000000..5aaf94e7 --- /dev/null +++ b/udapi/block/corefud/delete.py @@ -0,0 +1,84 @@ +"""Delete coreference annotation (Entity|Bridge|SplitAnte) and optionally also empty nodes.""" + +from udapi.core.block import Block +import udapi.core.coref +import logging + +class Delete(Block): + + def __init__(self, coref=True, empty=False, misc=False, **kwargs): + """Args: + coref: delete coreference attributes in MISC, i.e (Entity|Bridge|SplitAnte) + empty: delete all empty nodes and references to them (from DEPS and MISC[Functor]) + misc: delete all attributes in MISC except for SpaceAfter + """ + super().__init__(**kwargs) + self.coref = coref + self.empty = empty + self.misc = misc + + def is_root_reachable_by_deps(self, node, parents_to_ignore=None): + """ Check if the root node is reachable from node, possibly after deleting the parents_to_ignore nodes. + """ + stack = [(node, [])] + while stack: + proc_node, path = stack.pop() + # root is reachable + if proc_node == node.root: + return True + # path forms a cycle, the root cannot be reached through this branch + if proc_node not in path: + for dep in proc_node.deps: + # the root cannot be reached through ignored nodes + if dep['parent'] not in parents_to_ignore: + # process the parent recursively + stack.append((dep['parent'], path + [proc_node])) + return False + + def _deps_ignore_nodes(self, node, parents_to_ignore): + """ Retrieve deps from the node, recursively ignoring specified parents. + """ + newdeps = [] + stack = [(node, [])] + while stack: + proc_node, skipped_nodes = stack.pop() + if proc_node not in skipped_nodes: + for dep in proc_node.deps: + if dep['parent'] in parents_to_ignore: + # process the ignored parent recursively + stack.append((dep['parent'], skipped_nodes + [proc_node])) + else: + # keep deps with a parent that shouldn't be ignored + newdeps.append(dep) + # If no newdeps were found (because of a cycle), return the root. + return newdeps if newdeps else [{'parent': node.root, 'deprel': 'root'}] + + def process_document(self, doc): + # This block should work both with coreference loaded (deserialized) and not. + if self.coref: + doc._eid_to_entity = None + for root in doc.trees: + if self.empty: + for node in root.descendants: + # process only the nodes dependent on empty nodes + if '.' in node.raw_deps: + # just remove empty parents if the root remains reachable + if self.is_root_reachable_by_deps(node, root.empty_nodes): + node.deps = [dep for dep in node.deps if not dep['parent'] in root.empty_nodes] + # otherwise propagate to non-empty ancestors + else: + node.deps = self._deps_ignore_nodes(node, root.empty_nodes) + # This needs to be done even if '.' not in node.raw_deps. + if '.' in node.misc['Functor'].split(':')[0]: + del node.misc['Functor'] + root.empty_nodes = [] + + if self.coref or self.misc: + for node in root.descendants + root.empty_nodes: + if self.misc: + node.misc = 'SpaceAfter=No' if node.no_space_after else None + if self.coref: + node._mentions = [] + if not self.misc: + for attr in ('Entity', 'Bridge', 'SplitAnte'): + del node.misc[attr] diff --git a/udapi/block/corefud/fixcorefud02.py b/udapi/block/corefud/fixcorefud02.py new file mode 100644 index 00000000..1575cea6 --- /dev/null +++ b/udapi/block/corefud/fixcorefud02.py @@ -0,0 +1,56 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +NEW_ETYPE = { + "misc": "other", + "date": "time", + "loc": "place", + "location": "place", + "per": "person", + "org": "organization", + "_": "", + } + +class FixCorefUD02(Block): + """Fix errors in CorefUD 0.2 for release of CorefUD 1.0.""" + + def process_document(self, doc): + # For GUM + if doc.meta['global.Entity'] == 'entity-GRP-infstat-MIN-coref_type-identity': + doc.meta['global.Entity'] = 'eid-etype-head-other-infstat-minspan-identity' + + for entity in doc.coref_entities: + if entity.etype: + # Harmonize etype. + # If gen/spec is distinguished, store it in all mentions' other['gstype']. + etype = entity.etype.lower() + if etype.startswith('spec') or etype.startswith('gen'): + gstype = 'gen' if etype.startswith('gen') else 'spec' + for m in entity.mentions: + m.other['gstype'] = gstype + if etype == 'spec': + etype = 'other' + etype = etype.replace('gen', '').replace('spec', '').replace('.', '') + etype = NEW_ETYPE.get(etype, etype) + + # etype="APPOS" is used only in NONPUBL-CorefUD_English-OntoNotes. + # Apposition is a mention-based rather than entity-based attribute. + # We don't know which of the mentions it should be assigned, but let's expect all non-first. + # UD marks appositions with deprel appos, so once someone checks it is really redunant, + # TODO we can delete the appos mention attribute. + if etype == 'appos': + etype = '' + for mention in entity.mentions[1:]: + mention.other['appos'] = '1' + entity.etype = etype + + for mention in entity.mentions: + # Harmonize bridge relation labels + for bridge in mention.bridging: + rel = bridge.relation.lower() + if rel.endswith('-inv'): + rel = 'i' + rel.replace('-inv', '') + rel = rel.replace('-', '') + rel = rel.replace('indirect_', '') + bridge.relation = rel diff --git a/udapi/block/corefud/fixentityacrossnewdoc.py b/udapi/block/corefud/fixentityacrossnewdoc.py new file mode 100644 index 00000000..61e5e4f6 --- /dev/null +++ b/udapi/block/corefud/fixentityacrossnewdoc.py @@ -0,0 +1,25 @@ +from udapi.core.block import Block +import udapi.core.coref +import logging + +class FixEntityAcrossNewdoc(Block): + """ + Fix the error reported by validate.py --coref: + "[L6 Coref entity-across-newdoc] Same entity id should not occur in multiple documents" + by making the entity IDs (eid) unique in each newdoc document. + + This block uses Udapi's support for loading GUM-like GRP document-wide IDs + (so the implementation is simple, although unnecessarily slow). + After applying this block, IDs of all entities are prefixed with document numbers, + e.g. "e45" in the 12th document changes to "d12.e45". + If you prefer simple eid, use corefud.IndexClusters afterwards. + """ + + def process_document(self, doc): + if not doc.eid_to_entity: + logging.warning(f"No entities in document {doc.meta}") + udapi.core.coref.store_coref_to_misc(doc) + assert doc.meta["global.Entity"].startswith("eid") + doc.meta["global.Entity"] = "GRP" + doc.meta["global.Entity"][3:] + udapi.core.coref.load_coref_from_misc(doc) + doc.meta["global.Entity"] = "eid" + doc.meta["global.Entity"][3:] diff --git a/udapi/block/corefud/fixinterleaved.py b/udapi/block/corefud/fixinterleaved.py new file mode 100644 index 00000000..b4a42a43 --- /dev/null +++ b/udapi/block/corefud/fixinterleaved.py @@ -0,0 +1,84 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class FixInterleaved(Block): + """Fix mentions with interleaved or crossing spans. + https://github.com/ufal/corefUD/issues/25 + """ + + def __init__(self, same_entity_only=True, both_discontinuous=False, + crossing_only=False, nested_same_subspan=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.crossing_only = crossing_only + self.nested_same_subspan = nested_same_subspan + + def process_tree(self, tree): + mentions, deleted = set(), set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + for mA, mB in itertools.combinations(mentions, 2): + if mA in deleted or mB in deleted: + continue + if self.same_entity_only and mA.entity != mB.entity: + continue + + # Fully nested spans are OK, except for same-subspan. + sA, sB = set(mA.words), set(mB.words) + if (sA <= sB) or (sB <= sA): + if not self.nested_same_subspan: + continue + elif not set(mA.span.split(',')).intersection(set(mB.span.split(','))): + continue + + # Crossing or interleaved+crossing? + elif self.crossing_only: + if not sA.intersection(sB): + continue + else: + if mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]: + continue + if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: + continue + + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + + mA.words = list(sA.union(sB)) + for wb in sB: + try: + wb._mentions.remove(mB) + except ValueError: + pass + try: + mB.entity.mentions.remove(mB) + except ValueError: + pass + deleted.add(mB) + + # By changing the mA.words, we could have created another error: + # making the span same as another mention. Let's fix it. + sA = set(mA.words) + for mC in sorted(mentions): + if mC in deleted or mC is mA or mC is mB: + continue + if sA != set(mC.words): + continue + # So mA and mC have the same span and we need to delete one of them to fix it. + # We will delete mA because it has the artificially enlarged span, + # while mC is from the original annotation. + for wa in sA: + try: + wa._mentions.remove(mA) + except ValueError: + pass + try: + mA.entity.mentions.remove(mA) + except ValueError: + pass + break + deleted.add(mA) diff --git a/udapi/block/corefud/fixparentheses.py b/udapi/block/corefud/fixparentheses.py new file mode 100644 index 00000000..bc8e6504 --- /dev/null +++ b/udapi/block/corefud/fixparentheses.py @@ -0,0 +1,31 @@ +from udapi.core.block import Block + + +class FixParentheses(Block): + """Find mentions that contain opening parenthesis but do not contain the closing one (or the other way around). + If the missing parenthesis is an immediate neighbour of the mention span, add it to the span.""" + + def __init__(self, mark=True, **kwargs): + super().__init__(**kwargs) + self.mark = mark + + def process_coref_mention(self, mention): + words = [word.lemma for word in mention.words] + pairs = ['()', '[]', '{}'] + for pair in pairs: + if pair[0] in words: + if not pair[1] in words and pair[1] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[-1].ord == int(mention.words[-1].ord) and mention.words[-1].next_node and \ + mention.words[-1].next_node.lemma == pair[1]: + next_node = mention.words[-1].next_node + mention.words.append(next_node) + if self.mark: + next_node.misc['Mark'] = 1 + + elif pair[1] in words and pair[0] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[0].ord == int(mention.words[0].ord) and mention.words[0].prev_node \ + and mention.words[0].prev_node.lemma == pair[0]: + prev_node = mention.words[0].prev_node + mention.words.append(prev_node) + if self.mark: + prev_node.misc['Mark'] = 1 diff --git a/udapi/block/corefud/fixtovalidate.py b/udapi/block/corefud/fixtovalidate.py new file mode 100644 index 00000000..48a3608d --- /dev/null +++ b/udapi/block/corefud/fixtovalidate.py @@ -0,0 +1,39 @@ +from udapi.core.block import Block + +class FixToValidate(Block): + """This block fixes the CorefUD data so that the final documents are valid conllu files.""" + + def _set_root_deprel(self, doc): + for root in doc.trees: + for node in root.children: + if node.deprel != "root": + node.deprel = "root" + + def _unset_root_deprel(self, doc): + for node in doc.nodes: + parent = node.parent + if node.deprel == "root" and parent is not None and not parent.is_root(): + #print("\t".join(['Non-0-root:', node.address(), node.upos, str(node.feats), node.parent.upos, str(node.parent.feats)])) + if parent.upos == "PUNCT" and parent.parent is not None: + node.parent = parent.parent + if node.upos == "CCONJ": + node.deprel = "cc" + elif node.upos == "ADJ" and parent.upos == "PROPN": + node.deprel = "amod" + elif node.upos == "NOUN" and parent.upos == "VERB": + node.deprel = "obl" + else: + node.deprel = "parataxis" + + def _space_before_pardoc(self, doc): + last_node = None + for i, tree in enumerate(doc.trees): + if i > 0: + if (tree.newdoc is not None or tree.newpar is not None) and last_node.no_space_after: + del last_node.misc["SpaceAfter"] + last_node = tree.descendants[-1] + + def process_document(self, doc): + self._set_root_deprel(doc) + self._unset_root_deprel(doc) + self._space_before_pardoc(doc) diff --git a/udapi/block/corefud/guessspan.py b/udapi/block/corefud/guessspan.py new file mode 100644 index 00000000..d6093ece --- /dev/null +++ b/udapi/block/corefud/guessspan.py @@ -0,0 +1,33 @@ +from udapi.core.block import Block + +class GuessSpan(Block): + """Block corefud.GuessSpan heuristically fills mention spans, while keeping mention.head""" + + def process_coref_mention(self, mention): + mwords = mention.head.descendants(add_self=True) + # TODO add heuristics from corefud.PrintMentions almost_forest=1 + + # Add empty nodes that are causing gaps. + # A node "within the span" whose enhanced parent is in the mentions + # must be added to the mention as well. + # "within the span" includes also empty nodes "on the boundary". + # However, don't add empty nodes which are in a gap cause by non-empty nodes. + to_add = [] + min_ord = int(mwords[0].ord) if mwords[0].is_empty() else mwords[0].ord - 1 + max_ord = int(mwords[-1].ord) + 1 + root = mention.head.root + for empty in root.empty_nodes: + if empty in mwords: + continue + if empty.ord > max_ord: + break + if empty.ord > min_ord: + if any(enh['parent'] in mwords for enh in empty.deps): + to_add.append(empty) + elif empty.ord > min_ord + 1 and empty.ord < max_ord - 1: + prev_nonempty = root.descendants[int(empty.ord) - 1] + next_nonempty = root.descendants[int(empty.ord)] + if prev_nonempty in mwords and next_nonempty in mwords: + to_add.append(empty) + #else: empty.misc['Mark'] = f'not_in_treelet_of_{mention.entity.eid}' + mention.words = sorted(mwords + to_add) diff --git a/udapi/block/corefud/gum2corefud.py b/udapi/block/corefud/gum2corefud.py new file mode 100644 index 00000000..bf6d798d --- /dev/null +++ b/udapi/block/corefud/gum2corefud.py @@ -0,0 +1,111 @@ +import re +import logging +from collections import defaultdict +from udapi.core.block import Block + +class Gum2CorefUD(Block): + + def process_tree(self, tree): + docname = tree.bundle.document.meta['docname'] + '_' + + eid_to_entity = tree.bundle.document._eid_to_entity + unfinished_mentions = defaultdict(list) + for node in tree.descendants: + misc_entity = node.misc['Entity'] + if not misc_entity: + continue + # Attribute Entity may contain multiple entities, e.g. + # Entity=(abstract-7-new-2-coref(abstract-3-giv:act-1-coref) + # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. + # The following re.split line splits this into + # entities = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] + entities = [x for x in re.split('(\([^()]+\)?|[^()]+\))', misc_entity) if x] + for entity in entities: + # GUM 2.9 uses global.Entity = entity-GRP-infstat-MIN-coref_type-identity + # but the closing tag is shortent just to GRP. + opening, closing = (entity[0] == '(', entity[-1] == ')') + entity = entity.strip('()') + if not opening and not closing: + logging.warning(f"Entity {entity} at {node} has no opening nor closing bracket.") + elif not opening and closing: + name = docname + entity + if not unfinished_mentions[name]: + raise ValueError(f"Mention {name} closed at {node}, but not opened in the same tree.") + else: + mention = unfinished_mentions[name].pop() + mention.span = f'{mention.head.ord}-{node.ord}' + else: + attrs = entity.split('-') + if len(attrs) == 6: + etype, grp, infstat, minspan, ctype, wiki = attrs + elif len(attrs) == 5: + wiki = None + etype, grp, infstat, minspan, ctype = attrs + elif len(attrs) > 6: + logging.warning(f"Entity {entity} at {node} has more than 6 attributes.") + etype, grp, infstat, minspan, ctype, wiki = entity.split('-', maxsplit=5) + else: + raise ValueError(f"Less than 5 attributes in {entity} at {node}") + name = docname + grp + entity = eid_to_entity.get(name) + if entity is None: + entity = node.create_coref_entity(eid=name, etype=etype) + mention = entity.mentions[0] + mention.misc = f"Infstat:{infstat},MinSpan:{minspan},CorefType:{ctype}" + if wiki: + mention.misc += ',Wikification:' + wiki #.replace(',', '%2C') + else: + mention = entity.create_mention(head=node) + if closing: + mention.words = [node] + else: + unfinished_mentions[name].append(mention) + del node.misc['Entity'] + + misc_bridges = node.misc['Bridge'] + if misc_bridges: + # E.g. Entity=event-12|Bridge=12<124,12<125 + for misc_bridge in misc_bridges.split(','): + try: + trg_str, src_str = [docname + grp for grp in misc_bridge.split('<')] + except ValueError as err: + raise ValueError(f"{node}: {misc_bridge} {err}") + try: + trg_entity = eid_to_entity[trg_str] + src_entity = eid_to_entity[src_str] + except KeyError as err: + logging.warning(f"{node}: Cannot find entity {err}") + else: + mention = src_entity.mentions[-1] + # TODO: what relation should we choose for Bridging? + # relation = f"{src_str.split('-')[0]}-{trg_str.split('-')[0]}" + relation = '_' + mention.bridging.append((trg_entity, relation)) + del node.misc['Bridge'] + + misc_split = node.misc['Split'] + if misc_split: + # E.g. Entity=(person-54)|Split=4<54,9<54 + src_str = docname + misc_split.split('<')[-1] + ante_entities = [] + for x in misc_split.split(','): + ante_str, this_str = [docname + grp for grp in x.split('<')] + if this_str != src_str: + raise ValueError(f'{node} invalid Split: {this_str} != {src_str}') + # logging.warning + # There are just three such cases in GUM and all are bugs, + # so let's ignore them entirely (the `else` clause will be skipped if exiting `for` w/ `break`). + # break + ante_entities.append(eid_to_entity[ante_str]) + else: + eid_to_entity[src_str].split_ante = ante_entities + del node.misc['Split'] + + for entity_name, mentions in unfinished_mentions.items(): + for mention in mentions: + logging.warning(f"Mention {name} opened at {mention.head}, but not closed in the same tree. Deleting.") + entity = mention.entity + mention.words = [] + entity._mentions.remove(mention) + if not entity._mentions: + del eid_to_entity[name] diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py new file mode 100644 index 00000000..3f5d74d8 --- /dev/null +++ b/udapi/block/corefud/indexclusters.py @@ -0,0 +1,35 @@ +"""Block corefud.IndexClusters""" +from udapi.core.block import Block + + +class IndexClusters(Block): + """Re-index the coreference entity IDs (eid). The final entity IDs are of the "e" form, + where are ordinal numbers starting from the one specified by the `start` parameter. + This block can be applied on multiple documents within one udapy call. + For example, to re-index eid in all conllu files in the current directory + (keeping the IDs unique across all the files), use: + `udapy read.Conllu files='!*.conllu' corefud.IndexClusters write.Conllu overwrite=1` + + Parameters: + ----------- + start : int + the starting index (default=1) + prefix : str + prefix of the IDs before the number (default="e") + """ + + def __init__(self, start=1, prefix='e'): + self.start = start + self.prefix = prefix + + def process_document(self, doc): + entities = doc.coref_entities + if not entities: + return + new_eid_to_entity = {} + for idx, entity in enumerate(entities, self.start): + new_eid = self.prefix + str(idx) + entity.eid = new_eid + new_eid_to_entity[new_eid] = entity + self.start = idx + 1 + doc._eid_to_entity = new_eid_to_entity diff --git a/udapi/block/corefud/link2cluster.py b/udapi/block/corefud/link2cluster.py new file mode 100644 index 00000000..08296531 --- /dev/null +++ b/udapi/block/corefud/link2cluster.py @@ -0,0 +1,137 @@ +import logging +from udapi.core.block import Block + +class Link2Cluster(Block): + """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format. + + Params: + id_attr: name of the attribute in MISC that stores the original-format IDs of nodes + ante_attr: name of the attribute in MISC that stores the ID of the antecedent + of the current node (in the same format as `id_attr`). + delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion? + (i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr, + bridge_attr, bridge_relation_attr if these are used). Default=True. + infstat_attr: name of the attribute in MISC that stores the information status of a given mention + Will be stored in `mention.other['infstat']`. Use None for ignoring this. + coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention + Will be stored in `mention.other['coreftype']`. Use None for ignoring this. + bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent + of the current node/mention (in the same format as `id_attr`). + Default=None, i.e. ignore this parameter. + bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type + (e.g. "part" or "subset"). Default=None, i.e. ignore this parameter. + eid_counter: use a global counter of entity.eid and start with a given number. Default=1. + The main goal of this parameter is to make eid unique across multiple documents. + If you use eid_counter=0, this feature will be turned off, + so entities will be created using `root.document.create_coref_entity()`, + with no eid parameter, so that the eid will start from "e1" in each document processed by this block. + """ + def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, + infstat_attr='information-status', coreftype_attr='coreftype', + bridge_attr=None, bridge_relation_attr=None, eid_counter=1, **kwargs): + super().__init__(**kwargs) + self.id_attr = id_attr + self.ante_attr = ante_attr + self.delete_orig_attrs = delete_orig_attrs + self.infstat_attr = infstat_attr + self.coreftype_attr = coreftype_attr + self.bridge_attr = bridge_attr + self.bridge_relation_attr = bridge_relation_attr + self.eid_counter = int(eid_counter) + + def _new_entity(self, doc): + if not self.eid_counter: + return doc.create_coref_entity() + entity = doc.create_coref_entity(eid=f"e{self.eid_counter}") + self.eid_counter += 1 + return entity + + def _new_mention(self, entity, node): + mention = entity.create_mention(head=node, words=[node]) + if self.infstat_attr and node.misc[self.infstat_attr]: + mention.other['infstat'] = node.misc[self.infstat_attr] + if self.delete_orig_attrs: + del node.misc[self.infstat_attr] + if self.coreftype_attr and node.misc[self.coreftype_attr]: + mention.other['coreftype'] = node.misc[self.coreftype_attr] + if self.delete_orig_attrs: + del node.misc[self.coreftype_attr] + return mention + + def process_document(self, doc): + id2node = {} + links = [] + bridges = [] + for node in doc.nodes_and_empty: + this_id = node.misc[self.id_attr] + if this_id != '': + id2node[this_id] = node + ante_id = node.misc[self.ante_attr] + if ante_id != '': + if ante_id == this_id: + logging.warning(f"{node} has a self-reference {self.ante_attr}={ante_id}") + else: + links.append([ante_id, this_id]) + if self.delete_orig_attrs: + for attr in (self.id_attr, self.ante_attr): + del node.misc[attr] + if self.bridge_attr: + bridge_id = node.misc[self.bridge_attr] + if bridge_id != '': + if bridge_id == this_id: + logging.warning(f"{node} has a self-reference bridging {self.bridge_attr}={bridge_id}") + else: + bridges.append([bridge_id, this_id, node.misc[self.bridge_relation_attr]]) + if self.delete_orig_attrs: + for attr in (self.bridge_attr, self.bridge_relation_attr): + del node.misc[attr] + + # It seems faster&simpler to process the links in any order and implement entity merging, + # rather than trying to sort the links so that no entity merging is needed. + for ante_id, this_id in links: + if ante_id not in id2node: + logging.warning(f"{ante_id} is referenced in {self.ante_attr}, but not in {self.id_attr}") + else: + ante_node, this_node = id2node[ante_id], id2node[this_id] + if not this_node.coref_mentions and not ante_node.coref_mentions: + # None of the nodes is part of any mention/entity. Let's create them. + entity = self._new_entity(this_node.root.document) + self._new_mention(entity, ante_node) + self._new_mention(entity, this_node) + elif this_node.coref_mentions and ante_node.coref_mentions: + # Both of the nodes are part of mentions in different entities. + # Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity). + # While the official API supports "stealing" a single mention (m.entity = another_entity), + # the implementation below using _mentions and _entity is a bit faster. + e_ante, e_this = this_node.coref_entities[0], ante_node.coref_entities[0] + assert e_ante != e_this + for mention in e_ante.mentions: + mention._entity = e_this + e_this._mentions.extend(e_ante.mentions) + e_this._mentions.sort() + e_ante._mentions.clear() + else: + # Only one of the nodes is part of an entity. Let's add the second one to this entity. + if ante_node.coref_mentions: + self._new_mention(ante_node.coref_entities[0], this_node) + else: + self._new_mention(this_node.coref_entities[0], ante_node) + + # Bridging + for ante_id, this_id, relation in bridges: + if ante_id not in id2node: + logging.warning(f"{ante_id} is referenced in {self.bridge_attr}, but not in {self.id_attr}") + else: + ante_node, this_node = id2node[ante_id], id2node[this_id] + if ante_node.coref_mentions: + m_ante = next(m for m in ante_node.coref_mentions if m.head is ante_node) + e_ante = m_ante.entity + else: + e_ante = self._new_entity(ante_node.root.document) + m_ante = self._new_mention(e_ante, ante_node) + if this_node.coref_mentions: + m_this = next(m for m in this_node.coref_mentions if m.head is this_node) + else: + e_this = self._new_entity(this_node.root.document) + m_this = self._new_mention(e_this, this_node) + m_this.bridging.append((e_ante, relation)) diff --git a/udapi/block/corefud/load.py b/udapi/block/corefud/load.py new file mode 100644 index 00000000..92773dc2 --- /dev/null +++ b/udapi/block/corefud/load.py @@ -0,0 +1,12 @@ +from udapi.core.block import Block +import udapi.core.coref + +class Load(Block): + """Load coreference-related MISC attributes into memory. Allow lenient mode by strict=0.""" + + def __init__(self, strict=True): + self.strict = strict + + def process_document(self, doc): + if doc._eid_to_entity is None: + udapi.core.coref.load_coref_from_misc(doc, self.strict) diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py new file mode 100644 index 00000000..8064e67f --- /dev/null +++ b/udapi/block/corefud/markcrossing.py @@ -0,0 +1,39 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +import logging + +class MarkCrossing(Block): + """Find mentions with crossing spans.""" + + def __init__(self, same_entity_only=False, continuous_only=False, print_form=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.continuous_only = continuous_only + self.print_form = print_form + self.log = log + self.mark = mark + self._logged = {} + + def _print(self, mention): + if self.print_form: + return ' '.join([w.form for w in mention.words]) + else: + return mention.span + + def process_node(self, node): + if len(node.coref_mentions) > 1: + for mA, mB in itertools.combinations(node.coref_mentions, 2): + if not (set(mA.words) <= set(mB.words)) and not (set(mB.words) <= set(mA.words)): + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.continuous_only and (',' in mA.span or ',' in mB.span): + continue + if self.mark: + node.misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + cross_id = node.root.sent_id + mA.span + mB.span + if cross_id not in self._logged: + self._logged[cross_id] = True + print(f"crossing mentions at {node}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/markinterleaved.py b/udapi/block/corefud/markinterleaved.py new file mode 100644 index 00000000..c00f73b1 --- /dev/null +++ b/udapi/block/corefud/markinterleaved.py @@ -0,0 +1,45 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkInterleaved(Block): + """Find mentions with interleaved spans.""" + + def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + if len(mentions) > 1: + for mA, mB in itertools.combinations(mentions, 2): + if set(mA.words).intersection(set(mB.words)): + continue + if mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]: + continue + if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: + continue + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"interleaved mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/marknested.py b/udapi/block/corefud/marknested.py new file mode 100644 index 00000000..8db8a657 --- /dev/null +++ b/udapi/block/corefud/marknested.py @@ -0,0 +1,44 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkNested(Block): + """Find nested mentions.""" + + def __init__(self, same_entity_only=True, both_discontinuous=False, multiword_only=False, + print_form=False, log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.multiword_only = multiword_only + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + for mA, mB in itertools.combinations(mentions, 2): + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + sA, sB = set(mA.words), set(mB.words) + if not (sA <= sB) and not (sB <= sA): + continue + if self.multiword_only and (len(sA) == 1 or len(sB) == 1): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"nested mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/markpairs.py b/udapi/block/corefud/markpairs.py new file mode 100644 index 00000000..cc63b387 --- /dev/null +++ b/udapi/block/corefud/markpairs.py @@ -0,0 +1,138 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +from collections import Counter +import logging + +class MarkPairs(Block): + """Find pairs of coreference mentions within the same sentence with given properties. + Mark these pairs of mentions (using `misc["Mark"]`), so they can be further + processed or printed. + + Usage: + # Find pairs of mentions of the same entity within the same sentence: + cat my.conllu | udapy -TM corefud.MarkPairs same_entity=1 | less -R + + Properties: + same_entity - both mentions belong to the same entity (cluster) + both_continuous - both mentions have continuous spans + both_discontinuous - both mentions have discontinuous spans + nested - span of one mention is nested (a subset of) in the span of the other mention + crossing - spans are crossing (i.e. intersecting, but neither is subset of the other) + interleaved - spans are interleaved (i.e. not intersecting, but neither span precedes the other) + same_head - the same node is a head of both mentions + same_span - both mentions have the same span (which is invalid according to UD's validate.py) + same_subspan - at least one of the mentions is discontinuous and one of its subspans + is also a subspan (or span) of the other mention + + + You can combine any number of properties. + Each property can have one of the three values: + include - this is the default value: include pairs with this property, i.e. ignore the property + exclude - exclude (from the marking) pairs of mentions with this property + only - pairs of mentions without this property will be excluded + + As a shortcut, you can use -1 and 1 instead of exclude and only, so e.g. + nested=only same_head=exclude + can be written as + nested=1 same_head=-1 + """ + + def __init__(self, same_entity=0, both_continuous=0, both_discontinuous=0, + nested=0, crossing=0, interleaved=0, + same_head=0, same_span=0, same_subspan=0, + print_form=False, print_total=True, log=True, mark=True, **kwargs): + super().__init__(**kwargs) + + + self.same_entity = self._convert(same_entity) + self.both_continuous = self._convert(both_continuous) + self.both_discontinuous = self._convert(both_discontinuous) + self.nested = self._convert(nested) + self.crossing = self._convert(crossing) + self.interleaved = self._convert(interleaved) + self.same_head = self._convert(same_head) + self.same_span = self._convert(same_span) + self.same_subspan = self._convert(same_subspan) + + self.print_form = print_form + self.print_total = print_total + self.log = log + self.mark = mark + self.counter = Counter() + + def _convert(self, value): + if value in {-1, 0, 1}: + return value + if value == 'include': + return 0 + if value == 'only': + return 1 + if value == 'exclude': + return -1 + raise ValueError('unknown value ' + value) + + def _ok(self, condition, value): + if value == 0: + return True + return (condition and value == 1) or (not condition and value==-1) + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + self.counter['mentions'] += len(mentions) + + for mA, mB in itertools.combinations(mentions, 2): + self.counter['pairs'] += 1 + if not self._ok(mA.entity == mB.entity, self.same_entity): + continue + if not self._ok(mA.head == mB.head, self.same_head): + continue + + if self.both_continuous or self.both_discontinuous or self.same_span or self.same_subspan: + sA, sB = mA.span, mB.span + cA, cB = ',' not in sA, ',' not in sB + if not self._ok(cA and cB, self.both_continuous): + continue + if not self._ok(not cA and not cB, self.both_discontinuous): + continue + if not self._ok(sA == sB, self.same_span): + continue + if not self._ok(set(sA.split(',')).intersection(set(sB.split(','))), self.same_subspan): + continue + + if self.nested or self.crossing or self.interleaved: + wA, wB = set(mA.words), set(mB.words) + if not self._ok(wA <= wB or wB <= wA, self.nested): + continue + if not self._ok(wA.intersection(wB) and not wA <= wB and not wB <= wA, self.crossing): + continue + if self.interleaved: + a_precedes_b = mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0] + b_precedes_a = mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0] + if not self._ok(not wA.intersection(wB) and not a_precedes_b and not b_precedes_a, self.interleaved): + continue + + self.counter['matching'] += 1 + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + logging.info(f"Found mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") + + def after_process_document(self, doc): + if self.print_total: + #if self.max_trees and seen_trees > self.max_trees: + # print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + msg = f'######## Mentions = {self.counter["mentions"]}, matching/all pairs = {self.counter["matching"]} / {self.counter["pairs"]}' + logging.info(msg) + doc.meta["corefud.MarkPairs"] = msg diff --git a/udapi/block/corefud/marksamesubspan.py b/udapi/block/corefud/marksamesubspan.py new file mode 100644 index 00000000..f3cfd7b3 --- /dev/null +++ b/udapi/block/corefud/marksamesubspan.py @@ -0,0 +1,45 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkSameSubSpan(Block): + """Find mentions with the same subspan.""" + + def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False, nested_only=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.nested_only = nested_only + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + if len(mentions) > 1: + for mA, mB in itertools.combinations(mentions, 2): + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + sA, sB = set(mA.words), set(mB.words) + if self.nested_only and not (sA <= sB) and not (sB <= sA): + continue + if not set(mA.span.split(',')).intersection(set(mB.span.split(','))): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"same-subspan mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py new file mode 100644 index 00000000..61b613cb --- /dev/null +++ b/udapi/block/corefud/mergesamespan.py @@ -0,0 +1,52 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +import logging + +class MergeSameSpan(Block): + """ + Multiple same-span mentions are considered invalid in CoNLL-U, whether they + belong to the same entity or not. If they occur, merge them into one. + Note: We currently do not have mentions across sentence boundaries in the + CorefUD data, so this block processes one sentence at a time. + """ + + def __init__(self, same_entity_only=False, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + for mA, mB in itertools.combinations(mentions, 2): + if self.same_entity_only and mA.entity != mB.entity: + continue + # Reduce non-determinism in which mention is removed: + # If the mentions belong to different entities, sort them by entity (entity) ids. + if mA.entity.eid > mB.entity.eid: + mA, mB = mB, mA + + sA, sB = set(mA.words), set(mB.words) + if sA != sB: + continue + + # If the mentions belong to different entities, we should merge the + # entities first, i.e., pick one entity as the survivor, move the + # mentions from the other entity to this entity, and remove the + # other entity. + if mA.entity != mB.entity: + logging.warning(f"Merging same-span mentions that belong to different entities: {mA.entity.eid} vs. {mB.entity.eid}") + ###!!! TODO: As of now, changing the entity of a mention is not supported in the API. + #for m in mB.entity.mentions: + # m.entity = mA.entity + # Remove mention B. It may have been removed earlier because of + # another duplicate, that is the purpose of try-except. + ###!!! TODO: If we remove a singleton, we are destroying the entity. Then we must also handle possible bridging and split antecedents pointing to that entity! + mB.words = [] + try: + mB.entity.mentions.remove(mB) + except ValueError: + pass diff --git a/udapi/block/corefud/miscstats.py b/udapi/block/corefud/miscstats.py new file mode 100644 index 00000000..dee358d6 --- /dev/null +++ b/udapi/block/corefud/miscstats.py @@ -0,0 +1,35 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class MiscStats(Block): + """Block corefud.MiscStats prints 10 most frequent values of each attribute stored in the MISC field""" + + def __init__(self, maxvalues=10, **kwargs): + + """Create the corefud.MiscStats + + Args: + maxvalues: the number of most frequent values + to be printed for each attribute. + + """ + super().__init__(**kwargs) + self.maxvalues = maxvalues + self.valuecounter = {} + self.totalcounter = Counter() + + def process_node(self,node): + for attrname in node.misc: + shortattrname = re.sub(r'\[\d+\]',r'',attrname) + if not shortattrname in self.valuecounter: + self.valuecounter[shortattrname] = Counter() + self.valuecounter[shortattrname][node.misc[attrname]] += 1 + self.totalcounter[shortattrname] += 1 + + def process_end(self): + for attrname in self.valuecounter: + print() + print(attrname+"\t"+str(self.totalcounter[attrname])) + for value,freq in self.valuecounter[attrname].most_common(self.maxvalues): + print("\t"+str(value)+"\t"+str(freq)) diff --git a/udapi/block/corefud/miscstatstex.py b/udapi/block/corefud/miscstatstex.py new file mode 100644 index 00000000..25d3751a --- /dev/null +++ b/udapi/block/corefud/miscstatstex.py @@ -0,0 +1,44 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class MiscStatsTex(Block): + """Block corefud.MiscStats prints 10 most frequent values of each attribute stored in the MISC field""" + + def __init__(self, maxvalues=10, **kwargs): + + """Create the corefud.MiscStats + + Args: + maxvalues: the number of most frequent values + to be printed for each attribute. + + """ + super().__init__(**kwargs) + self.maxvalues = maxvalues + self.valuecounter = {} + self.totalcounter = Counter() + + def process_node(self,node): + for attrname in node.misc: + shortattrname = re.sub(r'\[\d+\]',r'',attrname) + if not shortattrname in self.valuecounter: + self.valuecounter[shortattrname] = Counter() + self.valuecounter[shortattrname][node.misc[attrname]] += 1 + self.totalcounter[shortattrname] += 1 + + def process_end(self): + for attrname in self.valuecounter: + + total = self.totalcounter[attrname] + distrvalues = [] + + for value,freq in self.valuecounter[attrname].most_common(self.maxvalues): + value = re.sub(r'_',r'\\_',value) + distrvalues.append(f'\\attr{{{str(value)}}} {100*freq/total:2.1f}~\\%') + + attrname = re.sub(r'_',r'\\_',attrname) + print(f" \\item attribute \\attr{{{attrname}}}, {total:,} occurrences, values: "+", ".join(distrvalues)) +# print(f" \\item attribute \\attr\{{attrname}\}, {str(total)} occurrences, distribution of values: "+", ".join(distrvalues)) + + diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py new file mode 100644 index 00000000..00a32e9f --- /dev/null +++ b/udapi/block/corefud/movehead.py @@ -0,0 +1,95 @@ +import logging +from collections import Counter +from udapi.core.block import Block +from udapi.core.node import find_minimal_common_treelet + +class MoveHead(Block): + """Block corefud.MoveHead moves the head to the highest node in each mention.""" + + def __init__(self, bugs='warn', keep_head_if_possible=True, **kwargs): + self.counter = Counter() + self.bugs = bugs + self.keep_head_if_possible = keep_head_if_possible + super().__init__(**kwargs) + + def _eparents(self, node): + if node._raw_deps != '_': + return [d['parent'] for d in node.deps] + if node.parent: + return [node.parent] + return [] + + def find_head(self, mention): + mwords = set(mention.words) + + # First, check the simplest case: no empty words and a treelet in basic dependencies. + basic_heads = [w for w in mention.words if not w.parent or not w.parent in mwords] + assert basic_heads + if len(basic_heads) == 1: + return basic_heads[0], 'treelet' + + # Second, check also enhanced dependencies (but only within basic_heads for simplicity). + enh_heads = [w for w in basic_heads if not any(p in mwords for p in self._eparents(w))] + if not enh_heads: + enh_heads = [w for w in basic_heads if not all(p in mwords for p in self._eparents(w))] + if not enh_heads: + return mention.head, 'cycle' + if len(enh_heads) == 1: + return enh_heads[0], 'treelet' + + # Third, find non-empty parents (ancestors in future) of empty nodes. + empty_nodes, non_empty = [], [] + for w in enh_heads: + (empty_nodes if w.is_empty() else non_empty).append(w) + if empty_nodes: + for empty_node in empty_nodes: + parents = [d['parent'] for d in empty_node.deps if not d['parent'].is_empty()] + if parents: + if parents[0] not in non_empty: + non_empty.append(parents[0]) + else: + # TODO we should climb up, but preventing cycles + # We could also introduce empty_node.nonempty_ancestor + if 'warn' in self.bugs: + logging.warning(f"could not find non-empty parent of {empty_node} for mention {mention.head}") + if 'mark' in self.bugs: + node.misc['Bug'] = 'no-parent-of-empty' + non_empty.sort() + + # Fourth, check if there is a node within the enh_heads governing all the mention nodes + # and forming thus a "gappy treelet", where the head is clearly the "highest" node. + (highest, added_nodes) = find_minimal_common_treelet(*non_empty) + if highest in enh_heads: + return highest, 'gappy' + if highest in mwords: + if 'warn' in self.bugs: + logging.warning(f"Strange mention {mention.head} with highest node {highest}") + if 'mark' in self.bugs: + highest.misc['Bug'] = 'highest-in-mwords' + mention.head.misc['Bug'] = 'highest-head' + + # Fifth, try to convervatively preserve the original head, if it is one of the possible heads. + if self.keep_head_if_possible and mention.head in enh_heads: + return mention.head, 'nontreelet' + + # Finally, return the word-order-wise first head candidate as the head. + return enh_heads[0], 'nontreelet' + + def process_coref_mention(self, mention): + self.counter['total'] += 1 + if len(mention.words) < 2: + self.counter['single-word'] += 1 + else: + new_head, category = self.find_head(mention) + self.counter[category] += 1 + if new_head is mention.head: + self.counter[category + '-kept'] += 1 + else: + self.counter[category + '-moved'] += 1 + mention.head = new_head + + def process_end(self): + logging.info("corefud.MoveHead overview of mentions:") + total = self.counter['total'] + for key, value in self.counter.most_common(): + logging.info(f"{key:>16} = {value:6} ({100*value/total:5.1f}%)") diff --git a/udapi/block/corefud/printentities.py b/udapi/block/corefud/printentities.py new file mode 100644 index 00000000..7230c6a5 --- /dev/null +++ b/udapi/block/corefud/printentities.py @@ -0,0 +1,55 @@ +import re +import os.path +from udapi.core.block import Block +from collections import Counter, defaultdict + +class PrintEntities(Block): + """Block corefud.PrintEntities prints all mentions of a given entity.""" + + def __init__(self, eid_re=None, min_mentions=0, print_ranges=True, mark_head=True, + aggregate_mentions=True, **kwargs): + """Params: + eid_re: regular expression constraining ID of the entities to be printed + min_mentions: print only entities with with at least N mentions + print_ranges: print also addressess of all mentions + (compactly, using the longest common prefix of sent_id) + mark_head: mark the head (e.g. as "red **car**") + """ + super().__init__(**kwargs) + self.eid_re = re.compile(str(eid_re)) if eid_re else None + self.min_mentions = min_mentions + self.print_ranges = print_ranges + self.mark_head = mark_head + self.aggregate_mentions = aggregate_mentions + + def process_document(self, doc): + if 'docname' in doc.meta: + print(f"Coref entities in document {doc.meta['docname']}:") + for entity in doc.coref_entities: + if self.eid_re and not self.eid_re.match(entity.eid): + continue + if len(entity.mentions) < self.min_mentions: + continue + print(f" {entity.eid} has {len(entity.mentions)} mentions:") + if self.aggregate_mentions: + counter = Counter() + ranges = defaultdict(list) + for mention in entity.mentions: + forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) + counter[forms] += 1 + if self.print_ranges: + ranges[forms].append(mention.head.root.address() + ':' +mention.span) + for form, count in counter.most_common(): + print(f"{count:4}: {form}") + if self.print_ranges: + if count == 1: + print(' ' + ranges[form][0]) + else: + prefix = os.path.commonprefix(ranges[form]) + print(f' {prefix} ({" ".join(f[len(prefix):] for f in ranges[form])})') + else: + for mention in entity.mentions: + forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) + print(' ' + forms) + if self.print_ranges: + print(f" {mention.head.root.address()}:{mention.span}") diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py new file mode 100644 index 00000000..d011f686 --- /dev/null +++ b/udapi/block/corefud/printmentions.py @@ -0,0 +1,186 @@ +import random +from collections import Counter +from udapi.core.block import Block +from udapi.block.write.textmodetreeshtml import TextModeTreesHtml +from udapi.block.write.textmodetrees import TextModeTrees + +class PrintMentions(Block): + """Print mentions with various properties.""" + + def __init__(self, continuous='include', almost_continuous='include', treelet='include', + forest='include', almost_forest='include', oneword='include', singleton='include', + empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, + print_total=True, print_should=True, + print_sent_id=True, print_text=True, add_empty_line=True, indent=1, + minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc', + print_undef_as='_', print_doc_meta=True, print_comments=False, + mark='(Mark)', hints=True, layout='classic', + **kwargs): + super().__init__(**kwargs) + self.continuous = self._convert(continuous) + self.almost_continuous = self._convert(almost_continuous) + self.treelet = self._convert(treelet) + self.forest = self._convert(forest) + self.almost_forest = self._convert(almost_forest) + self.oneword = self._convert(oneword) + self.singleton = self._convert(singleton) + self.empty = self._convert(empty) + + self.max_trees = max_trees + self.html = html + self.shuffle = shuffle + if shuffle: + random.seed(42) + self.print_other_forms = print_other_forms + self.print_total = print_total, + self.print_should = print_should, + print_class = TextModeTreesHtml if html else TextModeTrees + self.print_block = print_class( + print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, + minimize_cross=minimize_cross, color=color, attributes=attributes, + print_undef_as=print_undef_as, print_doc_meta=print_doc_meta, print_comments=print_comments, + mark=mark, hints=hints, layout=layout) + + def _convert(self, value): + if value in {'include', 'exclude', 'only'}: + return value + if value == 1: + return 'only' + if value == 0: + return 'exclude' + raise ValueError('unknown value ' + value) + + def before_process_document(self, document): + self.print_block.before_process_document(document) + + def after_process_document(self, document): + self.print_block.after_process_document(document) + + def _ok(self, condition, value): + if value == 'include': + return True + return (condition and value == 'only') or (not condition and value=='exclude') + + def _is_auxiliary_etc(self, node): + if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}: + return True + if node.deprel == 'advmod:emph': + return True + if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: + return True + return False + + def _is_forest(self, mention, mwords, almost): + for w in mention.words: + # UD unfortunatelly does not use the copula-as-head style for copula construction, + # so e.g. in "It is my fault", "fault" is the root of the tree and all other words its children. + # However, in the cop-as-head stule, only "my" would depend on "fault" (and should be part of the mention). + # It is difficult to tell apart which w.children are related to w and which to the copula. + # We thus ignore these cases completely (we expect any child is potentially related to the copula). + if any(ch.udeprel == 'cop' for ch in w.children): + continue + for ch in w.children: + if ch not in mwords: + if not almost: + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid + return False + # Punctuation before or after the mention span can depend on any of the mwords + # without breaking the almost_forest property. + # According to the UD guidelines, it should depend on the highest node within the phrase, + # i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines. + if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]): + continue + # Some auxiliary words (e.g. prepositions) may be excluded from the mention span + # without breaking the almost_forest property, but they need to depend + # on the mention head (or if the mention is not a catena, they need to depend + # on one of the potential heads, i.e. a node from mwords whose parent is not in mwords). + # For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head), + # but "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest + # because "with" depends on "Mary", which is not the mention head (nor a potential mention head). + if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)): + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid + return False + return True + + def _is_almost_continuous(self, mention): + if ',' not in mention.span: + return True + nonempty = [w for w in mention.words if not w.is_empty()] + if not nonempty: + return True + mwords = set(mention.words) + gap_nodes = [w for w in mention.head.root.descendants if w > nonempty[0] and w < nonempty[-1] and not w in mwords] + for gap_node in gap_nodes: + if not gap_node.is_empty(): + return False + return True + + def process_document(self, doc): + mentions = [] + for entity in doc.coref_entities: + if self._ok(len(entity.mentions) == 1, self.singleton): + mentions.extend(entity.mentions) + if self.shuffle: + random.shuffle(mentions) + else: + mentions.sort() + + seen_trees = 0 + for mention in mentions: + if not self._ok(len(mention.words) == 1, self.oneword): + continue + if not self._ok(',' not in mention.span, self.continuous): + continue + if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous): + continue + + empty_mwords = [w for w in mention.words if w.is_empty()] + if not self._ok(len(empty_mwords) > 0, self.empty): + continue + + heads, mwords = 0, set(mention.words) + for w in mention.words: + if w.parent: + heads += 0 if w.parent in mwords else 1 + else: + heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 + if not self._ok(heads <= 1, self.treelet): + continue + if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest): + continue + if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest): + continue + + for w in mention.words: + w.misc['Mark'] = 1 + + seen_trees += 1 + if self.max_trees and seen_trees > self.max_trees: + if not self.print_total: + print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + return + else: + this_form = ' '.join([w.form for w in mention.words]) + print("# Mention = " + this_form) + if self.print_other_forms: + counter = Counter() + for m in mention.entity.mentions: + forms = ' '.join([w.form for w in m.words]) + if forms != this_form: + counter[forms] += 1 + if counter: + print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='') + for form, count in counter.most_common(self.print_other_forms): + print(f' "{form}"({count})', end='') + print() + self.print_block.process_tree(mention.head.root) + for w in mention.words: + del w.misc['Mark'] + + if self.print_total: + if self.max_trees and seen_trees > self.max_trees: + print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + print(f'######## Total matching/all mentions = {seen_trees} / {len(mentions)}') + diff --git a/udapi/block/corefud/removemisc.py b/udapi/block/corefud/removemisc.py new file mode 100644 index 00000000..f132aaed --- /dev/null +++ b/udapi/block/corefud/removemisc.py @@ -0,0 +1,18 @@ +from udapi.core.block import Block +import re + +class RemoveMisc(Block): + """Deleting all temporary attributes after primary conversions""" + + def __init__(self, attrnames='', **kwargs): + """ Arg: attrnames = comma-separated list of Misc attributes to be deleted""" + super().__init__(**kwargs) + self.attrs4deletion = set(attrnames.split(',')) + + def process_tree(self,root): + for node in root.descendants_and_empty: + for attrname in list(node.misc): + shortattrname = re.sub(r'\[\d+\]',r'',attrname) + if shortattrname in self.attrs4deletion: + del node.misc[attrname] + diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py new file mode 100644 index 00000000..4551873c --- /dev/null +++ b/udapi/block/corefud/removenocorefentities.py @@ -0,0 +1,21 @@ +from udapi.core.block import Block +import udapi.core.coref +import re +import logging + +class RemoveNoCorefEntities(Block): + """ + Some corpora (e.g., AnCora) include annotation of named entities that are + not annotated for coreference. To distinguish them, their cluster ID starts + with 'NOCOREF' (optionally followed by entity type, so that one cluster + still has just one type). We may want to remove such entities from datasets + that are used to train coreference resolves, to prevent the resolvers from + thinking that all members of a NOCOREF cluster are coreferential. That is + what this block does. + """ + + def process_document(self, doc): + entities = doc.coref_entities + if not entities: + return + doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)} diff --git a/udapi/block/corefud/singleparent.py b/udapi/block/corefud/singleparent.py new file mode 100644 index 00000000..ee9b1948 --- /dev/null +++ b/udapi/block/corefud/singleparent.py @@ -0,0 +1,47 @@ +"""If an empty node has multiple (enhanced-deps) parents, only the highest one is kept.""" +from udapi.core.block import Block +from collections import Counter +from udapi.core.node import find_minimal_common_treelet +import logging + +class SingleParent(Block): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._reasons = Counter() + + def process_tree(self, tree): + for empty in tree.empty_nodes: + self._reasons['_empty'] += 1 + if len(empty.deps) > 1: + self._reasons['_more-parents'] += 1 + parents = [d['parent'] for d in empty.deps] + nonempty_parents = [p for p in parents if not p.is_empty()] + if len(nonempty_parents) != len(parents): + self._reasons['empty-parent'] += 1 + #empty.misc['Mark'] = f"empty-parent:{empty.deps}" + logging.warning(f"Empty node {empty} has an empty parent.") + if not nonempty_parents: + empty.deps = [] + self._reasons['no-nonempty-parent'] += 1 + continue + (highest, added_nodes) = find_minimal_common_treelet(*nonempty_parents) + if highest in nonempty_parents: + self._reasons['one-governs'] += 1 + empty.deps = [d for d in empty.deps if d['parent'] is highest] + continue + nonempty_parents.sort(key=lambda n:n._get_attr('depth')) + if len(nonempty_parents)>1 and nonempty_parents[0]._get_attr('depth') == nonempty_parents[0]._get_attr('depth'): + self._reasons['same-depth'] += 1 + #empty.misc['Mark'] = f"same-depth:{empty.deps}" + else: + self._reasons['one-highest'] += 1 + #empty.misc['Mark'] = f"one-highest:{empty.deps}" + empty.deps = [d for d in empty.deps if d['parent'] is nonempty_parents[0]] + + def after_process_document(self, document): + message = "\n" + for k, v in self._reasons.most_common(): + message += f"{k}={v}\n" + #document.meta["bugs"] = message + logging.info(message) diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py new file mode 100644 index 00000000..527159e9 --- /dev/null +++ b/udapi/block/corefud/stats.py @@ -0,0 +1,305 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class Stats(Block): + """Block corefud.Stats prints various coreference-related statistics.""" + + def __init__(self, m_len_max=5, e_len_max=5, + report_basics=False, report_mentions=True, report_entities=True, + report_details=True, report_words_per_doc=False, report_entity_range=False, + selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _', + exclude_singletons=False, exclude_nonsingletons=False, style='human', + per_doc=False, max_rows_per_page=50, docname='newdoc', docname_len=15, + highlight_docnames=None, + **kwargs): + super().__init__(**kwargs) + self.m_len_max = m_len_max + self.e_len_max = e_len_max + self.report_basics = report_basics + self.report_mentions = report_mentions + self.report_entities = report_entities + self.report_details = report_details + self.report_words_per_doc = report_words_per_doc + self.report_entity_range = report_entity_range + self.exclude_singletons = exclude_singletons + self.exclude_nonsingletons = exclude_nonsingletons + self.style = style + if style not in 'tex tex-table tex-doc human'.split(): + raise ValueError(f'Unknown style {style}') + self.per_doc = per_doc + self.max_rows_per_page = max_rows_per_page + if docname not in 'newdoc filename'.split(): + raise ValueError(f'Unknown style {style}') + self.docname = docname + self.docname_len = docname_len + self.highlight_docnames = highlight_docnames + self._header_printed = False + self._lines_printed = None + + self.counter = Counter() + self.mentions = 0 + self.entities = 0 + self.singletons = 0 + self.total_nodes = 0 + self.longest_mention = 0 + self.longest_entity = 0 + self.m_words = 0 + self.selected_upos = None if selected_upos == 'all' else selected_upos.split() + self.entity_ranges = [] + + def process_document(self, doc): + self.total_nodes += len(list(doc.nodes)) + self.counter['documents'] += 1 + node2docord, current_docord = {}, 0 + if self.report_entity_range: + for node in doc.nodes_and_empty: + node2docord[node] = current_docord + current_docord += 1 + + for entity in doc.coref_entities: + len_mentions = len(entity.mentions) + if len_mentions == 1: + self.singletons += 1 + if len_mentions == 1 and self.exclude_singletons: + continue + elif len_mentions > 1 and self.exclude_nonsingletons: + continue + if self.report_entity_range: + self.entity_ranges.append(node2docord[entity.mentions[-1].head] - node2docord[entity.mentions[0].head]) + self.longest_entity = max(len_mentions, self.longest_entity) + self.counter['c_total_len'] += len_mentions + self.counter[f"c_len_{min(len_mentions, self.e_len_max)}"] += 1 + + self.entities += 1 + if not self.report_mentions and not self.report_details: + continue + for mention in entity.mentions: + self.mentions += 1 + all_words = len(mention.words) + non_empty = len([w for w in mention.words if not w.is_empty()]) + self.m_words += all_words + self.longest_mention = max(non_empty, self.longest_mention) + self.counter['m_total_len'] += non_empty + self.counter[f"m_len_{min(non_empty, self.m_len_max)}"] += 1 + if self.report_details: + upos = 'other' + if not self.selected_upos or mention.head.upos in self.selected_upos: + upos = mention.head.upos + self.counter['m_head_upos_' + upos] += 1 + self.counter['m_with_empty'] += 1 if all_words > non_empty else 0 + self.counter['m_with_gaps'] += 1 if ',' in mention.span else 0 + heads, mwords = 0, set(mention.words) + for w in mention.words: + if w.parent: + heads += 0 if w.parent in mwords else 1 + else: + heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 + self.counter['m_nontreelet'] += 1 if heads > 1 else 0 + + if self.report_basics: + doc_words = 0 + for tree in doc.trees: + self.counter['sents'] += 1 + self.counter['words'] += len(tree.descendants) + self.counter['empty'] += len(tree.empty_nodes) + if tree.newdoc: + self.counter['newdocs'] += 1 + if doc_words > self.counter['max_words_per_doc']: + self.counter['max_words_per_doc'] = doc_words + doc_words = 0 + doc_words += len(tree.descendants) + + def after_process_document(self, doc): + if self.per_doc: + self.process_end(skip=False, doc=doc) + self.counter = Counter() + self.mentions = 0 + self.entities = 0 + self.singletons = 0 + self.total_nodes = 0 + self.longest_mention = 0 + self.longest_entity = 0 + self.m_words = 0 + self.entity_ranges = [] + + def process_end(self, skip=True, doc=None): + if not self._lines_printed: + self.print_header() + self._lines_printed = 0 + if self.per_doc: + if skip: + self.print_footer() + return + else: + docname = doc.meta['loaded_from'] if self.docname == 'filename' else doc[0].trees[0].newdoc + if self.style.startswith('tex'): + if self.highlight_docnames and re.search(self.highlight_docnames, docname): + docname = r"\NEW " + docname + docname = docname.replace('_', r'\_') + print(f"{docname:{self.docname_len}}", end='&' if self.style.startswith('tex') else '\n') + elif self.style.startswith('tex-'): + print(f"{self.counter['documents']:4} documents &") + self._lines_printed += 1 + + mentions_nonzero = 1 if self.mentions == 0 else self.mentions + entities_nonzero = 1 if self.entities == 0 else self.entities + total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes + + columns =[ ] + if self.report_basics: + columns += [('docs', f"{self.counter['newdocs']:6,}"), + ('sents', f"{self.counter['sents']:7,}"), + ('words', f"{self.counter['words']:9,}"), + ('empty', f"{self.counter['empty']:7,}"),] + if self.report_words_per_doc: + columns += [('max_words/doc', f"{self.counter['max_words_per_doc']:7,}"), + ('words/doc', f"{self.counter['words']/self.counter['newdocs']:7,.0f}"),] + if self.report_entities: + columns += [('entities', f"{self.entities:7,}"), + ('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"), + ('longest_entity', f"{self.longest_entity:6}"), + ('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")] + if self.report_entity_range: + self.entity_ranges.sort() + percentile = self.entity_ranges[int(0.95 * (len(self.entity_ranges) - 1))] if self.entity_ranges else 0 + columns += [('entity_range_95percentile', f"{percentile:6,}"),] + for i in range(1, self.e_len_max + 1): + percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero + columns.append((f"c_len_{i}{'' if i < self.e_len_max else '+'}", f"{percent:5.1f}")) + if self.report_mentions: + columns += [('mentions', f"{self.mentions:7,}"), + ('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"), + ('longest_mention', f"{self.longest_mention:6}"), + ('avg_mention', f"{self.counter['m_total_len'] / mentions_nonzero:5.1f}")] + if self.m_len_max: + for i in range(0, self.m_len_max + 1): + percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero + columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}")) + if self.report_details: + columns += [('with_empty', f"{100 * self.counter['m_with_empty'] / mentions_nonzero:5.1f}"), + ('with_gaps', f"{100 * self.counter['m_with_gaps'] / mentions_nonzero:5.1f}"), + ('nontreelet', f"{100 * self.counter['m_nontreelet'] / mentions_nonzero:5.1f}"),] + if self.selected_upos: + upos_list = self.selected_upos + ['other'] + else: + upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')] + for upos in upos_list: + columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}")) + + if self.style.startswith('tex'): + print(" &".join(c[1] for c in columns), end=" \\\\\n") + elif self.style == 'human': + for c in columns: + print(f"{c[0]:>15} = {c[1].strip():>10}") + if not self.per_doc: + self.print_footer() + elif self._lines_printed > self.max_rows_per_page: + self.print_footer(False) + self._lines_printed = 0 + + def print_header(self): + if not self.style.startswith('tex-'): + return + if self.style == 'tex-doc': + if self._lines_printed is None: + print(r'\documentclass[multi=mypage]{standalone}') + print(r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}') + print(r'\usepackage[table]{xcolor}\newcommand{\NEW}{\rowcolor{gray!50}}') + print(r'\title{Udapi coreference statistics}') + print(r'\begin{document}') + print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}') + lines = [r'\begin{mypage}'+"\n"+r'\begin{tabular}{@{}l ', + " " * self.docname_len, + ("document" if self.per_doc else "dataset ") + " " * (self.docname_len-8), + " " * self.docname_len] + if self.report_basics: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{text size} ' + lines[2] += r'& \MC{4}{total number of} ' + lines[3] += r'& docs & sents & words &empty n.' + if self.report_words_per_doc: + lines[0] += "rr " + lines[1] += r'& & ' + lines[2] += r'&\MC{2}{words/doc}' + lines[3] += r'& max & avg ' + if self.report_entities: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{entities} ' + lines[2] += r'& total &per 1k &\MC{2}{length}' + lines[3] += r'& count & words & max & avg ' + if self.report_entity_range: + lines[0] += "r " + lines[1] += r'& ' + lines[2] += r'& range ' + lines[3] += r'& p95 ' + if self.e_len_max: + for i in range(1, self.e_len_max + 1): + lines[0] += "r" + lines[2] += f"& {i:4}" + ("+ " if i==self.e_len_max else " ") + lines[3] += r'& [\%] ' + lines[0] += " " + lines[1] += r'& \MC{' + str(self.e_len_max) + r'}{distribution of entity lengths}' + if self.report_mentions: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{mentions} ' + lines[2] += r'& total &per 1k &\MC{2}{length}' + lines[3] += r'& count & words & max & avg ' + if self.m_len_max: + for i in range(0, self.m_len_max + 1): + lines[0] += "r" + lines[2] += f"& {i:4}" + ("+ " if i==self.m_len_max else " ") + lines[3] += r'& [\%] ' + lines[0] += " " + lines[1] += r'& \MC{' + str(self.m_len_max + 1) + r'}{distribution of mention lengths}' + " "*7 + if self.report_details: + lines[0] += "rrrr " + lines[1] += r'& \MC{3}{mention type} ' + lines[2] += r'&w/empty& w/gap&non-tree' + lines[3] += r'& [\%] ' * 3 + if self.selected_upos: + upos_list = self.selected_upos + ['other'] + else: + upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')] + lines[0] += "@{~}r" * len(upos_list) + lines[1] += r"& \MC{" + str(len(upos_list)) + r"}{distribution of head UPOS}" + lines[2] += ''.join(f'&{upos:7}' for upos in upos_list) + lines[3] += r'& [\%] ' * len(upos_list) + lines[0] += r'@{}}\toprule' + last_col = 1 + lines[1] += r'\\' + lines[2] += r'\\' + lines[3] += r'\\\midrule' + if self.report_basics: + lines[1] += r'\cmidrule(lr){2-7}' if self.report_words_per_doc else r'\cmidrule(lr){2-5}' + lines[2] += r'\cmidrule(lr){2-5}' + last_col += 4 + if self.report_words_per_doc: + lines[2] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+2}" + '}' + last_col += 2 + if self.report_entities: + _cols = 5 if self.report_entity_range else 5 + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+_cols}" + '}' + lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}' + last_col += _cols + if self.e_len_max: + last_col += self.e_len_max + lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}' + if self.report_mentions: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}' + lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}' + last_col += 4 + if self.m_len_max: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+self.m_len_max+1}" + '}' + last_col += self.m_len_max + 1 + if self.report_details: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+3}" + lines[1] += r'}\cmidrule(l){' + f"{last_col+4}-{last_col+3+len(upos_list)}" + '}' + print("\n".join(lines)) + + def print_footer(self, end_doc=True): + if not self.style.startswith('tex-'): + return + print(r'\bottomrule\end{tabular}'+"\n"+r'\end{mypage}') + if self.style == 'tex-doc' and end_doc: + print(r'\end{document}') diff --git a/udapi/block/demo/complexity.py b/udapi/block/demo/complexity.py new file mode 100644 index 00000000..99e8a046 --- /dev/null +++ b/udapi/block/demo/complexity.py @@ -0,0 +1,268 @@ +"""demo.Complexity prints statistics on syntactic complexity. +""" +from udapi.core.basewriter import BaseWriter +from collections import deque + + +def non_punct(nodes): + return [n for n in nodes if n.upos != 'PUNCT'] + + +def is_np(node): + return node.upos in ("NOUN", "PROPN") or (node.upos == "PRON" and node.feats["PronType"] == "Prs" and not node.feats["Poss"]) + + +def is_vp(node): + """E.g. prosili, naléhali a žadonili => 1 coordinated verb phrase, head “prosili”. + + [POS == “VERB”, [deprel == “conj”, POS == “VERB”]], unique coordination heads + TODO: zahrnout i non-VERB? + - vznikla a byla přijata(conj,ADJ,parent=vznikla) + - je(cop,AUX) nešťastný(ADJ) a nechá(conj,VERB,parent=nešťastný) se nalákat + - "podařilo se to a dokladem(ClauseHead,NOUN,conj,parent=podařilo) je(cop,AUX,parent=dokladem)" + - omezit se jen na (či využít) ClauseHead, nebo zahrnout i non-finite verbs (koordinace infinitivů či příčestí)? + "stihl(ClauseHead) napsat(VerbForm=Inf) a publikovat(VerbForm=Inf)" ... napsat ani publikovat nejsou ClauseHead + "rozhodl se ukončit a ukazuje(ClauseHead,parent=ukončit)" správně by mělo být parent=rozhodl, ale parser dělá chyby. + - Parsing vůbec dělá mnoho chyb v koordinacích, takže je vhodné podmínky velmi omezit. + """ + return node.upos == "VERB" or node.misc["ClauseHead"] + + +def is_relcl(node): + """Is a given node a head of a relative clause? + + Unfortunatelly, UDPipe 2.4 produces just acl instead of acl:relcl. + """ + if node.deprel == 'acl:relcl': + return True + return node.udeprel == 'acl' and any('Rel' in c.feats['PronType'] for c in node.children) + + +def is_postponed_nom_mod(node): + """Is a given node a postponed nominal modifier? + + Silvie: [(POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)), child with higher word order than parent + [deprel != “conj”, POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)] + + TODO: Tohle hledá v češtině zcela běžné jevy jako "vznik díla". Nechceme hledat něco jiného? + """ + return node.udeprel != 'conj' and is_np(node) and node.parent.precedes(node) and is_np(node.parent) + + +def is_postponed_adj_mod(node): + # TODO můžeme rozlišovat holý přívlastek ("písní ruských") a rozvitý ("milenec známý z pozdějšího zpracování") + return node.parent.precedes(node) and is_np(node.parent) and node.upos == 'ADJ' #and not node.children + + +def is_complex_nominal(node): + """[(POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)) 2x descendant [deprel != “conj”]] + TODO: punct, case, cc a dep taky ignorovat? + TODO: opravdu descendants a ne children? (descendants snadno roste nad všechny meze, je-li tam třeba vedlejší věta) + TODO: beztak bude chtít odfiltrovat copuly: "Jádrem tvorby jsou sbírky." - Jádrem má 3 děti. + TODO: a nezvýšit ten limit z 2x aspoň na 3x? + """ + return is_np(node) and len([n for n in node.descendants if n.deprel not in ('conj', 'punct', 'case', 'cc', 'dep', 'cop')]) > 1 + + +def is_finite_clause_head(node): + """Is a given node a head of a finite clause? + + Silvie: [(POS == „VERB“ & feats:Verbform == „Fin“ | Verbform == „Part“} ) ] OR [(POS in {„ADJ“, „NOUN“, „PROPN“}, [child POS == „AUX“)]] + - POS == „VERB“ je zbytečné, protože VerbForm=Part je nastaveno i u ADJ ("je nucen" apod.) + - child POS == „AUX“ zase matchuje i např. na "Vidím psa(NOUN), který je(AUX,acl,parent=psa) z dávné doby." + - adjectivized predicates (převažující(VerbForm=Part) básně) by neměly být určeny jako clause_head + + * Most finite verbs with deprel=amod are parsing errors - they should have deprel=acl, + but for better robustness we include these as well. + * Similarly "dep" and "orphan" are mostly parsing errors. + * TODO: by uncommenting the nsubj/csubj line, we find few more real clause heads, but also some false positives. + """ + # TODO appos + if ((node.udeprel in {'root', 'conj', 'acl', 'advcl', 'ccomp', 'csubj', 'obl', 'parataxis', 'amod', 'dep', 'orphan'} + and is_finite_verb(node)) + #or any(c.udeprel in {'nsubj', 'csubj'} for c in node.children) + or (any(c.udeprel == 'cop' for c in node.children) and node.udeprel != 'xcomp')): + return True + xcomp_child = next((c for c in node.children if c.udeprel == 'xcomp'), None) + return xcomp_child and any(c.udeprel == 'cop' for c in xcomp_child.children) + + +# TODO: zahrnout i: bude(aux,AUX,parent=chovat) se chovat(VERB,VerbForm=Inf) +def is_finite_verb(node): + return (node.feats['VerbForm'] in {'Fin', 'Part'} and + (node.upos == 'VERB' or + node.upos == 'ADJ' and any(c.deprel == 'aux:pass' for c in node.children))) + + +def is_adjectivized_predicate(node): + """E.g. kouřící komín, zbitý kluk + + Silvie: [(POS == „ADJ“ & feats:VerbForm == „Part“), parent [POS in {„NOUN“, „PROPN“}] ] + - parent [POS in {„NOUN“, „PROPN“}] zamezí případům jako + "kvůli nesmyslné a stupňující(parent=nesmyslné,deprel=conj) se žárlivosti" + "Nové pronikající(parent=Nové,deprel=amod) socialistické myšlení" asi chyba parsingu, mělo být parent=myšlení? + - dotaz naopak matchuje na "způsob, jakým jsou popsány", proto přidávám podmínku not node.misc["ClauseHead"] + """ + return (node.feats["VerbForm"] == "Part" + and node.upos == "ADJ" + and (node.parent.upos in {"NOUN","PROPN"} or (node.udeprel == "conj" and node.parent.upos == "ADJ")) + and not node.misc["ClauseHead"]) + + +def is_controlled_predicate(node): + """E.g. Mohli jsme odejít i zůstat. + + TODO: Chceme zahrnout i druhý a další člen koordinace, např. "stihl napsat a publikovat", + tedy node.udeprel == "conj" and node.parent.udeprel == "xcomp"? + """ + return node.deprel == "xcomp" + +class Complexity(BaseWriter): + + def __init__(self, matches=False, **kwargs): + super().__init__(**kwargs) + self.matches = matches + + + def report(self, category, groups, expand_type='no'): + if self.matches: + for group in groups: + self.print_match(category, group, expand_type) + else: + print("\t" + str(len(groups)), end='') + + + def expand_subtree(self, nodes, expand_type): + if expand_type == 'no': + return nodes + if len(nodes) > 1: + raise Exception("expanding more than one node not implemented yet") + if expand_type == 'subtree': + return nodes[0].descendants(add_self=True) + #if expand_type == 'subtree_except_conj': + #result = nodes + #for child in group.children: + #if child.udeprel != 'conj': + #result.extend(child.descendants(add_self=True)) + #return = sorted(result) + if expand_type == 'subtree_within_clause': + stack = [n for n in nodes[0].children if n.udeprel != 'conj'] + while stack: + node = stack.pop() + if not node.misc["ClauseHead"]: + nodes.append(node) + stack.extend(node.children()) + return sorted(nodes) + raise ValueError("unknown expand value " + expand_type) + + + def print_match(self, category, group, expand_type='no'): + nodes = self.expand_subtree(group, expand_type) + lemmas = " ".join(n.lemma for n in nodes) + tags = " ".join(n.upos for n in nodes) + n_tokens = str(len(non_punct(nodes))) + print("\t".join([category, nodes[0].root.sent_id, lemmas, tags, n_tokens])) + + + def get_main_clauses(self, root): + main_heads = [] + for main_head in root.children: + main_heads.append(main_head) + main_heads.extend(n for n in main_head.children if n.udeprel == 'conj') + return [[n] for n in main_heads] + + + def get_coord_phrase(self, root, phrase_type_function): + results = [] + for node in root.descendants: + if phrase_type_function(node): + conjuncts = [n for n in node.children if n.udeprel == 'conj' and phrase_type_function(n)] + if conjuncts: + conjunctions = [] + for conj in conjuncts: + # TODO multiword conjunctions (udeprel=flat)? + conjunctions.extend([n for n in conj.children if n.udeprel == 'cc']) + results.append(sorted([node] + conjuncts + conjunctions)) + return results + + # TODO koordinace hlavních i vedlejších vět + def get_t_units(self, main_heads): + results = [] + for main_head in main_heads: + main_clause = [main_head] + dep_heads = [] + stack = main_head.children + while stack: + node = stack.pop() + if node.misc["ClauseHead"]: + dep_heads.append(node) + else: + main_clause.append(node) + stack.extend(node.children) + main_clause = sorted(main_clause) + + for dep_clause_head in dep_heads: + results.append(main_clause + self.expand_subtree([dep_clause_head], 'subtree')) + return results + + # TODO complex t-unit má jinou definici: 3 klauze + def get_complex_t_units(self, root): + results = [] + for node in root.descendants: + if node.deprel != 'root' and node.misc["ClauseHead"]: # TODO: exclude the main clause? + results += self.get_t_units([node]) + return results + + + def process_tree(self, root): + print("# " + root.text) + + allnodes = root.descendants + depth, clause_depth = {0: 0}, {0: 0} + queue = deque(root.children) + clause_heads = [] + while queue: + node = queue.popleft() + depth[node.ord] = depth[node.parent.ord] + 1 + clause_depth[node.ord] = clause_depth[node.parent.ord] + if is_finite_clause_head(node): + node.misc['ClauseHead'] = 1 + clause_heads.append(node) + clause_depth[node.ord] += 1 + queue.extend(node.children) + max_depth = sorted(depth.values())[-1] + max_clause_depth = sorted(clause_depth.values())[-1] + + t_units = self.get_t_units([n for n in root.children if n.deprel == 'root']) + total_t_units_length = sum(len(t_unit) for t_unit in t_units) + mean_t_unit_length = total_t_units_length / (len(t_units) or 1) # TODO co reportovat, když věta nemá žádné t-units? + + if not self.matches: + print("\t".join(str(x) for x in [root.sent_id, len(non_punct(allnodes)), max_depth, max_clause_depth, mean_t_unit_length]), end='') + + self.report("clauses", [[n] for n in clause_heads], 'subtree') + self.report("adjectivized_predicates", [[n] for n in allnodes if is_adjectivized_predicate(n)]) + self.report("controlled_predicates", [[n] for n in allnodes if is_controlled_predicate(n)]) + self.report("main_clauses", self.get_main_clauses(root), 'subtree_within_clause') + self.report("coordinated_verb_phrases", self.get_coord_phrase(root, is_vp)) + self.report("coordinated_noun_phrases", self.get_coord_phrase(root, is_np)) + self.report("coordinated_adjective_phrases", self.get_coord_phrase(root, lambda n: n.upos in ("ADJ", "DET"))) + self.report("coordinated_adverb_phrases", self.get_coord_phrase(root, lambda n: n.upos == "ADV")) + self.report("t-units", t_units) + self.report("complex_t-units", self.get_complex_t_units(root)) + # TODO: najde "básně a písně" a "rychtář a rychtářka" UDPipe kdovíproč určil jako ADV a ADV. Zkontrolovat, máme-li nejlepší možný UDPipe model. + self.report("relative_clauses", [[n] for n in allnodes if is_relcl(n)], 'subtree_within_clause') + self.report("postponed_nominal_modifiers", [[n] for n in allnodes if is_postponed_nom_mod(n)]) + self.report("postponed_adjective_modifiers", [[n] for n in allnodes if is_postponed_adj_mod(n)]) + self.report("complex_nominals", [[n] for n in allnodes if is_complex_nominal(n)]) + + if not self.matches: + # TODO: pro total koordinace asi nemá smysl reportovat matches, jen total count? + self.report("coordinated_phrases_total", self.get_coord_phrase(root, lambda _: True)) + + nonpunct_upos = [n.upos for n in non_punct(allnodes)] + ['NONE', 'NONE'] + brackets = str(len([n for n in allnodes if n.form == '('])) + dashes = str(len([n for n in allnodes if n.form in '-–—―'])) # hyphen, en-dash, em-dash, horizonatal bar + colons = str(len([n for n in allnodes if n.form == ':'])) + semicolons = str(len([n for n in allnodes if n.form == ';'])) + print("\t", "\t".join([nonpunct_upos[0], nonpunct_upos[1], brackets, dashes, colons, semicolons])) diff --git a/udapi/block/demo/newspeak.py b/udapi/block/demo/newspeak.py new file mode 100644 index 00000000..6be2caf5 --- /dev/null +++ b/udapi/block/demo/newspeak.py @@ -0,0 +1,66 @@ +"""demo.Newspeak block for 1984-like newspeak-ization of Czech. + +This is just a demo/draft. + +Usage: + $ echo 'Nejhorší žena je lepší než nejlepší muž.' | \ + udapy -q read.Sentences udpipe.Cs demo.Newspeak write.Sentences + Převelenedobrá žena je veledobrá než převeledobrý muž. +""" +from udapi.core.block import Block +from udapi.tool.morphodita import MorphoDiTa + +ANTONYMS = { + 'špatný': 'dobrý', + 'pomalý': 'rychlý', + # 'muž': 'žena', this does not work because xpos contains gender, + # we would also need to exploit the parsing and change gender of all congruent adj children. +} + + +class Newspeak(Block): + """Change all comparatives to vele-x and superlatives to převele-x.""" + + def __init__(self, morphodita_path='models/morphodita/cs/', + morphodita_model='czech-morfflex-131112.dict', + **kwargs): + """Create the PreVele block object.""" + super().__init__(**kwargs) + self.morphodita = MorphoDiTa(model=morphodita_path + morphodita_model) + + def process_tree(self, tree): + + # apply process_node on all nodes + super().process_tree(tree) + + # Capitalize if needed + first_node = tree.descendants[0] + if tree.text[0].isupper() and not first_node.form[0].isupper(): + first_node.form = first_node.form[0].upper() + first_node.form[1:] + + # Recompute the sentence string + tree.text = tree.compute_text() + + def process_node(self, node): + antonym = ANTONYMS.get(node.lemma) + if antonym is not None: + if node.xpos[11] == 'N': + if node.form.lower().startswith('ne'): + node.lemma = antonym + node.xpos = node.xpos[:10] + 'A' + node.xpos[11:] + node.form = node.form[2:] + else: + forms = self.morphodita.forms_of_lemma(antonym, node.xpos) + if forms: + node.lemma = antonym + node.xpos = node.xpos[:10] + 'N' + node.xpos[11:] + node.form = 'ne' + forms[0].form + + degree = node.feats["Degree"] + if degree in ("Sup", "Cmp"): + new_xpos = node.xpos[:9] + '1' + node.xpos[10:] + forms = self.morphodita.forms_of_lemma(node.lemma, new_xpos) + if forms: + new_form = "vele" if degree == "Cmp" else "převele" + new_form += forms[0].form + node.form = new_form diff --git a/udapi/block/eval/conll17.py b/udapi/block/eval/conll17.py index 12158e55..61e86383 100644 --- a/udapi/block/eval/conll17.py +++ b/udapi/block/eval/conll17.py @@ -25,7 +25,7 @@ For evaluating multiple systems and testsets (as in CoNLL2017) -stored in `systems/testset_name/system_name.conllu` you can use:: +stored in `systems/system_name/testset_name.conllu` you can use:: #!/bin/bash SYSTEMS=`ls systems` diff --git a/udapi/block/eval/conll18.py b/udapi/block/eval/conll18.py new file mode 100644 index 00000000..22f42a42 --- /dev/null +++ b/udapi/block/eval/conll18.py @@ -0,0 +1,337 @@ +r"""Block&script eval.Conll18 for evaluating LAS,UAS,etc as in CoNLL2018 UD shared task. + +This is a reimplementation of the CoNLL2018 shared task official evaluation script, +http://universaldependencies.org/conll18/evaluation.html + +The gold trees and predicted (system-output) trees need to be sentence-aligned +e.g. using `util.ResegmentGold`. +Unlike in `eval.Parsing`, the gold and predicted trees can have different tokenization. + +An example usage and output:: + + $ udapy read.Conllu zone=gold files=gold.conllu \ + read.Conllu zone=pred files=pred.conllu ignore_sent_id=1 \ + util.ResegmentGold \ + eval.Conll18 + Metric | Precision | Recall | F1 Score | AligndAcc + -----------+-----------+-----------+-----------+----------- + Words | 27.91 | 52.17 | 36.36 | 100.00 + UPOS | 27.91 | 52.17 | 36.36 | 100.00 + XPOS | 27.91 | 52.17 | 36.36 | 100.00 + Feats | 27.91 | 52.17 | 36.36 | 100.00 + Lemma | 27.91 | 52.17 | 36.36 | 100.00 + UAS | 16.28 | 30.43 | 21.21 | 58.33 + LAS | 16.28 | 30.43 | 21.21 | 58.33 + CLAS | 10.34 | 16.67 | 12.77 | 37.50 + + +For evaluating multiple systems and testsets (as in CoNLL2018) +stored in `systems/system_name/testset_name.conllu` you can use:: + + #!/bin/bash + SYSTEMS=`ls systems` + [[ $# -ne 0 ]] && SYSTEMS=$@ + set -x + set -e + for sys in $SYSTEMS; do + mkdir -p results/$sys + for testset in `ls systems/$sys`; do + udapy read.Conllu zone=gold files=gold/$testset \ + read.Conllu zone=pred files=systems/$sys/$testset ignore_sent_id=1 \ + util.ResegmentGold \ + eval.Conll18 print_results=0 print_raw=LAS \ + > results/$sys/${testset%.conllu} + done + done + python3 `python3 -c 'import udapi.block.eval.conll18 as x; print(x.__file__)'` -r 100 + +The last line executes this block as a script and computes bootstrap resampling with 100 resamples +(default=1000, it is recommended to keep the default or higher value unless testing the interface). +This prints the ranking and confidence intervals (95% by default) and also p-values for each +pair of systems with neighboring ranks. If the difference in LAS is significant +(according to a paired bootstrap test, by default if p < 0.05), +a line is printed between the two systems. + +The output looks like:: + + 1. Stanford 76.17 ± 0.12 (76.06 .. 76.30) p=0.001 + ------------------------------------------------------------ + 2. C2L2 74.88 ± 0.12 (74.77 .. 75.01) p=0.001 + ------------------------------------------------------------ + 3. IMS 74.29 ± 0.13 (74.16 .. 74.43) p=0.001 + ------------------------------------------------------------ + 4. HIT-SCIR 71.99 ± 0.14 (71.84 .. 72.12) p=0.001 + ------------------------------------------------------------ + 5. LATTICE 70.81 ± 0.13 (70.67 .. 70.94) p=0.001 + ------------------------------------------------------------ + 6. NAIST-SATO 70.02 ± 0.13 (69.89 .. 70.16) p=0.001 + ------------------------------------------------------------ + 7. Koc-University 69.66 ± 0.13 (69.52 .. 69.79) p=0.002 + ------------------------------------------------------------ + 8. UFAL-UDPipe-1-2 69.36 ± 0.13 (69.22 .. 69.49) p=0.001 + ------------------------------------------------------------ + 9. UParse 68.75 ± 0.14 (68.62 .. 68.89) p=0.003 + ------------------------------------------------------------ + 10. Orange-Deskin 68.50 ± 0.13 (68.37 .. 68.62) p=0.448 + 11. TurkuNLP 68.48 ± 0.14 (68.34 .. 68.62) p=0.029 + ------------------------------------------------------------ + 12. darc 68.29 ± 0.13 (68.16 .. 68.42) p=0.334 + 13. conll18-baseline 68.25 ± 0.14 (68.11 .. 68.38) p=0.003 + ------------------------------------------------------------ + 14. MQuni 67.93 ± 0.13 (67.80 .. 68.06) p=0.062 + 15. fbaml 67.78 ± 0.13 (67.65 .. 67.91) p=0.283 + 16. LyS-FASTPARSE 67.73 ± 0.13 (67.59 .. 67.85) p=0.121 + 17. LIMSI-LIPN 67.61 ± 0.14 (67.47 .. 67.75) p=0.445 + 18. RACAI 67.60 ± 0.13 (67.46 .. 67.72) p=0.166 + 19. IIT-Kharagpur 67.50 ± 0.14 (67.36 .. 67.64) p=0.447 + 20. naistCL 67.49 ± 0.15 (67.34 .. 67.63) +""" +import argparse +import difflib +import logging +import os +import random +import sys +from collections import Counter +from udapi.core.basewriter import BaseWriter + +CONTENT = {'nsubj', 'obj', 'iobj', 'csubj', 'ccomp', 'xcomp', 'obl', 'vocative', 'expl', + 'dislocated', 'advcl', 'advmod', 'discourse', 'nmod', 'appos', 'nummod', 'acl', + 'amod', 'conj', 'fixed', 'flat', 'compound', 'list', 'parataxis', 'orphan', 'goeswith', + 'reparandum', 'root', 'dep'} +FUNCTIONAL = {'aux', 'cop', 'mark', 'det', 'clf', 'case', 'cc'} +UNIV_FEATS = {'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr', 'Gender', 'Animacy', + 'Number', 'Case', 'Definite', 'Degree', 'VerbForm', 'Mood', 'Tense', 'Aspect', + 'Voice', 'Evident', 'Polarity', 'Person', 'Polite'} + +class Conll18(BaseWriter): + """Evaluate LAS, UAS, MLAS and BLEX.""" + + def __init__(self, gold_zone='gold', print_raw=False, print_results=True, print_counts=False, + **kwargs): + """Args: + gold_zone - Which zone contains the gold-standard trees (the other zone contains "pred")? + print_raw - Print raw counts (pred, gold, aligned, correct) for each sentence. + This is useful for bootstrap resampling post-processing to get confidence intervals. + The parameter print_raw specifies a given metric + (UAS, LAS, MLAS, BLEX, UPOS, XPOS, Feats, Lemma) or is 0 (or False) by default. + print_results - Print a table with overall results after all document are processed. + print_counts - Print counts of correct/gold/system instead of prec/rec/f1 for all metrics. + """ + super().__init__(**kwargs) + self.gold_zone = gold_zone + self.total_count = Counter() + self.print_raw = print_raw + self.print_results = print_results + self.print_counts = print_counts + + def _ufeats(self, feats): + return '|'.join(sorted(x for x in feats.split('|') if x.split('=', 1)[0] in UNIV_FEATS)) + + def process_tree(self, tree): + gold_tree = tree.bundle.get_tree(self.gold_zone) + if tree == gold_tree: + return + pred_nodes = tree.descendants + gold_nodes = gold_tree.descendants + pred_forms = [n.form.lower() for n in pred_nodes] + gold_forms = [n.form.lower() for n in gold_nodes] + matcher = difflib.SequenceMatcher(None, pred_forms, gold_forms, autojunk=False) + aligned = [] + for diff in matcher.get_opcodes(): + edit, pred_lo, pred_hi, gold_lo, gold_hi = diff + if edit == 'equal': + aligned.extend(zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi])) + align_map, feats_match = {tree: gold_tree}, {} + for p_node, g_node in aligned: + align_map[p_node] = g_node + feats_match[p_node] = self._ufeats(str(p_node.feats)) == self._ufeats(str(g_node.feats)) + + count = Counter() + count['pred'] = len(pred_nodes) + count['gold'] = len(gold_nodes) + count['Words'] = len(aligned) + count['pred_cont'] = len([n for n in pred_nodes if n.udeprel in CONTENT]) + count['gold_cont'] = len([n for n in gold_nodes if n.udeprel in CONTENT]) + count['alig_cont'] = len([n for _, n in aligned if n.udeprel in CONTENT]) + + for p_node, g_node in aligned: + count['UPOS'] += 1 if p_node.upos == g_node.upos else 0 + count['XPOS'] += 1 if p_node.xpos == g_node.xpos else 0 + count['Lemmas'] += 1 if g_node.lemma == '_' or p_node.lemma == g_node.lemma else 0 + count['UFeats'] += 1 if feats_match[p_node] else 0 + if feats_match[p_node] and p_node.upos == g_node.upos and p_node.xpos == g_node.xpos: + count['AllTags'] += 1 + if align_map.get(p_node.parent) == g_node.parent and not p_node.misc['Rehanged']: + count['UAS'] += 1 + if p_node.udeprel == g_node.udeprel: + count['LAS'] += 1 + if g_node.udeprel in CONTENT: + count['CLAS'] += 1 + if g_node.lemma == '_' or g_node.lemma == p_node.lemma: + count['BLEX'] += 1 + if self._morpho_match(p_node, g_node, align_map, feats_match): + if not p_node.misc['FuncChildMissing']: + count['MLAS'] += 1 + self.total_count.update(count) + + if self.print_raw: + if self.print_raw in {'CLAS', 'BLEX', 'MLAS'}: + scores = [str(count[s]) for s in ('pred_cont', 'gold_cont', 'alig_cont', + self.print_raw)] + else: + scores = [str(count[s]) for s in ('pred', 'gold', 'Words', self.print_raw)] + print(' '.join(scores)) + + def _morpho_match(self, p_node, g_node, align_map, feats_match): + if p_node.upos != g_node.upos or not feats_match[p_node]: + return False + p_children = [c for c in p_node.children if c.udeprel in FUNCTIONAL and not c.misc['Rehanged']] + g_children = [c for c in g_node.children if c.udeprel in FUNCTIONAL] + if len(p_children) != len(g_children): + return False + for p_child, g_child in zip(p_children, g_children): + if align_map.get(p_child) != g_child: + return False + if p_child.udeprel != g_child.udeprel: + return False + if p_child.upos != g_child.upos or not feats_match[p_child]: + return False + return True + + def process_end(self): + if not self.print_results: + return + + # Redirect the default filehandle to the file specified by self.files + self.before_process_document(None) + + metrics = ('Words', 'UPOS', 'XPOS', 'UFeats', 'AllTags', + 'Lemmas', 'UAS', 'LAS', 'CLAS', 'MLAS', 'BLEX') + if self.print_counts: + print("Metric | Correct | Gold | Predicted | Aligned") + else: + print("Metric | Precision | Recall | F1 Score | AligndAcc") + print("-----------+-----------+-----------+-----------+-----------") + for metric in metrics: + correct = self.total_count[metric] + if metric in {'CLAS', 'BLEX', 'MLAS'}: + pred, gold = self.total_count['pred_cont'], self.total_count['gold_cont'] + alig = self.total_count['alig_cont'] + else: + pred, gold = self.total_count['pred'], self.total_count['gold'] + alig = self.total_count['Words'] + if self.print_counts: + print("{:11}|{:10} |{:10} |{:10} |{:10}".format( + metric, correct, gold, pred, alig)) + else: + precision, recall, fscore, alignacc = prec_rec_f1(correct, pred, gold, alig) + alignacc = "{:10.2f}".format(100 * alignacc) if metric != 'Words' else "" + print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( + metric, 100 * precision, 100 * recall, 100 * fscore, alignacc)) + + +def prec_rec_f1(correct, pred, gold, alig=0): + precision = correct / pred if pred else 0 + recall = correct / gold if gold else 0 + alignacc = correct / alig if alig else 0 + fscore = 2 * correct / (pred + gold) if pred + gold else 0 + return precision, recall, fscore, alignacc + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dir_results", "-d", default="results", help="directory with results") + parser.add_argument("--resamples", "-r", default=1000, type=int, help="how many resamples") + parser.add_argument("--confidence", "-c", default=95, help="use x-percent confidence interval") + parser.add_argument("--tests", "-t", default='all', help="comma-separated test sets") + parser.add_argument("--systems", "-s", default='all', help="comma-separated systems") + parser.add_argument("--randseed", default=0, type=int, help="random seed, default=sys time") + args = parser.parse_args() + res_dir, resamples, conf = args.dir_results, args.resamples, args.confidence + alpha = (1 - conf/100) / 2 + index_lo = int(alpha * (resamples - 1)) + index_hi = resamples - 1 - index_lo + index_mid = int(resamples / 2) + if args.systems == 'all': + systems = os.listdir(res_dir) + else: + systems = args.systems.split(',') + if args.tests == 'all': + tests = set() + for system in systems: + tests.update(os.listdir(res_dir + '/' + system)) + tests = sorted(tests) + else: + tests = args.tests.split(',') + if args.randseed: + random.seed(args.randseed) + results = [] + + print('Loading...', file=sys.stderr) + for system in systems: + sys_results = [] + results.append(sys_results) + for i_test, test in enumerate(tests): + filename = '/'.join((res_dir, system, test)) + try: + with open(filename) as res_file: + sys_results.extend([[i_test] + list(map(int, l.split())) for l in res_file]) + except FileNotFoundError: + logging.warning(filename + ' not found') + samples = len(sys_results) + + print('Resampling...', file=sys.stderr) + boot_results = [] + for i_resample in range(resamples): + print(i_resample + 1, file=sys.stderr, end='\r') + resample_results = [] + boot_results.append(resample_results) + for i_system in range(len(systems)): + pred, gold, words, correct = ([0] * len(tests) for _ in range(4)) + for _ in range(samples): + i_test, pre, gol, wor, corr = random.choice(results[i_system]) + pred[i_test] += pre + gold[i_test] += gol + words[i_test] += wor + correct[i_test] += corr + fscore_sum = 0 + for i_test in range(len(tests)): + _prec, _rec, fscore, _aligacc = prec_rec_f1(correct[i_test], pred[i_test], gold[i_test]) + fscore_sum += fscore + resample_results.append(fscore_sum / len(tests)) + print('\n', file=sys.stderr) + + sys_fscores = [] + for i_system, system in enumerate(systems): + sys_fscores.append([boot_results[i_resample][i_system] for i_resample in range(resamples)]) + final_results = [] + sys_sys_wins = [[0] * len(systems) for x in range(len(systems))] + for i_system, system in enumerate(systems): + for j_system in range(i_system): + for i, j in zip(sys_fscores[i_system], sys_fscores[j_system]): + if i > j: + sys_sys_wins[i_system][j_system] += 1 + elif i < j: + sys_sys_wins[j_system][i_system] += 1 + fscores = sorted(sys_fscores[i_system]) + final_results.append([i_system, fscores[index_mid], fscores[index_lo], fscores[index_hi]]) + + sorted_systems = sorted(final_results, key=lambda x: -x[1]) + for rank, sys_results in enumerate(sorted_systems): + i_system, f1_mid, f1_lo, f1_hi = sys_results + if rank < len(systems) - 1: + j_worse_sys = sorted_systems[rank + 1][0] + p_value = (sys_sys_wins[j_worse_sys][i_system] + 1) / (resamples + 1) + p_str = " p=%.3f" % p_value + else: + p_value, p_str = 1, "" + print("%2d. %17s %5.2f ±%5.2f (%5.2f .. %5.2f)%s" % + (rank + 1, systems[i_system], + 100 * f1_mid, 50 * (f1_hi - f1_lo), 100 * f1_lo, 100 * f1_hi, p_str)) + if p_value < (1 - conf/100): + print('-' * 60) + + +if __name__ == "__main__": + main() diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py index a4f93a1b..e4889770 100644 --- a/udapi/block/eval/f1.py +++ b/udapi/block/eval/f1.py @@ -110,11 +110,34 @@ def process_tree(self, tree): return self.visited_zones[tree.zone] += 1 - pred_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in tree.descendants] - gold_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in gold_tree.descendants] - common = find_lcs(pred_tokens, gold_tokens) - - if self.focus is not None: + pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in tree.descendants] + gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in gold_tree.descendants] + + # lcs("abc", "acb") can be either "ab" or "ac". + # We want to prefer the LCS with the highest number of non-focused tokens. + # E.g. if focus="," then lcs("a,c", "ac,") should be "ac" and the comma should be evaluated + # as non-aligned, i.e. eval.F1 should return precision=recall=f1=0 for this sentence. + if self.focus is None: + common = find_lcs(pred_tokens, gold_tokens) + else: + nf_pred_tokens = [x for x in pred_tokens if not self.focus.fullmatch(x)] + nf_gold_tokens = [x for x in gold_tokens if not self.focus.fullmatch(x)] + nf_common = find_lcs(nf_pred_tokens, nf_gold_tokens) + i, j, c, un_pred, un_gold, common = 0, 0, 0, [], [], [] + while i < len(pred_tokens) and j < len(gold_tokens): + if c == len(nf_common): + common += find_lcs(pred_tokens[i:], gold_tokens[j:]) + break + while nf_common[c] != pred_tokens[i]: + un_pred.append(pred_tokens[i]) + i += 1 + while nf_common[c] != gold_tokens[j]: + un_gold.append(gold_tokens[j]) + j += 1 + common += find_lcs(un_pred, un_gold) + un_pred, un_gold = [], [] + while c < len(nf_common) and nf_common[c] == pred_tokens[i] and nf_common[c] == gold_tokens[j]: + i, j, c = i+1, j+1, c+1 common = [x for x in common if self.focus.fullmatch(x)] pred_tokens = [x for x in pred_tokens if self.focus.fullmatch(x)] gold_tokens = [x for x in gold_tokens if self.focus.fullmatch(x)] @@ -133,6 +156,13 @@ def process_tree(self, tree): self._pred[x] += 1 self._total[x] += 1 + @property + def f1(self): + pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero + precision = self.correct / pred + recall = self.correct / gold + return 2 * precision * recall / ((precision + recall) or 1) + def process_end(self): # Redirect the default filehandle to the file specified by self.files self.before_process_document(None) @@ -172,22 +202,29 @@ def process_end(self): # difflib.SequenceMatcher does not compute LCS, so let's implement it here -# TODO: make faster by trimming common prefix and sufix def find_lcs(x, y): """Find longest common subsequence.""" m, n = len(x), len(y) - C = [[0] * (n + 1) for _ in range(m + 1)] - for i in range(1, m + 1): - for j in range(1, n + 1): - C[i][j] = C[i - 1][j - 1] + 1 if x[i - 1] == y[j - 1] else max(C[i][j - 1], C[i - 1][j]) - index = C[m][n] - lcs = [None] * index - while m > 0 and n > 0: - if x[m - 1] == y[n - 1]: - lcs[index - 1] = x[m - 1] - m, n, index = m - 1, n - 1, index - 1 - elif C[m - 1][n] > C[m][n - 1]: - m -= 1 - else: - n -= 1 - return lcs + if m == 0 or n == 0: + return [] + elif x[0] == y[0]: + i = 1 + while i < min(m, n) and x[i] == y[i]: + i += 1 + return x[:i] + (find_lcs(x[i:], y[i:]) if i < min(m, n) else []) + else: + C = [[0] * (n + 1) for _ in range(m + 1)] + for i in range(1, m + 1): + for j in range(1, n + 1): + C[i][j] = C[i - 1][j - 1] + 1 if x[i - 1] == y[j - 1] else max(C[i][j - 1], C[i - 1][j]) + index = C[m][n] + lcs = [None] * index + while m > 0 and n > 0: + if x[m - 1] == y[n - 1]: + lcs[index - 1] = x[m - 1] + m, n, index = m - 1, n - 1, index - 1 + elif C[m - 1][n] > C[m][n - 1]: + m -= 1 + else: + n -= 1 + return lcs diff --git a/udapi/block/msf/case.py b/udapi/block/msf/case.py new file mode 100644 index 00000000..7d362c7f --- /dev/null +++ b/udapi/block/msf/case.py @@ -0,0 +1,448 @@ +""" +Morphosyntactic features (UniDive): +Derive a MS Case feature from morphological case and adposition. +""" +from udapi.core.block import Block +import logging + +class Case(Block): + + adposmap = { + 'v+Loc': 'Ine', + 'uvnitř+Gen': 'Ine', + 'uvnitř+': 'Ine', + 'mezi_uvnitř+Gen': 'Ine', # annotation error? + 'uprostřed+Gen': 'Ces', + 'mezi+Ins': 'Int', + 'mezi+Nom': 'Int', # annotation error + 'mezi+Voc': 'Int', # annotation error + 'vně+Gen': 'Ext', + 'stranou+Gen': 'Ext', + 'stranou+Dat': 'Ext', + 'na+Loc': 'Ade', + 'na_mimo+Loc': 'Ade', # annotation error? + 'na_úroveň+Gen': 'Ade', + 'na_úroveň+': 'Ade', + 'v_proces+Gen': 'Ade', # ??? + 'v_rámec+Gen': 'Ade', # ??? + 'v_rámec+': 'Ade', # ??? + 'v_řada+Gen': 'Ade', # ??? + 'z_oblast+Gen': 'Ade', # ??? + 'vedle+Gen': 'Apu', + 'u+Gen': 'Chz', + 'kolem+Gen': 'Cir', + 'kol+Gen': 'Cir', + 'dokola+Gen': 'Cir', + 'okolo+Gen': 'Cir', + 'v_oblast+Gen': 'Cir', + 'v_oblast+': 'Cir', + 'blízko+Dat': 'Prx', + 'blízko+Gen': 'Prx', + 'blízko+': 'Prx', + 'nedaleko+Gen': 'Prx', + 'daleko+Gen': 'Prx', # lemma of 'nedaleko' + 'poblíž+Gen': 'Prx', + 'daleko_od+Gen': 'Dst', + 'nad+Ins': 'Sup', + 'pod+Ins': 'Sub', + 'vespod+Gen': 'Sub', + 'před+Ins': 'Ant', + 'vpředu+Gen': 'Ant', + 'na_čelo+Gen': 'Ant', + 'v_čelo+Gen': 'Ant', + 'v_čelo+': 'Ant', + 'za+Ins': 'Pst', + 'naproti+Dat': 'Opp', + 'od+Gen': 'Abl', + 'od+Dat': 'Abl', # annotation error + 'směr_od+Gen': 'Abl', + 'z_strana+Gen': 'Abl', + 'z_strana+': 'Abl', + 'z+Gen': 'Ela', + 'z+Nom': 'Ela', # annotation error + 'z+Dat': 'Ela', # annotation error + 'zevnitř+Gen': 'Ela', + 'zprostřed+Gen': 'Cne', + 's+Gen': 'Del', + 'zpod+Gen': 'Sbe', + 'zpoza+Gen': 'Pse', + 'po+Loc': 'Per', + 'cesta+Gen': 'Per', + 'cesta+Ins': 'Per', + 'napříč+Gen': 'Crs', + 'napříč+Ins': 'Crs', + 'podél+Gen': 'Lng', + 'skrz+Acc': 'Inx', + 'přes+Acc': 'Spx', + 'přes+Nom': 'Spx', # annotation error + 'ob+Acc': 'Cix', + 'po+Acc': 'Ter', + 'po+Nom': 'Ter', # annotation error + 'po+Gen': 'Ter', # annotation error + 'do+Gen': 'Ill', + 'do+Acc': 'Ill', # annotation error + 'do_/+Gen': 'Ill', + 'dovnitř+Gen': 'Ill', + 'doprostřed+Gen': 'Cnl', + 'mezi+Acc': 'Itl', + 'na+Acc': 'All', + 'na+Nom': 'All', # annotation error + 'na+Gen': 'All', # annotation error + 'k+Dat': 'Apl', + 'k+Nom': 'Apl', # annotation error + 'vstříc+Dat': 'Apl', + 'do_oblast+Gen': 'Apl', + 'směr+': 'Apl', + 'směr_k+Dat': 'Apl', + 'směr_k+': 'Apl', + 'směr_na+Acc': 'Apl', + 'v_směr_k+Dat': 'Apl', + 'nad+Acc': 'Spl', + 'nad+Nom': 'Spl', # annotation error + 'pod+Acc': 'Sbl', + 'před+Acc': 'Anl', + 'před+Gen': 'Anl', # annotation error + 'za+Acc': 'Psl', + 'dík_za+Acc': 'Psl', # annotation error? + 'dokud': 'Tan', + 'nežli': 'Tan', + 'v+Acc': 'Tem', + 'v+Nom': 'Tem', # annotation error + 'v+Gen': 'Tem', # annotation error + 'při_příležitost+Gen': 'Tem', + 'současně_s+Ins': 'Tem', + 'u_příležitost+Gen': 'Tem', + 'v_období+Gen': 'Tpx', + 'počátkem+Gen': 'Din', + 'počátek+Gen': 'Din', + 'počínat+Ins': 'Din', + 'počínat+': 'Din', + 'začátkem+Gen': 'Din', + 'začátek+Gen': 'Din', + 'během+Gen': 'Dur', + 'postupem+Gen': 'Dur', + 'postup+Gen': 'Dur', + 'při+Loc': 'Dur', + 'v_průběh+Gen': 'Dur', + 'za+Gen': 'Der', + 'koncem+Gen': 'Dtr', + 'konec+Gen': 'Dtr', + 'k_konec+Gen': 'Dtr', + 'končit+Ins': 'Dtr', + 'závěrem+Gen': 'Dtr', + 'závěr+Gen': 'Dtr', + 'na_závěr+Gen': 'Dtr', + 'v_závěr+Gen': 'Dtr', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'před_po+Loc': 'Tps', + 'počínaje+Ins': 'Teg', + 'jménem+Nom': 'Atr', + 'jméno+Nom': 'Atr', + 'zdali': 'Atr', + 'že': 'Atr', + 'z_řada+Gen': 'Gen', + 's+Ins': 'Com', + 's+Nom': 'Com', # annotation error + 'spolu_s+Ins': 'Com', + 'spolu_s+': 'Com', + 'společně_s+Ins': 'Com', + 'společně_s+': 'Com', + 'v_čelo_s+Ins': 'Com', + 'v_spolupráce_s+Ins': 'Com', + 'bez+Gen': 'Abe', + 'včetně+Gen': 'Inc', + 'nad_rámec+Gen': 'Add', + 'kromě+Gen': 'Exc', + 'krom+Gen': 'Exc', + 'mimo+Acc': 'Exc', + 'mimo+Gen': 'Exc', + 'vyjma+Gen': 'Exc', + 'až_na+Acc': 'Exc', + 's_výjimka+Gen': 'Exc', + 's_výjimka+': 'Exc', + 'místo+Gen': 'Sbs', + 'místo+Ins': 'Sbs', # něčím místo něčím jiným + 'místo+Loc': 'Sbs', # annotation error + 'místo_do+Gen': 'Sbs', + 'místo_k+Dat': 'Sbs', + 'místo_na+Acc': 'Sbs', + 'místo_na+': 'Sbs', + 'místo_po+Loc': 'Sbs', + 'místo_v+Acc': 'Sbs', + 'místo_v+': 'Sbs', + 'místo_za+Acc': 'Sbs', + 'namísto+Gen': 'Sbs', + 'namísto_do+Gen': 'Sbs', + 'v_zastoupení+Gen': 'Sbs', + 'výměna_za+Acc': 'Sbs', + 'jako': 'Ess', + 'jako+': 'Ess', + 'jako+Nom': 'Ess', + 'jako+Acc': 'Ess', + 'jako+Dat': 'Ess', + 'jako_u+Gen': 'Ess', + 'jako_v+Loc': 'Ess', + 'formou+Gen': 'Ess', + 'forma+Gen': 'Ess', + 'v_forma+Gen': 'Ess', + 'v_podoba+Gen': 'Ess', + 'v_podoba+': 'Ess', + 'shoda+Gen': 'Equ', + 'v_shoda_s+Ins': 'Equ', + 'do_soulad_s+Ins': 'Sem', + 'na_způsob+Gen': 'Sem', + 'po_vzor+Gen': 'Sem', + 'úměrně+Dat': 'Sem', + 'úměrně_k+Dat': 'Sem', + 'úměrně_s+Ins': 'Sem', + 'v_analogie_s+Ins': 'Sem', + 'v_duch+Gen': 'Sem', + 'v_smysl+Gen': 'Sem', + 'oproti+Dat': 'Dsm', + 'na_rozdíl_od+Gen': 'Dsm', + 'na_rozdíl_od+': 'Dsm', + 'než': 'Cmp', + 'než+Nom': 'Cmp', + 'než+Gen': 'Cmp', + 'než+Acc': 'Cmp', + 'než_nad+Ins': 'Cmp', + 'než_v+Acc': 'Cmp', + 'než_v+Loc': 'Cmp', + 'v_poměr_k+Dat': 'Cmp', + 'v_poměr_k+': 'Cmp', + 'v_porovnání_k+Dat': 'Cmp', + 'v_porovnání_s+Ins': 'Cmp', + 'v_porovnání_s+': 'Cmp', + 'v_srovnání_s+Ins': 'Cmp', + 'v_srovnání_s+': 'Cmp', + 'o+Acc': 'Dif', + 'o+Nom': 'Dif', # annotation error + 'o+Gen': 'Dif', # annotation error + 'o+Dat': 'Dif', # annotation error + 'o_o+Acc': 'Dif', # annotation error + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'díky+Dat': 'Cau', + 'dík+Dat': 'Cau', + 'kvůli+Dat': 'Cau', + 'vinou+Gen': 'Cau', + 'vlivem+Gen': 'Cau', + 'vliv+Gen': 'Cau', + 'vliv+': 'Cau', + 'vinou+Gen': 'Cau', + 'vina+Gen': 'Cau', + 'zásluhou+Gen': 'Cau', + 'zásluha+Gen': 'Cau', + 'z_důvod+Gen': 'Cau', + 'v_důsledek+Gen': 'Cau', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'následek+Gen': 'Cau', + 'aby': 'Pur', + 'jméno+Gen': 'Pur', + 'pro_případ+Gen': 'Pur', + 'v_jméno+Gen': 'Pur', + 'v_zájem+Gen': 'Pur', + 'za_účel+Gen': 'Pur', + 'na_základ+Gen': 'Cns', + 'pod_vliv+Gen': 'Cns', + 's_ohled_na+Acc': 'Cns', + 's_přihlédnutí_k+Dat': 'Cns', + 's_přihlédnutí_na+Acc': 'Cns', + 'v_souvislost_s+Ins': 'Cns', + 'v_souvislost_s+': 'Cns', + 'v_světlo+Gen': 'Cns', + 'vzhledem_k+Dat': 'Cns', + 'v_soulad_s+Ins': 'Cns', + 'v_soulad_s+': 'Cns', + 'z_titul+Gen': 'Cns', + 'ať': 'Ign', + 'bez_ohled_na+Acc': 'Ign', + 'nehledě_k+Dat': 'Ign', + 'nehledě_na+Acc': 'Ign', + 'navzdory+Dat': 'Ccs', + 'vzdor+Dat': 'Ccs', + 'v_rozpor_s+Ins': 'Ccs', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'pokud+Nom': 'Cnd', + 'zda': 'Cnd', + 'v_případ+Gen': 'Cnd', + 'v_případ+': 'Cnd', + 'v_závislost_na+Loc': 'Cnd', + 'v_závislost_s+Ins': 'Cnd', + 'o+Loc': 'The', + 'ohledně+Gen': 'The', + 'stran+Gen': 'The', + 'co_do+Gen': 'The', + 'na_téma+Gen': 'The', + 'na_téma+Nom': 'The', + 'na_téma+': 'The', + 'na_úsek+Gen': 'The', + 'po_stránka+Gen': 'The', + 'v_obor+Gen': 'The', + 'v_otázka+Gen': 'The', + 'v_spojení_s+Ins': 'The', + 'v_věc+Gen': 'The', + 'v_vztah_k+Dat': 'The', + 'v_vztah_k+': 'The', + 'v_záležitost+Gen': 'The', + 'v_znamení+Gen': 'The', + 'z_hledisko+Gen': 'The', + 'z_hledisko+': 'The', + 'podle+Gen': 'Quo', + 'dle+Gen': 'Quo', + 'pomocí+Gen': 'Ins', + 's_pomoc+Gen': 'Ins', + 'prostřednictvím+Gen': 'Ins', + 'prostřednictví+Gen': 'Ins', + 'prostřednictví+Ins': 'Ins', # annotation error + 'prostřednictví+': 'Ins', + 'za_pomoc+Gen': 'Ins', + 'pro+Acc': 'Ben', + 'pro+Nom': 'Ben', # annotation error + 'pro+Gen': 'Ben', # annotation error + 'pro+Ins': 'Ben', # annotation error + 'napospas+Dat': 'Ben', + 'k_prospěch+Gen': 'Ben', + 'na_úkor+Gen': 'Ben', + 'na_vrub+Gen': 'Ben', + 'v_prospěch+Gen': 'Ben', + 'v_neprospěch+Gen': 'Ben', + 'v_služba+Gen': 'Ben', + 'proti+Dat': 'Adv', + 'proti+Gen': 'Adv', + 'kontra+Nom': 'Adv', + 'versus+Nom': 'Adv', + 'vůči+Dat': 'Adv', + # subordinators + 'dokud': 'Tan', + 'nežli': 'Tan', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'zdali': 'Atr', + 'že': 'Atr', + 'jako': 'Ess', + 'než': 'Cmp', + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'aby': 'Pur', + 'ať': 'Ign', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'zda': 'Cnd', + # coordinators + 'a': 'Conj', + 'i': 'Conj', + 'ani': 'Nnor', + 'nebo': 'Disj', + 'či': 'Disj', + 'ale': 'Advs', + 'avšak': 'Advs', + 'však': 'Advs', + 'nýbrž': 'Advs', + 'neboť': 'Reas', + 'tedy': 'Cnsq', + 'tak': 'Cnsq' + } + + def process_node(self, node): + """ + Derives a case value from preposition and morphological case. Stores it + as MSFCase in MISC. + """ + # Do not do anything for function words. + # Specifically for Case, also skip 'det' and 'amod' modifiers (congruent attributes) + # because their Case is only agreement feature inherited from the head noun. + if node.udeprel in ['case', 'mark', 'cc', 'aux', 'cop', 'punct']: + node.misc['MSFFunc'] = 'Yes' + return + elif node.udeprel in ['det', 'amod']: + node.misc['MSFFunc'] = 'No' + return + else: + node.misc['MSFFunc'] = 'No' + # Get all case markers (adpositions) attached to the current node. + adpositions = [] + for c in node.children: + if c.udeprel == 'case': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + adpositions.append(lemma) + # We assume that all features were copied from FEATS to MISC in mwe.MsfInit. + # They may have been further processed there, so we take the input from there. + msfcase = node.misc['MSFCase'] + if adpositions: + adpostring = '_'.join(adpositions) + caseadpostring = adpostring + '+' + msfcase + if caseadpostring in self.adposmap: + msfcase = self.adposmap[caseadpostring] + else: + logging.warn(f"No Case value found for '{caseadpostring}'.") + msfcase = caseadpostring + # Omer wants to collect cases from both adpositions and subordinators + # but we will consider subordinators only if we do not have any case + # from morphology or adpositions. + if not msfcase: + subordinators = [] + for c in node.children: + if c.udeprel == 'mark': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + subordinators.append(lemma) + if subordinators: + subordstring = '_'.join(subordinators) + if subordstring in self.adposmap: + msfcase = self.adposmap[subordstring] + # To lump coordinators with all the above makes even less sense but for + # the moment we do it. + if not msfcase: + coordinators = [] + for c in node.children: + if c.udeprel == 'cc': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + coordinators.append(lemma) + if coordinators: + coordstring = '_'.join(coordinators) + if coordstring in self.adposmap: + msfcase = self.adposmap[coordstring] + node.misc['MSFCase'] = msfcase diff --git a/udapi/block/msf/createabstract.py b/udapi/block/msf/createabstract.py new file mode 100644 index 00000000..fbdf73e5 --- /dev/null +++ b/udapi/block/msf/createabstract.py @@ -0,0 +1,45 @@ +""" +Morphosyntactic features (UniDive): +Create abstract nodes representing dropped arguments of predicates (if verbal +morphology signals that the subject is third person singular, and there is no +subject node, create an abstract node and copy the features there). +""" +from udapi.core.block import Block +import re + +class CreateAbstract(Block): + + def process_node(self, node): + """ + If a node has MSFVerbForm=Fin and at least one of the agreement features + MSFNumber, MSFPerson, MSFGender, MSFAnimacy, MSFPolite, assume that these + features characterize the subject (this block is not suitable for languages + with polypersonal agreement). Check that the subject is present. If not, + create an abstract node to represent it. + """ + if node.misc['MSFVerbForm'] == 'Fin' and any([node.misc[x] for x in ['MSFNumber', 'MSFPerson', 'MSFGender', 'MSFAnimacy', 'MSFPolite']]): + # Current node is a finite predicate. Does it have a subject? If not, create an abstract one. + if not any([x.udeprel in ['nsubj', 'csubj'] for x in node.children]): + # There could already be an abstract subject. We have to look for it in the enhanced graph. + if not any([re.match(r"^[nc]subj", edep['deprel']) for edep in node.deps]): + # Create an abstract subject. + subject = node.create_empty_child('nsubj') + subject.upos = 'PRON' + subject.feats['PronType'] = 'Prs' + subject.misc['MSFPronType'] = 'Prs' + subject.feats['Case'] = 'Nom' + subject.misc['MSFCase'] = 'Nom' + for f in ['Number', 'Person', 'Gender', 'Animacy', 'Polite']: + msf = 'MSF' + f + if node.misc[msf]: + subject.feats[f] = node.misc[msf] + subject.misc[msf] = node.misc[msf] + subject.misc['MSFFunc'] = 'No' + # Regardless of whether it had a subject or not, the agreement features + # should be removed from the verb. + ###!!! We also may want to check if the pre-existing subject has all the features. + node.misc['MSFNumber'] = '' + node.misc['MSFPerson'] = '' + node.misc['MSFGender'] = '' + node.misc['MSFAnimacy'] = '' + node.misc['MSFPolite'] = '' diff --git a/udapi/block/msf/init.py b/udapi/block/msf/init.py new file mode 100644 index 00000000..ceca12af --- /dev/null +++ b/udapi/block/msf/init.py @@ -0,0 +1,53 @@ +""" +Morphosyntactic features (UniDive): +Initialization. Copies features from FEATS as MSF* attributes to MISC. +""" +from udapi.core.block import Block +import re + +class Init(Block): + + + def process_node(self, node): + """ + For every feature in FEATS, creates its MSF* counterpart in MISC. + """ + for f in node.feats: + # Only selected features will be copied. Certain features are not + # interesting for the morphosyntactic annotation. + if f not in ['Abbr', 'AdpType', 'Emph', 'Foreign', 'NameType', 'Style', 'Typo', 'Variant']: + node.misc['MSF'+f] = node.feats[f] + # We are particularly interested in the Case feature but some nominals + # lack it (e.g. acronyms or numbers). If there is a preposition, it may + # indicate the expected case of the nominal. + if not node.feats['Case']: + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + node.misc['MSFCase'] = adpositions[0].feats['Case'] + # If we did not find a preposition to help us, we may be able to read + # the case off an adjectival modifier or determiner. + if not node.misc['MSFCase']: + modifiers = [x for x in node.children if x.udeprel in ['amod', 'det'] and x.feats['Case']] + if modifiers: + node.misc['MSFCase'] = modifiers[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if not node.misc['MSFCase']: + if node.udeprel == 'nsubj': + node.misc['MSFCase'] = 'Nom' + elif node.udeprel == 'obj': + node.misc['MSFCase'] = 'Acc' + # If the node contains Phrase features in MISC (periphrastic verb forms + # detected by Lenka's code), replace the MS features with them. + phrasefeatures = [x for x in node.misc if re.match(r"^Phrase[A-Z]", x)] + for pf in phrasefeatures: + msf = pf + if msf == 'PhraseForm': + msf = 'MSFVerbForm' + else: + msf = re.sub(r"Phrase", 'MSF', pf) + node.misc[msf] = node.misc[pf] + node.misc[pf] = '' diff --git a/udapi/block/msf/numphrase.py b/udapi/block/msf/numphrase.py new file mode 100644 index 00000000..22f68c9d --- /dev/null +++ b/udapi/block/msf/numphrase.py @@ -0,0 +1,36 @@ +""" +Morphosyntactic features (UniDive): +Case in Number Phrases like 'pět mužů' (five men) in Czech. +""" +from udapi.core.block import Block + +class NumPhrase(Block): + + + def process_node(self, node): + """ + Nouns with a 'nummod:gov' dependent are morphologically in genitive, + but the case of the whole phrase (number + counted noun) is different, + probably nominative or accusative. + """ + quantifiers = [x for x in node.children if x.deprel in ['nummod:gov', 'det:numgov']] + current_case = node.misc['MSFCase'] + if (current_case == 'Gen' or current_case == '') and quantifiers: + quantifier_case = quantifiers[0].misc['MSFCase'] + # The quantifier may lack the case feature (e.g. numbers expressed by digits) + # but we may be able to guess it from a preposition or other factors. + if quantifier_case == '': + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + quantifier_case = adpositions[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if quantifier_case == '': + if node.udeprel == 'nsubj': + quantifier_case = 'Nom' + elif node.udeprel == 'obj': + quantifier_case = 'Acc' + node.misc['MSFCase'] = quantifier_case diff --git a/udapi/block/msf/phrase.py b/udapi/block/msf/phrase.py new file mode 100644 index 00000000..cf5a8f81 --- /dev/null +++ b/udapi/block/msf/phrase.py @@ -0,0 +1,168 @@ +""" +Morphosyntactic features (UniDive): +An abstract block as a base for derivation of blocks that discover periphrastic +verb forms and save them as Phrase features in MISC. This block provides the +methods that save the features in MISC. It is based on the Writer module by +Lenka Krippnerová. +""" +from udapi.core.block import Block +import logging + +class Phrase(Block): + + def __init__(self, feature_prefix='CW', **kwargs): + """ + Parameters: + feature_prefix (string) - The prefix of phrase features (e. g. 'CW', 'Phrase'), default is 'CG' + """ + super().__init__(**kwargs) + self.feature_prefix = feature_prefix + + self.dictionary = { + 'person': f'{feature_prefix}Person', + 'number': f'{feature_prefix}Number', + 'mood': f'{feature_prefix}Mood', + 'tense': f'{feature_prefix}Tense', + 'voice': f'{feature_prefix}Voice', + 'aspect':f'{feature_prefix}Aspect', + 'form': f'{feature_prefix}Form', + 'reflex': f'{feature_prefix}Reflex', + 'polarity': f'{feature_prefix}Polarity', + 'gender': f'{feature_prefix}Gender', + 'animacy': f'{feature_prefix}Animacy', + 'ords': feature_prefix, + 'expl': f'{feature_prefix}Expl', + 'analytic': 'Analytic', + } + + # a dictionary where the key is the lemma of a negative particle and the value is a list of the lemmas of their possible children that have a 'fixed' relation + # we do not want to include these negative particles in the phrase; these are expressions like "never", etc. + self.negation_fixed = { + # Belarusian + 'ні' : ['раз'], + 'ня' : ['толькі'], + + # Upper Sorbian + 'nic' : ['naposledku'], + + # Polish + 'nie' : ['mało'], + + # Pomak + 'néma' : ['kak'], + + # Slovenian + 'ne' : ['le'], + + # Russian and Old East Slavic + 'не' : ['то', 'токмо'], + 'ни' : ['в', 'раз', 'шатко'], + 'нет' : ['нет'] + } + + def process_node(self, node): + """ + Override this in a derived class! + """ + logging.fatal('process_node() not implemented.') + + + + def write_node_info(self, node, + tense = None, + person = None, + number = None, + mood = None, + voice = None, + form = None, + reflex = None, + polarity = None, + ords = None, + gender = None, + animacy = None, + aspect = None, + expl=None, + analytic=None): + arguments = locals() + del arguments['self'] # delete self and node from arguments, + del arguments['node'] # we want only grammatical categories + for key,val in arguments.items(): + if val != None: + node.misc[self.dictionary[key]] = val + + def has_fixed_children(self, node): + """ + Returns True if the node has any children with the 'fixed' relation and the node's lemma along with the child's lemma are listed in self.negation_fixed. + """ + fixed_children = [x for x in node.children if x.udeprel == 'fixed'] + + if fixed_children: + if fixed_children[0].lemma in self.negation_fixed.get(node.lemma, []): + return True + return False + + def get_polarity(self, nodes): + """ + Returns 'Neg' if there is exactly one node with Polarity='Neg' among the given nodes. + Returns an empty string if there are zero or more than one such nodes. + """ + neg_count = 0 + for node in nodes: + if node.feats['Polarity'] == 'Neg': + neg_count += 1 + + if neg_count == 1: + return 'Neg' + + # neg_count can be zero or two, in either case we want to return an empty string so that the PhrasePolarity attribute is not generated + else: + return '' + + def get_negative_particles(self, nodes): + """ + Returns a list of all negative particles found among the children + of the specified nodes, except for negative particles with fixed children specified in self.negation_fixed. + """ + neg_particles = [] + for node in nodes: + neg = [x for x in node.children if x.upos == 'PART' and x.feats['Polarity'] == 'Neg' and x.udeprel == 'advmod' and not self.has_fixed_children(x)] + if neg: + neg_particles += neg + return neg_particles + + + def get_is_reflex(self,node,refl): + if node.feats['Voice'] == 'Mid': + return 'Yes' + if len(refl) == 0: + return node.feats['Reflex'] + return 'Yes' + + def get_expl_type(self,node, refl): + if node.feats['Voice'] == 'Mid': + return 'Pv' + if not refl: + return '' + if refl[0].deprel == 'expl': + return 'Pv' + return refl[0].deprel.split(':')[1].capitalize() + + def is_expl_pass(self,refl): + if len(refl) == 0: + return False + return refl[0].deprel == 'expl:pass' + + def get_voice(self,node,refl): + voice = node.feats['Voice'] + if self.is_expl_pass(refl): + return 'Pass' + return voice + + def get_analytic_bool(self,node): + auxes = [x for x in node.children if x.udeprel == 'aux'] + + if auxes: + return 'Yes' + else: + return 'No' + diff --git a/udapi/block/msf/removefunc.py b/udapi/block/msf/removefunc.py new file mode 100644 index 00000000..e169a2de --- /dev/null +++ b/udapi/block/msf/removefunc.py @@ -0,0 +1,17 @@ +""" +Morphosyntactic features (UniDive): +Cleanup. Removes MSF* features from MISC for function nodes (MSFFunc=Yes). +""" +from udapi.core.block import Block + +class RemoveFunc(Block): + + + def process_node(self, node): + """ + Removes MSF* features if MSFFunc=Yes. + """ + if node.misc['MSFFunc'] == 'Yes': + msfeats = [x for x in node.misc if x.startswith('MSF')] + for msf in msfeats: + node.misc[msf] = '' diff --git a/udapi/block/msf/romance/preprocessor.py b/udapi/block/msf/romance/preprocessor.py new file mode 100644 index 00000000..ad7aec1e --- /dev/null +++ b/udapi/block/msf/romance/preprocessor.py @@ -0,0 +1,20 @@ +from udapi.core.block import Block + +class Preprocessor(Block): + + + def process_node(self, node): + + # In Porttinari treebank, the negative adverb não is not marked with feat Polarity=Neg + if node.lemma == 'não' and node.upos == 'ADV': + node.feats['Polarity'] = 'Neg' + + if node.upos == 'ADV' and node.feats['PronType'] == 'Neg': + node.feats['PronType'] = '' + node.feats['Polarity'] = 'Neg' + + # In Romanian RRT treebank, there is no annotation of the voice feature + # Automatically assign passive voice + pass_auxes = [x for x in node.children if x.deprel == 'aux:pass'] + if pass_auxes: + node.feats['Voice'] = 'Pass' \ No newline at end of file diff --git a/udapi/block/msf/romance/romance.py b/udapi/block/msf/romance/romance.py new file mode 100644 index 00000000..ed05fa89 --- /dev/null +++ b/udapi/block/msf/romance/romance.py @@ -0,0 +1,965 @@ +import udapi.block.msf.phrase +from enum import Enum + +AUXES_HAVE = ['ter', 'haber', 'avere'] +AUXES_BE = ['estar', 'essere'] +MODALS = ['poder', 'deber', 'querer', 'saber', # Spanish + Portuguese + 'potere', 'dovere', 'volere', 'sapere'] # Italian + +class Aspect(str, Enum): + ANT = 'Ant' + IMP = 'Imp' + IMPPROG = 'ImpProg' + PERF = 'Perf' + PERFPROG = 'PerfProg' + PROG = 'Prog' + PQP = 'Pqp' + PQPPROG = 'PqpProg' + +class Tense(str, Enum): + FUT = 'Fut' + FUTFUT = 'FutFut' + PAST = 'Past' + PASTFUT = 'PastFut' + PASTPRES = 'PastPres' + PRES = 'Pres' + +class Romance(udapi.block.msf.phrase.Phrase): + + def __init__(self, neg=True, **kwargs): + """ + Parameters: + neg (bool) - If True, process negation and generate the PhrasePolarity=Neg attribute. + feature_prefix (string) - The prefix of phrase features (e. g. 'CG', 'Phrase'), default is 'CG' + """ + super().__init__(**kwargs) + self.neg = neg + + def process_node(self, node): + + if node.misc[self.feature_prefix] != '': + return + + cop = [x for x in node.children if x.udeprel == 'cop'] + + # only expl or expl:pv, no expl:impers or expl:pass + refl = [x for x in node.children if (x.lemma == 'se' or x.lemma == 'soi') and x.upos == 'PRON' and x.udeprel == 'expl' and x.deprel != 'expl:impers' and x.deprel != 'expl:pass'] + + if refl: + expl='Pv' + else: + expl=None + + if cop: + # find auxiliary verbs, modal verbs, and auxiliary verbs related to modal verbs among the children of the content verb and separate them from each other + auxes, neg, modals, modal_auxes, modal_neg = self.find_auxes_and_neg(node) + adp = [x for x in node.children if x.upos == 'ADP'] + + if modals: + # we consider modals themselves to be separate verb forms + self.process_modal_verbs(modals, modal_auxes, modal_neg) + + if auxes: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in auxes] + [r.ord for r in refl] + [a.ord for a in adp] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in auxes] + [a.ord for a in adp] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(cop[0], auxes, expl, polarity, phrase_ords, node) + else: + # no auxiliaries, only cop + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [c.ord for c in cop] + [r.ord for r in refl] + [a.ord for a in adp] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in adp] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_copulas(node, cop, expl, polarity, phrase_ords) + return + + if node.upos == 'VERB': #TODO maybe add "or node.feats['VerbForm'] == 'Part'"? + + # find auxiliary verbs, modal verbs, and auxiliary verbs related to modals among the children of the content verb and separate them from each other + auxes, neg, modals, modal_auxes, modal_neg = self.find_auxes_and_neg(node) + aux_pass = [x for x in auxes if x.deprel == 'aux:pass'] + auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass'] + + # infinitive with a subject is a subjunctive + subj = [x for x in node.children if x.udeprel == 'subj'] + if node.feats['VerbForm'] == 'Inf' and subj: + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + mood='Sub', + form='Fin', + tense=Tense.FUT.value, + gender=node.feats['Gender'], + voice=node.feats['Voice'], + expl=expl, + analytic=self.get_analytic_bool(node), + ords=[node.ord] + ) + return + + if modals: + # we consider modals themselves to be separate verb forms + self.process_modal_verbs(modals, modal_auxes, modal_neg) + + if not auxes: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_phrases_with_ir_aller_estar(node, expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(node, expl, polarity, phrase_ords, node) + + + else: + # no passive auxiliaries + if not aux_pass: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(node, auxes, expl, polarity, phrase_ords, node) + + # head verb has only passive auxiliary and no more other auxiliaries + elif not auxes_without_pass: + polarity = '' + + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # TODO phrase-level features are currently determined based on the first passive auxiliary, but it can happen that there are more than one passive auxiliary + self.process_phrases_with_ir_aller_estar(auxes[0], expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(auxes[0], expl, polarity, phrase_ords, node) + + # head verb has passive auxiliary and also other auxiliaries + else: + polarity = '' + + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(aux_pass[0], auxes_without_pass, expl, polarity, phrase_ords, node) + + def find_auxes_and_neg(self, node): + """ + Find all auxiliaries and negative adverbials among node.children and classifies them. + + Parameters: + node (udapi.core.node.Node): head word, look for auxiliaries in its children + + Returns: + tuple: a classification of auxiliaries consisting of: + - auxiliaries directly modifying the node, + - negative adverbs modifying the node, + - modal verbs, + - auxiliaries modifying a modal verb, + - negative adverbs modifying a modal verb. + """ + + node_auxes = [] + node_neg = [] + modals = [] + modal_auxes = [] + modal_neg = [] + + for child in node.children: + if child.udeprel == 'aux': + if child.lemma in MODALS: + modals.append(child) + modal_auxes = node_auxes # auxiliaries found so far are assumed to modify the modal verb (they come before it) + node_auxes = [] + + modal_neg = node_neg + node_neg = [] + + else: + node_auxes.append(child) + + elif child.upos == 'ADV' and child.feats['Polarity'] == 'Neg': + node_neg.append(child) + + return node_auxes, node_neg, modals, modal_auxes, modal_neg + + def process_modal_verbs(self, modals, modal_auxes, modal_neg): + """ + Annotates modal verb forms with the Phrase* attributes. + The modal verbs are kept as a single verb form, without including the infinitive of the content word. + + Parameters: + modals (list): all modal verbs among the children of the head content verb (currently assumes there is only one.) + modal_auxes (list): auxiliaries of the modal verb(s) + modal_neg (list): negative adverbs of the modal verb(s) + + """ + if not modal_auxes: + polarity = '' + if self.neg is True: + phrase_ords = [modals[0].ord] + [n.ord for n in modal_neg] + phrase_ords.sort() + + if modal_neg: + polarity='Neg' + else: + phrase_ords = [modals[0].ord] + self.process_phrases_with_ir_aller_estar(modals[0], '', polarity, phrase_ords, modals[0]) + self.process_simple_verb_forms(modals[0], '', polarity, phrase_ords, modals[0]) + + else: + polarity = '' + if self.neg is True: + phrase_ords = [modals[0].ord] + [a.ord for a in modal_auxes] + [n.ord for n in modal_neg] + if modal_neg: + polarity='Neg' + else: + phrase_ords = [modals[0].ord] + [a.ord for a in modal_auxes] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(modals[0], modal_auxes, '', polarity, phrase_ords, modals[0]) + + def process_phrases_with_ir_aller_estar(self, node, expl, polarity, phrase_ords, head_node): + aspect = '' + tense = node.feats['Tense'] + + # phrase already annotated + if head_node.misc[self.feature_prefix] != '': + return + + xcomps = [x for x in node.children if x.udeprel == 'xcomp'] + if node.lemma in ['ir', 'aller', 'estar', 'ter'] and node.upos == 'VERB' and xcomps: + node.misc['PeriAux'] = 'Yes' + + voice = node.feats['Voice'] + auxes = [x for x in xcomps[0].children if x.udeprel == 'aux'] + aux_pass = [x for x in auxes if x.deprel == 'aux:pass'] + auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass'] + + # European Portuguese: estar + a + Inf + if node.lemma == 'estar': + + if node.feats['Tense'] == 'Pres': + tense=Tense.PRES.value + aspect =Aspect.PROG.value + + elif node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + elif node.feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect=Aspect.PERFPROG.value + + elif node.feats['Tense'] == 'Fut': + tense=Tense.FUT.value + aspect=Aspect.PROG.value + + elif node.lemma == 'ter' and len(xcomps) > 1: + tense=Tense.PAST.value + aspect=Aspect.PROG.value + xcomps[0].misc['PeriAux'] = 'Yes' + + elif node.feats['Tense'] == 'Pres': + tense=Tense.FUT.value + + elif node.feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMP.value + + elif node.feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + + elif node.feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERF.value + + + if auxes_without_pass: + if auxes[0].lemma == 'estar': + aspect += 'Prog' + if auxes[0].lemma == 'haber': + aspect += 'Perf' + + + + adp_a = [x for x in xcomps[-1].children if x.lemma == 'a' and x.udeprel == 'mark'] + cop = [x for x in xcomps[0].children if x.udeprel == 'cop'] + phrase_ords = [node.ord] + [x.ord for x in xcomps] + [x.ord for x in auxes] + [x.ord for x in cop] + if adp_a: + phrase_ords += [x.ord for x in adp_a] + + if aux_pass: + voice='Pass' + + phrase_ords.sort() + + self.write_node_info(xcomps[-1], + tense = tense, + number = node.feats['Number'], + person = node.feats['Person'], + aspect = aspect, + mood = node.feats['Mood'], + form = 'Fin', + voice=voice, + expl = expl, + polarity = polarity, + analytic='Yes', + ords=phrase_ords) + return + + def process_simple_verb_forms(self, node, expl, polarity, phrase_ords, head_node): + """ + Annotate simple verb forms or passive verb forms that contain only a passive auxiliary. + + Parameters + node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words of the verb form. + head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase. + """ + + if node.misc['PeriAux'] != '': + return + + # Portuguese + # presente -> PhraseTense=Pres, PhraseAspect='' + # Futuro do presente -> PhraseTense=Fut, PhraseAspect='' + + # Spanish + # presente -> PhraseTense=Pres, PhraseAspect='' + # futuro simple -> PhraseTense=Fut, PhraseAspect='' + + # Italian + # presente -> PhraseTense=Pres, PhraseAspect='' + # futuro semplice -> PhraseTense=Fut, PhraseAspect='' + + aspect = '' + tense = node.feats['Tense'] + form = node.feats['VerbForm'] + + if node.feats['Mood'] == 'Ind': + + # Portuguese + # pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + + # Spanish + # pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp + + # Italian + # imperfetto -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # Portuguese + # pretérito perfeito -> PhraseTense=Past, PhraseAspect=Perf + + # Spanish + # pretérito perfecto -> PhraseTense=Past, PhraseAspect=Perf + + # Italian + # pass remoto -> PhraseTense=Past, PhraseAspect=Perf + elif node.feats['Tense'] == 'Past': + aspect=Aspect.PERF.value + + # Portuguese + # pretérito mais que perfeito simples -> PhraseTense=Past, PhraseAspect=Pqp + elif node.feats['Tense'] == 'Pqp': + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + else: + # viitorul popular/colocvial (intentional future) -> PhraseTense=Fut, PhraseAspect='' + o = [x for x in node.children if x.lemma == 'o' and x.upos == 'PART'] + sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART'] + + + if o and sa: + tense = Tense.FUT.value + phrase_ords.append(o[0].ord) + phrase_ords.append(sa[0].ord) + + phrase_ords.sort() + + + + # Portuguese + # subjunctive presente -> PhraseTense=Pres, PhraseAspect='' + # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' + + # Spanish + # subjunctive presente -> PhraseTense=Pres, PhraseAspect='' + # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' TODO not annotated in treebanks? + + # Italian + # Congiuntivo presente -> PhraseTense=Pres, PhraseAspect='' + if node.feats['Mood'] == 'Sub': + + if node.feats['Tense'] == 'Past': + aspect=Aspect.IMP.value + + # Portuguese + # subjunctive pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + + # Spanish + # Pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp + + # Italian + # Congiuntivo imperfetto -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # Portuguese + # Futuro do pretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + + # Spanish + # pospretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + + # Italian + # Condizionale presente -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + if node.feats['Mood'] == 'Cnd': + aspect='' + tense=Tense.PRES.value + + adp_en = [x for x in head_node.children if x.upos == 'ADP' and x.lemma == 'en' and x.udeprel == 'mark'] + if node.feats['VerbForm'] == 'Part' and adp_en: + phrase_ords.append(adp_en[0].ord) + phrase_ords.sort() + form = 'Ger' + + + self.write_node_info(head_node, + person=node.feats['Person'], + aspect=aspect, + number=node.feats['Number'], + mood=node.feats['Mood'], + form=form, + tense=tense, + gender=head_node.feats['Gender'], + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic=self.get_analytic_bool(head_node), + ords=phrase_ords + ) + + def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_ords, head_node): + """ + Annotate periphrastic verb forms with the Phrase* attributes. + + Parameters + node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary. + auxes (list[udapi.core.node.Node]): All auxiliaries except the passive auxiliaries. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words in the verb form. + head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase. + """ + + # phrase already annotated + if head_node.misc[self.feature_prefix] != '': + return + + if len(auxes) == 1: + # Cnd + if auxes[0].feats['Mood'] == 'Cnd' and (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'Ger'): + + # Portuguese + # aux estar cond + gerund -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].lemma == 'estar': + tense=Tense.PRES.value + aspect=Aspect.PROG.value + + # Portuguese + # Futuro do pretérito composto -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + + # Spanish + # Antepospretérito -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + + # Italian + # Condizionale passato -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + else: + tense=Tense.PAST.value + aspect='' + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + aspect=aspect, + mood='Cnd', + form='Fin', + expl=expl, + polarity=polarity, + voice=head_node.feats['Voice'], + analytic='Yes', + ords=phrase_ords) + return + + if auxes[0].lemma == 'vir' and auxes[0].feats['Tense'] in ['Pres', 'Imp', 'Past'] and node.feats['VerbForm'] == 'Ger': + + # aux Pres (vir) + gerund -> PhraseTense=PastPres, PraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=Aspect.PROG.value, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + if auxes[0].lemma == 'ir' and node.feats['VerbForm'] == 'Ger': + + # aux Pres (ir) + gerund -> PhraseTense=Pres, PhraseAspect=Prog + tense = auxes[0].feats['Tense'] + aspect = Aspect.PROG.value + + # aux Imp (ir) + gerund -> PhraseTense=Past, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + # Auxiliary 'estar' followed by a gerund + if node.feats['VerbForm'] == 'Ger': + + # Portuguese + Spanish + # pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg + # subjunctive pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + # Portuguese + Spanish + # pretérito perfeito (aux estar) -> PhraseTense=Past, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect=Aspect.PERFPROG.value + + # Portuguese + Spanish + # presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog + # futuro do presente (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog + # subjunctive presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Sub + # subjunctive futuro (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog, PhraseMood=Sub + else: + tense=auxes[0].feats['Tense'] + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + aspect=aspect, + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # Auxiliary 'ter' / 'haber' / 'avere' / 'essere' followed by a participle + if node.feats['VerbForm'] == 'Part': + + # Portuguese + # futuro do presente composto (aux ter) -> PhraseTense=Fut, PhraseAspect=Perf + + # Spanish + # Futuro compuesto antefuturo -> PhraseTense=Fut, PhraseAspect=Perf + + # Italian + # Futuro anteriore -> PhraseTense=Fut, PhraseAspect=Perf + aspect=Aspect.PERF.value + tense=auxes[0].feats['Tense'] + form='Fin' + mood=auxes[0].feats['Mood'] + + adp_en = [x for x in node.children if x.lemma == 'en' and x.upos == 'ADP' and x.udeprel == 'mark'] + if auxes[0].feats['VerbForm'] == 'Part' and adp_en: + tense=Tense.PAST.value + aspect='' + phrase_ords.append(adp_en[0].ord) + phrase_ords.sort() + form='Ger' + + + # Romanian + # Perfect compus -> PhraseTense=Past, PhraseAspect=Perf + elif auxes[0].lemma == 'avea': + tense = Tense.PAST.value + aspect = Aspect.PERF.value + form = 'Fin' + + # Spanish + # Pretérito perfecto compuesto ante presente -> PhraseTense=Past, PhraseAspect=Perf + + # Italian + # Passato prossimo (aux avere/essere) -> PhraseTense=Past, PhraseAspect=Perf + elif auxes[0].feats['Tense'] == 'Pres': + + # Portuguese + # pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf + # subjonctive pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf, PhraseMood=Sub + if auxes[0].lemma == 'fi' or auxes[0].feats['Mood'] == 'Sub': + tense = Tense.PASTPRES.value + + # subjonctive mood not annotated in Romanian data + if auxes[0].lemma == 'fi': + mood='Sub' + else: + tense=Tense.PAST.value + + # Portuguese + # pretérito mais que perfeito composto (aux ter/haver) -> PhraseTense=Past, PhraseAspect=Pqp + # subjonctive pretérito mais-que-perfeito composto (aux ter) -> PhraseTense=Past, PhraseAspect=Pqp, PhraseMood=Sub + + # Spanish + # pretérito pluscuamperfecto -> PhraseTense=Past, PhraseAspect=Pqp + + # Italian + # Trapassato prossimo -> PhraseTense=Past, PhraseAspect=Pqp + elif auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + # Spanish + # pretérito anterior ante pretérito -> PhraseTense=Past, PhraseAspect=Ant + + # Italian + # trapassato remoto -> PhraseTense=Past, PhraseAspect=Ant + + # French + # passé antérieur -> PhraseTense=Past, PhraseAspect=Ant + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect = Aspect.ANT.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=mood, + aspect=aspect, + form=form, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + # auxiliary 'ir' or 'vrea' followed by infinitive + if auxes[0].lemma in ['ir', 'vrea'] and node.feats['VerbForm'] == 'Inf': + + tense=node.feats['Tense'] + aspect='' + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect='' + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect='' + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=Imp + elif auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMP.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect='' + elif auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect='' + + # Futuro perifrástico passado perf -> PhraseTense=PastFut, PhraseAspect=Perf + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERF.value + + # Viitorul standard/literar/simplu -> PhraseTense=Fut, PhraseAspect='' + if auxes[0].lemma == 'vrea': + tense = Tense.FUT.value + aspect = '' + + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # condițional-optativ prezent -> PhraseTense=Pres, PhraseAspect='' + if auxes[0].lemma == 'avea' and node.feats['VerbForm'] == 'Inf': + tense=Tense.PRES.value + aspect='' + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood='Cnd', + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # viitor popular/colloquial (obligative future) -> PhraseTense=Fut, PhraseAspect='' + # viitor popular (potential future - contracted form) -> PhraseTense=Fut, PhraseAspect='' + if node.feats['VerbForm'] == 'Fin': + sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART'] + + if sa: + phrase_ords.append(sa[0].ord) + phrase_ords.sort() + + tense=Tense.FUT.value + aspect='' + + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=head_node.feats['Number'], + person=head_node.feats['Person'], + mood=head_node.feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + elif len(auxes) == 2: + # Romanian + # viitor anterior -> PhraseTense=Fut, PhraseAsoect=Perf + if auxes[0].lemma == 'vrea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part': + + self.write_node_info(head_node, + tense=Tense.PAST.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=Aspect.PERF.value, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # condițional-optativ perfect -> PhraseTense=Past + if auxes[0].lemma == 'avea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part': + + self.write_node_info(head_node, + tense=Tense.PAST.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood='Cnd', + form='Fin', + aspect='', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # Portuguese + # auxiliry 'ir' followed by auxiliary 'estar' in infinitive and a gerund + if auxes[0].lemma == 'ir' and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PROG.value + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=ImpProg + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMPPROG.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PROG.value + + if auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERFPROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + # auxiliriy 'ir' in present or future tense followed by auxiliary 'ter' in infinitive and a participle + if auxes[0].lemma == 'ir' and (auxes[0].feats['Tense'] in ['Pres', 'Fut']) and auxes[1].lemma == 'ter' and node.feats['VerbForm'] == 'Part': + + # Futuro perifrástico -> PhraseTense=FutFut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PERF.value + + # aux Pres (ir) + aux ter inf + pp -> PhraseTense=Fut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PERF.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + # Cnd (only ter/haber), Sub and Past,Pres,Fut tenses: 2 auxes - ter/haber + estar + if auxes[0].lemma in AUXES_HAVE and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + + tense = auxes[0].feats['Tense'] + aspect = Aspect.PERFPROG.value + + # aux ter cond + estar pp + gerund -> PhraseTense=Past, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].feats['Mood'] == 'Cnd': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + # Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg + # subjonctive Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + # Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg + # subjonctive Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + aspect=Aspect.PQPPROG.value + + # Futuro do presente composto -> PhraseTense=Fut, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Fut' and auxes[0].lemma == 'ter': + tense=Tense.FUT.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords, + ) + return + + def process_copulas(self, node, cop, expl, polarity, phrase_ords): + """ + Annotate non-verbal predicates with copula using the Phrase* attributes. + + This method is specialized for non-periphrastic copulas. + If any auxiliaries are present, process_periphrastic_verb_forms() is called instead. + + Parameters + node (udapi.core.node.Node): The non-verbal predicate that should receive the Phrase* attributes, i.e., the head of the phrase. + cop (list[udapi.core.node.Node]): The copula nodes. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words in the verb form. + """ + + # classify the morphological features of the copula node and propagate them to the entire phrase (treating the copula as the content verb) + self.process_phrases_with_ir_aller_estar(cop[0], expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(cop[0], expl, polarity, phrase_ords, node) + + # adjust PhraseAspect based on the lemma of the copula + if cop[0].feats['Tense'] in ['Pres', 'Fut']: + if cop[0].lemma == 'ser': + node.misc['PeriAspect'] = Aspect.PERF.value + elif cop[0].lemma == 'estar': + node.misc['PeriAspect'] = Aspect.IMP.value \ No newline at end of file diff --git a/udapi/block/msf/slavic/conditional.py b/udapi/block/msf/slavic/conditional.py new file mode 100644 index 00000000..9d15418f --- /dev/null +++ b/udapi/block/msf/slavic/conditional.py @@ -0,0 +1,97 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects conditional verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Conditional(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + if (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'PartRes') or node.feats['VerbForm'] == 'Fin': + # in most Slavic languages, the verb has feats['VerbForm'] == 'Part' but in Polish the verb has feats['VerbForm'] == 'Fin' + + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # list for auxiliary verbs for forming the conditional mood + cop = [x for x in node.children if x.udeprel == 'cop'] # in some cases it may happen that the cop follows the noun, we don't want to these cases in this branch + # in Polish the auxiliary verbs for conditional mood have deprel == 'aux:cnd', in other languages the auxiliary verbs have x.feats['Mood'] == 'Cnd' + + # the conditional mood can be formed using the auxiliary verb or some conjunctions (such as 'aby, kdyby...' in Czech) + # so x.udeprel == 'aux' can't be required because it doesn't meet the conjunctions + + if aux_cnd and not cop: + aux = [x for x in node.children if x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd'] # all auxiliary verbs and conjuctions with feats['Mood'] == 'Cnd' + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person='3' # TODO there is a problem in russian etc. (same as in past tense) + + for aux_verb in aux: + if aux_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + + + self.write_node_info(node, + person=person, + number=node.feats['Number'], + mood='Cnd', + form='Fin', + aspect=node.feats['Aspect'], + expl=self.get_expl_type(node,refl), + polarity=self.get_polarity(phrase_nodes), + voice=self.get_voice(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['VerbForm'] == 'Part' or x.feats['VerbForm'] == 'Fin')] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel=='aux:cnd'] + + if cop and aux_cnd: + # there can be a copula with Mood='Cnd' (i. e. in Old East Slavonic), we don't want to count these copula in phrase_ords twice, so there is x.udeprel != 'cop' in aux list + aux = [x for x in node.children if (x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd') and x.udeprel != 'cop'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + prep + refl + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + + person = '3' + + for aux_verb in aux: + if aux_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + for cop_verb in cop: + if cop_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=person, + number=copVerb.feats['Number'], + mood='Cnd', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + ords=phrase_ords, + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) \ No newline at end of file diff --git a/udapi/block/msf/slavic/converb.py b/udapi/block/msf/slavic/converb.py new file mode 100644 index 00000000..32714630 --- /dev/null +++ b/udapi/block/msf/slavic/converb.py @@ -0,0 +1,94 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects converb (transgressive) forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Converb(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # condition node.upos == 'VERB' to prevent copulas from entering this branch + if node.feats['VerbForm'] == 'Conv' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + form='Conv', + tense=node.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + voice=self.get_voice(node, refl), + analytic=self.get_analytic_bool(node) + ) + + # passive voice + elif node.upos == 'ADJ': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Conv'] + + if aux: + auxVerb = aux[0] + + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + form='Conv', + tense=auxVerb.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=auxVerb.feats['Gender'], + animacy=auxVerb.feats['Animacy'], + voice='Pass', + analytic=self.get_analytic_bool(node) + ) + + # copulas + else: + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Conv'] + + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + tense=copVerb.feats['Tense'], + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'], + form='Conv', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + voice=self.get_voice(copVerb, refl), + analytic=self.get_analytic_bool(node) + ) diff --git a/udapi/block/msf/slavic/future.py b/udapi/block/msf/slavic/future.py new file mode 100644 index 00000000..9cc17717 --- /dev/null +++ b/udapi/block/msf/slavic/future.py @@ -0,0 +1,207 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects future tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Future(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # future tense for Serbian and Croatian + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and (x.lemma == 'hteti' or x.lemma == 'htjeti')] + if node.upos != 'AUX' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + aux_other = [x for x in node.children if x.udeprel == 'aux'] # adding aux for passive voice + cop = [x for x in node.children if x.deprel == 'cop'] + + phrase_nodes = [node] + refl + aux_other + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + if not cop: + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], # srbstina ani chorvatstina vidy nema + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + else: + prep = [x for x in node.children if x.upos == 'ADP'] + phrase_nodes += prep + phrase_ords += [x.ord for x in prep] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + + return + + # Macedonian forms the future tense with the auxiliary word ќе and a verb in the present tense + # Bulgarian forms the future tense with the auxiliary word ще and a verb in the present tense + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + + if node.feats['Tense'] == 'Pres' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # future tense of perfect verbs + # Upper Sorbian forms the future tense in this way, however, the feats[Aspect] are not listed in the data + # in some languages ​​(e.g. in Russian) these verbs have the Tense Fut, in others (e.g. in Czech) they have the Tense Pres + if node.feats['Aspect'] == 'Perf' and (node.feats['Tense'] == 'Pres' or node.feats['Tense'] == 'Fut') and node.feats['VerbForm'] != 'Conv': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + form='Fin', + aspect='Perf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + + # future tense of imperfect verbs and passive voice + # in some languages ​​the verb is in the infinitive, in some it is in the l-participle + # the condition node.upos == 'ADJ' is due to the passive voice - the n-participle is marked as ADJ, but the auxiliary verb is not cop, but aux + if node.upos == 'VERB' or node.upos == 'ADJ': + + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Fut'] + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + if aux: + auxVerb = aux[0] + self.write_node_info(node, + tense='Fut', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + # simple future tense - e.g. in Serbian, the future tense can be formed by combining a verb with a full meaning and an auxiliary verb into one word, i.e. without an auxiliary verb + # or verbs like pojede, půjdeme... in Czech + + if not aux and node.feats['Tense'] == 'Fut': + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Fut'] + if cop: + copVerb = cop[0] + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood']=='Ind'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Fut', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + diff --git a/udapi/block/msf/slavic/imperative.py b/udapi/block/msf/slavic/imperative.py new file mode 100644 index 00000000..5a30d05e --- /dev/null +++ b/udapi/block/msf/slavic/imperative.py @@ -0,0 +1,89 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects imperative verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Imperative(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # the condition node.upos == 'VERB' ensures that copulas do not enter this branch + if node.feats['Mood'] == 'Imp' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + aspect=node.feats['Aspect'], + mood='Imp', + form='Fin', + voice='Act', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # verbs in the passive forms are marked as ADJ + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood'] == 'Imp'] + if aux: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Imp', + voice='Pass', + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Mood'] == 'Imp'] + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Imp', + form='Fin', + voice=self.get_voice(copVerb, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/infinitive.py b/udapi/block/msf/slavic/infinitive.py new file mode 100644 index 00000000..83bc0766 --- /dev/null +++ b/udapi/block/msf/slavic/infinitive.py @@ -0,0 +1,107 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects infinitive verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Infinitive(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + if node.feats['VerbForm'] == 'Inf' and node.upos == 'VERB': + aux = [x for x in node.children if x.udeprel == 'aux'] + if not aux: # the list of auxiliary list must be empty - we don't want to mark infinitives which are part of any other phrase (for example the infinititive is part of the future tense in Czech) + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes == neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] != 'Inf'] + if aux and not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Pass', + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + number=node.feats['Number'], + analytic=self.get_analytic_bool(node) + ) + return + + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + if cop and not aux_forb: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + voice=self.get_voice(cop[0], refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + + # there is a rare verb form called supine in Slovenian, it is used instead of infinitive as the argument of motion verbs + if node.feats['VerbForm'] == 'Sup': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Act', + form='Sup', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/past.py b/udapi/block/msf/slavic/past.py new file mode 100644 index 00000000..130d972d --- /dev/null +++ b/udapi/block/msf/slavic/past.py @@ -0,0 +1,212 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects past tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Past(udapi.block.msf.phrase.Phrase): + + def get_person_for_langs_with_simple_past(self, node, person): + """ + returns the person which is known from subject, languages with the simple past tense (e. g. Russian) do not express person in these verb forms + if the person was not taken from the subject, the third person would be filled in automatically due to languages ​​with a compound past but simple forms for the third person (e. g. Czech) + """ + subj = [x for x in node.children if x.udeprel == 'nsubj'] + if subj: + subj = subj[0] + if subj.feats['Person'] != '': + person = subj.feats['Person'] + return person + + def process_node(self, node): + + past_tenses = ['Past', 'Imp', 'Pqp'] + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['Tense'] in past_tenses)] + + # there is person 0 in Polish and Ukrainian which is for impersonal statements + # in Polish, verbs with Person=0 have also Tense=Past, in Ukrainian the tense is not specified + if node.feats['Person'] == '0': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood=node.feats['Mood'], + voice='Act', #In Polish, impersonal statements are annotated with Voice=Act. In Ukrainian, the Voice feature is missing; therefore, we decided to annotate these phrases with PhraseVoice=Act + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + # compound past tense + if (node.feats['VerbForm'] in ['Part', 'PartRes', 'Fin']) and node.upos == 'VERB' and node.feats['Voice'] != 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in ['Pres', '']] + aux_pqp = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in past_tenses] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + aux_pqp + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + aux_cnd = [x for x in node.children if (x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd') and x.udeprel != 'conj'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux: + person = aux[0].feats['Person'] + + elif not aux: + person = '3' + + if aux_pqp: + person = aux_pqp[0].feats['Person'] + + # in Slovenian, the participles are not annotated as Tense='Past', the Tense feature is missing here + # but in Bulgarian, there are cases where the participles are annotated as Tense='Imp' + tense = 'Past' + if node.feats['Tense'] == 'Imp': + tense = 'Imp' + if node.feats['Tense'] == 'Pqp': + tense = 'Pqp' + + self.write_node_info(node, + tense=tense, + person=person, + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + + # the past tense of some Slavic languages ​​is formed only by a verb without an auxiliary verb (e.g. Polish) + # or imperfect (special case of the past tense) e.g. in Bulgarian or Croatian + elif (node.feats['Tense'] in past_tenses) and node.upos == 'VERB' and node.feats['VerbForm'] != 'Conv': + + # the past tense is formed only by a content verb, not with an auxiliary + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + + if not aux_forb: + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + + + # passive + elif node.upos == 'ADJ' and node.feats['Voice'] == 'Pass' and not cop: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and (x.feats['Tense'] in past_tenses)] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux_past_tense: + aux_pres_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] # e. g. the auxiliary 'jsem' in the phrase 'byl jsem přinucen' + + phrase_nodes = [node] + aux_past_tense + aux_pres_tense + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_pres_tense: + person = aux_pres_tense[0].feats['Person'] + person = self.get_person_for_langs_with_simple_past(node, person) + + self.write_node_info(node, + tense=aux_past_tense[0].feats['Tense'], + person=person, + number=aux_past_tense[0].feats['Number'], + mood='Ind', + voice='Pass', + form='Fin', + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + else: + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if cop and not aux_cnd: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux_past_tense + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_past_tense: + person = aux_past_tense[0].feats['Person'] + + # In ru, be, uk, the person is not expressed in past tense and the verbform is Fin, not Part + if cop[0].feats['VerbForm'] == 'Fin': + person = '' + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + tense=cop[0].feats['Tense'], + person=person, + number=cop[0].feats['Number'], + mood='Ind', + voice=self.get_voice(cop[0], refl), + form='Fin', + expl=self.get_expl_type(node,refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=cop[0].feats['Gender'], + animacy=cop[0].feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) diff --git a/udapi/block/msf/slavic/preprocessor.py b/udapi/block/msf/slavic/preprocessor.py new file mode 100644 index 00000000..0672812b --- /dev/null +++ b/udapi/block/msf/slavic/preprocessor.py @@ -0,0 +1,83 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block serves as a preprocessor for Slavic languages before the other blocks +are applied to detect periphrastic verb forms. It improves harmonization of +annotations across the treebanks by addressing some known divergences. +""" + +from udapi.core.block import Block + +class Preprocessor(Block): + + def process_node(self,node): + + # in Ukrainian the active verb forms are not marked as PhraseVoice=Act + if (node.upos == 'VERB' or (node.upos == 'AUX' and node.feats['VerbForm'] == 'Fin')) and node.feats['Voice'] == '': + node.feats['Voice'] = 'Act' + + # in some languages, participles are annotated with UPOS=VERB, while in others they are annotated with UPOS=ADJ + # we change the UPOS to ADJ when a participle expresses case + #if node.upos == 'VERB' and node.feats['VerbForm'] == 'Part' and node.feats['Case'] != '': + # node.upos = 'ADJ' + + # in Polish, the conditional mood for auxiliary verbs is marked as deprel == 'aux:cnd' and not as in the last Slavic languages ​​feats['Mood'] == 'Cnd' + if node.deprel == 'aux:cnd': + node.feats['Mood'] = 'Cnd' + + # unify polarities - some languages ​​mark only Neg (Russian), some mark both Neg and Pos (Czech) + if node.feats['Polarity'] == 'Pos': + node.feats['Polarity'] = '' + + # In Ukrainian, there is no explicit annotation of reflexive verbs + # We decided to unify the annotation of reflexive verbs with Russian and Belarusian, where reflexive verbs are formed similarly + # We add the feature Voice=Mid to reflexive verbs + if node.upos == 'VERB' and (node.lemma.endswith('сь') or node.lemma.endswith('ся')): + node.feats['Voice'] = 'Mid' + + # makedonstina tvori budouci cas pomoci pomocneho slova ќе, u nejz neni nijak vyznaceno, ze se podili na tvorbe budouciho casu + # stejne tak bulharstina pomoci pomocneho slova ще + # makedonstina a bulharstina + if node.feats['Tense'] == 'Pres': + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + if len(aux) == 1: + aux[0].feats['Tense'] = 'Fut' + + # in Czech and in Old Church Slavonic, the participles are sometimes marked with the plural gender + if node.feats['Gender'] == 'Fem,Neut' or node.feats['Gender'] == 'Fem,Masc': + subj = [x for x in node.children if x.udeprel == 'nsubj'] + + # for relative pronouns, only one gender is indicated + if len(subj) == 1: + conj = [x for x in subj[0].children if x.deprel == 'conj'] + if len(conj) == 0: + node.feats['Gender'] = subj[0].feats['Gender'] + node.feats['Number'] = subj[0].feats['Number'] + + # participles in passive are sometimes annotated as VERB, sometimes as ADJ + #if node.upos == 'VERB' and node.feats['Voice'] == 'Pass': + # node.upos = 'ADJ' + + # there are cases where the node has deprel=='expl:pv' or 'expl:pass' or 'expl:impers' and Reflex is not Yes (i.e. Macedonian treebank) + # we add the Reflex=Yes feature + if node.deprel == 'expl:pv' or node.deprel == 'expl:pass' or node.deprel == 'expl:impers': + node.feats['Reflex'] = 'Yes' + + # fixing the mistake in Macedonian treebank (mk_mtb-ud-test.conllu), in sent_id=other0010, there is personal pronoun 'ми' marked as expl:pv, it should be iobj + if node.deprel == 'expl:pv' and node.lemma == 'ми' and node.feats['PronType'] == 'Prs': + node.deprel = '' + node.udeprel = 'iobj' + + # in Old Church Slavonic, there is feature Mood=Sub, but this is a notation for conditional mood + if node.feats['Mood'] == 'Sub': + node.feats['Mood'] = 'Cnd' + + # although infinitives in Old Church Slavonic are annotated with Tense=Pres, they do not convey tense; therefore, we remove this annotation + if node.feats['VerbForm'] == 'Inf': + node.feats['Tense'] = '' + + # in the russian Syntagrus corpus, the negative particles have no Polarity=Neg feature + if node.lemma == 'не' and node.upos == 'PART' and node.udeprel == 'advmod': + node.feats['Polarity'] = 'Neg' + + # TODO maybe we want to set Tense=Fut for the perfective verbs with Tense=Pres? This could solve the problem with the simplified detection of the future tense in Czech + # but there are many verbs with no Aspect value, so the problem is still there diff --git a/udapi/block/msf/slavic/present.py b/udapi/block/msf/slavic/present.py new file mode 100644 index 00000000..7521a08d --- /dev/null +++ b/udapi/block/msf/slavic/present.py @@ -0,0 +1,132 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects present tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Present(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + # the condition VerbForm == 'Fin' ensures that there are no transgressives between the found verbs + # the aspect is not always given in Czech treebanks, so we can't rely on the fact that the imperfect aspect is specified + if node.feats['Tense'] == 'Pres' and node.upos == 'VERB' and node.feats['VerbForm'] == 'Fin' and node.feats['Aspect'] !='Perf': + + aux_forb = [x for x in node.children if x.upos == 'AUX' and (x.lemma == 'ќе' or x.lemma == 'ще' or x.feats['Mood'] == 'Cnd')] # forbidden auxiliaries for present tense (these auxiliaries are used for the future tense or the conditional mood) + + if not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Pres', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # passive voice + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and x.lemma != 'hteti' and x.lemma != 'htjeti'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] != 'Pres'] # we don't want the past passive (e. g. 'byl jsem poučen' in Czech) + + if aux and not aux_forb: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + auxVerb = aux[0] + + self.write_node_info(node, + tense='Pres', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + form='Fin', + voice='Pass', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + # participles + # in some languages, participles are used as attributes (they express case and degree) + if node.upos == 'ADJ' and node.feats['VerbForm'] == 'Part': + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + cop = [x for x in node.children if x.udeprel == 'cop'] + + if not aux_forb and not cop: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + tense=node.feats['Tense'], + number=node.feats['Number'], + form='Part', + voice=self.get_voice(node, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Pres'] + aux_forb = [x for x in node.children if x.upos == 'AUX' and x.feats['Tense'] != 'Pres'] # in Serbian this can be a future tense + + if cop and not aux_forb: + aux = [x for x in node.children if x.udeprel == "aux" and x.feats['Mood'] == 'Ind' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Pres', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/mwe/normalize.py b/udapi/block/mwe/normalize.py new file mode 100644 index 00000000..e7ebf24f --- /dev/null +++ b/udapi/block/mwe/normalize.py @@ -0,0 +1,68 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC + and normalizes it so that the type is always annotated at the first word of + the expression.""" +from udapi.core.block import Block +import logging +import re + +class Normalize(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then saves them back but makes sure that the type is annotated at the + first word of the expression (as opposed to the syntactic head or to + any other word). + """ + (mwes, mwes_by_nodes) = self.collect_mwes(root) + nodes = root.descendants + for n in nodes: + # Erase the previous MWE annotations so we can start from scratch. + n.misc['Mwe'] = '' + # There may be multiple MWEs this node is member of. + annotations = [] + for m in mwes_by_nodes[n.ord]: + if n.ord == mwes[m]['nodes'][0]: + annotations.append("%s:%s" % (m, mwes[m]['type'])) + else: + annotations.append(m) + if annotations: + n.misc['Mwe'] = ';'.join(annotations) diff --git a/udapi/block/mwe/possessives.py b/udapi/block/mwe/possessives.py new file mode 100644 index 00000000..0849a210 --- /dev/null +++ b/udapi/block/mwe/possessives.py @@ -0,0 +1,74 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC, + looks for dependent possessive pronouns and reports how they are treated.""" +from udapi.core.block import Block +import logging +import re + +class Possessives(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then surveys the possessive pronouns. + """ + (mwes, mwes_by_nodes) = self.collect_mwes(root) + nodes = root.descendants + for m in mwes: + mwenodes = [x for x in nodes if m in mwes_by_nodes[x.ord]] + mweheads = [x for x in mwenodes if not x.parent in mwenodes] + mwedescendantset = set() + for x in mweheads: + mwedescendantset = mwedescendantset.union(set(x.descendants)) + mwedescendants = list(sorted(mwedescendantset)) + # Is there a possessive pronoun? + possprons = [x for x in mwedescendants if x.upos == 'PRON' and x.feats['Poss'] == 'Yes'] + inpp = [x for x in possprons if m in mwes_by_nodes[x.ord]] + outpp = [x for x in possprons if not m in mwes_by_nodes[x.ord]] + observation = '' + if inpp and outpp: + observation = 'both' + elif inpp: + observation = 'in' + elif outpp: + observation = 'out' + if observation: + expression = ' '.join([x.form if m in mwes_by_nodes[x.ord] else '('+x.form+')' for x in mwedescendants]) + print(observation + ': ' + expression) diff --git a/udapi/block/mwe/tosubdeprels.py b/udapi/block/mwe/tosubdeprels.py new file mode 100644 index 00000000..3682c0c7 --- /dev/null +++ b/udapi/block/mwe/tosubdeprels.py @@ -0,0 +1,62 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC + and projects it to subtypes of dependency relation labels. The motivation is + that a parser could learn to predict the multiword expressions.""" +from udapi.core.block import Block +import logging +import re + +class ToSubDeprels(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then saves the type of the MWE as a subtype of the deprels inside. + """ + nodes = root.descendants + (mwes, mwes_by_nodes) = self.collect_mwes(root) + # Now we hopefully know the type of every multiword expression in the sentence. + for n in nodes: + if mwes_by_nodes[n.ord]: + for m in mwes_by_nodes[n.ord]: + type = re.sub(r"\.", '', mwes[m]['type'].lower()) + # Add the MWE type to the DEPREL if the parent is also in the same MWE. + if n.parent.ord > 0 and m in mwes_by_nodes[n.parent.ord]: + n.deprel += ':' + type diff --git a/udapi/block/read/addbratann.py b/udapi/block/read/addbratann.py new file mode 100644 index 00000000..4f5fc877 --- /dev/null +++ b/udapi/block/read/addbratann.py @@ -0,0 +1,230 @@ +"""Add Brat coreference annotation from *.ann files. + +So far, tested on French LitBank data only. + +T12 HIST 362 366 qui +T13 HIST 349 362 une aventure +R1431 Coreference Arg1:T12 Arg2:T13 + +""" + +from udapi.core.block import Block +from udapi.core.files import Files +import logging +from bisect import bisect_left +import networkx as nx + +def _m(range_s, range_e, offset): + return f"{range_s}-{offset}:{range_e}-{offset}" if offset else f"{range_s}:{range_e}" + +class AddBratAnn(Block): + + def __init__(self, files, zone='', offset=0, detect_bom=True, keep_mention_id=True, + coref_attr="R", no_type_value='_Unsorted_', + **kwargs): + """Args: + files: file names with the coreference annotations (*.ann) + offset: what number to substract from the chatacter indices in the ann files + detect_bom: if True and the current txt file starts with BOM (byte-order mark), add 1 to the offset + """ + super().__init__(**kwargs) + self.zone = zone + self.files = Files(filenames=files) + self.offset = offset + self.detect_bom = detect_bom + self.keep_mention_id = keep_mention_id + self.coref_attr = coref_attr + self.no_type_value = no_type_value + + def process_document(self, document): + + # Read all the important info from the *.ann file. + mentions, attrs, split_ante, clusters = {}, [], [], [] + ann_filehandle = self.files.next_filehandle() + offset = self.offset + if self.detect_bom: + txt_filename = self.files.filename.replace("ann", "txt") + with open(txt_filename, 'rb') as txt_fh: + raw_bytes = txt_fh.read(3) + if raw_bytes == b'\xef\xbb\xbf': + offset += 1 + + for line in ann_filehandle: + line = line.rstrip('\n') + if not "\t" in line: + logging.warning(f"Unexpected line without tabs: {line}") + elif line.startswith("T"): + # T13 HIST 349 362 une aventure + try: + mention_id, type_and_range, form = line.split("\t") + # Usually range are two numbers, but can be more, e.g. type_and_range="Abstract 605 653;654 703" + # Let's take the first and last number only.´ + parts = type_and_range.split() + ne_type, range_s, range_e = parts[0], int(parts[1]), int(parts[-1]) + + # If form ends with spaces, remove them and adjust range_e + stripped_form = form.rstrip(" ") + if form != stripped_form: + num_spaces = len(form) - len(stripped_form) + logging.debug(f"Stripping {num_spaces} space{'s' if num_spaces>1 else ''} from {mention_id} '{form}' ({_m(range_s,range_e,offset)}->{range_e-num_spaces})") + form = stripped_form + range_e = range_e - num_spaces + + + mentions[mention_id] = [ne_type, range_s, range_e, form] + if self.keep_mention_id: + attrs.append(["mention_id", mention_id, mention_id]) + except Exception as e: + logging.warning(f"Unexpected mention line: {line}\n{e}") + elif line.startswith(self.coref_attr): + try: + cor_attr, mention_ids = line.rstrip().split("\t") + parts = mention_ids.split() + assert(parts[0] == "Coreference") + except Exception as e: + logging.warning(f"Unexpected coref line: '{line}'\n{e}") + clusters.append([p.split(":")[1] for p in parts[1:]]) + elif line.startswith("#"): + pass # Let's ignore annotators' comments + else: + logging.warning(f"Unexpected line in {self.files.filename}:\n{line}") + + # Some Brat ann files use link-based representation, e.g. + # R123 Coreference Arg1:T11 Arg2:T13 + # R124 Coreference Arg1:T12 Arg2:T14 + # R125 Coreference Arg1:T13 Arg2:T14 + # This actually means that all four mentions T11, T12, T13 and T14 are in the same cluster (entity). + # However, clusters = [["T11", "T13"], ["T12", "T14"], ["T13", "T14"]] + # and we need to convert it to clusters = [["T11", "T12", "T13", "T14"]] + # Note that if creating entities for link, in their original order, + # R123 and R125 would result in creating two entities and when hitting R125 + # we would need to merge them, i.e. delete one of them and move their mentions to the other. + # This is the solution of corefud.Link2Cluster, but here it seems easier to find connected components. + coref_graph = nx.Graph() + for mention_ids in clusters: + coref_graph.add_node(mention_ids[0]) + for mention_id in mention_ids[1:]: + coref_graph.add_node(mention_id) + coref_graph.add_edge(mention_id, mention_ids[0]) + clusters = [list(component) for component in nx.connected_components(coref_graph)] + + # Create entity objects for non-singletons. + entity_map = {} + for mention_ids in clusters: + etype, etype_index = None, 0 + for index, m_id in enumerate(mention_ids): + if mentions[m_id][0] == self.no_type_value: + pass + elif etype is None: + etype, etype_index = mentions[m_id][0], index + elif etype != mentions[m_id][0]: + logging.warning(f"Mention type mismatch {mention_ids[etype_index]}:{etype} != {m_id}:{mentions[m_id][0]}. Using the former.") + if etype is None: + etype = "other" + entity = document.create_coref_entity(etype=etype) + for m_id in mention_ids: + if m_id in entity_map: + logging.warning(f"Mention {m_id} already in Entity {entity_map[m_id].eid}, not adding to {entity.eid}") + else: + entity_map[m_id] = entity + + # Collect TokenRange (as pre-filled by UDPipe) for each token. + tokens, starts, ends = [], [], [] + for tree in document.trees: + for token in tree.token_descendants: + tokens.append(token) + range_s, range_e = token.misc["TokenRange"].split(":") + starts.append(int(range_s)) + ends.append(int(range_e)) + + # Create mention objects. + mention_map = {} + for mention_id, mention_values in mentions.items(): + + # Find Udapi tokens for each mention. + ne_type, range_s, range_e, form = mention_values + index_s = bisect_left(starts, range_s - offset) + if starts[index_s] != range_s - offset and index_s > 0: + index_s -= 1 + index_e = bisect_left(ends, range_e - offset) + mtokens = tokens[index_s : index_e+1] + token_s, token_e = tokens[index_s], tokens[index_e] + + # Solve cases when the character range crosses Udapi (UDPipe-predicted) token boundaries. + # If the start token is a multi-word token (MWT), + # we can still try to find the proper word within the MWT. + ok_s, ok_e = True, True + if starts[index_s] != range_s - offset: + ok_s = False + if token_s.is_mwt(): + mtokens.pop(0) + first_form = form.split()[0] + new_start = ends[index_s] + for w in reversed(token_s.words): + mtokens = [w] + mtokens + new_start -= len(w.form) + if w.form == first_form or new_start < range_s - offset: + ok_s = True + break + + # similarly for the end token + if ends[index_e] != range_e - offset: + ok_e = False + if token_e.is_mwt(): + mtokens.pop() + last_form = form.split()[-1] + new_end = starts[index_e] + for w in token_e.words: + mtokens.append(w) + new_end += len(w.form) + if w.form == last_form or new_end > range_e - offset: + ok_e = True + break + + if not ok_s or not ok_e: + logging.warning(f"Mention {mention_id} range {_m(range_s, range_e, offset)} ({form})" + f" crosses token boundaries: {token_s.misc} ({token_s.form}) " + f".. {token_e.misc} ({token_e.form})") + + # Project tokens (including MWTs) to words and check forms match. + words, udapi_form = [], "" + for token in mtokens: + words += token.words + udapi_form += token.form + if not token.no_space_after: + udapi_form += " " + udapi_form = udapi_form.rstrip() + if form != udapi_form: + logging.warning(f"Mention {mention_id}: ann form '{form}' != Udapi form '{udapi_form}'") + + # Make sure all words of the mention are in the same sentence. + root = words[0].root + mwords = [words[0]] + for word in words[1:]: + if word.root is root: + mwords.append(word) + else: + logging.warning(f"Cross-sentence mention. Word {word} not in {root}, thus omitting from the mention.") + + # Create entities for singletons + if mention_id not in entity_map: + entity_map[mention_id] = document.create_coref_entity(etype=ne_type) + + # Create the Udapi mention object + mention = entity_map[mention_id].create_mention(words=mwords) + mention_map[mention_id] = mention + + # Fill-in the additional mention attributes. + for attr_name, mention_id, attr_value in attrs: + if mention_id in mention_map: + mention_map[mention_id].other[attr_name] = attr_value + + # Fill-in split antecedents + for arg1, arg2 in split_ante: + if arg1 in entity_map and arg2 in entity_map: + if entity_map[arg1] in entity_map[arg2].split_ante: + logging.warning(f"Repeated SplitAnte: {arg1=} ({entity_map[arg1].eid}) {arg2=} ({entity_map[arg2].eid})") + else: + entity_map[arg2].split_ante.append(entity_map[arg1]) + else: + logging.warning(f"{arg1} or {arg2} not indexed in entity_map") diff --git a/udapi/block/read/addtext.py b/udapi/block/read/addtext.py new file mode 100644 index 00000000..4d0b7771 --- /dev/null +++ b/udapi/block/read/addtext.py @@ -0,0 +1,59 @@ +"""read.AddText is a reader for adding word-wrapped plain-text to existing trees.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root +import logging + +class AddText(BaseReader): + r"""A reader for plain-text files to be stored to existing trees. + + For example LitBank conll files are segmented to sentences and tokenized, + but the SpacesAfter attributes are missing. We need to load the original + (raw) texts, which are not tokenized and not segmented, only word-wrapped + (to 70 characters per line). + + Args: + add_newpar: add newpar CoNLL-U annotations on empty lines (and the beginning of file) + """ + def __init__(self, zone='', add_newpar=True, **kwargs): + super().__init__(zone=zone, **kwargs) + self.add_newpar = add_newpar + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def process_document(self, document): + filehandle = self.next_filehandle() + if filehandle is None: + self.finished = True + return + text = ''.join(self.filehandle.readlines()) + i, end, was_newpar = 0, len(text)-1, True + while i <= end and text[i].isspace(): + i += 1 + + for bundle in document.bundles: + root = bundle.get_tree(zone=self.zone) + if self.add_newpar and was_newpar: + root.newpar = True + was_newpar = False + for node in root.token_descendants: + if text[i:i+len(node.form)] == node.form: + i += len(node.form) + if i > end or text[i].isspace(): + del node.misc['SpaceAfter'] + was_newpar = i+1 < end and text[i+1] == '\n' and text[i] == '\n' + while i <= end and text[i].isspace(): + i += 1 + else: + node.misc['SpaceAfter'] = 'No' + was_newpar = False + else: + logging.warning('Node %s does not match text "%s"', node, text[i:i+20]) + return + root.text = root.compute_text() + self.finished = not self.files.has_next_file() diff --git a/udapi/block/read/ccv.py b/udapi/block/read/ccv.py new file mode 100644 index 00000000..eb449362 --- /dev/null +++ b/udapi/block/read/ccv.py @@ -0,0 +1,78 @@ +"""Ccv class is a reader for Corpus of Czech Verse json files.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root +from udapi.block.ud.setspaceafterfromtext import SetSpaceAfterFromText +import json + +class Ccv(BaseReader): + r"""A reader for Corpus of Czech Verse json files. + + See https://github.com/versotym/corpusCzechVerse + Each verse (line) is stored as one tree (although it is quite often not a whole sentence). + Start of each stanza is marked with `newpar`. + Start of each poem is marked with `newdoc = [poem_id]`. + + Args: + tokenize: create nodes + """ + def __init__(self, tokenize=True, **kwargs): + self.tokenize = tokenize + self._cache = None + super().__init__(**kwargs) + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def read_tree(self): + if self._cache: + return self._cache.pop() + else: + trees = self.read_trees() + if not trees: + return None + self._cache = list(reversed(trees[1:])) + return trees[0] + + def read_trees(self): + if self.filehandle is None: + return None + poems = json.load(self.filehandle) + all_trees = [] + for poem in poems: + poem_trees = [] + for stanza in poem["body"]: + stanza_trees = [] + for line in stanza: + root = Root() + root.text = line["text"] + root.json["rhyme"] = line["rhyme"] + root.json["metre"] = line["metre"] + root.json["stress"] = line["stress"] + stanza_trees.append(root) + if self.tokenize: + words = [[]] + [[w] for w in line["words"]] + for index, puncts in line["punct"].items(): + for punct in puncts: + words[int(index)].append({"token": punct, "lemma": punct}) + for word in words: + for w in word: + node = root.create_child(form=w["token"], lemma=w["lemma"]) + if "morph" in w: + node.xpos = w["morph"] + node.misc["xsampa"] = w["xsampa"] + node.misc["phoebe"] = w["phoebe"] + SetSpaceAfterFromText.process_tree(None, root) + stanza_trees[0].newpar = True + poem_trees.extend(stanza_trees) + root = poem_trees[0] + root.newdoc = poem["poem_id"] + root.json["p_author"] = poem["p_author"] + root.json["b_author"] = poem["b_author"] + root.json["biblio"] = poem["biblio"] + all_trees.extend(poem_trees) + return all_trees diff --git a/udapi/block/read/conll.py b/udapi/block/read/conll.py new file mode 100644 index 00000000..d0aef1ee --- /dev/null +++ b/udapi/block/read/conll.py @@ -0,0 +1,162 @@ +""""Conll is a reader block for CoNLL-like files (CoNLL-U, CoNLL-X, CoNLL-2009).""" +import json +import logging +import re + +import udapi.block.read.conllu +from udapi.core.root import Root +from udapi.core.node import Node + + +class Conll(udapi.block.read.conllu.Conllu): + """A reader of the CoNLL-U files.""" + + def __init__(self, separator='tab', + attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc', **kwargs): + """Create the Conll reader object. + + This us a subclass of udapi.block.read.conllu.Conllu, + which adds a support for arbitrary column names and thus supporting not only CoNLL-U, + but also CoNLL-X, CoNLL-2009 and many other CoNLL-like formats. + + Args: + separator: How are the columns separated? + Default='tab' is the only possibility in valid CoNLL-U files. + 'space' means one or more whitespaces (this does not allow forms with space). + 'doublespace' means two or more spaces. + attributes: comma-separated list of column names in the input files + (default='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc') + Changing the default can be used for loading CoNLL-like formats (not valid CoNLL-U). + For ignoring a column, use "_" as its name. + Column "ord" marks the column with 1-based word-order number/index (usualy called ID). + Column "head" marks the column with dependency parent index (word-order number). + + For example, for CoNLL-X which uses name1=value1|name2=value2 format of FEATS, use + `attributes=ord,form,lemma,upos,xpos,feats,head,deprel,_,_` + but note that attributes upos, feats and deprel will contain language-specific values, + not valid according to UD guidelines and a further conversion will be needed. + You will loose the projective_HEAD and projective_DEPREL attributes. + + For CoNLL-2009 you can use `attributes=ord,form,lemma,_,upos,_,feats,_,head,_,deprel`. + You will loose the predicted_* attributes and semantic/predicate annotation. + + TODO: allow storing the rest of columns in misc, e.g. `node.misc[feats]` + for feats which do not use the name1=value1|name2=value2 format. + """ + super().__init__(**kwargs) + self.node_attributes = attributes.split(',') + self.separator = separator + + # pylint: disable=too-many-locals,too-many-branches,too-many-statements + # Maybe the code could be refactored, but it is speed-critical, + # so benchmarking is needed because calling extra methods may result in slowdown. + + def parse_node_line(self, line, root, nodes, parents, mwts): + if self.separator == 'tab': + fields = line.split('\t') + elif self.separator == 'space': + fields = line.split() + elif self.separator == 'doublespace': + fields = re.split(' +', line) + else: + raise ValueError('separator=%s is not valid' % self.separator) + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + # multi-word tokens will be processed later + if '-' in fields[0]: + mwts.append(fields) + return + if '.' in fields[0]: + empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], + xpos=fields[4], feats=fields[5], misc=fields[9]) + empty.ord = fields[0] + empty.raw_deps = fields[8] # TODO + return + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'head': + try: + parents.append(int(value)) + except ValueError as exception: + if not self.strict and value == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception + elif attribute_name == 'ord': + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") + elif attribute_name == 'deps': + setattr(node, 'raw_deps', value) + elif attribute_name != '_' and value != '_': + setattr(node, attribute_name, value) + + nodes.append(node) + + # Acknowledged code duplication with read.Conllu + def read_tree_from_lines(self, lines): + root = Root() + nodes = [root] + parents = [0] + mwts = [] + for line in lines: + if line[0] == '#': + self.parse_comment_line(line, root) + else: + self.parse_node_line(line, root, nodes, parents, mwts) + + # If no nodes were read from the filehandle (so only root remained in nodes), + # we return None as a sign of failure (end of file or more than one empty line). + if len(nodes) == 1: + return None + + # Empty sentences are not allowed in CoNLL-U, + # but if the users want to save just the sentence string and/or sent_id + # they need to create one artificial node and mark it with Empty=Yes. + # In that case, we will delete this node, so the tree will have just the (technical) root. + # See also udapi.block.write.Conllu, which is compatible with this trick. + if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': + nodes.pop() + root._children = [] + root._descendants = [] + + # Set dependency parents (now, all nodes of the tree are created). + for node_ord, node in enumerate(nodes[1:], 1): + try: + parent = nodes[parents[node_ord]] + except IndexError: + raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) + if node is parent: + if self.fix_cycles: + logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) + parent = root + else: + raise ValueError(f"Detected a cycle: {node} attached to itself") + elif node._children: + climbing = parent._parent + while climbing: + if climbing is node: + if self.fix_cycles: + logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent) + parent = root + break + else: + raise ValueError(f"Detected a cycle: {node}") + climbing = climbing._parent + node._parent = parent + parent._children.append(node) + + # Create multi-word tokens. + for fields in mwts: + range_start, range_end = fields[0].split('-') + words = nodes[int(range_start):int(range_end) + 1] + root.create_multiword_token(words, form=fields[1], misc=fields[-1]) + + return root diff --git a/udapi/block/read/conll2012.py b/udapi/block/read/conll2012.py new file mode 100644 index 00000000..2adbd00f --- /dev/null +++ b/udapi/block/read/conll2012.py @@ -0,0 +1,153 @@ +""""Conll2012 is a reader block for the coreference in CoNLL-2012 format. + +This implementation was tested on the LitBank files only +(and quickly on Portuguese Corref-PT and Summ-it++v2), so far. +LitBank does not use most of the columns, so the implementation +should be improved to handle other types of CoNLL-2012 files. +""" +import json +import logging +import re + +import udapi.block.read.conllu +from udapi.core.root import Root +from udapi.core.node import Node + +RE_BEGIN = re.compile(r'^#begin document ([^ ]+)') + +class Conll2012(udapi.block.read.conllu.Conllu): + """A reader of the Conll2012 files.""" + + def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', emptyval='_', **kwargs): + """Create the Conll2012 reader object. + + Args: + attributes: comma-separated list of column names in the input files + (default='docname,_,ord,form,_,_,_,_,_,_,_,_,coref' suitable for LitBank) + For ignoring a column, use "_" as its name. + Column "ord" marks the column with 0-based (unlike in CoNLL-U, which uses 1-based) + word-order number/index (usualy called ID). + For Corref-PT-SemEval, use attributes='ord,form,_,_,_,_,coref'. + For Summ-it++v2, use attributes='ord,form,_,_,_,_,_,_,coref'. + For FantasyCoref, use attributes='docname,_,ord,form,_,_,_,_,_,_,_,coref'. + emptyval: a symbol that represents an empty value, especially in the coref column + (default='_' suitable for LitBank, Corref-PT-SemEval, and Summ-it++v2) + For FantasyCoref, use emptyval='-'. + """ + super().__init__(**kwargs) + self.node_attributes = attributes.split(',') + self._docname = 'd' + self.emptyval = emptyval + + def parse_comment_line(self, line, root): + if line.startswith("#end document"): + return + match = RE_BEGIN.match(line) + if match: + docname = match.group(1) + # LitBank and FantasyCoref use e.g. + # #begin document (1023_bleak_house_brat); part 0 + if docname.startswith('(') and docname.endswith(');'): + docname = docname[1:-2] + # Summ-it++v2 uses e.g. + # #begin document /home/andre/Recursos-fontes/Summit/Summ-it_v3.0/corpusAnotado_CCR/CIENCIA_2002_22010/CIENCIA_2002_22010.txt + elif docname.startswith('/home/'): + docname = docname.split('/')[-1] + # Corref-PT-SemEval uses e.g. + # #begin document D1_C30_Folha_07-08-2007_09h19.txt.xml + docname = docname.replace('.txt', '').replace('.xml', '') + # FantasyCoref may use parentheses within the document ID e.g. + # #begin document (051_Fundevogel_(Bird-foundling)); part 000 + docname = docname.replace('(', '').replace(')', '') + + root.newdoc = docname + self._global_entity = 'eid-etype-head-other' + root.comment += '$GLOBAL.ENTITY\n' + self._docname = docname + else: + logging.warning(f"Unexpected comment line: {line}") + + def parse_node_line(self, line, root, nodes): + fields = line.split('\t') + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'docname': + # FantasyCoref may use parentheses within the document ID + value = value.replace('(', '').replace(')', '') + if value != self._docname: + logging.warning(f"Document name mismatch {value} != {self._docname}") + + # convert the zero-based index to one-based + # but Corref-PT uses a mix of one-based and zero-based + elif attribute_name == 'ord': + #setattr(node, 'ord', int(value) + 1) + if node.ord not in(int(value) + 1, int(value)): + logging.warning(f"Mismatch: expected {node.ord=}, but found {int(value) + 1} {line=}") + + elif attribute_name == 'coref': + if value and value != self.emptyval: + # LitBank always separates chunks by a vertical bar, e.g. (13)|10) + # Summ-it++v2 does not, e.g. (13)10) + if '|' in value: + chunks = value.split("|") + else: + chunks = [x for x in re.split(r'(\([^()]+\)?|[^()]+\))', value) if x] + modified_entities = [] + escaped_docname = self._docname.replace("-", "") + for entity in chunks: + entity_num = entity.replace("(", "").replace(")","") + modified_entity = f"{escaped_docname}_e{entity_num}--1" + if entity.startswith("(") and entity.endswith(")"): + modified_entity = "(" + modified_entity + ")" + elif entity.startswith("("): + modified_entity = "(" + modified_entity + elif entity.endswith(")"): + modified_entity = f"{escaped_docname}_e{entity_num}" + ")" + + # to avoid parentheses clashes, put the entities with ")" first + if modified_entity.startswith("("): + modified_entities.append(modified_entity) + else: + modified_entities.insert(0, modified_entity) + node.misc['Entity'] = ''.join(modified_entities) + + elif attribute_name == 'form' or (attribute_name != '_' and value != '_'): + setattr(node, attribute_name, value) + nodes.append(node) + + def read_tree_from_lines(self, lines): + root = Root() + nodes = [root] + for line in lines: + if line == '': + pass + elif line[0] == '#': + self.parse_comment_line(line, root) + else: + self.parse_node_line(line, root, nodes) + + # If no nodes were read from the filehandle (so only root remained in nodes), + # we return None as a sign of failure (end of file or more than one empty line). + if len(nodes) == 1: + return None + + return root + + def read_trees(self): + if self.max_docs: + raise NotImplementedError("TODO implement max_docs in read.Conll2012") + # Corref-PT does not put an empty line before #end document, + # so we need to split both on #end document and empty lines. + return [self.read_tree_from_lines(s.split('\n')) for s in + re.split(r'\n\n+|\n#end document\n', self.filehandle.read()) if s] + + def read_tree(self): + raise NotImplementedError("TODO implement read_tree in read.Conll2012") diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 8303d096..e19cd676 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -1,69 +1,51 @@ """"Conllu is a reader block for the CoNLL-U files.""" +import json import logging import re from udapi.core.basereader import BaseReader from udapi.core.root import Root +from udapi.core.node import Node # Compile a set of regular expressions that will be searched over the lines. # The equal sign after sent_id was added to the specification in UD v2.0. # This reader accepts also older-style sent_id (until UD v2.0 treebanks are released). RE_SENT_ID = re.compile(r'^# sent_id\s*=?\s*(\S+)') -RE_TEXT = re.compile(r'^# text\s*=\s*(.+)') -RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc) (?:\s*id\s*=\s*(.+))?') +RE_TEXT = re.compile(r'^# text\s*=\s*(.*)') +RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?$') +RE_JSON = re.compile(r'^# (doc_)?json_([^ =]+)\s*=\s*(.+)') +RE_GLOBAL_ENTITY = re.compile(r'^# global.Entity\s*=\s*(\S+)') class Conllu(BaseReader): """A reader of the CoNLL-U files.""" - def __init__(self, strict=False, separator='tab', empty_parent='warn', - attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc', **kwargs): + def __init__(self, strict=False, empty_parent='warn', fix_cycles=False, **kwargs): """Create the Conllu reader object. Args: strict: raise an exception if errors found (default=False, i.e. a robust mode) - separator: How are the columns separated? - Default='tab' is the only possibility in valid CoNLL-U files. - 'space' means one or more whitespaces (this does not allow forms with space). - 'doublespace' means two or more spaces. - empty_parent: What to do if HEAD is _? Default=warn - issue a warning and attach to the root + empty_parent: What to do if HEAD is _? Default=warn: issue a warning and attach to the root or if strict=1 issue an exception. With `empty_parent=ignore` no warning is issued. - attributes: comma-separated list of column names in the input files - (default='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc') - Changing the default can be used for loading CoNLL-like formats (not valid CoNLL-U). - For ignoring a column, use "_" as its name. - Column "ord" marks the column with 1-based word-order number/index (usualy called ID). - Column "head" marks the column with dependency parent index (word-order number). - - For example, for CoNLL-X which uses name1=value1|name2=value2 format of FEATS, use - `attributes=ord,form,lemma,upos,xpos,feats,head,deprel,_,_` - but note attributes that upos, feats and deprel will contain language-specific values, - not valid according to UD guidelines and a further conversion will be needed. - You will loose the projective_HEAD and projective_DEPREL attributes. - - For CoNLL-2009 you can use `attributes=ord,form,lemma,_,upos,_,feats,_,head,_,deprel`. - You will loose the predicted_* attributes and semantic/predicate annotation. - - TODO: allow storing the rest of columns in misc, e.g. `node.misc[feats]` - for feats which do not use the name1=value1|name2=value2 format. + fix_cycles: fix cycles by attaching a node in the cycle to the root; fix also HEAD index out of range """ super().__init__(**kwargs) - self.node_attributes = attributes.split(',') self.strict = strict - self.separator = separator self.empty_parent = empty_parent + self.fix_cycles = fix_cycles - @staticmethod - def parse_comment_line(line, root): + def parse_comment_line(self, line, root): """Parse one line of CoNLL-U and fill sent_id, text, newpar, newdoc in root.""" sent_id_match = RE_SENT_ID.match(line) if sent_id_match is not None: root.sent_id = sent_id_match.group(1) + root.comment += '$SENT_ID\n' return text_match = RE_TEXT.match(line) if text_match is not None: root.text = text_match.group(1) + root.comment += '$TEXT\n' return pardoc_match = RE_NEWPARDOC.match(line) @@ -71,42 +53,89 @@ def parse_comment_line(line, root): value = True if pardoc_match.group(2) is None else pardoc_match.group(2) if pardoc_match.group(1) == 'newpar': root.newpar = value + root.comment += '$NEWPAR\n' else: root.newdoc = value + root.comment += '$NEWDOC\n' + return + + json_match = RE_JSON.match(line) + if json_match is not None: + container = root.json + if json_match.group(1) == 'doc_': + if '__doc__' not in root.json: + root.json['__doc__'] = {} + container = root.json['__doc__'] + container[json_match.group(2)] = json.loads(json_match.group(3)) + return + + entity_match = RE_GLOBAL_ENTITY.match(line) + if entity_match is not None: + global_entity = entity_match.group(1) + if self._global_entity and self._global_entity != global_entity: + logging.warning(f"Mismatch in global.Entity: {self._global_entity} != {global_entity}") + self._global_entity = global_entity + root.comment += '$GLOBAL.ENTITY\n' return root.comment += line[1:] + "\n" - # pylint: disable=too-many-locals,too-many-branches,too-many-statements - # Maybe the code could be refactored, but it is speed-critical, - # so benchmarking is needed because calling extra methods may result in slowdown. - def read_tree(self, document=None): + def read_trees(self): + if not self.max_docs: + # Valid CoNLL-U files must have sentences separated by a single empty line. + # However, some users have to work with invalid files e.g. ending with two empty lines. + # It is obvious how to parse such files and re.split(r'\n\n+', s) is only twice as slow + # as s.split('\n\n') and this time is negligble + # relative to the main CoNLL-U parsing in read_tree_from_lines(). + return [self.read_tree_from_lines(s.split('\n')) for s in + re.split(r'\n\n+', self.filehandle.read()) if s] + # udapi.core.basereader takes care about the max_docs parameter. + # However, we can make the loading much faster by not reading + # the whole file if the user wants just first N documents. + trees, lines, loaded_docs = [], [], 0 + for line in self.filehandle: + line = line.rstrip() + if line == '': + tree = self.read_tree_from_lines(lines) + lines = [] + if tree.newdoc: + if loaded_docs == self.max_docs: + return trees + loaded_docs += 1 + if tree: + trees.append(tree) + else: + lines.append(line) + return trees + + def read_tree(self): if self.filehandle is None: return None + lines = [] + for line in self.filehandle: + line = line.rstrip() + if line == '': + break + lines.append(line) + return self.read_tree_from_lines(lines) + # pylint: disable=too-many-locals,too-many-branches,too-many-statements + # Maybe the code could be refactored, but it is speed-critical, + # so benchmarking is needed because calling extra methods may result in slowdown. + def read_tree_from_lines(self, lines): root = Root() nodes = [root] parents = [0] mwts = [] - for line in self.filehandle: - line = line.rstrip() - if line == '': - break + for line in lines: if line[0] == '#': self.parse_comment_line(line, root) else: - if self.separator == 'tab': - fields = line.split('\t') - elif self.separator == 'space': - fields = line.split() - elif self.separator == 'doublespace': - fields = re.split(' +', line) - else: - raise ValueError('separator=%s is not valid' % self.separator) - if len(fields) != len(self.node_attributes): + fields = line.split('\t') + if len(fields) != 10: if self.strict: raise RuntimeError('Wrong number of columns in %r' % line) - fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + fields.extend(['_'] * (10 - len(fields))) # multi-word tokens will be processed later if '-' in fields[0]: mwts.append(fields) @@ -118,26 +147,29 @@ def read_tree(self, document=None): empty.raw_deps = fields[8] # TODO continue - node = root.create_child() - - # TODO slow implementation of speed-critical loading - for (n_attribute, attribute_name) in enumerate(self.node_attributes): - if attribute_name == 'head': - try: - parents.append(int(fields[n_attribute])) - except ValueError as exception: - if not self.strict and fields[n_attribute] == '_': - if self.empty_parent == 'warn': - logging.warning("Empty parent/head index in '%s'", line) - parents.append(0) - else: - raise exception - elif attribute_name == 'ord': - setattr(node, 'ord', int(fields[n_attribute])) - elif attribute_name == 'deps': - setattr(node, 'raw_deps', fields[n_attribute]) - elif attribute_name != '_': - setattr(node, attribute_name, fields[n_attribute]) + if fields[3] == '_': + fields[3] = None + if fields[4] == '_': + fields[4] = None + if fields[7] == '_': + fields[7] = None + + # ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc + node = Node(root=root, form=fields[1], lemma=fields[2], upos=fields[3], + xpos=fields[4], feats=fields[5], deprel=fields[7], misc=fields[9]) + root._descendants.append(node) + node._ord = int(fields[0]) + if fields[8] != '_': + node.raw_deps = fields[8] + try: + parents.append(int(fields[6])) + except ValueError as exception: + if not self.strict and fields[6] == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception nodes.append(node) @@ -151,24 +183,49 @@ def read_tree(self, document=None): # they need to create one artificial node and mark it with Empty=Yes. # In that case, we will delete this node, so the tree will have just the (technical) root. # See also udapi.block.write.Conllu, which is compatible with this trick. - if len(nodes) == 2 and nodes[1].misc == 'Empty=Yes': + if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': nodes.pop() + root._children = [] + root._descendants = [] # Set dependency parents (now, all nodes of the tree are created). - # TODO: parent setter checks for cycles, but this is something like O(n*log n) - # if done for each node. It could be done faster if the whole tree is checked at once. - # Also parent setter removes the node from its old parent's list of children, - # this could be skipped here by not using `node = root.create_child()`. for node_ord, node in enumerate(nodes[1:], 1): try: - node.parent = nodes[parents[node_ord]] + parent = nodes[parents[node_ord]] except IndexError: - raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) + if self.fix_cycles: + logging.warning(f"Ignoring out-of-range HEAD (attaching to the root instead): {node} HEAD={parents[node_ord]}") + parent = root + else: + raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) + if node is parent: + if self.fix_cycles: + logging.warning("Ignoring a self-cycle (attaching to the root instead):\n%s", node) + parent = root + else: + raise ValueError(f"Detected a cycle: {node} attached to itself") + elif node._children: + climbing = parent._parent + while climbing: + if climbing is node: + if self.fix_cycles: + logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent) + parent = root + break + else: + raise ValueError(f"Detected a cycle: {node}") + climbing = climbing._parent + node._parent = parent + parent._children.append(node) # Create multi-word tokens. for fields in mwts: - range_start, range_end = fields[0].split('-') + try: + range_start, range_end = fields[0].split('-') + except ValueError: + logging.warning(f"Wrong MWT range in\n{fields[0]}\n\n{lines}") + raise words = nodes[int(range_start):int(range_end) + 1] - root.create_multiword_token(words, form=fields[1], misc=fields[-1]) + root.create_multiword_token(words, form=fields[1], feats=fields[5], misc=fields[9]) return root diff --git a/udapi/block/read/conllup.py b/udapi/block/read/conllup.py new file mode 100644 index 00000000..16d83d07 --- /dev/null +++ b/udapi/block/read/conllup.py @@ -0,0 +1,107 @@ +"""Conllup is a reader block for the CoNLL-UPlus format. + +Columns which don't have standardize attributes in Udapi/CoNLL-U +are stored in MISC (as key=value pairs). + +This code has been only tested on Hungarian KorKor files for CorefUD so far. +However, in the end, it is not used there (xtsv files are used instead conllup). +""" +import logging +import re + +import udapi.block.read.conll +from udapi.core.root import Root +from udapi.core.node import Node + +RE_GLOBAL_COLUMNS = re.compile(r'^# global.columns\s*=\s*(.+)') +COLUMN_MAP = { + 'ID': 'ord', +} +NORMAL_ATTRS = 'form lemma upos xpos feats deprel misc'.split() + +class Conllup(udapi.block.read.conll.Conll): + """A reader of the CoNLL-UPlus files.""" + + def __init__(self, attributes='autodetect', save_global_columns=False, **kwargs): + """Create the Conllup reader object. + + Args: + attributes: comma-separated list of column names in the input files + (can be used if the global.columns header is missing or needs to be overriden). + Default='autodetect' which means the column names will be loaded from the global.columns header. + For ignoring a column, use "_" as its name. + save_global_columns: keep the "global.columns" header in root.comments. Default=False. + Note that when saving the output to CoNLL-U, the comment is not needed + and it may be even misleading. It could be helpful only once write.Conllup is implemented + (with the possibility to use the same columns as in the input file). + """ + super().__init__(**kwargs) + self.save_global_columns = save_global_columns + if attributes == 'autodetect': + self.node_attributes = None + else: + self.node_attributes = attributes.split(',') + + def parse_comment_line(self, line, root): + if self.node_attributes is None: + global_columns_match = RE_GLOBAL_COLUMNS.match(line) + if global_columns_match is None: + return super().parse_comment_line(line, root) + global_columns = global_columns_match.group(1) + self.node_attributes = [COLUMN_MAP.get(v, v.lower()) for v in global_columns.split(" ")] + if self.save_global_columns: + root.comment += line[1:] + '\n' + return + return super().parse_comment_line(line, root) + + def parse_node_line(self, line, root, nodes, parents, mwts): + fields = line.split('\t') + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + + # multi-word tokens will be processed later + if '-' in fields[0]: + mwts.append(fields) + return + if '.' in fields[0]: + raise NotImplementedError("Empty nodes in CoNLL-UPlus not implement yet in read.Conllup") + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + nonstandard_attrs = [] + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'head': + if value == '???': + value = 0 + try: + parents.append(int(value)) + except ValueError as exception: + if not self.strict and value == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception + elif attribute_name == 'ord': + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") + elif attribute_name == 'deps': + setattr(node, 'raw_deps', value) + elif value == '_' and attribute_name != 'form': + pass + elif attribute_name == '_': + pass + elif attribute_name in NORMAL_ATTRS: + setattr(node, attribute_name, value) + else: + nonstandard_attrs.append([attribute_name, value]) + + # This needs to be done after node.misc is created (if "misc" in node.attributes) + for attribute_name, value in nonstandard_attrs: + node.misc[attribute_name.capitalize()] = value + + nodes.append(node) diff --git a/udapi/block/read/oldcorefud.py b/udapi/block/read/oldcorefud.py new file mode 100644 index 00000000..73e05f3b --- /dev/null +++ b/udapi/block/read/oldcorefud.py @@ -0,0 +1,119 @@ +"""Reader for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation.""" +import re +import logging +import udapi.block.read.conllu +from udapi.core.coref import CorefEntity, CorefMention, BridgingLinks + +class OldCorefUD(udapi.block.read.conllu.Conllu): + + def __init__(self, replace_hyphen_in_id_with='', **kwargs): + """Create the read.OldCorefUD reader object. + + Args: + substitute_hyphen_in_id_for: string to use as a replacement for hyphens in ClusterId + The new format does not allow hyphens in eid (IDs of entity entities), + so we need to replace them. + """ + super().__init__(**kwargs) + self.replace_hyphen_in_id_with = replace_hyphen_in_id_with + self.orig2new = {} + self.new2orig = {} + + def _fix_id(self, cid): + if not cid or '-' not in cid: + return cid + new_cid = self.orig2new.get(cid) + if new_cid is None: + new_cid = cid.replace('-', self.replace_hyphen_in_id_with) + base, counter = new_cid, 1 + while new_cid in self.new2orig: + counter += 1 + new_cid = f"{base}{counter}" + self.new2orig[new_cid] = cid + self.orig2new[cid] = new_cid + return new_cid + + def process_document(self, doc, strict=True): + super().process_document(doc) + + eid_to_entity = {} + for node in doc.nodes_and_empty: + index, index_str = 0, "" + eid = node.misc["ClusterId"] + if not eid: + index, index_str = 1, "[1]" + eid = node.misc["ClusterId[1]"] + eid = self._fix_id(eid) + while eid: + entity = eid_to_entity.get(eid) + if entity is None: + entity = CorefEntity(eid) + eid_to_entity[eid] = entity + mention = CorefMention(words=[node], entity=entity) + if node.misc["MentionSpan" + index_str]: + mention.span = node.misc["MentionSpan" + index_str] + etype = node.misc["ClusterType" + index_str] + if etype: + if entity.etype is not None and etype != entity.etype: + logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + entity.etype = etype + + bridging_str = node.misc["Bridging" + index_str] + if bridging_str: + mention._bridging = BridgingLinks(mention) + for link_str in bridging_str.split(','): + target, relation = link_str.split(':') + target = self._fix_id(target) + if target == eid: + _error("Bridging cannot self-reference the same entity: " + target, strict) + if target not in eid_to_entity: + eid_to_entity[target] = CorefEntity(target) + mention._bridging.append((eid_to_entity[target], relation)) + + split_ante_str = node.misc["SplitAnte" + index_str] + if split_ante_str: + split_antes = [] + # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma. + # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. + for ante_str in split_ante_str.replace('+', ',').split(','): + ante_str = self._fix_id(ante_str) + if ante_str in eid_to_entity: + if ante_str == eid: + _error("SplitAnte cannot self-reference the same entity: " + eid, strict) + split_antes.append(eid_to_entity[ante_str]) + else: + # split cataphora, e.g. "We, that is you and me..." + ante_cl = CorefEntity(ante_str) + eid_to_entity[ante_str] = ante_cl + split_antes.append(ante_cl) + entity.split_ante = sorted(split_antes) + + # Some CorefUD 0.2 datasets (e.g. ARRAU) separate key-value pairs with spaces instead of commas. + # We also need to escape forbidden characters. + mmisc = node.misc["MentionMisc" + index_str].replace(' ', ',') + mention.other = mmisc.replace('-', '%2D').replace('(', '%28').replace(')', '%29') + index += 1 + index_str = f"[{index}]" + eid = self._fix_id(node.misc["ClusterId" + index_str]) + # c=doc.coref_entities should be sorted, so that c[0] < c[1] etc. + # In other words, the dict should be sorted by the values (according to CorefEntity.__lt__), + # not by the keys (eid). + # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order. + for entity in eid_to_entity.values(): + if not entity._mentions: + _error(f"Entity {entity.eid} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict) + entity._mentions.sort() + doc._eid_to_entity = {c._eid: c for c in sorted(eid_to_entity.values())} + + # Delete all old-style attributes from MISC (so when converting old to new style, the old attributes are deleted). + attrs = "ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() + for node in doc.nodes_and_empty: + for key in list(node.misc): + if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): + del node.misc[key] + + +def _error(msg, strict): + if strict: + raise ValueError(msg) + logging.error(msg) diff --git a/udapi/block/read/sentences.py b/udapi/block/read/sentences.py index 14840a50..7487d580 100644 --- a/udapi/block/read/sentences.py +++ b/udapi/block/read/sentences.py @@ -4,7 +4,28 @@ class Sentences(BaseReader): - """A reader for plain-text sentences (one sentence per line) files.""" + r"""A reader for plain-text sentences (one sentence per line) files. + + Args: + ignore_empty_lines: if True, delete empty lines from the input. + Default=False. + newdoc_if_empty_line: if True, empty lines mark document boundaries, + which are marked with `root.newdoc`. Default=False. + rstrip: a set of characters to be stripped from the end of each line. + Default='\r\n '. You can use rstrip='\n' if you want to preserve + any space or '\r' (Carriage Return) at end of line, + so that `udpipe.Base` keeps these characters in `SpacesAfter`. + As most blocks do not expect whitespace other than a space to appear + in the processed text, using this feature is at your own risk. + """ + def __init__(self, ignore_empty_lines=False, newdoc_if_empty_line=False, + rstrip='\r\n ', **kwargs): + if ignore_empty_lines and newdoc_if_empty_line: + raise ValueError("ignore_empty_lines is not compatible with newdoc_if_empty_line") + self.ignore_empty_lines = ignore_empty_lines + self.newdoc_if_empty_line = newdoc_if_empty_line + self.rstrip = rstrip + super().__init__(**kwargs) @staticmethod def is_multizone_reader(): @@ -18,8 +39,25 @@ def read_tree(self, document=None): if self.filehandle is None: return None line = self.filehandle.readline() + # if readline() returns an empty string, the end of the file has been + # reached, while a blank line is represented by '\n' + # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None + preceded_by_empty_line = False + if self.ignore_empty_lines or self.newdoc_if_empty_line: + while line in {'\n', '\r\n'}: + preceded_by_empty_line = True + line = self.filehandle.readline() + if line == '': + return None root = Root() - root.text = line.rstrip() + root.text = line.rstrip(self.rstrip) + if self.newdoc_if_empty_line and preceded_by_empty_line: + root.newdoc = True return root + + # The first line in a file also marks a start of new document + def after_process_document(self, document): + if self.newdoc_if_empty_line: + document.bundles[0].trees[0].newdoc = True diff --git a/udapi/block/read/text.py b/udapi/block/read/text.py new file mode 100644 index 00000000..161b6b6e --- /dev/null +++ b/udapi/block/read/text.py @@ -0,0 +1,74 @@ +"""Text class is a reader for word-wrapped plain-text files.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root + + +class Text(BaseReader): + r"""A reader for plain-text files with sentences on one or more lines. + + Sentences are separated by one or more empty lines. + Newlines within sentences are substituted by a space. + + Args: + rstrip: a set of characters to be stripped from the end of each line. + Default='\r\n '. You can use rstrip='\n' if you want to preserve + any space or '\r' (Carriage Return) at end of line, + so that `udpipe.Base` keeps these characters in `SpacesAfter`. + As most blocks do not expect whitespace other than a space to appear + in the processed text, using this feature is at your own risk. + empty_line: how empty lines are handled. Default 'new_sentence' preserves + the current behaviour (empty lines mark sentence boundaries). Use + 'keep' to read the entire file content into a single sentence (tree), including + empty lines. Use 'newpar' to behave like 'new_sentence' but also set + `root.newpar = True` on each sentence. + """ + def __init__(self, rstrip='\r\n ', empty_line='new_sentence', **kwargs): + if empty_line not in {'new_sentence', 'keep', 'newpar'}: + raise ValueError("empty_line must be 'new_sentence', 'keep' or 'newpar'") + self.rstrip = rstrip + self.empty_line = empty_line + super().__init__(**kwargs) + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def read_tree(self, document=None): + if self.filehandle is None: + return None + if self.empty_line == 'keep': + content = self.filehandle.read() + if content == '': + return None + root = Root() + root.text = content + return root + lines = [] + line = None + while True: + line = self.filehandle.readline() + # if readline() returns an empty string, the end of the file has been + # reached, while a blank line is represented by '\n' + # (or '\r\n' if reading a Windows file on Unix machine). + if line == '': + if not lines: + return None + else: + break + elif line in {'\n', '\r\n'}: + if not lines: + continue + else: + break + else: + lines.append(line.rstrip(self.rstrip)) + + root = Root() + root.text = " ".join(lines) + if self.empty_line == 'newpar': + root.newpar = True + return root diff --git a/udapi/block/read/vislcg.py b/udapi/block/read/vislcg.py index 9ad272e3..4c5a87ab 100644 --- a/udapi/block/read/vislcg.py +++ b/udapi/block/read/vislcg.py @@ -8,7 +8,7 @@ class Vislcg(BaseReader): # TODO check validity and raise helpful exceptions if not valid # pylint: disable=too-many-branches - def read_tree(self, document=None): + def read_tree(self): if self.filehandle is None: return None diff --git a/udapi/block/segment/merge.py b/udapi/block/segment/merge.py new file mode 100644 index 00000000..9ada45f1 --- /dev/null +++ b/udapi/block/segment/merge.py @@ -0,0 +1,46 @@ +"""Block segment.Merge""" +from udapi.core.block import Block + +class Merge(Block): + """"Re-segmenter merging selected sentences (trees). + + This class merges sentences ending with semicolons, + but it can be used as a base class for merging based on different criteria + by overriding one of the `should_*` methods. + """ + + @staticmethod + def should_merge_tokens(first, second): + """Is there actually a sentence boundary between the first and second node?""" + if first.form[-1] == ';': + return True + return False + + def should_merge_bundles(self, first_bundle, second_bundle): + """Is there actually a sentence boundary between the first and second bundle?""" + first_tree = self._get_our_tree(first_bundle) + second_tree = self._get_our_tree(second_bundle) + return self.should_merge_tokens(first_tree.descendants[-1], second_tree.descendants[0]) + + + def _get_our_tree(self, bundle): + for tree in bundle: + if self._should_process_tree(tree): + return tree + raise ValueError("Bundle %s contains no tree to process." % bundle.address()) + + + def process_document(self, doc): + old_bundles = doc.bundles + prev_bundle = old_bundles[0] + new_bundles = [prev_bundle] + for bundle in old_bundles[1:]: + if self.should_merge_bundles(prev_bundle, bundle): + for tree in bundle: + prev_tree = prev_bundle.get_tree(tree.zone) + prev_tree.steal_nodes(tree.descendants) + prev_tree.text = prev_tree.compute_text() + else: + new_bundles.append(bundle) + prev_bundle = bundle + doc.bundles = new_bundles \ No newline at end of file diff --git a/udapi/block/segment/simple.py b/udapi/block/segment/simple.py new file mode 100644 index 00000000..58be9b6d --- /dev/null +++ b/udapi/block/segment/simple.py @@ -0,0 +1,91 @@ +"""Block segment.Simple""" +from udapi.core.block import Block +from udapi.core.bundle import Bundle +import re + +class Simple(Block): + """"Heuristic segmenter, splits on sentence-final segmentation followed by uppercase. + The exceptions are: + 1) abbreviations of names, e.g. "A. Merkel" + 2) predefined list of nonfinal abbreviations, e.g. "e.g." + + Parameters + ---------- + keep_spaces : bool + do not strip whitespaces from the `text` attribute of the sentences created by segmentation + """ + + def __init__(self, keep_spaces=False, **kwargs): + super().__init__(**kwargs) + self.keep_spaces = keep_spaces + + @staticmethod + def is_nonfinal_abbrev(token): + """Is a given token an abbreviation (without the final period) which cannot end a sentence?""" + if re.search('(např|e.g.)$', token): + return True + return False + + + def is_boundary(self, first, second): + """Is there a sentence boundary between the first and second token?""" + if not first or not second: + return False + if first[-1] in '"“»›)': + first = first[:-1] + if not first: + return False + if second[0] in '"„«¿¡‹(': + second = second[1:] + if not second: + return False + if not second[0].isupper() or second[0].isdigit(): + return False + if not first[-1] in '.!?': + return False + if first[-1] == '.': + # correctly count length in "„A. Merkel" + if first[0] in '"„«¿¡‹(': + first = first[1:] + if len(first) == 2 and first[0].isupper(): + return False + if self.is_nonfinal_abbrev(first[:-1]): + return False + return True + + + def segment_string(self, string): + """Return a list of sentences in a given string.""" + tokens = string.split(' ') + previous = tokens[0] + segments = [previous] + for token in tokens[1:]: + if self.is_boundary(previous, token): + if self.keep_spaces: + segments[-1] += ' ' + segments.append(token) + else: + segments[-1] += ' ' + token + previous = token + return segments + + + def process_document(self, doc): + old_bundles = doc.bundles + new_bundles = [] + for bundle in old_bundles: + new_bundles.append(bundle) + for tree in bundle: + if self._should_process_tree(tree): + if tree.children: + raise ValueError("Segmenting already tokenized text is not supported.") + sentences = self.segment_string(tree.text) + orig_bundle_id = bundle.bundle_id + bundle.bundle_id = orig_bundle_id + '-1' + if len(sentences) > 1: + tree.text = sentences[0] + for i, sentence in enumerate(sentences[1:], 2): + new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) + new_bundle.create_tree(tree.zone).text = sentence + new_bundles.append(new_bundle) + doc.bundles = new_bundles diff --git a/udapi/block/tokenize/onwhitespace.py b/udapi/block/tokenize/onwhitespace.py index 5451b3a1..913dae61 100644 --- a/udapi/block/tokenize/onwhitespace.py +++ b/udapi/block/tokenize/onwhitespace.py @@ -1,9 +1,40 @@ """Block tokenize.OnWhitespace""" +import re from udapi.core.block import Block class OnWhitespace(Block): - """"Base tokenizer, splits on whitespaces, fills SpaceAfter=No.""" + """Base tokenizer, splits on whitespaces, fills SpaceAfter=No. + + Use the parameter `keep_spaces=True` to preserve all whitespaces in the sentence + in the UDPipe way, i.e. using the `SpacesAfter` and `SpacesBefore` features in the MISC field. + It is backward compatible with CoNLL-U v2 `SpaceAfter=No` feature. That is, no following + whitespace is marked by `SpaceAfter=No` and a single following space results in no + whitespace-related markup. + If loading the text using `read.Sentences` and all whitespaces need to be preserved + (in order to be able to reconstruct the original document), the `read.Sentences` block + must be called with `rstrip=''`, `rstrip=\n` or `rstrip=\r\n` to prevent stripping the + trailing whitespace, e.g.:: + $> echo -e "Hello \t world " | udapy read.Sentences $'rstrip=\r\n' tokenize.OnWhitespace keep_spaces=1 write.Conllu + + # sent_id = 1 + # text = Hello world + 1 Hello _ _ _ _ 0 _ _ SpacesAfter=\s\t\s + 2 world _ _ _ _ 0 _ _ _ + Note that the attribute `SpaceAfter=No` is missing for the token `world`, since it is + followed by a single space. + + Parameters + ---------- + keep_spaces : bool + preserve whitespaces by filling MISC attributes `SpacesAfter` and `SpacesBefore` (by default False) + """ + + escape_whitespace_table = str.maketrans({' ':r'\s', '\t':r'\t', '\r':r'\r', '\n':r'\n'}) + + def __init__(self, keep_spaces=False, **kwargs): + super().__init__(**kwargs) + self.keep_spaces = keep_spaces @staticmethod def tokenize_sentence(string): @@ -13,24 +44,23 @@ def tokenize_sentence(string): def process_tree(self, root): if root.children: raise ValueError('Tree %s is already tokenized.' % root) - sentence = ' '.join(root.text.split()) + #sentence = ' '.join(root.text.split()) + sentence = root.text tokens = self.tokenize_sentence(sentence) + + # Check if there are any spaces before the first token + spaces_before = "" + m = re.match(r'\s+', sentence) + if m: + spaces_before = m.group(0) + sentence = sentence[len(spaces_before):] + for i, token in enumerate(tokens, 1): - space_after = False + spaces_after = "" - # Delete the token from the begining of the sentence. - if sentence.startswith(token): - sentence = sentence[len(token):] - # This is the expected case. The sentence starts with the token. - # If it is followed by a space, delete the space and set space_after=True. - if not len(sentence): - space_after = True - elif sentence.startswith(' '): - space_after = True - sentence = sentence[1:] - else: - # The token (returned from tokenization) does not match the start of sentence. - # E.g. '. . . word' is tokenized as '... word'. + # The token (returned from tokenization) does not match the start of sentence. + # E.g. '. . . word' is tokenized as '... word'. + if not sentence.startswith(token): # Let's delete the start of sentence anyway, # using a non-greedy regex and the expected next token # returned from the tokenization. @@ -40,8 +70,28 @@ def process_tree(self, root): # $sentence = $rest if (defined $rest); raise ValueError('tokenization does not match: "%s" vs "%s"' % (token, sentence)) + # Delete the token from the begining of the sentence. + sentence = sentence[len(token):] + + # Set the SpaceAfter and SpacesAfter properly + m = re.match(r'\s+', sentence) + if m is not None: + spaces_after = m.group(0) + sentence = sentence[len(spaces_after):] + + # normalize whitespace + if not self.keep_spaces: + spaces_before = "" + # spaces_after = "" <=> SpaceAfter=No is never set for the last token <=> len(sentence) = 0 + spaces_after = "" if not len(spaces_after) and len(sentence) else " " + # create a new node node = root.create_child(form=token) node.ord = i - if not space_after: - node.misc = 'SpaceAfter=No' + + if i == 1 and spaces_before: + node.misc["SpacesBefore"] = spaces_before.translate(self.escape_whitespace_table) + if not spaces_after: + node.misc["SpaceAfter"] = 'No' + elif spaces_after != " ": + node.misc["SpacesAfter"] = spaces_after.translate(self.escape_whitespace_table) diff --git a/udapi/block/transform/flatten.py b/udapi/block/transform/flatten.py index ded64fb1..d218ad27 100644 --- a/udapi/block/transform/flatten.py +++ b/udapi/block/transform/flatten.py @@ -4,6 +4,22 @@ class Flatten(Block): """Apply `node.parent = node.root; node.deprel = 'root'` on all nodes.""" - def process_node(self, node): - node.parent = node.root - node.deprel = 'root' + def __init__(self, oneroot=False, **kwargs): + """Args: + oneroot: only the first node will have deprel 'root'. + All other nodes will depend on the first node with deprel 'dep'. + This option makes the trees valid according to the validator. + (default=False) + """ + super().__init__(**kwargs) + self.oneroot = oneroot + + def process_tree(self, tree): + for node in tree.descendants: + node.parent = node.root + node.deprel = 'root' + if self.oneroot: + first = tree.descendants[0] + for node in tree.descendants[1:]: + node.parent = first + node.deprel = 'dep' diff --git a/udapi/block/tutorial/addarticles.py b/udapi/block/tutorial/addarticles.py index 20a4295f..96f0ba2f 100644 --- a/udapi/block/tutorial/addarticles.py +++ b/udapi/block/tutorial/addarticles.py @@ -1,4 +1,8 @@ """tutorial.AddArticles block template.""" +# nickname = xy123 +# TODO: make up a unique nickname and edit the previous line +# if you want your results to be listed on the NPFL070 web (under that nickname). +# Delete the line if you don't want to listed on the web. from udapi.core.block import Block class AddArticles(Block): diff --git a/udapi/block/tutorial/addcommas.py b/udapi/block/tutorial/addcommas.py index ccc26a66..97677d89 100644 --- a/udapi/block/tutorial/addcommas.py +++ b/udapi/block/tutorial/addcommas.py @@ -1,21 +1,29 @@ """tutorial.AddCommas block template.""" from udapi.core.block import Block +# nickname = xy123 +# TODO: make up a unique nickname and edit the previous line +# if you want your results to be listed on the NPFL070 web (under that nickname). +# Delete the line if you don't want to listed on the web. class AddCommas(Block): """Heuristically insert nodes for missing commas.""" + def __init__(self, language='en', **kwargs): + super().__init__(**kwargs) + self.language = language + def process_node(self, node): + # TODO: Your task: implement some heuristics if self.should_add_comma_before(node): comma = node.create_child(form=',', deprel='punct', upos='PUNCT') comma.shift_before_node(node) def should_add_comma_before(self, node): - # TODO: Your task: implement some heuristics prev_node = node.prev_node if prev_node is None: return False - if prev_node.lemma == 'however': + if self.language == 'en' and prev_node.lemma == 'however': return True if any(n.deprel == 'appos' for n in prev_node.children): return True diff --git a/udapi/block/tutorial/parse.py b/udapi/block/tutorial/parse.py index 77928782..db732a12 100644 --- a/udapi/block/tutorial/parse.py +++ b/udapi/block/tutorial/parse.py @@ -9,11 +9,19 @@ util.MarkDiff gold_zone=gold \ write.TextModeTreesHtml marked_only=1 files=parse-diff.html """ +# nickname = xy123 +# TODO: make up a unique nickname and edit the previous line +# if you want your results to be listed on the NPFL070 web (under that nickname). +# Delete the line if you don't want to listed on the web. from udapi.core.block import Block class Parse(Block): """Dependency parsing.""" + def __init__(self, language='en', **kwargs): + super().__init__(**kwargs) + self.language = language + def process_tree(self, root): # TODO: Your task: implement a better heuristics than "right chain" for node in root.descendants: diff --git a/udapi/block/tutorial/removecommas.py b/udapi/block/tutorial/removecommas.py new file mode 100644 index 00000000..a07e2bba --- /dev/null +++ b/udapi/block/tutorial/removecommas.py @@ -0,0 +1,13 @@ +"""tutorial.RemoveCommas helper block.""" +from udapi.core.block import Block + + +class RemoveCommas(Block): + """Delete all comma nodes and edit SpaceAfter and text accordingly.""" + + def process_tree(self, root): + for node in root.descendants: + if node.form == ",": + node.remove(children="rehang") + del node.prev_node.misc['SpaceAfter'] + root.text = root.compute_text() diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py index ffa78bbb..e7eb3989 100644 --- a/udapi/block/ud/addmwt.py +++ b/udapi/block/ud/addmwt.py @@ -1,5 +1,6 @@ """Abstract base class ud.AddMwt for heuristic detection of multi-word tokens.""" from udapi.core.block import Block +import logging class AddMwt(Block): @@ -14,6 +15,9 @@ def process_node(self, node): orig_attr[attr] = getattr(node, attr) orig_attr['feats'] = node.feats.copy() orig_attr['misc'] = node.misc.copy() + # Defaults for the newly created MWT + mwt_misc = node.misc.copy() + mwt_form = node.form forms = analysis['form'].split() main = analysis.get('main', 0) @@ -36,12 +40,28 @@ def process_node(self, node): elif orig_attr['form'][0].isupper(): nodes[0].form = nodes[0].form.title() + node.misc = None for attr in 'lemma upos xpos feats deprel misc'.split(): if attr in analysis: values = analysis[attr].split() for i, new_node in enumerate(nodes): + if len(values) <= i: + logging.warning("Attribute '%s' not supplied for word no. %d" % (attr, i)) + for attr in 'form lemma upos xpos feats deprel misc'.split(): + logging.warning("%s = %s" % (attr, analysis.get(attr, ''))) if values[i] == '*': setattr(new_node, attr, orig_attr[attr]) + # No MISC attribute should be duplicated on the word level and token level, + # so if copying MISC to a new_node, delete mwt_misc. + # However, SpaceAfter should be annotated only on the token level, + # so make sure it is not accidentally copied on the word level. + if attr == 'misc': + orig_attr['misc'].clear() + for a in 'SpaceAfter SpacesAfter SpacesBefore'.split(): + if new_node.misc[a]: + orig_attr['misc'][a] = new_node.misc[a] + del new_node.misc[a] + elif attr == 'feats' and '*' in values[i]: new_node.feats = values[i] for feat_name, feat_value in list(new_node.feats.items()): @@ -50,8 +70,23 @@ def process_node(self, node): else: setattr(new_node, attr, values[i]) - mwt = node.root.create_multiword_token(nodes, orig_attr['form'], orig_attr['misc']) - node.misc = None + # Entity (coreference) annotation should be only on the word level, + # so make sure it does not stay on the token level. + if mwt_misc['Entity']: + nodes[0].misc['Entity'] = mwt_misc['Entity'] + del mwt_misc['Entity'] + + # If node is already part of an MWT, we need to delete the old MWT and extend the new MWT. + if node.multiword_token: + mwt_words = node.multiword_token.words + mwt_form = node.multiword_token.form + if node.multiword_token.misc: + mwt_misc.update(node.multiword_token.misc) + node.multiword_token.remove() + mwt_words[mwt_words.index(node):mwt_words.index(node)+1] = nodes + nodes = mwt_words + + mwt = node.root.create_multiword_token(words=nodes, form=mwt_form, misc=mwt_misc) self.postprocess_mwt(mwt) def multiword_analysis(self, node): diff --git a/udapi/block/ud/addpuncttype.py b/udapi/block/ud/addpuncttype.py new file mode 100644 index 00000000..f5f20e06 --- /dev/null +++ b/udapi/block/ud/addpuncttype.py @@ -0,0 +1,91 @@ +""" +Some UD treebanks use features PunctType and PunctSide that classify +punctuation symbols. This block can be used to add such features to data where +they are missing – the classification is mostly deterministic. If the input +data already contains such features, their values will be overwritten. +""" +from udapi.core.block import Block + +# TODO We need to know the language, there are many other quotation styles, +# e.g. Finnish and Swedish uses the same symbol for opening and closing: ”X”. +# Danish uses uses the French quotes, but switched: »X«. + +PUNCT_TYPES = { + '(': 'Brck', + ')': 'Brck', + '[': 'Brck', + ']': 'Brck', + '{': 'Brck', + '}': 'Brck', + '.': 'Peri', + '...': 'Elip', + '…': 'Elip', + ',': 'Comm', + ';': 'Semi', + ':': 'Colo', + '!': 'Excl', + '¡': 'Excl', # Spanish initial exclamation mark + '?': 'Qest', + '¿': 'Qest', # Spanish initial question mark + '/': 'Colo', # it is used this way in AnCora + '-': 'Dash', + '–': 'Dash', + '—': 'Dash', + '"': 'Quot', + "'": 'Quot', + '`': 'Quot', + '“': 'Quot', # opening English, closing Czech + '”': 'Quot', # closing English + '„': 'Quot', # opening Czech + '‘': 'Quot', # opening English, closing Czech + '’': 'Quot', # closing English + '‚': 'Quot', # opening Czech + '«': 'Quot', # opening French, closing Danish + '»': 'Quot', # closing French, opening Danish + '‹': 'Quot', + '›': 'Quot', + '《': 'Quot', # Korean, Chinese + '》': 'Quot', + '「': 'Quot', # Chinese, Japanese + '」': 'Quot', + '『': 'Quot', + '』': 'Quot' +} + +PUNCT_SIDES = { + '(': 'Ini', + ')': 'Fin', + '[': 'Ini', + ']': 'Fin', + '{': 'Ini', + '}': 'Fin', + '¡': 'Ini', # Spanish initial exclamation mark + '!': 'Fin', # but outside Spanish people may expect empty value + '¿': 'Ini', # Spanish initial question mark + '?': 'Fin', + '《': 'Ini', # Korean, Chinese + '》': 'Fin', + '「': 'Ini', # Chinese, Japanese + '」': 'Fin', + '『': 'Ini', + '』': 'Fin' +} + + +class AddPunctType(Block): + """Add features PunctType and PunctSide where applicable.""" + + def process_node(self, node): + # The two features apply only to PUNCT. If they already occur elsewhere, erase them. + if node.upos != 'PUNCT': + node.feats['PunctType'] = '' + node.feats['PunctSide'] = '' + else: + if node.form in PUNCT_TYPES: + node.feats['PunctType'] = PUNCT_TYPES[node.form] + else: + node.feats['PunctType'] = '' + if node.form in PUNCT_SIDES: + node.feats['PunctSide'] = PUNCT_SIDES[node.form] + else: + node.feats['PunctSide'] = '' diff --git a/udapi/block/ud/ar/fixedeprels.py b/udapi/block/ud/ar/fixedeprels.py new file mode 100644 index 00000000..a4b359ff --- /dev/null +++ b/udapi/block/ud/ar/fixedeprels.py @@ -0,0 +1,699 @@ +"""Block to fix case-enhanced dependency relations in Arabic.""" +from udapi.core.block import Block +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'أَنَّ': [], + 'أَن': [], + 'إِنَّ': [], + 'إِذَا': [], + 'لَو': [], + 'حَيثُ': [], + 'مِثلَ': [], + 'لِأَنَّ': [], + 'كَمَا': [], +# 'فِي_حِينَ': [], + 'فَ': [] + } + + # Reduction and normalization of prepositions and conjunctions, including + # the derived and compound ones. The Latin transliterations are not really + # needed in the process. We include them here as documentation, but also + # to help the poor editor with rendering the lines. Ideally, each line + # should have left-to-right text at both the beginning and end. + substitution = [ + {'target': ('min:gen', 'مِن:gen'), + 'sources': + [('ibtida min', 'اِبتِدَاء_مِن')] + }, + {'target': ('ʾiṯra:gen', 'إِثرَ:gen'), # ʾiṯra = right after + 'sources': + [('ʾiṯra', 'إِثرَ')] + }, + {'target': ('ʾaṯnāʾa:gen', 'أَثنَاءَ:gen'), # ʾaṯnāʾa = during + 'sources': + [('ʾaṯnāʾa', 'أَثنَاءَ')] + }, + {'target': ('ʾiḏ', 'إِذ'), # ʾiḏ = because + 'sources': + [('ʾiḏ', 'إِذ'), + ('ʾiḏ ʾanna', 'إِذ_أَنَّ')] + }, + {'target': ('ʾiḏā', 'إِذَا'), # ʾiḏā = if + 'sources': + [('ʾiḏā', 'إِذَا'), + ('ʾiḏā', 'إِذًا')] + }, + ] + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'اِبتِدَاء_مِن': 'مِن:gen', + 'إِثرَ': 'إِثرَ:gen', # ʾiṯra = right after + 'أَثنَاءَ': 'أَثنَاءَ:gen', # ʾaṯnāʾa = during + 'إِذ': 'إِذ', # ʾiḏ = because + 'إِذ_أَنَّ': 'إِذ', # ʾiḏ ʾanna + 'إِذًا': 'إِذَا', + 'إِذَا': 'إِذَا', # remove morphological case; ʾiḏā = if + 'إِزَاءَ': 'إِزَاءَ:gen', # ʾizāʾa = regarding, facing, towards + 'أَلَّا': 'إِلَّا', + 'إِلَّا': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_إِذَا': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَن': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَنَّ': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَنَّ_هُوَ': 'إِلَّا', # ʾillā = except, unless + 'إِلَى': 'إِلَى:gen', # ʾilā = to + 'إِلَى_أَن': 'إِلَى:gen', + 'إِلَى_أَنَّ': 'إِلَى_أَنَّ', # until? that? + 'إِلَى_أَنَّ_لَدَى': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_مِن': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ_مِن': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ_مِن_بَينَ': 'إِلَى_أَنَّ', + 'إِلَى_بَعدَ': 'إِلَى:gen', + 'إِلَى_بَينَ': 'إِلَى_بَينِ:gen', # ʾilā bayni = to between + 'إِلَى_جَانِب': 'إِلَى_جَانِبِ:gen', # ʾilā ǧānibi = beside + 'إِلَى_حَوَالَى': 'إِلَى:gen', # ila hawala = to around X + 'إِلَى_حَوَالَى_مِن': 'إِلَى:gen', # ila hawala min + 'إِلَى_حَيثُ': 'إِلَى:gen', + 'إِلَى_حِينَ': 'فِي_حِينِ', # during + 'إِلَى_خَارِجَ': 'إِلَى_خَارِجِ:gen', # ʾilā ḫāriǧi = out + 'إِلَى_فِي': 'إِلَى:gen', + 'إِلَى_قَبلَ': 'إِلَى_قَبلِ:gen', # ʾilā qabli = until before X (e.g. until one year ago) + 'إِلَى_مِثلَ': 'مِثلَ', # miṯla = like + 'إِلَى_نَحوَ': 'إِلَى:gen', # to about N + 'أَمَّا': 'أَمَامَ:gen', + 'إِمَّا_لِ': 'لِ:gen', + 'أَمَامَ': 'أَمَامَ:gen', # ʾamāma = in front of + 'أَمَامَ_مِن': 'أَمَامَ:gen', + 'أَن': 'أَنَّ', # remove morphological case; ʾanna = that + 'أَنَّ': 'أَنَّ', # remove morphological case; ʾanna = that + 'إِن': 'إِنَّ', # remove morphological case; ʾinna = that + 'إِنَّ': 'إِنَّ', # remove morphological case; ʾinna = that + 'إِنَّمَا': 'إِنَّ', + 'إِيَّا': 'إِلَّا', + 'بِ': 'بِ:gen', # bi = for, with + 'بِ_اِتِّجَاه': 'بِاِتِّجَاهِ:gen', # bi-ittiǧāhi = towards + 'بِ_إِزَاءَ': 'إِزَاءَ:gen', # ʾizāʾa = regarding, facing, towards + 'بِ_اِستِثنَاء': 'بِاِستِثنَاءِ:gen', # biistiṯnāʾi = with exception of + 'بِ_اِسم': 'بِاِسمِ:gen', # biismi = in name of + 'بِ_إِضَافَة_إِلَى': 'بِاَلإِضَافَةِ_إِلَى:gen', # bi-al-ʾiḍāfati ʾilā = in addition to + 'بِ_إِضَافَة_إِلَى_أَنَّ': 'إِلَى_أَنَّ', + 'بِ_إِضَافَة_لِ': 'بِاَلإِضَافَةِ_إِلَى:gen', # in addition to + 'بِ_اِعتِبَار': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِبَار_أَنَّ': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِبَار_مِن': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِمَاد_عَلَى': 'بِاَلِاعتِمَادِ_عَلَى:gen', # bi-al-i-ʼʿtimādi ʿalā = depending on + 'بِ_إِلَى': 'بِ:gen', + 'بِ_أَنَّ': 'أَنَّ', # that + 'بِ_أَن': 'بِ:gen', + 'بِ_إِنَّ': 'بِ:gen', + 'بِ_أَنَّ_أَمَامَ': 'أَنَّ', # that + 'بِ_أَنَّ_لَا': 'أَنَّ', # that + 'بِ_أَنَّ_مِن': 'أَنَّ', # that + 'بِ_أَنَّ_هما_مِن': 'أَنَّ', # that + 'بِ_أَنَّ_هُوَ': 'أَنَّ', # that + 'بِ_أَنَّ_هُوَ_عَلَى': 'أَنَّ', # that + 'بِ_اِنطِلَاق': 'بِ:gen', + 'بِ_تَالِي_إِنَّ': 'بِ:gen', + 'بِ_تَعَاوُن_مَعَ': 'بِاَلتَّعَاوُنِ_مَعَ:gen', # bi-at-taʿāwuni maʿa = in cooperation with + 'بِ_تُهمَة': 'بِتُهمَةِ:gen', # bituhmati = on charges of + 'بِ_تَوَازِي_مَعَ': 'بِاَلتَّوَازِي_مَعَ:gen', # bi-at-tawāzī maʿa = in parallel with + 'بِ_ثُمَّ': 'بِ:gen', + 'بِ_جَانِب': 'بِجَانِبِ:gen', # biǧānibi = next to + 'بِ_جِهَة': 'بِ:gen', + 'بِ_حَالَة': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'بِ_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'بِ_حُضُور': 'فِي_حُضُورِ:gen', # together with + 'بِ_حَقّ': 'بِ:gen', + 'بِ_حُكم': 'بِ:gen', + 'بِ_حُلُول': 'بِ:gen', + 'بِ_حَوَالَى': 'بِ:gen', # bi hawala = with around X + 'بِ_حَيثُ': 'بِ:gen', + 'بِ_خُصُوص': 'بِخُصُوصِ:gen', # biḫuṣūṣi = with regard + 'بِ_خِلَاف': 'بِخِلَافِ:gen', # biḫilāfi = in addition to + 'بِ_دَاخِلَ': 'دَاخِلَ:gen', + 'بِ_دَعوَى': 'بِ:gen', + 'بِ_دَور': 'بِ:gen', # bidawri = with role, in turn? + 'بِ_دُون': 'دُونَ:gen', + 'بِ_دُونَ': 'دُونَ:gen', # bi dūni = without + 'بِ_دُونَ_أَن': 'دُونَ:gen', # bi dūni ʾan = without + 'بِ_رِعَايَة': 'بِ:gen', + 'بِ_رَغم': 'رَغمَ:gen', # despite + 'بِ_رَغم_أَنَّ': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن_أَن': 'بِ:gen', + 'بِ_رَغم_مِن_أَنَّ': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن_أَنَّ_هُوَ': 'بِ:gen', + 'بِ_رِفقَة': 'بِرِفقَةٍ:gen', # birifqatin = in company of + 'بِ_رِئَاسَة': 'بِ:gen', + 'بِ_سَبّ': 'بِ:gen', + 'بِ_سَبَب': 'بِسَبَبِ:gen', # bisababi = because of + 'بِ_شَأن': 'بِشَأنِ:gen', # bišaʾni = about, regarding (lit. with + matter) + 'بِ_شَرط_أَن': 'بِ:gen', + 'بِ_صَدَد': 'بِصَدَدِ:gen', # biṣadadi = with respect to + 'بِ_صَرف_نَظَر_عَن': 'بِصَرفِ_اَلنَّظَرِ_عَن:gen', # biṣarfi an-naẓari ʿan = regardless of + 'بِ_صِفَة': 'بِصِفَةِ:gen', # biṣifati = as + 'بِ_صُورَة': 'بِ:gen', + 'بِ_عَكس': 'بِ:gen', + 'بِ_عَلَى': 'بِ:gen', + 'بِ_عَن': 'بِ:gen', + 'بِ_عَين': 'بِ:gen', + 'بِ_غَضّ_نَظَر_عَن': 'بِغَضِّ_اَلنَّظَرِ_عَن:gen', # biġaḍḍi an-naẓari ʿan = regardless of + 'بِ_فَضل': 'بِفَضلِ:gen', # bifaḍli = thanks to + 'بِ_فِي': 'بِ:gen', + 'بِ_قَدر': 'بِ:gen', + 'بِ_قُرب_مِن': 'بِاَلقُربِ_مِن:gen', # bi-al-qurbi min = near (with proximity to) + 'بِ_قَصد': 'بِقَصدِ:gen', # biqaṣdi = with intention + 'بِ_كَ': 'بِ:gen', + 'بِ_لِ': 'بِ:gen', + 'بِ_لَا': 'بِ:gen', + 'بِ_مَا_أَنَّ': 'بِ:gen', + 'بِ_مَثَابَة': 'بِ:gen', + 'بِ_مِثلَ': 'مِثلَ', # miṯla = like + 'بِ_مُجَرَّد': 'بِ:gen', + 'بِ_مُسَاعَدَة': 'بِ:gen', + 'بِ_مُشَارَكَة': 'بِمُشَارَكَةِ:gen', # bimušārakati = with participation of + 'بِ_مُقَارَنَة_بِ': 'بِاَلمُقَارَنَةِ_بِ:gen', # bi-al-muqāranati bi = in comparison to + 'بِ_مُقتَضَى': 'بِمُقتَضَى:gen', # bimuqtaḍā = with requirement of + 'بِ_مِقدَار': 'بِ:gen', + 'بِ_مِن': 'بِ:gen', + 'بِ_مُنَاسَبَة': 'بِمُنَاسَبَةِ:gen', # bimunāsabati = on the occasion of + 'بِ_مُوجِب': 'بِمُوجِبِ:gen', # bimūǧibi = with motive + 'بِ_نَتِيجَة': 'بِ:gen', + 'بِ_نَحوَ': 'بِ:gen', # by about N + 'بِ_نِسبَة': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati (bin-nisbati) = in proportion/relation to + 'بِ_نِسبَة_إِلَى': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati ʾilā (bin-nisbati ʾilā) = in proportion/relation to + 'بِ_نِسبَة_لِ': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati li (bin-nisbati li) = in proportion/relation to + 'بِ_نِسبَة_لِ_مِن': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati li (bin-nisbati li) = in proportion/relation to + 'بِ_نَظَر_إِلَى': 'بِ:gen', + 'بِ_نِيَابَة_عَن': 'بِاَلنِّيَابَةِ_عَن:gen', # bi-an-niyābati ʿan = on behalf of + 'بِ_هَدَف': 'بِهَدَفِ:gen', # bihadafi = with goal + 'بِ_وَ_لِ': 'بِ:gen', + 'بِ_وَاسِطَة': 'بِوَاسِطَةِ:gen', # biwāsiṭati = by means of + 'بِ_وَاقِع': 'بِ:gen', + 'بِ_وَسَط': 'بِوَسَطِ:gen', # biwasaṭi = in the middle of + 'بِ_وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'بِ_وَصف': 'بِ:gen', + 'بازاء': 'بِ:gen', + 'بالتسخين': 'بِ:gen', + 'بَدَلًا_مِن': 'بَدَلًا_مِن:gen', # badalan min = instead of + 'بدون': 'دُونَ:gen', # without + 'بشان': 'بِشَأنِ:gen', + 'بَعدَ': 'بَعدَ:gen', # baʿda = after + 'بَعدَ_أَن': 'بَعدَ:gen', # baʿda ʾan = after + clause + 'بَعدَ_حَوَالَى': 'بَعدَ:gen', # baada hawala + 'بَعدَ_نَحوَ': 'بَعدَ:gen', # after about N + 'بَعدَمَا': 'بَعدَ:gen', # baʿdamā = after + 'بُعَيدَ': 'بُعَيدَ:gen', # buʿayda = shortly after + 'بَل': 'قَبلَ:gen', + 'بِنَاء_عَلَى': 'بناء_عَلَى:gen', + 'بناء_عَلَى': 'بناء_عَلَى:gen', # bnāʾ ʿalā = based on + 'بناء_لِ': 'لِ:gen', + 'بَيدَ': 'بِ:gen', + 'بَيدَ_أَنَّ': 'بِ:gen', + 'بَينَ': 'بَينَ:gen', # bayna = between + 'بَينَ_حَوَالَى': 'بَينَ:gen', # bayna hawala + 'بينا': 'بَينَ:gen', # bayna = between + 'بَينَ_وَ_وَ_وَ': 'بَينَ:gen', # bayna = between + 'بَينَمَا': 'بَينَ:gen', + 'بَينَمَا_لَم': 'بَينَ:gen', + 'تُجَاهَ': 'تُجَاهَ:gen', # tuǧāha = towards, facing + 'تَحتَ': 'تَحتَ:gen', # tahta = under + 'ثَمَّ': 'بِ:gen', + 'ثُمَّ': 'بِ:gen', + 'جَرَّاء': 'جَرَّاء:gen', # ǧarrāʾ = because of + 'حَتَّى': 'حَتَّى:gen', # ḥattā = until + 'حَتَّى_أَنَّ': 'حَتَّى:gen', # before + 'حَتَّى_إِنَّ': 'حَتَّى:gen', # before + 'حَتَّى_بِ': 'حَتَّى:gen', # before + 'حَتَّى_لَو': 'لَو', # even if + 'حَتَّى_وَ_لَو': 'لَو', # even if + 'حَتَّى_وإن': 'إِنَّ', + 'حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'حَسَبَمَا': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'حَوَالَى': 'حَوَالَى', # ḥawālā = around, about + 'حَوَالَى_مِن': 'مِن:gen', # hawala min = from around X + 'حَولَ': 'حَولَ:gen', # ḥawla = about + 'حولما_إِذَا': 'إِذَا', + 'حَولَ_مَا_إِذَا': 'إِذَا', + 'حِيَالَ': 'حِيَالَ:gen', # ḥiyāla = concerning + 'حَيثُ': 'حَيثُ', # remove morphological case; ḥayṯu = where (SCONJ, not ADV) + 'حِينَمَا': 'فِي_حِينِ', # during + 'خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside + 'خِلَالَ': 'خِلَالَ:gen', # ḫilāla = during + 'خَلفَ': 'خَلفَ:gen', # ḫalfa = behind + 'دَاخِل': + 'دَاخِلَ:gen', # dāḫila = inside of + 'دَاخِلَ': + 'دَاخِلَ:gen', # dāḫila = inside of + 'دُونَ': 'دُونَ:gen', # dūna = without + 'دُونَ_أَن': 'دُونَ:gen', # dūna ʾan = without + 'دُونَ_سِوَى': 'دُونَ:gen', # dūna siwā = without + 'دونما': 'دُونَ:gen', + 'ذٰلِكَ_بَعدَمَا': 'بَعدَ:gen', + 'ذٰلِكَ_عِندَمَا': 'بِ:gen', + 'ذٰلِكَ_لِأَنَّ': 'لِأَنَّ', # because + 'ذٰلِكَ_لِكَي': 'لِكَي', # li-kay = in order to + 'ذٰلِكَ_نَظَر_لِ': 'بِ:gen', + 'رَغمَ': 'رَغمَ:gen', # raġma = despite + 'رَغمَ_أَنَّ': 'رَغمَ:gen', # raġma ʾanna = despite + clause + 'رَغمَ_أَنَّ_مِن': 'رَغمَ:gen', # raġma ʾanna min = despite + 'رَهنَ': 'رَهنَ:gen', # rahna = depending on + 'رَيثَمَا': 'رَهنَ:gen', # rahna = depending on + 'سِوَى': 'سِوَى:gen', # siwā = except for + 'سِوَى_أَنَّ_هُوَ': 'سِوَى:gen', # siwā = except for + 'سِوَى_بِ': 'سِوَى:gen', # siwā = except for + 'سِوَى_عَلَى': 'سِوَى:gen', # siwā = except for + 'سِوَى_لِ': 'سِوَى:gen', # siwā = except for + 'ضِدَّ': 'ضِدَّ:gen', # ḍidda = against + 'ضِمنَ': 'ضِمنَ:gen', # ḍimna = within, inside, among + 'طَالَمَا': + 'طَالَمَا', # ṭālamā = as long as + 'طالَما': + 'طَالَمَا', # ṭālamā = as long as + 'طَالَمَا_أَنَّ': + 'طَالَمَا', # ṭālamā = as long as + 'طِوَالَ': 'طِوَالَ:gen', # ṭiwāla = throughout + 'طِيلَةَ': 'طِيلَةَ:gen', # ṭīlata = during + 'عبر': 'عَبرَ:gen', + 'عَبرَ': 'عَبرَ:gen', # ʿabra = via + 'عَدَا': 'عَدَا:gen', # ʿadā = except for + 'عَقِبَ': 'عَقِبَ:gen', # ʿaqiba = following + 'عَقِبَ_أَن': 'عَقِبَ:gen', # ʿaqiba = following + 'عَقِبَ_مِن': 'عَقِبَ:gen', # ʿaqiba = following + 'عَلَى': 'عَلَى:gen', # ʿalā = on + 'عَلَى_أبواب': 'عَلَى:gen', + 'عَلَى_إِثرَ': 'إِثرَ:gen', # ʿalā ʾiṯri = right after + 'عَلَى_أَثَر': 'عَلَى:gen', + 'عَلَى_اِختِلَاف': 'عَلَى:gen', + 'عَلَى_أَسَاس': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on + 'عَلَى_أَسَاس_أَنَّ': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on + 'عَلَى_اِعتِبَار_أَنَّ': 'عَلَى_اِعتِبَارِ_أَنَّ', # ʿalā iʿtibāri ʾanna = considering that + 'عَلَى_إِلَّا': 'إِلَّا', # ʾillā = except, unless + 'عَلَى_الفور': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_إِلَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَن': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَن_بِ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_عَلَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_مِن_شَأن': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_هُوَ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_هُوَ_لَدَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بِ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بِ_فِي': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بَينَ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_حَدّ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of + 'عَلَى_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'عَلَى_حَولَ': 'عَلَى:gen', + 'عَلَى_رَأس': 'عَلَى_رَأسِ:gen', # ʿalā raʾsi = on top of + 'عَلَى_رَغم': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغمَ_أَنَّ': 'رَغمَ:gen', # ʿalā raġma ʾanna = despite + clause + 'عَلَى_رَغم_أَنَّ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن_أَنَّ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن_أَنَّ_هُوَ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_طَرِيقَة': 'عَلَى_طَرِيقَةِ:gen', # ʿalā ṭarīqati = on the way + 'عَلَى_عَكس': 'عَلَى:gen', + 'عَلَى_غِرَار': 'عَلَى_غِرَارِ:gen', # ʿalā ġirāri = similar to + 'عَلَى_قَيد': 'عَلَى:gen', + 'عَلَى_لِسَان': 'عَلَى:gen', + 'عَلَى_مِثلَ': 'مِثلَ', # miṯla = like + 'عَلَى_مدى': 'عَلَى:gen', + 'عَلَى_مَدَى': 'عَلَى_مَدَى:gen', # ʿalā madā = on period + 'عَلَى_مَقرَبَة_مِن': 'عَلَى_مَقرَبَةٍ_مِن:gen', # ʿalā maqrabatin min = in the vicinity of + 'عَلَى_مِن': 'عَلَى:gen', + 'عَلَى_نَحوَ': 'عَلَى:gen', # to about N + 'عَلَى_يَد': 'عَلَى:gen', + 'عَن': 'عَن:gen', # ʿan = about, from + 'عَن_أَن': 'عَن:gen', + 'عَن_أَنَّ': 'عَن:gen', + 'عَن_أَنَّ_وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond + 'عَن_بِ': 'عَن:gen', + 'عَن_طَرِيق': 'عَن_طَرِيقِ:gen', # ʿan ṭarīqi = via + 'عَن_فِي_أَن': 'عَن:gen', + 'عَن_قُربَ': 'قُربَ:gen', # qurba = near + 'عَن_مِثلَ': 'مِثلَ', # miṯla = like + 'عَن_مِن': 'عَن:gen', + 'عِندَ': 'عِندَمَا', # ʿinda = when + 'عِندَمَا': 'عِندَمَا', # ʿindamā = when + 'غَيرَ': 'إِلَّا', + 'فَ': 'فَ', # fa = so (advcl or coordination) + 'فَ_إِذَا': 'فَ', # fa = so (advcl or coordination) + 'فَ_بَدَل_مِن_أَن': 'فَ', # fa = so (advcl or coordination) + 'فَ_بَينَ': 'فَ', # fa = so (advcl or coordination) + 'فَ_عَلَى': 'فَ', # fa = so (advcl or coordination) + 'فَ_فِي': 'فَ', # fa = so (advcl or coordination) + 'فَ_مِن': 'فَ', # fa = so (advcl or coordination) + 'فَورَ': 'فَورَ:gen', # fawra = as soon as + 'فَوقَ': 'فَوقَ:gen', # fawqa = above, over + 'فِي': 'فِي:gen', # fī = in + 'فِي_اِتِّجَاه': 'بِاِتِّجَاهِ:gen', # bi-ittiǧāhi = towards + 'فِي_أَثنَاءَ': 'أَثنَاءَ:gen', # ʾaṯnāʾa = during + 'فِي_إِطَار': 'فِي_إِطَار:gen', # fī ʾiṭār = in frame + 'فِي_اعقاب': 'فِي_أَعقَابِ:gen', + 'فِي_إِلَى': 'فِي:gen', + 'فِي_أَن': 'فِي:gen', + 'فِي_أَنَّ': 'فِي:gen', + 'فِي_أَنَّ_عَلَى': 'فِي:gen', + 'فِي_أَنَّ_لَدَى': 'فِي:gen', + 'فِي_أَنَّ_مِن': 'فِي:gen', + 'فِي_بِ': 'فِي:gen', + 'فِي_بِ_فِي': 'فِي:gen', + 'فِي_بَاطِن': 'فِي:gen', + 'فِي_بَعدَ': 'فِي:gen', + 'فِي_بَينَ': 'بَينَ:gen', + 'فِي_حَال': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'فِي_حَالَة': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'فِي_حَدّ': 'فِي:gen', + 'فِي_حُضُور': 'فِي_حُضُورِ:gen', # fī ḥuḍūri = in presence of + 'فِي_حَقّ': 'فِي:gen', + 'فِي_حُكم': 'فِي:gen', + 'فِي_حَوَالَى': 'فِي:gen', # fi hawala = in around X + 'فِي_حِين': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِينَ': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِين_أَنَّ': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِينَ_أَنَّ_هُوَ': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside + 'فِي_خِتَام': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion + 'فِي_خِتَامِ': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion + 'فِي_خِلَالَ': 'فِي:gen', + 'فِي_دَاخِل': + 'دَاخِلَ:gen', + 'فِي_دَاخِلَ': 'فِي:gen', + 'فِي_سَبِيل': 'فِي_سَبِيلِ:gen', # fī sabīli = in order to + 'فِي_سِيَاق': 'فِي:gen', + 'فِي_شَأن': 'فِي_شَأنِ:gen', # fī šaʾni = in regard of + 'فِي_شَكل': 'فِي:gen', + 'فِي_صَفّ': 'فِي:gen', + 'فِي_صُورَة': 'فِي:gen', + 'فِي_ضَوء': 'فِي_ضَوءِ:gen', # fī ḍawʾi = in light of + 'فِي_ظِلّ': 'فِي_ظِلِّ:gen', # fī ẓilli = in light of + 'فِي_عُقب': 'فِي_أَعقَابِ:gen', # fī ʾaʿqābi = in the aftermath of + 'فِي_غَضن': 'فِي:gen', + 'فِي_غُضُون': 'فِي:gen', + 'فِي_مَا': 'فِي:gen', + 'فِي_مِثلَ': 'مِثلَ', # miṯla = like + 'فِي_مَجَال': 'فِي_مَجَالِ:gen', # fī maǧāli = in the area of + 'فِي_مستشفى': 'فِي:gen', + 'فِي_مَعَ': 'فِي:gen', + 'فِي_مُقَابِلَ': 'مُقَابِلَ:gen', + 'فِي_مَقدَم': 'فِي:gen', + 'فِي_مِن': 'فِي:gen', + 'فِي_مُنَاسَبَة': 'فِي_مُنَاسَبَةِ:gen', # fī munāsabati = on the occasion of + 'فِي_مُوَاجَهَة': 'فِي:gen', + 'فِي_نَحوَ': 'فِي:gen', # in about N + 'فِي_نِطَاق': 'فِي:gen', + 'فِي_وَجه': 'فِي:gen', + 'فِي_وَسط': 'وَسطَ:gen', + 'فِي_وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'فِيمَا': 'فِيمَا', # fīmā = while + 'قُبَالَةَ': 'قُبَالَةَ:gen', # qubālata = in front of, facing + 'قَبلَ': 'قَبلَ:gen', # qabla = before + 'قَبلَ_أَن': 'قَبلَ:gen', # qabla = before + 'قَبلَ_حَوَالَى': 'قَبلَ:gen', # qabla hawala + 'قَبلَ_نَحوَ': 'قَبلَ:gen', # before about N + 'قُبَيلَ': 'قُبَيلَ:gen', # qubayla = before + 'قُربَ': 'قُربَ:gen', # qurba = near + 'قَيدَ': 'فِي:gen', + 'كَ': 'كَ:gen', # ka = in (temporal?) + 'كَ_أَنَّ': 'كَ:gen', + 'كَ_لِ': 'كَ:gen', + 'كَ_وَ_وَ': 'كَ:gen', + 'كَأَنَّمَا': 'كَأَنَّمَا', # ka-ʾannamā = as if + 'كُلَّمَا': 'كُلَّمَا', # kullamā = whenever + 'كَمَا': 'كَمَا', # remove morphological case; kamā = as + 'كَي': 'لِكَي', # kay = in order to + 'لَ': 'لِ:gen', + 'لَ_عَلَّ': 'لِ:gen', + 'لِ': 'لِ:gen', # li = to + 'لِ_أَجَلّ': 'لِ:gen', + 'لِ_إِلَى': 'لِ:gen', + 'لِ_أَمَامَ_وَ': 'لِ:gen', + 'لِ_أَن': 'لِ:gen', + 'لِ_بِ': 'لِ:gen', + 'لِ_جِهَة': 'لِ:gen', + 'لِ_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of + 'لِ_حَوَالَى': 'لِ:gen', # li hawala = for around X + 'لِ_خَارِجَ': 'لِخَارِجِ:gen', # liḫāriǧi = out + 'لِ_دُخُول': 'لِ:gen', + 'لِ_دَرَجَة_أَنَّ': 'لِ:gen', + 'لِ_سَبَب': 'لِ:gen', + 'لِ_صَالِح': 'لِصَالِحِ:gen', # liṣāliḥi = in interest of + 'لِ_عَلَى': 'لِ:gen', + 'لِ_عَن': 'لِ:gen', + 'لِ_عِندَ': 'لِ:gen', + 'لِ_فِي': 'لِ:gen', + 'لِ_فِي_بَينَ': 'لِ:gen', + 'لِ_كَون': 'لِكَونِ', # likawni = because + 'لِ_لِئَلّا': 'لِ:gen', + 'لِ_مِثلَ': 'مِثلَ', # miṯla = like + 'لِ_مَعَ': 'لِ:gen', + 'لِ_مِن': 'لِ:gen', + 'لِ_نَحوَ': 'لِ:gen', # to/for about N + 'لِ_وَ': 'لِ:gen', + 'لِ_وَ_فِي': 'لِ:gen', + 'لَا': 'إِلَّا', + 'لَا_سِيَّمَا_بَعدَ': 'بَعدَ:gen', + 'لَا_سِيَّمَا_وَ_أَنَّ': 'أَنَّ', + 'لَا_سِيَّمَا_وَ_أَنَّ_هُوَ': 'أَنَّ', + 'لِأَنَّ': 'لِأَنَّ', # remove morphological case; li-ʾanna = because + 'لدى': 'لَدَى:gen', + 'لَدَى': 'لَدَى:gen', # ladā = with, by, of, for + 'لِذَا': 'لِذَا', # liḏā = so, therefore + 'لِذَا_فَ': 'لِ:gen', + 'لِذٰلِكَ': 'لِذَا', # liḏā = so, therefore + 'لٰكِنَّ': 'مَعَ:gen', + 'لكن_إِذَا': 'إِذَا', + 'لكن_بِ': 'بِ:gen', + 'لٰكِن_بَعدَ': 'بَعدَ:gen', + 'لكن_دَاخِلَ': 'دَاخِلَ:gen', + 'لكن_لَدَى': 'لَدَى:gen', + 'لٰكِن_مَعَ': 'مَعَ:gen', + 'لِكَي': 'لِكَي', # li-kay = in order to + 'لَمَّا': 'كُلَّمَا', + 'لَمَّا_لِ': 'كُلَّمَا', + 'لَو': 'لَو', # law = if + 'لَو_أَنَّ': 'لَو', # if + 'لَو_مِن': 'لَو', # if + 'ما': 'مِمَّا', + 'مَا': 'مِمَّا', + 'ما_دَام': 'مِمَّا', + 'مادامت': 'مِمَّا', + 'مَالَم': 'مَالَم', # mālam = unless + 'مَا_إِذَا': 'إِذَا', + 'مِثلَ': 'مِثلَ', # remove morphological case; miṯla = like + 'مِثلَمَا': 'مِثلَ', # miṯla = like + 'مَعَ': 'مَعَ:gen', # maʿa = with + 'مَعَ_أَنَّ': 'مَعَ:gen', + 'مَعَ_بِ': 'مَعَ:gen', + 'مَعَ_فِي': 'مَعَ:gen', + 'مَعَ_مِن_بَينَ': 'بَينَ:gen', + 'مقابل': 'مُقَابِلَ:gen', + 'مُقَابِلَ': 'مُقَابِلَ:gen', # muqābila = in exchange for, opposite to, corresponding to + 'مُقَابِلَ_حَوَالَى': 'مُقَابِلَ:gen', # muqabila hawala + 'مُقَارَن_بِ': 'بِ:gen', + 'مِمَّا': 'مِمَّا', # mimmā = that, which + 'مِمَّا_لَدَى': 'مِمَّا', # mimmā = that, which + 'مِن': 'مِن:gen', # min = from + 'مِن_اجل': 'مِن_أَجلِ:gen', # min ʾaǧli = for the sake of + 'مِن_أَجل': 'مِن_أَجلِ:gen', # min ʾaǧli = for the sake of + 'مِن_أَجل_أَن': 'مِن:gen', + 'مِن_إِلَى': 'مِن:gen', + 'مِن_أَن': 'مِن:gen', + 'مِن_أَنَّ': 'مِن:gen', + 'مِن_بِ': 'مِن:gen', + 'مِن_بَعدَ': 'مِن:gen', + 'مِن_بَينَ': 'بَينَ:gen', + 'مِن_تَحتَ': 'مِن:gen', + 'مِن_ثَمَّ': 'مِن:gen', + 'مِن_ثُمَّ': 'مِن:gen', + 'مِن_جَانِب': 'إِلَى_جَانِبِ:gen', # min ǧānibi = beside + 'مِن_جَرَّاء': 'جَرَّاء:gen', # ǧarrāʾ = because of + 'مِن_حَوَالَى': 'مِن:gen', # min hawala = from around X + 'مِن_حَولَ': 'مِن:gen', + 'مِن_حَيثُ': 'مِن:gen', + 'مِن_خَارِج': 'مِن_خَارِجِ:gen', # min ḫāriǧi = from outside + 'مِن_خَارِجَ': 'مِن_خَارِجِ:gen', # min ḫāriǧi = from outside + 'مِن_خِلَالَ': 'مِن_خِلَالِ:gen', # min ḫilāli = through, during + 'مِن_دَاخِلَ': 'مِن_دَاخِلِ:gen', # min dāḫili = from inside + 'مِن_دُون': 'مِن_دُونِ:gen', # min dūni = without, beneath, underneath + 'مِن_دُونَ': 'مِن_دُونِ:gen', # min dūni = without, beneath, underneath + 'مِن_دُون_أَن': 'مِن_دُونِ:gen', + 'مِن_دُونَ_أَن': 'مِن_دُونِ:gen', # min dūni ʾan = without, beneath, underneath + clause + 'مِن_زَاوِيَة': 'مِن:gen', + 'مِن_شَأن': 'مِن_شَأنِ:gen', # min šaʾni = from matter + 'مِن_ضِمنَ': 'مِن_ضِمنِ:gen', # min ḍimni = from within = including + 'مِن_طَرَف': 'مِن:gen', + 'مِن_عَلَى': 'مِن:gen', + 'مِن_عِندَ': 'مِن:gen', + 'مِن_غَير_أَن': 'مِن:gen', + 'مِن_فَوقَ': 'مِن_فَوقِ:gen', # min fawqi = from above + 'مِن_فِي': 'مِن:gen', + 'مِن_قَبلَ': 'مِن_قِبَلِ:gen', + 'مِن_قِبَل': 'مِن_قِبَلِ:gen', # min qibali = by + 'مِن_قِبَل_بِ_فِي': 'مِن_قِبَلِ:gen', # min qibali = by + 'مِن_مِثلَ': 'مِثلَ', # miṯla = like + 'مِن_مِن': 'مِن:gen', + 'مِن_مِن_بَينَ': 'بَينَ:gen', + 'مِن_مَوقِع': 'مِن:gen', + 'مِن_نَاحِيَة': 'مِن:gen', + 'مِن_وَرَاءَ': 'مِن_وَرَاءِ:gen', # min warāʾi = from behind + 'مُنذُ': 'مُنذُ:gen', # munḏu = since + 'مُنذُ_أَن': 'مُنذُ:gen', + 'مُنذُ_نَحوَ': 'مُنذُ:gen', # since about N + 'مُنذُ_وَ_فِي': 'مُنذُ:gen', + 'مَهمَا': 'مَهمَا', # mahmā = regardless + 'نَاهِيك_بِ': 'بِ:gen', + 'نَتِيجَة_لِ': 'لِ:gen', + 'نَحوَ': 'نَحوَ', # naḥwa = about, approximately + 'نَحوَ_بِ': 'بِ:gen', # about by N + 'هذا_بالأضافة': 'بِ:gen', + 'وان': 'أَنَّ', + 'وإن': 'إِنَّ', + 'وبشان': 'بِشَأنِ:gen', + 'وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond + 'وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'وِفقَ': 'وِفقَ:gen', # wifqa = according to + 'وِفق_لِ': 'وِفقَ:gen', # wifqa = according to + 'ولو': 'إِذَا', # walaw = even if + 'ولو_أَنَّ': 'إِذَا' # walaw = even if + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + @staticmethod + def compose_edeprel(bdeprel, cdeprel): + """ + Composes enhanced deprel from the basic part and optional case + enhancement. + + Parameters + ---------- + bdeprel : str + Basic deprel (can include subtype, e.g., 'acl:relcl'). + cdeprel : TYPE + Case enhancement (can be composed of adposition and morphological + case, e.g., 'k:dat'). It is optional and it can be None or empty + string if there is no case enhancement. + + Returns + ------- + Full enhanced deprel (str). + """ + assert(bdeprel[-1] != ':') + edeprel = bdeprel + if cdeprel: + assert(cdeprel[0] != ':') + edeprel += ':'+cdeprel + return edeprel + + def process_tree(self, tree): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + + We cannot use the process_node() method because it ignores empty nodes. + """ + for node in tree.descendants_and_empty: + for edep in node.deps: + if edep['deprel'] == 'advcl:pred:إِذَن' or edep['deprel'] == 'advcl:pred:كدا' or edep['deprel'] == 'advcl:pred:لكن': + edep['deprel'] = 'advcl:pred' + continue + if edep['deprel'] == 'nmod:بِأَسْرِ:gen': + edep['deprel'] = 'nmod' + continue + m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel']) + if m: + bdeprel = m.group(1) + cdeprel = m.group(2) + solved = False + # Arabic clauses often start with وَ wa "and", which does not add + # much to the meaning but sometimes gets included in the enhanced + # case label. Remove it if there are more informative subsequent + # morphs. + cdeprel = re.sub(r'^وَ_', r'', cdeprel) + cdeprel = re.sub(r'^وَ:', r'', cdeprel) + cdeprel = re.sub(r'^وَ$', r'', cdeprel) + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.fullmatch(x+r'([_:].+)?', cdeprel) + if m and m.group(1) and not x+m.group(1) in exceptions: + cdeprel = x + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: + continue + # Split preposition from morphological case (if any), normalize + # the preposition and add the fixed morphological case where + # applicable. + m = re.fullmatch(r'([^:]+):(nom|gen|acc)', cdeprel) + adposition = m.group(1) if m else cdeprel + if adposition in self.unambiguous: + cdeprel = self.unambiguous[adposition] + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + continue + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/basic2enhanced.py b/udapi/block/ud/basic2enhanced.py new file mode 100644 index 00000000..bc5c8b25 --- /dev/null +++ b/udapi/block/ud/basic2enhanced.py @@ -0,0 +1,23 @@ +"""Block ud.Basic2Enhanced for copying basic dependencies to enhanced where missing. + +UD treebanks are not required to have enhanced dependencies (https://universaldependencies.org/u/overview/enhanced-syntax.html). +However, if such annotation is present (in the DEPS column of CoNLL-U), +it must be present in all nodes and all nodes must be reachable from the root +in the enhanced-deps graph (as checked by the validator). +There may be use cases where enhanced deps are annotated only in some kinds of nodes (e.g. empty nodes) +and the rest of nodes is expected to be the same as in the basic dependencies. +To make such file valid, one can use this block. + +This block should not be used on a file with no enhanced dependencies: +It makes no sense to just duplicate the HEAD+DEPREL information also in the DEPS column. +""" +from udapi.core.block import Block + + +class Basic2Enhanced(Block): + """Make sure DEPS column is always filled.""" + + def process_tree(self, tree): + for node in tree.descendants_and_empty: + if node.raw_deps == "_": + node.raw_deps = f"{node.parent.ord}:{node.deprel}" diff --git a/udapi/block/ud/ca/__init__.py b/udapi/block/ud/ca/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/ca/addmwt.py b/udapi/block/ud/ca/addmwt.py new file mode 100644 index 00000000..49b79da1 --- /dev/null +++ b/udapi/block/ud/ca/addmwt.py @@ -0,0 +1,194 @@ +"""Block ud.ca.AddMwt for heuristic detection of Catalan contractions. + +According to the UD guidelines, contractions such as "del" = "de el" +should be annotated using multi-word tokens. + +Note that this block should be used only for converting legacy conllu files. +Ideally a tokenizer should have already split the MWTs. +""" +import re +import udapi.block.ud.addmwt + +MWTS = { + 'al': {'form': 'a el', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'als': {'form': 'a els', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'del': {'form': 'de el', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'dels': {'form': 'de els', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'pel': {'form': 'per el', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'pels': {'form': 'per els', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + v['lemma'] = v['form'] + v['upos'] = 'ADP DET' + v['deprel'] = '* det' + # The following are the default values + # v['main'] = 0 # which of the two words will inherit the original children (if any) + # v['shape'] = 'siblings', # the newly created nodes will be siblings + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def __init__(self, verbpron=False, **kwargs): + super().__init__(**kwargs) + self.verbpron = verbpron + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + + if analysis is not None: + # Modify the default attachment of the new syntactic words in special situations. + if re.match(r'^(root|conj|reparandum)$', node.udeprel): + # Copy the dictionary so that we do not modify the original and do not affect subsequent usages. + analysis = analysis.copy() + analysis['shape'] = 'subtree' + return analysis + return None + + def fix_personal_pronoun(self, node): + # There is a mess in lemmas and features of personal pronouns. + if node.upos == 'PRON': + if re.match("^jo$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Nom|Number=Sing|Person=1|PronType=Prs' + if re.match("^(em|m'|-me|'m|me|m)$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc,Dat|Number=Sing|Person=1|PrepCase=Npr|PronType=Prs' + if re.match("^mi$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc|Number=Sing|Person=1|PrepCase=Pre|PronType=Prs' + if re.match("^tu$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Nom|Number=Sing|Person=2|Polite=Infm|PronType=Prs' + if re.match("^(et|t'|-te|'t|te|t)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc,Dat|Number=Sing|Person=2|Polite=Infm|PrepCase=Npr|PronType=Prs' + if re.match("^ti$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc|Number=Sing|Person=2|Polite=Infm|PrepCase=Pre|PronType=Prs' + # Strong forms of third person pronouns can be used as subjects or after preposition. + # Do not mark them as nominative (because of the prepositions). + if re.match("^ell$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^ella$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Fem|Number=Sing|Person=3|PronType=Prs' + if re.match("^(el|-lo|'l|lo)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^(la|-la)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs' + if re.match("^(l')$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^(ho|-ho)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs' + if re.match("^(li|-li)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Dat|Number=Sing|Person=3|PronType=Prs' + if re.match("^(es|s'|-se|'s|se|s)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes' + if re.match("^si$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Person=3|PrepCase=Pre|PronType=Prs|Reflex=Yes' + # If nosaltres can be used after a preposition, we should not tag it as nominative. + if re.match("^nosaltres$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Number=Plur|Person=1|PronType=Prs' + # Nós is the majestic first person singular. In accusative and dative, it is identical to first person plural. + if re.match("^nós$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Number=Sing|Person=1|Polite=Form|PronType=Prs' + if re.match("^(ens|-nos|'ns|nos|ns)$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs' + if re.match("^vosaltres$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Plur|Person=2|PronType=Prs' + # Vós is the formal second person singular. In accusative and dative, it is identical to second person plural. + # Vostè is even more formal than vós. In accusative and dative, it is identical to third person singular. + if re.match("^(vós|vostè)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Sing|Person=2|Polite=Form|PronType=Prs' + if re.match("^vostès$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Plur|Person=2|Polite=Form|PronType=Prs' + if re.match("^(us|-vos|-us|vos)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs' + # Strong forms of third person pronouns can be used as subjects or after preposition. + # Do not mark them as nominative (because of the prepositions). + if re.match("^ells$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Masc|Number=Plur|Person=3|PronType=Prs' + if re.match("^elles$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Fem|Number=Plur|Person=3|PronType=Prs' + # Els is masculine accusative, or dative in any gender. + if re.match("^(els|-los|'ls|los|ls)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs' + if re.match("^(les|-les)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem|Number=Plur|Person=3|PronType=Prs' + # There are also "adverbial" pronominal clitics that can occur at direct object positions. + if re.match("^(en|n'|'n|-ne|n|ne)$", node.form, re.IGNORECASE): + node.lemma = 'en' + node.feats = 'Case=Gen|Person=3|PronType=Prs' + if re.match("^(hi|-hi)$", node.form, re.IGNORECASE): + node.lemma = 'hi' + node.feats = 'Case=Loc|Person=3|PronType=Prs' + + def report_suspicious_lemmas(self, node): + # There are offset issues of splitted multi_word_expressions. + # Sometimes a word gets the lemma of the neighboring word. + if node.form.lower()[:1] != node.lemma.lower()[:1]: + # Exclude legitimate cases where the lemma starts with a different letter. + hit = True + if node.lemma == 'jo' and re.match("(em|ens|m'|me|mi|nos|nosaltres|'ns)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'tu' and re.match("(et|'t|us|vosaltres|vostè)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'el' and re.match("(la|l|l'|les)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ell' and re.match("(hi|ho|'l|l'|la|-la|les|li|lo|-lo|los|'ls|'s|s'|se|-se|si)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'es' and re.match("(s|se)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'em' and re.match("('m|m|m')", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'en' and re.match("('n|n'|ne|-ne)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'anar' and re.match("(va|van|vàrem)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ser' and re.match("(és|era|eren|eres|érem|essent|estat|ets|foren|fos|fossin|fou)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'estar' and re.match("(sigut)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'caure' and re.match("(queia|queies|quèiem|quèieu|queien)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ampli' and re.match("(àmplia|àmplies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'indi' and re.match("(índies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'obvi' and re.match("(òbvia)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ossi' and re.match("(òssies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ús' and re.match("(usos)", node.form, re.IGNORECASE): + hit = False + # Form = '2001/37/CE', lemma = 'CE' + # Form = 'nº5', lemma = '5' + # Form = 'kg.', lemma = 'quilogram' + # Form = 'un', lemma = '1' + if node.lemma == 'CE' or re.match("nº", node.form, re.IGNORECASE) or re.match("^quil[oò]", node.lemma, re.IGNORECASE) or re.match("^[0-9]+$", node.lemma): + hit = False + if hit: + print("Form = '%s', lemma = '%s', address = %s" % (node.form, node.lemma, node.address())) diff --git a/udapi/block/ud/ca/elque.py b/udapi/block/ud/ca/elque.py new file mode 100644 index 00000000..6b3ad22b --- /dev/null +++ b/udapi/block/ud/ca/elque.py @@ -0,0 +1,116 @@ +""" +This block searches for relative clauses modifying a determiner ('el que...'). +It is written for Catalan but a similar block should work for Spanish and other +Romance languages. +""" +from udapi.core.block import Block +import logging +import re + +class ElQue(Block): + + def __init__(self, fix=False, **kwargs): + """ + Default: Print the annotation patterns but do not fix anything. + fix=1: Do not print the patterns but fix them. + """ + super().__init__(**kwargs) + self.fix = fix + + def process_node(self, node): + # We take 'que' as the central node of the construction. + if node.lemma == 'que' and node.upos == 'PRON' and node.parent.ord > node.ord: + # We will refer to the parent of 'que' as a verb, although it can be + # a non-verbal predicate, too. + que = node + verb = node.parent + # Check the lemma of the determiner. The form may vary for gender and number. + if que.prev_node and que.prev_node.lemma == 'el': + el = que.prev_node + adp = None + if el.prev_node and el.prev_node.upos == 'ADP': + adp = el.prev_node + if adp.udeprel == 'fixed': + adp = adp.parent + if self.fix: + self.fix_pattern(adp, el, que, verb) + else: + self.print_pattern(adp, el, que, verb) + + def print_pattern(self, adp, el, que, verb): + stanford = [] + if adp: + if adp.parent == el: + parentstr = 'el' + elif adp.parent == que: + parentstr = 'que' + elif adp.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(adp.deprel + '(' + parentstr + ', ADP)') + if el.parent == adp: + parentstr = 'ADP' + elif el.parent == que: + parentstr = 'que' + elif el.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(el.deprel + '(' + parentstr + ', el)') + # We found the verb as the parent of 'que', so we do not need to check the parent of 'que' now. + stanford.append(que.deprel + '(VERB, que)') + if verb.parent == adp: + parentstr = 'ADP' + elif verb.parent == el: + parentstr = 'el' + else: + parentstr = 'OTHER' + stanford.append(verb.deprel + '(' + parentstr + ', VERB)') + print('; '.join(stanford)) + + def fix_pattern(self, adp, el, que, verb): + if adp: + if adp.parent == que or adp.parent == verb: + attach(adp, el, 'case') + if el.parent == que: + ###!!! Just a temporary change. In the end it will be attached elsewhere. + attach(el, verb) + el.parent = verb + if len(el.deps) == 1: + el.deps[0]['parent'] = verb + if verb.parent != adp and verb.parent != el and verb.parent != que: + eldeprel = None + if re.match(r'^[nc]subj$', verb.udeprel): + eldeprel = 'nsubj' + elif re.match(r'^ccomp$', verb.udeprel): + eldeprel = 'obj' + elif re.match(r'^advcl$', verb.udeprel): + eldeprel = 'obl' + elif re.match(r'^acl$', verb.udeprel): + eldeprel = 'nmod' + elif re.match(r'^(xcomp|conj|appos|root)$', verb.udeprel): + eldeprel = verb.deprel + if eldeprel: + attach(el, verb.parent, eldeprel) + attach(verb, el, 'acl:relcl') + # If anything before 'el' depends on the verb ('cc', 'mark', 'punct' etc.), + # re-attach it to 'el'. + for c in verb.children: + if c.ord < el.ord and re.match(r'^(cc|mark|case|punct)$', c.udeprel): + attach(c, el) + +def attach(node, parent, deprel=None): + """ + Attach a node to a new parent with a new deprel in the basic tree. In + addition, if there are enhanced dependencies and there is just one incoming + enhanced relation (this is the case in AnCora), this relation will be + modified accordingly. + """ + node.parent = parent + if deprel: + node.deprel = deprel + if len(node.deps) == 1: + node.deps[0]['parent'] = parent + if deprel: + node.deps[0]['deprel'] = deprel diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index cead294a..b36b2512 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -24,7 +24,7 @@ """ import difflib import logging -import re +import regex from udapi.core.block import Block from udapi.core.mwt import MWT @@ -34,7 +34,9 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, - **kwargs): + allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, + previous_form_label='CorrectForm', previous_text_label='OrigText', + added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -54,33 +56,66 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + allow_add_punct - allow creating punctuation-only nodes + allow_delete_punct - allow deleting extra punctuation-only nodes, + which are not represented in root.text + allow_hyphen_goeswith - if e.g. node.form=="mother-in-law" corresponds to + "mother in law" in root.text, convert it to three nodes: + node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") + node2(form="in", deprel="goeswith", upos="X", parent=node1) + node3(form="law", deprel="goeswith", upos="X", parent=node1). + previous_form_label - when changing node.form, we store the previous value + in node.misc[previous_form_label] (so no information is lost). + Default="CorrectForm" because we expect that the previous value + (i.e. the value of node.form before applying this block) + contained the corrected spelling, while root.text contains + the original spelling with typos as found in the raw text. + CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html + When setting this parameter to an empty string, no values will be stored to node.misc. + When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. + previous_text_label - when we are not able to adapt the annotation to match root.text + and fix_text is True, we store the previous root.text value in a CoNLL-U comment with this label. + Default="OrigText". When setting this parameter to an empty string, + no values will be stored to root.comment. + added_label - when creating new nodes because allow_add_punct=True, we mark these nodes + as new_node.misc[added_label] = 1. Default="Added". """ super().__init__(**kwargs) self.fix_text = fix_text self.prefer_mwt = prefer_mwt self.allow_goeswith = allow_goeswith self.max_mwt_length = max_mwt_length + self.allow_add_punct = allow_add_punct + self.allow_delete_punct = allow_delete_punct + self.allow_hyphen_goeswith = allow_hyphen_goeswith + self.previous_form_label = previous_form_label + self.previous_text_label = previous_text_label + self.added_label = added_label @staticmethod def allow_space(form): """Is space allowed within this token form?""" - return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) + return regex.fullmatch('[0-9 ]+([,.][0-9]+)?', form) - @staticmethod - def store_orig_form(node, new_form): - """Store the original form of this node into MISC, unless the change is common&expected.""" - _ = new_form - if node.form not in ("''", "``"): - node.misc['OrigForm'] = node.form + def store_previous_form(self, node): + """Store the previous form of this node into MISC, unless the change is common&expected.""" + if node.form not in ("''", "``") and self.previous_form_label: + node.misc[self.previous_form_label] = node.form + if self.previous_form_label == 'CorrectForm': + node.feats['Typo'] = 'Yes' def process_tree(self, root): text = root.text if text is None: raise ValueError('Tree %s has no text, cannot use ud.ComplyWithText' % root) - # Normalize the stored text (double space -> single space) + # Normalize the stored text (e.g. double space or no-break space -> single space) # and skip sentences which are already ok. text = ' '.join(text.split()) + if root.text != text and self.fix_text: + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') + root.text = text if text == root.compute_text(): return @@ -112,13 +147,14 @@ def process_tree(self, root): node.misc['SpaceAfter'] = 'No' else: logging.warning('Node %s does not match text "%s"', node, tmp_text[:20]) - return + break # Edit root.text if needed. if self.fix_text: computed_text = root.compute_text() if text != computed_text: - root.add_comment('ToDoOrigText = ' + root.text) + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') root.text = computed_text def unspace_diffs(self, orig_diffs, tree_chars, text): @@ -130,6 +166,10 @@ def unspace_diffs(self, orig_diffs, tree_chars, text): tree_lo += 1 if tree_chars[tree_hi - 1] == ' ': tree_hi -= 1 + if text[text_lo] == ' ': + text_lo += 1 + if text[text_hi - 1] == ' ': + text_hi -= 1 old = tree_chars[tree_lo:tree_hi] new = text[text_lo:text_hi] if old == '' and new == '': @@ -181,18 +221,37 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): for diff in diffs: edit, tree_lo, tree_hi, text_lo, text_hi = diff - # Focus only on edits of type 'replace', log insertions and deletions as failures. if edit == 'equal': - continue - if edit in ('insert', 'delete'): - logging.warning('Unable to solve token-vs-text mismatch\n%s', - _diff2str(diff, tree_chars, text)) - continue - - # Revert the splittng and solve the diff. - nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] - form = text[text_lo:text_hi] - self.solve_diff(nodes, form.strip()) + pass + elif edit == 'insert': + forms = text[text_lo:text_hi].split(' ') + if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: + next_node = char_nodes[tree_lo] + for f in reversed(forms): + new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') + new.shift_before_node(next_node) + new.misc[self.added_label] = 1 + else: + logging.warning('Unable to insert nodes\n%s', + _diff2str(diff, tree_chars, text)) + elif edit == 'delete': + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + if all(regex.fullmatch('\p{P}+', n.form) for n in nodes): + if self.allow_delete_punct: + for node in nodes: + node.remove(children='rehang') + else: + logging.warning('Unable to delete punctuation nodes (try ud.ComplyWithText allow_delete_punct=1)\n%s', + _diff2str(diff, tree_chars, text)) + else: + logging.warning('Unable to delete non-punctuation nodes\n%s', + _diff2str(diff, tree_chars, text)) + else: + assert edit == 'replace' + # Revert the splittng and solve the diff. + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + form = text[text_lo:text_hi] + self.solve_diff(nodes, form.strip()) def solve_diff(self, nodes, form): """Fix a given (minimal) tokens-vs-text inconsistency.""" @@ -201,20 +260,33 @@ def solve_diff(self, nodes, form): # First, solve the cases when the text contains a space. if ' ' in form: - if len(nodes) == 1 and node.form == form.replace(' ', ''): - if self.allow_space(form): - self.store_orig_form(node, form) - node.form = form - elif self.allow_goeswith: - forms = form.split() - node.form = forms[0] - for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos=node.upos) + node_form = node.form + if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: + node_form = node_form.replace('-', '') + if len(nodes) == 1: + if node_form == form.replace(' ', ''): + if self.allow_space(form): + self.store_previous_form(node) + node.form = form + elif self.allow_goeswith: + self.store_previous_form(node) + forms = form.split() + node.form = forms[0] + node.feats['Typo'] = 'Yes' + for split_form in reversed(forms[1:]): + new = node.create_child(form=split_form, deprel='goeswith', upos='X') + new.shift_after_node(node) + else: + logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) + elif self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('[ \p{P}]+', form[len(node.form):]): + for punct_form in reversed(form[len(node.form):].split()): + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) + new.misc[self.added_label] = 1 else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: - logging.warning('Unable to solve n:m diff:\n%s -> %s', nodes_str, form) + logging.warning(f'Unable to solve {len(nodes)}:{len(form.split(" "))} diff:\n{nodes_str} -> {form}') # Second, solve the cases when multiple nodes match one form (without any spaces). elif len(nodes) > 1: @@ -235,8 +307,14 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_orig_form(node, form) - node.form = form + if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): + punct_form = form[len(node.form):] + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') + new.shift_after_node(node) + new.misc[self.added_label] = 1 + else: + self.store_previous_form(node) + node.form = form def _nodes_to_chars(nodes): @@ -261,6 +339,4 @@ def _log_diffs(diffs, tree_chars, text, msg): def _diff2str(diff, tree, text): old = '|' + ''.join(tree[diff[1]:diff[2]]) + '|' new = '|' + ''.join(text[diff[3]:diff[4]]) + '|' - if diff[0] == 'equal': - return '{:7} {!s:>50}'.format(diff[0], old) return '{:7} {!s:>50} --> {!s}'.format(diff[0], old, new) diff --git a/udapi/block/ud/cs/addmwt.py b/udapi/block/ud/cs/addmwt.py index 4c203ddc..a690c95b 100644 --- a/udapi/block/ud/cs/addmwt.py +++ b/udapi/block/ud/cs/addmwt.py @@ -1,17 +1,30 @@ """Block ud.cs.AddMwt for heuristic detection of multi-word tokens.""" import udapi.block.ud.addmwt +import re +import logging +# Define static rules for 'aby', 'kdyby' and similar forms. MWTS = { - 'abych': {'form': 'aby bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, - 'kdybych': {'form': 'když bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, - 'abys': {'form': 'aby bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, - 'kdybys': {'form': 'když bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, - 'aby': {'form': 'aby by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, - 'kdyby': {'form': 'když by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, - 'abychom': {'form': 'aby bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, - 'kdybychom': {'form': 'když bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, - 'abyste': {'form': 'aby byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, - 'kdybyste': {'form': 'když byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'abych': {'form': 'aby bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'kdybych': {'form': 'když bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'abys': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'abysi': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybys': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybysi': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'aby': {'form': 'aby by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'kdyby': {'form': 'když by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'abychom': {'form': 'aby bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychom': {'form': 'když bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + # Old Czech 'abychme' == Modern Czech 'abychom' + 'abychme': {'form': 'aby bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychme': {'form': 'když bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'abyste': {'form': 'aby byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'abyšte': {'form': 'aby byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyste': {'form': 'když byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyšte': {'form': 'když byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + # Old Czech 'abyšta' == dual number; 2nd or 3rd person, the one example in data so far is 3rd. + 'abyšta': {'form': 'aby byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, + 'kdybyšta': {'form': 'když byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, } for v in MWTS.values(): v['upos'] = 'SCONJ AUX' @@ -25,23 +38,52 @@ person = '1' elif 'Person=2' in v['feats']: person = '2' - v['xpos'] = 'J,------------- Vc-%s---%s-------' % (number, person) v['deprel'] = '* aux' v['lemma'] = v['form'].split()[0] + ' být' v['main'] = 0 v['shape'] = 'siblings' +# Define static rules for 'nač', 'oč', 'zač' (but not 'proč'). +# Add them to the already existing dictionary MWTS. # nač -> na + co -for prep in 'na za o'.split(): +for prep in 'na o za'.split(): MWTS[prep + 'č'] = { 'form': prep + ' co', 'lemma': prep + ' co', 'upos': 'ADP PRON', + 'xpos': 'RR--4---------- PQ--4----------', + 'feats': 'AdpType=Prep|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel', 'deprel': 'case *', 'main': 1, 'shape': 'subtree', } +# In 19th century texts (Hičkok etalon), one instance of 'seč' was also split (and annotated as ADP + accusative!) +# A few additional instances were found in older texts, too (e.g. 16th century). +# We must do it separately, as the preposition is vocalized. +MWTS['seč'] = { + 'form': 'se' + ' co', + 'lemma': 's' + ' co', + 'upos': 'ADP PRON', + 'xpos': 'RV--4---------- PQ--4----------', + 'feats': 'AdpType=Voc|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', +} + +# Old Czech 'toliť' (special case with 3 subtokens; general -ť will be solved dynamically below). +MWTS['toliť'] = { + 'form': 'to li ť', + 'lemma': 'ten li ť', + 'upos': 'DET SCONJ PART', + 'xpos': '* J,------------- TT-------------', + 'feats': '* _ _', + 'deprel': '* mark discourse', + 'main': 0, + 'shape': 'siblings' +} + class AddMwt(udapi.block.ud.addmwt.AddMwt): @@ -49,25 +91,153 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + # Avoid adding a MWT if the current node already is part of an MWT. + if node.multiword_token: + return None analysis = MWTS.get(node.form.lower(), None) if analysis is not None: return analysis - - # There is no VerbType=verbconj in the UD_Czech data. - # The purpose of this rule is rather to show that - # it is possible to write such "dynamic" rules - # (which cannot be included in static MWTS). - if node.form.lower().endswith('ť') and node.feats['VerbType'] == 'verbconj': - return { - 'form': node.form.lower()[:-1] + ' neboť', - 'lemma': '* neboť', - 'upos': '* CCONJ', - 'xpos': 'Vt-S---3P-NA--2 J^-------------', - 'feats': '* _', - 'deprel': '* cc', - 'main': 0, - 'shape': 'subtree', - } + # If the node did not match any of the static rules defined in MWTS, + # check it against the "dynamic" rules below. The enclitic 'ť' will be + # separated from its host but only if it has been marked by an annotator + # in MISC. (These are annotation conventions used for Old Czech in the + # Hičkok project.) + if node.misc['AddMwt'] != '': + subtokens = node.misc['AddMwt'].split() + if len(subtokens) != 2: + logging.warning("MISC 'AddMwt=%s' has unexpected number of subtokens." % node.misc['AddMwt']) + return None + token_from_subtokens = ''.join(subtokens) + if subtokens[1] == 'jsi': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' jsi', + 'lemma': '* být', + 'upos': '* AUX', + 'xpos': '* VB-S---2P-AAI--', + 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', + 'deprel': '* aux', + 'main': 0, + 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', + } + if subtokens[1] == 'jest': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' jest', + 'lemma': '* být', + 'upos': '* AUX', + 'xpos': '* VB-S---3P-AAI-2', + 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', + 'deprel': '* aux', + 'main': 0, + 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', + } + if subtokens[1] == 'i': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' i', + 'lemma': '* i', + 'upos': '* CCONJ', + 'xpos': '* J^-------------', + 'feats': '* _', + 'deprel': '* cc', + 'main': 0, + 'shape': 'subtree', + } + if subtokens[1] in ['ť', 'tě', 'ti']: + if token_from_subtokens != node.form: + logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) + return None + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' ' + subtokens[1], + 'lemma': '* ť', + 'upos': '* PART', + 'xpos': '* TT-------------', + 'feats': '* _', + 'deprel': '* discourse', + 'main': 0, + 'shape': 'subtree', + } + # dajžto = dajž + to + if subtokens[1] == 'to': + if token_from_subtokens != node.form: + logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) + return None + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' ' + subtokens[1], + 'lemma': '* ten', + 'upos': '* DET', + 'xpos': '* PDNS4----------', + 'feats': '* Case=Acc|Gender=Neut|Number=Sing|PronType=Dem', + 'deprel': '* obj', + 'main': 0, + 'shape': 'subtree', + } + # Contractions of prepositions and pronouns almost could be processed + # regardless of AddMwt instructions by the annotator, but we still + # require it to be on the safe side. For example, both 'přědeň' and + # 'přěden' are attested in Old Czech but then we do not want to catch + # 'on' (besides the wanted 'oň'). Another reason si that the pronoun + # could be masculine or neuter. We pick Gender=Masc and Animacy=Anim + # by default, unless the original token was annotated as Animacy=Inan + # or Gender=Neut. + m = re.match(r"^(na|nade|o|po|pro|přěde|ski?rz[eě]|za)[nň](ž?)$", node.form.lower()) + if m: + node.misc['AddMwt'] = '' + # Remove vocalization from 'přěde' (přěd něj) but keep it in 'skrze' + # (skrze něj). + if m.group(1) == 'přěde': + pform = 'přěd' + plemma = 'před' + adptype = 'Voc' + at = 'V' + elif re.match(r"^ski?rz[eě]$", m.group(1).lower()): + pform = m.group(1) + plemma = 'skrz' + adptype = 'Voc' + at = 'V' + else: + pform = m.group(1) + plemma = m.group(1) + adptype = 'Prep' + at = 'R' + # In UD PDT, Gender=Masc,Neut, and in PDT it is PEZS4--3 / P4ZS4---. + if node.feats['Gender'] == 'Neut': + gender = 'Neut' + animacy = '' + g = 'N' + elif node.feats['Animacy'] == 'Inan': + gender = 'Masc' + animacy = 'Animacy=Inan|' + g = 'I' + else: + gender = 'Masc' + animacy = 'Animacy=Anim|' + g = 'M' + if m.group(2).lower() == 'ž': + return { + 'form': pform + ' nějž', + 'lemma': plemma + ' jenž', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- P4'+g+'S4---------2', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|PrepCase=Pre|PronType=Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } + else: + return { + 'form': pform + ' něj', + 'lemma': plemma + ' on', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- PE'+g+'S4--3-------', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|Person=3|PrepCase=Pre|PronType=Prs', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } return None def postprocess_mwt(self, mwt): diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py new file mode 100644 index 00000000..4e2be633 --- /dev/null +++ b/udapi/block/ud/cs/fixedeprels.py @@ -0,0 +1,685 @@ +"""Block to fix case-enhanced dependency relations in Czech.""" +from udapi.core.block import Block +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'aby': [], + 'ač': [], + 'ačkoli': [], # 'ačkoliv' se převede na 'ačkoli' dole + 'ačkoliv': [], # ... ale možná ne když je doprovázeno předložkou + 'ať': [], + 'byť': [], + 'i_když': [], + 'jak': [], + 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole + 'jako': [], + 'jakoby': ['jakoby_pod:ins'], # these instances in FicTree should be spelled 'jako by' + 'když': [], + 'než': ['než_aby'], + 'nežli': [], + 'pokud': [], + 'protože': [], + 'takže': [], + 'třebaže': [], + 'že': [] + } + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'á': 'na:acc', # "á konto té záležitosti", ovšem "á konto" není ani spojeno jako složená předložka (význam = "na konto") + 'abi': 'aby', + 'aby_na': 'na:loc', + 'ačkoliv': 'ačkoli', + 'ať': 'ať', # remove morphological case + 'ať_forma': 'formou:gen', + 'ať_jako': 'jako', + 'ať_na': 'na:loc', + 'ať_s': 's:ins', + 'ať_v': 'v:loc', + 'ať_v_oblast': 'v_oblasti:gen', + 'ať_z': 'z:gen', + 'ať_z_hledisko': 'z_hlediska:gen', + 'ať_z_strana': 'ze_strany:gen', + 'až_do': 'do:gen', + 'až_o': 'o:acc', + 'během': 'během:gen', + 'bez': 'bez:gen', + 'bez_ohled_na': 'bez_ohledu_na:acc', + 'bez_na': 'bez_ohledu_na:acc', ###!!! a temporary hack to silence the validator about (https://github.com/UniversalDependencies/UD_Czech-PDT/issues/10#issuecomment-2710721703) + 'bez_zřetel_k': 'bez_zřetele_k:dat', + 'bez_zřetel_na': 'bez_zřetele_na:acc', + 'blízko': 'blízko:dat', + 'blízko_k': 'blízko:dat', + 'blíž': 'blízko:dat', + 'blíže': 'blízko:dat', + 'bok_po_bok_s': 'bok_po_boku_s:ins', + 'cesta': 'cestou:gen', + 'co_jako': 'jako', + 'coby': 'coby', # remove morphological case + 'daleko': 'nedaleko:gen', + 'daleko_od': 'od:gen', + 'dík': 'díky:dat', + 'díky': 'díky:dat', + 'dle': 'dle:gen', + 'do': 'do:gen', + 'do_čelo': 'do_čela:gen', + 'do_k': 'k:dat', + 'do_oblast': 'do_oblasti:gen', + 'do_rozpor_s': 'do_rozporu_s:ins', + 'do_ruka': 'do_rukou:gen', + 'do_soulad_s': 'do_souladu_s:ins', + 'důsledkem': 'v_důsledku:gen', + 'forma': 'formou:gen', + 'formou': 'formou:gen', + 'hledět_na': 'nehledě_na:acc', + 'i_když': 'i_když', # remove morphological case + 'i_pro': 'pro:acc', + 'jak_aby': 'jak', + 'jak_ad': 'jak', + 'jakkoliv': 'jakkoli', + 'jako': 'jako', # remove morphological case + 'jako_kupříkladu': 'jako', + 'jakoby': 'jako', + 'jakoby_pod': 'pod:ins', + 'jakožto': 'jako', + 'jelikož_do': 'jelikož', + 'jenom': 'jen', + 'jesli': 'jestli', + 'jestli_že': 'jestliže', + 'jménem': 'jménem:gen', + 'k': 'k:dat', + 'k_konec': 'ke_konci:gen', + 'k_prospěch': 'ku_prospěchu:gen', + 'kdykoliv': 'kdykoli', + 'kol': 'kolem:gen', + 'kolem': 'kolem:gen', + 'kolem_dokola': 'kolem:gen', + 'koncem': 'koncem:gen', + 'konec': 'koncem:gen', + 'krom': 'kromě:gen', + 'kromě': 'kromě:gen', + 'kvůli': 'kvůli:dat', + 'leda_když': 'ledaže', + 'li_jako': 'li', + 'liž': 'li', + 'mezi_uvnitř': 'uvnitř:gen', + 'na:ins': 'na:acc', + 'na_báze': 'na_bázi:gen', + 'na_čelo': 'na_čele:gen', + 'na_mimo': 'na:loc', # na kurtě i mimo něj + 'na_než': 'na:acc', # na víc než čtyři a půl kilometru + 'na_od': 'na_rozdíl_od:gen', + 'na_počátek': 'na_počátku:gen', + 'na_počest': 'na_počest:gen', # appears also with :dat but the meaning is same + 'na_podklad': 'na_podkladě:gen', + 'na_rozdíl_od': 'na_rozdíl_od:gen', + 'na_strana': 'na_straně:gen', + 'na_účet': 'na_účet:gen', + 'na_újma': 'gen', # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier + 'na_úroveň': 'na_úrovni:gen', + 'na_úroveň_okolo': 'na_úrovni:gen', + 'na_úsek': 'na_úseku:gen', + 'na_začátek': 'na_začátku:gen', + 'na_základ': 'na_základě:gen', + 'na_základna': 'na_základně:gen', + 'na_závěr': 'na_závěr:gen', + 'na_zda': 'na:loc', # na tom, zda a v jaké formě... + 'namísto': 'namísto:gen', + 'namísto_do': 'do:gen', + 'napospas': 'napospas:dat', + 'narozdíl_od': 'na_rozdíl_od:gen', + 'následek': 'následkem:gen', + 'navzdory': 'navzdory:dat', + 'nedaleko': 'nedaleko:gen', + 'než': 'než', # remove morphological case + 'nežli': 'nežli', # remove morphological case + 'o_jako': 'jako', + 'o_o': 'o:acc', + 'od': 'od:gen', + 'od_počínaje': 'počínaje:ins', # od brambor počínaje a základní zeleninou konče + 'ohledně': 'ohledně:gen', + 'okolo': 'okolo:gen', + 'oproti': 'oproti:dat', + 'po_v': 'po:loc', + 'po_bok': 'po_boku:gen', + 'po_doba': 'po_dobu:gen', + 'po_stránka': 'po_stránce:gen', + 'po_vzor': 'po_vzoru:gen', + 'poblíž': 'poblíž:gen', + 'počátek': 'počátkem:gen', + 'počátkem': 'počátkem:gen', + 'počínaje': 'počínaje:ins', + 'počínat': 'počínaje:ins', + 'počínat_od': 'počínaje:ins', + 'pod_dojem': 'pod_dojmem:gen', + 'pod_tlak': 'pod_tlakem:gen', + 'pod_vliv': 'pod_vlivem:gen', + 'pod_záminka': 'pod_záminkou:gen', + 'pod_záminka_že': 'pod_záminkou_že', + 'podél': 'podél:gen', + 'podle': 'podle:gen', + 'pomoc': 'pomocí:gen', + 'pomocí': 'pomocí:gen', + 'postup': 'postupem:gen', + 'pouze_v': 'v:loc', + 'pro': 'pro:acc', + 'pro_aby': 'pro:acc', + 'prostřednictví': 'prostřednictvím:gen', + 'prostřednictvím': 'prostřednictvím:gen', + 'proti': 'proti:dat', + 'proto_aby': 'aby', + 'protože': 'protože', # remove morphological case + 'před_během': 'během:gen', # před a během utkání + 'před_po': 'po:loc', # před a po vyloučení Schindlera + 'přes': 'přes:acc', + 'přes_přes': 'přes:acc', # annotation error + 'přestože': 'přestože', # remove morphological case + 'při': 'při:loc', + 'při_pro': 'při:loc', + 'při_příležitost': 'při_příležitosti:gen', + 'ruka_v_ruka_s': 'ruku_v_ruce_s:ins', + 's_cíl': 's_cílem', # s cílem projednat X + 's_ohled_k': 's_ohledem_k:dat', + 's_ohled_na': 's_ohledem_na:acc', + 's_pomoc': 's_pomocí:gen', + 's_postup': 'postupem:gen', + 's_přihlédnutí_k': 's_přihlédnutím_k:dat', + 's_přihlédnutí_na': 's_přihlédnutím_na:acc', + 's_výjimka': 's_výjimkou:gen', + 's_výjimka_z': 's_výjimkou:gen', + 's_výjimka_že': 's_výjimkou_že', + 's_vyloučení': 's_vyloučením:gen', + 's_zřetel_k': 'se_zřetelem_k:dat', + 's_zřetel_na': 'se_zřetelem_na:acc', + 'severně_od': 'od:gen', + 'skrz': 'skrz:acc', + 'směr_do': 'směrem_do:gen', + 'směr_k': 'směrem_k:dat', + 'směr_na': 'směrem_na:acc', + 'směr_od': 'směrem_od:gen', + 'směr_přes': 'směrem_přes:acc', + 'směr_z': 'směrem_z:gen', + 'společně_s': 'společně_s:ins', + 'spolu': 'spolu_s:ins', + 'spolu_s': 'spolu_s:ins', + 'spolu_se': 'spolu_s:ins', + 'stranou': 'stranou:gen', + 'stranou_od': 'stranou:gen', + 'takže': 'takže', # remove morphological case + 'takže_a': 'takže', + 'třebaže': 'třebaže', # remove morphological case + 'tvář_v_tvář': 'tváří_v_tvář:dat', + 'u': 'u:gen', + 'u_příležitost': 'u_příležitosti:gen', + 'uprostřed': 'uprostřed:gen', + 'uvnitř': 'uvnitř:gen', + 'v:ins': 'v:loc', # ve skutečností (překlep) + 'v_analogie_s': 'v_analogii_s:ins', + 'v_blízkost': 'v_blízkosti:gen', + 'v_čas': 'v_čase:gen', + 'v_čelo': 'v_čele:gen', + 'v_čelo_s': 'v_čele_s:ins', + 'v_doba': 'v_době:gen', + 'v_dohoda_s': 'v_dohodě_s:ins', + 'v_duch': 'v_duchu:gen', + 'v_důsledek': 'v_důsledku:gen', + 'v_forma': 've_formě:gen', + 'v_jméno': 've_jménu:gen', + 'v_k': 'k:dat', + 'v_kombinace_s': 'v_kombinaci_s:ins', + 'v_konfrontace_s': 'v_konfrontaci_s:ins', + 'v_kontext_s': 'v_kontextu_s:ins', + 'v_na': 'na:loc', + 'v_neprospěch': 'v_neprospěch:gen', + 'v_oblast': 'v_oblasti:gen', + 'v_oblast_s': 's:ins', + 'v_obor': 'v_oboru:gen', + 'v_otázka': 'v_otázce:gen', + 'v_podoba': 'v_podobě:gen', + 'v_poměr_k': 'v_poměru_k:dat', + 'v_porovnání_s': 'v_porovnání_s:ins', + 'v_proces': 'v_procesu:gen', + 'v_prospěch': 've_prospěch:gen', + 'v_protiklad_k': 'v_protikladu_k:dat', + 'v_průběh': 'v_průběhu:gen', + 'v_případ': 'v_případě:gen', + 'v_případ_že': 'v_případě_že', + 'v_rámec': 'v_rámci:gen', + 'v_reakce_na': 'v_reakci_na:acc', + 'v_rozpor_s': 'v_rozporu_s:ins', + 'v_řada': 'v_řadě:gen', + 'v_shoda_s': 've_shodě_s:ins', + 'v_služba': 've_službách:gen', + 'v_směr': 've_směru:gen', + 'v_směr_k': 've_směru_k:dat', + 'v_směr_na': 've_směru_k:dat', # same meaning as ve_směru_na:acc + 'v_smysl': 've_smyslu:gen', + 'v_součinnost_s': 'v_součinnosti_s:ins', + 'v_souhlas_s': 'v_souhlasu_s:ins', + 'v_soulad_s': 'v_souladu_s:ins', + 'v_souvislost_s': 'v_souvislosti_s:ins', + 'v_spojení_s': 've_spojení_s:ins', + 'v_spojení_se': 've_spojení_s:ins', + 'v_spojený_s': 've_spojení_s:ins', + 'v_spojitost_s': 've_spojitosti_s:ins', + 'v_spolupráce_s': 've_spolupráci_s:ins', + 'v_s_spolupráce': 've_spolupráci_s:ins', + 'v_srovnání_s': 've_srovnání_s:ins', + 'v_srovnání_se': 've_srovnání_s:ins', + 'v_stav': 've_stavu:gen', + 'v_stín': 've_stínu:gen', + 'v_světlo': 've_světle:gen', + 'v_úroveň': 'v_úrovni:gen', + 'v_věc': 've_věci:gen', + 'v_vztah_k': 've_vztahu_k:dat', + 'v_vztah_s': 've_vztahu_k:dat', + 'v_zájem': 'v_zájmu:gen', + 'v_záležitost': 'v_záležitosti:gen', + 'v_závěr': 'v_závěru:gen', + 'v_závislost_na': 'v_závislosti_na:loc', + 'v_závislost_s': 'v_závislosti_s:ins', + 'v_znamení': 've_znamení:gen', + 'včetně': 'včetně:gen', + 'vedle': 'vedle:gen', + 'versus': 'versus:nom', + 'vina': 'vinou:gen', + 'vliv': 'vlivem:gen', + 'vlivem': 'vlivem:gen', + 'vůči': 'vůči:dat', + 'výměna_za': 'výměnou_za:acc', + 'vzhledem': 'vzhledem_k:dat', + 'vzhledem_k': 'vzhledem_k:dat', + 'z': 'z:gen', + 'z_důvod': 'z_důvodu:gen', + 'z_hledisko': 'z_hlediska:gen', + 'z_oblast': 'z_oblasti:gen', + 'z_řada': 'z_řad:gen', + 'z_strana': 'ze_strany:gen', + 'z_nedostatek': 'z_nedostatku:gen', + 'z_titul': 'z_titulu:gen', + 'z_začátek': 'ze_začátku:gen', + 'za_pomoc': 'za_pomoci:gen', + 'za_účast': 'za_účasti:gen', + 'za_účel': 'za_účelem:gen', + 'začátek': 'začátkem:gen', + 'zásluha': 'zásluhou:gen', + 'zatím_co': 'zatímco', + 'závěr': 'závěrem:gen', + 'závisle_na': 'nezávisle_na:loc', + 'že': 'že', # remove morphological case + 'že_ať': 'ať', + 'že_jako': 'že', + 'že_jakoby': 'že', + 'že_za': 'za:gen' + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + @staticmethod + def compose_edeprel(bdeprel, cdeprel): + """ + Composes enhanced deprel from the basic part and optional case + enhancement. + + Parameters + ---------- + bdeprel : str + Basic deprel (can include subtype, e.g., 'acl:relcl'). + cdeprel : TYPE + Case enhancement (can be composed of adposition and morphological + case, e.g., 'k:dat'). It is optional and it can be None or empty + string if there is no case enhancement. + + Returns + ------- + Full enhanced deprel (str). + """ + edeprel = bdeprel + if cdeprel: + edeprel += ':'+cdeprel + return edeprel + + def process_tree(self, tree): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + + We cannot use the process_node() method because it ignores empty nodes. + """ + for node in tree.descendants_and_empty: + for edep in node.deps: + m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel']) + if m: + bdeprel = m.group(1) + cdeprel = m.group(2) + solved = False + # Issues caused by errors in the original annotation must be fixed early. + # Especially if acl|advcl occurs with a preposition that unambiguously + # receives a morphological case in the subsequent steps, and then gets + # flagged as solved. + if re.match(r'advcl', bdeprel): + # The following advcl should in fact be obl. + if re.fullmatch(r'do(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'do:gen' + elif re.fullmatch(r'k(?::dat)?', cdeprel): ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'k:dat' + elif re.fullmatch(r'místo(?::gen)?', cdeprel): # 'v poslední době se množí bysem místo bych' + bdeprel = 'obl' + cdeprel = 'místo:gen' + elif re.fullmatch(r'od(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'od:gen' + elif re.fullmatch(r'podle(?::gen)?', cdeprel): + bdeprel = 'obl' + cdeprel = 'podle:gen' + elif re.fullmatch(r's(?::ins)?', cdeprel): ###!!! "seděli jsme tam s Člověče, nezlob se!" Měla by se opravit konverze stromu. + bdeprel = 'obl' + cdeprel = 's:ins' + elif re.fullmatch(r'v_duchu?(?::gen)?', cdeprel): + bdeprel = 'obl' + cdeprel = 'v_duchu:gen' + elif re.fullmatch(r'v', cdeprel): + bdeprel = 'obl' + cdeprel = 'v:loc' + # byl by pro, abychom... ###!!! Opravit i konverzi stromu. + elif re.fullmatch(r'pro(?::acc)?', cdeprel): + cdeprel = 'aby' + elif re.match(r'acl', bdeprel): + # The following acl should in fact be nmod. + if re.fullmatch(r'k(?::dat)?', cdeprel): + bdeprel = 'nmod' + cdeprel = 'k:dat' + elif re.fullmatch(r'na_způsob(?::gen)?', cdeprel): # 'střídmost na způsob Masarykova "jez dopolosyta"' + bdeprel = 'nmod' + cdeprel = 'na_způsob:gen' + elif re.fullmatch(r'od(?::gen)?', cdeprel): + bdeprel = 'nmod' + cdeprel = 'od:gen' + elif re.fullmatch(r'v', cdeprel): + bdeprel = 'nmod' + cdeprel = 'v:loc' + else: # bdeprel is 'obl' or 'nmod' + # The following subordinators should be removed if they occur with nominals. + if re.match(r'(ačkoli|když)', cdeprel): # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here! + cdeprel = '' + # Removing 'až' must be done early. The remainder may be 'počátek' + # and we will want to convert it to 'počátkem:gen'. + elif re.match(r'až_(.+):(gen|dat|acc|loc|ins)', cdeprel): + cdeprel = re.sub(r'až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2', cdeprel) + elif re.fullmatch(r'jestli(?::gen)?', cdeprel): # nevím, jestli osmého nebo devátého září + cdeprel = 'gen' + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.fullmatch(x+r'([_:].+)?', cdeprel) + if m and m.group(1) and not x+m.group(1) in exceptions: + cdeprel = x + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: + continue + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.fullmatch(x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?', cdeprel) + if m: + cdeprel = self.unambiguous[x] + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + if re.match(r'(obl|nmod)', bdeprel): + m = re.fullmatch(r'(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?', cdeprel) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(1)) + if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase): + cdeprel = adpcase + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + continue + ###!!! bdeprel and cdeprel are not visible from here on but we may want to use them there as well. + if re.match(r'^(acl|advcl):', edep['deprel']): + # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating + edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) + if edep['deprel'] == 'acl:v' and node.form == 'patře': + edep['deprel'] = 'nmod:v:loc' + node.deprel = 'nmod' + node.lemma = 'patro' + node.upos = 'NOUN' + node.xpos = 'NNNS6-----A----' + node.feats['Aspect'] = '' + node.feats['Gender'] = 'Neut' + node.feats['Tense'] = '' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + elif re.match(r'^(nmod|obl(:arg)?):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and (node.parent == None or node.parent.feats['Case'] == 'Loc') or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'obl:loc': + # Annotation error. The first occurrence in PDT dev: + # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' + # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. + # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. + edep['deprel'] = 'obl:v:loc' + elif edep['deprel'] == 'obl:arg:loc': + # Annotation error. The first occurrence in PDT dev: + edep['deprel'] = 'obl:arg:na:loc' + elif edep['deprel'] == 'nmod:loc': + # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': + # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? + # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. + edep['deprel'] = 'obl' + elif edep['deprel'] == 'nmod:voc': + # 'v 8. čísle tiskoviny Ty rudá krávo' + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'nmod:co:nom': + # Annotation error: 'kompatibilní znamená tolik co slučitelný' + # 'co' should be relative pronoun rather than subordinating conjunction. + edep['deprel'] = 'acl:relcl' + node.deprel = 'acl:relcl' + elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): + edep['deprel'] = 'advcl:li' + elif re.match(r'^(nmod|obl(:arg)?):mezi:voc$', edep['deprel']): + edep['deprel'] = re.sub(r':voc$', r':acc', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^obl:místo_za:acc$', edep['deprel']): + # 'chytají krávu místo za rohy spíše za ocas' + # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. + for c in node.children: + if c.form == 'místo': + c.upos = 'ADV' + c.deprel = 'cc' + edep['deprel'] = 'obl:za:acc' + elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):na(:gen)?$', edep['deprel']): + edep['deprel'] = re.sub(r':gen$', '', edep['deprel']) + # The case is unknown. We need 'acc' or 'loc'. + # The locative is probably more frequent but it is not so likely with every noun. + # If there is an nummod:gov child, it must be accusative and not locative. + # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:arg:na_konec$', edep['deprel']): + # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' + edep['deprel'] = 'obl:arg:na:acc' + elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): + # Annotation error. + if node.form == 's': + ohled = node.next_node + na = ohled.next_node + noun = na.next_node + self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') + self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(node, noun, 'case', 'case') + elif re.match(r'^nmod:pára:nom$', edep['deprel']): + # Annotation error: 'par excellence'. + edep['deprel'] = 'nmod' + for c in node.children: + if c.udeprel == 'case' and c.form.lower() == 'par': + c.lemma = 'par' + c.upos = 'ADP' + c.xpos = 'RR--X----------' + c.feats['Case'] = '' + c.feats['Gender'] = '' + c.feats['Number'] = '' + c.feats['Polarity'] = '' + c.feats['AdpType'] = 'Prep' + elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): + # Accusative would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): + # Genitive would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': + # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. + # Find the content nominal. + cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] + vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] + if len(cnouns) > 0 and len(vs) > 0: + cnoun = cnouns[0] + v = vs[0] + self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') + self.set_basic_and_enhanced(v, cnoun, 'case', 'case') + self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') + elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): + # ':nom' occurs in 'karneval v Rio de Janeiro' + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): + # There is just one occurrence and it is an error: + # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' + # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. + edep['deprel'] = 'obl:s:ins' + elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): + # Instrumental would be possible but unlikely. + edep['deprel'] += ':acc' + else: + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/cs/fixmorpho.py b/udapi/block/ud/cs/fixmorpho.py new file mode 100644 index 00000000..7fcb0e12 --- /dev/null +++ b/udapi/block/ud/cs/fixmorpho.py @@ -0,0 +1,471 @@ +""" +A Czech-specific block to fix lemmas, UPOS and morphological features in UD. +It should increase consistency across the Czech treebanks. It focuses on +individual closed-class verbs (such as the auxiliary "být") or on entire classes +of words (e.g. whether or not nouns should have the Polarity feature). It was +created as part of the Hičkok project (while importing nineteenth-century Czech +data) but it should be applicable on any other Czech treebank. +""" +from udapi.core.block import Block +import logging +import re + +class FixMorpho(Block): + + def process_node(self, node): + # Do not touch words marked as Foreign or Typo. They may not behave the + # way we expect in Czech data. + if node.feats['Foreign'] == 'Yes' or node.feats['Typo'] == 'Yes': + return + #---------------------------------------------------------------------- + # NOUNS, PROPER NOUNS, AND ADJECTIVES + #---------------------------------------------------------------------- + # Nouns do not have polarity but the Prague-style tagsets may mark it. + if node.upos in ['NOUN', 'PROPN']: + if node.feats['Polarity'] == 'Pos': + node.feats['Polarity'] = '' + elif node.feats['Polarity'] == 'Neg': + logging.warn(f'To remove Polarity=Neg from the NOUN {node.form}, we may have to change its lemma ({node.lemma}).') + # For some nouns, there is disagreement in whether to tag and lemmatize + # them as proper nouns. We must be careful and not add too many to this + # rule, as many of them could be used as surnames and then they should + # be PROPN. + if node.upos == 'PROPN' and re.fullmatch(r'(bůh|duch|hospodin|město|milost|pan|pán|panna|stvořitel|trojice)', node.lemma.lower()): + node.lemma = node.lemma.lower() + node.upos = 'NOUN' + # Lemmatization. + if node.upos == 'NOUN' and node.lemma == 'zem': + node.lemma = 'země' + if node.upos == 'ADJ': + # Adjectives should be lemmatized to lowercase even if they are part of + # a multiword name, e.g., "Malá" in "Malá Strana" should be lemmatized + # to "malý". Exception: Possessive adjectives derived from personal + # names, e.g., "Karlův". + if node.feats['Poss'] != 'Yes': + node.lemma = node.lemma.lower() + # Short forms of adjectives are rare in Modern Czech and uninflected + # (they are used as predicates), so they lack the Case feature. But + # they were inflected for Case in the past, so it is better to add + # Case=Nom for consistency. + if node.feats['Variant'] == 'Short' and node.feats['Case'] == '': + node.feats['Case'] = 'Nom' + #---------------------------------------------------------------------- + # PRONOUNS AND DETERMINERS + #---------------------------------------------------------------------- + # Clitic forms of personal pronouns have Variant=Short if there is also a longer, full form. + if node.upos == 'PRON' and node.feats['PronType'] == 'Prs' and re.fullmatch(r'(mi|mě|ti|tě|si|se|ho|mu)', node.form.lower()): + node.feats['Variant'] = 'Short' + # Forms of "my" should be lemmatized as "já". + if node.upos == 'PRON' and node.lemma == 'my': + node.lemma = 'já' + # Forms of "vy" should be lemmatized as "ty". + if node.upos == 'PRON' and node.lemma == 'vy': + node.lemma = 'ty' + # Forms of "oni" should be lemmatized as "on" and cases that allow + # a preposition should have PrepCase. + if node.upos == 'PRON' and node.lemma in ['on', 'oni']: + node.lemma = 'on' + if node.feats['Case'] not in ['Nom', 'Voc']: + if node.form.lower().startswith('j'): + node.feats['PrepCase'] = 'Npr' + elif re.match(r'[nň]', node.form.lower()): + node.feats['PrepCase'] = 'Pre' + # In 19th century data, the grammaticalized usages of "se", "si" are + # tagged as PART (rather than a reflexive PRON, which is the standard). + # Even if it already was tagged PRON, some features may have to be added. + if node.upos in ['PRON', 'PART'] and node.form.lower() in ['se', 'si']: + node.lemma = 'se' + node.upos = 'PRON' + node.feats['PronType'] = 'Prs' + node.feats['Reflex'] = 'Yes' + if node.form.lower() == 'se': + # Occasionally "se" can be genitive: "z prudkého do se dorážení". + if not node.feats['Case'] == 'Gen': + node.feats['Case'] = 'Acc' + else: + node.feats['Case'] = 'Dat' + node.feats['Variant'] = 'Short' + # As the genitive/accusative form of "on", "jeho" should have PrepCase. + if node.upos == 'PRON' and node.form.lower() == 'jeho': + node.feats['PrepCase'] = 'Npr' + # Possessive pronouns have Person, Gender[psor] and Number[psor]. + # Although it is questionable, plural possessors are lemmatized to singular + # possessors in an analogy to personal pronouns: "my" --> "já", "náš" --> "můj". + # Some source corpora lack Person and [psor] features, others do not respect + # the lemmatization rule, so in the end we have to look at the forms; but + # there are potentially many variants, especially in old texts. + if node.upos == 'DET' and node.feats['Poss'] == 'Yes': + if node.form.lower().startswith('m'): + # můj muoj mój mého mému mém mým moje má mojí mé moji mou mí mých mými + node.feats['Person'] = '1' + node.feats['Number[psor]'] = 'Sing' + elif node.form.lower().startswith('t'): + # tvůj tvuoj tvój tvého tvému tvém tvým tvoje tvá tvojí tvé tvoji tvou tví tvých tvými + node.feats['Person'] = '2' + node.feats['Number[psor]'] = 'Sing' + elif node.form.lower().startswith('n'): + # náš našeho našemu našem naším naše naší naši našich našim našimi + node.lemma = 'můj' + node.feats['Person'] = '1' + node.feats['Number[psor]'] = 'Plur' + elif node.form.lower().startswith('v'): + # váš vašeho vašemu vašem vaším vaše vaší vaši vašich vašim vašimi + node.lemma = 'tvůj' + node.feats['Person'] = '2' + node.feats['Number[psor]'] = 'Plur' + elif node.form.lower() == 'jeho': + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Sing' + if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']): + node.feats['Gender[psor]'] = 'Masc,Neut' + elif re.fullmatch(r'jehož', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Sing' + if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']): + node.feats['Gender[psor]'] = 'Masc,Neut' + elif re.fullmatch(r'(její|jejího|jejímu|jejím|jejích|jejími|jejíma)', node.form.lower()): + node.lemma = 'jeho' + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Sing' + node.feats['Gender[psor]'] = 'Fem' + elif re.fullmatch(r'jejíž', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Sing' + node.feats['Gender[psor]'] = 'Fem' + elif re.fullmatch(r'jich|jejich', node.form.lower()): + node.lemma = 'jeho' + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Plur' + elif re.fullmatch(r'jichž|jejichž', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Plur' + elif re.fullmatch(r'jichžto|jejichžto', node.form.lower()): + node.lemma = 'jehožto' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Plur' + elif node.lemma == 'čí': + node.feats['Poss'] = 'Yes' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Reflexive possessive pronoun should not forget the Reflex=Yes feature. + if node.upos == 'DET' and node.lemma == 'svůj': + node.feats['Reflex'] = 'Yes' + # Demonstrative, interrogative, relative, negative, total and indefinite + # pronouns (or determiners, because some of them get the DET tag). + if node.upos in ['PRON', 'DET']: + # Relative pronoun "jenž" should be PRON, not DET + # (it inflects for Gender but it can never be used as congruent attribute). + if re.fullmatch(r'(jenž|jenžto)', node.lemma): + node.upos = 'PRON' + if node.form.lower().startswith('j'): + node.feats['PrepCase'] = 'Npr' + else: + node.feats['PrepCase'] = 'Pre' + # Relative pronoun "ješto" should be PRON, not DET (if it is not SCONJ, but that was excluded by a condition above) + # (it inflects for Gender but it can never be used as congruent attribute). + elif node.form.lower() in ['ješto', 'ježto']: + node.lemma = 'jenžto' + node.upos = 'PRON' + node.feats['PrepCase'] = 'Npr' + # Relative pronoun "an" is PRON (not DET). + elif node.lemma == 'an': + node.upos = 'PRON' + node.feats['PronType'] = 'Rel' + # Pronoun "kdo" is PRON (not DET). + elif node.lemma == 'kdo': + node.lemma = 'kdo' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "kdož" is PRON (not DET). + elif node.lemma == 'kdož': + node.lemma = 'kdož' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Rel' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "někdo", "kdosi" is PRON (not DET). + elif re.fullmatch(r'(kdosi|někdo)', node.lemma): + node.upos = 'PRON' + node.feats['PronType'] = 'Ind' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "nikdo" is PRON (not DET). + elif node.lemma == 'nikdo': + node.lemma = 'nikdo' + node.upos = 'PRON' + node.feats['PronType'] = 'Neg' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "co" is PRON (not DET). + elif node.lemma == 'co': + node.lemma = 'co' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "což" is PRON (not DET). + elif node.lemma in ['což', 'cože']: + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Rel' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "něco" is PRON (not DET). + elif re.fullmatch(r'(cokoli|cosi|něco)', node.lemma): + node.upos = 'PRON' + node.feats['PronType'] = 'Ind' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "nic" is PRON (not DET). + elif node.lemma == 'nic': + node.lemma = 'nic' + node.upos = 'PRON' + node.feats['PronType'] = 'Neg' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "týž" is DET and PronType=Dem. + elif re.fullmatch(r'(tentýž|týž)', node.lemma): + node.upos = 'DET' + node.feats['PronType'] = 'Dem' + # Pronoun "každý" is DET and PronType=Tot. + elif node.lemma == 'každý': + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + # Pronoun "vše" is lemmatized to "všechen", it is DET and PronType=Tot. + elif node.form.lower() == 'vše': + node.lemma = 'všechen' + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + elif node.lemma == 'všechen': + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + elif re.fullmatch(r'(všecek|všecka|všecku|všecko|všickni)', node.form.lower()): + node.lemma = 'všechen' + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + # Pronoun "sám" is lemmatized to the long form, it is DET and PronType=Emp. + elif node.lemma in ['sám', 'samý']: + node.lemma = 'samý' + node.upos = 'DET' + node.feats['PronType'] = 'Emp' + node.feats['Variant'] = 'Short' if re.fullmatch(r'(sám|sama|samo|sami|samy|samu)', node.form.lower()) else '' + #---------------------------------------------------------------------- + # PRONOMINAL NUMERALS AND ADVERBS + #---------------------------------------------------------------------- + # The numeral "oba" should be NUM, not PRON or DET. But it should have PronType=Tot. + if node.upos in ['NUM', 'PRON', 'DET'] and node.lemma == 'oba': + node.upos = 'NUM' + node.feats['NumType'] = 'Card' + node.feats['NumForm'] = 'Word' + node.feats['PronType'] = 'Tot' + # Pronominal cardinal numerals should be DET, not NUM. + if node.upos == 'NUM': + if re.fullmatch(r'(mnoho|málo|několik)', node.lemma): + node.upos = 'DET' + node.feats['PronType'] = 'Ind' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' ###!!! so we are losing the distinction mnoho/nemnoho? + elif re.fullmatch(r'(toliko?)', node.lemma): + node.lemma = 'tolik' + node.upos = 'DET' + node.feats['PronType'] = 'Dem' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(kolik)', node.lemma): + node.upos = 'DET' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' + if node.upos in ['ADV', 'NUM']: + if re.fullmatch(r'(mnoho|málo|několi)krát', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + node.feats['PronType'] = 'Ind' + elif re.fullmatch(r'(tolikrát)', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + node.feats['PronType'] = 'Dem' + elif re.fullmatch(r'(kolikrát)', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Pronominal adverbs have PronType but most of them do not have Degree + # and Polarity. + if node.upos == 'ADV': + if re.fullmatch(r'(dosud|dotud|nyní|odsud|odtud|proto|sem|tady|tak|takož|takto|tam|tamto|teď|tehdy|tenkrát|tu|tudy|zde)', node.lemma): + node.feats['PronType'] = 'Dem' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(dokdy|dokud|jak|kam|kde|kdy|kterak|kudy|odkdy|odkud|proč)', node.lemma): + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(kdežto)', node.lemma): + node.feats['PronType'] = 'Rel' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(jakkoli|jaksi|kamkoli|kamsi|kdekoli|kdesi|kdykoli|kdysi|kudykoli|kudysi|nějak|někam|někde|někdy|někudy)', node.lemma): + node.feats['PronType'] = 'Ind' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(nic|nijak|nikam|nikde|nikdy|nikudy)', node.lemma): + node.feats['PronType'] = 'Neg' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + # Total pronominals can be negated ("nevždy"). Then they get Degree, too. + elif re.fullmatch(r'(odevšad|všude|všudy|ve?ždy|ve?ždycky)', node.lemma): + node.feats['PronType'] = 'Tot' + node.feats['Degree'] = 'Pos' + node.feats['Polarity'] = 'Pos' + #---------------------------------------------------------------------- + # VERBS AND AUXILIARIES + #---------------------------------------------------------------------- + # In Czech UD, "být" is always tagged as AUX and never as VERB, regardless + # of the fact that it can participate in purely existential constructions + # where it no longer acts as a copula. Czech tagsets typically do not + # distinguish AUX from VERB, which means that converted data may have to + # be fixed. + if node.upos == 'VERB' and node.lemma in ['být', 'bývat', 'bývávat']: + node.upos = 'AUX' + if node.upos in ['ADV', 'VERB'] and re.fullmatch(r'(ne)?lze', node.form.lower()): + node.upos = 'ADV' + node.lemma = 'lze' # not 'nelze' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + node.feats['Aspect'] = '' + node.feats['Mood'] = '' + node.feats['Tense'] = '' + node.feats['Person'] = '' + node.feats['Number'] = '' + node.feats['Degree'] = 'Pos' + if node.upos in ['VERB', 'AUX']: + # Most non-passive verb forms have Voice=Act, and infinitives should + # have it, too. Passive infinitives are always periphrastic. + # (This is not done in the PDT tagset, but we should add it.) + if node.feats['VerbForm'] == 'Inf': + node.feats['Voice'] = 'Act' + # Same for imperatives. + elif node.feats['Mood'] == 'Imp': + node.feats['Voice'] = 'Act' + # Some verbs lack the Aspect feature although they are not biaspectual. + if node.feats['Aspect'] == '': + if re.fullmatch(r'(cítit|čekat|činit|číst|dávat|dělat|dít|dívat|hledat|chodit|chtít|jít|kralovat|ležet|milovat|mít|mluvit|moci|mus[ei]t|mysl[ei]t|patřit|počínat|prosit|ptát|působit|sedět|snažit|vědět|vidět|vyprávět|zdát|znamenat|žít)', node.lemma): + node.feats['Aspect'] = 'Imp' + elif re.fullmatch(r'(dát|dojít|dostat|nalézt|napadnout|nechat|obrátit|odpovědět|otevřít|počít|položit|pomoci|poslat|postavit|povědět|poznat|přijít|přinést|říci|učinit|udělat|ukázat|vrátit|vstát|vydat|vzít|začít|zeptat|zůstat)', node.lemma): + node.feats['Aspect'] = 'Perf' + # We must look at word form to distinguish imperfective "stát" from perfective "stát se". + elif re.fullmatch(r'(stojí(me?|š|te)?|stál(a|o|i|y)?)', node.form.lower()): + node.feats['Aspect'] = 'Imp' + elif re.fullmatch(r'(stan(u|eš|e|eme?|ete|ou)|stal(a|o|i|y)?)', node.form.lower()): + node.feats['Aspect'] = 'Perf' + # Present forms of perfective verbs normally have Tense=Pres despite + # meaning future. However, a few imperfective verbs have a separate + # future form (distinct from present form), which gets Tense=Fut + # despite inflecting similarly to present forms. + if node.feats['Mood'] == 'Ind' and node.feats['Tense'] == 'Pres' and node.feats['Aspect'] != 'Perf' and re.match(r'(ne)?((bud|půjd|pojed|polez|pones)(u|eš|e|eme?|ete|ou)|polet(ím|íš|í|íme|íte))', node.form.lower()): + node.feats['Tense'] = 'Fut' + # Passive participles (including the short forms) should be ADJ, not VERB. + # But they keep the verbal features of VerbForm, Voice, Aspect. + if node.feats['VerbForm'] == 'Part' and node.feats['Voice'] == 'Pass': + node.upos = 'ADJ' + # But now we need an adjectival lemma. + ###!!! Bohužel to občas zahodí normalizaci, kterou tam Martinův tým zavedl ručně, např. "rozhřita" mělo lemma "rozehřát", ale já teď místo "rozehřátý" vyrobím "rozhřitý". + ###!!! odepříno - odepříný místo odepřený + ###!!! dovolíno - dovolíný místo dovolený + ###!!! vyslyšána - vyslyšaný místo vyslyšený + ###!!! obmezený místo omezený, oslyšaný místo oslyšený + node.misc['LDeriv'] = node.lemma + node.lemma = re.sub(r'([nt])[auoiy]?$', r'\1ý', node.form.lower()) + node.lemma = re.sub(r'áný$', r'aný', node.lemma) # ztroskotány --> ztroskotáný --> ztroskotaný; zachován, spořádán + if node.feats['Polarity'] == 'Neg': + node.lemma = re.sub(r'^ne', '', node.lemma) + if node.feats['Case'] == '': + node.feats['Case'] = 'Nom' + if node.feats['Degree'] == '': + node.feats['Degree'] = 'Pos' + node.feats['Variant'] = 'Short' + #---------------------------------------------------------------------- + # ADVERBS + #---------------------------------------------------------------------- + # Words that indicate the speaker's attitude are tagged ADV in UD, + # although the Czech tagsets often treat them as particles. + if node.upos == 'PART' and re.fullmatch(r'(ani|asi?|až|bezpochyby|bohdá|co|dokonce|jen|jistě|již|hlavně|hned|jednoduše|leda|možná|naopak|nejen|nejspíše?|opravdu|ovšem|patrně|právě|prej|prý|přece|především|rozhodně|skoro|skutečně|snad|spíše?|teda|tedy|třeba|určitě|věru|vlastně|vůbec|zajisté|zase|zrovna|zřejmě|zvlášť|zvláště)', node.lemma): + node.upos = 'ADV' + node.feats['Degree'] = 'Pos' + node.feats['Polarity'] = 'Pos' + node.misc['CzechParticle'] = 'Yes' + # Adverb "brzo" should be lemmatized as "brzy". + if node.upos == 'ADV' and node.form.lower() == 'brzo': + node.lemma = 'brzy' + if node.upos == 'ADV' and node.form.lower() == 'teprv': + node.lemma = 'teprve' + # All non-pronominal adverbs (and also some pronominal ones) should + # have Degree and Polarity. At least for now we also exclude adverbial + # numerals, e.g. "jednou" – "nejednou". + if node.upos == 'ADV' and node.feats['PronType'] == '' and node.feats['NumType'] == '': + if node.feats['Degree'] == '': + node.feats['Degree'] = 'Pos' + if node.feats['Polarity'] == '': + node.feats['Polarity'] = 'Pos' + #---------------------------------------------------------------------- + # PREPOSITIONS + #---------------------------------------------------------------------- + # Preposition "u" may combine with Case=Loc|Acc in old texts, and then + # it functions as a vocalized counterpart of "v". Nevertheless, we always + # lemmatize it as "u" and thus AdpType is Prep, not Voc. + if node.upos == 'ADP' and node.form.lower() == 'u': + node.lemma = 'u' + node.feats['AdpType'] = 'Prep' + #---------------------------------------------------------------------- + # CONJUNCTIONS + #---------------------------------------------------------------------- + # As a conjunction (and not particle/adverb), "ani" is coordinating and + # not subordinating. + if node.upos == 'SCONJ' and node.lemma == 'ani': + node.upos = 'CCONJ' + if node.upos == 'CCONJ' and node.lemma == 'nebť': + node.lemma = 'neboť' + #---------------------------------------------------------------------- + # PARTICLES (other than those already grabbed above) + #---------------------------------------------------------------------- + # "jako" should be SCONJ but 19th century data have it as PART. + if node.upos == 'PART': + if node.lemma == 'jako': + node.upos = 'SCONJ' + elif node.lemma == 'ti': + node.lemma = 'ť' diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py new file mode 100644 index 00000000..da9f5bda --- /dev/null +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -0,0 +1,979 @@ +""" +Block to identify missing or ill-valued features in Czech. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + # The convention used in PDT is not consistent. Adjectives are fully disambiguated + # (three genders, two animacies, three numbers, seven cases), even though some + # forms are shared among many feature combinations. On the other hand, pronouns + # and determiners omit some features in the context of certain values of other + # features (e.g., gender and animacy are not distinguished in plural if the case + # is genitive, dative, locative or instrumental). + # In contrast, ČNK (CNC) fully disambiguates pronouns and determiners just like + # adjectives. + # Here we can trigger one of the two conventions. It should become a block parameter + # in the future. + pdt20 = False # True = like in PDT 2.0; False = like in ČNK + + def process_node(self, node): + # Czech constraints should not be applied to foreign words. + if node.feats['Foreign'] == 'Yes': + pass + # NOUNS ################################################################ + elif node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case']) + if node.feats['VerbForm'] == 'Vnoun': + # verbal nouns: bytí, dělání, ... + self.check_allowed_features(node, { + 'VerbForm': ['Vnoun'], + 'Gender': ['Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes'] + }) + elif node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + # PROPER NOUNS ######################################################### + elif node.upos == 'PROPN': + self.check_required_features(node, ['Gender', 'Number', 'Case']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + if node.feats['Poss'] == 'Yes': # possessive adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + elif node.feats['NumType'] == 'Ord' or node.feats['NumType'] == 'Mult': # ordinal numerals are a subtype of adjectives; same for some multiplicative numerals (dvojí, trojí) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord', 'Mult'], + 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho') + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'], # sedmer (Mult Short) duch tvój; pól čtverta (Ord Short) komára + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord', 'Mult'], + 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho') + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives + self.check_required_features(node, ['VerbForm', 'Voice']) + if node.feats['Voice'] == 'Act': # active participles have tense, passives don't but they have degree + if node.feats['Gender'] == 'Masc': + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + if node.feats['Gender'] == 'Masc': + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Degree']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Number', 'Case', 'Polarity', 'Degree']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: # regular adjectives, including short forms + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # on, ona, ono, oni, ony + if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony + self.check_adjective_like(node, ['PronType', 'Person'], { + 'PronType': ['Prs'], + 'Person': ['3'] + }) + elif re.match(r"^(ho|mu)$", node.form.lower()): + # The short (clitic) forms do not have PrepCase in Modern Czech. + # Old Czech has also 'jmu' (besides 'jemu' and 'mu') and 'jho' + # (besides 'jeho' and 'ho'); it should not have Variant=Short + # and it should have PrepCase=Npr (the next block). + self.check_adjective_like(node, ['PronType', 'Person', 'Variant'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Variant': ['Short'] + }) + else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně + # Mostly only two gender groups and no animacy: + # Masc,Neut ... jeho, jho, jemu, jmu, jej, něm, jím + # Fem ... jí, ji, ní + # Neut ... je + # No gender in dual and plural: + # Plur ... jich, jim, je, nich, jimi + # Here we require PrepCase but disallow Variant. + self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # 1st and 2nd person do not have gender: já, ty + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + elif re.search(r'k[dt][oe]', node.lemma): # kdo (kto), kdož, někdo, nikdo + # There is no Number. Někdo and nikdo behave like singular; + # kdo is by default singular as well but it also occurs as subject + # of plural verbs ("ti, kdo nepřišli včas, byli vyloučeni"). + # In Old Czech, "nikde" is a variant of the pronoun "nikdo" (nobody) + # (while in New Czech, "nikde" (nowhere) is a pronominal adverb only). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kdo to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], + 'Gender': ['Masc'], + 'Animacy': ['Anim'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif re.match(r'^(co(si?)?|což|což?koliv?|něco|lečco|lecco|ledacos?|nic|nicož)$', node.lemma): + # Although these pronouns behave by default as neuter singular, + # no Gender and Number is annotated. However, quite unusually, + # there is Animacy=Inan without Gender. + ###!!! This should probably be fixed in all Czech treebanks and + ###!!! in Interset. The pronoun should get Gender=Neut and no + ###!!! animacy. For now, let's at least make animacy an optional + ###!!! feature (I see that we already do not fill it in the Old + ###!!! Czech data). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, co to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], + 'Animacy': ['Inan'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif node.lemma == 'ješto': + # Unlike 'jenžto', this relative pronoun does not inflect, it + # always occurs in a nominative position, but the context can + # be any gender and number. + # Update from the Hičkok project: 'ješto' is lemmatized to + # 'jenžto' (see below), meaning that this branch should not be + # needed for the new data. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Case': ['Nom'] + }) + elif re.match(r'^(jenž|jenžto)$', node.lemma): + # The relative pronouns 'jenž', 'jenžto' inflect for gender; + # while we normally take this as a sign of DET (instead of PRON), + # these can never act as real DET because they never modify a + # nominal. + # Similarly to the personal pronoun 'on', animacy is only + # annotated for masculine nominative plural, non-nominative + # forms are merged for masculine and neuter (jehož, jemuž), and + # non-singular gender is only annotated in nominative (while + # these cases are common for all genders: jichž, jimž, jimiž). + # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even + # in the nominative, although there is no prepositional counter- + # part (but similarly the locative has no prepositionless form). + # Update from the Hičkok project: In Old Czech, both 'jenž' and + # 'jenžto' (or its variant 'ješto') can be used uninflected, + # accompanied by a resumptive pronoun which provides the inflection. + # In this case, the Hičkok data will not annotate Gender, Animacy, + # Number and Case of the relative pronoun. Therefore, we require + # the full set of features if any of them is present; otherwise, + # we only expect PronType and PrepCase. + if node.feats['Gender'] != '' or node.feats['Animacy'] != '' or node.feats['Number'] != '' or node.feats['Case'] != '': + self.check_adjective_like(node, ['PronType', 'PrepCase'], { + 'PronType': ['Rel'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: + self.check_required_features(node, ['PronType', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'PrepCase': ['Npr'] + }) + else: + # What remains is the relative pronoun 'an'. It behaves similarly + # to 'jenž' but it does not have the PrepCase feature and it + # only occurs in the nominative. + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom'] + }) + else: # not Masc Plur: an, ana, ano, any + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. + # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. + if re.match(r'^(je?ho|jejich|j[ií]ch)$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner because no n-form can be used (jeho dům VS. na jeho dům). + # Compare with genitive/accusative of the pronoun "on", there the form changes after preposition and PrepCase must be annotated + # (jeho se bojím VS. bez něho se neobejdu). + }) + # Relative possessive determiners 'jehož' and 'jejichž' behave similarly + # to the personal possessive determiners but they do not have Person. + # Normally determiners do not change j->n after prepositions but we + # have an example in Old Czech (štěpové zlatí, na nichžto větviech...) + elif re.match(r'^(jeho|jejich|[jn][ií]ch)ž(e|to)?$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner (muž, jehož manželka zahynula při nehodě) because no n-form can be used + # (after preposition: muž, na jehož manželku jste si stěžoval). Compare with genitive/accusative of the relative pronoun "jenž", + # there the form changes after preposition and PrepCase must be annotated (muž, jehož se bojím VS. muž, bez něhož se neobejdeme). + }) + # Feminine personal possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)$', node.form.lower()): + # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (její bota, její boty, její botě, její botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiej') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # Feminine relative possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(e|to)?)$', node.form.lower()): + # The feminine possessive 'jejíž' slightly inflects, unlike 'jehož' and 'jejichž'. + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (jejíž bota, jejíž boty, jejíž botě, jejíž botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiejž') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(můj|tvůj|svůj)(ž(e|to)?)?$', node.lemma): + if node.feats['Reflex'] == 'Yes': + self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'] + }) + else: + self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2'], + 'Number[psor]': ['Sing', 'Plur'] + }) + elif re.match(r'^(ně|lec|ni)?číž?(koliv?)?$', node.lemma): + self.check_adjective_like(node, ['PronType', 'Poss'], { + 'PronType': ['Int', 'Rel', 'Ind', 'Neg'], + 'Poss': ['Yes'] + }) + elif re.match(r'^(sám|samý)$', node.lemma): + # The above condition looks at both lemma options, although only one lemma is assumed. + # However, in New Czech data the one lemma is "samý" while in Old Czech data it is "sám". + # Unlike other determiners, it allows Variant=Short: sám, sama, samu, samo, sami, samy. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Emp'], 'Variant': ['Short']}) + elif node.lemma == 'veškerý': + # In Old Czech, this determiner also allows Variant=Short: veškeren, veškera, veškeru, veškero, veškeři, veškery. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Tot'], 'Variant': ['Short']}) + elif node.lemma == 'žádný': + # In Old Czech, this determiner also allows Variant=Short: žáden, žádna, žádnu, žádno, žádni, žádny. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Neg'], 'Variant': ['Short']}) + elif node.feats['NumType'] in ['Ord', 'Mult']: # pronominal numerals 'několikátý', 'několikerý', 'několiký' etc. + self.check_adjective_like(node, ['PronType', 'NumType'], { + 'PronType': ['Ind', 'Int', 'Rel', 'Dem'], + 'NumType': ['Ord', 'Mult'] + }) + elif node.feats['NumType'] == 'Card': # pronominal quantifiers 'mnoho', 'málo', 'několik' etc. + if node.lemma == 'nejeden': + self.check_adjective_like(node, ['PronType', 'NumType'], {'PronType': ['Ind'], 'NumType': ['Card']}) + else: + # Lemmas 'hodně' and 'málo' have Degree even if used as quantifiers and not adverbs: + # hodně, více, nejvíce; málo, méně, nejméně + # Lemmas 'mnoho' and 'málo' can be negated (nemnoho, nemálo). + self.check_required_features(node, ['PronType', 'NumType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Ind', 'Int', 'Rel', 'Dem'], + 'NumType': ['Card'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Int', 'Rel', 'Ind', 'Neg', 'Tot']}) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + if node.feats['NumType'] == 'Sets': + # 'jedny', 'dvoje', 'oboje', 'troje', 'čtvery' + # Number should perhaps be only Plur because the counted noun will be Plur. + # Gender is not annotated in PDT but there are different forms ('jedni' vs. 'jedny', + # and in Old Czech also 'dvoji' vs. 'dvoje'), so we should allow Gender (and Animacy). + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Sets'], + 'PronType': ['Tot'], # for 'oboje' + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. + # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. + # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. + # 'pět' and more have Number=Plur, Case: pět, pěti. + # 'půl' has no Number and Case, although it behaves syntactically similarly to 'pět' (but genitive is still 'půl', not '*půli'). + # 'sto', 'tisíc', 'milión', 'miliarda' etc. have Gender (+ possibly Animacy) and Number (depending on their form). + elif node.lemma == 'jeden': + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(dva|oba)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Gender', 'Number', 'Case']) + if self.pdt20: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(dvé|obé)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'obé' + 'NumForm': ['Word'], + 'Gender': ['Neut'], + 'Number': ['Sing'], # when 'dvé' is subject, the verb is neuter singular + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif node.lemma == 'půl': + self.check_required_features(node, ['NumType', 'NumForm']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'] + }) + elif re.match(r'^(sto|tisíc|.+ili[oó]n|.+iliarda)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + # In PDT, cardinal numerals higher than four in nominative/accusative/vocative + # have Number=Sing instead of Plur! It may be motivated by the default + # agreement they trigger on verbs (but they don't have Gender=Neut). + # It does not make much sense but we must allow Sing before a better + # approach is defined and implemented in the data. + # On the other hand, we may want to allow Dual for "stě". + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # VERBS AND AUXILIARIES ################################################ + elif node.upos in ['VERB', 'AUX']: + # There are only three lemmas recognized as AUX in Czech. This is not + # about features and it would be caught by the UD validator, but it + # is error in morphology, so let's report it here as well. + if node.upos == 'AUX' and node.lemma not in ['být', 'bývat', 'bývávat']: + self.bug(node, 'NonAuxLemma') + # All Czech verbs (and some adjectives and nouns) must have VerbForm. + # Almost all verbs have lexical Aspect but we cannot require it + # because there are a few biaspectual verbs (e.g. 'analyzovat') that + # do not have the feature. + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] in ['Inf', 'Sup']: + # There is no voice. For some reason, PDT does not annotate that + # the infinitive form is active (while a passive infinitive is + # a combination of the infinitive with a passive participle). + self.check_required_features(node, ['Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Inf', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Fin': + # Voice is optional. For some reason it is not annotated with + # imperatives (although passive imperatives are a combination + # of the active imperative and a passive participle). It is + # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. + # Conditional "by" has no person and number (it is typically + # 3rd person but it could be other persons, too, as in "ty by + # ses bál"). + if node.feats['Mood'] == 'Cnd': + if node.form.lower() == 'by': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'] + }) + elif node.form.lower() == 'byšta': + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['2', '3'], + 'Number': ['Dual'] + }) + else: + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'] + }) + elif node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Voice': ['Act'], # optional in Old Czech data, not used with imperatives in Modern Czech data (at least not yet) + 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'] + }) + else: # indicative + self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Voice': ['Act'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short', 'Long'], # distinguishes sigmatic (Long) and asigmatic (Short) aorist + 'Emph': ['Yes'] + }) + elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # converb + # Old Czech data annotate converb gender by context rather than form + # (because the form was different than in Modern Czech) and for + # masculines they also include animacy. In Modern Czech animacy is + # currently not annotated and Masc,Neut gender is merged. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['NumType'] != '': + # Adverbial multiplicative numerals (jednou, dvakrát, třikrát) + # belong here. They have also pronominal counterparts (kolikrát, + # tolikrát, několikrát). There are also adverbial ordinal numerals + # (zaprvé, poprvé, zadruhé, podruhé). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_allowed_features(node, { + 'NumType': ['Mult', 'Ord'], + 'PronType': ['Dem', 'Int', 'Rel', 'Int,Rel', 'Ind'] + }) + elif self.pdt20: + if node.feats['PronType'] != '': + # Pronominal adverbs in PDT are neither compared nor negated. + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] + }) + elif node.feats['Degree'] != '': + # Adverbs that are compared can also be negated. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + else: + if node.feats['PronType'] == 'Tot': + # Total adverbs in Old Czech can be negated: vždy, nevždy. + # Then for consistence with other adverbs, we also require + # Degree, although it will be always Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'PronType': ['Tot'], + 'Degree': ['Pos'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['PronType'] != '': + # Other pronominal adverbs are neither compared nor negated. + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg'] + }) + else: + # All other adverbs should have both Degree and Polarity, + # although for some of them the values will always be Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_required_features(node, ['AdpType', 'Case']) + self.check_allowed_features(node, { + 'AdpType': ['Prep', 'Voc'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Abbr': ['Yes'] + }) + # SUBORDINATING CONJUNCTIONS ########################################### + elif node.upos == 'SCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) + # COORDINATING CONJUNCTIONS ############################################ + elif node.upos == 'CCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) + # PARTICLES ############################################################ + elif node.upos == 'PART': + # "t." = "totiž" + self.check_allowed_features(node, { + 'Abbr': ['Yes'] + }) + # THE REST: NO FEATURES ################################################ + # (OR UNDEFINED UPOS) ################################################## + else: + if not node.upos in ['INTJ', 'PUNCT', 'SYM', 'X']: + bugmsg = 'UnknownUpos' + if node.upos: + bugmsg += node.upos + self.bug(node, bugmsg) + self.check_allowed_features(node, {}) + + def check_adjective_like(self, node, r0, a0): + """ + Long form of adjectives, pronouns and determiners mostly share declension + paradigms and thus the sets of features that are expected. Whether the + actual feature sets are the same depends on the tagging convention (PDT + vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are + not; in ČNK, both adjectives and pronouns (incl. determiners) are fully + disambiguated. This method defines the core inflectional features while + any extras (such as PronType for pronouns) have to be provided by the + caller in parameters r0 (list) and a0 (dict). + """ + required_features = [] + allowed_features = {} + full_set = node.upos == 'ADJ' or not self.pdt20 + if full_set: + # Even in the full set, animacy is only distinguished for the + # masculine gender. + if node.feats['Gender'] == 'Masc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + # Gender is annotated in all cases in singular (ten, ta, to) + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished + # in plural if gender is distinguished and it is masculine; in + # singular it is distinguished only in accusative (toho, ten). + # Other cases in plural are gender-less (těch, těm, těmi). + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + } + else: + required_features = ['Number', 'Case'] + allowed_features = { + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + } + required_features = r0 + required_features + a0.update(allowed_features) + allowed_features = a0 + self.check_required_features(node, required_features) + self.check_allowed_features(node, allowed_features) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py new file mode 100644 index 00000000..e9367d46 --- /dev/null +++ b/udapi/block/ud/da/fixmultisubject.py @@ -0,0 +1,123 @@ +""" +Block ud.da.FixMultiSubject tries to fix some systemic instances of predicates +that have more than one subject dependent. +""" +from udapi.core.block import Block +import re + +class FixMultiSubject(Block): + """ + Make sure that a predicate has at most one subject. Note that it can + only fix instances that follow certain pattern observed in the Danish + data. + """ + + def process_node(self, node): + subjects = [x for x in node.children if re.match(r'^[nc]subj$', x.udeprel)] + if len(subjects) > 1: + # Pattern 1: A node is is attached as xcomp to the current node, and + # one of the subjects is closer to that xcomp than to the current + # node. + xcompchildren = [x for x in node.children if x.udeprel == 'xcomp'] + # Pattern 2: Similar to pattern 1, but advcl instead of xcomp, and + # possibly not so many other mis-attached dependents. + advclchildren = [x for x in node.children if x.udeprel == 'advcl'] + # Pattern 3: Instead of xcomp or advcl, there is a simple amod + # (under a verb!), in fact an adjective with a copula that should + # have been advcl. Alternatively, the nonverbal clause is headed + # by a noun, and the deprel is obl instead of amod. + amodchildren = [x for x in node.children if re.match(r'^(amod|obl)$', x.udeprel)] + if len(subjects) == 2 and len(xcompchildren) > 0: + for xcompnode in xcompchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(xcompnode, x) for x in subjects] + # Is the first subject closer to xcomp than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to xcomp? + if dx[0] <= dn[0] and dn[1] <= dx[1]: + # The first subject should be re-attached to the xcomp node. + subjects[0].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if c != xcompnode and dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + break + # Is the second subject closer to xcomp than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to xcomp? + elif dx[1] <= dn[1] and dn[0] <= dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if c != xcompnode and dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + break + elif len(subjects) == 2 and len(advclchildren) > 0: + for advclnode in advclchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(advclnode, x) for x in subjects] + # Is the first subject closer to advcl than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to advcl? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = advclnode + break + # Is the second subject closer to advcl than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to advcl? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = advclnode + break + elif len(subjects) == 2 and len(amodchildren) > 0: + for amodnode in amodchildren: + if len([x for x in amodnode.children if x.udeprel == 'cop']) > 0: + dn = [dist(node, x) for x in subjects] + dx = [dist(amodnode, x) for x in subjects] + # Is the first subject closer to amod than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to amod? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break + # Is the second subject closer to amod than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to amod? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break + +def dist(x, y): + if x.ord < y.ord: + a = x + b = y + else: + a = y + b = x + d = b.ord - a.ord + # Count the commas between the two nodes. A comma should be seen as increasing + # the distance of the nodes, that is, decreasing the probability that they + # are in the same clause. + nc = 0 + for i in a.root.descendants: + if i.ord > a.ord and i.ord < b.ord: + if i.form == ',': + nc += 1 + d += nc * 10 + return d diff --git a/udapi/block/ud/de/addmwt.py b/udapi/block/ud/de/addmwt.py index 23ac54f9..18778a4a 100644 --- a/udapi/block/ud/de/addmwt.py +++ b/udapi/block/ud/de/addmwt.py @@ -16,15 +16,16 @@ 'durchs': {'form': 'durch das', }, 'fürs': {'form': 'fürs das', }, 'hinterm': {'form': 'hinter dem', }, + 'hinters': {'form': 'hinter das', }, 'im': {'form': 'in dem', }, 'ins': {'form': 'in das', }, 'übers': {'form': 'über das', }, 'ums': {'form': 'um das', }, - 'unters': {'form': 'unter das', }, 'unterm': {'form': 'unter dem', }, + 'unters': {'form': 'unter das', }, 'vom': {'form': 'von dem', }, - 'vors': {'form': 'vor das', }, 'vorm': {'form': 'vor dem', }, + 'vors': {'form': 'vor das', }, 'zum': {'form': 'zu dem', }, 'zur': {'form': 'zu der', }, } diff --git a/udapi/block/ud/de/fixgsd.py b/udapi/block/ud/de/fixgsd.py new file mode 100644 index 00000000..65d12681 --- /dev/null +++ b/udapi/block/ud/de/fixgsd.py @@ -0,0 +1,58 @@ +""" +Block to fix annotation of UD German-GSD. +""" +from udapi.core.block import Block +import logging +import re + +class FixGSD(Block): + + def process_node(self, node): + """ + Normalizes tokenization, lemmatization and tagging of ordinal numerals + that are expressed using digits followed by a period. + https://github.com/UniversalDependencies/UD_German-GSD/issues/24 + """ + # Ignore periods that terminate a sentence, although they could belong + # to an ordinal numeral at the same time. + if node.form == '.' and node.next_node: + # Ignore number+period combinations that have an intervening space. + if node.prev_node and re.match(r'^\d+$', node.prev_node.form) and node.prev_node.no_space_after: + # Merge the number and the period into one token. + number = node.prev_node + period = node + # The period should not have any children but if it does, re-attach them to the number. + for c in period.children: + c.parent = number + # The period should be followed by a space but if it isn't, mark it at the number. + number.misc['SpaceAfter'] = 'No' if period.no_space_after else '' + number.form += '.' + number.lemma = number.form + number.upos = 'ADJ' + number.xpos = 'ADJA' + number.feats = '_' + number.feats['NumType'] = 'Ord' + if number.udeprel == 'nummod': + number.deprel = 'amod' + period.remove() + # Even if the digits and the period are already in one token, check their annotation. + if re.match(r'^\d+\.$', node.form): + node.lemma = node.form + node.upos = 'ADJ' + node.xpos = 'ADJA' + node.feats = '_' + node.feats['NumType'] = 'Ord' + if node.udeprel == 'nummod': + node.deprel = 'amod' + # Finally, make sure that ordinal numerals expressed verbosely are tagged properly. + # Unlike for digits, do not remove the features for Gender, Number, and Case. + # Skip 'acht' because we cannot reliably distinguish it from the cardinal numeral and from the verb 'achten'. + if re.match(r'^(erst|zweit|dritt|viert|fünft|sechst|siebt|neunt|(drei|vier|fünf|sechs|sieb|acht|neun)?zehnt|elft|zwölft)(er)?$', node.lemma, re.IGNORECASE): + # Skip 'erst' that is used as an adverb. + if node.lemma != 'erst' or node.upos != 'ADV': + node.lemma = re.sub(r'^(.+)er$', r'\1', node.lemma) + node.upos = 'ADJ' + node.xpos = 'ADJA' + node.feats['NumType'] = 'Ord' + if node.udeprel == 'nummod': + node.deprel = 'amod' diff --git a/udapi/block/ud/de/fixhdt.py b/udapi/block/ud/de/fixhdt.py new file mode 100644 index 00000000..a3792a96 --- /dev/null +++ b/udapi/block/ud/de/fixhdt.py @@ -0,0 +1,109 @@ +""" +Block to fix annotation of UD German-HDT. + +It was created independently of ud.de.AddMwt but it aims to do essentially the +same thing. Future work: make the two blocks converge. + +Currently known differences: +- This block covers a wider range of contractions. +- This block generates morphological features for the syntactic words. +- This block does not touch words that look like contractions but do not have PronType=Art (this is a reliable indicator in HDT). +- This block overrides the default attachment when the original relation is root, conj, reparandum. +- The other block takes advantage of the generic class ud.AddMwt, so it does not have to re-invent common procedures. +""" +from udapi.core.block import Block +import logging +import re + +class FixHDT(Block): + + def process_node(self, node): + # PronType=Art with ADP is wrong. Fused prepositions and articles should be decomposed in UD. + # The following contractions have been observed: + # a. am ans aufs beim durchs fürs hinterm hinters im ins übers ums unterm unters vom vorm vors z. zum zur + if node.upos == 'ADP' and node.feats['PronType'] == 'Art': + if re.match("^(a\.|am|ans|aufs|beim|durchs|fürs|hinter[ms]|im|ins|übers|ums|unter[ms]|vom|vor[ms]|z\.|zu[mr])$", node.form, re.IGNORECASE): + # We need two nodes instead of one. Create a node. + # The parent should not be the root but unfortunately it is not guaranteed. + node2 = node.create_child() + node2.shift_after_node(node) + if not re.match(r"^(root|conj|reparandum)$", node.udeprel): + node2.parent = node.parent + node.deprel = 'case' + node2.deprel = 'det' + mwt = node.root.create_multiword_token(form=node.form, words=[node, node2], misc=node.misc) + node.misc['SpaceAfter'] = '' + # We want to respect the original letter case in the forms of the syntactic words. + # We can use the isupper() method to find out whether all letters are uppercase. + # However, detecting first-letter capitalization requires more work. + up = 2 if mwt.form.isupper() else 1 if mwt.form[:1].isupper() else 0 + up2 = 2 if up == 2 else 0 + if re.match(r"^(a\.|am|ans)$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'an') + node.lemma = 'an' + elif re.match(r"^aufs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'auf') + node.lemma = 'auf' + elif re.match(r"^beim$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'bei') + node.lemma = 'bei' + elif re.match(r"^durchs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'durch') + node.lemma = 'durch' + elif re.match(r"^fürs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'für') + node.lemma = 'für' + elif re.match(r"^hinter[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'hinter') + node.lemma = 'hinter' + elif re.match(r"^(im|ins)$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'in') + node.lemma = 'in' + elif re.match(r"^übers$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'über') + node.lemma = 'über' + elif re.match(r"^ums$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'um') + node.lemma = 'um' + elif re.match(r"^unter[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'unter') + node.lemma = 'unter' + elif re.match(r"^vom$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'von') + node.lemma = 'von' + elif re.match(r"^vor[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'vor') + node.lemma = 'vor' + elif re.match(r"^(z\.|zu[mr])$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'zu') + node.lemma = 'zu' + node.upos = 'ADP' + node.xpos = 'APPR' + node.feats = '_' + node.feats['AdpType'] = 'Prep' + # We must use search() because match() only checks at the beginning of the string. + if re.search("[m\.]$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'dem') + node2.feats = 'Case=Dat|Definite=Def|Gender=Masc,Neut|Number=Sing|PronType=Art' + node.feats['Case'] = 'Dat' + node2.lemma = 'der' + elif re.search("s$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'das') + node2.feats = 'Case=Acc|Definite=Def|Gender=Neut|Number=Sing|PronType=Art' + node.feats['Case'] = 'Acc' + node2.lemma = 'der' + elif re.search("r$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'der') + node2.feats = 'Case=Dat|Definite=Def|Gender=Fem|Number=Sing|PronType=Art' + node.feats['Case'] = 'Dat' + node2.lemma = 'der' + node2.upos = 'DET' + node2.xpos = 'ART' + +def mimic_case(up, x): + if up >= 2: + return x.upper() + elif up == 1: + return x[:1].upper() + x[1:].lower() + else: + return x.lower() diff --git a/udapi/block/ud/el/addmwt.py b/udapi/block/ud/el/addmwt.py index 8381c69f..ac753ed5 100644 --- a/udapi/block/ud/el/addmwt.py +++ b/udapi/block/ud/el/addmwt.py @@ -8,13 +8,13 @@ import udapi.block.ud.addmwt MWTS = { - 'στη': {'form': 'σ τη', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing'}, - 'στην': {'form': 'σ την', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing'}, - 'στα': {'form': 'σ τα', 'feats': '_ Case=Acc|Gender=Neut|Number=Plur'}, - 'στους': {'form': 'σ τους', 'feats': '_ Case=Acc|Gender=Masc|Number=Plur'}, - 'στις': {'form': 'σ τις', 'feats': '_ Case=Acc|Gender=Fem|Number=Plur'}, - 'στον': {'form': 'σ τον', 'feats': '_ Case=Acc|Gender=Masc|Number=Sing'}, - 'στο': {'form': 'σ το', 'feats': '_ Case=Acc|Gender=*|Number=Sing'}, + 'στη': {'form': 'σ τη', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'}, + 'στην': {'form': 'σ την', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'}, + 'στα': {'form': 'σ τα', 'feats': '_ Case=Acc|Definite=Def|Gender=Neut|Number=Plur|PronType=Art'}, + 'στους': {'form': 'σ τους', 'feats': '_ Case=Acc|Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'στις': {'form': 'σ τις', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Plur|PronType=Art'}, + 'στον': {'form': 'σ τον', 'feats': '_ Case=Acc|Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'στο': {'form': 'σ το', 'feats': '_ Case=Acc|Definite=Def|Gender=*|Number=Sing|PronType=Art'}, } # shared values for all entries in MWTS diff --git a/udapi/block/ud/en/__init__.py b/udapi/block/ud/en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/en/setspaceafter.py b/udapi/block/ud/en/setspaceafter.py new file mode 100644 index 00000000..1ebc3054 --- /dev/null +++ b/udapi/block/ud/en/setspaceafter.py @@ -0,0 +1,46 @@ +"""Block ud.en.SetSpaceAfter for heuristic setting of SpaceAfter=No in English. + +Usage:: + + udapy -s ud.en.SetSpaceAfter < in.conllu > fixed.conllu + +Author: Martin Popel +""" +import udapi.block.ud.setspaceafter + + +class SetSpaceAfter(udapi.block.ud.setspaceafter.SetSpaceAfter): + """Block for heuristic setting of the SpaceAfter=No MISC attribute in English. + + """ + + def process_tree(self, root): + nodes = root.descendants + for i, node in enumerate(nodes[:-1]): + next_form = nodes[i + 1].form + + # Contractions like "don't" and possessive suffix 's should be annotated as MWT. + # However, older UD_English-EWT versions did not follow this rule and even v2.7 + # contains some forgotten occurrences, so let's handle these as well. + if next_form in {"n't", "'s"}: + self.mark_no_space(node) + + # Parsers may distinguish opening and closing single quotes by XPOS. + elif node.form == "'" and node.xpos == "``": + self.mark_no_space(node) + elif next_form == "'" and nodes[i + 1].xpos == "''": + self.mark_no_space(node) + + + # hyphen-compounds + elif node.form == '-' and i: + if ((nodes[i - 1] is node.parent or nodes[i - 1].parent is node.parent) and + (nodes[i + 1] is node.parent or nodes[i + 1].parent is node.parent)): + self.mark_no_space(nodes[i - 1]) + self.mark_no_space(node) + + # $200 + elif node.form == '$' and nodes[i + 1].upos == 'NUM': + self.mark_no_space(node) + + super().process_tree(root) diff --git a/udapi/block/ud/es/addmwt.py b/udapi/block/ud/es/addmwt.py index ee85b1d6..92f80160 100644 --- a/udapi/block/ud/es/addmwt.py +++ b/udapi/block/ud/es/addmwt.py @@ -1,6 +1,6 @@ """Block ud.es.AddMwt for heuristic detection of Spanish contractions. -According to the UD guidelines, contractions such as "dele" = "de ele" +According to the UD guidelines, contractions such as "del" = "de el" should be annotated using multi-word tokens. Note that this block should be used only for converting legacy conllu files. @@ -28,7 +28,7 @@ v['lemma'] = v['form'] v['upos'] = 'ADP DET' v['deprel'] = '* det' - v['feats'] = '_ *' + v['feats'] = '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art' # The following are the default values # v['main'] = 0 # which of the two words will inherit the original children (if any) # v['shape'] = 'siblings', # the newly created nodes will be siblings @@ -46,6 +46,11 @@ def multiword_analysis(self, node): analysis = MWTS.get(node.form.lower(), None) if analysis is not None: + # Modify the default attachment of the new syntactic words in special situations. + if re.match(r'^(root|conj|reparandum)$', node.udeprel): + # Copy the dictionary so that we do not modify the original and do not affect subsequent usages. + analysis = analysis.copy() + analysis['shape'] = 'subtree' return analysis if not self.verbpron or node.upos not in {'VERB', 'AUX'}: diff --git a/udapi/block/ud/es/elque.py b/udapi/block/ud/es/elque.py new file mode 100644 index 00000000..4d14b98d --- /dev/null +++ b/udapi/block/ud/es/elque.py @@ -0,0 +1,116 @@ +""" +This block searches for relative clauses modifying a determiner ('el que, el cual...'). +It is written for Spanish but a similar block should work for other Romance +languages. +""" +from udapi.core.block import Block +import logging +import re + +class ElQue(Block): + + def __init__(self, fix=False, **kwargs): + """ + Default: Print the annotation patterns but do not fix anything. + fix=1: Do not print the patterns but fix them. + """ + super().__init__(**kwargs) + self.fix = fix + + def process_node(self, node): + # We take 'que' as the central node of the construction. + if re.match(r'^(que|cual)$', node.lemma) and node.upos == 'PRON' and node.parent.ord > node.ord: + # We will refer to the parent of 'que' as a verb, although it can be + # a non-verbal predicate, too. + que = node + verb = node.parent + # Check the lemma of the determiner. The form may vary for gender and number. + if que.prev_node and que.prev_node.lemma == 'el': + el = que.prev_node + adp = None + if el.prev_node and el.prev_node.upos == 'ADP': + adp = el.prev_node + if adp.udeprel == 'fixed': + adp = adp.parent + if self.fix: + self.fix_pattern(adp, el, que, verb) + else: + self.print_pattern(adp, el, que, verb) + + def print_pattern(self, adp, el, que, verb): + stanford = [] + if adp: + if adp.parent == el: + parentstr = 'el' + elif adp.parent == que: + parentstr = 'que' + elif adp.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(adp.deprel + '(' + parentstr + ', ADP)') + if el.parent == adp: + parentstr = 'ADP' + elif el.parent == que: + parentstr = 'que' + elif el.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(el.deprel + '(' + parentstr + ', el)') + # We found the verb as the parent of 'que', so we do not need to check the parent of 'que' now. + stanford.append(que.deprel + '(VERB, que)') + if verb.parent == adp: + parentstr = 'ADP' + elif verb.parent == el: + parentstr = 'el' + else: + parentstr = 'OTHER' + stanford.append(verb.deprel + '(' + parentstr + ', VERB)') + print('; '.join(stanford)) + + def fix_pattern(self, adp, el, que, verb): + if adp: + if adp.parent == que or adp.parent == verb: + attach(adp, el, 'case') + if el.parent == que: + ###!!! Just a temporary change. In the end it will be attached elsewhere. + attach(el, verb) + el.parent = verb + if len(el.deps) == 1: + el.deps[0]['parent'] = verb + if verb.parent != adp and verb.parent != el and verb.parent != que: + eldeprel = None + if re.match(r'^[nc]subj$', verb.udeprel): + eldeprel = 'nsubj' + elif re.match(r'^ccomp$', verb.udeprel): + eldeprel = 'obj' + elif re.match(r'^advcl$', verb.udeprel): + eldeprel = 'obl' + elif re.match(r'^acl$', verb.udeprel): + eldeprel = 'nmod' + elif re.match(r'^(xcomp|conj|appos|root)$', verb.udeprel): + eldeprel = verb.deprel + if eldeprel: + attach(el, verb.parent, eldeprel) + attach(verb, el, 'acl:relcl') + # If anything before 'el' depends on the verb ('cc', 'mark', 'punct' etc.), + # re-attach it to 'el'. + for c in verb.children: + if c.ord < el.ord and re.match(r'^(cc|mark|case|punct)$', c.udeprel): + attach(c, el) + +def attach(node, parent, deprel=None): + """ + Attach a node to a new parent with a new deprel in the basic tree. In + addition, if there are enhanced dependencies and there is just one incoming + enhanced relation (this is the case in AnCora), this relation will be + modified accordingly. + """ + node.parent = parent + if deprel: + node.deprel = deprel + if len(node.deps) == 1: + node.deps[0]['parent'] = parent + if deprel: + node.deps[0]['deprel'] = deprel diff --git a/udapi/block/ud/es/fixexclamation.py b/udapi/block/ud/es/fixexclamation.py new file mode 100644 index 00000000..7dea8e0d --- /dev/null +++ b/udapi/block/ud/es/fixexclamation.py @@ -0,0 +1,47 @@ +"""Block to fix tokenization of exclamation marks in UD Spanish-AnCora.""" +from udapi.core.block import Block +import logging +import re + +class FixExclamation(Block): + + def process_node(self, node): + """ + In Spanish AnCora, there are things like '¡Hola!' as one token. + The punctuation should be separated. One may question whether this + should include names of companies (Yahoo!) or products (la revista + Hello!) but it should, as company and product names often have + multiple tokens (even multiple full words, not just punctuation) + and these are also separated in UD. + """ + if re.search(r'^[¡!]\w', node.form): + # Separate the punctuation and attach it to the rest. + punct = node.create_child() + punct.shift_before_node(node) + punct.form = node.form[:1] + node.form = node.form[1:] + punct.lemma = punct.form + punct.upos = 'PUNCT' + punct.xpos = 'faa' if punct.form == '¡' else 'fat' + punct.feats['PunctType'] = 'Excl' + punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin' + punct.misc['SpaceAfter'] = 'No' + punct.deprel = 'punct' + # Mark the position for manual check. + node.misc['Mark'] = 'PunctSep' + if re.search(r'\w[¡!]$', node.form): + # Separate the punctuation and attach it to the rest. + punct = node.create_child() + punct.shift_after_node(node) + punct.form = node.form[-1:] + node.form = node.form[:-1] + punct.lemma = punct.form + punct.upos = 'PUNCT' + punct.xpos = 'faa' if punct.form == '¡' else 'fat' + punct.feats['PunctType'] = 'Excl' + punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin' + punct.misc['SpaceAfter'] = node.misc['SpaceAfter'] + node.misc['SpaceAfter'] = 'No' + punct.deprel = 'punct' + # Mark the position for manual check. + node.misc['Mark'] = 'PunctSep' diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py new file mode 100644 index 00000000..62fa0f4d --- /dev/null +++ b/udapi/block/ud/es/fixtenerque.py @@ -0,0 +1,47 @@ +"""Block to fix spurious auxiliary verbs in UD Spanish-AnCora.""" +from udapi.core.block import Block +import logging +import re + +class FixTenerQue(Block): + + def process_node(self, node): + """ + Some Spanish treebanks treat the verb 'tener' in constructions such as + 'tener que comer' as auxiliary. This is wrong and the validator will + flag it as an error. This block fixes such annotations. + + EDIT: 'ir a comer' is processed the same way. + """ + if re.match(r'^(tener|ir)$', node.lemma) and node.upos == 'AUX': + node.upos = 'VERB' + # In rare cases the auxiliary may have been promoted due to ellipsis. + # Most of the time however, it is attached as 'aux' to the main verb. + if node.udeprel == 'aux': + mainverb = node.parent + self.reattach(node, mainverb.parent, mainverb.deprel) + self.reattach(mainverb, node, 'xcomp') + # Some children of the former main verb should be reattached to 'tener'. + # Others (especially a direct object) should stay with the former main verb. + for c in mainverb.children: + if not re.match(r'^(obj|iobj|obl|ccomp|xcomp|conj|list|compound|flat|fixed|goeswith|reparandum)$', c.udeprel): + self.reattach(c, node, c.deprel) + # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. + for c in node.children: + if re.match(r'^(que|a)$', c.form.lower()) and c.ord > node.ord and c.ord < mainverb.ord: + self.reattach(c, mainverb, 'mark') + + def reattach(self, node, parent, deprel): + """ + Changes the incoming dependency relation to a node. Makes sure that the + same change is done in the basic tree and in the enhanced graph. + """ + if node.deps: + # If the enhanced graph contains the current basic relation, remove it. + orig_n_deps = len(node.deps) + node.deps = [x for x in node.deps if x['parent'] != node.parent or re.sub(r':.*', '', x['deprel']) != node.udeprel] + # Add the new basic relation to the enhanced graph only if the original one was there. + if len(node.deps) < orig_n_deps: + node.deps.append({'parent': parent, 'deprel': deprel}) + node.parent = parent + node.deprel = deprel diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py new file mode 100644 index 00000000..643ecd7c --- /dev/null +++ b/udapi/block/ud/es/fixverbfeats.py @@ -0,0 +1,38 @@ +"""Block to fix features (and potentially lemmas) of verbs in UD Spanish-PUD.""" +from udapi.core.block import Block +import logging +import re + +class FixVerbFeats(Block): + + def process_node(self, node): + """ + The features assigned to verbs in Spanish PUD are often wrong, although + the annotation was (reportedly) done manually. For example, infinitives + are tagged with VerbForm=Fin instead of VerbForm=Inf. + """ + if re.match(r'^(VERB|AUX)$', node.upos): + if re.search(r'[aei]r$', node.form, re.IGNORECASE): + # The infinitive has no features other than VerbForm. + node.feats = {} + node.feats['VerbForm'] = 'Inf' + node.lemma = node.form.lower() + elif re.search(r'ndo$', node.form, re.IGNORECASE): + if node.form.lower() != 'entiendo': + # The gerund has no features other than VerbForm. + # The lemma is not always straightforward but we have fixed it manually. + node.feats = {} + node.feats['VerbForm'] = 'Ger' + elif re.search(r'([ai]d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): + # The (past) participle has always Gender and Number. + # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). + # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) + gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') + number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') + node.feats = {} + node.feats['VerbForm'] = 'Part' + node.feats['Tense'] = 'Past' + node.feats['Gender'] = gender + node.feats['Number'] = number + if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): + node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) diff --git a/udapi/block/ud/fixadvmodbyupos.py b/udapi/block/ud/fixadvmodbyupos.py new file mode 100644 index 00000000..a2e4439c --- /dev/null +++ b/udapi/block/ud/fixadvmodbyupos.py @@ -0,0 +1,103 @@ +""" +Block ud.FixAdvmodByUpos will change the dependency relation from advmod to something else +if the UPOS is not ADV. +""" +from udapi.core.block import Block + + +class FixAdvmodByUpos(Block): + """ + Make sure advmod is not used with UPOS it should not be used with. + """ + + def process_node(self, node): + if node.udeprel == 'advmod': + if node.upos in ['NOUN', 'PROPN', 'PRON', 'DET', 'NUM']: + node.deprel = 'obl' + elif node.upos == 'VERB': + node.deprel = 'advcl' + elif node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos in ['ADP', 'SCONJ']: + if node.parent.upos == 'VERB': + node.deprel = 'mark' + else: + node.deprel = 'case' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + else: + node.deprel = 'dep' + ###!!! The following are not advmod so they should probably have their + ###!!! own block or this block should have a different name. + elif node.udeprel == 'expl': + if node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'ADP': + node.deprel = 'case' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.udeprel in ['aux', 'cop']: + if node.upos != 'AUX': + node.deprel = 'dep' + elif node.udeprel == 'case': + if node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'PRON': + node.deprel = 'nmod' + elif node.udeprel == 'mark': + if node.upos in ['PRON', 'DET']: + node.deprel = 'nsubj' # it could be also obj, iobj, obl or nmod; just guessing what might be more probable + elif node.upos == 'NOUN': + node.deprel = 'obl' + elif node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.udeprel == 'cc': + if node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.upos == 'NOUN': + node.deprel = 'dep' + elif node.udeprel == 'det': + if node.upos == 'NOUN': + node.deprel = 'nmod' + elif node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'NUM': + node.deprel = 'nummod' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'VERB': + node.deprel = 'dep' + elif node.upos == 'SCONJ': + node.deprel = 'mark' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.upos == 'X': + node.deprel = 'dep' + elif node.udeprel == 'nummod': + if node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'PRON': + node.deprel = 'nmod' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'ADP': + node.deprel = 'case' + elif node.udeprel == 'punct': + if node.upos != 'PUNCT': + node.deprel = 'dep' + elif node.udeprel == 'obl' and node.parent.upos in ['NOUN', 'PROPN', 'PRON'] and node.parent.udeprel in ['nsubj', 'obj', 'iobj', 'obl', 'vocative', 'dislocated', 'expl', 'nmod']: + node.deprel = 'nmod' diff --git a/udapi/block/ud/fixcompoundname.py b/udapi/block/ud/fixcompoundname.py new file mode 100644 index 00000000..90596e35 --- /dev/null +++ b/udapi/block/ud/fixcompoundname.py @@ -0,0 +1,46 @@ +""" +Block ud.FixCompoundName finds compound relations between PROPN nodes and converts +them to flat:name. This is not necessarily correct in all situations. The difference +between compound and flat is that compound allows to distinguish head and modifier. +Multiword person names (given name and surname, or various other patterns) typically +should be analyzed as flat but there are treebanks that incorrectly use compound +for person names. This block can be used to fix them. +""" +from udapi.core.block import Block +import regex as re +import logging + + +class FixCompoundName(Block): + """ + Converts a compound relation between two PROPN nodes into a flat relation. + Compounds of a PROPN and a non-PROPN will be left alone, although they are + suspicious, too. + """ + + def process_node(self, node): + if node.upos == 'PROPN' and node.udeprel == 'compound' and node.parent.upos == 'PROPN': + origparent = node.parent + grandparent = origparent.parent + outdeprel = origparent.deprel + # See if there are other PROPN compound siblings. + # (The list node.children is automatically sorted by ord. If any new sorting is needed later, we can compare nodes directly, their default comparison value is ord.) + namewords = [x for x in origparent.children(add_self=True) if x.upos == 'PROPN' and (x.udeprel == 'compound' or x == origparent)] + # The Hindi treebank tags dates (['30', 'navaṁbara'], ['disaṁbara', '1993']) as PROPN compounds. + # This is wrong but it is also different from personal names we are targeting here. + # Hence, we will skip "names" that contain numbers. + if any(re.search(r"\d", x.form) for x in namewords): + #logging.info(str([x.misc['Translit'] for x in namewords])) + ###!!! We currently cannot transform enhanced dependencies. + ###!!! If we proceed, the basic tree would diverge from the enhanced dependencies. + if len(node.deps) > 0: + logging.fatal('There are enhanced dependencies but ud.FixCompoundName has been implemented only for basic dependencies.') + # The first name word will be the technical head. If it is the current parent, fine. + head = namewords[0] + rest = namewords[1:] + if head != origparent: + head.parent = grandparent + head.deprel = outdeprel + for n in rest: + n.parent = head + n.deprel = 'flat:name' diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py new file mode 100644 index 00000000..9b4ce191 --- /dev/null +++ b/udapi/block/ud/fixleaf.py @@ -0,0 +1,42 @@ +""" +Block ud.FixLeaf checks that function word dependents are leaves. +Certain known exceptions are observed (e.g., fixed expressions). +""" +from udapi.core.block import Block +import logging +import re + +class FixLeaf(Block): + """ + Make sure that function words are leaves unless one of the known exceptions + applies. + """ + + def __init__(self, deprels='aux,cop,case,mark,cc', **kwargs): + """ + Args: + deprels: comma-separated list of deprels to be fixed. Default = aux,cop,case,mark,cc. + """ + super().__init__(**kwargs) + self.deprels = deprels.split(',') + + def process_node(self, node): + for deprel in self.deprels: + if node.udeprel == deprel: + # Every function dependent can have a fixed child. + # We will also allow conj, cc, punct, goeswith, reparandum. + allowed = ['fixed', 'punct', 'goeswith', 'reparandum'] + if deprel != 'cc': + allowed += ['conj', 'cc'] + children = [c for c in node.children if not (c.udeprel in allowed)] + # Re-attach the remaining children to an acceptable ancestor. + ancestor = node.parent + while ancestor.udeprel in self.deprels: + ancestor = ancestor.parent + for c in children: + c.parent = ancestor + # If there are enhanced dependencies, check whether we want to redirect them too. + if c.deps: + for edep in c.deps: + if edep['parent'] == node: + edep['parent'] = ancestor diff --git a/udapi/block/ud/fixmultiobjects.py b/udapi/block/ud/fixmultiobjects.py new file mode 100644 index 00000000..485b85f0 --- /dev/null +++ b/udapi/block/ud/fixmultiobjects.py @@ -0,0 +1,47 @@ +""" +Block ud.FixMultiObjects will ensure that no node has more than one (direct) object child. +""" +from udapi.core.block import Block + + +class FixMultiObjects(Block): + """ + Make sure there is at most one object. + """ + + def process_node(self, node): + objects = [x for x in node.children if x.udeprel == 'obj'] + if len(objects) > 1: + subjects = [x for x in node.children if x.udeprel in ['nsubj', 'csubj']] + # Some heuristics that could work in AnCora: + # If all objects are after the verb, keep the one that is closest to the verb. + if objects[0].ord > node.ord: + objects = objects[1:] + for o in objects: + o.deprel = 'obl:arg' + o.deps[0]['deprel'] = 'obl:arg' + elif objects[-1].ord < node.ord: + objects = objects[:-1] + for o in objects: + o.deprel = 'dislocated' + o.deps[0]['deprel'] = 'dislocated' + # ho experimenta tot + elif objects[-1].lemma in ['tot', 'todo']: + objects[-1].parent = objects[0] + objects[-1].deprel = 'nmod' + objects[-1].deps[0]['parent'] = objects[0] + objects[-1].deps[0]['deprel'] = 'nmod' + # X se llama Y + elif node.lemma in ['llamar', 'considerar', 'decir', 'denunciar', 'causar', 'escribir', 'hacer', 'rubricar']: + objects[-1].deprel = 'xcomp' + objects[-1].deps[0]['deprel'] = 'xcomp' + elif len(subjects) == 0: + objects[0].deprel = 'nsubj' + objects[0].deps[0]['deprel'] = 'nsubj' + else: + objects[0].deprel = 'dislocated' + objects[0].deps[0]['deprel'] = 'dislocated' + # For the moment, we take the dummiest approach possible: The first object survives and all others are forced to a different deprel. + #objects = objects[1:] + #for o in objects: + # o.deprel = 'iobj' diff --git a/udapi/block/ud/fixmultisubjects.py b/udapi/block/ud/fixmultisubjects.py new file mode 100644 index 00000000..f8aeca06 --- /dev/null +++ b/udapi/block/ud/fixmultisubjects.py @@ -0,0 +1,23 @@ +""" +Block ud.FixMultiSubjects will ensure that no node has more than one subject child (except those +marked as :outer). +""" +import re +from udapi.core.block import Block + + +class FixMultiSubjects(Block): + """ + Make sure there is at most one subject that is not marked as :outer. + """ + + def process_node(self, node): + subjects = [x for x in node.children if re.match(r"^[nc]subj(:|$)", x.deprel) and not re.search(r":outer$", x.deprel)] + # For the moment, we take the dummiest approach possible: The first subject survives and all others are forced to a different deprel. + if len(subjects) > 1: + subjects = subjects[1:] + for s in subjects: + if re.match(r"^n", s.deprel): + s.deprel = 'obl' + else: + s.deprel = 'advcl' diff --git a/udapi/block/ud/fixmwtspace.py b/udapi/block/ud/fixmwtspace.py new file mode 100644 index 00000000..a2b7b875 --- /dev/null +++ b/udapi/block/ud/fixmwtspace.py @@ -0,0 +1,22 @@ +""" +Block ud.FixMwtSpace looks for multiword tokens whose form contains a space, +which should be avoided. If found, the block checks whether it can remove +the multiword token seamlessly, that is, whether the syntactic words correspond +to the space-delimited parts of the multiword token. If possible, the MWT +line will be removed. +""" +from udapi.core.block import Block +import re + + +class FixMwtSpace(Block): + """Try to remove multiword tokens with spaces.""" + + def process_node(self, node): + if node.multiword_token: + mwt = node.multiword_token + if re.search(r' ', mwt.form): + if node == mwt.words[0]: + wordforms = [x.form for x in mwt.words] + if ' '.join(wordforms) == mwt.form: + mwt.remove() diff --git a/udapi/block/ud/fixpseudocop.py b/udapi/block/ud/fixpseudocop.py new file mode 100644 index 00000000..f4d9a1ec --- /dev/null +++ b/udapi/block/ud/fixpseudocop.py @@ -0,0 +1,45 @@ +"""Block to fix annotation of verbs that are currently treated as copulas + but they should be treated as normal verbs (with secondary predication) + instead.""" +from udapi.core.block import Block +import re + +class FixPseudoCop(Block): + + def __init__(self, lemmas, noncopaux=False, **kwargs): + """Create the ud.FixPseudoCop block instance. + + Args: + lemmas: comma-separated list of lemmas of the pseudocopulas that should be fixed + noncopaux: do the same for non-copula auxiliaries with the given lemma + """ + super().__init__(**kwargs) + self.lemmas = lemmas.split(',') + self.noncopaux = noncopaux + + def process_node(self, node): + pseudocop = self.lemmas + if node.lemma in pseudocop: + # Besides spurious copulas, this block can be optionally used to fix spurious auxiliaries (if noncopaux is set). + if node.udeprel == 'cop' or self.noncopaux and node.udeprel == 'aux': + secpred = node.parent + grandparent = secpred.parent + node.parent = grandparent + node.deprel = secpred.deprel + secpred.parent = node + secpred.deprel = "xcomp" + ###!!! We should also take care of DEPS if they exist. + # As a copula, the word was tagged AUX. Now it should be VERB. + node.upos = "VERB" + # Examine the children of the original parent. + # Those that modify the clause should be re-attached to me. + # Those that modify the word (noun, adjective) should stay there. + for c in secpred.children: + # obl is borderline. It could modify an adjective rather than a clause. + # obj and iobj should not occur in copular clauses but it sometimes + # occurs with pseudocopulas: "I declare him handsome." + if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): + c.parent = node + # Another possible error is that the word is tagged AUX without being attached as "cop" or "aux". + elif self.noncopaux and node.upos == 'AUX': + node.upos = 'VERB' diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py index e810c58d..854a24a8 100644 --- a/udapi/block/ud/fixpunct.py +++ b/udapi/block/ud/fixpunct.py @@ -2,11 +2,10 @@ Punctuation in Universal Dependencies has the tag PUNCT, dependency relation punct, and is always attached projectively, usually to the head of a neighboring subtree -to its left or right. -Punctuation normally does not have children. If it does, we will skip it. -It is unclear what to do anyway, and we won't have to check for cycles. +to its left or right (see https://universaldependencies.org/u/dep/punct.html). +Punctuation normally does not have children. If it does, we will fix it first. -Tries to re-attach punctuation projectively. +This block tries to re-attach punctuation projectively and according to the guidelines. It should help in cases where punctuation is attached randomly, always to the root or always to the neighboring word. However, there are limits to what it can do; for example it cannot always recognize whether a comma is introduced to separate @@ -14,11 +13,7 @@ this block is almost good, the block may actually do more harm than good. Since the punctuation should not have children, we should not create a non-projectivity -if we check the roof edges going to the right. -However, it is still possible that we will attach the punctuation non-projectively -by joining a non-projectivity that already exists. -For example, the left neighbor (node i-1) may have its parent at i-3, -and the node i-2 forms a gap (does not depend on i-3). +if we check the root edges going to the right. """ from udapi.core.block import Block # pylint: disable=no-self-use @@ -32,32 +27,57 @@ '{': '}', '"': '"', # ASCII double quotes "'": "'", # ASCII single quotes - '“': '”', # quotation marks used in English,... - '„': '“', # Czech, German, Russian,... - '«': '»', # French, Russian, Spanish,... + '“': '”', # quotation marks used in English, ... + '„': '“', # Czech, German, Russian, ... + '«': '»', # French, Russian, Spanish, ... '‹': '›', # dtto '《': '》', # Korean, Chinese '「': '」', # Chinese, Japanese - '『': '』', # dtto - '¿': '?', # Spanish question quotation marks - '¡': '!', # Spanish exclamation quotation marks + '『': '』', # ditto + '¿': '?', # Spanish paired question marks + '¡': '!', # Spanish paired exclamation marks } FINAL_PUNCT = '.?!' class FixPunct(Block): - """Make sure punct nodes are attached punctuation is attached projectively.""" + """Make sure punctuation nodes are attached projectively.""" - def __init__(self, **kwargs): - """Create the ud.FixPunct block instance.""" + def __init__(self, check_paired_punct_upos=False, copy_to_enhanced=False, **kwargs): + """Create the ud.FixPunct block instance. + + Args: + check_paired_punct_upos: fix paired punctuation tokens only if their UPOS=PUNCT. + The default is false, which means that fixed punctuation is detected only + based on the form with the exception of single & double quote character, + which is frequently ambiguous*, so UPOS=PUNCT is checked always. + *) Single quote can be an apostrophe. Double quote as a NOUN can be the inch symbol. + copy_to_enhanced: for all upos=PUNCT, let the enhanced depencies + be the same as the basic dependencies. + """ super().__init__(**kwargs) self._punct_type = None + self.check_paired_punct_upos = check_paired_punct_upos + self.copy_to_enhanced = copy_to_enhanced + + def _is_punct(self, node): + if node.upos == 'PUNCT': + return True + if self.check_paired_punct_upos: + return False + if node.form in "'\"": + return False + if node.form in PAIRED_PUNCT or node.form in PAIRED_PUNCT.values(): + return True + return False def process_tree(self, root): - # First, make sure no PUNCT has children + # First, make sure no PUNCT has children. + # This may introduce multiple subroots, which will be fixed later on + # (preventing to temporarily create multiple subroots here would prevent fixing some errors). for node in root.descendants: - while node.parent.upos == "PUNCT": + while self._is_punct(node.parent): node.parent = node.parent.parent # Second, fix paired punctuations: quotes and brackets, marking them in _punct_type. @@ -69,15 +89,40 @@ def process_tree(self, root): self._punct_type = [None] * (1 + len(root.descendants)) for node in root.descendants: if self._punct_type[node.ord] != 'closing': - closing_punct = PAIRED_PUNCT.get(node.form, None) + closing_punct = PAIRED_PUNCT.get(node.form) if closing_punct is not None: self._fix_paired_punct(root, node, closing_punct) # Third, fix subordinate punctuation (i.e. any punctuation not marked in _punct_type). for node in root.descendants: - if node.upos == "PUNCT" and not self._punct_type[node.ord]: + if node.upos == 'PUNCT' and not self._punct_type[node.ord]: self._fix_subord_punct(node) + # UD requires "exactly one word is the head of the sentence, dependent on a notional ROOT", i.e. a single "subroot". + # This seems to be a stronger rule than no-PUNCT-children because it is checked by the validator. + # So lets prevent multiple subroots (at the cost of possibly re-introducing PUNCT-children). + if len(root.children) > 1: + selected_subroot = next((n for n in root.children if n.udeprel == 'root'), root.children[0]) + for a_subroot in root.children: + if a_subroot != selected_subroot: + a_subroot.parent = selected_subroot + + # Check if the subroot is still marked with deprel=root. + # This may not hold if the original subroot was a paired punctuation, which was rehanged. + if root.children[0].udeprel != 'root': + root.children[0].udeprel = 'root' + if self.copy_to_enhanced: + root.children[0].deps = [{'parent': root, 'deprel': 'root'}] + for another_node in root.children[0].descendants: + if another_node.udeprel == 'root': + another_node.udeprel = 'punct' + + # TODO: This block changes parents not only for PUNCT nodes. These should be reflected into enhanced deps as well. + if self.copy_to_enhanced: + for node in root.descendants: + if node.upos == 'PUNCT': + node.deps = [{'parent': node.parent, 'deprel': node.deprel}] + def _fix_subord_punct(self, node): # Dot used as the ordinal-number marker (in some languages) or abbreviation marker. # TODO: detect these cases somehow @@ -99,13 +144,13 @@ def _fix_subord_punct(self, node): l_cand, r_cand = node.prev_node, node.next_node if node.form in FINAL_PUNCT: r_cand = None - while l_cand.ord > 0 and l_cand.upos == "PUNCT": - if self._punct_type[l_cand.ord] == 'opening': + while l_cand.ord > 0 and l_cand.upos == 'PUNCT': + if self._punct_type[l_cand.ord] == 'opening' and l_cand.parent != node: l_cand = None break l_cand = l_cand.prev_node - while r_cand is not None and r_cand.upos == "PUNCT": - if self._punct_type[r_cand.ord] == 'closing': + while r_cand is not None and r_cand.upos == 'PUNCT': + if self._punct_type[r_cand.ord] == 'closing' and r_cand.parent != node: r_cand = None break r_cand = r_cand.next_node @@ -115,18 +160,27 @@ def _fix_subord_punct(self, node): # because climbing higher would cause a non-projectivity (the punct would be the gap). l_path, r_path = [l_cand], [r_cand] if l_cand is None or l_cand.is_root(): - l_cand = None + l_cand, l_path = None, [] else: - while (not l_cand.parent.is_root() and l_cand.parent.precedes(node) - and not node.precedes(l_cand.descendants(add_self=1)[-1])): + while (not l_cand.parent.is_root() and l_cand.parent < node + and not node < l_cand.descendants(add_self=1)[-1]): l_cand = l_cand.parent l_path.append(l_cand) if r_cand is not None: - while (not r_cand.parent.is_root() and node.precedes(r_cand.parent) - and not r_cand.descendants(add_self=1)[0].precedes(node)): + while (not r_cand.parent.is_root() and node < r_cand.parent + and not r_cand.descendants(add_self=1)[0] < node): r_cand = r_cand.parent r_path.append(r_cand) + # Filter out candidates which would lead to non-projectivities, i.e. bugs + # punct-nonproj and punct-nonproj-gap as checked by the UD validator and ud.MarkBugs. + orig_parent = node.parent + l_path = [n for n in l_path if n and self._will_be_projective(node, n)] + r_path = [n for n in r_path if n and self._will_be_projective(node, n)] + l_cand = l_path[-1] if l_path else None + r_cand = r_path[-1] if r_path else None + node.parent = orig_parent + # Now select between l_cand and r_cand -- which will be the new parent? # The lower one. Note that if neither is descendant of the other and neither is None # (which can happen in rare non-projective cases), we arbitrarily prefer l_cand, @@ -152,14 +206,24 @@ def _fix_subord_punct(self, node): # We try to be conservative and keep the parent, unless we are sure it is wrong. if node.parent not in path: node.parent = cand - node.deprel = "punct" + node.deprel = 'punct' + + def _will_be_projective(self, node, cand): + node.parent = cand + return not node.is_nonprojective() and not self._causes_gap(node) + + def _causes_gap(self, node): + return node.is_nonprojective_gap() and not node.parent.is_nonprojective_gap() def _fix_paired_punct(self, root, opening_node, closing_punct): + if (self.check_paired_punct_upos + or opening_node.form in "'\"") and opening_node.upos != 'PUNCT': + return nested_level = 0 for node in root.descendants[opening_node.ord:]: if node.form == closing_punct: if nested_level > 0: - nested_level -= 0 + nested_level -= 1 else: self._fix_pair(root, opening_node, node) return @@ -167,18 +231,72 @@ def _fix_paired_punct(self, root, opening_node, closing_punct): nested_level += 1 def _fix_pair(self, root, opening_node, closing_node): + # Ideally, paired punctuation symbols should be attached to the single + # head of the subtree inside. Provided the inside segment is a single + # subtree. heads = [] - for node in root.descendants[opening_node.ord: closing_node.ord - 1]: - if node.parent.precedes(opening_node) or closing_node.precedes(node.parent): - if node.upos != 'PUNCT': - heads.append(node) - if len(heads) == 1: + punct_heads = [] + for node in root.descendants: + if node == opening_node or node == closing_node: + continue + # If this is a node inside of the pair, is its parent outside? + if node > opening_node and node < closing_node: + if node.parent < opening_node or node.parent > closing_node: + if node.upos == 'PUNCT': + punct_heads.append(node) + else: + heads.append(node) + # Not only the punctuation symbols must not be attached non-projectively, + # they also must not cause non-projectivity of other relations. This could + # happen if an outside node is attached to an inside node. To account for + # this, mark the inside parent as a head, too. + elif node.parent > opening_node and node.parent < closing_node: + if node.parent.upos == 'PUNCT': + punct_heads.append(node.parent) + else: + heads.append(node.parent) + + # Punctuation should not have children, but if there is no other head candidate, + # let's break this rule. + if len(heads) == 0: + heads = punct_heads + # If there are no nodes between the opening and closing mark (), + # let's treat the marks as any other (non-pair) punctuation. + if len(heads) == 0: + return + else: + # Ideally, there should be only a single head. + # If not, we could try e.g. to choose the "widests-span head": + # opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0] + # closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] + # which often leads to selecting the same head for the opening and closing punctuation + # ignoring single words inside the paired punct which are non-projectively attached outside. + # However, this means that the paired punctuation will be attached non-projectively, + # which is forbidden by the UD guidelines. + # Thus, we will choose the nearest head, which is the only way how to prevent non-projectivities. + # Sort the heads by their ords (this is not guaranteed because we were adding a mixture of + # inside heads and inside parents of outside nodes). + heads.sort(key=lambda x: x.ord) opening_node.parent = heads[0] - closing_node.parent = heads[0] - self._punct_type[opening_node.ord] = 'opening' - self._punct_type[closing_node.ord] = 'closing' - elif len(heads) > 1: - opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0] - closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] - self._punct_type[opening_node.ord] = 'opening' - self._punct_type[closing_node.ord] = 'closing' + closing_node.parent = heads[-1] + + self._punct_type[opening_node.ord] = 'opening' + self._punct_type[closing_node.ord] = 'closing' + + # In rare cases, non-projective gaps may remain. Let's dirty fix these! + # E.g. in "the (lack of) reproducibility", the closing parenthesis + # should be attached to "of" rather than to "lack" + # -- breaking the paired-marks-have-same-parent rule + # in order to prevent the punct-nonproj-gap bug (recently checked by validator.py). + if self._causes_gap(opening_node): + opening_node.parent = opening_node.next_node + while (opening_node.parent.ord < closing_node.ord - 1 + and (opening_node.parent.upos == 'PUNCT' or opening_node.is_nonprojective() + or self._causes_gap(opening_node))): + opening_node.parent = opening_node.parent.next_node + if self._causes_gap(closing_node): + closing_node.parent = closing_node.prev_node + while (closing_node.parent.ord > opening_node.ord + 1 + and (closing_node.parent.upos == 'PUNCT' or closing_node.is_nonprojective() + or self._causes_gap(closing_node))): + closing_node.parent = closing_node.parent.prev_node diff --git a/udapi/block/ud/fixroot.py b/udapi/block/ud/fixroot.py new file mode 100644 index 00000000..be972d8b --- /dev/null +++ b/udapi/block/ud/fixroot.py @@ -0,0 +1,37 @@ +""" +Block ud.FixRoot will ensure that the tree is free of common root-related errors. +Simple heuristics are used; it is likely that human inspection would lead to +a different solution. Nevertheless, if a quick fix is needed to pass the +validation, this block can be helpful. + +WARNING: The block currently ignores enhanced dependencies. +""" +import re +from udapi.core.block import Block + + +class FixRoot(Block): + """ + Fixes the following validation errors: + - Only one node must be attached directly to the artificial root node. + => If the root has multiple children, keep the first one. Attach the other + ones to the first one. Change their deprel to 'parataxis'. + - The node attached as a child of the artificial root node must have the + 'root' relation (or its subtype). + => If the root child has another deprel, change it to 'root'. + - The node attached as a child of the artificial root node is the only one + allowed to have the 'root' relation (or its subtype). + => If another node has that deprel, change it to 'parataxis'. + """ + + def process_tree(self, root): + rchildren = root.children + if len(rchildren) > 1: + for i in range(len(rchildren)-1): + rchildren[i+1].parent = rchildren[0] + rchildren[i+1].deprel = 'parataxis' + if rchildren[0].udeprel != 'root': + rchildren[0].deprel = 'root' + for n in root.descendants: + if not n.parent == root and n.udeprel == 'root': + n.deprel = 'parataxis' diff --git a/udapi/block/ud/google2ud.py b/udapi/block/ud/google2ud.py index 453bb9c0..3ba20c5c 100644 --- a/udapi/block/ud/google2ud.py +++ b/udapi/block/ud/google2ud.py @@ -498,7 +498,7 @@ def fix_deprel(self, node): if self.lang == 'fr' and node.parent.form in {'M.', 'Mme', 'Dr'}: node.deprel = 'flat:name' elif node.deprel == 'prt': - if self.lang in {'en', 'de', 'nl', 'sv', 'da', 'no'}: + if self.lang in {'en', 'de', 'nl', 'sv', 'da', 'no', 'th'}: node.deprel = 'compound:prt' elif self.lang == 'tr': node.deprel = 'advmod:emph' diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py new file mode 100644 index 00000000..004ab4af --- /dev/null +++ b/udapi/block/ud/hi/fixaux.py @@ -0,0 +1,170 @@ +""" +Block to fix annotation of verbs that are currently treated as auxiliaries +but they should be treated as normal verbs instead. +""" +from udapi.core.block import Block +import logging +import re + +class FixAux(Block): + + def process_node(self, node): + self.fix_lemma(node) + # The following verbs appear in verb-verb compounds as the semantically + # less salient element: le (to take), de (to give), ḍāla / phenka (to throw), + # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), + # pahuñca (to reach), dekha (to look), phara (to return), cala (to walk), + # caṛha (to climb), saṛa (to rot), nikala (to get out), nikāla (to remove), girā (to drop), + # samā (to encounter), dhamaka (to bully), khaḍā (to stand), daboca (to catch), + # gujara (to pass), ghera (to surround), baca (to escape). + # There are also jā (to go) and paṛa (to fall) but we do not list them here + # because they can also act as genuine auxiliaries. + hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकल', 'निकाल', 'गिरा', 'समा', 'धमक', 'खडा', 'दबोच', 'गुजर', 'फूंक', 'घेर', 'बच'] + urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] + recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' + # Control and raising verbs. + # चाहना چاہنا (cāhnā) “to want, to wish” is a control verb but not an auxiliary. + # Its form چاہیئے (cāhie) “should, ought to” (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. + # दिखाना دکھانا (dikhānā) “to show” + # बनना بننا (bananā) “to become” + hiphase = ['लग', 'चुक', 'चाह', 'दिखा', 'बन', 'करा'] + urphase = ['لگ', 'چک', 'چاہ', 'دکھا', 'بن'] + rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' + if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': + node.deprel = 'compound' + # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. + node.upos = "VERB" + # वाला والا (vālā) with infinitive is annotated as auxiliary but it should not. + # It is not even a verb (it does not have a verbal paradigm); it is more + # like an adjective morphologically, and like a noun syntactically. It means + # “the one who does the action of the content verb infinitive.” + # Some occurrences in the original annotation are case or mark, so we do not + # check AUX/aux here. + elif node.lemma == 'वाला' or node.lemma == 'والا': + node.upos = 'ADJ' + node.feats['AdpType'] = '' + node.feats['VerbForm'] = '' + node.feats['Aspect'] = '' + node.deprel = 'compound' + elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': + secpred = node.parent + grandparent = secpred.parent + node.parent = grandparent + node.deprel = secpred.deprel + secpred.parent = node + secpred.deprel = "xcomp" + ###!!! We should also take care of DEPS if they exist. + # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. + node.upos = "VERB" + # Examine the children of the original parent. + # Those that modify the clause should be re-attached to me. + # Those that modify the word (noun, adjective) should stay there. + for c in secpred.children: + # obl is borderline. It could modify an adjective rather than a clause. + # obj and iobj should not occur in copular clauses but it sometimes + # occurs with pseudocopulas: "I declare him handsome." + if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): + c.parent = node + + def fix_lemma(self, node): + """ + Some verbal forms have wrong lemmas in the Hindi/Urdu treebanks. If they + are tagged AUX, it means that either the validator fails to recognize a + correct auxiliary, or we fail here to recognize a spurious auxiliary that + must be fixed. + """ + if node.upos == 'AUX': + # آنے is the oblique infinitive form of “to come” + if node.lemma == 'آنہ': + node.lemma = 'آ' + # بنانا बनाना “make, create, produce, cause to be/become” + # (I don't know why in some instances بنا was used as lemma for کر “to do”.) + if node.form == 'کر' and node.lemma == 'بنا': + node.lemma = 'کر' + # چاہئے (cāhie) “should, ought to” occurs with alternative spellings (should they also be labeled as typos?) + if node.form == 'چاہئے' or node.form == 'چاہیئے' or node.form == 'چاہیے': + node.lemma = 'چاہئے' + if node.form == 'چاہئیں': + node.lemma = 'چاہئے' + node.feats['Number'] = 'Plur' + # چاہے seems to be a wrong lemma of چاہیں_گے “would like” + if node.lemma == 'چاہے': + node.lemma = 'چاہ' + # चुका چکا is a perfective participle of चुकना چکنا (cuknā) “to be finished” + if node.lemma == 'चुका': + node.lemma = 'चुक' + if node.lemma == 'چکا': + node.lemma = 'چک' + # दिया دیا is a perfective participle of देना دینا (denā) “to give” + if node.lemma == 'दिया': + node.lemma = 'दे' + if node.lemma == 'دیا' or node.lemma == 'دی' or node.lemma == 'دیت': + node.lemma = 'دے' + # دکھائیں (dikhānā) “to show” + if node.form == 'دکھائیں': + node.lemma = 'دکھا' + # گا, گی, گے denote the future tense. They are written as separate + # words in Urdu (while they are just suffixes in Hindi). However, + # when written as a separate auxiliary, all these forms should share + # the same lemma. + if node.lemma == 'گی' or node.lemma == 'گے': + node.lemma = 'گا' + # گیا is a perfective participle of जाना جانا‎ (jānā) “to go” + # जान جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. + if node.lemma == 'जाना' or node.lemma == 'जान': + node.lemma = 'जा' + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی' or node.lemma == 'جاتے' or node.lemma == 'جات': + node.lemma = 'جا' + # Wrongly lemmatized present forms of “to be”. + # In one instance, ہے had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'हों' or node.lemma == 'है.': + node.lemma = 'है' + if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے': + node.lemma = 'ہے' + # लिया لیا is a perfective participle of लेना لینا (lenā) “to take” + # In one instance, لیا had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'लिया': + node.lemma = 'ले' + if node.lemma == 'لیا' or node.form == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': + node.lemma = 'لے' + # लगा لگا is a perfective participle of लगना لگنا (lagnā) “to seem, to appear” + if node.lemma == 'लगा': + node.lemma = 'लग' + if node.lemma == 'لگا': + node.lemma = 'لگ' + # पहुंचा پہنچا is a perfective participle of पहुंचना پہنچنا (pahuñcnā) “to reach” + if node.lemma == 'पहुंचा' or node.lemma == 'पहुँच': + node.lemma = 'पहुंच' + # پڑے is a perfective participle of پڑنا (paṛnā) “to fall” + if node.lemma == 'پڑے': + node.lemma = 'پڑ' + # پھرے is a perfective participle of پھرنا (pharnā) “to return” + if node.lemma == 'پھرے': + node.lemma = 'پھر' + # रहा رہا is a perfective participle of रहना رہنا (rahnā) “to stay” + if node.lemma == 'रहा' or node.lemma == 'रहूं': + node.lemma = 'रह' + if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے': + node.lemma = 'رہ' + # sakna to be able to + if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا' or node.lemma == 'سکت': + node.lemma = 'سک' + # Wrongly lemmatized past forms of “to be”. + if node.lemma == 'थी': + node.lemma = 'था' + if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں': + node.lemma = 'تھا' + # उठा اٹھا is a perfective participle of उठना اٹھنا (uṭhnā) “to rise, get up” + if node.lemma == 'उठा': + node.lemma = 'उठ' + if node.lemma == 'اٹھا': + node.lemma = 'اٹھ' + # The compound part vālā is not an auxiliary. We handle it in process_node() + # but it must be lemmatized properly. + if node.lemma == 'والی': + node.lemma = 'والا' + # The postposition ke after a verbal stem is not an auxiliary. + # Example: علحدہ علحدہ کیس رجسٹر کر کے “by registering separate cases” + if node.lemma == 'کا' and node.form == 'کے': + node.upos = 'ADP' + node.deprel = 'mark' diff --git a/udapi/block/ud/id/__init__.py b/udapi/block/ud/id/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py new file mode 100644 index 00000000..a8d50748 --- /dev/null +++ b/udapi/block/ud/id/addmwt.py @@ -0,0 +1,219 @@ +""" +Block ud.id.AddMwt cuts the clitic "-nya" in Indonesian (preprocessed with +MorphInd whose output is stored in MISC attribute MorphInd). +""" +import udapi.block.ud.addmwt +import logging +import re + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + if re.search(r'^(ku|kau)', node.form, re.IGNORECASE) and re.search(r'^\^(aku

_PS1|kamu

_PS2)\+', node.misc['MorphInd']) and node.upos == 'VERB': + splitform = re.sub(r'^(ku|kau)', r'\1 ', node.form, flags=re.IGNORECASE) + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = 'PRON VERB' + if re.search(r'^ku ', splitform.lower()): + lemma = re.sub(r'^ku ', 'aku ', splitform.lower()) + feats = 'Number=Sing|Person=1|PronType=Prs *' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split())<2: + xpos = 'PS1 VSA' + else: + lemma = re.sub(r'^kau ', 'kamu ', splitform.lower()) + feats = 'Number=Sing|Person=2|PronType=Prs *' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split())<2: + xpos = 'PS2 VSA' + deprel = 'nsubj *' + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} + elif re.search(r'(nya|ku|mu)$', node.form, re.IGNORECASE) and re.search(r'\+(dia

_PS3|aku

_PS1|kamu

_PS2)\$$', node.misc['MorphInd']): + if node.upos == 'VERB': + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + # For transitive verbs with the meN- prefix, -nya is an object clitic. + # For passive verbs with the di- prefix, -nya refers to a passive agent. + # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. + # The same would hold for intransitive verbs with the meN- prefix but we cannot recognize them (we will treat all meN- verbs as transitive). + menverb = True if re.match(r'^\^meN\+', node.misc['MorphInd']) else False + diverb = True if re.match(r'^\^di\+', node.misc['MorphInd']) else False + nominalization = not menverb and not diverb + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + if nominalization: + lemma = splitform.lower() + upos = 'VERB DET' + feats = '* Definite=Def|PronType=Art' + deprel = '* det' + else: + upos = 'VERB PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' + # The agent of the passive verb is coded like a direct object of an active verb, + # so we might want to use obj:agent rather than obl:agent. However, full nominals + # as passive agents can be optionally accompanied by the preposition _oleh_ "by", + # which is an argument in favor of saying that they are oblique. So we currently + # mark all passive agents as obliques, although it is disputable in Austronesian + # languages (unlike Indo-European passives). + deprel = '* obl:agent' if diverb else '* obj' + xpos = re.sub(r'\+', ' ', node.xpos) + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'(NOUN|PROPN|X)', node.upos): + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = '* PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* nmod:poss' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'PRON' and re.match(r'^diri(nya|ku|mu)$', node.form, re.IGNORECASE): + # dirinya = reflexive himself/herself/itself (similarly, diriku = myself, dirimu = yourself; somewhere else we should check that they have the right features) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = 'PRON PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=3|PronType=Prs' + xpos = 'NSD PS3' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=1|PronType=Prs' + xpos = 'NSD PS1' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=2|PronType=Prs' + xpos = 'NSD PS2' + deprel = '* nmod:poss' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'ADJ' and re.search(r'(nya)$', node.form, re.IGNORECASE): + # nominalized adjective + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'ADJ DET' + feats = '* Definite=Def|PronType=Art' + if re.match(r' ', node.xpos): + xpos = re.sub(r'\+', ' ', node.xpos) + else: + xpos = 'ASP PS3' + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'^(banyak|semua)nya$', node.form, re.IGNORECASE): + # semua = all (DET) + # semuanya = nominalization of semua, i.e., 'everything' (PRON) + # banyak = many, much (DET) + # banyaknya = nominalization of banyak, i.e., 'a lot' (PRON) + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'DET DET' + feats = ('PronType=Tot' if lemma == 'semua nya' else 'PronType=Ind')+' Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'^(satu)nya$', node.form, re.IGNORECASE): + # satu = one (NUM) + # satunya = nominalization of satu, meaning 'the only one' + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'NUM DET' + feats = 'NumType=Card Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'ADP' and re.match(r'^R--\+PS[123]$', node.xpos) or re.match(r'^(bersama|dibawah|didalam|sekitar)nya$', node.form, re.IGNORECASE): + # Fused preposition and pronoun. + # Most of them are recognized as R--+PS3 by MorphInd. However, some are different: + # bersamanya = 'with him' = VSA+PS3 + # dibawahnya = 'under it' = VSP+PS3 + # didalamnya = 'inside it' = VSP+PS3 + # sekitarnya = 'around it' = D--+PS3 + # However: + # layaknya = 'like' is a derivation from 'layak' = 'worthy' (ASP+PS3) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + upos = 'ADP PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + xpos = 'R-- PS3' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + xpos = 'R-- PS1' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' + xpos = 'R-- PS2' + if node.udeprel == 'case': + if re.match(r'^(NOUN|PROPN|PRON|DET|NUM|X|SYM)$', node.parent.upos): + deprel = 'nmod' + else: + deprel = 'obl' + else: + deprel = '*' + deprel = 'case '+deprel + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} + else: + # Do not warn about instances that are known exceptions. + # akibatnya = as a result (SCONJ); akibat = result + # bukannya = instead (PART); bukan = no, not + # layaknya = like (ADP); layak = worthy + # sebaiknya = should (AUX) + # sesampainya = once in / arriving at (ADP) + # tidaknya = whether or not (PART); tidak = no, not + # Adverbs are an exception, too. The -nya morpheme could be derivation. E.g., 'ironis' = 'ironic'; 'ironisnya' = 'ironically'. + if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|layak|sebaik|sesampai|tidak)(nya|ku|mu)$', node.form, re.IGNORECASE): + logging.warning("Form '%s' analyzed by MorphInd as having the -nya|-ku|-mu clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) + return None + elif re.search(r'(kah|lah|pun|tah)$', node.form, re.IGNORECASE) and re.search(r'\+(kah|lah|pun|tah)_T--\$$', node.misc['MorphInd']): + splitform = re.sub(r'(kah|lah|pun|tah)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = '* PART' + feats = '* _' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split()) < 2: + xpos = xpos + ' T--' + deprel = '* advmod:emph' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + return None + + def postprocess_mwt(self, mwt): + """Distribute the MorphInd analysis to the two parts so that we can later use it to fix the lemmas of verbs.""" + match = re.match(r'^\^(.*)\+(aku

_PS1|kamu

_PS2|dia

_PS3|kah_T--|lah_T--|pun_T--|tah_T--)\$$', mwt.misc['MorphInd']) + if not match: + match = re.match(r'^\^(aku

_PS1|kamu

_PS2)\+(.*)\$$', mwt.misc['MorphInd']) + if match: + mwt.words[0].misc['MorphInd'] = '^'+match.group(1)+'$' + mwt.words[1].misc['MorphInd'] = '^'+match.group(2)+'$' diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py new file mode 100644 index 00000000..4ea23d06 --- /dev/null +++ b/udapi/block/ud/id/fixgsd.py @@ -0,0 +1,447 @@ +"""Block to fix annotation of UD Indonesian-GSD.""" +from udapi.core.block import Block +import logging +import re + +class FixGSD(Block): + + def fix_upos_based_on_morphind(self, node): + """ + Example from data: ("kesamaan"), the correct UPOS is NOUN, as + suggested by MorphInd. + Based on my observation so far, if there is a different UPOS between + the original GSD and MorphInd, it's better to trust MorphInd + I found so many incorrect UPOS in GSD, especially when NOUNs become + VERBs and VERBs become NOUNs. + I suggest adding Voice=Pass when the script decides ke-xxx-an as VERB. + """ + if node.upos == 'VERB' and node.xpos == 'NSD' and re.match(r'^ke.+an$', node.form, re.IGNORECASE): + node.upos = 'NOUN' + if node.udeprel == 'acl': + node.deprel = 'nmod' + elif node.udeprel == 'advcl': + node.deprel = 'obl' + + def fix_semua(self, node): + """ + Indonesian "semua" means "everything, all". + Originally it was DET, PRON, or ADV. + Ika: I usually only labeled "semua" as DET only if it's followed by a + NOUN/PROPN. If it's followed by DET (including '-nya' as DET) or it's + not followed by any NOUN/DET, I labeled them as PRON. + """ + if node.form.lower() == 'semua': + if re.match(r'^(NOUN|PROPN)$', node.parent.upos) and node.parent.ord > node.ord: + node.upos = 'DET' + if node.udeprel == 'nmod' or node.udeprel == 'advmod': + node.deprel = 'det' + else: + node.upos = 'PRON' + if node.udeprel == 'det' or node.udeprel == 'advmod': + node.deprel = 'nmod' + node.feats['PronType'] = 'Tot' + + def fix_ordinal_numerals(self, node): + """ + Ordinal numerals should be ADJ NumType=Ord in UD. They have many different + UPOS tags in Indonesian GSD. This method harmonizes them. + pertama = first + kedua = second + ketiga = third + keempat = fourth + kelima = fifth + keenam = sixth + ketujuh = seventh + kedelapan = eighth + kesembilan = ninth + ke-48 = 48th + + However! The ke- forms (i.e., not 'pertama') can also function as total + versions of cardinal numbers ('both', 'all three' etc.). If the numeral + precedes the noun, it is a total cardinal; if it follows the noun, it is + an ordinal. An exception is when the modified noun is 'kali' = 'time'. + Then the numeral is ordinal regardless where it occurs, and together + with 'kali' it functions as an adverbial ordinal ('for the second time'). + """ + # We could also check the XPOS, which is derived from MorphInd: re.match(r'^CO-', node.xpos) + if re.match(r'^pertama(nya)?$', node.form, re.IGNORECASE): + node.upos = 'ADJ' + node.feats['NumType'] = 'Ord' + if re.match(r'^(det|nummod|nmod)$', node.udeprel): + node.deprel = 'amod' + elif re.match(r'^(kedua|ketiga|keempat|kelima|keenam|ketujuh|kedelapan|kesembilan|ke-?\d+)(nya)?$', node.form, re.IGNORECASE): + if node.parent.ord < node.ord or node.parent.lemma == 'kali': + node.upos = 'ADJ' + node.feats['NumType'] = 'Ord' + if re.match(r'^(det|nummod|nmod)$', node.udeprel): + node.deprel = 'amod' + else: + node.upos = 'NUM' + node.feats['NumType'] = 'Card' + node.feats['PronType'] = 'Tot' + if re.match(r'^(det|amod|nmod)$', node.udeprel): + node.deprel = 'nummod' + + def rejoin_ordinal_numerals(self, node): + """ + If an ordinal numeral is spelled using digits ('ke-18'), it is often + tokenized as multiple tokens, which is wrong. Fix it. + """ + if node.form.lower() == 'ke': + dash = None + number = None + if node.next_node: + if node.next_node.form == '-': + dash = node.next_node + if dash.next_node and re.match(r'^\d+$', dash.next_node.form): + number = dash.next_node + node.form = node.form + dash.form + number.form + node.lemma = node.lemma + dash.lemma + number.lemma + elif re.match(r'^\d+$', node.next_node.form) and (node.parent == node.next_node or node.next_node.parent == node): + number = node.next_node + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = node.form + '-' + number.form + node.form = node.form + number.form + node.lemma = node.lemma + '-' + number.lemma + if number: + # Let us pretend that these forms are always ordinal numerals. + # Situations where they act as total cardinals will be disambiguated + # in a subsequent call to fix_ordinal_numerals(). + node.upos = 'ADJ' + node.xpos = 'CO-' + node.feats['NumType'] = 'Ord' + node.misc['MorphInd'] = '^ke_R--+' + number.form + '_CC-$' + # Find the parent node. Assume that the dash, if present, was not the head. + if node.parent == number: + node.parent = number.parent + node.deprel = number.deprel + if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): + node.deprel = 'amod' + # Adjust SpaceAfter. + node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' + # Remove the separate node of the dash and the number. + if dash: + if len(dash.children) > 0: + for c in dash.children: + c.parent = node + dash.remove() + if len(number.children) > 0: + for c in number.children: + c.parent = node + number.remove() + # There may have been spaces around the dash, which are now gone. Recompute the sentence text. + node.root.text = node.root.compute_text() + + def rejoin_decades(self, node): + """ + In Indonesian, the equivalent of English "1990s" is written as "1990-an". + In GSD, it is often tokenized as multiple tokens, which is wrong. Fix it. + """ + if node.form.lower() == 'an': + dash = None + number = None + if node.prev_node: + if node.prev_node.form == '-': + dash = node.prev_node + if dash.prev_node and re.match(r'^\d+$', dash.prev_node.form): + number = dash.prev_node + node.form = number.form + dash.form + node.form + node.lemma = number.lemma + dash.lemma + node.lemma + elif re.match(r'^\d+$', node.prev_node.form) and (node.parent == node.prev_node or node.prev_node.parent == node): + number = node.prev_node + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = number.form + '-' + node.form + node.form = number.form + node.form + node.lemma = number.lemma + '-' + node.lemma + if number: + # The combined token is no longer a numeral. It cannot quantify an entity. + # Instead, it is itself something like a noun (or perhaps proper noun). + node.upos = 'NOUN' + node.xpos = 'NSD' + node.feats['NumType'] = '' + # In some cases, "-an" is labeled as foreign for no obvious reason. + node.feats['Foreign'] = '' + node.misc['MorphInd'] = '^' + number.form + '_CC-+an_F--$' + # Find the parent node. Assume that the dash, if present, was not the head. + if node.parent == number: + node.parent = number.parent + node.deprel = number.deprel + if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): + node.deprel = 'nmod' + # No need to adjust SpaceAfter, as the 'an' node was the last one in the complex. + #node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' + # Remove the separate node of the dash and the number. + if dash: + if len(dash.children) > 0: + for c in dash.children: + c.parent = node + dash.remove() + if len(number.children) > 0: + for c in number.children: + c.parent = node + number.remove() + # There may have been spaces around the dash, which are now gone. Recompute the sentence text. + node.root.text = node.root.compute_text() + + def merge_reduplication(self, node): + """ + Reduplication is a common morphological device in Indonesian. Reduplicated + nouns signal plural but some reduplications also encode emphasis, modification + of meaning etc. In the previous annotation of GSD, reduplication was mostly + analyzed as three tokens, e.g., for plurals, the second copy would be attached + to the first one as compound:plur, and the hyphen would be attached to the + second copy as punct. We want to analyze reduplication as a single token. + Fix it. + """ + # We assume that the previous token is a hyphen and the token before it is the parent. + first = node.parent + root = node.root + # Example of identical reduplication: negara-negara = countries + # Example of reduplication with -an: kopi-kopian = various coffee trees + # Example of reduplication with vowel substitution: bolak-balik = alternating + # Example of reduplication with di-: disebut-sebut = mentioned (the verb sebut is reduplicated, then passivized) + # Example of reduplication with se-: sehari-hari = daily (hari = day) + # The last pattern is not reduplication but we handle it here because the procedure is very similar: non-/sub-/anti- + a word. + if first.ord == node.ord-2 and (first.form.lower() == node.form.lower() or first.form.lower() + 'an' == node.form.lower() or re.match(r'^(.)o(.)a(.)-\1a\2i\3$', first.form.lower() + '-' + node.form.lower()) or first.form.lower() == 'di' + node.form.lower() or first.form.lower() == 'se' + node.form.lower() or re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower())): + hyph = node.prev_node + if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): + # This is specific to the reduplicated plurals. The rest will be done for any reduplications. + # Note that not all reduplicated plurals had compound:plur. So we will look at whether they are NOUN. + ###!!! Also, reduplicated plural nouns always have exact copies on both sides of the hyphen. + ###!!! Some other reduplications have slight modifications on one or the other side. + if node.upos == 'NOUN' and first.form.lower() == node.form.lower(): + first.feats['Number'] = 'Plur' + # For the non-/sub-/anti- prefix we want to take the morphology from the second word. + if re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower()): + first.lemma = first.lemma + '-' + node.lemma + first.upos = node.upos + first.xpos = node.xpos + first.feats = node.feats + first.misc['MorphInd'] = re.sub(r'\$\+\^', '+', first.misc['MorphInd'] + '+' + node.misc['MorphInd']) + # Neither the hyphen nor the current node should have children. + # If they do, re-attach the children to the first node. + for c in hyph.children: + c.parent = first + for c in node.children: + c.parent = first + # Merge the three nodes. + # It is possible that the last token of the original annotation + # is included in a multi-word token. Then we must extend the + # multi-word token to the whole reduplication! Example: + # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' + # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. + mwt = node.multiword_token + if mwt: + # We assume that the MWT has only two words. We are not prepared for other possibilities. + if len(mwt.words) > 2: + logging.critical('MWT of only two words is expected') + mwtmisc = mwt.misc.copy() + second = mwt.words[1] + mwt.remove() + first.form = first.form + '-' + node.form + hyph.remove() + node.remove() + first.misc['SpaceAfter'] = '' + mwt = root.create_multiword_token([first, second], form=first.form + second.form, misc=mwtmisc) + else: + first.form = first.form + '-' + node.form + if node.no_space_after: + first.misc['SpaceAfter'] = 'No' + else: + first.misc['SpaceAfter'] = '' + hyph.remove() + node.remove() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() + # In some cases the non-/sub-/anti- prefix is annotated as the head of the phrase and the above pattern does not catch it. + elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti|multi|kontra)$', node.form.lower()): + prefix = node + stem = first # here it is not the first part at all + hyph = stem.prev_node + if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): + # For the non-/sub-/anti- prefix we want to take the morphology from the second word. + stem.lemma = prefix.lemma + '-' + stem.lemma + stem.misc['MorphInd'] = re.sub(r'\$\+\^', '+', prefix.misc['MorphInd'] + '+' + stem.misc['MorphInd']) + # Neither the hyphen nor the prefix should have children. + # If they do, re-attach the children to the stem. + for c in hyph.children: + c.parent = stem + for c in prefix.children: + c.parent = stem + # Merge the three nodes. + # It is possible that the last token of the original annotation + # is included in a multi-word token. Then we must extend the + # multi-word token to the whole reduplication! Example: + # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' + # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. + mwt = stem.multiword_token + if mwt: + # We assume that the MWT has only two words. We are not prepared for other possibilities. + if len(mwt.words) > 2: + logging.critical('MWT of only two words is expected') + mwtmisc = mwt.misc.copy() + second = mwt.words[1] + mwt.remove() + stem.form = prefix.form + '-' + stem.form + prefix.remove() + hyph.remove() + stem.misc['SpaceAfter'] = '' + mwt = root.create_multiword_token([stem, second], form=stem.form + second.form, misc=mwtmisc) + else: + stem.form = prefix.form + '-' + stem.form + prefix.remove() + hyph.remove() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() + + def fix_plural_propn(self, node): + """ + It is unlikely that a proper noun will have a plural form in Indonesian. + All examples observed in GSD should actually be tagged as common nouns. + """ + if node.upos == 'PROPN' and node.feats['Number'] == 'Plur': + node.upos = 'NOUN' + node.lemma = node.lemma.lower() + if node.upos == 'PROPN': + node.feats['Number'] = '' + + def fix_satu_satunya(self, node): + """ + 'satu' = 'one' (NUM) + 'satu-satunya' = 'the only' + """ + root = node.root + if node.form == 'nya' and node.parent.form.lower() == 'satu' and node.parent.udeprel == 'fixed' and node.parent.parent.form.lower() == 'satu': + satu0 = node.parent.parent + satu1 = node.parent + nya = node + dash = None + if satu1.ord == satu0.ord+2 and satu1.prev_node.form == '-': + dash = satu1.prev_node + satu0.misc['SpaceAfter'] = 'No' + dash.misc['SpaceAfter'] = 'No' + root.text = root.compute_text() + satu1.deprel = 'compound:redup' + nya.parent = satu0 + # We actually cannot leave the 'compound:redup' here because it is not used in Indonesian. + if node.form == 'nya' and node.parent.form.lower() == 'satu': + satu0 = node.parent + nya = node + if satu0.next_node.form == '-': + dash = satu0.next_node + if dash.next_node.form.lower() == 'satu': + satu1 = dash.next_node + if satu1.ord == node.ord-1: + # Merge satu0 + dash + satu1 into one node. + satu0.form = satu0.form + dash.form + satu1.form + dash.remove() + satu1.remove() + # There should be a multi-word token comprising satu1 + nya. + mwt = nya.multiword_token + if mwt: + mwtmisc = mwt.misc.copy() + mwt.remove() + mwt = root.create_multiword_token([satu0, nya], form=satu0.form + nya.form, misc=mwtmisc) + satu0.misc['SpaceAfter'] = '' + root.text = root.compute_text() + if node.multiword_token and node.no_space_after: + node.misc['SpaceAfter'] = '' + + def lemmatize_from_morphind(self, node): + # The MISC column contains the output of MorphInd for the current word. + # The analysis has been interpreted wrongly for some verbs, so we need + # to re-interpret it and extract the correct lemma. + morphind = node.misc['MorphInd'] + if node.upos == 'VERB': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r"_V[SP][AP]$", "", morphind) + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r"\+", morphind) + # Expected suffixes are -kan, -i, -an, or no suffix at all. + # There is also the circumfix ke-...-an which seems to be nominalized adjective: + # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; + # but I am not sure what is the reason that these are tagged VERB. + if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): + del morphemes[-1] + # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. + # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". + while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + elif node.upos == 'NOUN': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_(N[SP]D|VSA)$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefixes are peN-, per-, ke-, ber-. + # Expected suffix is -an. + if len(morphemes) > 1 and re.match(r'^an$', morphemes[-1]): + del morphemes[-1] + if len(morphemes) > 1 and re.match(r'^(peN|per|ke|ber)$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + elif node.upos == 'ADJ': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_ASS$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefix is ter-. + if len(morphemes) > 1 and re.match(r'^ter$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + + def process_node(self, node): + self.fix_plural_propn(node) + self.fix_upos_based_on_morphind(node) + self.fix_semua(node) + self.rejoin_ordinal_numerals(node) + self.fix_ordinal_numerals(node) + self.rejoin_decades(node) + self.merge_reduplication(node) + self.fix_satu_satunya(node) + self.lemmatize_from_morphind(node) diff --git a/udapi/block/ud/joinasmwt.py b/udapi/block/ud/joinasmwt.py index 02c54206..be93bd3c 100644 --- a/udapi/block/ud/joinasmwt.py +++ b/udapi/block/ud/joinasmwt.py @@ -22,19 +22,30 @@ def process_node(self, node): if node.multiword_token: return mwt_nodes = [node] - while (node.no_space_after and node.next_node and not node.next_node.multiword_token - and node.form[-1].isalpha() and node.next_node.form[0].isalpha()): + while (node.next_node and not node.next_node.multiword_token + and self.should_join(node, node.next_node)): node = node.next_node mwt_nodes.append(node) if len(mwt_nodes) > 1: - mwt_form = ''.join([n.form for n in mwt_nodes]) - mwt = node.root.create_multiword_token(mwt_nodes, mwt_form) - if node.misc['SpaceAfter'] == 'No': - mwt.misc['SpaceAfter'] = 'No' + self.create_mwt(mwt_nodes) + + def should_join(self, node, next_node): + return node.no_space_after and node.form[-1].isalpha() and next_node.form[0].isalpha() + + def create_mwt(self, mwt_nodes): + mwt_form = ''.join([n.form for n in mwt_nodes]) + mwt = mwt_nodes[0].root.create_multiword_token(words=mwt_nodes, form=mwt_form) + if mwt_nodes[0].node.misc['SpaceAfter'] == 'No': + mwt.misc['SpaceAfter'] = 'No' + for mwt_node in mwt_nodes: + del mwt_node.misc['SpaceAfter'] + if self.revert_orig_form: for mwt_node in mwt_nodes: - del mwt_node.misc['SpaceAfter'] - if self.revert_orig_form: - for mwt_node in mwt_nodes: - if mwt_node.misc['OrigForm']: - mwt_node.form = mwt_node.misc['OrigForm'] - del mwt_node.misc['OrigForm'] + if mwt_node.misc['OrigForm']: + mwt_node.form = mwt_node.misc['OrigForm'] + del mwt_node.misc['OrigForm'] + self.postprocess_mwt() + + # a helper method to be overriden + def postprocess_mwt(self, mwt): + pass diff --git a/udapi/block/ud/jointoken.py b/udapi/block/ud/jointoken.py new file mode 100644 index 00000000..43d2b30d --- /dev/null +++ b/udapi/block/ud/jointoken.py @@ -0,0 +1,97 @@ +""" +Block ud.JoinToken will join a given token with the preceding one. +""" +from udapi.core.block import Block +import logging + + +class JoinToken(Block): + """ + Merge two tokens into one. A MISC attribute is used to mark the tokens that + should join the preceding token. (The attribute may have been set by an + annotator or by a previous block that tests the specific conditions under + which joining is desired.) Joining cannot be done across sentence + boundaries; if necessary, apply util.JoinSentence first. Multiword tokens + are currently not supported: None of the nodes to be merged can belong to + a MWT. (The block ud.JoinAsMwt may be of some help, but it works differently.) + Merging is simple if there is no space between the tokens (see SpaceAfter=No + at the first token). If there is a space, there are three options in theory: + + 1. Keep the tokens as two nodes but apply the UD goeswith relation + (see https://universaldependencies.org/u/overview/typos.html) and + the related annotation rules. + 2. Join them into one token that contains a space. Such "words with + spaces" can be exceptionally allowed in UD if they are registered + in the given language. + 3. Remove the space without any trace. Not recommended in UD unless the + underlying text was created directly for UD and can be thus considered + part of the annotation. + + At present, this block does not support merging with spaces at all, but + in the future one or more of the options may be added. + """ + + def __init__(self, misc_name='JoinToken', misc_value=None, **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the joining + default: JoinToken + misc_value: value of the MISC attribute to trigger the joining; + if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + self.misc_value = misc_value + + def process_node(self, node): + """ + The JoinToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be merged with the previous node and the + attribute will be removed from MISC, or a warning will be issued that + the merging cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + if node.misc[self.misc_name] == '': + return + if self.misc_value and node.misc[self.misc_name] != self.misc_value: + return + prevnode = node.prev_node + if not prevnode: + logging.warning("MISC %s cannot be used at the first token of a sentence." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if node.multiword_token or prevnode.multiword_token: + logging.warning("MISC %s cannot be used if one of the nodes belongs to a multiword token." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if prevnode.misc['SpaceAfter'] != 'No': + logging.warning("MISC %s cannot be used if there is space between the tokens." % self.misc_name) + node.misc['Bug'] = 'JoiningTokensWithSpaceNotSupported' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if prevnode.deps or node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # If the first token depends on the second token, re-attach it to the + # second token's parent to prevent cycles. + if prevnode in node.descendants: + prevnode.parent = node.parent + prevnode.deprel = node.deprel + # Re-attach all children of the second token to the first token. + for c in node.children: + c.parent = prevnode + # Concatenate the word forms of the two tokens. Assume that morphological + # annotation, including the lemma, is already updated accordingly (we + # cannot guess it anyway). + prevnode.form += node.form + # Remove SpaceAfter=No from the first token unless the second token has + # this attribute, too (meaning that there is no space between the second + # token and whatever comes next). + prevnode.misc['SpaceAfter'] = node.misc['SpaceAfter'] + # Remove the current node. The joining instruction was in its MISC, so + # it will disappear together with the node. + node.remove() diff --git a/udapi/block/ud/kk/fixspuriousaux.py b/udapi/block/ud/kk/fixspuriousaux.py new file mode 100644 index 00000000..044ff178 --- /dev/null +++ b/udapi/block/ud/kk/fixspuriousaux.py @@ -0,0 +1,27 @@ +"""Block to convert spurious auxiliaries to lexical verbs in Kazakh.""" +from udapi.core.block import Block +import logging +import re + +class FixSpuriousAux(Block): + + def process_node(self, node): + """ + Some verbs that are called auxiliary by the traditional grammar, should + be analyzed in UD as VERB + non-finite xcomp. + """ + if node.upos == 'AUX' and node.udeprel == 'aux': + # баста = start + if re.match(r'^(баста|кет)$', node.lemma): + node.upos = 'VERB' + # The auxiliary inherits the incoming relation of its original parent. + lexverb = node.parent + node.parent = lexverb.parent + node.deprel = lexverb.deprel + # The auxiliary also inherits some but not all children of the lexical verb. + for c in lexverb.children: + if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): + c.parent = node + # The lexical verb becomes an xcomp of the auxiliary. + lexverb.parent = node + lexverb.deprel = 'xcomp' diff --git a/udapi/block/ud/la/addmwt.py b/udapi/block/ud/la/addmwt.py new file mode 100644 index 00000000..27831151 --- /dev/null +++ b/udapi/block/ud/la/addmwt.py @@ -0,0 +1,41 @@ +""" Block ud.la.AddMwt for heuristic detection of multi-word (PRON + cum, nonne) and abbreviations-dots tokens. """ +import udapi.block.ud.addmwt + +MWTS = { + 'mecum': {'lemma': 'ego cum', 'form': 'me cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'tecum': {'lemma': 'tu cum', 'form': 'te cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'nobiscum': {'lemma': 'nos cum', 'form': 'nobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Neut|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'vobiscum': {'lemma': 'vos cum', 'form': 'vobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'uobiscum': {'lemma': 'uos cum', 'form': 'uobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'secum': {'lemma': 'sui cum', 'form': 'se cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, # can be singular or plural + 'nonne': {'lemma': 'non ne', 'form': 'non ne', 'upos': 'PART PART', 'feats': 'Polarity=Neg Clitic=Yes|PartType=Int', 'deprel': 'advmod:neg discourse', 'shape': 'sibling'} +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + # v['xpos'] = '' # treebank-specific + if 'shape' not in v: + v['shape'] = 'subtree' + v['main'] = 0 + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + if analysis is not None: + return analysis + + if node.form.endswith('.') and len(node.form) > 1 and node.form != '...': + # currently under discussion + return {'form': node.form[:-1] + ' .', + 'lemma': '* .', + 'upos': '* PUNCT', + 'xpos': '_ _', + 'feats': '* _', + 'deprel': '* punct', + 'main': 0, + 'shape': 'subtree'} + diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py new file mode 100644 index 00000000..a7b506e8 --- /dev/null +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -0,0 +1,338 @@ +""" +Block to identify missing or ill-valued features in Latin. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.la.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def __init__(self, flavio=False, **kwargs): + """ + Create the ud.la.MarkFeatsBugs block instance. + + Args: + flavio=1: Accept features as defined by Flavio for treebanks he + maintains. By default, a more conservative set of features and + values is expected. + """ + super().__init__(**kwargs) + self.flavio = flavio + + def process_node(self, node): + rf = [] + af = {} + # PROIEL-specific: greek words without features + # LLCT-specific: corrupted nodes + if node.lemma in ['greek.expression', 'missing^token']: + pass + # NOUNS ################################################################ + elif node.upos == 'NOUN': + if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns + rf = ['Gender', 'Number', 'Case'] + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Dim'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'VerbForm': ['Part', 'Vnoun']} + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Proper'] = ['Yes'] + af['Polarity'] = ['Neg'] + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # PROPER NOUNS ######################################################### + elif node.upos == 'PROPN': + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: # abbreviated and indeclinable nouns + rf = ['Gender', 'Number', 'Case'] + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes']} + if self.flavio: + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: + rf = ['Gender', 'Number', 'Case'] + af = { + 'NumType': ['Dist', 'Mult', 'Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Cmp', 'Sup', 'Abs'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Polarity': ['Neg'], + 'VerbForm': ['Part']} + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] + af['Variant'] = ['Greek'] + af['Degree'].append('Dim') + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + rf = ['PronType', 'Case'] + af = { + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Proper': ['Yes'], + 'Compound': ['Yes'], + 'Polarity': ['Neg'] + } + if node.feats['PronType'] == 'Prs': + af['Reflex'] = ['Yes'] + if node.feats['Reflex'] == 'Yes': # seipsum, se + rf.extend(['Person']) + # seipsum has gender and number but se does not, so it is not required + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + af['Person'] = ['3'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + else: # not reflexive: ego, tu, is, nos + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 3rd person must have gender + if node.feats['Person'] == '3': # is, id + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + elif re.match(r'^(Rel|Int)$', node.feats['PronType']): + rf.extend(['Gender', 'Number']) + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + elif node.feats['PronType'] == 'Ind': + rf = [f for f in rf if f != 'Case'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + # lexical check of PronTypes + af['PronType'] = [] + if node.lemma in ['ego', 'tu', 'is', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'egoipse', 'egometipse', 'tumetipse', 'semetipse', 'nosmetipse']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquis', 'nemo', 'nihil', 'nihilum', 'qui', 'quis', 'quisquis', 'quiuis', 'quivis']: + af['PronType'].append('Ind') + elif node.lemma in ['inuicem', 'invicem']: + af['PronType'].append('Rcp') + rf.remove('Case') + if node.lemma in ['qui', 'quicumque', 'quisquis']: + af['PronType'].append('Rel') + if node.lemma in [ 'ecquis', 'ecqui', 'numquis', 'qui', 'quis', 'quisnam']: + af['PronType'].append('Int') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurO', 'IndEurX', 'LatAnom', 'LatPron'] + af['Compound'] = ['Yes'] + af['Polarity'] = ['Neg'] + af['Form'] = ['Emp'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + rf = ['PronType'] + if node.feats['Case']: + rf.extend(['Gender', 'Number', 'Case']) + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Cmp', 'Abs', 'Sup'], + 'Polarity': ['Neg'], + 'Proper': ['Yes'], + 'PronType': [] + } + if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' + rf.extend(['Poss', 'Person[psor]']) + af['PronType'] = ['Prs'] + af['Poss'] = 'Yes' + af['Person[psor]'] = ['1', '2', '3'] + af['Reflex'] = ['Yes'] + # The possessor's number is distinguished in the first and second person (meus vs. noster) but not in the third person (suus). + if node.feats['Person[psor]'] != '3': + rf.append('Number[psor]') + af['Number[psor]'] = ['Sing', 'Plur'] + if node.feats['PronType'] == 'Ind': + af['NumType'] = ['Card'] + # lexical check of PronTypes + if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: + if not af['PronType'] == ['Prs']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquantus', 'aliqui', 'aliquot', 'quidam', 'nonnullus', 'nullus', 'quantuscumque', 'quantuslibet', 'qui', 'quilibet', 'quispiam', 'quiuis', 'quivis', 'quotlibet', 'ullus', 'unus', 'uterque','multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + af['PronType'].append('Ind') + elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: + af['PronType'].append('Tot') + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: + af['PronType'].append('Rel') + if node.lemma in ['qui', 'quantus', 'quot']: + af['PronType'].append('Int') + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot', 'praedictus', 'praefatus', 'suprascriptus']: + af['PronType'].append('Dem') + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter', 'uterlibet', 'uterque']: + af['PronType'].append('Con') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['NumType'] = ['Card'] + af['Degree'].append('Dim') + af['PronType'].append('Art') + if re.match(r'^(unus|ambo)', node.lemma): + af['NumValue'] = ['1', '2'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + rf = ['NumType', 'NumForm'] + af = { + 'NumType': ['Card', 'Ord'], + 'NumForm': ['Word', 'Roman', 'Digit'], + 'Proper': ['Yes']} + # Arabic digits and Roman numerals do not have inflection features. + if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['NumForm'].append('Reference') + af['Compound'] = ['Yes'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + rf = ['VerbForm', 'Aspect'] + af = { + 'VerbForm': ['Inf', 'Fin', 'Part', 'Conv'], + 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], + 'Polarity': ['Neg'], + 'Typo': ['Yes'] + } + if node.feats['VerbForm'] not in ['Part', 'Conv']: + rf.append('Tense') + af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] + if node.upos == 'VERB' or (node.upos == 'AUX' and node.lemma != 'sum'): + rf.append('Voice') + af['Voice'] = ['Act', 'Pass'] + if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive + rf.extend(['Mood', 'Person', 'Number']) + af['Mood'] = ['Ind', 'Sub', 'Imp'] + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + elif node.feats['VerbForm'] == 'Part': + rf.extend(['Gender', 'Number', 'Case']) + af['Number'] = ['Sing', 'Plur'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Sing'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Neut'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + af['Degree'] = ['Abs', 'Cmp'] + if node.misc['TraditionalMood'].startswith('Gerundi'): + af['Voice'] = ['Pass'] + af['Aspect'] = 'Prosp' + elif node.feats['VerbForm'] == 'Conv': + rf.extend(['Case', 'Gender', 'Number']) + af['Case'] = ['Abl', 'Acc'] + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Voice'] = ['Act'] + elif node.feats['VerbForm'] == 'Inf': + af['Tense'].remove('Pqp') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + af['VerbType'] = ['Mod'] + if 'Degree' in af: + af['Degree'].append('Dim') + else: + af['Degree'] = ['Dim'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] + if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + elif node.feats['VerbForm'] == 'Inf': + af['Case'] = ['Nom', 'Acc', 'Abl'] + af['Gender'] = ['Neut'] + af['Number'] = ['Sing'] + af['InflClass[nominal]'] = ['Ind'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + af = { + 'AdvType': ['Loc', 'Tim'], + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'NumType': ['Card', 'Mult', 'Ord'], # e.g., primum + 'Polarity': ['Neg'] + } + if self.flavio: + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['VerbForm'] = ['Fin', 'Part'] + af['Degree'].append('Dim') + self.check_allowed_features(node, af) + # PARTICLES ############################################################ + elif node.upos == 'PART': + af = { + 'PartType': ['Int', 'Emp'], + 'Polarity': ['Neg'] + } + if self.flavio: + af['Form'] = ['Emp'] + af['PronType'] = ['Dem'] + af['Compound'] = ['Yes'] + self.check_allowed_features(node, af) + # CONJUNCTIONS ######################################################### + elif re.match(r'^[CS]CONJ$', node.upos): + af = { + 'PronType': ['Rel', 'Con'], + 'Polarity': ['Neg'], + 'Compound': ['Yes']} + if self.flavio: + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['VerbForm'] = ['Fin'] + af['NumType'] = ['Card'] + af['ConjType'] = ['Expl'] + af['AdvType'] = ['Loc'] + self.check_allowed_features(node, af) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + rf = ['AdpType'] + af = { + 'AdpType': ['Prep', 'Post'], + 'Abbr': ['Yes'] + } + if self.flavio: + af['VerbForm'] = ['Part'] + af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] + self.check_allowed_features(node, af) + # X ########################################################## + elif node.upos == 'X': + af = {'Abbr': ['Yes']} + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) diff --git a/udapi/block/ud/lemmatize.py b/udapi/block/ud/lemmatize.py new file mode 100644 index 00000000..a234256f --- /dev/null +++ b/udapi/block/ud/lemmatize.py @@ -0,0 +1,42 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + def process_node(self, node): + """ + Some treebanks lack lemmas for some or all words. Occasionally we may be + able to guess that the lemma is identical to the word form. This block + will then fill out the lemma. + + For some parts of speech, we can only say that the form is the lemma if + we have morphological features that will confirm it is the right form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + # Many closed classes do not inflect and have the same lemma as the form (just lowercased). + if re.match(r'^(PUNCT|SYM|ADP|CCONJ|SCONJ|PART|INTJ|X)$', node.upos): + node.lemma = node.form.lower() + # NOUN PROPN ADJ PRON DET NUM VERB AUX ADV + # ADV: use positive affirmative + elif re.match(r'^(ADV)$', node.upos) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + # VERB and AUX: use the infinitive + elif re.match(r'^(VERB|AUX)$', node.upos) and node.feats['VerbForm'] == 'Inf' and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + # NOUN and PROPN: use singular nominative (but do not lowercase for PROPN) + # Note: This rule is wrong in German, where no nouns should be lowercased. + elif re.match(r'^(NOUN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + elif re.match(r'^(PROPN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form + # ADJ: use masculine singular nominative positive affirmative + elif re.match(r'^(ADJ)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + # ADJ, PRON, DET: use masculine singular nominative (pronouns: each person has its own lemma) + elif re.match(r'^(ADJ|PRON|DET)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() + # NUM: use masculine nominative (number, if present at all, is lexical) + elif re.match(r'^(NUM)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() diff --git a/udapi/block/ud/lt/fixedeprels.py b/udapi/block/ud/lt/fixedeprels.py new file mode 100644 index 00000000..9b1cb98d --- /dev/null +++ b/udapi/block/ud/lt/fixedeprels.py @@ -0,0 +1,144 @@ +"""Block to fix case-enhanced dependency relations in Lithuanian.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'kaip': [], + 'lyg': [], + 'negu': [], + 'nei': [], + 'nes': [] + } + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'apie': 'apie:acc', # about (topic) + 'dėl': 'dėl:gen', # because of + 'iki': 'iki:gen', # until + 'iš': 'iš:gen', # from, out of + 'į': 'į:acc', # to, into, in + 'jei': 'jei', # remove morphological case # if + 'jeigu': 'jeigu', # remove morphological case # if + 'jog': 'jog', # remove morphological case # because + 'kadangi': 'kadangi', # remove morphological case # since, because + 'kai': 'kai', # remove morphological case # when + 'kaip': 'kaip', # remove morphological case # as, than + 'lyg': 'lyg', # remove morphological case # like + 'negu': 'negu', # remove morphological case # than + 'nei': 'nei', # remove morphological case # more than + 'nes': 'nes', # remove morphological case # because + 'nors': 'nors', # remove morphological case # though, although, when, if + 'nuo': 'nuo:gen', # from + 'pagal': 'pagal:acc', # according to, under, by + 'pagal_dėl': 'pagal:acc', + 'per': 'per:acc', # through, over (přes) + 'prie': 'prie:gen', # to, at, near, under + 'prieš': 'prieš:acc', # against + 'su': 'su:ins', # with + 'tarp': 'tarp:gen', # between + 'tarsi': 'tarsi', # remove morphological case # as if + 'virš': 'virš:gen' # above + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + # Issues caused by errors in the original annotation must be fixed early. + # Especially if acl|advcl occurs with a preposition that unambiguously + # receives a morphological case in the subsequent steps, and then gets + # flagged as solved. + edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel']) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m and m.group(2) and not x+m.group(2) in exceptions: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + continue + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. Exclude 'nom' and 'voc', which cannot + # be correct. + m = re.match(r'^(obl(?::arg)?|nmod):(po|už)(?::(?:nom|voc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase and not re.search(r':(nom|voc)$', adpcase): + edep['deprel'] = m.group(1)+':'+adpcase + continue + # The remaining instance of 'po' should be ':acc'. + elif m.group(2) == 'po': + edep['deprel'] = m.group(1)+':po:acc' + continue + # The remaining 'už' are ':acc' (they are second conjuncts + # in coordinated oblique modifiers). + elif m.group(2) == 'už': + edep['deprel'] = m.group(1)+':už:acc' + continue + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index 1969278a..ee58084a 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -8,6 +8,13 @@ Usage: udapy -s ud.MarkBugs < in.conllu > marked.conllu 2> log.txt +Some tests may be customized for individual languages if the language code is +available as the zone id. The zone id can be provided in the sentence id after +the slash (e.g., "sent_id = s125/en" for English), or as a parameter of the +reader: + +udapy -s read.Conllu zone=en ud.MarkBugs < in.conllu > marked.conllu 2> log.txt + Errors are both logged to stderr and marked within the nodes' MISC field, e.g. `node.misc['Bug'] = 'aux-chain'`, so the output conllu file can be searched for "Bug=" occurences. @@ -109,7 +116,10 @@ def process_node(self, node): for i_upos, i_feat in REQUIRED_FEATURE_FOR_UPOS.items(): if upos == i_upos and not feats[i_feat]: - self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) + # Some languages do not distinguish finite and non-finite forms of verbs. + # The VerbForm feature is not obligatory in those languages. + if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb', 'naq'}: + self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) if feats['VerbForm'] == 'Fin': if upos not in ('VERB', 'AUX'): @@ -117,22 +127,22 @@ def process_node(self, node): if not feats['Mood']: self.log(node, 'finverb-mood', 'VerbForm=Fin but Mood feature is missing') - if feats['Degree'] and upos not in ('ADJ', 'ADV'): - self.log(node, 'degree-upos', - 'Degree=%s upos!=ADJ|ADV (but %s)' % (feats['Degree'], upos)) - - subject_children = [n for n in node.children if 'subj' in n.udeprel] + subject_children = [n for n in node.children if 'subj' in n.udeprel and n.sdeprel != 'outer'] if len(subject_children) > 1: - self.log(node, 'multi-subj', 'More than one [nc]subj(:pass)? child') - - object_children = [n for n in node.children if n.udeprel in ('obj', 'ccomp')] + self.log(node, 'multi-subj', 'More than one (non-outer) [nc]subj child') + + # Since "ccomp" is considered a clausal counterpart of "obj" in UD v2, + # one may conclude that "obj" and "ccomp" are mutually exclusive. + # However, this has always be a gray zone and people have occasionally + # brought up examples where they would want the two relations to co-occur. + # Also, there is no clausal counterpart for "iobj", which may cause some + # of the problems. It is probably safer not to consider "ccomp" in this + # test. Nevertheless, two "obj" under the same parent are definitely an + # error. + object_children = [n for n in node.children if n.udeprel == 'obj'] if len(object_children) > 1: self.log(node, 'multi-obj', 'More than one obj|ccomp child') - # In addition to http://universaldependencies.org/svalidation.html - if parent.udeprel == 'punct': - self.log(node, 'punct-child', 'parent.deprel=punct') - # See http://universaldependencies.org/u/overview/syntax.html#the-status-of-function-words # TODO: Promotion by Head Elision: It is difficult to detect this exception. # So far, I have just excluded "det" from the forbidded parent.deprel set @@ -144,7 +154,7 @@ def process_node(self, node): # so there should be no false alarms. Some errors are not reported, i.e. the cases # when advmod incorrectly depends on a function word ("right before midnight"). if parent.udeprel in ('aux', 'cop', 'mark', 'clf', 'case'): - if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod'): + if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod', 'reparandum'): self.log(node, parent.deprel + '-child', 'parent.deprel=%s deprel!=conj|cc|punct|fixed|goeswith' % parent.deprel) @@ -174,14 +184,6 @@ def process_node(self, node): if upos == 'PUNCT' and node.is_nonprojective_gap() and not parent.is_nonprojective_gap(): self.log(node, 'punct-nonproj-gap', 'upos=PUNCT and causing a non-projectivity') - # http://universaldependencies.org/u/dep/cc.html says - # "cc is the relation between a conjunct and a preceding - # [coordinating conjunction](http://universaldependencies.org/u/pos/CCONJ)." - # No other upos is allowed in the documentation, although e.g. PART is common in the data. - # There are clear cases of adverbs in role of cc (e.g. "respektive" in Swedish and Czech). - if udeprel == 'cc' and upos not in ('CCONJ', 'ADV'): - self.log(node, 'cc-upos', "deprel=cc upos!=CCONJ (but %s): " % upos) - if udeprel == 'cop': lemma = node.lemma if node.lemma != '_' else form self.cop_nodes[lemma].append(node) diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py new file mode 100644 index 00000000..26c5624d --- /dev/null +++ b/udapi/block/ud/markfeatsbugs.py @@ -0,0 +1,73 @@ +""" +Block to identify missing or ill-valued features in a treebank. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. This is a base block that only +implements service methods. A language-specific block must be derived from this +one and define the actual rules valid in that language. + +Usage (Czech example): cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html +""" +from udapi.core.block import Block + +class MarkFeatsBugs(Block): + + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def check_allowed_features(self, node, allowed): + """ + We need a dictionary indexed by feature names that are allowed; for each + feature name, there is a list of allowed values. + """ + # Check for features that are not allowed but the node has them. + # For features that are allowed, check that their values are allowed. + for f in node.feats: + if f in allowed: + if not node.feats[f] in allowed[f]: + self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') + else: + self.bug(node, 'Feat' + f + 'NotAllowed') + + def check_required_features(self, node, required): + """ + We need a list of names of features whose values must not be empty. + """ + for f in required: + if not f in node.feats: + self.bug(node, 'Feat' + f + 'Missing') + + def process_node(self, node): + """ + This is a generic block, do nothing here. In a language-specific block + based on this one, rules similar to the examples below can be specified: + + # NOUNS ################################################################ + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + #... + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) + """ + return diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py new file mode 100644 index 00000000..13c8434c --- /dev/null +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -0,0 +1,279 @@ +""" +Block to identify missing or ill-valued features in Malayalam. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.ml.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def process_node(self, node): + # FOREIGN WORDS ######################################################## + # Do not put any restrictions on words that have Foreign=Yes. These may + # also have Lang=xx in MISC, which would mean that the official + # validator would judge them by the rules for language [xx]. But even + # if they are not fully code-switched (e.g. because they are written in + # the Malayalam script, like the English verb പ്ലാന്റ് plānṟ "plant"), + # they still may not have the regular features of Malayalam morphology. + if node.feats['Foreign'] == 'Yes': + pass + # NOUNS AND PROPER NOUNS ############################################### + elif re.match(r'^(NOUN|PROPN)$', node.upos): + self.check_required_features(node, ['Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'NumType': ['Ord'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + rf = ['PronType', 'Case'] + af = { + 'PronType': ['Prs', 'Int', 'Ind'], # demonstrative pronouns are treated as third person personal pronouns + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + } + if node.feats['PronType'] == 'Prs': + af['Reflex'] = ['Yes'] + if node.feats['Reflex'] == 'Yes': + rf = ['PronType'] + else: # not reflexive + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī; or 3rd person താൻ tān̕ + if node.feats['Person'] == '3' and not node.lemma == 'താൻ': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ; but not താൻ tān̕ + rf.append('Deixis') + af['Deixis'] = ['Prox', 'Remt'] + if node.feats['Number'] == 'Sing': + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + # third person singular neuter pronouns also distinguish animacy (animate neuter are animals and plants, they have a different accusative form) + if node.feats['Gender'] == 'Neut': + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + else: # plural pronouns do not distinguish gender but they do distinguish animacy + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + elif node.feats['Person'] == '1' and node.feats['Number'] == 'Plur': + rf.append('Clusivity') + af['Clusivity'] = ['In', 'Ex'] + # Interrogative pronouns, too, can be case-marked. Therefore, the + # base form must have Case=Nom. + # ആര് ār "who" (Nom) എന്ത് ent "what" (Nom, Acc.Inan) + # ആരെ āre "who" (Acc) എന്തെ ente "what" (Acc.Anim) എന്തിനെ entine "what" (Acc.Anim or maybe Inan but optional) + # ആരുടെ āruṭe "who" (Gen) എന്തിന് entin "what" (Gen) or "why" + # ആരൊക്കെ ārokke "who" (Dat?) എന്തൊക്കെ entokke "what" (Dat?) + #elif node.feats['PronType'] == 'Int': + # rf.append('Animacy') + # af['Animacy'] = ['Anim', 'Inan'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + if node.feats['PronType'] == 'Art': + self.check_required_features(node, ['PronType', 'Definite']) + self.check_allowed_features(node, { + 'PronType': ['Art'], + 'Definite': ['Ind'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['PronType']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Deixis': ['Prox', 'Remt'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card', 'Frac'], + 'NumForm': ['Word'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # VERBS ################################################################ + elif node.upos == 'VERB': + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] == 'Inf': + self.check_allowed_features(node, { + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + # Unlike other forms, the imperative distinguishes politeness. + # The verb stem serves as an informal imperative: തുറ tuṟa "open" + # The citation form may serve as a formal imperative: തുറക്കുക tuṟakkūka "open" + # Finally, there is another formal imperative with -kkū: തുറക്കൂ tuṟakkū "open" + self.check_required_features(node, ['Mood', 'Polite']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Polite': ['Infm', 'Form'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['Mood'] == 'Nec': + self.check_required_features(node, ['Mood', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Nec'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['Mood', 'Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Pot', 'Cnd'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['VerbForm'] == 'Part': + self.check_required_features(node, ['Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + else: # verbal noun + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + # AUXILIARIES ########################################################## + elif node.upos == 'AUX': + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub', 'Cnd'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: # verbal noun + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Typo': ['Yes'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {'Typo': ['Yes']}) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_allowed_features(node, { + # Case suffixes after numbers are separate tokens, they are attached + # via the 'case' relation and they bear the Case feature (the number does not). + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes']}) + # PARTICLES ############################################################ + elif node.upos == 'PART': + self.check_allowed_features(node, { + 'Polarity': ['Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {'Abbr': ['Yes'], 'Typo': ['Yes']}) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py new file mode 100644 index 00000000..bd63ee7d --- /dev/null +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -0,0 +1,94 @@ +""" +Block ud.mr.AddFormsInMwt looks for multiword tokens whose words lack forms. +Based on the form of the surface token and on the information provided in +the lemmas and UPOS, tries to reconstruct the forms of individual words. +""" +from udapi.core.block import Block +import re +import logging + + +class AddFormsInMwt(Block): + """Guess forms of syntactic worms within a multiword token.""" + + def process_node(self, node): + if node.form == '_' and node.multiword_token: + mwt = node.multiword_token + # Many multiword tokens consist of NOUN + ADP. Beware: The adposition + # may have a form different from its lemma. It happens with possessive + # postpositions चा, चे, which distinguish the gender and number of + # the possessed entity. + if len(mwt.words) == 2 and re.match(r'^(ADP|PART)$', mwt.words[1].upos): + # Occasionally the lemma of the possessive postposition is mistakenly 'ची' instead of 'चा'. + if mwt.words[1].lemma == 'चा' or mwt.words[1].lemma == 'ची': + mwt.words[1].lemma = 'चा' + # चा (cā) ... Masc Sing + # ची (cī) ... Fem Sing, Neut Plur + # चे (ce) ... Neut Sing, Masc Plur + # च्या (cyā) ... Fem Plur + # चं (caṁ) ... ? + m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)$', mwt.form) + # The resulting form is different with personal pronouns. + # माझा (mājhā), माझी (mājhī), माझे (mājhe), माझ्या (mājhyā) + # तुझी (tujhī), तुझे (tujhe) + # आपला (āpalā), आपली (āpalī), आपल्या (āpalyā) + # त्याचं (tyācaṁ) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = m.group(2) + elif m2: + if node == mwt.words[0]: + node.form = m2.group(1) + else: + node.form = 'च' + m2.group(2) + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + elif mwt.words[1].lemma == 'वरती': + m = re.match(r'^(.+)(वर(?:ती)?)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = m.group(2) + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + else: # not the possessive 'चा' + m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = node.lemma + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + elif len(mwt.words) == 3 and re.match(r'^(ADP|PART)$', mwt.words[1].upos) and re.match(r'^(ADP|PART)$', mwt.words[2].upos): + # Compound postpositions where the middle word is the possessive 'चा'. + # The lemma of the middle word should be 'चा' but sometimes it is 'च्या'. + if re.match(r'^(चा|च्या)$', mwt.words[1].lemma): + m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)(.+)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + elif node == mwt.words[1]: + node.form = m.group(2) + node.lemma = 'चा' + else: + node.form = m.group(3) + elif m2: + if node == mwt.words[0]: + node.form = m2.group(1) + elif node == mwt.words[1]: + node.form = 'च' + m2.group(2) + node.lemma = 'चा' + else: + node.form = m2.group(3) + else: + logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) + else: + logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) + else: + logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma + '/' + x.upos for x in mwt.words]))) diff --git a/udapi/block/ud/printfixed.py b/udapi/block/ud/printfixed.py new file mode 100644 index 00000000..313943bb --- /dev/null +++ b/udapi/block/ud/printfixed.py @@ -0,0 +1,104 @@ +""" +Block PrintFixed prints occurrences of fixed multiword expressions in UD. It +can be run twice in a row, first collecting known fixed expressions and then +also reporting other occurrences of these expressions where they are not +annotated as fixed. + +Usage: +udapy ud.PrintFixed only_forms=1 < in.conllu | sort -u > fixed_expressions.txt +udapy ud.PrintFixed known_expressions=fixed_expressions.txt < in.conllu | sort | uniq -c | less + +Author: Dan Zeman +""" +from udapi.core.block import Block +import re +import logging + +class PrintFixed(Block): + """ + Print fixed multiword expressions. + """ + + def __init__(self, only_forms=False, known_expressions=None, **kwargs): + """ + Create the PrintFixed block. + + Parameters: + only_forms=1: print the word forms but not tags and other info; + This can be used to create the list of known forms that we want to + identify even if they are not annotated as fixed. + known_expressions: the name of the text file with the expressions + """ + super().__init__(**kwargs) + self.only_forms = only_forms + self.known_expressions = {} + self.first_words = {} + self.max_length = 2 + if known_expressions: + fh = open(known_expressions, 'r', encoding='utf-8') + n = 0 + for expression in fh.readlines(): + expression = expression.replace('\n', '') + if expression in self.known_expressions: + self.known_expressions[expression] += 1 + else: + self.known_expressions[expression] = 1 + logging.info("Read known fixed expression '%s'" % expression) + n += 1 + words = expression.split(' ') + first_word = words[0] + self.first_words[first_word] = 1 + length = len(words) + if length > self.max_length: + self.max_length = length + logging.info('Read %d known fixed expressions.' % n) + + def process_node(self, node): + fixed_children = [x for x in node.children if x.udeprel == 'fixed'] + if len(fixed_children) > 0: + # Fixed children are always to the right of of the parent. But there + # may be other nodes in between that are not fixed children (for + # example, there may be punctuation that is attached to one of the + # fixed nodes). + n = node + list_of_forms = [node.form.lower()] + list_of_tags = [node.upos] + while n != fixed_children[-1]: + n = n.next_node + if n.parent == node and n.udeprel == 'fixed': + list_of_forms.append(n.form.lower()) + list_of_tags.append(n.upos) + else: + list_of_forms.append('X') + list_of_tags.append('X') + forms = ' '.join(list_of_forms) + tags = ' '.join(list_of_tags) + if self.only_forms: + print(forms) + else: + print("%s / %s / %s" % (forms, tags, node.deprel)) + else: + # If this is not the first word of a fixed expression, check whether + # something that looks like a known fixed expression starts here. + # Note that it is also possible that a known expression starts here + # but only a subset is actually marked as such; we currently do not + # account for this. + if node.form.lower() in self.first_words: + n = node + list_of_forms = [node.form.lower()] + list_of_tags = [node.upos] + for i in range(self.max_length - 1): + n = n.next_node + if not n: + break + ###!!! At present we cannot identify known expressions with gaps ('X'). + list_of_forms.append(n.form.lower()) + list_of_tags.append(n.upos) + forms = ' '.join(list_of_forms) + if forms in self.known_expressions: + if self.only_forms: + print(forms) + else: + tags = ' '.join(list_of_tags) + print("%s / %s / NOT FIXED" % (forms, tags)) + break diff --git a/udapi/block/ud/pt/addhyphenmwt.py b/udapi/block/ud/pt/addhyphenmwt.py new file mode 100644 index 00000000..9492b1a2 --- /dev/null +++ b/udapi/block/ud/pt/addhyphenmwt.py @@ -0,0 +1,37 @@ +"""Block ud.pt.AddHyphenMwt for transforming hyphen compounds into multiword tokens in Portuguese-GSD. + +See https://github.com/UniversalDependencies/UD_Portuguese-GSD/issues/39 +""" +from udapi.core.block import Block + +class AddHyphenMwt(Block): + + def _ok(self, token): + # The hyphen in "al-Assad" perhaps should be kept as a separate word. + return token.form.isalnum() and token.form.lower() != 'al' + + def process_tree(self, root): + tokens, i = root.token_descendants, 1 + while i+1 < len(tokens): + start_i = i-1 + if tokens[i].form == "-" and self._ok(tokens[i-1]) and self._ok(tokens[i+1]): + while i+3 < len(tokens) and tokens[i+2].form == "-" and self._ok(tokens[i+3]): + i += 2 + compound, words = tokens[start_i:i+2], [] + for token in compound: + words += token.words + heads = [w for w in words if w.parent not in words] + cuckolds = [w for w in words if w not in heads and any(c not in words for c in w.children)] + if len(heads) > 1: + for h in heads: + h.misc["ToDo"] = 'NonCatenaCompound' + elif cuckolds: + for c in cuckolds: + c.misc["ToDo"] = 'HasChildrenOutsideCompound' + else: + compound_form = "".join(t.form for t in compound) + for hyphen in compound[1::2]: + hyphen.remove() + root.create_multiword_token([w for w in words if w.form != '-'], compound_form) + root.text = None + i += 1 diff --git a/udapi/block/ud/pt/addmwt.py b/udapi/block/ud/pt/addmwt.py index 11ebfbbf..daa605b2 100644 --- a/udapi/block/ud/pt/addmwt.py +++ b/udapi/block/ud/pt/addmwt.py @@ -39,8 +39,8 @@ 'nisso': {'form': 'em isso', 'lemma': 'em este'}, 'nisto': {'form': 'em isto', 'lemma': 'em este', 'upos': 'ADP PRON', 'main': 1, 'shape': 'subtree'}, - 'no': {'form': 'em o', 'lemma': 'em o'}, - 'nos': {'form': 'em os', 'lemma': 'em o'}, + 'no': {'form': 'em o', 'lemma': 'em o'}, # PRON cases are excluded below + 'nos': {'form': 'em os', 'lemma': 'em o'}, # PRON cases are excluded below 'num': {'form': 'em um', 'lemma': 'em um'}, 'numa': {'form': 'em uma', 'lemma': 'em um'}, 'numas': {'form': 'em umas', 'lemma': 'em um'}, @@ -79,6 +79,11 @@ class AddMwt(udapi.block.ud.addmwt.AddMwt): def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + + # "no" can be either a contraction of "em o", or a pronoun + if node.form.lower() in ('no', 'nos') and node.upos == 'PRON': + return + analysis = MWTS.get(node.form.lower(), None) # If the input is e.g.: diff --git a/udapi/block/ud/ro/fixfixed.py b/udapi/block/ud/ro/fixfixed.py new file mode 100644 index 00000000..14d16464 --- /dev/null +++ b/udapi/block/ud/ro/fixfixed.py @@ -0,0 +1,20 @@ +"""Block ud.ro.FixFixed + +Author: Dan Zeman +""" +import logging + +from udapi.core.block import Block + + +class FixFixed(Block): + """Block for fixing annotation of some 'fixed' expressions.""" + + def process_node(self, node): + fixchildren = [x for x in node.children if x.udeprel=='fixed'] + nfc = len(fixchildren) + if nfc > 0: + if node.udeprel == 'advmod' and node.feats['ExtPos'] == '': + node.feats['ExtPos'] = 'ADV' + elif node.feats['ExtPos'] == '': + logging.info('Another case: '+node.lemma+' '+' '.join([x.form for x in fixchildren])) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py new file mode 100644 index 00000000..6fa73460 --- /dev/null +++ b/udapi/block/ud/ru/fixedeprels.py @@ -0,0 +1,279 @@ +"""Block to fix case-enhanced dependency relations in Russian.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'как' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('как_в:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'более_чем': [], + 'будто': [], + 'ведь': [], + 'ежели': [], + 'если': [], + 'как': ['как_только'], + 'когда': [], + 'кроме_как': [], + 'менее_чем': [], + 'минус': [], + 'нежели': [], + 'плюс': [], + 'пока': [], + 'поскольку': [], + 'потому_что': [], + 'пусть': [], + 'равно_как': [], + 'раз': [], + 'словно': [], + 'так_что': [], + 'хоть': [], + 'хотя': [], + 'чем': [], + 'что': [], + 'чтобы': [], + 'яко': [] + } + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'versus': 'версус:nom', + 'loc': 'в:loc', + 'в_вид': 'в_виде:gen', + 'в_во_глава': 'в:acc', # annotation error: 'входил в группу во главе с геологом' + 'в_для': 'в:acc', + 'в_качество': 'в_качестве:gen', + 'в_отношение': 'в_отношении:gen', + 'в_с': 'в:loc', # annotation error: 'в партнерстве с ACCELS' lacks the second level + 'в_связь_с': 'в_связи_с:ins', + 'в_случай_если': 'в_случае_если', + 'в_случай_когда': 'в_случае_когда', + 'в_соответствие_с': 'в_соответствии_с:ins', + 'в_течение': 'в_течение:gen', + 'в_то_быть': 'в:loc', + 'в_тот_время_как': 'в_то_время_как', + 'в_угода': 'в_угоду:dat', + 'в_ход': 'в_ходе:gen', + 'вблизи': 'вблизи:gen', + 'взамен': 'взамен:gen', + 'вместо': 'вместо:gen', + 'во_глава': 'во_главе_с:ins', + 'во_глава_с': 'во_главе_с:ins', + 'во_избежание': 'во_избежание:gen', + 'возле': 'возле:gen', + 'вокруг': 'вокруг:gen', + 'вплоть_до': 'вплоть_до:gen', + 'вроде': 'вроде:gen', + 'выше': 'выше:gen', + 'для': 'для:gen', + 'для_в': 'для:gen', + 'до_то_как': 'до:gen', # до того, как ... + 'за_исключение': 'за_исключением:gen', + 'из_более_чем': 'из:gen', + 'к': 'к:dat', + 'ко': 'ко:dat', + 'коли_скоро': 'коль_скоро', + 'кроме': 'кроме:gen', + 'между_во_глава': 'между:ins', # annotation error: 'между делегацией Минобороны во главе с замминистра Владимиром Исаковым и лидером Приднестровья Игорем Смирновым' + 'на_вперед': 'на:acc', + 'над': 'над:ins', # at least I have not encountered any genuine example of accusative + 'насчет': 'насчет:gen', + 'несмотря_на': 'несмотря_на:acc', + 'ниже': 'ниже:gen', + 'около': 'около:gen', + 'от_до': 'от:gen', + 'от_от': 'от:gen', + 'от_с': 'от:gen', + 'относительно': 'относительно:gen', + 'перед': 'перед:ins', + 'по_мера': 'по_мере:gen', + 'по_мера_то_как': 'по_мере_того_как', + 'по_отношение_ко?': 'по_отношению_к:dat', + 'по_повод': 'по_поводу:gen', + 'по_сравнение_с': 'по_сравнению_с:ins', + 'помимо': 'помимо:gen', + 'порядка': 'порядка:gen', + 'после': 'после:gen', + 'посредством_как': 'посредством:gen', + 'при': 'при:loc', + 'при_помощь': 'при_помощи:gen', + 'при_условие_что': 'при_условии_что', + 'про': 'про:acc', + 'против': 'против:gen', + 'с_более_чем': 'с:gen', + 'с_во_глава': 'с:ins', + 'с_на': 'с:par', + 'с_помощь': 'с_помощью:gen', + 'с_тем': 'с:ins', + 'с_тот_пора_как': 'с_тех_пор_как', + 'с_что': 'с:ins', + 'свыше': 'свыше:gen', + 'со_сторона': 'со_стороны:gen', + 'согласно': 'согласно:dat', + 'спустя': 'спустя:acc', + 'среди': 'среди:gen', + 'среди_в': 'среди:gen', + 'так_чтобы': 'чтобы', + 'тем_между': 'между:ins', + 'у': 'у:gen', + 'у_без': 'у:gen', + 'через': 'через:acc', + 'чтоб': 'чтобы' + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Russian basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + # Although in theory allowed by the EUD guidelines, Russian does not enhance the ccomp relation with case markers. + edep['deprel'] = re.sub(r'^ccomp:чтобы$', r'ccomp', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause. + edep['deprel'] = re.sub(r':(быть|сколь|столько|типа).*', '', edep['deprel']) + # Some markers should be discarded only if they occur as clause markers (acl, advcl). + edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) + # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). + edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'obl:\1\2', edep['deprel']) + edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'nmod:\1\2', edep['deprel']) + # If the case marker starts with 'столько', remove this part. + # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. + # Similarly, 'то' occurs in 'то...то' and should be removed. + edep['deprel'] = re.sub(r':(столько|то|точно)[_:]', ':', edep['deprel']) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'словно_у' becomes just 'словно'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m and m.group(2) and not x+m.group(2) in exceptions: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + continue + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|par|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. + m = re.match(r'^(obl(?::arg)?|nmod):(до|из|от)(?::(?:nom|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or partitive are possible. Pick genitive. + edep['deprel'] = m.group(1)+':'+m.group(2)+':gen' + continue + # Both "на" and "в" also occur with genitive. However, this + # is only because there are numerals in the phrase ("в 9 случаев из 10") + # and the whole phrase should not be analyzed as genitive. + m = re.match(r'^(obl(?::arg)?|nmod):(в|во|на|о)(?::(?:nom|gen|dat|voc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Accusative or locative are possible. Pick locative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + continue + # Unlike in Czech, 'над' seems to allow only instrumental and not accusative. + m = re.match(r'^(obl(?::arg)?|nmod):(за|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Accusative or instrumental are possible. Pick accusative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' + continue + m = re.match(r'^(obl(?::arg)?|nmod):(между)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or instrumental are possible. Pick genitive. + edep['deprel'] = m.group(1)+':'+m.group(2)+':gen' + continue + m = re.match(r'^(obl(?::arg)?|nmod):(по)(?::(?:nom|gen|voc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Dative, accusative or locative are possible. Pick dative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':dat' + continue + m = re.match(r'^(obl(?::arg)?|nmod):(с)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or instrumental are possible. Pick instrumental. + edep['deprel'] = m.group(1)+':'+m.group(2)+':ins' + continue + if re.match(r'^(nmod|obl):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'nmod:loc': + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'nmod:voc': + edep['deprel'] = 'nmod:nom' + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/ru/fixtoest.py b/udapi/block/ud/ru/fixtoest.py new file mode 100644 index 00000000..1b603e96 --- /dev/null +++ b/udapi/block/ud/ru/fixtoest.py @@ -0,0 +1,35 @@ +"""Block to fix annotation of то есть in Russian.""" +from udapi.core.block import Block +import logging +import re + +class FixToEst(Block): + + def process_node(self, node): + """ + In the converted data from Kira, the fixed expression "то есть" ("that is") + is treated as a subordinator and attached as "mark", which later makes it + part of complex enhanced relation labels. I believe that this analysis is + wrong and that it will be better to label these expressions as "cc". + """ + if node.udeprel == 'mark' and node.lemma == 'то': + if len([c for c in node.children if c.udeprel == 'fixed' and c.lemma == 'быть']) > 0: + self.set_basic_and_enhanced(node, node.parent, 'cc', 'cc') + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/setspaceafter.py b/udapi/block/ud/setspaceafter.py index e796bf0d..04c9fffb 100644 --- a/udapi/block/ud/setspaceafter.py +++ b/udapi/block/ud/setspaceafter.py @@ -13,10 +13,11 @@ class SetSpaceAfter(Block): """Block for heuristic setting of the SpaceAfter=No MISC attribute.""" - def __init__(self, not_after='¡¿([{„', not_before='.,;:!?}])', fix_text=True, **kwargs): + def __init__(self, not_after='¡ ¿ ( [ { „ /', not_before='. , ; : ! ? } ] ) / ?? ??? !! !!! ... …', + fix_text=True, extra_not_after='', extra_not_before='', **kwargs): super().__init__(**kwargs) - self.not_after = not_after - self.not_before = not_before + self.not_after = (not_after + ' ' + extra_not_after).split(' ') + self.not_before = (not_before + ' ' + extra_not_before).split(' ') self.fix_text = fix_text self.changed = False @@ -26,7 +27,7 @@ def process_tree(self, root): self.changed = False # Undirected double quotes are ambiguous. - # If there is an even number of quotes in a sentence, supposed they are not nested + # If there is an even number of quotes in a sentence, suppose they are not nested # and treat odd-indexed ones as opening and even-indexed ones as closing. # Otherwise (odd number, e.g. when quoting multiple sentences), don't remove any space. matching_quotes = not bool(count_of_form['"'] % 2) @@ -36,22 +37,25 @@ def process_tree(self, root): # Some languages use directed „quotes“ and some “quotes”, # so the symbol “ (U+201C) is ambiguous and we heuristically check for presence of „. if count_of_form['„']: - not_before += '“' + not_before += ['“'] else: - not_after += '“' + not_after += ['“'] for i, node in enumerate(nodes[:-1]): next_form = nodes[i + 1].form if node.form in self.not_after or next_form in not_before: self.mark_no_space(node) - if matching_quotes and node.form == '"': - if odd_indexed_quote: + if node.form == '"': + if matching_quotes: + if odd_indexed_quote: + self.mark_no_space(node) + elif i: + self.mark_no_space(nodes[i - 1]) + odd_indexed_quote = not odd_indexed_quote + elif i==0: self.mark_no_space(node) - elif i: - self.mark_no_space(nodes[i - 1]) - odd_indexed_quote = not odd_indexed_quote - if matching_quotes and nodes[-1].form == '"': + if nodes[-1].form == '"': self.mark_no_space(nodes[-2]) if self.fix_text and self.changed: diff --git a/udapi/block/ud/setspaceafterfromtext.py b/udapi/block/ud/setspaceafterfromtext.py index c5321221..ec7ab658 100644 --- a/udapi/block/ud/setspaceafterfromtext.py +++ b/udapi/block/ud/setspaceafterfromtext.py @@ -14,6 +14,10 @@ class SetSpaceAfterFromText(Block): """Block for setting of the SpaceAfter=No MISC attribute according to the sentence text.""" def process_tree(self, root): + # Empty nodes cannot have 'SpaceAfter=No', so make sure the file is valid. + for empty_node in root.empty_nodes: + del empty_node.misc['SpaceAfter'] + text = root.text if text is None: raise ValueError('Tree %s has no text, cannot use ud.SetSpaceAfterFromText' % root) diff --git a/udapi/block/ud/settranslation.py b/udapi/block/ud/settranslation.py new file mode 100644 index 00000000..487cca06 --- /dev/null +++ b/udapi/block/ud/settranslation.py @@ -0,0 +1,59 @@ +""" +Block SetTranslation for setting of sentence-level translation (the attribute +text_en for English translation) from a separate text file (one sentence per +line). For example, one can export the original sentences using write.SentencesHtml, +then Google-translate them in the web browser, then CTRL+C CTRL+V to a plain +text editor, save them as translations.txt and import them using this block. + +Usage: +udapy -s ud.SetTranslation file=translations.txt < in.conllu > out.conllu + +Author: Dan Zeman +""" +from udapi.core.block import Block +import re +import logging + +class SetTranslation(Block): + """ + Set text_en to the next available translation. + """ + + def __init__(self, file, overwrite=False, **kwargs): + """ + Create the SetTranslation block. + + Parameters: + file: the name of the text file with the translations (one sentence per line) + overwrite=1: set the translation even if the sentence already has one + (default: do not overwrite existing translations) + """ + super().__init__(**kwargs) + self.file = file + fh = open(self.file, 'r', encoding='utf-8') + self.trlines = fh.readlines() + self.nlines = len(self.trlines) + self.iline = 0 + self.overwrite = overwrite + + def process_tree(self, tree): + if self.iline < self.nlines: + translation = self.trlines[self.iline] + self.iline += 1 + comments = [] + if tree.comment: + comments = tree.comment.split('\n') + i_tr = -1 + for i in range(len(comments)): + # The initial '#' character has been stripped. + if re.match(r'\s*text_en\s*=', comments[i]): + i_tr = i + break + if i_tr >= 0: + if self.overwrite: + comments[i_tr] = ' text_en = ' + translation + else: + comments.append(' text_en = ' + translation) + tree.comment = '\n'.join(comments) + elif self.iline == self.nlines: + logging.warning('There are only %d translation lines but there are more input sentences.' % self.nlines) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py new file mode 100644 index 00000000..7de53881 --- /dev/null +++ b/udapi/block/ud/sk/fixedeprels.py @@ -0,0 +1,138 @@ +"""Block to fix case-enhanced dependency relations in Slovak.""" +from udapi.core.block import Block +import re + +class FixEdeprels(Block): + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'a_hoci': 'hoci', + 'ako': 'ako', # remove morphological case + 'ako_na': 'ako', + 'ako_z': 'ako', + 'akoby_z': 'z:gen', + 'akže': 'ak', + 'ani_keby': 'keby', + 'ani_keď': 'keď', + 'až_keď': 'keď', + 'do': 'do:gen', + 'k': 'k:dat', + 'kto': 'kým', ###!!! The lemma should be fixed! The pronoun has grammaticalized as a subordinator. + 'mimo': 'mimo:gen', + 'na_rozdiel_od': 'na_rozdiel_od:gen', + 'na_základ': 'na_základe:gen', + 'od': 'od:gen', + 'pod_vplyv': 'pod_vplyvom:gen', + 'pomoc': 'pomocou:gen', + 'pre': 'pre:acc', + 'prostredníctvom': 'prostredníctvom:gen', + 'prv_ako': 'ako', + 's': 's:ins', + 's_cieľ': 's_cieľom', # no case, used with infinitives (advcl) + 's_dôraz_na': 's_dôrazom_na:acc', + 's_ohľad_na': 's_ohľadom_na:acc', + 's_pomoc': 's_pomocou:gen', + 'smer_k': 'smerom_k:dat', + 'spoločne_s': 'spoločne_s:ins', + 'spolu_s': 'spolu_s:ins', + 'v_dôsledok': 'v_dôsledku:gen', + 'v_meno': 'v_mene:gen', + 'v_oblasť': 'v_oblasti:gen', + 'v_porovnanie_s': 'v_porovnaní_s:ins', + 'v_porovnaniu_s': 'v_porovnaní_s:ins', + 'v_priebeh': 'v_priebehu:gen', + 'v_prípad': 'v_prípade:gen', + 'v_prospech': 'v_prospech:gen', + 'v_rámec': 'v_rámci:gen', + 'v_spolupráca_s': 'v_spolupráci_s:ins', + 'v_súlad_s': 'v_súlade_s:ins', + 'v_súvislosť_s': 'v_súvislosti_s:ins', + 'v_ústrety': 'v_ústrety:dat', + 'v_vzťah_k': 'vo_vzťahu_k:dat', + 'v_závislosť_na': 'v_závislosti_na:loc', + 'vzhľad_na': 'vzhľadom_na:acc', + 'z': 'z:gen', + 'z_hľadisko': 'z_hľadiska:gen', + 'začiatkom': 'začiatkom:gen' + } + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Slovak basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):(medzi|na|o|po|pred|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + # If we failed to identify the case of the preposition in the + # preceding steps, pick a default. It applies mostly to 'o' + # with wrongly split time values. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):o$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':o:acc' + solved = True + m = re.match(r'^(obl(?::arg)?|nmod):(po|v)$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + solved = True + # Some cases do not occur with nominal modifiers without preposition. + # If we see them, chances are that it is the same-case modifier, + # and the same case just happens to be the one we see. For vocatives, + # it is also possible that they have been confused with nominatives. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):(voc|loc)$', edep['deprel']) + if m: + edep['deprel'] = m.group(1) + solved = True + # Annotation and conversion errors. + if not solved: + # Povedal som jej „na zdorovie“. + if edep['deprel'] == 'obl:arg:na' and node.form == 'zdorovie': + self.set_basic_and_enhanced(node, edep['parent'], 'ccomp', 'ccomp') + solved = True + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/splittoken.py b/udapi/block/ud/splittoken.py new file mode 100644 index 00000000..16c60a38 --- /dev/null +++ b/udapi/block/ud/splittoken.py @@ -0,0 +1,107 @@ +""" +Block ud.SplitToken will split a given token into multiple tokens. +""" +from udapi.core.block import Block +import re +import logging + + +class SplitToken(Block): + """ + Split a token into two or more. A MISC attribute is used to mark the tokens + that should be split. (The attribute may have been set by an annotator or + by a previous block that tests the specific conditions under which splitting + is desired.) Multiword tokens are currently not supported: The node to be + split cannot belong to a MWT. Note that the result will not be a MWT either + (use the block ud.AddMwt if that is desired). There will be simply a new + attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes + (indicating that this was an error in the source text). + """ + + def __init__(self, misc_name='SplitToken', **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the splitting + default: SplitToken + The value of the attribute should indicate where to split the token. + It should be a string that is identical to node.form except that + there is one or more spaces where the token should be split. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + + def process_node(self, node): + """ + The SplitToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be split to multiple nodes and the + attribute will be removed from MISC, or a warning will be issued that + the splitting cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + value = node.misc[self.misc_name] + if value == '': + return + if node.multiword_token: + logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # Verify that the value of the MISC attribute can be used as specification + # of the split. + if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value): + logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + if re.search(r'\s', node.form): + logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + if re.sub(r' ', '', value) != node.form: + logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + # Do the split. + space_after = node.misc['SpaceAfter'] + forms = value.split(' ') + # Optionally, SplitTokenMorpho in MISC can have the morphological annotation + # of the new tokens. For example: + # SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act + if node.misc['SplitTokenMorpho'] != '': + morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ') + del node.misc['SplitTokenMorpho'] + else: + morphoblocks = ['' for x in forms] + node.form = forms[0] + last_node = node + for form, morpho in zip(forms[1:], morphoblocks[1:]): + last_node.misc['SpaceAfter'] = 'No' + last_node.misc['CorrectSpaceAfter'] = 'Yes' + lemma = form + upos = node.upos + feats = str(node.feats) + xpos = node.xpos + if morpho != '': + cols = morpho.split('\\t') + for c in cols: + colname, value = c.split('=', 1) + if colname == 'LEMMA': + lemma = value + elif colname == 'UPOS': + upos = value + elif colname == 'FEATS': + feats = re.sub(r'\\p', '|', value) + elif colname == 'XPOS': + xpos = value + else: + logging.fatal(f"c = {c}") + new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep') + new_node.shift_after_node(last_node) + last_node = new_node + last_node.misc['SpaceAfter'] = space_after + del node.misc[self.misc_name] diff --git a/udapi/block/ud/splitunderscoretokens.py b/udapi/block/ud/splitunderscoretokens.py index 094f181a..44575e0c 100644 --- a/udapi/block/ud/splitunderscoretokens.py +++ b/udapi/block/ud/splitunderscoretokens.py @@ -23,7 +23,7 @@ class SplitUnderscoreTokens(Block): Real-world use cases: UD_Irish (`default_deprel=fixed`) and UD_Czech-CLTT v1.4. """ - def __init__(self, deprel=None, default_deprel='flat', **kwargs): + def __init__(self, deprel=None, default_deprel='flat', lemma='split', **kwargs): """Create the SplitUnderscoreTokens block instance. Args: @@ -31,14 +31,21 @@ def __init__(self, deprel=None, default_deprel='flat', **kwargs): Most common values are: flat, fixed, compound. Default=None. default_deprel: Which deprel to use for the newly created nodes if the heuristics in `deprel_for()` method fail. Default=flat. + lemma: What to do with the lemmas? + - 'split' (the default) means to split them on underscores as well + (and warn in case of a different number of underscores than in the form). + - 'form' means to copy the forms to the lemmas """ super().__init__(**kwargs) self.deprel = deprel self.default_deprel = default_deprel + self.lemma = lemma def process_node(self, node): if node.form != '_' and '_' in node.form: forms = node.form.split('_') + if self.lemma == 'form': + node.lemma = node.form lemmas = node.lemma.split('_') if len(forms) != len(lemmas): logging.warning("Different number of underscores in %s and %s, skipping.", diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py new file mode 100644 index 00000000..952644f8 --- /dev/null +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -0,0 +1,46 @@ +"""Block to convert spurious auxiliaries to lexical verbs in Uyghur.""" +from udapi.core.block import Block +import logging +import re + +class FixSpuriousAux(Block): + + def process_node(self, node): + """ + Some verbs that are called auxiliary by the traditional grammar, should + be analyzed in UD as VERB + non-finite xcomp. + """ + # Sometimes there is a double error: it should not be auxiliary, it is + # attached as aux but it is not tagged AUX. So we only look at the deprel. + if node.udeprel == 'aux': + # بەر/بار = give (used with actions done for the benefit of somebody) + # چىق = go out + # چىقىش = come out + # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) + # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) + # باق = do ever? + # ئۆت = pass + # كۆرۈش = see + # باشلى = start + # يەت = be enough + # قايت = return + # چۈش = fall down + # قىل = do + # چاپ = jump + # قورق = fear + # كەلتۈر = cause + # كىر = enter + # _ ... some putative auxiliaries do not even have a lemma + if re.match(r'^(بەر|بار|چىق|چىقىش|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش|قىل|چاپ|قورق|كەلتۈر|كىر)$', node.lemma): + node.upos = 'VERB' + # The auxiliary inherits the incoming relation of its original parent. + lexverb = node.parent + node.parent = lexverb.parent + node.deprel = lexverb.deprel + # The auxiliary also inherits some but not all children of the lexical verb. + for c in lexverb.children: + if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): + c.parent = node + # The lexical verb becomes an xcomp of the auxiliary. + lexverb.parent = node + lexverb.deprel = 'xcomp' diff --git a/udapi/block/ud/yue/lemmatize.py b/udapi/block/ud/yue/lemmatize.py new file mode 100644 index 00000000..87279dc1 --- /dev/null +++ b/udapi/block/ud/yue/lemmatize.py @@ -0,0 +1,43 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + # dictionary: form --> lemma + lemma = { + '𡃁仔': '笭仔', + '仲': '重', + '企': '徛', + '係咪': '係', + '出嚟': '出唻', + '可': '可以', + '啦': '喇', + '㗎喇': '㗎嘑', + '喇': '嘑', + '嚟': '唻', + '就嚟': '就唻', + '死𡃁妹': '死笭妹', + '老豆': '老頭', + '蚊': '緡', + '蛋撻': '蛋澾', + '返嚟': '返唻', + '過嚟人': '過唻人', + '過嚟': '過唻' + } + + def process_node(self, node): + """ + Parts of the Cantonese treebank lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + + For Cantonese, lemmatization includes normalization of some characters. + These are the few cases where lemma differs from the surface form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + if node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py new file mode 100644 index 00000000..abacf29f --- /dev/null +++ b/udapi/block/ud/zh/lemmatize.py @@ -0,0 +1,81 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + def __init__(self, rewrite='empty', **kwargs): + """ + Create the ud.zh.Lemmatize block instance. + + Args: + rewrite=empty: set the lemma if it was empty so far; do not touch the rest + rewrite=form: set the lemma if it was empty or equal to form; do not touch the rest + rewrite=all: set the lemma regardless of what it was previously + """ + super().__init__(**kwargs) + if not re.match(r'^(empty|form|all)$', rewrite): + raise ValueError("Unexpected value of parameter 'rewrite'") + self.rewrite = rewrite + + # dictionary: form --> lemma + lemma = { + # The plural suffix -men. + '我們': '我', # trad + '我们': '我', # simp + '他們': '他', # trad + '他们': '他', # simp + '它們': '它', # trad + '它们': '它', # simp + '牠們': '牠', # trad + '她們': '她', # trad + '她们': '她', # simp + '人們': '人', # trad + '人们': '人' # simp + } + + def process_node(self, node): + """ + Parts of the Chinese treebanks lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + """ + if self.rewrite == 'empty' and not (node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + # Lemmatize negated verbs to their affirmative forms. + # 不是 bùshì = not be + # 沒有 没有 méiyǒu = not exist + # 沒能 没能 méinéng = cannot + # 未能 wèinéng = cannot + # Lemmatize question verbs to their base forms. + # 要不要 yàobùyào = do (you) want? + # 有没有 yǒuméiyǒu = do (you) have? + # Verbs that are derived from the copula and tagged as the copula need + # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). + # 亦為 亦为 yìwèi = také + # 則為 则为 zéwèi = potom + # 更為 更为 gèngwèi = více + # 認為 认为 rènwéi = myslet, věřit + # 以為 以为 yǐwéi = myslet, věřit + # 以爲 以为 yǐwéi = myslet, věřit + if re.match(r'^(AUX|VERB)$', node.upos): + m1 = re.match(r'^([不没沒未])(.+)$', node.form) + m2 = re.match(r'^(.+)([不没沒未])\1$', node.form) + m3 = re.search(r'([是爲為为])', node.form) + if m1: + node.lemma = m1.group(2) + node.feats['Polarity'] = 'Neg' + elif m2: + node.lemma = m2.group(1) + node.feats['Mood'] = 'Int' + elif m3: + node.lemma = m3.group(1) + if node.lemma == '爲': + node.lemma = '為' + elif node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index 11e12b65..9d053cb7 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -1,72 +1,123 @@ """Block udpipe.Base for tagging and parsing using UDPipe.""" from udapi.core.block import Block -from udapi.tool.udpipe import UDPipe +from udapi.tool.udpipeonline import UDPipeOnline +from udapi.core.bundle import Bundle + +# Import UDPipe only if available (requires ufal.udpipe) +try: + from udapi.tool.udpipe import UDPipe + UDPIPE_AVAILABLE = True +except ImportError: + UDPIPE_AVAILABLE = False KNOWN_MODELS = { - 'grc': 'models/udpipe/2.0/ancient_greek-proiel-ud-2.0-conll17-170315.udpipe', - 'grc_proiel': 'models/udpipe/2.0/ancient_greek-ud-2.0-conll17-170315.udpipe', - 'ar': 'models/udpipe/2.0/arabic-ud-2.0-conll17-170315.udpipe', - 'eu': 'models/udpipe/2.0/basque-ud-2.0-conll17-170315.udpipe', - 'bg': 'models/udpipe/2.0/bulgarian-ud-2.0-conll17-170315.udpipe', - 'ca': 'models/udpipe/2.0/catalan-ud-2.0-conll17-170315.udpipe', - 'zh': 'models/udpipe/2.0/chinese-ud-2.0-conll17-170315.udpipe', - 'hr': 'models/udpipe/2.0/croatian-ud-2.0-conll17-170315.udpipe', - 'cs_cac': 'models/udpipe/2.0/czech-cac-ud-2.0-conll17-170315.udpipe', - 'cs_cltt': 'models/udpipe/2.0/czech-cltt-ud-2.0-conll17-170315.udpipe', - 'cs': 'models/udpipe/2.0/czech-ud-2.0-conll17-170315.udpipe', - 'da': 'models/udpipe/2.0/danish-ud-2.0-conll17-170315.udpipe', - 'nl_lassysmall': 'models/udpipe/2.0/dutch-lassysmall-ud-2.0-conll17-170315.udpipe', - 'nl': 'models/udpipe/2.0/dutch-ud-2.0-conll17-170315.udpipe', - 'en_lines': 'models/udpipe/2.0/english-lines-ud-2.0-conll17-170315.udpipe', - 'en_partut': 'models/udpipe/2.0/english-partut-ud-2.0-conll17-170315.udpipe', - 'en': 'models/udpipe/2.0/english-ud-2.0-conll17-170315.udpipe', - 'et': 'models/udpipe/2.0/estonian-ud-2.0-conll17-170315.udpipe', - 'fi_ftb': 'models/udpipe/2.0/finnish-ftb-ud-2.0-conll17-170315.udpipe', - 'fi': 'models/udpipe/2.0/finnish-ud-2.0-conll17-170315.udpipe', - 'fr_partut': 'models/udpipe/2.0/french-partut-ud-2.0-conll17-170315.udpipe', - 'fr_sequoia': 'models/udpipe/2.0/french-sequoia-ud-2.0-conll17-170315.udpipe', - 'fr': 'models/udpipe/2.0/french-ud-2.0-conll17-170315.udpipe', - 'gl_treegal': 'models/udpipe/2.0/galician-treegal-ud-2.0-conll17-170315.udpipe', - 'gl': 'models/udpipe/2.0/galician-ud-2.0-conll17-170315.udpipe', - 'de': 'models/udpipe/2.0/german-ud-2.0-conll17-170315.udpipe', - 'got': 'models/udpipe/2.0/gothic-ud-2.0-conll17-170315.udpipe', - 'el': 'models/udpipe/2.0/greek-ud-2.0-conll17-170315.udpipe', - 'he': 'models/udpipe/2.0/hebrew-ud-2.0-conll17-170315.udpipe', - 'hi': 'models/udpipe/2.0/hindi-ud-2.0-conll17-170315.udpipe', - 'hu': 'models/udpipe/2.0/hungarian-ud-2.0-conll17-170315.udpipe', - 'id': 'models/udpipe/2.0/indonesian-ud-2.0-conll17-170315.udpipe', - 'ga': 'models/udpipe/2.0/irish-ud-2.0-conll17-170315.udpipe', - 'it_partut': 'models/udpipe/2.0/italian-partut-ud-2.0-conll17-170315.udpipe', - 'it': 'models/udpipe/2.0/italian-ud-2.0-conll17-170315.udpipe', - 'ja': 'models/udpipe/2.0/japanese-ud-2.0-conll17-170315.udpipe', - 'kk': 'models/udpipe/2.0/kazakh-ud-2.0-conll17-170315.udpipe', - 'ko': 'models/udpipe/2.0/korean-ud-2.0-conll17-170315.udpipe', - 'la_ittb': 'models/udpipe/2.0/latin-ittb-ud-2.0-conll17-170315.udpipe', - 'la_proiel': 'models/udpipe/2.0/latin-proiel-ud-2.0-conll17-170315.udpipe', - 'la': 'models/udpipe/2.0/latin-ud-2.0-conll17-170315.udpipe', - 'lv': 'models/udpipe/2.0/latvian-ud-2.0-conll17-170315.udpipe', - 'no_bokmaal': 'models/udpipe/2.0/norwegian-bokmaal-ud-2.0-conll17-170315.udpipe', - 'no_nynorsk': 'models/udpipe/2.0/norwegian-nynorsk-ud-2.0-conll17-170315.udpipe', - 'cu': 'models/udpipe/2.0/old_church_slavonic-ud-2.0-conll17-170315.udpipe', - 'fa': 'models/udpipe/2.0/persian-ud-2.0-conll17-170315.udpipe', - 'pl': 'models/udpipe/2.0/polish-ud-2.0-conll17-170315.udpipe', - 'pt_br': 'models/udpipe/2.0/portuguese-br-ud-2.0-conll17-170315.udpipe', - 'pt': 'models/udpipe/2.0/portuguese-ud-2.0-conll17-170315.udpipe', - 'ro': 'models/udpipe/2.0/romanian-ud-2.0-conll17-170315.udpipe', - 'ru_syntagrus': 'models/udpipe/2.0/russian-syntagrus-ud-2.0-conll17-170315.udpipe', - 'ru': 'models/udpipe/2.0/russian-ud-2.0-conll17-170315.udpipe', - 'sk': 'models/udpipe/2.0/slovak-ud-2.0-conll17-170315.udpipe', - 'sl_sst': 'models/udpipe/2.0/slovenian-sst-ud-2.0-conll17-170315.udpipe', - 'sl': 'models/udpipe/2.0/slovenian-ud-2.0-conll17-170315.udpipe', - 'es_ancora': 'models/udpipe/2.0/spanish-ancora-ud-2.0-conll17-170315.udpipe', - 'es': 'models/udpipe/2.0/spanish-ud-2.0-conll17-170315.udpipe', - 'sv_lines': 'models/udpipe/2.0/swedish-lines-ud-2.0-conll17-170315.udpipe', - 'sv': 'models/udpipe/2.0/swedish-ud-2.0-conll17-170315.udpipe', - 'tr': 'models/udpipe/2.0/turkish-ud-2.0-conll17-170315.udpipe', - 'uk': 'models/udpipe/2.0/ukrainian-ud-2.0-conll17-170315.udpipe', - 'ur': 'models/udpipe/2.0/urdu-ud-2.0-conll17-170315.udpipe', - 'ug': 'models/udpipe/2.0/uyghur-ud-2.0-conll17-170315.udpipe', - 'vi': 'models/udpipe/2.0/vietnamese-ud-2.0-conll17-170315.udpipe', + 'af': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', + 'af_afribooms': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', + 'grc': 'models/udpipe/2.4/ancient_greek-perseus-ud-2.4-190531.udpipe', + 'grc_perseus': 'models/udpipe/2.4/ancient_greek-perseus-ud-2.4-190531.udpipe', + 'grc_proiel': 'models/udpipe/2.4/ancient_greek-proiel-ud-2.4-190531.udpipe', + 'ar': 'models/udpipe/2.4/arabic-padt-ud-2.4-190531.udpipe', + 'ar_padt': 'models/udpipe/2.4/arabic-padt-ud-2.4-190531.udpipe', + 'hy': 'models/udpipe/2.4/armenian-armtdp-ud-2.4-190531.udpipe', + 'hy_armtdp': 'models/udpipe/2.4/armenian-armtdp-ud-2.4-190531.udpipe', + 'eu': 'models/udpipe/2.4/basque-bdt-ud-2.4-190531.udpipe', + 'eu_bdt': 'models/udpipe/2.4/basque-bdt-ud-2.4-190531.udpipe', + 'be': 'models/udpipe/2.4/belarusian-hse-ud-2.4-190531.udpipe', + 'be_hse': 'models/udpipe/2.4/belarusian-hse-ud-2.4-190531.udpipe', + 'bg': 'models/udpipe/2.4/bulgarian-btb-ud-2.4-190531.udpipe', + 'bg_btb': 'models/udpipe/2.4/bulgarian-btb-ud-2.4-190531.udpipe', + 'ca': 'models/udpipe/2.4/catalan-ancora-ud-2.4-190531.udpipe', + 'ca_ancora': 'models/udpipe/2.4/catalan-ancora-ud-2.4-190531.udpipe', + 'zh': 'models/udpipe/2.4/chinese-gsd-ud-2.4-190531.udpipe', + 'zh_gsd': 'models/udpipe/2.4/chinese-gsd-ud-2.4-190531.udpipe', + 'lzh': 'models/udpipe/2.4/classical_chinese-kyoto-ud-2.4-190531.udpipe', + 'lzh_kyoto': 'models/udpipe/2.4/classical_chinese-kyoto-ud-2.4-190531.udpipe', + 'cop': 'models/udpipe/2.4/coptic-scriptorium-ud-2.4-190531.udpipe', + 'cop_scriptotium': 'models/udpipe/2.4/coptic-scriptorium-ud-2.4-190531.udpipe', + 'hr': 'models/udpipe/2.4/croatian-set-ud-2.4-190531.udpipe', + 'hr_set': 'models/udpipe/2.4/croatian-set-ud-2.4-190531.udpipe', + 'cs': 'models/udpipe/2.4/czech-pdt-ud-2.4-190531.udpipe', + 'cs_pdt': 'models/udpipe/2.4/czech-pdt-ud-2.4-190531.udpipe', + 'cs_cac': 'models/udpipe/2.4/czech-cac-ud-2.4-190531.udpipe', + 'cs_cltt': 'models/udpipe/2.4/czech-cltt-ud-2.4-190531.udpipe', + 'cs_fictree': 'models/udpipe/2.4/czech-fictree-ud-2.4-190531.udpipe', + 'da': 'models/udpipe/2.4/danish-ddt-ud-2.4-190531.udpipe', + 'da_ddt': 'models/udpipe/2.4/danish-ddt-ud-2.4-190531.udpipe', + 'nl': 'models/udpipe/2.4/dutch-alpino-ud-2.4-190531.udpipe', + 'nl_alpino': 'models/udpipe/2.4/dutch-alpino-ud-2.4-190531.udpipe', + 'nl_lassysmall': 'models/udpipe/2.4/dutch-lassysmall-ud-2.4-190531.udpipe', + 'en': 'models/udpipe/2.4/english-ewt-ud-2.4-190531.udpipe', + 'en_ewt': 'models/udpipe/2.4/english-ewt-ud-2.4-190531.udpipe', + 'en_gum': 'models/udpipe/2.4/english-gum-ud-2.4-190531.udpipe', + 'en_lines': 'models/udpipe/2.4/english-lines-ud-2.4-190531.udpipe', + 'en_partut': 'models/udpipe/2.4/english-partut-ud-2.4-190531.udpipe', + 'et_edt': 'models/udpipe/2.4/estonian-edt-ud-2.4-190531.udpipe', + 'et_ewt': 'models/udpipe/2.4/estonian-ewt-ud-2.4-190531.udpipe', + 'fi': 'models/udpipe/2.4/finnish-tdt-ud-2.4-190531.udpipe', + 'fi_tdt': 'models/udpipe/2.4/finnish-tdt-ud-2.4-190531.udpipe', + 'fi_ftb': 'models/udpipe/2.4/finnish-ftb-ud-2.4-190531.udpipe', + 'fr_gsd': 'models/udpipe/2.4/french-gsd-ud-2.4-190531.udpipe', + 'fr_partut': 'models/udpipe/2.4/french-partut-ud-2.4-190531.udpipe', + 'fr_sequoia': 'models/udpipe/2.4/french-sequoia-ud-2.4-190531.udpipe', + 'fr_spoken': 'models/udpipe/2.4/french-spoken-ud-2.4-190531.udpipe', + 'gl_ctg': 'models/udpipe/2.4/galician-ctg-ud-2.4-190531.udpipe', + 'gl_treegal': 'models/udpipe/2.4/galician-treegal-ud-2.4-190531.udpipe', + 'de': 'models/udpipe/2.4/german-gsd-ud-2.4-190531.udpipe', + 'got': 'models/udpipe/2.4/gothic-proiel-ud-2.4-190531.udpipe', + 'el': 'models/udpipe/2.4/greek-gdt-ud-2.4-190531.udpipe', + 'he': 'models/udpipe/2.4/hebrew-htb-ud-2.4-190531.udpipe', + 'hi': 'models/udpipe/2.4/hindi-hdtb-ud-2.4-190531.udpipe', + 'hu': 'models/udpipe/2.4/hungarian-szeged-ud-2.4-190531.udpipe', + 'id': 'models/udpipe/2.4/indonesian-gsd-ud-2.4-190531.udpipe', + 'ga': 'models/udpipe/2.4/irish-idt-ud-2.4-190531.udpipe', + 'it_isdt': 'models/udpipe/2.4/italian-isdt-ud-2.4-190531.udpipe', + 'it_partut': 'models/udpipe/2.4/italian-partut-ud-2.4-190531.udpipe', + 'it_postwita': 'models/udpipe/2.4/italian-postwita-ud-2.4-190531.udpipe', + 'it_vit': 'models/udpipe/2.4/italian-vit-ud-2.4-190531.udpipe', + 'ja': 'models/udpipe/2.4/japanese-gsd-ud-2.4-190531.udpipe', + 'ko_gsd': 'models/udpipe/2.4/korean-gsd-ud-2.4-190531.udpipe', + 'ko_kaist': 'models/udpipe/2.4/korean-kaist-ud-2.4-190531.udpipe', + 'la_ittb': 'models/udpipe/2.4/latin-ittb-ud-2.4-190531.udpipe', + 'la_perseus': 'models/udpipe/2.4/latin-perseus-ud-2.4-190531.udpipe', + 'la_proiel': 'models/udpipe/2.4/latin-proiel-ud-2.4-190531.udpipe', + 'lv': 'models/udpipe/2.4/latvian-lvtb-ud-2.4-190531.udpipe', + 'lt_alksnis': 'models/udpipe/2.4/lithuanian-alksnis-ud-2.4-190531.udpipe', + 'lt_hse': 'models/udpipe/2.4/lithuanian-hse-ud-2.4-190531.udpipe', + 'mt': 'models/udpipe/2.4/maltese-mudt-ud-2.4-190531.udpipe', + 'mr': 'models/udpipe/2.4/marathi-ufal-ud-2.4-190531.udpipe', + 'sme': 'models/udpipe/2.4/north_sami-giella-ud-2.4-190531.udpipe', + 'no_bokmaal': 'models/udpipe/2.4/norwegian-bokmaal-ud-2.4-190531.udpipe', + 'no_nynorsklia': 'models/udpipe/2.4/norwegian-nynorsklia-ud-2.4-190531.udpipe', + 'no_nynorsk': 'models/udpipe/2.4/norwegian-nynorsk-ud-2.4-190531.udpipe', + 'cu': 'models/udpipe/2.4/old_church_slavonic-proiel-ud-2.4-190531.udpipe', + 'fro': 'models/udpipe/2.4/old_french-srcmf-ud-2.4-190531.udpipe', + 'orv': 'models/udpipe/2.4/old_russian-torot-ud-2.4-190531.udpipe', + 'fa': 'models/udpipe/2.4/persian-seraji-ud-2.4-190531.udpipe', + 'pl_lfg': 'models/udpipe/2.4/polish-lfg-ud-2.4-190531.udpipe', + 'pl_pdb': 'models/udpipe/2.4/polish-pdb-ud-2.4-190531.udpipe', + 'pt_bosque': 'models/udpipe/2.4/portuguese-bosque-ud-2.4-190531.udpipe', + 'pt_gsd': 'models/udpipe/2.4/portuguese-gsd-ud-2.4-190531.udpipe', + 'ro_nonstandard': 'models/udpipe/2.4/romanian-nonstandard-ud-2.4-190531.udpipe', + 'ro_rrt': 'models/udpipe/2.4/romanian-rrt-ud-2.4-190531.udpipe', + 'ru_gsd': 'models/udpipe/2.4/russian-gsd-ud-2.4-190531.udpipe', + 'ru_syntagrus': 'models/udpipe/2.4/russian-syntagrus-ud-2.4-190531.udpipe', + 'ru_taiga': 'models/udpipe/2.4/russian-taiga-ud-2.4-190531.udpipe', + 'sr': 'models/udpipe/2.4/serbian-set-ud-2.4-190531.udpipe', + 'sk': 'models/udpipe/2.4/slovak-snk-ud-2.4-190531.udpipe', + 'sl_ssj': 'models/udpipe/2.4/slovenian-ssj-ud-2.4-190531.udpipe', + 'sl_sst': 'models/udpipe/2.4/slovenian-sst-ud-2.4-190531.udpipe', + 'es_ancora': 'models/udpipe/2.4/spanish-ancora-ud-2.4-190531.udpipe', + 'es_gsd': 'models/udpipe/2.4/spanish-gsd-ud-2.4-190531.udpipe', + 'sv_lines': 'models/udpipe/2.4/swedish-lines-ud-2.4-190531.udpipe', + 'sv_talbanken': 'models/udpipe/2.4/swedish-talbanken-ud-2.4-190531.udpipe', + 'ta': 'models/udpipe/2.4/tamil-ttb-ud-2.4-190531.udpipe', + 'te': 'models/udpipe/2.4/telugu-mtg-ud-2.4-190531.udpipe', + 'tr': 'models/udpipe/2.4/turkish-imst-ud-2.4-190531.udpipe', + 'uk': 'models/udpipe/2.4/ukrainian-iu-ud-2.4-190531.udpipe', + 'ur': 'models/udpipe/2.4/urdu-udtb-ud-2.4-190531.udpipe', + 'ug': 'models/udpipe/2.4/uyghur-udt-ud-2.4-190531.udpipe', + 'vi': 'models/udpipe/2.4/vietnamese-vtb-ud-2.4-190531.udpipe', + 'wo': 'models/udpipe/2.4/wolof-wtb-ud-2.4-190531.udpipe', } @@ -74,13 +125,14 @@ class Base(Block): """Base class for all UDPipe blocks.""" # pylint: disable=too-many-arguments - def __init__(self, model=None, model_alias=None, - tokenize=True, tag=True, parse=True, **kwargs): - """Create the udpipe.En block object.""" + def __init__(self, model=None, model_alias=None, online=False, + tokenize=True, tag=True, parse=True, resegment=False, + ranges=False, delete_nodes=False, **kwargs): super().__init__(**kwargs) - self.model, self.model_alias = model, model_alias + self.model, self.model_alias, self.online = model, model_alias, online self._tool = None - self.tokenize, self.tag, self.parse = tokenize, tag, parse + self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment + self.ranges, self.delete_nodes = ranges, delete_nodes @property def tool(self): @@ -90,24 +142,59 @@ def tool(self): if not self.model: if not self.model_alias: raise ValueError('model (path/to/model) or model_alias (e.g. en) must be set!') - self.model = KNOWN_MODELS[self.model_alias] - self._tool = UDPipe(model=self.model) + if self.online: + self.model = self.model_alias + else: + self.model = KNOWN_MODELS[self.model_alias] + if self.online: + self._tool = UDPipeOnline(model=self.model) + else: + if not UDPIPE_AVAILABLE: + raise ImportError("UDPipe is not available. Install ufal.udpipe or use online=1") + self._tool = UDPipe(model=self.model) return self._tool - def process_tree(self, root): - tok, tag, par = self.tokenize, self.tag, self.parse - if tok and tag and par: - return self.tool.tokenize_tag_parse_tree(root) - if not tok and tag and par: - return self.tool.tag_parse_tree(root) - # TODO - # return $self->tool->tokenize_tag_parse_tree($root) if $tok && $tag && $par; - # return $self->tool->tokenize_tag_tree($root) if $tok && $tag && !$par; - # return $self->tool->tokenize_tree($root) if $tok && !$tag && !$par; - # return $self->tool->tag_parse_tree($root) if !$tok && $tag && $par; - # return $self->tool->tag_tree($root) if !$tok && $tag && !$par; - # return $self->tool->parse_tree($root) if !$tok && !$tag && $par; - raise ValueError("Unimplemented tokenize=%s tag=%s parse=%s" % (tok, tag, par)) + def process_document(self, doc): + tok, tag, par, reseg, ranges = self.tokenize, self.tag, self.parse, self.resegment, self.ranges + if self.zones == "all" and self.online: + self.tool.process_document(doc, tok, tag, par, reseg, ranges) + return + old_bundles = doc.bundles + new_bundles = [] + for bundle in old_bundles: + for tree in bundle: + new_bundles.append(bundle) + if self._should_process_tree(tree): + if self.delete_nodes: + for subroot in tree.children: + subroot.remove() + if tok: + new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=reseg, + tag=tag, parse=par, ranges=ranges) + if self.resegment and len(new_trees) > 1: + orig_bundle_id = bundle.bundle_id + bundle.bundle_id = orig_bundle_id + '-1' + for i, new_tree in enumerate(new_trees[1:], 2): + new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}") + new_tree.zone = tree.zone + new_bundle.add_tree(new_tree) + new_bundles.append(new_bundle) + elif not tok and not reseg and (tag or par): + self.tool.tag_parse_tree(tree, tag=tag, parse=par) + elif not tok and reseg and not tag and not par: + sentences = self.tool.segment_text(tree.text) + if len(sentences) > 1: + orig_bundle_id = bundle.bundle_id + bundle.bundle_id = orig_bundle_id + '-1' + tree.text = sentences[0] + for i, sentence in enumerate(sentences[1:], 2): + new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}") + new_tree = new_bundle.create_tree(zone=tree.zone) + new_tree.text = sentence + new_bundles.append(new_bundle) + else: + raise ValueError(f"Unimplemented tokenize={tok} tag={tag} parse={par} resegment={reseg}") + doc.bundles = new_bundles ''' Udapi::Block::UDPipe::Base - tokenize, tag and parse into UD diff --git a/udapi/block/udpipe/cs.py b/udapi/block/udpipe/cs.py new file mode 100644 index 00000000..743efcb7 --- /dev/null +++ b/udapi/block/udpipe/cs.py @@ -0,0 +1,10 @@ +"""Block udpipe.Cs for tagging and parsing Czech.""" +from udapi.block.udpipe.base import Base + + +class Cs(Base): + """Tag and parse Czech.""" + + def __init__(self, **kwargs): + """Create the udpipe.Cs block object.""" + super().__init__(model_alias='cs', **kwargs) diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index b814b80d..6e4f2ac9 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -29,7 +29,8 @@ class Eval(Block): # pylint: disable=too-many-arguments,too-many-instance-attributes def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, - expand_code=True, **kwargs): + coref_mention=None, coref_entity=None, empty_nodes=False, + expand_code=True, mwt=None, **kwargs): super().__init__(**kwargs) self.doc = doc self.bundle = bundle @@ -37,10 +38,14 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.node = node self.start = start self.end = end + self.mwt = mwt self.before_doc = before_doc self.after_doc = after_doc self.before_bundle = before_bundle self.after_bundle = after_bundle + self.coref_mention = coref_mention + self.coref_entity = coref_entity + self.empty_nodes = empty_nodes self.expand_code = expand_code self.count = collections.Counter() @@ -66,11 +71,21 @@ def process_document(self, document): if self.doc: exec(self.expand_eval_code(self.doc)) - if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node: + if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node or self.mwt: for bundle in doc.bundles: # TODO if self._should_process_bundle(bundle): self.process_bundle(bundle) + if self.coref_entity or self.coref_mention: + for entity in doc.coref_entities: + if self.coref_entity: + this = entity + exec(self.expand_eval_code(self.coref_entity)) + if self.coref_mention: + for mention in entity.mentions: + this = mention + exec(self.expand_eval_code(self.coref_mention)) + def process_bundle(self, bundle): # Extract variables, so they can be used in eval code document = doc = bundle.document @@ -82,7 +97,7 @@ def process_bundle(self, bundle): if self.bundle: exec(self.expand_eval_code(self.bundle)) - if self.tree or self.node: + if self.tree or self.node or self.mwt: trees = bundle.trees for tree in trees: if self._should_process_tree(tree): @@ -102,10 +117,16 @@ def process_tree(self, tree): exec(self.expand_eval_code(self.tree)) if self.node: - for node in tree.descendants(): + nodes = tree.descendants_and_empty if self.empty_nodes else tree.descendants + for node in nodes: this = node exec(self.expand_eval_code(self.node)) + if self.mwt: + for mwt in tree.multiword_tokens: + this = mwt + exec(self.expand_eval_code(self.mwt)) + def process_start(self): if self.start: exec(self.expand_eval_code(self.start)) diff --git a/udapi/block/util/filter.py b/udapi/block/util/filter.py index 6d4118d6..811973ec 100644 --- a/udapi/block/util/filter.py +++ b/udapi/block/util/filter.py @@ -30,7 +30,7 @@ class Filter(Block): def __init__(self, # pylint: disable=too-many-arguments delete_tree=None, delete_tree_if_node=None, delete_subtree=None, keep_tree=None, keep_tree_if_node=None, keep_subtree=None, - mark=None, **kwargs): + keep_node=None, mark=None, **kwargs): """Create the Filter block object. Args: @@ -56,6 +56,10 @@ def __init__(self, # pylint: disable=too-many-arguments If no node in the tree was marked (i.e. only the root without any children remained), the whole tree will be deleted. + `keep_node`: Python expression to be evaluated for each node and if False, + the node will be deleted and its children rehanged to its parent. + Multiple nodes can be deleted (or kept) this way. + `mark`: a string or None. This makes sense only with `keep_tree_if_node`, where the matched nodes are marked with `Mark=` in `node.misc`, so they will be highlighted if printed with `write.TextModeTrees`. Default=None. @@ -71,6 +75,7 @@ def __init__(self, # pylint: disable=too-many-arguments self.keep_tree = keep_tree self.keep_tree_if_node = keep_tree_if_node self.keep_subtree = keep_subtree + self.keep_node = keep_node self.mark = mark def process_tree(self, tree): # pylint: disable=too-many-branches @@ -118,8 +123,17 @@ def process_tree(self, tree): # pylint: disable=too-many-branches kept_subtrees.append(node) if not kept_subtrees: tree.remove() + return else: for node in kept_subtrees: node.parent = root for orig_subroot in [n for n in root.children if n not in kept_subtrees]: orig_subroot.remove() + + if self.keep_node is not None: + nodes_to_delete = [node for node in tree.descendants if not eval(self.keep_node)] + if nodes_to_delete == tree.descendants: + tree.remove() + return + for node in nodes_to_delete: + node.remove(children='rehang') diff --git a/udapi/block/util/findbug.py b/udapi/block/util/findbug.py index e05afe76..e1ea838c 100644 --- a/udapi/block/util/findbug.py +++ b/udapi/block/util/findbug.py @@ -5,9 +5,12 @@ insert "util.FindBug block=" into the scenario, e.g. to debug ``second.Block``, use -udapy first.Block util.FindBug block=second.Block > bug.conllu + udapy first.Block util.FindBug block=second.Block > bug.conllu This will create the file bug.conllu with the bundle, which caused the bug. + +The second.Block can have any parameters, e.g. + udapy first.Block util.FindBug block=second.Block param1=value1 param2=value2 > bug.conllu """ import copy import logging @@ -20,24 +23,31 @@ class FindBug(BaseWriter): """Debug another block by finding a minimal testcase conllu file.""" - def __init__(self, block, first_error_only=True, **kwargs): - """Args: block, first_error_only""" - super().__init__(**kwargs) + def __init__(self, block, first_error_only=True, + files='-', filehandle=None, docname_as_file=False, encoding='utf-8', + newline='\n', overwrite=False, + **kwargs): + """Args: block, first_error_only. + All other parameters (which are not parameters of BaseWriter) + will be passed to the block being inspected. + """ + super().__init__(files, filehandle, docname_as_file, encoding, newline, overwrite) self.block = block self.first_error_only = first_error_only + self._kwargs = kwargs def process_document(self, document): sub_path, class_name = _parse_block_name(self.block) module = "udapi.block." + sub_path + "." + class_name.lower() try: - command = "from " + module + " import " + class_name + " as b" + command = "from " + module + " import " + class_name + " as B" logging.debug("Trying to run command: %s", command) exec(command) # pylint: disable=exec-used except Exception: logging.warning("Error when trying import the block %s", self.block) raise - command = "b()" # TODO params as kwargs + command = "B(**self._kwargs)" logging.debug("Trying to evaluate this: %s", command) new_block = eval(command) # pylint: disable=eval-used diff --git a/udapi/block/util/joinsentence.py b/udapi/block/util/joinsentence.py new file mode 100644 index 00000000..578f3865 --- /dev/null +++ b/udapi/block/util/joinsentence.py @@ -0,0 +1,77 @@ +""" +Block util.JoinSentence will join a given sentence with the preceding one. +""" +import logging +from udapi.core.block import Block + +class JoinSentence(Block): + """ + Joins a sentence with the preceding one. There are two ways how to indicate + the sentences that this block should process. + + Method 1: Parameter sent_id provides the id of the sentence that should be + merged with the preceding one. At most one sentence pair from the input will + be merged, even if there are multiple sentences with the given id. + + Method 2: A MISC attribute can be specified that, if found, will trigger + joining of the current sentence to the previous one. With this approach, + multiple sentence pairs can be merged during one run. + """ + + def __init__(self, sent_id=None, misc_name=None, misc_value=None, **kwargs): + """ + Args: + sent_id: which sentence should be appended to the previous one + misc_name: name of the MISC attribute that can trigger the joining (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the joining; if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + if misc_name: + if sent_id: + logging.fatal('Cannot combine misc_value with sent_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + self.sent_id = sent_id + self.misc_name = misc_name + self.misc_value = misc_value + + def process_document(self, document): + previous_tree = None + for bundle_no, bundle in enumerate(document.bundles): + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to join all zones but we do not try to do it at present. + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + # The MISC attribute we are looking for should logically occur + # on the first node of the sentence but we can take it from any node. + join_commands = [n for n in root.descendants if n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if join_commands: + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove from the node the MISC attribute that triggered the sentence split. + for n in join_commands: + n.misc[self.misc_name] = '' + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + else: + previous_tree = root + elif bundle.bundle_id == self.sent_id: + logging.info('Found!') + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + root = bundle.get_tree() + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + # We have found our sentence. No need to process the rest of the document. + break diff --git a/udapi/block/util/mark.py b/udapi/block/util/mark.py index c57f7443..bcb4f894 100644 --- a/udapi/block/util/mark.py +++ b/udapi/block/util/mark.py @@ -15,7 +15,7 @@ class Mark(Block): udapy -TM util.Mark node='node.is_nonprojective()' < in | less -R """ - def __init__(self, node, mark=1, add=True, **kwargs): + def __init__(self, node, mark=1, mark_attr="Mark", add=True, print_stats=False, empty=False, **kwargs): """Create the Mark block object. Args: @@ -24,17 +24,36 @@ def __init__(self, node, mark=1, add=True, **kwargs): `mark`: the node will be marked with `Mark=` in `node.misc`. Default=1. + `mark_attr`: use this MISC attribute name instead of "Mark". + `add`: should we keep existing Mark|ToDo|Bug? Default=True. + + `print_stats`: print the total number of marked nodes to stdout at process_end + + `empty`: apply the code also on empty nodes """ super().__init__(**kwargs) self.mark = mark + self.mark_attr = mark_attr self.node = node self.add = add + self.print_stats = print_stats + self._marked = 0 + self.empty = empty def process_node(self, node): if eval(self.node): - node.misc['Mark'] = self.mark + node.misc[self.mark_attr] = self.mark + self._marked += 1 elif not self.add: - del node.misc['Mark'] + del node.misc[self.mark_attr] del node.misc['ToDo'] del node.misc['Bug'] + + def process_empty_node(self, empty_node): + if self.empty: + self.process_node(empty_node) + + def process_end(self): + if self.print_stats: + print(f'util.Mark marked {self._marked} nodes') diff --git a/udapi/block/util/markdiff.py b/udapi/block/util/markdiff.py index 22a7a03e..e102ca9c 100644 --- a/udapi/block/util/markdiff.py +++ b/udapi/block/util/markdiff.py @@ -1,5 +1,7 @@ """util.MarkDiff is a special block for marking differences between parallel trees.""" +import collections import difflib +import pprint from udapi.core.block import Block @@ -7,13 +9,43 @@ class MarkDiff(Block): """Mark differences between parallel trees.""" def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc', - mark=1, add=False, **kwargs): - """Create the Mark block object.""" + mark=1, mark_attr='Mark', add=False, print_stats=0, ignore_parent=False, + align=False, align_attr='Align', **kwargs): + """Create the Mark block object. + Params: + gold_zone: Which of the zones should be treated as gold? + (The changes are interpreted as from a "pred"=predicted zone into the gold zone.) + attributes: Which node attributes should be considered when searching for diffs? + The tree topology, i.e. node parent is always considered. + mark: What value should be used in `node.misc['Mark']` of the differing nodes? + mark_attr: use this MISC attribute name instead of "Mark". + Use mark_attr=0 to prevent marking diffs in MISC. + add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block, + so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block. + print_stats: How many lines of statistics should be printed? -1 means all. + ignore_parent: ignore differences in dependency parents + align: store word alignment, possible values are False (no alignment stored, the default) + "from-pred", i.e. pred_node.misc["Align"] = aligned_gold_node.ord, + "from-gold", i.e. gold_node.misc["Align"] = aligned_pred_node.ord and + "both", i.e. both from-pred and from-gold. + If only forms should be considered for inducing the word alignment, + you should use "util.MarkDiff attributes='form' ignore_parent=1 align=1". + Only one-to-one alignment is supported. + align_attr: use this MISC attribute name instead of "Align". + """ super().__init__(**kwargs) self.gold_zone = gold_zone self.attrs = attributes.split(',') self.mark = mark + self.mark_attr = mark_attr self.add = add + self.print_stats = print_stats + self.ignore_parent = ignore_parent + self.align = align + self.align_attr = align_attr + self.stats = collections.Counter() + if not mark_attr and not align and not print_stats: + raise ValueError('mark_attr=0 does not make sense without align or print_stats') def process_tree(self, tree): gold_tree = tree.bundle.get_tree(self.gold_zone) @@ -21,17 +53,17 @@ def process_tree(self, tree): return if not self.add: for node in tree.descendants + gold_tree.descendants: - del node.misc['Mark'] + del node.misc[self.mark_attr] del node.misc['ToDo'] del node.misc['Bug'] pred_nodes, gold_nodes = tree.descendants, gold_tree.descendants # Make sure both pred and gold trees are marked, even if one has just deleted nodes. - if len(pred_nodes) != len(gold_nodes): - tree.add_comment('Mark = %s' % self.mark) - gold_tree.add_comment('Mark = %s' % self.mark) - pred_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in pred_nodes] - gold_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in gold_nodes] + if len(pred_nodes) != len(gold_nodes) and self.mark_attr: + tree.add_comment(f'{self.mark_attr} = {self.mark}') + gold_tree.add_comment(f'{self.mark_attr} = {self.mark}') + pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in pred_nodes] + gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in gold_nodes] matcher = difflib.SequenceMatcher(None, pred_tokens, gold_tokens, autojunk=False) diffs = list(matcher.get_opcodes()) @@ -41,14 +73,45 @@ def process_tree(self, tree): if edit in {'equal', 'replace'}: for i in range(pred_lo, pred_hi): alignment[i] = i - pred_lo + gold_lo + if self.align in ("both", "from-pred"): + pred_nodes[i].misc[self.align_attr] = i - pred_lo + gold_lo + 1 + if self.align in ("both", "from-gold"): + gold_nodes[i - pred_lo + gold_lo].misc[self.align_attr] = i + 1 for diff in diffs: edit, pred_lo, pred_hi, gold_lo, gold_hi = diff if edit == 'equal': for p_node, g_node in zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]): - if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: - p_node.misc['Mark'] = self.mark - g_node.misc['Mark'] = self.mark + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + self.stats['ONLY-PARENT-CHANGED'] += 1 + if self.mark_attr: + p_node.misc[self.mark_attr] = self.mark + g_node.misc[self.mark_attr] = self.mark else: - for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]: - node.misc['Mark'] = self.mark + if self.mark_attr: + for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]: + node.misc[self.mark_attr] = self.mark + if self.print_stats: + if edit == 'replace': + # first n nodes are treated as aligned, the rest is treated as ADDED/DELETED + n = min(pred_hi - pred_lo, gold_hi - gold_lo) + for p_node, g_node in zip(pred_nodes[pred_lo:pred_lo + n], gold_nodes[gold_lo:gold_lo + n]): + for attr in self.attrs: + p_value, g_value = p_node._get_attr(attr), g_node._get_attr(attr) + if p_value != g_value: + self.stats[f'{attr.upper()}: {p_value} -> {g_value}'] += 1 + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + self.stats['PARENT-CHANGED'] += 1 + pred_lo, gold_lo = pred_lo + n, gold_lo + n + for node in gold_nodes[gold_lo:gold_hi]: + self.stats['ADD-WORD'] += 1 + self.stats['ADD-LEMMA: ' + node.lemma] += 1 + for node in pred_nodes[pred_lo:pred_hi]: + self.stats['DELETE-WORD'] += 1 + self.stats['DELETE-LEMMA: ' + node.lemma] += 1 + + def process_end(self): + if self.print_stats: + how_many = None if self.print_stats in (-1, '-1') else self.print_stats + for edit, count in self.stats.most_common(how_many): + print(f'{count:4} {edit}') diff --git a/udapi/block/util/markmwtbugsatnodes.py b/udapi/block/util/markmwtbugsatnodes.py new file mode 100644 index 00000000..ebc2ef4e --- /dev/null +++ b/udapi/block/util/markmwtbugsatnodes.py @@ -0,0 +1,25 @@ +"""util.MarkMwtBugsAtNodes copies Bug attributes from MISC of multiword tokens to MISC of member nodes. + Otherwise they will be ignored when write.TextModeTrees marked_only=1 is called.""" + +from udapi.core.block import Block + +class MarkMwtBugsAtNodes(Block): + """ + If a node belongs to a multiword token and the MWT has Bug in MISC, copy + the Bug to the node so that filtering trees with bugs works. + The same bug note will be copied to all nodes in the MWT. + """ + + ###!!! Do we want to do the same thing also with ToDo attributes? + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def process_node(self, node): + if node.multiword_token: + if node.multiword_token.misc['Bug']: + self.bug(node, node.multiword_token.misc['Bug']) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py new file mode 100644 index 00000000..4cce4ab8 --- /dev/null +++ b/udapi/block/util/normalize.py @@ -0,0 +1,97 @@ +"""util.Normalize normalizes the ordering of various attributes in CoNLL-U.""" +from udapi.core.block import Block +from pathlib import Path + +class Normalize(Block): + """Normalize the ordering of attributes in the FEATS and MISC columns. + + The attribute-value pairs in the FEATS column in CoNLL-U files + must be sorted alphabetically (case-insensitive) according to the guidelines + (https://universaldependencies.org/format.html#morphological-annotation). + The same is highly recommended for the MISC column. + It is useful e.g. for comparing two conllu files with diff. + + Udapi does the sorting automatically, but for speed reasons + only when writing into these attributes. + This block thus just forces deserialization of node.feats and node.misc, + so that the Udapi later sorts the attributes during serialization. + It is a bit more efficient than something like + util.Eval node='node.feats["Number"] = node.feats["Number"]' + or + util.Eval node='node.misc["NonExistentAttribute"] = None' + """ + + def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, start_sent_id=1, sent_id_prefix="", + sent_id_from_filename=False, sent_id_reset_at_newdoc=False, newdoc_from_filename=False, **kwargs): + """ + Args: + `feats`: normalize the ordering of FEATS. Default=True. + `misc`: normalize the ordering of MISC. Default=True. + `sent_id`: normalize sent_id so it forms a sequence of integers. Default=False. + `empty_node_ord`: normalize ord attributes of empty nodes. Default=False. + `start_sent_id`: the first sent_id number + `sent_id_prefix`: a string to be prepended before the integer sent_id. Default=empty string. + `sent_id_from_filename`: add Path(doc.meta["loaded_from"]).stem before the `sent_id_prefix`. Default=False. + `sent_id_reset_at_newdoc`: reset the sent_id counter to 1 for each new document. Default=False. + `newdoc_from_filename`: set newdoc to Path(doc.meta["loaded_from"]).stem. Default=False. + """ + super().__init__(**kwargs) + self.feats = feats + self.misc = misc + self.sent_id = sent_id + self.empty_node_ord = empty_node_ord + self.next_sent_id = start_sent_id + self.sent_id_prefix = sent_id_prefix + self.sent_id_from_filename = sent_id_from_filename + self.sent_id_reset_at_newdoc = sent_id_reset_at_newdoc + self.newdoc_from_filename = newdoc_from_filename + if sent_id_reset_at_newdoc and not sent_id_from_filename: + raise ValueError("Cannot use sent_id_reset_at_newdoc without sent_id_from_filename") + if sent_id_prefix or start_sent_id != 1 or sent_id_from_filename: + self.sent_id = True + + # TODO: normalize also the order of standardized comments like text, sent_id,... + + def process_bundle(self, bundle): + is_newdoc = any(tree.newdoc for tree in bundle.trees) + if self.newdoc_from_filename and is_newdoc: + tree = next(tree for tree in bundle.trees if tree.newdoc) + tree.newdoc = Path(bundle.document.meta["loaded_from"]).stem + if self.sent_id: + if self.sent_id_reset_at_newdoc and is_newdoc: + self.next_sent_id = 1 + prefix = self.sent_id_prefix + if self.sent_id_from_filename: + prefix = Path(bundle.document.meta["loaded_from"]).stem + prefix + bundle.bundle_id = prefix + str(self.next_sent_id) + self.next_sent_id += 1 + + for tree in bundle: + if self._should_process_tree(tree): + self.process_tree(tree) + + def process_tree(self, tree): + if self.empty_node_ord: + node_ord, empty_ord = 0, 0 + for node in tree.descendants_and_empty: + if node.is_empty(): + empty_ord += 1 + old_empty_ord, new_empty_ord = str(node.ord), f"{node_ord}.{empty_ord}" + if old_empty_ord != new_empty_ord: + # Make sure all nodes in this sentence have deserialized enhanced deps. + for n in tree.descendants_and_empty: + n.deps + node.ord = new_empty_ord + else: + empty_ord = 0 + node_ord = node.ord + for node in tree.descendants: + self.process_node(node) + + def process_node(self, node): + if self.feats: + node.feats._deserialize_if_empty() + node.feats._string = None + if self.misc: + node.misc._deserialize_if_empty() + node.misc._string = None diff --git a/udapi/block/util/resegmentgold.py b/udapi/block/util/resegmentgold.py index 39ebe6e9..383510b6 100644 --- a/udapi/block/util/resegmentgold.py +++ b/udapi/block/util/resegmentgold.py @@ -1,9 +1,11 @@ """util.ResegmentGold is a block for sentence alignment and re-segmentation of two zones.""" import logging +import unicodedata from udapi.core.block import Block from udapi.core.mwt import MWT from udapi.core.root import Root +FUNCTIONAL = {'aux', 'cop', 'mark', 'det', 'clf', 'case', 'cc'} class ResegmentGold(Block): """Sentence-align two zones (gold and pred) and resegment the pred zone. @@ -18,28 +20,41 @@ def __init__(self, gold_zone='gold', **kwargs): super().__init__(**kwargs) self.gold_zone = gold_zone + @staticmethod + def _strip_spaces(string): + return ''.join(filter(lambda c: unicodedata.category(c) != "Zs", string)) + def process_document(self, document): if not document.bundles: return pred_trees = self.extract_pred_trees(document) + was_subroot = set() + for pred_tree in pred_trees: + for n in pred_tree.children: + was_subroot.add(n) for bundle_no, bundle in enumerate(document.bundles): g_tree = bundle.trees[0] p_tree = pred_trees.pop() - g_chars = ''.join(t.form for t in g_tree.token_descendants).replace(' ', '') - p_chars = ''.join(t.form for t in p_tree.token_descendants).replace(' ', '') + g_chars = self._strip_spaces(''.join(t.form for t in g_tree.token_descendants)) + p_chars = self._strip_spaces(''.join(t.form for t in p_tree.token_descendants)) + g_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", g_chars)) + p_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", p_chars)) if g_chars == p_chars: bundle.add_tree(p_tree) continue # Make sure that p_tree contains enough nodes. + moved_roots = [] while len(p_chars) < len(g_chars): if not pred_trees: raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars)) new_p_tree = pred_trees.pop() - p_chars += ''.join(t.form for t in new_p_tree.token_descendants).replace(' ', '') + p_chars += self._strip_spaces(''.join(t.form for t in new_p_tree.token_descendants)) + moved_roots.extend(new_p_tree.children) p_tree.steal_nodes(new_p_tree.descendants) - self.choose_root(p_tree, g_tree) + self.choose_root(p_tree, was_subroot, g_tree) + if not p_chars.startswith(g_chars): raise ValueError('sent_id=%s: !p_chars.startswith(g_chars):\np_chars=%s\ng_chars=%s' % (g_tree.sent_id, p_chars, g_chars)) @@ -51,7 +66,7 @@ def process_document(self, document): p_chars = '' tokens = p_tree.token_descendants for index, token in enumerate(tokens): - p_chars += token.form.replace(' ', '') + p_chars += self._strip_spaces(token.form) if len(p_chars) > len(g_chars): logging.warning('Pred token crossing gold sentences: %s', g_tree.sent_id) # E.g. gold cs ln95048-151-p2s8 contains SpaceAfter=No on the last word @@ -61,7 +76,8 @@ def process_document(self, document): if index + 1 == len(tokens): next_p_tree = Root(zone=p_tree.zone) pred_trees.append(next_p_tree) - next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):]) + next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):], + misc='Rehanged=Yes') bundle.add_tree(p_tree) break else: @@ -76,9 +92,18 @@ def process_document(self, document): words.extend(token.words) else: words.append(token) + for word in words: + if word in was_subroot: + del word.misc['Rehanged'] + if word.parent is not p_tree and word.parent not in words: + if word.udeprel in FUNCTIONAL: + word.parent.misc['FuncChildMissing'] = 'Yes' + for child in word.children: + if child not in words and child.udeprel in FUNCTIONAL: + word.misc['FuncChildMissing'] = 'Yes' next_p_tree.steal_nodes(words) - self.choose_root(p_tree, g_tree) - self.choose_root(next_p_tree, document.bundles[bundle_no + 1].trees[0]) + self.choose_root(p_tree, was_subroot, g_tree) + self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0]) pred_trees.append(next_p_tree) bundle.add_tree(p_tree) break @@ -103,12 +128,19 @@ def extract_pred_trees(self, document): return pred_trees @staticmethod - def choose_root(p_tree, g_tree): - """Prevent multiple roots, which are forbidden in the evaluation script.""" - p_subroots = p_tree.children - if len(p_subroots) > 1: - g_subroot_form = g_tree.children[0] - p_subroot = next((n for n in p_subroots if n.form == g_subroot_form), p_subroots[0]) - for false_subroot in (n for n in p_subroots if n != p_subroot): - false_subroot.parent = p_subroot - false_subroot.deprel = 'wrong-' + false_subroot.deprel + def choose_root(p_tree, was_subroot, g_tree): + """Prevent multiple roots, which are forbidden in CoNLL-U.""" + possible_subroots = [n for n in p_tree.children if n in was_subroot] + if possible_subroots: + the_subroot = possible_subroots[0] + g_subroot = g_tree.children[0] + possible_subroots = sorted([n for n in possible_subroots if n.form == g_subroot.form], + key=lambda n: abs(n.ord - g_subroot.ord)) + the_subroot = possible_subroots[0] if possible_subroots else the_subroot + else: + the_subroot = p_tree.children[0] + the_subroot.misc['Rehanged'] = 'Yes' + for subroot in p_tree.children: + if subroot is not the_subroot: + subroot.parent = the_subroot + subroot.misc['Rehanged'] = 'Yes' diff --git a/udapi/block/util/see.py b/udapi/block/util/see.py index aa7131b7..9a895b88 100644 --- a/udapi/block/util/see.py +++ b/udapi/block/util/see.py @@ -51,7 +51,7 @@ class See(Block): """Print statistics about the nodes specified by the parameter `node`.""" - def __init__(self, node, n=5, stats=STATS, **kwargs): + def __init__(self, node, n=5, stats=STATS, empty=False, **kwargs): """Args: `node`: Python expression to be evaluated for each node and if True, the node will be considered "matching". @@ -62,6 +62,7 @@ def __init__(self, node, n=5, stats=STATS, **kwargs): `children` = number of children nodes, `p_lemma` = lemma of a parent node, etc). See `udapi.core.Node.get_attrs` for a full list of statistics. + `empty`: apply the code also on empty nodes """ super().__init__(**kwargs) self.node = node @@ -73,11 +74,13 @@ def __init__(self, node, n=5, stats=STATS, **kwargs): self.match[stat] = Counter() self.every[stat] = Counter() self.overall = Counter() + self.empty = empty def process_tree(self, root): self.overall['trees'] += 1 tree_match = False - for node in root.descendants: + nodes = root.descendants_and_empty if self.empty else root.descendants + for node in nodes: matching = self.process_node(node) self.overall['nodes'] += 1 if matching: diff --git a/udapi/block/util/splitsentence.py b/udapi/block/util/splitsentence.py new file mode 100644 index 00000000..b6ca57d8 --- /dev/null +++ b/udapi/block/util/splitsentence.py @@ -0,0 +1,134 @@ +""" +Block util.SplitSentence will split a given sentence at a given token. +""" +import logging +from udapi.core.block import Block +from udapi.core.root import Root + +class SplitSentence(Block): + """ + If the sent_id of the current sentence matches the parameter, splits the + sentence into two. The first token of the second sentence is also given as + a parameter. + + Alternatively, a MISC attribute can be specified that triggers sentence + splitting at the given token. With this approach, multiple sentence splits + can be performed during one run. + """ + + def __init__(self, sent_id=None, word_id=None, misc_name=None, misc_value=None, **kwargs): + """ + Args: + sent_id: which sentence should be split (new ids will have A and B appended) + word_id: which word should be the first word of the second sentence (tokens and words will be renumbered) + misc_name: name of the MISC attribute that can trigger the split (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the split; if not specified, then simple occurrence of the attribute with any value will cause the split + MISC attributes that have triggered sentence split will be removed from their node. + """ + super().__init__(**kwargs) + if misc_name: + if sent_id or word_id: + logging.fatal('Cannot combine misc_value with sent_id or word_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + if not word_id: + logging.fatal('Missing parameter word_id') + self.sent_id = sent_id + self.word_id = word_id + self.misc_name = misc_name + self.misc_value = misc_value + + def process_document(self, document): + for bundle_no, bundle in enumerate(document.bundles): + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to split all zones but we do not try to do it at present. + # (The zones may be translations to other languages and it is not likely that we would + # want to split each translation at the same position.) + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + split_points = [n for n in root.descendants if n.ord > 1 and n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if split_points: + # Create as many new bundles as there are split points. + n_new = len(split_points) + current_bid = bundle.bundle_id + idletter = 'B' # a letter will be added to bundle ids to distinguish them + for i in range(n_new): + new_bundle = document.create_bundle() + new_bundle.bundle_id = current_bid + idletter + new_root = Root(zone='') + new_bundle.add_tree(new_root) + # Identify nodes to move to the new bundle. + first_node_id = split_points[i].ord + if i < n_new - 1: + next_first_node_id = split_points[i+1].ord + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id and n.ord < next_first_node_id] + else: + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id] + new_root.steal_nodes(nodes_to_move) + self.make_zeros_roots(new_root) + new_root.text = new_root.compute_text() + # The new bundle was created at the end of the document. + # Move it to the position right after the current bundle. + document.bundles.pop() + document.bundles.insert(bundle_no + i + 1, new_bundle) + idletter = chr(ord(idletter) + 1) + # Remove from the node the MISC attribute that triggered the sentence split. + split_points[i].misc[self.misc_name] = '' + # Update the id of the current bundle, fix its zero-dependents and recompute sentence text. + bundle.bundle_id += 'A' + self.make_zeros_roots(root) + root.text = root.compute_text() + # Update the bundle numbers of the new bundles and all bundles after them. + updated_no = bundle_no + 1 + for b in document.bundles[(bundle_no+1):]: + b.number = updated_no + updated_no += 1 + elif bundle.bundle_id == self.sent_id: + logging.info('Found!') + root = bundle.get_tree() + nodes_to_move = [n for n in root.descendants if n.ord >= self.word_id] + if len(nodes_to_move) == 0: + logging.fatal('No nodes to move to the new sentence; word_id may be out of range') + # Create a new bundle at the end of the current document. + new_bundle = document.create_bundle() + # Move the new bundle to the position right after the current bundle. + new_bundle_no = bundle_no + 1 + document.bundles.pop() + document.bundles.insert(new_bundle_no, new_bundle) + updated_no = new_bundle_no + for b in document.bundles[new_bundle_no:]: + b.number = updated_no + updated_no += 1 + new_bundle.bundle_id = bundle.bundle_id + 'B' + bundle.bundle_id += 'A' + new_root = Root(zone='') + new_bundle.add_tree(new_root) + new_root.steal_nodes(nodes_to_move) + # The steal_nodes() method does not make sure that all nodes newly attached + # to the artificial root have the 'root' relation. Fix it. + self.make_zeros_roots(root) + self.make_zeros_roots(new_root) + # Update the sentence text attributes of the new sentences. + root.text = root.compute_text() + new_root.text = new_root.compute_text() + # We have found our sentence. No need to process the rest of the document. + break + + def make_zeros_roots(self, root): + """ + The steal_nodes() method does not make sure that all nodes newly attached + to the artificial root have the 'root' relation. Fix it. + """ + n_root = 0 + for n in root.descendants: + if n.parent.is_root(): + n.deprel = 'root' + n_root += 1 + if n_root > 1: + logging.warning('More than one 0:root relation in newly segmented sentence %s.' % root.bundle.bundle_id) diff --git a/udapi/block/util/wc.py b/udapi/block/util/wc.py index 403daf5f..9920d0b6 100644 --- a/udapi/block/util/wc.py +++ b/udapi/block/util/wc.py @@ -5,10 +5,16 @@ class Wc(Block): """Special block for printing statistics (word count etc).""" - def __init__(self, **kwargs): - """Create the Wc block object.""" + def __init__(self, tsv=False, **kwargs): + """Create the Wc block object. + + Params: + tsv: print just tab-separated-values (trees, words, tokens, MWTs, empty nodes) + """ super().__init__(**kwargs) self.trees, self.words, self.mwts, self.tokens, self.empty = 0, 0, 0, 0, 0 + self.docs, self.paragraphs = 0, 0 + self.tsv = tsv def process_tree(self, tree): self.trees += 1 @@ -17,10 +23,21 @@ def process_tree(self, tree): self.mwts += mwtoks self.tokens += len(tree.token_descendants) if mwtoks else len(tree.descendants) self.empty += len(tree.empty_nodes) + if tree.newdoc or tree == tree.document[0].trees[0]: + self.docs += 1 + if tree.newpar: + self.paragraphs += 1 def process_end(self): - print('%8d trees\n%8d words' % (self.trees, self.words)) - if self.mwts: - print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) - if self.empty: - print('%8d empty nodes' % self.empty) + if self.tsv: + print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty, self.docs, self.paragraphs)))) + else: + print('%8d trees\n%8d words' % (self.trees, self.words)) + if self.mwts: + print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) + if self.empty: + print('%8d empty nodes' % self.empty) + if self.docs: + print('%8d documents' % self.docs) + if self.paragraphs: + print('%8d paragraphs' % self.paragraphs) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 6c2dc314..ad647477 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -1,7 +1,7 @@ """Conllu class is a a writer of files in the CoNLL-U format.""" +import json from udapi.core.basewriter import BaseWriter - class Conllu(BaseWriter): """A writer of files in the CoNLL-U format.""" @@ -11,65 +11,144 @@ def __init__(self, print_sent_id=True, print_text=True, print_empty_trees=True, self.print_text = print_text self.print_empty_trees = print_empty_trees - # A list of Conllu columns. - self.node_attributes = ["ord", "form", "lemma", "upos", "xpos", - "feats", "parent", "deprel", "raw_deps", "misc"] - def process_tree(self, tree): # pylint: disable=too-many-branches - nodes = tree.descendants + empty_nodes = tree.empty_nodes + if empty_nodes: + nodes = sorted(tree._descendants + empty_nodes) + else: + nodes = tree._descendants # Empty sentences are not allowed in CoNLL-U, so with print_empty_trees==0 # we need to skip the whole tree (including possible comments). if not nodes and not self.print_empty_trees: return + # If tree.comment contains placeholders $NEWDOC,...$TEXT, replace them with the actual + # value of the attribute and make note on which line (i_*) they were present. + comment_lines = tree.comment.splitlines() + i_newdoc, i_newpar, i_sent_id, i_text, i_global_entity = -1, -1, -1, -1, -1 + for i, c_line in enumerate(comment_lines): + if c_line == '$SENT_ID': + i_sent_id = i + comment_lines[i] = ' sent_id = ' + tree.sent_id if self.print_sent_id else None + elif c_line == '$TEXT': + i_text = i + if self.print_text: + if tree.text is None: + comment_lines[i] = ' text = ' + tree.compute_text() + else: + comment_lines[i] = ' text = ' + tree.text.replace('\n', '').replace('\r', '').rstrip() + elif c_line == '$NEWDOC': + i_newdoc = i + if self.print_sent_id and tree.newdoc: + comment_lines[i] = ' newdoc' + (' id = ' + tree.newdoc if tree.newdoc is not True else '') + else: + comment_lines[i] = None + elif c_line == '$NEWPAR': + i_newpar = i + if self.print_sent_id and tree.newpar: + comment_lines[i] = ' newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '') + else: + comment_lines[i] = None + elif c_line == '$GLOBAL.ENTITY': + i_global_entity = i + ge = tree.document.meta.get('global.Entity') + if ge: + comment_lines[i] = ' global.Entity = ' + ge + else: + comment_lines[i] = None + + # Now print the special comments: global.columns, newdoc, newpar, sent_id and text. + # If these comments were already present in tree.comment (as marked with the placeholders), + # keep them at their original position and print also all comment lines preceding them. + # It they were missing, try to print them at the correct position. + printed_i = -1 + if comment_lines and comment_lines[0].startswith(' global.columns'): + printed_i += 1 + print('#' + comment_lines[printed_i]) if self.print_sent_id: if tree.newdoc: - value = ' id = ' + tree.newdoc if tree.newdoc is not True else '' - print('# newdoc' + value) + if i_newdoc == -1: + print('# newdoc' + (' id = ' + tree.newdoc if tree.newdoc is not True else '')) + else: + while printed_i < i_newdoc: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) + ge = tree.document.meta.get('global.Entity') + if ge: + if i_global_entity == -1: + print('# global.Entity = ' + ge) + else: + while printed_i < i_global_entity: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) if tree.newpar: - value = ' id = ' + tree.newpar if tree.newpar is not True else '' - print('# newpar' + value) - print('# sent_id = ' + tree.address()) + if i_newpar == -1: + print('# newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '')) + else: + while printed_i < i_newpar: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) + if i_sent_id == -1: + print('# sent_id = ' + tree.sent_id) + else: + while printed_i < i_sent_id: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) + if self.print_text and i_text == -1: + print('# text = ' + (tree.compute_text() if tree.text is None else tree.text.replace('\n', '').replace('\r', '').rstrip())) - if self.print_text: - print("# text = " + tree.get_sentence()) + for c_line in comment_lines[printed_i + 1:]: + if c_line: + print('#' + c_line) - comment = tree.comment - if comment: - comment = comment.rstrip() - print('#' + comment.replace('\n', '\n#')) + # Special-purpose json_* comments should always be at the end of the comment block. + if tree.json: + for key, value in sorted(tree.json.items()): + print(f"# json_{key} = {json.dumps(value, ensure_ascii=False, sort_keys=True)}") last_mwt_id = 0 - empty_nodes = list(tree.empty_nodes) - next_empty_ord = int(float(empty_nodes[0].ord)) if empty_nodes else -1 for node in nodes: - mwt = node.multiword_token - if mwt and node.ord > last_mwt_id: - last_mwt_id = mwt.words[-1].ord - print('\t'.join([mwt.ord_range(), - mwt.form if mwt.form is not None else '_', - '_\t_\t_\t_\t_\t_\t_', str(mwt.misc)])) - values = [getattr(node, attr_name) for attr_name in self.node_attributes] - values = ['_' if v is None else str(v) for v in values] - try: - values[6] = str(node.parent.ord) - except AttributeError: - values[6] = '0' - print('\t'.join(values)) - if node.ord == next_empty_ord: - empty = empty_nodes.pop(0) - values = [str(getattr(empty, a)) for a in self.node_attributes] - values[6] = '_' - values[7] = '_' - print('\t'.join(values)) - next_empty_ord = int(float(empty_nodes[0].ord)) if empty_nodes else -1 + mwt = node._mwt + if mwt and node._ord > last_mwt_id: + print('\t'.join((mwt.ord_range, + '_' if mwt.form is None else mwt.form, + '_\t_\t_', + '_' if mwt._feats is None else str(mwt.feats), + '_\t_\t_', + '_' if mwt._misc is None else str(mwt.misc)))) + last_mwt_id = mwt.words[-1]._ord - # Empty sentences are not allowed in CoNLL-U, + if node._parent is None: + head = '_' # Empty nodes + else: + try: + head = str(node._parent._ord) + except AttributeError: + head = '0' + + print('\t'.join('_' if v is None else v for v in + (str(node._ord), node.form, node.lemma, node.upos, node.xpos, + '_' if node._feats is None else str(node.feats), head, node.deprel, + node.raw_deps, '_' if node._misc is None else str(node.misc)))) + + # Empty sentences (sentences with no non-empty nodes) are not allowed in CoNLL-U, # but with print_empty_trees==1 (which is the default), # we will print an artificial node, so we can print the comments. - if not nodes: + if not tree._descendants: print("1\t_\t_\t_\t_\t_\t0\t_\t_\tEmpty=Yes") # Empty line separates trees in CoNLL-U (and is required after the last tree as well) print("") + + def before_process_document(self, document): + """Print doc_json_* headers.""" + super().before_process_document(document) + if document.json: + for key, value in sorted(document.json.items()): + print("# doc_json_%s = %s" + % (key, json.dumps(value, ensure_ascii=False, sort_keys=True))) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py new file mode 100644 index 00000000..1d5d4716 --- /dev/null +++ b/udapi/block/write/corefhtml.py @@ -0,0 +1,478 @@ +"""CorefHtml class is a writer for HTML+JavaScript visualization of coreference. + +When using lazy loading of documents (infinite scrolling), +modern browsers don't allow JavaScript to load files from a local file system +("Access to XMLHttpRequest at 'file://.../doc2.html' from origin 'null' has been +blocked by CORS policy: Cross origin requests are only supported for protocol schemes: +http, data, chrome, chrome-extension, https.") + +The recommended solution is to start a local web server, e.g. using + python -m http.server +and browse http://0.0.0.0:8000/my.html. + +Non-recommended solution is to run + google-chrome --new-window --user-data-dir=/tmp/chrome-proxy --allow-file-access-from-files my.html +""" +from udapi.core.basewriter import BaseWriter +from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention +from collections import Counter +import udapi.block.write.html +import gzip +import sys +import os +import re + +ETYPES = 'person place organization animal plant object substance time number abstract event'.split() + +HTYPES = 'PROPN NOUN PRON VERB DET OTHER'.split() + +HEADER = ''' + +Udapi CorefUD viewer + + +''' + +CSS = ''' +#wrap {display: flex; align-items: flex-start;} +#main {width: 100%; padding: 5px; background: white; z-index:100;} +#overview { position: sticky; top: 0; overflow-y: scroll; height:95vh; resize:horizontal; + display: grid; border-right: double; + padding: 5px; width: 20em; background: #ddd; border-radius: 5px; +} +#main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; + padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#main-menu div {display: inline-block;} +#menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} +#menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} +.change .b1 {transform: translate(0, 9px) rotate(-45deg);} +.change .b2 {opacity: 0;} +.change .b3 {transform: translate(0, -9px) rotate(45deg);} + +.m {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.nobox {border:1px solid transparent; padding:0; background: transparent !important; display: inline} +.nobox .labels {display: inline;} +.nocolor {color: black !important;} +.nobold {font-weight: normal;} +.labels {display: block; font-size: 10px;} +.showtree {margin: 5px; user-select: none;} +.display-inline {display: inline;} +.close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} +i.empty {color: gray; border: 3px outset gray; padding: 1px;} +.sentence .singleton {border-style: dotted;} +.crossing:before {content: "!"; display: block; background: #ffd500;} +.active {border: 1px solid red !important;} +.selected {background: red !important; text-shadow: 1px 1px 4px white, -1px 1px 4px white, 1px -1px 4px white, -1px -1px 4px white;} +.sent_id {display: none; background: #ddd; border-radius: 3px;} +''' + +SCRIPT_BASE = ''' +function add_mention_listeners(mentions){ + mentions.click(function(e) { + let was_selected = $(this).hasClass("selected"); + $(".m").removeClass("selected"); + if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} + e.stopPropagation(); + }); + mentions.hover( + function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, + function(e) {$(".m").removeClass("active");} + ); +} +add_mention_listeners($(".m")); + +window.onhashchange = function() { + $(".m").removeClass("selected"); + var fragment = window.location.hash.substring(1); + if (fragment) {$("." + fragment).addClass("selected");} +} + +function menuclick(x) { + x.classList.toggle("change"); + $("#main-menu").toggle(); +} + +async function load_doc(doc_num) { + loading_now = true; + let filename = docs_dir + "/doc" + doc_num + ".html.gz" + console.log("loading " + filename); + try { + const res = await fetch(filename); + let raw = await res.arrayBuffer(); + data = pako.inflate(raw, {to: "string"}); + } catch (error){ + if (! load_fail_reported) { + load_fail_reported = true; + alert("Cannot load " + filename + "\\nLocal files do not support lazy loading." + + " Run a web server 'python -m http.server'\\n" + + "error = " + error); + } + } + $("#main").append(data); + add_mention_listeners($("#doc" + doc_num + " .m")); + $("#doc" + doc_num + " .sentence").each(add_show_tree_button); + $('.eid').toggle($('#show-eid')[0].checked); + $('.etype').toggle($('#show-etype')[0].checked); + $('.sent_id').toggle($('#show-sent_id')[0].checked); + $('.showtree').toggle($('#show-trees')[0].checked); + $('.m').toggleClass('nocolor', ! $('#show-color')[0].checked); + $('.m').toggleClass('nobox', ! $('#show-boxes')[0].checked); + $('.norm').toggle($('#show-norm')[0].checked); + $('.head').toggleClass('nobold', ! $('#show-heads')[0].checked); + $('.empty').toggle($('#show-empty')[0].checked); + $('.sentence').toggleClass('display-inline', ! $('#show-breaks')[0].checked); + $('.par').toggle($('#show-pars')[0].checked); + $('h1').toggle($('#show-docs')[0].checked); + $('.m').toggleClass('htype',$('#htype')[0].checked) + loading_now = false; +} + +var docs_loaded = 1; +var load_fail_reported = false; +var loading_now = false; +add_show_tree_button = function(index, el){ // to be redefined later if show_trees=True + $(el).prepend('🆔' + el.dataset.id + ''); +} +function load_more() { + if (!loading_now && $(window).scrollTop() >= $(document).height() - $(window).height() - 42 && docs_loaded < all_docs) { + docs_loaded += 1; + load_doc(docs_loaded); + } +} +$(window).scroll(load_more); +const resizeObserver = new ResizeObserver(entries =>load_more()); +resizeObserver.observe(document.body); +''' + +SCRIPT_SHOWTREE = ''' +function show_tree_in_tdiv(tdiv, doc_number, index){ + tdiv.treexView([docs_json[doc_number][index]]); + $("\n' + ) + + # The first ud_doc will be printed to the main html file. + self.process_ud_doc(ud_docs[0], 1) + print('') # id=main + + # Other ud_docs will be printed into separate files (so they can be loaded lazily) + orig_stdout = sys.stdout + try: + for i, ud_doc in enumerate(ud_docs[1:], 2): + sys.stdout = gzip.open(f"{self.docs_dir}/doc{i}.html.gz", 'wt') + self.process_ud_doc(ud_doc, i) + sys.stdout.close() + finally: + sys.stdout = orig_stdout + + print(f'') + print('') + + def _start_subspan(self, subspan, crossing=False): + m = subspan.mention + e = m.entity + classes = f'{_dom_esc(e.eid)} {self._mention_ids[m]} {e.etype or "other"} m' + title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}' + classes += f" {m.head.upos if m.head.upos in HTYPES else 'OTHER'}" + title += f'\nhead-upos={m.head.upos}' + if self.colors: + classes += f' {self._entity_colors[e]}' + if all(w.is_empty() for w in subspan.words): + classes += ' empty' + if len(e.mentions) == 1: + classes += ' singleton' + if crossing: + classes += ' crossing' + title += '\ncrossing' + if m.other: + title += f'\n{m.other}' + span_id = '' + if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m: + span_id = f'id="{_dom_esc(e.eid)}" ' + # The title should be always rendered left-to-right (e.g. "head=X", not "X=head"), + # so for RTL languages, we need to use explicit dir="ltr" and insert a nested span with dir="rtl". + if self.rtl: + print(f'' + f'{_dom_esc(subspan.subspan_eid)}' + f' {e.etype}', end='') + else: + print(f'' + f'{_dom_esc(subspan.subspan_eid)}' + f' {e.etype}', end='') + + def process_tree(self, tree): + mentions = set() + nodes_and_empty = tree.descendants_and_empty + for node in nodes_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + subspans = [] + for mention in mentions: + subspans.extend(mention._subspans()) + subspans.sort(reverse=True) + + if tree.newdoc: + print(f'


{tree.newdoc if tree.newdoc is not True else ""}


') + elif tree.newpar: + print('
') + opened, prev_node_mention = [], True + rtl = ' dir="rtl"' if self.rtl else "" + print(f'

') + for node in nodes_and_empty: + if not prev_node_mention and subspans and subspans[-1].words[0] == node: + print('', end='') + while subspans and subspans[-1].words[0] == node: + subspan = subspans.pop() + self._start_subspan(subspan) + opened.append(subspan) + + if not opened and prev_node_mention: + print('', end='') + prev_node_mention = True if opened else False + is_head = self._is_head(node) + if is_head: + print('', end='') + if node.is_empty(): + print('', end='') + print(node.form, end='') + if node.is_empty(): + print('', end='') + if is_head: + print('', end='') + + while opened and opened[-1].words[-1] == node: + if self.rtl: + print('', end='') + else: + print('', end='') + opened.pop() + + # Two mentions are crossing iff their spans have non-zero intersection, + # but neither is a subset of the other, e.g. (e1 ... (e2 ... e1) ... e2). + # Let's visualize this (simplified) as + # ......... + # i.e. let's split mention e2 into two subspans which are next to each other. + # Unfortunatelly, we cannot mark now both crossing mentions using html class "crossing" + # (opening tags are already printed), so we'll mark only the second part of the second mention. + endings = [x for x in opened if x.words[-1] == node] + if endings: + new_opened, brokens, found_crossing = [], [], False + for subspan in opened: + if subspan.words[-1] == node: + found_crossing = True + elif found_crossing: + brokens.append(subspan) + else: + new_opened.append(subspan) + opened = new_opened + print('' * (len(endings) + len(brokens)), end='') + for broken in brokens: + self._start_subspan(broken, True) + opened.append(subspan) + + if not node.no_space_after: + print(' ', end='') + + if not prev_node_mention: + print('', end='') + print('

') + + def _is_head(self, node): + for mention in node.coref_mentions: + if mention.head == node: + return mention + return None + + +# id needs to be a valid DOM querySelector +# so it cannot contain [#./:] and maybe more, +# so let's substitute all [^\w\d-] to be on the safe side. +# DOM IDs cannot start with a digit, so prepend e.g. "n" if needed. +def _dom_esc(string): + if string[0].isdecimal(): + string = 'n' + string + return re.sub(r'[^\w\d-]', '_', string) + +def _id(node): + if node is None: + return 'null' + return _dom_esc(node.address()) + +def _esc(string): + if string is None: + string = '' + return string.replace('\\', '\\\\').replace('"', r'\"') diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index ec33b0fd..ae85d43c 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -14,7 +14,7 @@ class Html(BaseWriter): For offline use, we need to download first three JavaScript libraries:: wget https://code.jquery.com/jquery-2.1.4.min.js - wget https://cdn.rawgit.com/eligrey/FileSaver.js/master/FileSaver.min.js + wget https://cdn.rawgit.com/eligrey/FileSaver.js/1.3.4r/FileSaver.min.js wget https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js udapy write.Html path_to_js=. < file.conllu > file.html firefox file.html @@ -34,7 +34,7 @@ class Html(BaseWriter): This block is based on `Treex::View `_ but takes a different approach. `Treex::View` depends on (older version of) - `Valence` (Perl interface to `Electron `_) + `Valence` (Perl interface to `Electron `_) and comes with a script `view-treex`, which takes a treex file, converts it to json behind the scenes (which is quite slow) and displays the json in a Valence window. @@ -65,7 +65,7 @@ def __init__(self, path_to_js='web', **kwargs): def process_document(self, doc): if self.path_to_js == 'web': jquery = 'https://code.jquery.com/jquery-2.1.4.min.js' - fsaver = 'https://cdn.rawgit.com/eligrey/FileSaver.js/master/FileSaver.min.js' + fsaver = 'https://cdn.rawgit.com/eligrey/FileSaver.js/1.3.4/FileSaver.min.js' js_t_v = 'https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js' else: jquery = self.path_to_js + '/jquery-2.1.4.min.js' @@ -79,16 +79,32 @@ def process_document(self, doc): print('\n') print('
') + + def print_doc_json(self, doc): + print('[') for (bundle_number, bundle) in enumerate(doc, 1): - # TODO: if not self._should_process_bundle(bundle): continue if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' - for tree in bundle.trees: - # TODO: if not self._should_process_tree(tree): continue + try: + trees = bundle.trees + except: + trees = [bundle] # allow to call print_doc_json([tree1, tree2]) + for tree in trees: zone = tree.zone if first_zone: first_zone = False @@ -101,24 +117,16 @@ def process_document(self, doc): print('"labels":["zone=%s","id=%s"]}' % (zone, tree.address())) desc += ',["[%s]","label"],[" ","space"]' % zone for node in tree.descendants: - desc += self.print_node(node) + desc += self.print_node_json(node) desc += r',["\n","newline"]' print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) - print('];') - print("$('#treex-view').treexView(data);") - print('''function saveTree() { - var svg_el = jQuery('svg'); - if (svg_el.length) { - var svg = new Blob([svg_el.parent().html()], {type: "image/svg+xml"}); - saveAs(svg, 'tree.svg'); - } - }''') - print('') + print(']') + @staticmethod - def print_node(node): + def print_node_json(node): """JSON representation of a given node.""" # pylint does not understand `.format(**locals())` and falsely alarms for unused vars # pylint: disable=too-many-locals,unused-variable diff --git a/udapi/block/write/oldcorefud.py b/udapi/block/write/oldcorefud.py new file mode 100644 index 00000000..49f9beb0 --- /dev/null +++ b/udapi/block/write/oldcorefud.py @@ -0,0 +1,58 @@ +"""Writer for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation.""" +import re +import logging +import udapi.block.write.conllu + +class OldCorefUD(udapi.block.write.conllu.Conllu): + + def process_document(self, doc): + if not doc.coref_entities: + logging.warning("Using write.OldCorefUD on a document without any coreference annotation") + + # Delete both new-style (GUM-style) and old-style (CorefUD 0.1) coreference annotations from MISC. + attrs = "Entity Split Bridge ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() + for node in doc.nodes_and_empty: + for key in list(node.misc): + if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): + del node.misc[key] + del doc.meta['global.Entity'] + + # doc._eid_to_entity is a dict, which is insertion ordered in Python 3.7+. + # The insertion order is sorted according to CorefEntity.__lt__ (see few lines above). + # However, new entities could be added meanwhile or some entities edited, + # so we need to sort the entities again before storing to MISC. + # We also need to mare sure entity.mentions are sorted in each entity + # because the ordering of entities is defined by the first mention in each entity. + # Ordering of mentions within a entity can be changed when e.g. changing the span + # of a given mention or reordering words within a sentence and in such events + # Udapi currently does not automatically update the ordering of entities. + for entity in doc.coref_entities: + entity._mentions.sort() + for entity in sorted(doc.coref_entities): + for mention in entity.mentions: + head = mention.head + if head.misc["ClusterId"]: + for a in attrs: + if head.misc[a]: + head.misc[a + "[1]"] = head.misc[a] + del head.misc[a] + index_str = "[2]" + else: + index, index_str = 1, "[1]" + while(head.misc["ClusterId" + index_str]): + index += 1 + index_str = f"[{index}]" + if index == 1: + index_str = "" + head.misc["ClusterId" + index_str] = entity.eid + head.misc["MentionSpan" + index_str] = mention.span + head.misc["ClusterType" + index_str] = entity.etype + if mention._bridging: + head.misc["Bridging" + index_str] = ','.join(f'{l.target.eid}:{l.relation}' for l in sorted(mention.bridging)) + if entity.split_ante: + serialized = ','.join((c.eid for c in sorted(entity.split_ante))) + head.misc["SplitAnte" + index_str] = serialized + if mention.other: + head.misc["MentionMisc" + index_str] = str(mention.other).replace('%2D', '-') + + super().process_document(doc) diff --git a/udapi/block/write/sdparse.py b/udapi/block/write/sdparse.py index 60b78d6d..13487738 100644 --- a/udapi/block/write/sdparse.py +++ b/udapi/block/write/sdparse.py @@ -29,7 +29,7 @@ class Sdparse(BaseWriter): Notes: The original `Stanford dependencies format - `_ + `_ allows explicit specification of the root dependency, e.g. `root(ROOT-0, makes-8)`. However, this is not allowed by Brat, so this writer does not print it. diff --git a/udapi/block/write/sentences.py b/udapi/block/write/sentences.py index 60eb6bec..70553d7d 100644 --- a/udapi/block/write/sentences.py +++ b/udapi/block/write/sentences.py @@ -3,13 +3,14 @@ class Sentences(BaseWriter): - """A writer of plain-text sentences (one per line). + """A writer of plain-text sentences (one sentence per line). Usage: udapy write.Sentences if_missing=empty < my.conllu > my.txt + udapy write.Sentences newdoc=1 newpar=1 < my.conllu > my.txt """ - def __init__(self, if_missing='detokenize', **kwargs): + def __init__(self, if_missing='detokenize', newdoc=None, newpar=None, **kwargs): """Create the Sentences writer block. Parameters: @@ -18,9 +19,21 @@ def __init__(self, if_missing='detokenize', **kwargs): * `empty`: print an empty line * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` * `fatal`: raise an exception + newdoc: What to do if `root.newdoc` is not None? (default=None) + * None: ignore it + * True: print an empty_line (except for the first tree, i.e. bundle.number==1) + newpar: What to do if `root.newpar` is not None? (default=None) + * None: ignore it + * True: print an empty_line (except for the first tree, i.e. bundle.number==1) """ super().__init__(**kwargs) self.if_missing = if_missing + self.newdoc = newdoc + self.newpar = newpar def process_tree(self, tree): + if self.newdoc and tree.newdoc and tree.bundle.number > 1: + print() + if self.newpar and tree.newpar and tree.bundle.number > 1: + print() print(tree.get_sentence(self.if_missing)) diff --git a/udapi/block/write/sentenceshtml.py b/udapi/block/write/sentenceshtml.py new file mode 100644 index 00000000..e0f87241 --- /dev/null +++ b/udapi/block/write/sentenceshtml.py @@ -0,0 +1,37 @@ +"""SentencesHtml class is a writer for sentences in HTML list (could be Google-translated, remembering sentence correspondence).""" +from udapi.core.basewriter import BaseWriter + + +class SentencesHtml(BaseWriter): + """A writer of sentences in HTML list (one per item). + + Usage: + udapy write.SentencesHtml if_missing=empty < my.conllu > my.html + """ + + def __init__(self, title='Sentences from CoNLL-U', if_missing='detokenize', **kwargs): + """Create the SentencesHtml writer block. + + Parameters: + if_missing: What to do if `root.text` is `None`? (default=detokenize) + * `detokenize`: use `root.compute_text()` to compute the sentence. + * `empty`: print an empty line + * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` + * `fatal`: raise an exception + """ + super().__init__(**kwargs) + self.title = title + self.if_missing = if_missing + + def before_process_document(self, document): + super().before_process_document(document) + print('\n\n\n') + print('' + self.title + '') + print('\n\n
    \n') + + def after_process_document(self, document): + print("
\n\n") + super().after_process_document(document) + + def process_tree(self, tree): + print('
  • %s
  • ' % (tree.sent_id, tree.get_sentence(self.if_missing))) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index 54680e88..a8a7ab3d 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -1,8 +1,10 @@ """An ASCII pretty printer of dependency trees.""" +import os import re import sys import colorama +import collections from termcolor import colored from udapi.core.basewriter import BaseWriter @@ -12,6 +14,7 @@ 'upos': 'red', 'deprel': 'blue', 'ord': 'green', + 'misc[Entity]': 'magenta', } # Too many instance variables, arguments, branches... @@ -20,7 +23,7 @@ class TextModeTrees(BaseWriter): - """An ASCII pretty printer of dependency trees. + r"""An ASCII pretty printer of dependency trees. .. code-block:: bash @@ -29,7 +32,7 @@ class TextModeTrees(BaseWriter): In scenario (examples of other parameters):: - write.TextModeTrees indent=1 print_sent_id=1 print_sentence=1 + write.TextModeTrees indent=2 print_sent_id=0 print_sentence=1 layout=align write.TextModeTrees zones=en,cs attributes=form,lemma,upos minimize_cross=0 This block prints dependency trees in plain-text format. @@ -47,7 +50,7 @@ class TextModeTrees(BaseWriter): 10 boxer boxer NOUN NN Number=Sing 4 acl:relcl _ SpaceAfter=No 11 . . PUNCT . _ 2 punct _ _ - will be printed (with the default parameters) as:: + will be printed (with the default parameters plus hints=0) as:: ─┮ │ ╭─╼ I PRON nsubj @@ -62,7 +65,52 @@ class TextModeTrees(BaseWriter): │ ╰─┶ boxer NOUN acl:relcl ╰─╼ . PUNCT punct - Some non-projective trees cannot be printed witout crossing edges. + With ``layout=compact``, the output will be (note the nodes "today" and ","):: + + ─┮ + │ ╭─╼ I PRON nsubj + ╰─┾ saw VERB root + │ ╭─╼ a DET det + ┡───┾ dog NOUN dobj + ┡─╼ │ today NOUN nmod:tmod + ┡─╼ │ , PUNCT punct + │ │ ╭─╼ which DET nsubj + │ │ ┢─╼ was VERB cop + │ │ ┢─╼ a DET det + │ ╰─┶ boxer NOUN acl:relcl + ╰─╼ . PUNCT punct + + With ``layout=align-words``, the output will be:: + + ─┮ + │ ╭─╼ I PRON nsubj + ╰─┾ saw VERB root + │ ╭─╼ a DET det + ┡───┾ dog NOUN dobj + ┡─╼ │ today NOUN nmod:tmod + ┡─╼ │ , PUNCT punct + │ │ ╭─╼ which DET nsubj + │ │ ┢─╼ was VERB cop + │ │ ┢─╼ a DET det + │ ╰─┶ boxer NOUN acl:relcl + ╰─╼ . PUNCT punct + + And finally with ``layout=align``:: + + ─┮ + │ ╭─╼ I PRON nsubj + ╰─┾ saw VERB root + │ ╭─╼ a DET det + ┡───┾ dog NOUN dobj + ┡─╼ │ today NOUN nmod:tmod + ┡─╼ │ , PUNCT punct + │ │ ╭─╼ which DET nsubj + │ │ ┢─╼ was VERB cop + │ │ ┢─╼ a DET det + │ ╰─┶ boxer NOUN acl:relcl + ╰─╼ . PUNCT punct + + Some non-projective trees cannot be printed without crossing edges. TextModeTrees uses a special "bridge" symbol ─╪─ to mark this:: ─┮ @@ -71,52 +119,69 @@ class TextModeTrees(BaseWriter): ╰─┶ 3 │ ╰─╼ 4 - By default parameter ``color=auto``, so if the output is printed to the console + With ``color=auto`` (which is the default), if the output is printed to the console (not file or pipe), each node attribute is printed in different color. If a given node's MISC contains any of `ToDo`, `Bug` or `Mark` attributes (or any other specified in the parameter `mark`), the node will be highlighted - (by reveresing the background and foreground colors). + (by reversing the background and foreground colors). This block's method `process_tree` can be called on any node (not only root), - which is useful for printing subtrees using ``node.print_subtree()``, + which is useful for printing subtrees using ``node.draw()``, which is internally implemented using this block. + For use in LaTeX, you can insert the output of this block (without colors) + into ``\begin{verbatim}...\end{verbatim}``, but you need to compile with pdflatex (xelatex not supported) + and you must add the following code into the preamble:: + + \usepackage{pmboxdraw} + \DeclareUnicodeCharacter{256D}{\textSFi} %╭ + \DeclareUnicodeCharacter{2570}{\textSFii} %╰ + SEE ALSO :py:class:`.TextModeTreesHtml` """ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color='auto', attributes='form,upos,deprel', - print_undef_as='_', print_doc_meta=True, print_comments=False, - mark='ToDo|ToDoOrigText|Bug|Mark', marked_only=False, hints=True, **kwargs): + print_undef_as='_', print_doc_meta=True, print_comments=False, print_empty=True, + print_mwt=False, mark='(ToDo|ToDoOrigText|Bug|Mark)', marked_only=False, hints=True, + layout='classic', **kwargs): """Create new TextModeTrees block object. Args: - print_sent_id: Print ID of the tree (its root, aka "sent_id") above each tree? - print_sentence: Print plain-text detokenized sentence on a line above each tree? - add_empty_line: Print an empty line after each tree? - indent: Number of characters to indent node depth in the tree for better readability. - minimize_cross: Minimize crossings of edges in non-projective trees? - Trees without crossings are subjectively more readable, but usually - in practice also "deeper", that is with higher maximal line length. - color: Print the node attribute with ANSI terminal colors? - Default = 'auto' which means that color output only if the output filehandle - is interactive (console). Each attribute is assigned a color (the mapping is - tested on black background terminals and can be changed only in source code). - If you plan to pipe the output (e.g. to "less -R") and you want the colors, - you need to set explicitly color=1, see the example in Synopsis. - attributes: A comma-separated list of node attributes which should be printed. Possible - values are ord, form, lemma, upos, xpos, feats, deprel, deps, misc. - print_undef_as: What should be printed instead of undefined attribute values (if any)? - print_doc_meta: Print `document.meta` metadata before each document? - print_comments: Print comments (other than sent_id and text)? - mark: a regex. If `re.search(mark + '=', str(node.misc))` the node is highlighted. - If `print_comments and re.search(r'^ (%s) = ' % mark, root.comment, re.M)` - the comment is highlighted. - Empty string means no highlighting. Default = 'ToDo|ToDoOrigText|Bug|Mark'. - marked_only: print only trees containing one or more marked nodes/comments. Default=False. - hints: use thick-marked segments (┡ and ┢) to distinguish whether a given node precedes - or follows its parent. Default=True. If False, plain ├ is used in both cases. + print_sent_id: Print ID of the tree (its root, aka "sent_id") above each tree? + print_text: Print plain-text detokenized sentence on a line above each tree? + add_empty_line: Print an empty line after each tree? + indent: Number of characters to indent node depth in the tree for better readability. + minimize_cross: Minimize crossings of edges in non-projective trees? + Trees without crossings are subjectively more readable, but usually + in practice also "deeper", that is with higher maximal line length. + color: Print the node attribute with ANSI terminal colors? + Default = 'auto' which means that color output only if the output filehandle + is interactive (console). Each attribute is assigned a color (the mapping is + tested on black background terminals and can be changed only in source code). + If you plan to pipe the output (e.g. to "less -R") and you want the colors, + you need to set explicitly color=1, see the example in Synopsis. + attributes: A comma-separated list of node attributes which should be printed. Possible + values are ``ord``, ``form``, ``lemma``, ``upos``, ``xpos``, ``feats``, ``deprel``, ``deps``, ``misc``. + print_undef_as: What should be printed instead of undefined attribute values (if any)? + print_doc_meta: Print ``document.meta`` metadata before each document? + print_comments: Print comments (other than ``sent_id`` and ``text``)? + print_empty: Print empty nodes? Default=True + print_mwt: Print multi-word tokens? Default=False + mark: A regex pattern. If ``re.search(mark + '=', str(node.misc))`` matches, the node is highlighted. + If ``print_comments`` and ``re.search(r'^ %s = ' % mark, root.comment, re.M)`` matches, + the comment is highlighted. Empty string means no highlighting. + Default = ``'(ToDo|ToDoOrigText|Bug|Mark)'``. + marked_only: Print only trees containing one or more marked nodes/comments. Default ``False``. + hints: Use thick-marked segments (┡ and ┢) to distinguish whether a given node precedes + or follows its parent. Default ``True``. If ``False``, plain ├ is used in both cases. + layout: Tree layout style: + + - ``'classic'`` (default): shows word attributes immediately next to each node + - ``'compact'``: never print edges after (right to) words even in non-projectivities + - ``'align-words'``: like ``'compact'`` but all first attributes (forms by default) are aligned + - ``'align'``: like ``'align-words'`` but all attributes are aligned in columns """ super().__init__(**kwargs) self.print_sent_id = print_sent_id @@ -128,8 +193,11 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.print_undef_as = print_undef_as self.print_doc_meta = print_doc_meta self.print_comments = print_comments + self.print_empty = print_empty + self.print_mwt = print_mwt self.mark = mark self.marked_only = marked_only + self.layout = layout # _draw[is_bottommost][is_topmost] line = '─' * indent @@ -149,14 +217,14 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.mark_re, self.comment_mark_re = None, None if mark is not None and mark != '': self.mark_re = re.compile(mark + '=') - self.comment_mark_re = re.compile(r'^ (%s) = ' % mark, re.M) + self.comment_mark_re = re.compile(r'^ %s = ' % mark, re.M) self._index_of = [] - self._gaps = [] + self._gaps = collections.Counter() self.lines = [] self.lengths = [] # We want to be able to call process_tree not only on root node, - # so this block can be called from node.print_subtree(**kwargs) + # so this block can be called from node.print_draw(**kwargs) # on any node and print its subtree. Thus, we cannot assume that # allnodes[idx].ord == idx. Instead of node.ord, we'll use index_of[node.ord], # which is its index within the printed subtree. @@ -171,28 +239,36 @@ def _compute_gaps(self, node): self._gaps[node.ord] = rmost - lmost - descs return lmost, rmost, descs + 1 - def should_print_tree(self, root): + def should_print_tree(self, root, allnodes): """Should this tree be printed?""" if not self.marked_only: return True - if any(self.is_marked(n) for n in root.descendants(add_self=1)): + if any(self.is_marked(n) for n in allnodes): return True if not self.print_comments or root.comment is None or self.mark_re is None: return False return self.comment_mark_re.search(root.comment) - def process_tree(self, root): + def process_tree(self, root, force_print=False): """Print the tree to (possibly redirected) sys.stdout.""" - allnodes = root.descendants(add_self=1) - if not self.should_print_tree(root): + if self.print_empty: + if root.is_root() and not self.print_mwt: + allnodes = [root] + root.descendants_and_empty + else: + allnodes = root.descendants(add_self=1, add_mwt=self.print_mwt) + empty = [e for e in root._root.empty_nodes if e > allnodes[0] and e < allnodes[-1]] + allnodes.extend(empty) + allnodes.sort() + else: + allnodes = root.descendants(add_self=1, add_mwt=self.print_mwt) + if not force_print and not self.should_print_tree(root, allnodes): return - self._index_of = {allnodes[i].ord: i for i in range(len(allnodes))} + self._index_of = {allnodes[i].ord_range if allnodes[i].is_mwt() else allnodes[i].ord: i for i in range(len(allnodes))} self.lines = [''] * len(allnodes) self.lengths = [0] * len(allnodes) # Precompute the number of non-projective gaps for each subtree if self.minimize_cross: - self._gaps = [0, ] * (1 + len(root.root.descendants)) self._compute_gaps(root) # Precompute lines for printing @@ -211,21 +287,39 @@ def process_tree(self, root): botmost = idx == max_idx if idx_node is node: self._add(idx, self._draw[botmost][topmost]) - self.add_node(idx, node) + if self.layout == 'classic': + self.add_node(idx, node) else: - if idx_node.parent is not node: + if idx_node.is_mwt() or idx_node.parent is not node: self._add(idx, self._vert[self._ends(idx, '─╭╰╪┡┢')]) else: - self._add(idx, self._space[idx < node.ord][topmost or botmost]) + precedes_parent = idx < self._index_of[node.ord] + self._add(idx, self._space[precedes_parent][topmost or botmost]) if idx_node.is_leaf(): self._add(idx, self._horiz) - self.add_node(idx, idx_node) + if self.layout == 'classic': + self.add_node(idx, idx_node) else: stack.append(idx_node) # sorting the stack to minimize crossings of edges if self.minimize_cross: - stack = sorted(stack, key=lambda x: -self._gaps[x.ord]) + stack.sort(key=lambda x: -self._gaps[x.ord]) + + if self.layout == 'classic': + for idx, node in enumerate(allnodes): + if node.is_empty() or node.is_mwt(): + self.add_node(idx, node) + else: + columns_attrs = [[a] for a in self.attrs] if self.layout == 'align' else [self.attrs] + for col_attrs in columns_attrs: + self.attrs = col_attrs + max_length = max(self.lengths) + for idx, node in enumerate(allnodes): + if self.layout.startswith('align'): + self._add(idx, ' ' * (max_length - self.lengths[idx])) + self.add_node(idx, node) + self.attrs = [a for sublist in columns_attrs for a in sublist] # Print headers (if required) and the tree itself self.print_headers(root) @@ -256,11 +350,16 @@ def before_process_document(self, document): super().before_process_document(document) if self.color == 'auto': self.color = sys.stdout.isatty() - if self.color: - colorama.init() + if self.color: + colorama.just_fix_windows_console() + # termcolor since 2.1 also autodetects whether sys.stdout.isatty() + # and if not, it disables the colors, so `cat i.conllu | udapy -T | less -R" + # does not work. We need to turn off termcolor's autodetection with FORCE_COLOR. + os.environ["FORCE_COLOR"] = "1" if self.print_doc_meta: for key, value in sorted(document.meta.items()): - print('%s = %s' % (key, value)) + if key[0] != '_': + print('%s = %s' % (key, value)) def _add(self, idx, text): self.lines[idx] += text @@ -268,14 +367,18 @@ def _add(self, idx, text): def add_node(self, idx, node): """Render a node with its attributes.""" - if not node.is_root(): + if node.is_mwt() or not node.is_root(): values = node.get_attrs(self.attrs, undefs=self.print_undef_as) self.lengths[idx] += 1 + len(' '.join(values)) + marked = self.is_marked(node) if self.color: - marked = self.is_marked(node) for i, attr in enumerate(self.attrs): values[i] = self.colorize_attr(attr, values[i], marked) - self.lines[idx] += ' ' + ' '.join(values) + if not self.color and marked: + self.lines[idx] += ' **' + ' '.join(values) + '**' + self.lengths[idx] += 4 + else: + self.lines[idx] += ' ' + ' '.join(values) def is_marked(self, node): """Should a given node be highlighted?""" diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 75a39a97..0ad39da4 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -26,7 +26,7 @@ class TextModeTreesHtml(TextModeTrees): This block is a subclass of `TextModeTrees`, see its documentation for more info. """ - def __init__(self, color=True, title='Udapi visualization', **kwargs): + def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, whole_bundle=True, **kwargs): """Create new TextModeTreesHtml block object. Args: see `TextModeTrees`. @@ -35,9 +35,14 @@ def __init__(self, color=True, title='Udapi visualization', **kwargs): (see the `mark` parameter) to be more eye-catching. title: What title metadata to use for the html? + zones_in_rows: print trees from the same bundle side by side (i.e. in the same row). + whole_bundle: always print the whole bundle (all its trees) if any of the trees is marked + (relevant only with marked_only=True and zones_in_rows=True) """ super().__init__(color=color, **kwargs) self.title = title + self.zones_in_rows = zones_in_rows + self.whole_bundle = whole_bundle def before_process_document(self, document): # TextModeTrees.before_process_document changes the color property, @@ -53,8 +58,8 @@ def before_process_document(self, document): print('%s = %s' % (key, value)) def after_process_document(self, document): - super().after_process_document(document) print("\n\n") + super().after_process_document(document) def add_node(self, idx, node): if not node.is_root(): @@ -82,3 +87,27 @@ def print_headers(self, root): print(escape(text)) if self.print_comments and root.comment: print('#' + self.colorize_comment(escape(root.comment)).rstrip().replace('\n', '\n#')) + + def process_bundle(self, bundle): + if self.zones_in_rows: + # Don't print
    if no tree will be printed in this bundle. + marked_trees = [] + for tree in bundle: + if self._should_process_tree(tree): + if self.print_empty: + allnodes = [tree] + tree.descendants_and_empty + else: + allnodes = tree.descendants(add_self=1) + if self.should_print_tree(tree, allnodes): + marked_trees.append(tree) + if marked_trees: + if self.whole_bundle: + marked_trees = bundle + print("") + for tree in marked_trees: + print("") + print("
    ") + self.process_tree(tree, force_print=True) + print("
    ") + else: + super().process_bundle(bundle) diff --git a/udapi/block/write/tikz.py b/udapi/block/write/tikz.py index 19480e23..40071739 100644 --- a/udapi/block/write/tikz.py +++ b/udapi/block/write/tikz.py @@ -10,6 +10,8 @@ class Tikz(BaseWriter): Usage:: udapy write.Tikz < my.conllu > my.tex + # or for 2D tree-like rendering + udapy write.Tikz as_tree=1 < my.conllu > my.tex pdflatex my.tex xdg-open my.pdf @@ -26,13 +28,19 @@ class Tikz(BaseWriter): `_ for details. + With ``as_tree=1``, there are two options how to visualize deprels: + either as labels positioned on the edges by uncommenting the relevant style definition, + or by adding ``deprel`` to the list of attributes, so deprels are above/below the words. + The latter is the default because the edge labels need manual tweaks to prevent overlapping. + Alternatives: * use `write.TextModeTrees` and include it in verbatim environment in LaTeX. * use `write.Html`, press "Save as SVG" button, convert to pdf and include in LaTeX. """ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, - attributes='form,upos', **kwargs): + attributes=None, as_tree=False, comment_attribute=None, + enhanced=False, **kwargs): """Create the Tikz block object. Args: @@ -41,20 +49,45 @@ def __init__(self, print_sent_id=True, print_text=True, print_preambule=True, print_preambule: surround each document with LaTeX preambule (`documentclass` etc) and `end{document}` (default=True) attributes: comma-separated list of node attributes to print (each on a separate line). + as_tree: boolean - should print it as a 2D tree? + comment_attribute: which attribute to print as a string under each graph (e.g. text_en) + enhanced: boolean - print the enhanced graph below the sentence, too? """ super().__init__(**kwargs) self.print_sent_id = print_sent_id self.print_text = print_text self.print_preambule = print_preambule - self.node_attributes = attributes.split(',') + if attributes is not None: + self.node_attributes = attributes.split(',') + elif as_tree: + self.node_attributes = 'form,upos,deprel'.split(',') + else: + self.node_attributes = 'form,upos'.split(',') + self.as_tree = as_tree + self.comment_attribute = comment_attribute + if as_tree and enhanced: + raise ValueError("The enhanced graph cannot be printed as a tree") + self.enhanced = enhanced def before_process_document(self, doc): super().before_process_document(doc) if self.print_preambule: - print(r'\documentclass{article}') + print(r'\documentclass[multi=dependency]{standalone}') print(r'\usepackage[T1]{fontenc}') print(r'\usepackage[utf8]{inputenc}') print(r'\usepackage{tikz-dependency}') + if self.as_tree: + print(r'\tikzset{depedge/.style = {blue,thick}, %,<-') + print(r' deplabel/.style = {opacity=0, %black, fill opacity=0.9, text opacity=1,') + print(r' % yshift=4pt, pos=0.1, inner sep=0, fill=white, font={\scriptsize}') + print(r' },') + print(r' depnode/.style = {draw,circle,fill,blue,inner sep=1.5pt},') + print(r' depguide/.style = {dashed,gray},') + print(r'}') + print(r'\newlength{\deplevel}\setlength{\deplevel}{8mm}') + print(r'\newlength{\depskip}\setlength{\depskip}{4mm}') + print(r'\newcommand{\deptrans}[1]{\node (t) at (\matrixref.south)[yshift=-1mm]' + " {``#1''};}") print(r'\begin{document}') def after_process_document(self, doc): @@ -63,6 +96,9 @@ def after_process_document(self, doc): logging.info('Use pdflatex to compile the output') super().after_process_document(doc) + def _tex_escape(self, string): + return string.replace('_', r'\_').replace('$', '\$').replace('[', '$[$').replace(']', '$]$') + def process_tree(self, tree): print(r'\begin{dependency}') print(r'\begin{deptext}') @@ -81,8 +117,7 @@ def process_tree(self, tree): lines = ['' for _ in self.node_attributes] for node in nodes: - values = [str(getattr(node, attr_name)) for attr_name in self.node_attributes] - values = [v if v != '_' else r'\_' for v in values] + values = [self._tex_escape(v) for v in node.get_attrs(self.node_attributes)] max_len = max(len(value) for value in values) for index, value in enumerate(values): if node.ord > 1: @@ -91,10 +126,37 @@ def process_tree(self, tree): for line in lines: print(line + r' \\') print(r'\end{deptext}') - for node in nodes: - if node.parent.is_root(): - print(r'\deproot{%d}{root}' % node.ord) - else: - print(r'\depedge{%d}{%d}{%s}' % (node.parent.ord, node.ord, node.deprel)) + if self.as_tree: + depths = [n._get_attr('depth') for n in nodes] + max_depth = max(depths) + for node in nodes: + print(r'\node (w%d) [yshift=\depskip+%s\deplevel,depnode] at (\wordref{1}{%d}) {};' + % (node.ord, max_depth - depths[node.ord - 1], node.ord)) + for node in nodes: + print(r'\draw[depguide] (w%d)--(\wordref{1}{%d});' % (node.ord, node.ord), end='') + if node.parent.is_root(): + print('') + else: + print(r' \draw[depedge] (w%d)--node[deplabel] {%s} (w%d);' + % (node.ord, node.deprel, node.parent.ord)) + else: + for node in nodes: + if node.parent.is_root(): + print(r'\deproot{%d}{root}' % node.ord) + else: + print(r'\depedge{%d}{%d}{%s}' % (node.parent.ord, node.ord, node.deprel)) + if self.enhanced: + for dep in node.deps: + if dep['parent'].is_root(): + print(r'\deproot[edge below]{%d}{root}' % node.ord) + else: + print(r'\depedge[edge below]{%d}{%d}{%s}' % (dep['parent'].ord, node.ord, dep['deprel'])) + if self.comment_attribute and tree.comment: + start_pos = tree.comment.find(self.comment_attribute + ' = ') + if start_pos != -1: + start_pos += len(self.comment_attribute) + 3 + end_pos = tree.comment.find('\n', start_pos) + print(r'\deptrans{' + tree.comment[start_pos:end_pos]) + print(r'\end{dependency}') - print('') # empty line marks a new paragraph in LaTeX + print('') # empty line marks a new paragraph in LaTeX, but multi=dependency causes newpage diff --git a/udapi/block/write/vislcg.py b/udapi/block/write/vislcg.py index 569b1056..acdf1e80 100644 --- a/udapi/block/write/vislcg.py +++ b/udapi/block/write/vislcg.py @@ -64,10 +64,7 @@ def process_tree(self, tree): # Print the line with forms and optional upos tags and feats. for token in tree.token_descendants: print('"<%s>"' % self._escape(token.form)) - try: - words = token.words - except AttributeError: - words = [token] + words = token.words print('\t' + self._node(words[0])) for nonfirst_mwt_word in words[1:]: print('\t\t' + self._node(nonfirst_mwt_word)) diff --git a/udapi/cli.py b/udapi/cli.py new file mode 100755 index 00000000..de55f8cb --- /dev/null +++ b/udapi/cli.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +import os +import gc +import sys +import atexit +import logging +import argparse + +from udapi.core.run import Run + +# Parse command line arguments. +argparser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter, + usage="udapy [optional_arguments] scenario", + epilog="See http://udapi.github.io", + description="udapy - Python interface to Udapi - API for Universal Dependencies\n\n" + "Examples of usage:\n" + " udapy -s read.Sentences udpipe.En < in.txt > out.conllu\n" + " udapy -T < sample.conllu | less -R\n" + " udapy -HAM ud.MarkBugs < sample.conllu > bugs.html\n") +argparser.add_argument( + "-q", "--quiet", action="store_true", + help="Warning, info and debug messages are suppressed. Only fatal errors are reported.") +argparser.add_argument( + "-v", "--verbose", action="store_true", + help="Warning, info and debug messages are printed to the STDERR.") +argparser.add_argument( + "-s", "--save", action="store_true", + help="Add write.Conllu to the end of the scenario") +argparser.add_argument( + "-T", "--save_text_mode_trees", action="store_true", + help="Add write.TextModeTrees color=1 to the end of the scenario") +argparser.add_argument( + "-H", "--save_html", action="store_true", + help="Add write.TextModeTreesHtml color=1 to the end of the scenario") +argparser.add_argument( + "-A", "--save_all_attributes", action="store_true", + help="Add attributes=form,lemma,upos,xpos,feats,deprel,misc (to be used after -T and -H)") +argparser.add_argument( + "-C", "--save_comments", action="store_true", + help="Add print_comments=1 (to be used after -T and -H)") +argparser.add_argument( + "-M", "--marked_only", action="store_true", + help="Add marked_only=1 to the end of the scenario (to be used after -T and -H)") +argparser.add_argument( + "-N", "--no_color", action="store_true", + help="Add color=0 to the end of the scenario, this overrides color=1 of -T and -H") +argparser.add_argument( + "-X", "--extra", action="append", + help="Add a specified parameter (or a block name) to the end of the scenario\n" + "For example 'udapy -TNX attributes=form,misc -X layout=align < my.conllu'") +argparser.add_argument( + "--gc", action="store_true", + help="By default, udapy disables Python garbage collection and at-exit cleanup\n" + "to speed up everything (especially reading CoNLL-U files). In edge cases,\n" + "when processing many files and running out of memory, you can disable this\n" + "optimization (i.e. enable garbage collection) with 'udapy --gc'.") +argparser.add_argument( + 'scenario', nargs=argparse.REMAINDER, help="A sequence of blocks and their parameters.") + + +# Process and provide the scenario. +def main(argv=None): + args = argparser.parse_args(argv) + + # Set the level of logs according to parameters. + if args.verbose: + level = logging.DEBUG + elif args.quiet: + level = logging.CRITICAL + else: + level = logging.INFO + + logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', + level=level) + + # Global flag to track if an unhandled exception occurred + _unhandled_exception_occurred = False + + def _custom_excepthook(exc_type, exc_value, traceback): + global _unhandled_exception_occurred + _unhandled_exception_occurred = True + + # Call the default excepthook to allow normal error reporting + sys.__excepthook__(exc_type, exc_value, traceback) + + # Override the default excepthook + sys.excepthook = _custom_excepthook + + # Disabling garbage collections makes the whole processing much faster. + # Similarly, we can save several seconds by partially disabling the at-exit Python cleanup + # (atexit hooks are called in reversed order of their registration, + # so flushing stdio buffers etc. will be still done before the os._exit(0) call). + # See https://instagram-engineering.com/dismissing-python-garbage-collection-at-instagram-4dca40b29172 + # Is it safe to disable GC? + # OS will free the memory allocated by this process after it ends anyway. + # The udapy wrapper is aimed for one-time tasks, not a long-running server, + # so in a typical case a document is loaded and almost no memory is freed before the end. + # Udapi documents have a many cyclic references, so running GC is quite slow. + if not args.gc: + gc.disable() + # When an exception/error has happened, udapy should exit with a non-zero exit code, + # so that users can use `udapy ... || echo "Error detected"` (or Makefile reports errors). + # However, we cannot use `atexit.register(lambda: os._exit(1 if sys.exc_info()[0] else 0))` + # because the Python has already exited the exception-handling block + # (the exception/error has been already reported and sys.exc_info()[0] is None). + # We thus keep record whether _unhandled_exception_occurred. + atexit.register(lambda: os._exit(1 if _unhandled_exception_occurred else 0)) + atexit.register(sys.stderr.flush) + if args.save: + args.scenario = args.scenario + ['write.Conllu'] + if args.save_text_mode_trees: + args.scenario = args.scenario + ['write.TextModeTrees', 'color=1'] + if args.save_html: + args.scenario = args.scenario + ['write.TextModeTreesHtml', 'color=1'] + if args.save_all_attributes: + args.scenario = args.scenario + ['attributes=form,lemma,upos,xpos,feats,deprel,misc'] + if args.save_comments: + args.scenario = args.scenario + ['print_comments=1'] + if args.marked_only: + args.scenario = args.scenario + ['marked_only=1'] + if args.no_color: + args.scenario = args.scenario + ['color=0'] + if args.extra: + args.scenario += args.extra + + runner = Run(args) + # udapy is often piped to head etc., e.g. + # `seq 1000 | udapy -s read.Sentences | head` + # Let's prevent Python from reporting (with distracting stacktrace) + # "BrokenPipeError: [Errno 32] Broken pipe" + try: + runner.execute() + except BrokenPipeError: + pass + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 1f732568..c3bcf918 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -1,7 +1,7 @@ """BaseReader is the base class for all reader blocks.""" +import gc import re import logging - from udapi.core.block import Block from udapi.core.files import Files @@ -12,13 +12,15 @@ class BaseReader(Block): """Base class for all reader blocks.""" # pylint: disable=too-many-arguments - def __init__(self, files='-', zone='keep', bundles_per_doc=0, encoding='utf-8', - sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs): + def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', + sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, + max_docs=0, **kwargs): super().__init__(**kwargs) - self.files = Files(filenames=files) + if filehandle is not None: + files = None + self.files = Files(filenames=files, filehandle=filehandle, encoding=encoding) self.zone = zone self.bundles_per_doc = bundles_per_doc - self.encoding = encoding self._buffer = None self.finished = False self.sent_id_filter = None @@ -27,6 +29,16 @@ def __init__(self, files='-', zone='keep', bundles_per_doc=0, encoding='utf-8', logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id + self.merge = merge + self.max_docs = max_docs + self._docs_loaded = 0 + # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. + # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it here, in the reader. + # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, + # which reads all the trees in a file at once, but it does not have access to the document instance, + # it just returns a sequence of trees (which may be split into multiple documents if `bundles_per_doc` is set). + # So `read.Conllu` cannot store the `global.Entity` in `document.meta['global.Entity']` where it belongs. + self._global_entity = None @staticmethod def is_multizone_reader(): @@ -59,8 +71,8 @@ def next_filehandle(self): """Go to the next file and retrun its filehandle.""" return self.files.next_filehandle() - def read_tree(self, document=None): - """Load one (more) tree from self.files and return its root. + def read_tree(self): + """Load one (more) tree from self.filehandle and return its root. This method must be overriden in all readers. Usually it is the only method that needs to be implemented. @@ -68,113 +80,245 @@ def read_tree(self, document=None): """ raise NotImplementedError("Class %s doesn't implement read_tree" % self.__class__.__name__) - def filtered_read_tree(self, document=None): + def read_trees(self): + """Load all trees from self.filehandle and return a list of their roots. + + This method may be overriden in a reader if a faster alternative to read_tree() is needed. + The implementation in this base clases raises `NotImplementedError`. + """ + raise NotImplementedError("Class %s doesn't implement read_trees" % self.__class__.__name__) + + def filtered_read_tree(self): """Load and return one more tree matching the `sent_id_filter`. This method uses `read_tree()` internally. This is the method called by `process_document`. """ - tree = self.read_tree(document) + tree = self.read_tree() if self.sent_id_filter is None: return tree + + skipped_newdoc = None while True: if tree is None: return None if self.sent_id_filter.match(tree.sent_id) is not None: + if skipped_newdoc and not tree.newdoc: + tree.newdoc = skipped_newdoc return tree logging.debug('Skipping sentence %s as it does not match the sent_id_filter %s.', tree.sent_id, self.sent_id_filter) - tree = self.read_tree(document) + if tree.newdoc: + skipped_newdoc = tree.newdoc + tree = self.read_tree() - # pylint: disable=too-many-branches,too-many-statements - # Maybe the code could be refactored, but it is speed-critical, - # so benchmarking is needed because calling extra methods may result in slowdown. - def process_document(self, document): - orig_bundles = document.bundles[:] - last_bundle_id = '' - bundle = None - - # There may be a tree left in the buffer when reading the last doc. - if self._buffer: - # TODO list.pop(0) is inefficient, use collections.deque.popleft() - bundle = orig_bundles.pop(0) if orig_bundles else document.create_bundle() - bundle.add_tree(self._buffer) - if self._buffer.newdoc and self._buffer.newdoc is not True: - document.meta["docname"] = self._buffer.newdoc - self._buffer = None - - filehandle = self.filehandle - if filehandle is None: + def try_fast_load(self, document): + """Try to use self.read_trees() if possible and return True, otherwise False.""" + if document.bundles or self.bundles_per_doc or self.sent_id_filter or self.split_docs: + return False + if self.filehandle is None: filehandle = self.next_filehandle() if filehandle is None: self.finished = True - return + return True + logging.info(f"Reading {self.files.filename}") - trees_loaded = 0 while True: - root = self.filtered_read_tree(document) - if root is None: - if trees_loaded == 0 and self.files.has_next_file(): - filehandle = self.next_filehandle() + try: + trees = self.read_trees() + except NotImplementedError: + return False + + document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity + if trees and trees[0].newdoc and trees[0].newdoc is not True: + document.meta["docname"] = trees[0].newdoc + + bundle, last_bundle_id = None, '' + for root in trees: + if root is None: continue - self.finished = not self.files.has_next_file() - break - add_to_the_last_bundle = 0 - trees_loaded += 1 - - if self.ignore_sent_id: - root.sent_id = None - if root.sent_id is not None: - parts = root.sent_id.split('/', 1) - bundle_id = parts[0] - if len(parts) == 2: - root.zone = parts[1] - add_to_the_last_bundle = bundle_id == last_bundle_id - last_bundle_id = bundle_id - - if self.zone != 'keep': - root.zone = self.zone - - # The `# newdoc` comment in CoNLL-U marks a start of a new document. - if root.newdoc: - if not bundle and root.newdoc is not True: - document.meta["docname"] = root.newdoc - if bundle and self.split_docs: - self._buffer = root - if orig_bundles: - logging.warning("split_docs=1 but the doc had contained %d bundles", - len(orig_bundles)) - self.finished = False - return + if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return True + self._docs_loaded += 1 + add_to_the_last_bundle = False - # assign new/next bundle to `bundle` if needed - if not bundle or not add_to_the_last_bundle: - if self.bundles_per_doc and bundle and self.bundles_per_doc == bundle.number: - self._buffer = root - if orig_bundles: - logging.warning("bundles_per_doc=%d but the doc had contained %d bundles", - self.bundles_per_doc, len(orig_bundles)) - return + if self.ignore_sent_id: + root._sent_id = None + elif root._sent_id is not None: + parts = root._sent_id.split('/', 1) + bundle_id = parts[0] + if len(parts) == 2: + root.zone = parts[1] + add_to_the_last_bundle = bundle_id == last_bundle_id + last_bundle_id = bundle_id + if self.zone != 'keep': + root.zone = self.zone - if orig_bundles: - # TODO list.pop(0) is inefficient, use collections.deque.popleft() - bundle = orig_bundles.pop(0) - if last_bundle_id and last_bundle_id != bundle.bundle_id: - logging.warning('Mismatch in bundle IDs: %s vs %s. Keeping the former one.', - bundle.bundle_id, last_bundle_id) - else: + # assign new/next bundle to `bundle` if needed + if not bundle or not add_to_the_last_bundle: bundle = document.create_bundle() if last_bundle_id != '': bundle.bundle_id = last_bundle_id - bundle.add_tree(root) + bundle.add_tree(root) + + self.next_filehandle() + if self.filehandle is None: + self.finished = True + return True + if not self.merge: + return True + return True - # If bundles_per_doc is set and we have read the specified number of bundles, - # we should end the current document and return. - # However, if the reader supports reading multiple zones, we can never know - # if the current bundle has ended or there will be another tree for this bundle. - # So in case of multizone readers we need to read one extra tree - # and store it in the buffer (and include it into the next document). - if self.bundles_per_doc and self.bundles_per_doc == bundle.number \ - and not self.is_multizone_reader(): + # pylint: disable=too-many-branches,too-many-statements + # Maybe the code could be refactored, but it is speed-critical, + # so benchmarking is needed because calling extra methods may result in slowdown. + def process_document(self, document): + # Temporarily disabling garbage collection makes the loading much faster. + gc_was_enabled = gc.isenabled() + gc.disable() + try: + if self.try_fast_load(document): return + orig_bundles = document.bundles[:] + bundle, last_bundle_id = None, '' + + # There may be a tree left in the buffer when reading the last doc. + if self._buffer: + root = self._buffer + self._buffer = None + if orig_bundles: + bundle = orig_bundles.pop(0) + else: + bundle = document.create_bundle() + if root._sent_id is not None: + bundle.bundle_id = root._sent_id.split('/', 1)[0] + bundle.add_tree(root) + if root.newdoc: + self._docs_loaded += 1 + if root.newdoc is not True: + document.meta["docname"] = root.newdoc + document.meta['global.Entity'] = self._global_entity + document.meta['loaded_from'] = self.filename + + filehandle = self.filehandle + if filehandle is None: + filehandle = self.next_filehandle() + if filehandle is None: + self.finished = True + return + logging.info(f"Reading {self.files.filename}") + + trees_loaded = 0 + while True: + root = self.filtered_read_tree() + if root is None: + if (trees_loaded == 0 or self.merge) and self.files.has_next_file(): + filehandle = self.next_filehandle() + logging.info(f"Reading {self.files.filename}") + continue + self.finished = not self.files.has_next_file() + break + if trees_loaded == 0: + document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity + # Parameter max_docs is primarily aimed for counting UD docs, ie. trees with newdoc. + # However, it could be useful even when working with files without the newdoc annotations, + # e.g. when using files='!*.conllu' or bundles_per_doc, in which case we count the Udapi documents + # so even if the first tree in udapi.Document does not have newdoc, we count it as a new document. + # The cases where newdoc is used are checked further below. + if not root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return + self._docs_loaded += 1 + + add_to_the_last_bundle = False + trees_loaded += 1 + + if self.ignore_sent_id: + root._sent_id = None + elif root._sent_id is not None: + parts = root._sent_id.split('/', 1) + bundle_id = parts[0] + if len(parts) == 2: + root.zone = parts[1] + add_to_the_last_bundle = bundle_id == last_bundle_id + last_bundle_id = bundle_id + + if self.zone != 'keep': + root.zone = self.zone + + # The `# newdoc` comment in CoNLL-U marks a start of a new document. + if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return + if not bundle and root.newdoc is not True: + document.meta["docname"] = root.newdoc + if bundle and self.split_docs: + self._buffer = root + if orig_bundles: + logging.warning("split_docs=1 but the doc had contained %d bundles", + len(orig_bundles)) + self.finished = False + return + self._docs_loaded += 1 + + # assign new/next bundle to `bundle` if needed + if not bundle or not add_to_the_last_bundle: + if self.bundles_per_doc and bundle and self.bundles_per_doc == bundle.number: + self._buffer = root + if orig_bundles: + logging.warning("bundles_per_doc=%d but the doc had contained %d bundles", + self.bundles_per_doc, len(orig_bundles)) + return + + if orig_bundles: + bundle = orig_bundles.pop(0) + if last_bundle_id and last_bundle_id != bundle.bundle_id: + logging.warning('Mismatch in bundle IDs: %s vs %s. Keeping the former one.', + bundle.bundle_id, last_bundle_id) + else: + bundle = document.create_bundle() + if last_bundle_id != '': + bundle.bundle_id = last_bundle_id + + bundle.add_tree(root) + + # If bundles_per_doc is set and we have read the specified number of bundles, + # we should end the current document and return. + # However, if the reader supports reading multiple zones, we can never know + # if the current bundle has ended or there will be another tree for this bundle. + # So in case of multizone readers we need to read one extra tree + # and store it in the buffer (and include it into the next document). + if self.bundles_per_doc and self.bundles_per_doc == bundle.number \ + and not self.is_multizone_reader(): + return + + # Running garbage collector now takes about 0.36s for a 720k-words (68MiB) conllu file + # but it makes further processing (where new objects are created) much faster, + # e.g. 0.85s when creating 65k new nodes. + # If garbage collection was already disabled (e.g. in udapy), everything is even faster + # (but no memory with cyclic references is ever freed before the process exits) + # and in that case we don't want to enable gc here. + finally: + if gc_was_enabled: + gc.enable() + gc.collect() + + def read_documents(self): + """Load all documents of this reader and return them as a list.""" + # udapi.core.document imports udapi.block.read.conllu because of doc.load_conllu(filename) + # and udapi.block.read.conllu loads this module (udapi.core.basereader), + # so we cannot load udapi.core.document at the beginning of this module. + from udapi.core.document import Document + docs = [] + while not self.finished: + doc = Document() + self.apply_on_document(doc) + docs.append(doc) + return docs diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 36edd1aa..071ec124 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -1,23 +1,40 @@ """BaseWriter is the base class for all writer blocks.""" import sys import logging +import os +from pathlib import Path +import udapi.core.coref from udapi.core.block import Block from udapi.core.files import Files class BaseWriter(Block): - """Base class for all reader blocks.""" + """Base class for all writer blocks.""" - def __init__(self, files='-', docname_as_file=False, encoding='utf-8', newline='\n', **kwargs): + def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8', + newline='\n', overwrite=False, path=None, **kwargs): super().__init__(**kwargs) self.orig_files = files - self.files = Files(filenames=files) + self.orig_stdout = sys.stdout + if filehandle is not None: + files = None + self.orig_files = '' + self.files = Files(filenames=files, filehandle=filehandle) self.encoding = encoding self.newline = newline self.docname_as_file = docname_as_file if docname_as_file and files != '-': raise ValueError("docname_as_file=1 is not compatible with files=" + files) + self.overwrite = overwrite + if overwrite and files != '-': + raise ValueError("overwrite=1 is not compatible with files=" + files) + if overwrite and docname_as_file: + raise ValueError("overwrite=1 is not compatible with docname_as_file=1") + # interpret path=my_dir/my_subdir as path=my_dir/my_subdir/ + if path and path[-1] != os.sep and '*' not in path: + path += os.sep + self.path = path @property def filename(self): @@ -34,29 +51,62 @@ def next_filename(self): return self.files.next_filename() def before_process_document(self, document): + if document: + udapi.core.coref.store_coref_to_misc(document) + if self.orig_files == '': + logging.info('Writing to filehandle.') + sys.stdout = self.files.filehandle + return + old_filehandle = sys.stdout if self.orig_files == '-': if self.docname_as_file: docname = document.meta.get('docname', None) if docname is not None: logging.info('Writing to file %s.', docname) - sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) + sys.stdout = self._open(docname) else: logging.warning('docname_as_file=1 but the document contains no docname') + elif self.overwrite or self.path: + docname = document.meta.get('loaded_from', None) + if docname is not None: + if self.path: + old_dir, old_filename = os.path.split(docname) + new_dir, new_filename = os.path.split(self.path) + old_file, old_ext = os.path.splitext(old_filename) + new_file, new_ext = os.path.splitext(new_filename) + if new_dir in ('', '*'): + new_dir = old_dir + if new_file in ('', '*'): + new_file = old_file + if new_ext in ('', '*'): + new_ext = old_ext + docname = os.path.join(new_dir, new_file + new_ext) + logging.info('Writing to file %s.', docname) + sys.stdout = self._open(docname) + else: + logging.warning('using overwrite or path but document.meta["loaded_from"] is None') else: - sys.stdout = sys.__stdout__ - return - - old_filehandle = sys.stdout - if old_filehandle.fileno != sys.stdout.fileno: + sys.stdout = self.orig_stdout + else: + filename = self.next_filename() + if filename is None: + raise RuntimeError('There are more documents to save than filenames given (%s)' + % self.orig_files) + elif filename == '-': + logging.info('Writing to stdout.') + sys.stdout = self.orig_stdout + else: + logging.info('Writing to file %s.', filename) + sys.stdout = self._open(filename) + if old_filehandle not in (sys.stdout, self.orig_stdout): old_filehandle.close() - filename = self.next_filename() - if filename is None: - raise RuntimeError('There are more documents to save than filenames given (%s)' - % self.orig_files) - elif filename == '-': - logging.info('Writing to stdout.') - sys.stdout = sys.__stdout__ - else: - logging.info('Writing to file %s.', filename) - sys.stdout = open(filename, 'wt', encoding=self.encoding, newline=self.newline) + def _open(self, filename): + Path(filename).parent.mkdir(parents=True, exist_ok=True) + return open(filename, 'wt', encoding=self.encoding, newline=self.newline) + + def after_process_document(self, document): + sys.stdout.flush() + if sys.stdout != self.orig_stdout: + sys.stdout.close() + sys.stdout = self.orig_stdout diff --git a/udapi/core/block.py b/udapi/core/block.py index 453b1d65..d293df61 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -1,12 +1,37 @@ """Block class represents the basic Udapi processing unit.""" import logging +import inspect +def not_overridden(method): + method.is_not_overridden = True + return method class Block(object): - """The smallest processing unit for processing Universal Dependencies data.""" + """The smallest processing unit for processing Universal Dependencies data. - def __init__(self, zones='all'): + Parameters: + zones: which zone to process (default="all") + if_empty_tree: what to do when encountering a tree with no nodes. + Possible values are: process (default), skip, skip_warn, fail, delete. + """ + + def __init__(self, zones='all', if_empty_tree='process', **kwargs): self.zones = zones + self.if_empty_tree = if_empty_tree + if kwargs: + params = set() + for cls in type(self).mro()[:-1]: + params.update(inspect.signature(cls.__init__).parameters.keys()) + params -= {'self', 'kwargs'} + raise TypeError(f"Extra parameters {kwargs}.\n" + f"Parameters of {self.block_name()} are:\n" + + '\n'.join(sorted(params))) + + def block_name(self): + module = ".".join(self.__module__.split(".")[:-1]) + if module.startswith('udapi.block.'): + module = module[12:] + return module + "." + self.__class__.__name__ def process_start(self): """A hook method that is executed before processing UD data""" @@ -16,27 +41,98 @@ def process_end(self): """A hook method that is executed after processing all UD data""" pass + @not_overridden def process_node(self, _): """Process a UD node""" - raise Exception("No processing activity defined in block " + str(self)) + pass + @not_overridden + def process_empty_node(self, _): + """Process an empty node (in enhanced dependencies)""" + pass + + @not_overridden def process_tree(self, tree): """Process a UD tree""" + # tree.descendants is slightly slower than tree._descendants (0.05s per iterating over 700k words), + # but it seems safer to iterate over a copy of the list of nodes. + # If a user calls parent.create_child().shift_before_node(parent) in process_node, + # it may end up in endless cycle (because the same node is processed again - Python for cycle remembers the position). for node in tree.descendants: self.process_node(node) + @not_overridden def process_bundle(self, bundle): """Process a UD bundle""" for tree in bundle: if self._should_process_tree(tree): self.process_tree(tree) + def run(self, document): + self.process_start() + self.apply_on_document(document) + self.process_end() + + def apply_on_document(self, document): + self.before_process_document(document) + self.process_document(document) + self.after_process_document(document) + def process_document(self, document): """Process a UD document""" - for bundle_no, bundle in enumerate(document.bundles, 1): - logging.debug('Block %s processing bundle #%d (id=%s)', - self.__class__.__name__, bundle_no, bundle.bundle_id) - self.process_bundle(bundle) + # Calling document.coref_entities is expensive because + # it needs to deserialize coref_entities from the MISC attributes. + # If no block in a scenario needs to process coreference entities/mentions, + # the deserialization does not need to be done. + # So we need to detect if any of the methods process_coref_entity and process_coref_mention + # has been overriden (without calling them, which could have adverse side effects). + # Let's use method annotations for this. + p_entity = not hasattr(self.process_coref_entity, 'is_not_overridden') + p_mention = not hasattr(self.process_coref_mention, 'is_not_overridden') + p_bundle = not hasattr(self.process_bundle, 'is_not_overridden') + p_tree = not hasattr(self.process_tree, 'is_not_overridden') + p_node = not hasattr(self.process_node, 'is_not_overridden') + p_empty_node = not hasattr(self.process_empty_node, 'is_not_overridden') + if not any((p_entity, p_mention, p_bundle, p_tree, p_node, p_empty_node)): + raise Exception("No processing activity defined in block " + self.block_name()) + + if p_entity or p_mention: + for entity in document.coref_entities: + if p_entity: + self.process_coref_entity(entity) + else: + for mention in entity.mentions: + self.process_coref_mention(mention) + + if p_bundle or p_tree or p_node or p_empty_node: + for bundle_no, bundle in enumerate(document.bundles, 1): + logging.debug(f'Block {self.block_name()} processing ' + f'bundle #{bundle_no} (id={bundle.bundle_id})') + if p_bundle: + self.process_bundle(bundle) + else: + for tree in bundle: + if self._should_process_tree(tree): + if p_tree: + self.process_tree(tree) + else: + if p_node: + for node in tree.descendants: + self.process_node(node) + if p_empty_node: + for empty_node in tree.empty_nodes: + self.process_empty_node(empty_node) + + @not_overridden + def process_coref_entity(self, entity): + """This method is called on each coreference entity in the document.""" + for mention in entity.mentions: + self.process_coref_mention(mention) + + @not_overridden + def process_coref_mention(self, mention): + """This method is called on each coreference mention in the document.""" + pass def before_process_document(self, document): """This method is called before each process_document.""" @@ -47,6 +143,20 @@ def after_process_document(self, document): pass def _should_process_tree(self, tree): + if self.if_empty_tree != 'process' and not tree.descendants: + if self.if_empty_tree == 'skip': + return False + elif self.if_empty_tree == 'delete': + tree.remove() + return False + elif self.if_empty_tree == 'skip_warn': + logging.warning("Tree %s is empty", tree) + return False + elif self.if_empty_tree == 'fail': + raise Exception("Tree %s is empty" % tree) + else: + raise ValueError("Unknown value for if_empty_tree: " + + self.if_empty_tree) if self.zones == 'all': return True if self.zones == '' and tree.zone == '': diff --git a/udapi/core/bundle.py b/udapi/core/bundle.py index ffffa565..0a637f01 100644 --- a/udapi/core/bundle.py +++ b/udapi/core/bundle.py @@ -3,6 +3,7 @@ import re from udapi.core.root import Root +from udapi.block.write.textmodetrees import TextModeTrees VALID_ZONE_REGEX = re.compile("^[a-z-]*(_[A-Za-z0-9-]+)?$") @@ -31,17 +32,21 @@ def bundle_id(self): @bundle_id.setter def bundle_id(self, bundle_id): self._bundle_id = bundle_id - for tree in self.trees: - tree._sent_id = bundle_id + '/' + tree.zone # pylint: disable=protected-access + if len(self.trees) == 1 and self.trees[0].zone == '': + self.trees[0]._sent_id = bundle_id + else: + for tree in self.trees: + tree._sent_id = bundle_id + '/' + tree.zone # pylint: disable=protected-access def __str__(self): - if self.bundle_id is None: + if self._bundle_id is None: return 'bundle without id' - return "bundle id='%s'" % self.bundle_id + return f"bundle id='{self._bundle_id}'" def __iter__(self): return iter(self.trees) + @property def document(self): """Returns the document in which the bundle is contained.""" return self._document @@ -67,7 +72,7 @@ def has_tree(self, zone=''): def create_tree(self, zone=None): """Return the root of a newly added tree with a given zone.""" root = Root() - root.zone = zone + root._zone = zone self.add_tree(root) return root @@ -84,16 +89,39 @@ def check_zone(self, new_zone): def add_tree(self, root): """Add an existing tree to the bundle.""" if root.zone is None: - root.zone = '' + root._zone = '' self.check_zone(root.zone) + if self._bundle_id: + root._sent_id = self._bundle_id + if root.zone: + root._sent_id += '/' + root.zone root.bundle = self self.trees.append(root) + doc_json = root.json.get('__doc__') + if doc_json: + self._document.json.update(doc_json) + del root.json['__doc__'] return root def remove(self): """Remove a bundle from the document.""" self._document.bundles = [bundle for bundle in self._document.bundles if bundle != self] + for i, bundle in enumerate(self._document.bundles): + bundle.number = i def address(self): """Return bundle_id or '?' if missing.""" - return self.bundle_id if self.bundle_id is not None else '?' + return self._bundle_id if self._bundle_id is not None else '?' + + def draw(self, **kwargs): + """Pretty print the trees using TextModeTrees.""" + TextModeTrees(**kwargs).process_bundle(self) + + @property + def nodes(self): + """An iterator over all nodes (excluding empty nodes) in all trees in this bundle.""" + for tree in self: + # tree.descendants is slightly slower than tree._descendants, + # but it seems safer, see the comment in udapi.core.block.Block.process_tree(). + for node in tree.descendants: + yield node diff --git a/udapi/core/coref.py b/udapi/core/coref.py new file mode 100644 index 00000000..aa27e6a7 --- /dev/null +++ b/udapi/core/coref.py @@ -0,0 +1,1086 @@ +"""Classes for handling coreference. + +# CorefUD 1.0 format implementation details + +## Rules for ordering "chunks" within `node.misc['Entity']` +Entity mentions are annotated using "chunks" stored in `misc['Entity']`. +Chunks are of three types: +1. opening bracket, e.g. `(e1-person` +2. closing bracket, e.g. `e1-person)` +3. single-word span (both opening and closing), e.g. `(e1-person)` + +The `Entity` MISC attribute contains a sequence of chunks +without any separators, e.g. `Entity=(e1-person(e2-place)` +means opening `e1` mention and single-word `e2` mention +starting on a given node. + +### Crossing mentions +Two mentions are crossing iff their spans have non-empty intersection, +but neither is a subset of the other, e.g. `e1` spanning nodes 1-3 +and `e2` spanning 2-4 would be represented as: +``` +1 ... Entity=(e1 +2 ... Entity=(e2 +3 ... Entity=e1) +4 ... Entity=e2) +``` +This may be an annotation error and we may forbid such cases in future annotation guidelines, +but in CorefUD 0.2, there are thousands of such cases (see https://github.com/ufal/corefUD/issues/23). + +It can even happen that one entity ends and another starts at the same node: `Entity=e1)(e2` +For this reason, we need + +**Rule1**: closing brackets MUST always precede opening brackets. +Otherwise, we would get `Entity=(e2e1)`, which could not be parsed. + +Note that we cannot have same-entity crossing mentions in the CorefUD 1.0 format, +so e.g. if we substitute `e2` with `e1` in the example above, we'll get +`(e1`, `e1)`, `(e1`, `e1)`, which will be interpreted as two non-overlapping mentions of the same entity. + +### Nested mentions +One mention (span) can be often embedded within another mention (span). +It can happen that both these mentions correspond to the same entity (i.e. are in the same cluster), +for example, "` sold the world>`". +It can even happen that both mentions start at the same node, e.g. "`< w3>`" (TODO: find nice real-world examples). +In such cases, we need to make sure the brackets are well-nested: + +**Rule2**: when opening multiple brackets at the same node, longer mentions MUST be opened first. + +This is important because +- The closing bracket has the same form for both mentions of the same entity - it includes just the entity ID (`eid`). +- The opening-bracket annotation contains other mention attributes, e.g. head index. +- The two mentions may differ in these attributes, e.g. the "``" mention's head may be w3. +- When breaking Rule2, we would get +``` +1 w1 ... Entity=(e1-person-1(e1-person-3 +2 w2 ... Entity=e1) +3 w3 ... Entity=e1) +``` +which would be interpreted as if the head of the "``" mention is its third word, which is invalid. + +### Other rules + +**Rule3**: when closing multiple brackets at the same node, shorter mentions SHOULD be closed first. +See Rule4 for a single exception from this rule regarding crossing mentions. +I'm not aware of any problems when breaking this rule, but it seems intuitive +(to make the annotation well-nested if possible) and we want to define some canonical ordering anyway. +The API should be able to load even files breaking Rule3. + +**Rule4**: single-word chunks SHOULD follow all opening brackets and precede all closing brackets if possible. +When considering single-word chunks as a subtype of both opening and closing brackets, +this rule follows from the well-nestedness (and Rule2). +So we should have `Entity=(e1(e2)` and `Entity=(e3)e1)`, +but the API should be able to load even `Entity=(e2)(e1` and `Entity=e1)(e3)`. + +In case of crossing mentions (annotated following Rule1), we cannot follow Rule4. +If we want to add a single-word mention `e2` to a node with `Entity=e1)(e3`, +it seems intuitive to prefer Rule2 over Rule3, which results in `Entity=e1)(e3(e2)`. +So the canonical ordering will be achieved by placing single-word chunks after all opening brackets. +The API should be able to load even `Entity=(e2)e1)(e3` and `Entity=e1)(e2)(e3`. + +**Rule5**: ordering of same-span single-word mentions +TODO: I am not sure here. We may want to forbid such cases or define canonical ordering even for them. +E.g. `Entity=(e1)(e2)` vs. `Entity=(e2)(e1)`. + +**Rule6**: ordering of same-start same-end multiword mentions +TODO: I am not sure here. +These can be either same-span multiword mentions (which may be forbidden) +or something like +``` +1 w1 ... Entity=(e1(e2[1/2]) +2 w2 ... +3 w3 ... Entity=(e2[2/2])e1) +``` +where both `e1` and `e2` start at w1 and end at w3, but `e2` is discontinuous and does not contain w2. +If we interpret "shorter" and "longer" in Rule2 and Rule3 as `len(mention.words)` +(and not as `mention.words[-1].ord - mention.words[0].ord`), +we get the canonical ordering as in the example above. + +""" +import re +import functools +import collections +import collections.abc +import copy +import logging +import bisect + +@functools.total_ordering +class CorefMention(object): + """Class for representing a mention (instance of an entity).""" + __slots__ = ['_head', '_entity', '_bridging', '_words', '_other'] + + def __init__(self, words, head=None, entity=None, add_word_backlinks=True): + if not words: + raise ValueError("mention.words must be non-empty") + self._head = head if head else words[0] + self._entity = entity + if entity is not None: + entity._mentions.append(self) + self._bridging = None + self._other = None + self._words = words + if add_word_backlinks: + for new_word in words: + if not new_word._mentions or not entity or self > new_word._mentions[-1]: + new_word._mentions.append(self) + else: + new_word._mentions.append(self) + new_word._mentions.sort() + + def _subspans(self): + mspan = self.span + if ',' not in mspan: + return [CorefMentionSubspan(self._words, self, '')] + root = self._words[0].root + subspans = mspan.split(',') + result = [] + for idx,subspan in enumerate(subspans, 1): + result.append(CorefMentionSubspan(span_to_nodes(root, subspan), self, f'[{idx}/{len(subspans)}]')) + return result + + def __lt__(self, another): + """Does this mention precedes (word-order wise) `another` mention? + + This method defines a total ordering of all mentions + (within one entity or across different entities). + The position is primarily defined by the first word in each mention. + If two mentions start at the same word, + their order is defined by their length (i.e. number of words) + -- the shorter mention follows the longer one. + + In the rare case of two same-length mentions starting at the same word, but having different spans, + their order is defined by the order of the last word in their span. + For example precedes . + + The order of two same-span mentions is currently defined by their eid. + There should be no same-span (or same-subspan) same-entity mentions. + """ + #TODO: no mention.words should be handled already when loading + if not self._words: + self._words = [self._head] + if not another._words: + another._words = [another._head] + + if self._words[0] is another._words[0]: + if len(self._words) > len(another._words): + return True + if len(self._words) < len(another._words): + return False + if self._words[-1].precedes(another._words[-1]): + return True + if another._words[-1].precedes(self._words[-1]): + return False + return self._entity.eid < another._entity.eid + return self._words[0].precedes(another._words[0]) + + @property + def other(self): + if self._other is None: + self._other = OtherDualDict() + return self._other + + @other.setter + def other(self, value): + if self._other is None: + self._other = OtherDualDict(value) + else: + self._other.set_mapping(value) + + @property + def head(self): + return self._head + + @head.setter + def head(self, new_head): + if self._words and new_head not in self._words: + raise ValueError(f"New head {new_head} not in mention words") + self._head = new_head + + @property + def entity(self): + return self._entity + + @entity.setter + def entity(self, new_entity): + if self._entity is not None: + original_entity = self._entity + original_entity._mentions.remove(self) + if not original_entity._mentions: + logging.warning(f"Original entity {original_entity.eid} is now empty.") + self._entity = new_entity + bisect.insort(new_entity._mentions, self) + + @property + def bridging(self): + if not self._bridging: + self._bridging = BridgingLinks(self) + return self._bridging + + # TODO add/edit bridging + + @property + def words(self): + # Words in a sentence could have been reordered, so we cannot rely on sorting self._words in the setter. + # The serialization relies on storing the opening bracket in the first word (and closing in the last), + # so we need to make sure the words are always returned sorted. + # TODO: benchmark updating the order of mention._words in node.shift_*() and node.remove(). + self._words.sort() + return self._words + + @words.setter + def words(self, new_words): + if new_words and self.head not in new_words: + raise ValueError(f"Head {self.head} not in new_words {new_words} for {self._entity.eid}") + kept_words = [] + # Make sure each word is included just once and they are in the correct order. + new_words = sorted(list(set(new_words))) + for old_word in self._words: + if old_word in new_words: + kept_words.append(old_word) + else: + old_word._mentions.remove(self) + self._words = new_words + for new_word in new_words: + if new_word not in kept_words: + if not new_word._mentions or self > new_word._mentions[-1]: + new_word._mentions.append(self) + else: + new_word._mentions.append(self) + new_word._mentions.sort() + + @property + def span(self): + return nodes_to_span(self._words) + + @span.setter + def span(self, new_span): + self.words = span_to_nodes(self._head.root, new_span) + + def __str__(self): + """String representation of the CorefMention object: Mention.""" + return f"Mention<{self._entity._eid}: {self._head}>" + + def remove(self): + for word in self._words: + word._mentions.remove(self) + self._entity._mentions.remove(self) + + +@functools.total_ordering +class CorefMentionSubspan(object): + """Helper class for representing a continuous subspan of a mention.""" + __slots__ = ['words', 'mention', 'subspan_id'] + + def __init__(self, words, mention, subspan_id): + if not words: + raise ValueError("mention.words must be non-empty") + self.words = sorted(words) + self.mention = mention + self.subspan_id = subspan_id + + def __lt__(self, another): + if self.words[0] is another.words[0]: + if len(self.words) > len(another.words): + return True + if len(self.words) < len(another.words): + return False + return self.mention < another.mention + return self.words[0].precedes(another.words[0]) + + @property + def subspan_eid(self): + return self.mention._entity.eid + self.subspan_id + + +CHARS_FORBIDDEN_IN_ID = "-=| \t()" + + +@functools.total_ordering +class CorefEntity(object): + """Class for representing all mentions of a given entity.""" + __slots__ = ['_eid', '_mentions', 'etype', 'split_ante'] + + def __init__(self, eid, etype=None): + self._eid = None # prepare the _eid slot + self.eid = eid # call the setter and check the ID is valid + self._mentions = [] + self.etype = etype + self.split_ante = [] + + def __lt__(self, another): + """Does this CorefEntity precede (word-order wise) `another` entity? + + This method defines a total ordering of all entities + by the first mention of each entity (see `CorefMention.__lt__`). + If one of the entities has no mentions (which should not happen normally), + there is a backup solution (see the source code). + If entity IDs are not important, it is recommended to use block + `corefud.IndexClusters` to re-name entity IDs in accordance with this entity ordering. + """ + if not self._mentions or not another._mentions: + # Entities without mentions should go first, so the ordering is total. + # If both entities are missing mentions, let's use eid, so the ordering is stable. + if not self._mentions and not another._mentions: + return self._eid < another._eid + return not self._mentions + return self._mentions[0] < another._mentions[0] + + @property + def eid(self): + return self._eid + + @eid.setter + def eid(self, new_eid): + if any(x in new_eid for x in CHARS_FORBIDDEN_IN_ID): + raise ValueError(f"{new_eid} contains forbidden characters [{CHARS_FORBIDDEN_IN_ID}]") + self._eid = new_eid + + @property + def eid_or_grp(self): + root = self._mentions[0].head.root + meta = root.document.meta + if 'GRP' in meta['global.Entity'] and meta['_tree2docid']: + docid = meta['_tree2docid'][root] + if self._eid.startswith(docid): + return self._eid.replace(docid, '', 1) + else: + logging.warning(f"GRP in global.Entity, but eid={self._eid} does not start with docid={docid}") + return self._eid + + @property + def mentions(self): + return self._mentions + + def create_mention(self, head=None, words=None, span=None): + """Create a new CoreferenceMention object within this CorefEntity. + + Args: + head: a node where the annotation about this CorefMention will be stored in MISC. + The head is supposed to be the linguistic head of the mention, + i.e. the highest node in the dependency tree, + but if such information is not available (yet), + it can be any node within the `words`. + If no head is specified, the first word from `words` will be used instead. + words: a list of nodes of the mention. + This argument is optional, but if provided, it must contain the head. + The nodes can be both normal nodes or empty nodes. + span: an alternative way how to specify `words` + using a string such as "3-5,6,7.1-7.2". + (which means, there is an empty node 5.1 and normal node 7, + which are not part of the mention). + At most one of the args `words` and `span` can be specified. + """ + if words and span: + raise ValueError("Cannot specify both words and span") + if head and words and head not in words: + raise ValueError(f"Head {head} is not among the specified words") + if head is None and words is None: + raise ValueError("Either head or words must be specified") + if head is None: + head = words[0] + + mention = CorefMention(words=[head], head=head, entity=self) + if words: + mention.words = words + if span: + mention.span = span + self._mentions.sort() + return mention + + # TODO or should we create a BridgingLinks instance with a fake src_mention? + def all_bridging(self): + for m in self._mentions: + if m._bridging: + for b in m._bridging: + yield b + + def __str__(self): + """String representation of the CorefEntity object: Entity.""" + first_mention_head = self._mentions[0].head.form if self._mentions else "" + return f"Entity<{self._eid}: {first_mention_head}>" + + +# BridgingLink +# Especially the relation should be mutable, so we cannot use +# BridgingLink = collections.namedtuple('BridgingLink', 'target relation') +# TODO once dropping support for Python 3.6, we could use +# from dataclasses import dataclass +# @dataclass +# class DataClassCard: +# target: CorefEntity +# relation: str +class BridgingLink: + __slots__ = ['target', 'relation'] + + def __init__(self, target, relation=''): + self.target = target + self.relation = '' if relation is None else relation + + def __lt__(self, another): + if self.target == another.target: + return self.relation < another.relation + return self.target < another.target + + +class BridgingLinks(collections.abc.MutableSequence): + """BridgingLinks class serves as a list of BridgingLink tuples with additional methods. + + Example usage: + >>> bl = BridgingLinks(src_mention) # empty links + >>> bl = BridgingLinks(src_mention, [(c12, 'part'), (c56, 'subset')]) # from a list of tuples + >>> (bl8, bl9) = BridgingLinks.from_string('c12>> for entity, relation in bl: + >>> print(f"{bl.src_mention} ->{relation}-> {entity.eid}") + >>> print(str(bl)) # c12>> bl('part').targets == [c12] + >>> bl('part|subset').targets == [c12, c56] + >>> bl.append((c57, 'funct')) + """ + + @classmethod + def from_string(cls, string, entities, node, strict=True, tree2docid=None): + """Return a sequence of BridgingLink objects representing a given string serialization. + The bridging links are also added to the mentions (`mention.bridging`) in the supplied `entities`, + so the returned sequence can be usually ignored. + If `tree2docid` parameter is provided (mapping trees to document IDs used as prefixes in eid), + the entity IDs in the provided string are interpreted as "GRP", i.e. as document-wide IDs, + which need to be prefixed by the document IDs, to get corpus-wide unique "eid". + """ + src_str2bl = {} + for link_str in string.split(','): + try: + trg_str, src_str = link_str.split('<') + except ValueError as err: + _error(f"invalid Bridge {link_str} {err} at {node}", strict) + continue + relation = '' + if ':' in src_str: + src_str, relation = src_str.split(':', 1) + if trg_str == src_str: + _error(f"Bridge cannot self-reference the same entity {trg_str} at {node}", strict) + if tree2docid: + src_str = tree2docid[node.root] + src_str + trg_str = tree2docid[node.root] + trg_str + bl = src_str2bl.get(src_str) + if not bl: + bl = entities[src_str].mentions[-1].bridging + src_str2bl[src_str] = bl + if trg_str not in entities: + entities[trg_str] = CorefEntity(trg_str) + bl._data.append(BridgingLink(entities[trg_str], relation)) + return src_str2bl.values() + + def __init__(self, src_mention, value=None, strict=True): + self.src_mention = src_mention + self._data = [] + self.strict = strict + if value is not None: + if isinstance(value, collections.abc.Sequence): + for v in value: + if v[0] is src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + v[0].eid, strict) + self._data.append(BridgingLink(v[0], v[1])) + else: + raise ValueError(f"Unknown value type: {type(value)}") + self.src_mention._bridging = self + super().__init__() + + def __getitem__(self, key): + return self._data[key] + + def __len__(self): + return len(self._data) + + # TODO delete backlinks of old links, dtto for SplitAnte + def __setitem__(self, key, new_value): + if new_value[0] is self.src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + new_value[0].eid, self.strict) + self._data[key] = BridgingLink(new_value[0], new_value[1]) + + def __delitem__(self, key): + del self._data[key] + + def insert(self, key, new_value): + if new_value[0] is self.src_mention._entity: + _error("Bridging cannot self-reference the same entity: " + new_value[0].eid, self.strict) + self._data.insert(key, BridgingLink(new_value[0], new_value[1])) + + def __str__(self): + # TODO in future link.relation should never be None, 0 nor "_", so we could delete the below. + return ','.join(f'{l.target.eid_or_grp}<{self.src_mention.entity.eid_or_grp}{":" + l.relation if l.relation not in (None, "_", "") else ""}' for l in sorted(self._data)) + + def __call__(self, relations_re=None): + """Return a subset of links contained in this list as specified by the args. + Args: + relations: only links with a relation matching this regular expression will be returned + """ + if relations_re is None: + return self + return BridgingLinks(self.src_mention, [l for l in self._data if re.match(relations_re, l.relation)]) + + @property + def targets(self): + """Return a list of the target entities (without relations).""" + return [link.target for link in self._data] + + def _delete_targets_without_mentions(self, warn=True): + for link in self._data: + if not link.target.mentions: + if warn: + logging.warning(f"Entity {link.target.eid} has no mentions, but is referred to in bridging of {self.src_mention.entity.eid}") + self._data.remove(link) + + +def _error(msg, strict): + if strict: + raise ValueError(msg) + logging.error(msg) + + +RE_DISCONTINUOUS = re.compile(r'^([^[]+)\[(\d+)/(\d+)\]') +# When converting doc-level GRP IDs to corpus-level eid IDs, +# we need to assign each document a short ID/number (document names are too long). +# These document numbers must be unique even when loading multiple files, +# so we need to store the highest number generated so far here, at the Python module level. +highest_doc_n = 0 + +def load_coref_from_misc(doc, strict=True): + global highest_doc_n + entities = {} + unfinished_mentions = collections.defaultdict(list) + discontinuous_mentions = collections.defaultdict(list) + global_entity = doc.meta.get('global.Entity') + was_global_entity = True + if not global_entity: + was_global_entity = False + global_entity = 'eid-etype-head-other' + doc.meta['global.Entity'] = global_entity + tree2docid = None + if 'GRP' in global_entity: + tree2docid, docid = {}, "" + for bundle in doc: + for tree in bundle: + if tree.newdoc or docid == "": + highest_doc_n += 1 + docid = f"d{highest_doc_n}." + tree2docid[tree] = docid + doc.meta['_tree2docid'] = tree2docid + elif 'eid' not in global_entity: + raise ValueError("No eid in global.Entity = " + global_entity) + fields = global_entity.split('-') + + for node in doc.nodes_and_empty: + misc_entity = node.misc["Entity"] + if not misc_entity: + continue + + if not was_global_entity: + raise ValueError(f"No global.Entity header found, but Entity= annotations are presents") + + # The Entity attribute may contain multiple entities, e.g. + # Entity=(abstract-7-new-2-coref(abstract-3-giv:act-1-coref) + # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. + # The following re.split line splits this into + # chunks = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] + chunks = [x for x in re.split(r'(\([^()]+\)?|[^()]+\))', misc_entity) if x] + for chunk in chunks: + opening, closing = (chunk[0] == '(', chunk[-1] == ')') + chunk = chunk.strip('()') + # 1. invalid + if not opening and not closing: + logging.warning(f"Entity {chunk} at {node} has no opening nor closing bracket.") + # 2. closing bracket + elif not opening and closing: + # closing brackets should include just the ID, but GRP needs to be converted to eid + if tree2docid: + # TODO delete this legacy hack once we don't need to load UD GUM v2.8 anymore + if '-' in chunk: + if not strict and global_entity.startswith('entity-GRP'): + chunk = chunk.split('-')[1] + else: + _error("Unexpected closing eid " + chunk, strict) + chunk = tree2docid[node.root] + chunk + + # closing discontinuous mentions + eid, subspan_idx = chunk, None + if chunk not in unfinished_mentions: + m = RE_DISCONTINUOUS.match(chunk) + if not m: + raise ValueError(f"Mention {chunk} closed at {node}, but not opened.") + eid, subspan_idx, total_subspans = m.group(1, 2, 3) + + try: + mention, head_idx = unfinished_mentions[eid].pop() + except IndexError as err: + raise ValueError(f"Mention {chunk} closed at {node}, but not opened.") + last_word = mention.words[-1] + if node.root is not last_word.root: + # TODO cross-sentence mentions + if strict: + raise ValueError(f"Cross-sentence mentions not supported yet: {chunk} at {node}") + else: + logging.warning(f"Cross-sentence mentions not supported yet: {chunk} at {node}. Deleting.") + entity = mention.entity + mention.words = [] + entity._mentions.remove(mention) + if not entity._mentions: + del entities[entity.eid] + for w in node.root.descendants_and_empty: + if last_word.precedes(w): + mention._words.append(w) + w._mentions.append(mention) + if w is node: + break + if head_idx and (subspan_idx is None or subspan_idx == total_subspans): + try: + mention.head = mention.words[head_idx - 1] + except IndexError as err: + _error(f"Invalid head_idx={head_idx} for {mention.entity.eid} " + f"closed at {node} with words={mention.words}", strict) + if not strict and head_idx > len(mention.words): + mention.head = mention.words[-1] + if subspan_idx and subspan_idx == total_subspans: + m = discontinuous_mentions[eid].pop() + if m is not mention: + _error(f"Closing mention {mention.entity.eid} at {node}, but it has unfinished nested mentions ({m.words})", 1) + + # 3. opening or single-word + else: + eid, etype, head_idx, other = None, None, None, OtherDualDict() + for name, value in zip(fields, chunk.split('-')): + if name == 'eid': + eid = value + elif name == 'GRP': + eid = tree2docid[node.root] + value + elif name == 'etype' or name == 'entity': # entity is an old name for etype used in UD GUM 2.8 and 2.9 + etype = value + elif name == 'head': + try: + head_idx = int(value) + except ValueError as err: + _error(f"Non-integer {value} as head index in {chunk} in {node}: {err}", strict) + head_idx = 1 + elif name == 'other': + if other: + new_other = OtherDualDict(value) + for k,v in other.values(): + new_other[k] = v + other = new_other + else: + other = OtherDualDict(value) + else: + other[name] = value + if eid is None: + raise ValueError("No eid in " + chunk) + subspan_idx, total_subspans = None, '0' + if eid[-1] == ']': + m = RE_DISCONTINUOUS.match(eid) + if not m: + _error(f"eid={eid} ending with ], but not valid discontinuous mention ID ", strict) + else: + eid, subspan_idx, total_subspans = m.group(1, 2, 3) + + entity = entities.get(eid) + if entity is None: + if subspan_idx and subspan_idx != '1': + _error(f'Non-first subspan of a discontinuous mention {eid} at {node} does not have any previous mention.', 1) + entity = CorefEntity(eid) + entities[eid] = entity + entity.etype = etype + elif etype and entity.etype and entity.etype != etype: + logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + other["orig_etype"] = etype + # CorefEntity could be created first with "Bridge=" without any type + elif etype and entity.etype is None: + entity.etype = etype + + if subspan_idx and subspan_idx != '1': + opened = [pair[0] for pair in unfinished_mentions[eid]] + mention = next(m for m in discontinuous_mentions[eid] if m not in opened) + mention._words.append(node) + if closing and subspan_idx == total_subspans: + m = discontinuous_mentions[eid].pop() + if m is not mention: + _error(f"{node}: closing mention {mention.entity.eid} ({mention.words}), but it has an unfinished nested mention ({m.words})", 1) + try: + mention.head = mention._words[head_idx - 1] + except IndexError as err: + _error(f"Invalid head_idx={head_idx} for {mention.entity.eid} " + f"closed at {node} with words={mention._words}", 1) + else: + mention = CorefMention(words=[node], entity=entity, add_word_backlinks=False) + if other: + mention._other = other + if subspan_idx: + discontinuous_mentions[eid].append(mention) + node._mentions.append(mention) + + if not closing: + unfinished_mentions[eid].append((mention, head_idx)) + + + # Bridge, e.g. Entity=(e12-event|Bridge=e12 (e10) + # (e1(e2 --> (e1(e2(e10) + # e3)(e1(e2 --> e3)(e1(e2(e10) + if not orig_entity or orig_entity[-1] != ')': + firstword.misc['Entity'] += mention_str + ')' + # e4)e3) --> (e10)e4)e3) + elif '(' not in orig_entity: + firstword.misc['Entity'] = mention_str + ')' + orig_entity + # (e9)e4)e3) --> (e10)(e9)e4)e3) + elif any(c and c[0] == '(' and c[-1] != ')' for c in re.split(r'(\([^()]+\)?|[^()]+\))', orig_entity)): + firstword.misc['Entity'] += mention_str + ')' + # (e1(e2(e9) --> (e1(e2(e9)(e10) + # e3)(e1(e2(e9)--> e3)(e1(e2(e9)(e10) + else: + firstword.misc['Entity'] = mention_str + ')' + orig_entity + # Second, multi-word mentions. Opening brackets should follow closing brackets. + else: + firstword.misc['Entity'] += mention_str + eid = entity.eid + if tree2docid and 'GRP' in fields: + eid = re.sub(r'^d\d+\.', '', eid) + mention.words[-1].misc['Entity'] = eid + ')' + mention.words[-1].misc['Entity'] + + # Bridge=e1 lo else f"{lo}") + return ','.join(ranges) + + +# TODO fix code duplication with udapi.core.dualdict after making sure benchmarks are not slower +class OtherDualDict(collections.abc.MutableMapping): + """OtherDualDict class serves as dict with lazily synchronized string representation. + + >>> ddict = OtherDualDict('anacata:anaphoric,antetype:entity,nptype:np') + >>> ddict['mention'] = 'np' + >>> str(ddict) + 'anacata:anaphoric,antetype:entity,mention:np,nptype:np' + >>> ddict['NonExistent'] + '' + + This class provides access to both + * a structured (dict-based, deserialized) representation, + e.g. {'anacata': 'anaphoric', 'antetype': 'entity'}, and + * a string (serialized) representation of the mapping, e.g. `anacata:anaphoric,antetype:entity`. + There is a clever mechanism that makes sure that users can read and write + both of the representations which are always kept synchronized. + Moreover, the synchronization is lazy, so the serialization and deserialization + is done only when needed. This speeds up scenarios where access to dict is not needed. + + A value can be deleted with any of the following three ways: + >>> del ddict['nptype'] + >>> ddict['nptype'] = None + >>> ddict['nptype'] = '' + and it works even if the value was already missing. + """ + __slots__ = ['_string', '_dict'] + + def __init__(self, value=None, **kwargs): + if value is not None and kwargs: + raise ValueError('If value is specified, no other kwarg is allowed ' + str(kwargs)) + self._dict = dict(**kwargs) + self._string = None + if value is not None: + self.set_mapping(value) + + def __str__(self): + if self._string is None: + serialized = [] + for name, value in sorted(self._dict.items(), key=lambda s: s[0].lower()): + if value is True: + serialized.append(name) + else: + serialized.append(f"{name}:{value}") + self._string = ','.join(serialized) if serialized else '' + return self._string + + def _deserialize_if_empty(self): + if not self._dict and self._string is not None and self._string != '': + for raw_feature in self._string.split(','): + namevalue = raw_feature.split(':', 1) + if len(namevalue) == 2: + name, value = namevalue + else: + name, value = namevalue[0], True + self._dict[name] = value + + def __getitem__(self, key): + self._deserialize_if_empty() + return self._dict.get(key, '') + + def __setitem__(self, key, value): + self._deserialize_if_empty() + self._string = None + if value is None or value == '': + self.__delitem__(key) + else: + value = value.replace(',', '%2C') # TODO report a warning? Escape also '|' and '-'? + self._dict[key] = value + + def __delitem__(self, key): + self._deserialize_if_empty() + try: + del self._dict[key] + self._string = None + except KeyError: + pass + + def __iter__(self): + self._deserialize_if_empty() + return self._dict.__iter__() + + def __len__(self): + self._deserialize_if_empty() + return len(self._dict) + + def __contains__(self, key): + self._deserialize_if_empty() + return self._dict.__contains__(key) + + def clear(self): + self._string = '_' + self._dict.clear() + + def copy(self): + """Return a deep copy of this instance.""" + return copy.deepcopy(self) + + def set_mapping(self, value): + """Set the mapping from a dict or string. + + If the `value` is None, it is converted to storing an empty string. + If the `value` is a string, it is stored as is. + If the `value` is a dict (or any instance of `collections.abc.Mapping`), + its copy is stored. + Other types of `value` raise an `ValueError` exception. + """ + if value is None: + self.clear() + elif isinstance(value, str): + self._dict.clear() + self._string = value + elif isinstance(value, collections.abc.Mapping): + self._string = None + self._dict = dict(value) + else: + raise ValueError("Unsupported value type " + str(value)) diff --git a/udapi/core/document.py b/udapi/core/document.py index b64ee29c..5f2bdf0b 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -1,22 +1,60 @@ """Document class is a container for UD trees.""" +import io +import contextlib +import logging +import udapi.core.coref from udapi.core.bundle import Bundle - from udapi.block.read.conllu import Conllu as ConlluReader from udapi.block.write.conllu import Conllu as ConlluWriter - +from udapi.block.read.sentences import Sentences as SentencesReader +from udapi.block.write.textmodetrees import TextModeTrees class Document(object): """Document is a container for Universal Dependency trees.""" - def __init__(self): + def __init__(self, filename=None, **kwargs): + """Create a new Udapi document. + + Args: + filename: load the specified file. + Only `*.conlu` (using `udapi.block.read.conllu`) + and `*.txt` (using `udapi.block.read.sentences`) filenames are supported. + No pre-processing is applied, so when loading the document from a *.txt file, + `Document("a.txt").nodes` will be empty and you need to run tokenization first. + You can pass additional parameters for `udapi.block.read.sentences` + (`ignore_empty_lines`, `newdoc_if_empty_line` and `rstrip`). + """ self.bundles = [] self._highest_bundle_id = 0 self.meta = {} + self.json = {} + self._eid_to_entity = None + if filename is not None: + if filename.endswith(".conllu"): + self.load_conllu(filename, **kwargs) + elif filename.endswith(".txt"): + reader = SentencesReader(files=[filename], **kwargs) + reader.apply_on_document(self) + else: + raise ValueError("Only *.conllu and *.txt are supported. Provided: " + filename) def __iter__(self): return iter(self.bundles) + def __getitem__(self, key): + return self.bundles[key] + + def __len__(self): + return len(self.bundles) + + def __str__(self): + """Pretty print the whole document using write.TextModeTrees.""" + fh = io.StringIO() + with contextlib.redirect_stdout(fh): + TextModeTrees(color=True).run(self) + return fh.getvalue() + def create_bundle(self): """Create a new bundle and add it at the end of the document.""" self._highest_bundle_id += 1 @@ -25,12 +63,107 @@ def create_bundle(self): bundle.number = len(self.bundles) return bundle - def load_conllu(self, filename): + def load_conllu(self, filename=None, **kwargs): """Load a document from a conllu-formatted file.""" - reader = ConlluReader(files=filename) - reader.process_document(self) + ConlluReader(files=[filename], **kwargs).process_document(self) def store_conllu(self, filename): """Store a document into a conllu-formatted file.""" - writer = ConlluWriter(files=filename) - writer.process_document(self) + ConlluWriter(files=[filename]).apply_on_document(self) + + def from_conllu_string(self, string): + """Load a document from a conllu-formatted string.""" + reader = ConlluReader(filehandle=io.StringIO(string)) + reader.apply_on_document(self) + + def to_conllu_string(self): + """Return the document as a conllu-formatted string.""" + fh = io.StringIO() + with contextlib.redirect_stdout(fh): + ConlluWriter().apply_on_document(self) + return fh.getvalue() + + @property + def trees(self): + """An iterator over all trees in the document.""" + for bundle in self: + for tree in bundle: + yield tree + + @property + def nodes(self): + """An iterator over all nodes (excluding empty nodes) in the document.""" + for bundle in self: + for tree in bundle: + # tree.descendants is slightly slower than tree._descendants, + # but it seems safer, see the comment in udapi.core.block.Block.process_tree(). + for node in tree.descendants: + yield node + + @property + def nodes_and_empty(self): + """An iterator over all nodes and empty nodes in the document.""" + for bundle in self: + for tree in bundle: + for node in tree.descendants_and_empty: + yield node + + def draw(self, **kwargs): + """Pretty print the trees using TextModeTrees.""" + TextModeTrees(**kwargs).run(self) + + def _load_coref(self): + """De-serialize coreference-related objects (CorefMention, CorefEntity). + + This internal method will be called automatically whenever any coref-related method is called. + It iterates through all nodes in the document and creates the objects based on the info in MISC + (stored in attributes Entity, SplitAnte, Bridge). + """ + if self._eid_to_entity is None: + udapi.core.coref.load_coref_from_misc(self) + + @property + def eid_to_entity(self): + """A dict mapping each eid (entity ID) to a CorefEntity object.""" + self._load_coref() + return self._eid_to_entity + + @property + def coref_clusters(self): + """DEPRECATED: A dict mapping eid to a CorefEntity object. + + Substitute `doc.coref_clusters.values()` and `list(doc.coref_clusters.values())` + with `doc.coref_entities`. + Otherwise, substitute `doc.coref_clusters` with `doc.eid_to_entity`. + """ + logging.warning("coref_clusters is deprecated, use coref_entities or eid_to_entity instead.") + return self.eid_to_entity + + @property + def coref_entities(self): + """A list of all CorefEntity objects in the document.""" + self._load_coref() + return list(self._eid_to_entity.values()) + + @property + def coref_mentions(self): + """A sorted list of all CorefMention objects in the document.""" + self._load_coref() + all_mentions = [] + for entity in self._eid_to_entity.values(): + all_mentions.extend(entity.mentions) + all_mentions.sort() + return all_mentions + + def create_coref_entity(self, eid=None, etype=None): + self._load_coref() + if not eid: + counter = 1 + while self._eid_to_entity.get(f'e{counter}'): + counter += 1 + eid = f'e{counter}' + elif self._eid_to_entity.get(eid): + raise ValueError("Entity with eid=%s already exists", eid) + entity = udapi.core.coref.CorefEntity(eid, etype) + self._eid_to_entity[eid] = entity + return entity diff --git a/udapi/core/dualdict.py b/udapi/core/dualdict.py index a79c0610..ba0129ed 100644 --- a/udapi/core/dualdict.py +++ b/udapi/core/dualdict.py @@ -45,7 +45,7 @@ def __str__(self): if value is True: serialized.append(name) else: - serialized.append('%s=%s' % (name, value)) + serialized.append(f"{name}={value}") self._string = '|'.join(serialized) if serialized else '_' return self._string diff --git a/udapi/core/files.py b/udapi/core/files.py index 29ad60e9..be59b2c0 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -1,11 +1,13 @@ """Files is a helper class for iterating over filenames.""" import glob +import io import sys import os.path import bz2 import gzip import lzma +import itertools class Files(object): @@ -29,16 +31,23 @@ class Files(object): >>> filehandle = files.next_filehandle() """ - def __init__(self, filenames, encoding='utf-8'): - if isinstance(filenames, list): + def __init__(self, filenames=None, filehandle=None, encoding='utf-8'): + self.filehandle = None + self.file_number = 0 + self.encoding = encoding + if filehandle is not None: + self.filehandle = filehandle + if filenames is not None: + raise ValueError('Cannot specify both "filenames" and "filehandle"') + self.filenames = [''] + elif isinstance(filenames, list): self.filenames = filenames elif isinstance(filenames, str): + if filenames == '': + raise ValueError('Filenames (files=) cannot be an empty string') self.filenames = self.string_to_filenames(filenames) else: raise ValueError('Parameter "filenames" must be a list or str') - self.filehandle = None - self.encoding = encoding - self.file_number = 0 def string_to_filenames(self, string): """Parse a pattern string (e.g. '!dir??/file*.txt') and return a list of matching filenames. @@ -49,21 +58,14 @@ def string_to_filenames(self, string): or commas. For specifying files with spaces or commas in filenames, you need to use wildcard patterns or '@' filelist. (But preferably don't use such filenames.) """ - # "!" means glob pattern which can contain {dir1,dir2} - # so it cannot be combined with separating tokens with comma. - if string[0] == '!': - pattern = string[1:] - filenames = glob.glob(pattern) - if not filenames: - raise RuntimeError('No filenames matched "%s" pattern' % pattern) - return filenames - return [self._token_to_filenames(tok) for tok in string.replace(',', ' ').split()] + return list(itertools.chain.from_iterable(self._token_to_filenames(tok) + for tok in string.replace(',', ' ').split())) @staticmethod def _token_to_filenames(token): if token[0] == '!': pattern = token[1:] - filenames = glob.glob(pattern) + filenames = sorted(glob.glob(pattern)) if not filenames: raise RuntimeError('No filenames matched "%s" pattern' % pattern) elif token[0] == '@': @@ -74,7 +76,7 @@ def _token_to_filenames(token): if directory != '.': filenames = [f if f[0] != '/' else directory + '/' + f for f in filenames] else: - filenames = token + filenames = [token] return filenames @property @@ -104,7 +106,9 @@ def next_filehandle(self): if filename is None: fhandle = None elif filename == '-': - fhandle = sys.stdin + fhandle = io.TextIOWrapper(sys.stdin.buffer, encoding=self.encoding) + elif filename == '': + fhandle = self.filehandle else: filename_extension = filename.split('.')[-1] if filename_extension == 'gz': diff --git a/udapi/core/mwt.py b/udapi/core/mwt.py index 289adcdb..00ba935c 100644 --- a/udapi/core/mwt.py +++ b/udapi/core/mwt.py @@ -1,46 +1,160 @@ """MWT class represents a multi-word token.""" from udapi.core.dualdict import DualDict - +from udapi.core.feats import Feats class MWT(object): """Class for representing multi-word tokens in UD trees.""" - __slots__ = ['words', 'form', '_misc', 'root'] + __slots__ = ['words', 'form', '_feats', '_misc', 'root'] - def __init__(self, words=None, form=None, misc=None, root=None): + def __init__(self, words=None, form=None, feats=None, misc=None, root=None): self.words = words if words is not None else [] self.form = form - self._misc = DualDict(misc) + self._feats = Feats(feats) if feats and feats != '_' else None + self._misc = DualDict(misc) if misc and misc != '_' else None self.root = root for word in self.words: word._mwt = self # pylint: disable=W0212 + @property + def feats(self): + """Property `feats` in MWT should be used only for `Typo=Yes`. + + See https://universaldependencies.org/changes.html#typos-in-multiword-tokens + However, Udapi does not enforce this restriction and mwt.feats works exactly the same as node.feats. + """ + if self._feats is None: + self._feats = Feats() + return self._feats + + @feats.setter + def feats(self, value): + if self._feats is None: + self._feats = Feats(value) + else: + self._feats.set_mapping(value) + @property def misc(self): """Property for MISC attributes stored as a `DualDict` object. See `udapi.core.node.Node` for details. """ + if self._misc is None: + self._misc = DualDict() return self._misc @misc.setter def misc(self, value): - self._misc.set_mapping(value) + if self._misc is None: + self._misc = DualDict(value) + else: + self._misc.set_mapping(value) + @property def ord_range(self): """Return a string suitable for the first column of CoNLL-U.""" + self.words.sort() return "%d-%d" % (self.words[0].ord, self.words[-1].ord) def remove(self): """Delete this multi-word token (but keep its words).""" for word in self.words: word._mwt = None # pylint: disable=W0212 - self.root.multiword_tokens = [tok for tok in self.root.multiword_tokens if tok != self] + self.root.multiword_tokens.remove(self) def address(self): """Full (document-wide) id of the multi-word token.""" return self.root.address + '#' + self.ord_range + @staticmethod + def is_mwt(): + """Is this a multi-word token? + + Returns always True. + False is returned only by instances of the Node class. + """ + return True + + @property + def no_space_after(self): + """Boolean property as a shortcut for `mwt.misc["SpaceAfter"] == "No"`.""" + return self.misc["SpaceAfter"] == "No" + + @staticmethod + def is_empty(): + """Is this an Empty node? + + Returns always False because multi-word tokens cannot be empty nodes. + """ + return False + + @staticmethod + def is_leaf(): + """Is this a node/mwt without any children? + + Returns always True because multi-word tokens cannot have children. + """ + return True + + def _get_attr(self, name): # pylint: disable=too-many-return-statements + if name == 'form': + return self.form + if name == 'ord': + return self.ord_range + if name in ('edge', 'children', 'siblings', 'depth'): + return 0 + if name == 'feats_split': + return str(self.feats).split('|') + if name == 'misc_split': + return str(self.misc).split('|') + if name.startswith('feats['): + return self.feats[name[6:-1]] + if name.startswith('misc['): + return self.misc[name[5:-1]] + return '' + + def get_attrs(self, attrs, undefs=None, stringify=True): + """Return multiple attributes or pseudo-attributes, possibly substituting empty ones. + + MWTs do not have children nor parents nor prev/next nodes, + so the pseudo-attributes: p_xy, c_xy, l_xy and r_xy are irrelevant (and return nothing). + Other pseudo-attributes (e.g. dir) return always the string "". + The only relevant pseudo-attributes are + feats_split and misc_split: a list of name=value formatted strings. + The `ord` attribute returns actually `mwt.ord_range`. + + Args: + attrs: A list of attribute names, e.g. ``['form', 'ord', 'feats_split']``. + undefs: A value to be used instead of None for empty (undefined) values. + stringify: Apply `str()` on each value (except for None) + """ + values = [] + for name in attrs: + nodes = [self] + if name[1] == '_': + nodes, name = [], name[2:] + for node in (n for n in nodes if n is not None): + if name in {'feats_split', 'misc_split'}: + values.extend(node._get_attr(name)) + else: + values.append(node._get_attr(name)) + + if undefs is not None: + values = [x if x is not None else undefs for x in values] + if stringify: + values = [str(x) if x is not None else None for x in values] + return values + + @property + def _ord(self): + self.words.sort() + return self.words[0]._ord + # TODO: node.remove() should check if the node is not part of any MWT -# TODO: mwt.words.append(node) and node.shift* should check if the MWT does not contain gaps +# TODO: Document that editing words by mwt.words.append(node), del or remove(node) is not supported +# TODO: Make mwt._words private and provide a setter +# TODO: What to do when mwt.words = []? (It is allowed after mwt=MWT().) +# TODO: words.setter and node.shift* should check if the MWT does not contain gaps # and is still multi-word -# TODO: check if one word is not included in multiple multi-word tokens +# TODO: Make sure mwt.words are always sorted (even after node.shift*). +# TODO: Check if one word is not included in multiple multi-word tokens. diff --git a/udapi/core/node.py b/udapi/core/node.py index 8e42e991..c6a7a26a 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -1,10 +1,13 @@ """Node class and related classes and functions. -In addition to class `Node`, this module contains class `ListOfNodes` +In addition to class `Node`, this module contains also helper classes +`CycleError`, `EmptyNode`, `OrdTuple` and `ListOfNodes` and function `find_minimal_common_treelet`. """ import logging +import functools +import udapi.core.coref from udapi.block.write.textmodetrees import TextModeTrees from udapi.core.dualdict import DualDict from udapi.core.feats import Feats @@ -20,16 +23,17 @@ # The set of public attributes/properties and methods of Node was well-thought. # pylint: disable=too-many-instance-attributes,too-many-public-methods - +@functools.total_ordering class Node(object): """Class for representing nodes in Universal Dependency trees. Attributes `form`, `lemma`, `upos`, `xpos` and `deprel` are public attributes of type `str`, so you can use e.g. `node.lemma = node.form`. - `node.ord` is a int type public attribute for storing the node's word order index, + `node.ord` is a int type property for storing the node's word-order index, but assigning to it should be done with care, so the non-root nodes have `ord`s 1,2,3... It is recommended to use one of the `node.shift_*` methods for reordering nodes. + Note that `EmptyNode`s (subclass of `Node`) have decimal ords (and no `shift_*` methods). For changing dependency structure (topology) of the tree, there is the `parent` property, e.g. `node.parent = node.parent.parent` and `node.create_child()` method. @@ -62,8 +66,9 @@ class Node(object): # TODO: Benchmark memory and speed of slots vs. classic dict. # With Python 3.5 split dict, slots may not be better. # TODO: Should not we include __weakref__ in slots? + # TODO: Benchmark using node._ord instead node.ord in this file __slots__ = [ - 'ord', # Word-order index of the node (root has 0). + '_ord', # Word-order index of the node (root has 0). 'form', # Word form or punctuation symbol. 'lemma', # Lemma of word form. 'upos', # Universal PoS tag. @@ -75,29 +80,60 @@ class Node(object): '_feats', # Morphological features as udapi.core.feats.Feats object. '_parent', # Parent node. '_children', # Ord-ordered list of child nodes. + '_root', # Technical root of the tree '_mwt', # Multi-word token in which this word participates. + '_mentions', # List of udapi.core.coref.CorefMention objects whose span includes this node ] - def __init__(self, form=None, lemma=None, upos=None, # pylint: disable=too-many-arguments + def __init__(self, root, form=None, lemma=None, upos=None, # pylint: disable=too-many-arguments xpos=None, feats=None, deprel=None, misc=None): """Create a new node and initialize its attributes using the keyword arguments.""" - self.ord = None + self._root = root + self._ord = None self.form = form self.lemma = lemma self.upos = upos self.xpos = xpos - self._feats = Feats(feats) + self._feats = Feats(feats) if feats and feats != '_' else None self.deprel = deprel - self._misc = DualDict(misc) + self._misc = DualDict(misc) if misc and misc != '_' else None self._raw_deps = '_' self._deps = None self._parent = None self._children = list() self._mwt = None + self._mentions = list() def __str__(self): - """Pretty print of the Node object.""" - return "node<%s, %s>" % (self.address(), self.form) + """String representation of the Node object: .""" + return f"<{self.address()}, {self.form}>" + + def __repr__(self): + """String representation of the Node object: Node.""" + return f"Node<{self.address()}, {self.form}>" + + + @property + def root(self): + return self._root + + # ord is implemented as a property, so that it can be overriden in EmptyNode and Root + @property + def ord(self): + return self._ord + + @ord.setter + def ord(self, new_ord): + self._ord = new_ord + + def __lt__(self, other): + """Calling `nodeA < nodeB` is equivalent to `nodeA.ord < nodeB.ord`. + + Note that this does not work as expected for nodes from different trees + because `ord` is the word order within each sentence. + For comparing the word order across trees, use `nodeA.precedes(nodeB)` instead. + """ + return self._ord < other._ord @property def udeprel(self): @@ -130,6 +166,14 @@ def sdeprel(self): return parts[1] return '' + @sdeprel.setter + def sdeprel(self, value): + udeprel = self.udeprel + if value is not None and value != '': + self.deprel = udeprel + ':' + value + else: + self.deprel = udeprel + @property def feats(self): """Property for morphological features stored as a `Feats` object. @@ -152,11 +196,16 @@ def feats(self): For details about the implementation and other methods (e.g. `node.feats.is_plural()`), see ``udapi.core.feats.Feats`` which is a subclass of `DualDict`. """ + if self._feats is None: + self._feats = Feats() return self._feats @feats.setter def feats(self, value): - self._feats.set_mapping(value) + if self._feats is None: + self._feats = Feats(value) + else: + self._feats.set_mapping(value) @property def misc(self): @@ -179,11 +228,16 @@ def misc(self): For details about the implementation, see ``udapi.core.dualdict.DualDict``. """ + if self._misc is None: + self._misc = DualDict() return self._misc @misc.setter def misc(self, value): - self._misc.set_mapping(value) + if self._misc is None: + self._misc = DualDict(value) + else: + self._misc.set_mapping(value) @property def raw_deps(self): @@ -192,12 +246,13 @@ def raw_deps(self): After the access to the raw enhanced dependencies, provide the serialization if they were deserialized already. """ - if self._deps is not None: - serialized_deps = [] - for secondary_dependence in self._deps: - serialized_deps.append('%d:%s' % (secondary_dependence[ - 'parent'].ord, secondary_dependence['deprel'])) - self._raw_deps = '|'.join(serialized_deps) + # TODO: node.deps.append(dep) should be hooked and + # mark the serialized cache dirty, i.e. self._raw_deps = None. + # Afterwards, we can use the following optimization + #if self._raw_deps is not None: + # return self._raw_deps + if self._deps: + self._raw_deps = '|'.join(f"{p}:{r}" for p, r in sorted(set((d['parent'].ord, d['deprel']) for d in self._deps))) return self._raw_deps @raw_deps.setter @@ -205,9 +260,9 @@ def raw_deps(self, value): """Set serialized enhanced dependencies (the new value is a string). When updating raw secondary dependencies, - delete the current version of the deserialized data. + the current version of the deserialized data is deleted. """ - self._raw_deps = str(value) + self._raw_deps = value self._deps = None @property @@ -218,18 +273,26 @@ def deps(self): provide the deserialization of the raw data and save deps to the list. """ if self._deps is None: - # Obtain a list of all nodes in the dependency tree. - nodes = [self.root] + self.root.descendants() - # Create a list of secondary dependencies. self._deps = list() if self._raw_deps == '_': return self._deps + # Obtain a list of all nodes in the dependency tree. + nodes = [self._root] + self._root._descendants + for raw_dependency in self._raw_deps.split('|'): - head, deprel = raw_dependency.split(':') - parent = nodes[int(head)] + # Deprel itself may contain one or more ':' (subtypes). + head, deprel = raw_dependency.split(':', maxsplit=1) + # Empty nodes have to be located differently than normal nodes. + if '.' in head: + try: + parent = next(x for x in self._root.empty_nodes if str(x._ord) == head) + except StopIteration: + raise ValueError(f'Empty node with ord={head} not found') + else: + parent = nodes[int(head)] self._deps.append({'parent': parent, 'deprel': deprel}) return self._deps @@ -238,6 +301,7 @@ def deps(self): def deps(self, value): """Set deserialized enhanced dependencies (the new value is a list of dicts).""" self._deps = value + self._raw_deps = None @property def parent(self): @@ -254,35 +318,35 @@ def parent(self, new_parent): (from the list of original parent's children). """ # If the parent is already assigned, return. - if self.parent == new_parent: + if self._parent is new_parent: return - # The node itself couldn't be assigned as a parent. - if self == new_parent: - raise ValueError('Cannot set a node as its own parent (cycle are forbidden): %s' % self) - - # Check if the current Node is not an antecedent of the new parent. - climbing_node = new_parent - while not climbing_node.is_root(): - if climbing_node == self: - raise ValueError('Setting the parent of %s to %s would lead to a cycle.' - % (self, new_parent)) - climbing_node = climbing_node.parent + # Check for None new_parent and cycles. + if new_parent is None: + raise ValueError(f'Cannot set None as parent: {self}') + if new_parent.is_empty(): + raise ValueError(f'Cannot set EmptyNode as parent in basic dependencies: {self}') + if self is new_parent: + raise CycleError('Cannot set a node as its own parent (cycle are forbidden): %s', self) + if self._children and new_parent.is_descendant_of(self): + raise CycleError('Setting the parent of %s to %s would lead to a cycle.', self, new_parent) # Remove the current Node from the children of the old parent. # Forbid moving nodes from one tree to another using parent setter. if self._parent: - self._parent._children = [node for node in self.parent.children if node != self] - # TODO: .root is currently computed, so it is quite slow - old_root, new_root = self._parent.root, climbing_node - if old_root != new_root: + self._parent._children.remove(self) + if self._parent._root is not new_parent._root: raise ValueError('Cannot move nodes between trees with parent setter, ' 'use new_root.steal_nodes(nodes_to_be_moved) instead') # Set the new parent. self._parent = new_parent # Append the current node to the new parent children. - new_parent._children = sorted(new_parent.children + [self], key=lambda child: child.ord) + if not new_parent._children or self > new_parent._children[-1]: + new_parent._children.append(self) + else: + new_parent._children.append(self) + new_parent._children.sort() @property def children(self): @@ -303,17 +367,24 @@ def children(self): nodes2 = [n for n in node.children if n.ord > node.ord] nodes3 = [n for n in node.children if n.ord < node.ord] nodes4 = [n for n in node.children if n.ord < node.ord] + [node] - See documentation of ListOfNodes for details. + See the documentation of ListOfNodes for details. """ return ListOfNodes(self._children, origin=self) @property - def root(self): - """Return the (technical) root node of the whole tree.""" - node = self - while node.parent: - node = node.parent - return node + def siblings(self): + """Return a list of dependency sibling nodes. + + When used as a property, `node.siblings` is just a shortcut for: + [n for n in node.parent.children if n!=node] + However, it is especially helpful when used as a method, + so e.g. `node.siblings(preceding_only=True)` stands for + [n for n in node.parent.children if n.ord < node.ord] + which is something else than + node.parent.children(preceding_only=True). + See the documentation of ListOfNodes for details. + """ + return ListOfNodes([n for n in self._parent._children if n!=self], origin=self) @property def descendants(self): @@ -334,43 +405,86 @@ def descendants(self): nodes2 = [n for n in node.descendants if n.ord > node.ord] nodes3 = [n for n in node.descendants if n.ord < node.ord] nodes4 = [n for n in node.descendants if n.ord < node.ord] + [node] - See documentation of ListOfNodes for details. + See the documentation of ListOfNodes for details. """ - return ListOfNodes(sorted(self.unordered_descendants(), key=lambda n: n.ord), origin=self) + # The following code is equivalent to + # ListOfNodes(sorted(self.unordered_descendants()), origin=self) + # but it is faster because there is no extra copying of lists of nodes. + stack = list(self._children) + descendants = ListOfNodes(stack, origin=self) + while(stack): + n = stack.pop() + if n._children: + stack.extend(n._children) + descendants.extend(n._children) + descendants.sort() + return descendants def is_descendant_of(self, node): """Is the current node a descendant of the node given as argument?""" - climber = self.parent - while climber: - if climber == node: - return True - climber = climber.parent + if node and node._children: + climber = self._parent + while climber: + if climber is node: + return True + climber = climber._parent return False def create_child(self, **kwargs): """Create and return a new child of the current node.""" - new_node = Node(**kwargs) - new_node.ord = len(self.root._descendants) + 1 - self.root._descendants.append(new_node) - self.children.append(new_node) - new_node.parent = self + new_node = Node(root=self._root, **kwargs) + new_node._ord = len(self._root._descendants) + 1 + self._root._descendants.append(new_node) + self._children.append(new_node) + new_node._parent = self return new_node - def create_empty_child(self, **kwargs): - """Create and return a new empty node child of the current node.""" - new_node = Node(**kwargs) - self.root.empty_nodes.append(new_node) + def create_empty_child(self, deprel, after=True, **kwargs): + """Create and return a new empty node child of the current node. + + Args: + deprel: the enhanced dependency relation (required to be stored in DEPS) + form, lemma, upos, xpos, feats, misc: as in Node, the default is '_' + after: position the newly created empty node after this `node`? + If True (default), the `new_node.ord` will be `node.ord + 0.1`, + unless there is already an empty node with such ord, + in which case it will be `node.ord + 0.2` etc. + If False, the new node will be placed immediately before `node`. + """ + new_node = EmptyNode(root=self._root, **kwargs) + new_node.deps = [{'parent': self, 'deprel': deprel}] # self.enh_children.append(new_node) TODO # new_node.enh_parents.append(self) TODO + base_ord = self._ord if after else self._ord - 1 + new_ord = base_ord + 0.1 + for empty in self._root.empty_nodes: + if empty._ord > new_ord: + break + if empty._ord == new_ord: + if isinstance(new_ord, OrdTuple): + new_ord.increase() + elif new_ord == base_ord + 0.9: + new_ord = OrdTuple(base_ord, 10) + else: + new_ord = round(new_ord+0.1, 1) + new_node._ord = new_ord + if not self._root.empty_nodes or new_node > self._root.empty_nodes[-1]: + self._root.empty_nodes.append(new_node) + else: + self._root.empty_nodes.append(new_node) + self._root.empty_nodes.sort() return new_node # TODO: make private: _unordered_descendants def unordered_descendants(self): """Return a list of all descendants in any order.""" - descendants = [] - for child in self.children: - descendants.append(child) - descendants.extend(child.unordered_descendants()) + stack = list(self._children) + descendants = list(stack) + while(stack): + n = stack.pop() + if n._children: + stack.extend(n._children) + descendants.extend(n._children) return descendants @staticmethod @@ -382,8 +496,17 @@ def is_root(): """ return False + @staticmethod + def is_empty(): + """Is the current node an empty node? + + Returns False for all Node instances. + True is returned only by instances of the EmptyNode subclass. + """ + return False + def remove(self, children=None): - """Delete this node and all its descendants. + """Delete this node (and all its descendants unlsess specified otherwise). Args: children: a string specifying what to do if the node has any children. @@ -392,115 +515,281 @@ def remove(self, children=None): `warn` means to issue a warning if any children are present and delete them. `rehang_warn` means to rehang and warn:-). """ - self.parent._children = [child for child in self.parent.children if child != self] - if children is not None and self.children: + self._parent._children.remove(self) + + # If there are any children, do the action specified in the "children" parameter. + if children is not None and self._children: if children.startswith('rehang'): - for child in self.children: - child.parent = self.parent + for child in self._children: + child._parent = self._parent + self._parent._children.extend(self._children) + self._parent._children.sort() + self._children.clear() if children.endswith('warn'): logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) - self.root._update_ordering() - # TODO: make private: _shift - def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): + # When self is the only node being removed, it is faster to root._descendants.remove(self) + # and update the ords only where necessary (from self._ord further). + # When removing also its children+descendants, it is faster to recompute root._descendants + # and update all ords (computing leftmost descendant of self would be too slow). + if not self._children: + try: + self._root._descendants.remove(self) + except ValueError: + pass # self may be an already deleted node e.g. if n.remove() called twice + else: + for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): + node.ord = new_ord + last_ord = 0 + for empty in self._root.empty_nodes: + if empty._ord > self._ord: + new_ord = round(empty._ord - 1, 1) + if new_ord <= last_ord: + new_ord = round(last_ord + 0.1, 1) + empty.ord = new_ord + last_ord = empty._ord + else: + # Remember the position of empty nodes, so we can reorder them as well. + empty_follows = None + if self._root.empty_nodes: + will_be_removed = self if children and children.startswith('rehang') else self.descendants(add_self=1) + prev_nonempty = self._root + empty_follows = {} + for node in self._root.descendants_and_empty: + if node.is_empty(): + empty_follows[node] = prev_nonempty + elif node not in will_be_removed: + prev_nonempty = node + + # TODO nodes_to_remove = self.unordered_descendants() + # and mark all nodes as deleted, remove them from MWT and coref mentions + self._root._descendants = sorted(self._root.unordered_descendants()) + for (new_ord, node) in enumerate(self._root._descendants, 1): + node.ord = new_ord + # Decrease ord of empty nodes (keep their fractional part) + # Make sure that e.g. after deleting node with ord=2 + # ords "1 1.1 1.2 2 2.1" will become "1 1.1 1.2 1.3". + if empty_follows: + last_ord = 0 + for empty in self._root.empty_nodes: + prev_nonempty = empty_follows[empty] + new_ord = round(prev_nonempty._ord + (empty._ord % 1), 1) + while new_ord <= last_ord: + new_ord = round(new_ord + 0.1, 1) + last_ord, empty.ord = new_ord, new_ord + + def _shift_before_ord(self, reference_ord, without_children=False): """Internal method for changing word order.""" - nodes_to_move = [self] - - if move_subtree: - nodes_to_move.extend(self.descendants()) - - reference_ord = reference_node.ord - - if reference_subtree: - for node in [n for n in reference_node.descendants() if n != self]: - if (after and node.ord > reference_ord) or (not after and node.ord < reference_ord): - reference_ord = node.ord - - common_delta = 0.5 if after else -0.5 - - # TODO: can we use some sort of epsilon instead of choosing a silly - # upper bound for out-degree? - for node_to_move in nodes_to_move: - node_to_move.ord = reference_ord + common_delta + \ - (node_to_move.ord - self.ord) / 100000. - - self.root._update_ordering() + all_nodes = self._root._descendants + empty_nodes = self._root.empty_nodes + + # Moving a single node can be faster than nodes_to_move = [self] + if without_children or not self._children: + my_ord = self._ord + if reference_ord > my_ord + 1: + for i_ord in range(my_ord, reference_ord - 1): + all_nodes[i_ord - 1] = all_nodes[i_ord] + all_nodes[i_ord - 1]._ord = i_ord + all_nodes[reference_ord - 2] = self + self._ord = reference_ord - 1 + for en in empty_nodes: + if en._ord > my_ord and en._ord < reference_ord: + en._ord -= 1 + elif reference_ord < my_ord: + for i_ord in range(my_ord, reference_ord, -1): + all_nodes[i_ord - 1] = all_nodes[i_ord - 2] + all_nodes[i_ord - 1]._ord = i_ord + all_nodes[reference_ord - 1] = self + self._ord = reference_ord + for en in empty_nodes: + # Empty nodes before the first overt token (ID=0.X) will be never moved this way. + # We cannot know whether the caller wanted to place the shifted node before or after them. + if en._ord < my_ord and en._ord > reference_ord: + en._ord += 1 + self._parent._children.sort() + return - # TODO add without_children kwarg - def shift_after_node(self, reference_node): - """Shift this node after the reference_node.""" - self.shift(reference_node, after=1, move_subtree=1, reference_subtree=0) + #TODO: Updating ords of empty nodes is implemented only for the simple case above, + # but it has to be implemented also for the complex case below! + nodes_to_move = self.descendants(add_self=True) + first_ord, last_ord = nodes_to_move[0]._ord, nodes_to_move[-1]._ord + + # If there are no "gaps" in nodes_to_move (e.g. when it is projective), + # we can make the shifting a bit faster and simpler. + if last_ord - first_ord + 1 == len(nodes_to_move): + # First, move a node from position src_ord to position trg_ord RIGHT-ward. + trg_ord, src_ord = last_ord, first_ord - 1 + while src_ord >= reference_ord: + all_nodes[trg_ord - 1] = all_nodes[src_ord - 1] + all_nodes[trg_ord-1]._ord = trg_ord + trg_ord, src_ord = trg_ord - 1, src_ord - 1 + # Second, move a node from position src_ord to position trg_ord LEFT-ward. + trg_ord, src_ord = first_ord, last_ord + 1 + while src_ord < reference_ord: + all_nodes[trg_ord - 1] = all_nodes[src_ord - 1] + all_nodes[trg_ord - 1]._ord = trg_ord + trg_ord, src_ord = trg_ord + 1, src_ord + 1 + # Third, move nodes_to_move to trg_ord RIGHT-ward. + trg_ord = reference_ord if reference_ord < first_ord else trg_ord + for node in nodes_to_move: + all_nodes[trg_ord - 1], node._ord = node, trg_ord + trg_ord += 1 + self._parent._children.sort() + return - def shift_before_node(self, reference_node): + # First, move a node from position src_ord to position trg_ord RIGHT-ward. + # src_ord iterates decreasingly over nodes which are not moving. + trg_ord, src_ord, mov_ord = last_ord, last_ord - 1, len(nodes_to_move) - 2 + while src_ord >= reference_ord: + while all_nodes[src_ord - 1] is nodes_to_move[mov_ord]: + mov_ord, src_ord = mov_ord - 1, src_ord - 1 + if src_ord < reference_ord: + break + else: + all_nodes[trg_ord - 1] = all_nodes[src_ord - 1] + all_nodes[trg_ord - 1]._ord = trg_ord + trg_ord, src_ord = trg_ord - 1, src_ord - 1 + + # Second, move a node from position src_ord to position trg_ord LEFT-ward. + # src_ord iterates increasingly over nodes which are not moving. + trg_ord, src_ord, mov_ord = first_ord, first_ord + 1, 1 + while src_ord < reference_ord: + while mov_ord < len(nodes_to_move) and all_nodes[src_ord - 1] is nodes_to_move[mov_ord]: + mov_ord, src_ord = mov_ord + 1, src_ord + 1 + if src_ord >= reference_ord: + break + else: + all_nodes[trg_ord - 1] = all_nodes[src_ord - 1] + all_nodes[trg_ord - 1]._ord = trg_ord + trg_ord, src_ord = trg_ord + 1, src_ord + 1 + + # Third, move nodes_to_move to trg_ord RIGHT-ward. + trg_ord = reference_ord if reference_ord < first_ord else trg_ord + for node in nodes_to_move: + all_nodes[trg_ord - 1], node._ord = node, trg_ord + trg_ord += 1 + self._parent._children.sort() + + def shift_after_node(self, reference_node, without_children=False, skip_if_descendant=False): """Shift this node after the reference_node.""" - self.shift(reference_node, after=0, move_subtree=1, reference_subtree=0) - - def shift_after_subtree(self, reference_node, without_children=0): + if not without_children and reference_node.is_descendant_of(self): + if skip_if_descendant: + return + raise ValueError(f'{reference_node} is a descendant of {self}. Consider without_children=1.') + self._shift_before_ord(reference_node._ord + 1, without_children=without_children) + + def shift_before_node(self, reference_node, without_children=False, skip_if_descendant=False): + """Shift this node before the reference_node.""" + if reference_node.is_root(): + raise ValueError(f'Cannot shift a node before the root ({reference_node})') + if not without_children and reference_node.is_descendant_of(self): + if skip_if_descendant: + return + raise ValueError(f'{reference_node} is a descendant of {self}. Consider without_children=1.') + self._shift_before_ord(reference_node._ord, without_children=without_children) + + def shift_after_subtree(self, reference_node, without_children=False, skip_if_descendant=False): """Shift this node (and its subtree) after the subtree rooted by reference_node. Args: without_children: shift just this node without its subtree? """ - self.shift(reference_node, after=1, move_subtree=not without_children, reference_subtree=1) - - def shift_before_subtree(self, reference_node, without_children=0): + if not without_children and reference_node.is_descendant_of(self): + if skip_if_descendant: + return + raise ValueError(f'{reference_node} is a descendant of {self}. Consider without_children=1.') + ref_ord = reference_node._ord + for node in reference_node.unordered_descendants(): + if node._ord > ref_ord and node is not self: + ref_ord = node._ord + self._shift_before_ord(ref_ord + 1, without_children=without_children) + + def shift_before_subtree(self, reference_node, without_children=0, skip_if_descendant=False): """Shift this node (and its subtree) before the subtree rooted by reference_node. Args: without_children: shift just this node without its subtree? """ - self.shift(reference_node, after=0, move_subtree=not without_children, reference_subtree=1) + if reference_node.is_root(): + raise ValueError(f'Cannot shift a node before the root ({reference_node})') + if not without_children and reference_node.is_descendant_of(self): + if skip_if_descendant: + return + raise ValueError(f'{reference_node} is a descendant of {self}. Consider without_children=1.') + ref_ord = reference_node._ord + for node in reference_node.unordered_descendants(): + if node._ord < ref_ord and node is not self: + ref_ord = node._ord + self._shift_before_ord(ref_ord, without_children=without_children) @property def prev_node(self): """Return the previous node according to word order.""" - new_ord = self.ord - 1 + new_ord = self._ord - 1 if new_ord < 0: return None if new_ord == 0: - return self.root - return self.root._descendants[new_ord - 1] + return self._root + return self._root._descendants[new_ord - 1] @property def next_node(self): """Return the following node according to word order.""" # Note that all_nodes[n].ord == n+1 try: - return self.root._descendants[self.ord] + return self._root._descendants[self._ord] except IndexError: return None def precedes(self, node): - """Does this node precedes another `node` in word order (`self.ord < node.ord`)?""" - return self.ord < node.ord + """Does this node precedes another `node` in word order? + + This method handles correctly also nodes from different trees (but the same zone). + If you have nodes from the same tree, it is faster and more elegant to use just `nodeA < nodeB`, + which is equivalent to calling `nodeA.ord < nodeB.ord`. + For sorting nodes from the same tree, you can use `nodes.sort()` or `sorted(nodes)`. + """ + if self._root is node._root: + return self._ord < node._ord + if self._root._zone != node._root._zone: + raise ValueError(f"Cannot compare word order across zones: {self} {node}") + if self._root._bundle._document is not node._root._bundle._document: + raise ValueError(f"Cannot compare word order across documents: {self} {node}") + return self._root._bundle.number < node._root._bundle.number def is_leaf(self): """Is this node a leaf, ie. a node without any children?""" - return not self.children + return not self._children def _get_attr(self, name): # pylint: disable=too-many-return-statements if name == 'dir': - if self.parent.is_root(): + if not self._parent or self._parent.is_root(): return 'root' - return 'left' if self.precedes(self.parent) else 'right' + return 'left' if self.precedes(self._parent) else 'right' if name == 'edge': - if self.parent.is_root(): + if not self._parent or self._parent.is_root(): return 0 - return self.ord - self.parent.ord + return self._ord - self._parent._ord if name == 'children': - return len(self.children) + return len(self._children) if name == 'siblings': - return len(self.parent.children) - 1 + return 0 if not self._parent else len(self._parent._children) - 1 if name == 'depth': value = 0 tmp = self - while not tmp.is_root(): - tmp = tmp.parent + while tmp and not tmp.is_root(): + tmp = tmp._parent value += 1 return value if name == 'feats_split': return str(self.feats).split('|') + if name == 'misc_split': + return str(self.misc).split('|') + if name.startswith('feats['): + return self.feats[name[6:-1]] + if name.startswith('misc['): + return self.misc[name[5:-1]] return getattr(self, name) def get_attrs(self, attrs, undefs=None, stringify=True): @@ -529,7 +818,7 @@ def get_attrs(self, attrs, undefs=None, stringify=True): for name in attrs: nodes = [self] if name.startswith('p_'): - nodes, name = [self.parent], name[2:] + nodes, name = [self._parent], name[2:] elif name.startswith('c_'): nodes, name = self.children, name[2:] elif name.startswith('l_'): @@ -537,7 +826,7 @@ def get_attrs(self, attrs, undefs=None, stringify=True): elif name.startswith('r_'): nodes, name = [self.next_node], name[2:] for node in (n for n in nodes if n is not None): - if name == 'feats_split': + if name in {'feats_split', 'misc_split'}: values.extend(node._get_attr(name)) else: values.append(node._get_attr(name)) @@ -571,8 +860,8 @@ def compute_text(self, use_mwt=True): for node in self.descendants(add_self=not self.is_root()): mwt = node.multiword_token if use_mwt and mwt: - if node.ord > last_mwt_id: - last_mwt_id = mwt.words[-1].ord + if node._ord > last_mwt_id: + last_mwt_id = mwt.words[-1]._ord string += mwt.form if mwt.misc['SpaceAfter'] != 'No': string += ' ' @@ -583,6 +872,11 @@ def compute_text(self, use_mwt=True): return string.rstrip() def print_subtree(self, **kwargs): + """deprecated name for draw()""" + logging.warning("node.print_subtree() is deprecated, use node.draw() instead.") + TextModeTrees(**kwargs).process_tree(self) + + def draw(self, **kwargs): """Print ASCII visualization of the dependency structure of this subtree. This method is useful for debugging. @@ -604,7 +898,7 @@ def address(self): e.g. s123/en_udpipe#4. If zone is empty, the slash is excluded as well, e.g. s123#4. """ - return '%s#%d' % (self.root.address() if self.root else '?', self.ord) + return f"{self._root.address() if self._root else '?'}#{self._ord}" @property def multiword_token(self): @@ -616,6 +910,18 @@ def multiword_token(self): """ return self._mwt + @property + def words(self): + """Return one-item list with this node. + + This property is there for compatibility with udapi.core.mwt.MWT.words. + So that it is possible to use code such as: + for token in root.token_descendants: + words = token.words + ... + """ + return [self] + def is_nonprojective(self): """Is the node attached to its parent non-projectively? @@ -626,13 +932,13 @@ def is_nonprojective(self): and the total number of nodes in the span. """ # Root and its children are always projective - parent = self.parent + parent = self._parent if not parent or parent.is_root(): return False # Edges between neighboring nodes are always projective. # Check it now to make it a bit faster. - ord1, ord2 = self.ord, parent.ord + ord1, ord2 = self._ord, parent._ord if ord1 > ord2: ord1, ord2 = ord2, ord1 distance = ord2 - ord1 @@ -640,7 +946,7 @@ def is_nonprojective(self): return False # Get all the descendants of parent that are in the span of the edge. - span = [n for n in parent.descendants if n.ord > ord1 and n.ord < ord2] + span = [n for n in parent.unordered_descendants() if n._ord > ord1 and n._ord < ord2] # For projective edges, span must include all the nodes between parent and self. return len(span) != distance - 1 @@ -653,17 +959,17 @@ def is_nonprojective_gap(self): - this node is within span of X, i.e. it is between (word-order-wise) X's leftmost descendant (or X itself) and X's rightmost descendant (or X itself). """ - ancestors = set() + ancestors = set([self]) node = self - while node.parent: + while node._parent: + node = node._parent ancestors.add(node) - node = node.parent - all_nodes = node.descendants - for left_node in all_nodes[:self.ord - 1]: - if self.precedes(left_node.parent) and left_node.parent not in ancestors: + all_nodes = node._descendants + for left_node in all_nodes[:self._ord - 1]: + if self.precedes(left_node._parent) and left_node._parent not in ancestors: return True - for right_node in all_nodes[self.ord:]: - if right_node.parent.precedes(node) and right_node.parent not in ancestors: + for right_node in all_nodes[self._ord:]: + if right_node._parent.precedes(self) and right_node._parent not in ancestors: return True return False @@ -672,7 +978,159 @@ def no_space_after(self): """Boolean property as a shortcut for `node.misc["SpaceAfter"] == "No"`.""" return self.misc["SpaceAfter"] == "No" + @property + def gloss(self): + """String property as a shortcut for `node.misc["Gloss"]`.""" + return self.misc["Gloss"] + + @gloss.setter + def gloss(self, new_gloss): + self.misc["Gloss"] = new_gloss + + @property + def coref_mentions(self): + self._root.bundle.document._load_coref() + return self._mentions + + @property + def coref_entities(self): + self._root.bundle.document._load_coref() + return [m.entity for m in self._mentions if m.entity is not None] + + # TODO: is this method useful? + def create_coref_entity(self, eid=None, etype=None, **kwargs): + doc = self._root.bundle.document + entity = doc.create_coref_entity(eid, etype) + entity.create_mention(head=self, **kwargs) + return entity + + @staticmethod + def is_mwt(): + """Is this a multi-word token? + + Returns False for all Node instances. + True is returned only by instances of the MWT class. + """ + return False + +class CycleError(Exception): + '''A cycle in the dependency tree detected (or would be created).''' + def __init__(self, message, node1, node2=None): + self.message = message + self.node1 = node1 + self.node2 = node2 + super().__init__(message) + + def __str__(self): + if self.node2 is None: + return self.message % self.node1 + return self.message % (self.node1, self.node2) + +class EmptyNode(Node): + """Class for representing empty nodes (for ellipsis in enhanced UD).""" + + def is_empty(self): + """Return True for all EmptyNode instances.""" + return True + + @property + def parent(self): + return None + + @parent.setter + def parent(self, _): + """Attempts at setting parent of EmptyNode result in AttributeError exception.""" + raise AttributeError('EmptyNode cannot have a (basic-UD) parent.') + + # The ord getter is the same as in Node, but it must be defined, + # so that we can override the ord setter. + @property + def ord(self): + return self._ord + + @ord.setter + def ord(self, new_ord): + """Empty node's ord setter accepts float and str.""" + if isinstance(new_ord, str): + self._ord = float(new_ord) + elif isinstance(new_ord, float): + self._ord = new_ord + else: + raise ValueError('Only str and float are allowed for EmptyNode ord setter,' + f' but {type(new_ord)} was given.') + + def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): + """Attempts at changing the word order of EmptyNode result in NotImplemented exception.""" + raise NotImplemented('Empty nodes cannot be re-order using shift* methods yet.') + + def remove(self): + """Delete this empty node.""" + to_reorder = [e for e in self._root.empty_nodes if e._ord > self._ord and e._ord < self.ord+1] + for empty in to_reorder: + empty._ord = round(empty._ord - 0.1, 1) + try: + self._root.empty_nodes.remove(self) + except ValueError: + return # self may be an already deleted node e.g. if n.remove() called twice + for n in self._root.empty_nodes + self._root._descendants: + if n._deps: + n._deps = {(deprel, parent) for deprel, parent in n._deps if parent != self} + +@functools.total_ordering +class OrdTuple: + """Class for the rare case of 9+ consecutive empty nodes, i.e. ords x.10, x.11 etc. + + Ord 1.10 cannot be stored as float, which would result in 1.1. + We thus store it as a tuple (1,10) wrapped in OrdTuple, so that comparisons work, + e.g.: 1.9 < OrdTuple('1.10') < 2 + """ + __slots__ = ('_key') + + def __init__(self, string): + m = re.match(r'(\d+)\.(\d+)$', string) + if not m: + raise ValueError(f"Ord {string} does not match \\d+.\\d+") + major, minor = int(m.group(1)), int(m.group(2)) + if minor == 0: + raise ValueError(f"Ord {string} should be stored as int") + if minor < 10: + raise ValueError(f"Ord {string} should be stored as float") + self._key = (major, minor) + + def __repr__(self): + return f"{self._key[0]}.{self._key[1]}" + + def __eq__(self, other): + if isinstance(other, int): + return False + elif isinstance(other, float): + return self._key == (int(other), int(10*other - 10*int(other))) + elif isinstance(other, OrdTuple): + return self._key == other._key + else: + raise ValueError(f"OrdTuple cannot be compared with {type(other)}") + + def __lt__(self, other): + if isinstance(other, int): + return self._key < (other, 0) + elif isinstance(other, float): + return self._key < (int(other), int(10*other - 10*int(other))) + elif isinstance(other, OrdTuple): + return self._key < other._key + else: + raise ValueError(f"OrdTuple cannot be compared with {type(other)}") + + def increase(self): + """Increment the decimal part of this ord.""" + self._key = (self.key[0], self._key[1]+1) + +# Implementation note on ListOfNodes +# We could inherit from collections.abc.Sequence, store the list in self._data +# and implement __getitem__ and __len__ by delegating it to self._data. +# I thought it could be faster because we prevent copying of the list in super().__init__(iterable). +# In practice, it is slower because of the delegation: native list's __getitem__ is C-optimized. +# So let's just inherit from list. class ListOfNodes(list): """Helper class for results of node.children and node.descendants. @@ -693,7 +1151,9 @@ class ListOfNodes(list): nodes = node.children nodes = node.children() nodes = node.children(add_self=True, following_only=True) + nodes = node.descendants(add_self=True, add_mwt=True) """ + __slots__ = ('origin',) def __init__(self, iterable, origin): """Create a new ListOfNodes. @@ -705,18 +1165,28 @@ def __init__(self, iterable, origin): super().__init__(iterable) self.origin = origin - def __call__(self, add_self=False, following_only=False, preceding_only=False): + def __call__(self, add_self=False, following_only=False, preceding_only=False, add_mwt=False): """Returns a subset of nodes contained in this list as specified by the args.""" - if not add_self and not following_only and not preceding_only: - return self - result = list(self) if add_self: - result.append(self.origin) + self.append(self.origin) + self.sort() + result = self if preceding_only: - result = [x for x in result if x.ord <= self.origin.ord] + result = [x for x in result if x._ord <= self.origin._ord] if following_only: - result = [x for x in result if x.ord >= self.origin.ord] - return sorted(result, key=lambda node: node.ord) + result = [x for x in result if x._ord >= self.origin._ord] + if add_mwt: + new = [] + last_mwt_id = -1 + for node in result: + mwt = node.multiword_token + if mwt: + if node.ord > last_mwt_id: + last_mwt_id = mwt.words[-1].ord + new.append(mwt) + new.append(node) + result = new + return result def find_minimal_common_treelet(*args): @@ -735,7 +1205,7 @@ def find_minimal_common_treelet(*args): """ nodes = list(args) # The input nodes are surely in the treelet, let's mark this with "1". - in_treelet = {node.ord: 1 for node in nodes} + in_treelet = {node._ord: 1 for node in nodes} # Step 1: Find a node (`highest`) which is governing all the input `nodes`. # It may not be the lowest such node, however. @@ -754,14 +1224,14 @@ def find_minimal_common_treelet(*args): highest = None while len(nodes) > 1: node = nodes.pop(0) # TODO deque - parent = node.parent + parent = node._parent if parent is None: highest = node - elif in_treelet.get(parent.ord, False): - in_treelet[parent.ord] = 1 + elif in_treelet.get(parent._ord, False): + in_treelet[parent._ord] = 1 else: - new_nodes[parent.ord] = parent - in_treelet[parent.ord] = node + new_nodes[parent._ord] = parent + in_treelet[parent._ord] = node nodes.append(parent) # In most cases, `nodes` now contain just one node -- the one we were looking for. @@ -772,11 +1242,11 @@ def find_minimal_common_treelet(*args): # If the `highest` node is unsure, climb down using poiners stored in `in_treelet`. # All such nodes which were rejected as true members of the minimal common treelet # must be deleted from the set of newly added nodes `new_nodes`. - child = in_treelet[highest.ord] + child = in_treelet[highest._ord] while child != 1: - del new_nodes[highest.ord] + del new_nodes[highest._ord] highest = child - child = in_treelet[highest.ord] + child = in_treelet[highest._ord] # We return the root of the minimal common treelet plus all the newly added nodes. return (highest, new_nodes.values()) diff --git a/udapi/core/resource.py b/udapi/core/resource.py index 9e5923f1..da2ba561 100644 --- a/udapi/core/resource.py +++ b/udapi/core/resource.py @@ -2,6 +2,7 @@ import logging import urllib.request import os +from os.path import expanduser BASEURL = 'http://ufallab.ms.mff.cuni.cz/tectomt/share/data/' @@ -11,8 +12,10 @@ def require_file(path): if not os.path.isfile(path): raise IOError(path + " does not exist") return os.path.abspath(path) - udapi_data = os.environ.get('UDAPI_DATA', os.environ.get('HOME')) - full_path = udapi_data + '/' + path + udapi_data = os.environ.get('UDAPI_DATA', expanduser('~')) + if udapi_data is None: + raise IOError(f"Empty environment vars: UDAPI_DATA={os.environ.get('UDAPI_DATA')} HOME={expanduser('~')}") + full_path = os.path.join(udapi_data, path) if not os.path.isfile(full_path): logging.info('Downloading %s to %s', BASEURL + path, full_path) os.makedirs(os.path.dirname(full_path), exist_ok=True) diff --git a/udapi/core/root.py b/udapi/core/root.py index 56105872..15f31e58 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -1,7 +1,7 @@ """Root class represents the technical root node in each tree.""" import logging -from udapi.core.node import Node, ListOfNodes +from udapi.core.node import Node, EmptyNode, ListOfNodes from udapi.core.mwt import MWT # 7 instance attributes is too low (CoNLL-U has 10 columns) @@ -12,13 +12,13 @@ class Root(Node): """Class for representing root nodes (technical roots) in UD trees.""" __slots__ = ['_sent_id', '_zone', '_bundle', '_descendants', '_mwts', - 'empty_nodes', 'text', 'comment', 'newpar', 'newdoc'] + 'empty_nodes', 'text', 'comment', 'newpar', 'newdoc', 'json'] # pylint: disable=too-many-arguments def __init__(self, zone=None, comment='', text=None, newpar=None, newdoc=None): """Create new root node.""" # Call constructor of the parent object. - super().__init__() + super().__init__(root=self) self.ord = 0 self.form = '' @@ -30,6 +30,7 @@ def __init__(self, zone=None, comment='', text=None, newpar=None, newdoc=None): self.text = text self.newpar = newpar self.newdoc = newdoc + self.json = {} # TODO: or None and mask as {} in property reader&writer to save memory? self._sent_id = None self._zone = zone @@ -41,6 +42,13 @@ def __init__(self, zone=None, comment='', text=None, newpar=None, newdoc=None): @property def sent_id(self): """ID of this tree, stored in the sent_id comment in CoNLL-U.""" + if self._sent_id is not None: + return self._sent_id + zone = '/' + self.zone if self.zone else '' + if self._bundle is not None: + self._sent_id = self._bundle.address() + zone + else: + return '?' + zone return self._sent_id @sent_id.setter @@ -52,6 +60,21 @@ def sent_id(self, sent_id): self.zone = parts[1] self._sent_id = sent_id + def address(self): + """Full (document-wide) id of the root. + + The general format of root nodes is: + root.bundle.bundle_id + '/' + root.zone, e.g. s123/en_udpipe. + If zone is empty, the slash is excluded as well, e.g. s123. + If bundle is missing (could occur during loading), '?' is used instead. + Root's address is stored in CoNLL-U files as sent_id (in a special comment). + """ + return self.sent_id + + @property + def document(self): + return self._bundle._document + @property def bundle(self): """Return the bundle which this tree belongs to.""" @@ -72,6 +95,13 @@ def zone(self, zone): if self._bundle: self._bundle.check_zone(zone) self._zone = zone + slashzone = '/' + zone if zone else '' + if self._bundle is not None: + self._sent_id = self._bundle.address() + slashzone + elif self._sent_id: + self._sent_id = self._sent_id.split('/', 1)[0] + slashzone + else: + self._sent_id = '?' + slashzone @property def parent(self): @@ -114,7 +144,7 @@ def remove(self, children=None): The default (None) is to delete them (and all their descendants). `warn` means to issue a warning. """ - if children is not None and self.children: + if children is not None and self._children: logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) self.bundle.trees = [root for root in self.bundle.trees if root != self] @@ -123,36 +153,42 @@ def shift(self, reference_node, after=0, move_subtree=0, reference_subtree=0): """Attempts at changing the word order of root result in Exception.""" raise Exception('Technical root cannot be shifted as it is always the first node') - def address(self): - """Full (document-wide) id of the root. + def create_empty_child(self, **kwargs): + """Create and return a new empty node within this tree. - The general format of root nodes is: - root.bundle.bundle_id + '/' + root.zone, e.g. s123/en_udpipe. - If zone is empty, the slash is excluded as well, e.g. s123. - If bundle is missing (could occur during loading), '?' is used instead. - Root's address is stored in CoNLL-U files as sent_id (in a special comment). - TODO: Make sure root.sent_id returns always the same string as root.address. + This root-specific implementation overrides `Node.create_empty_child()'. + It is faster because it does not set `deps` and `ord` of the newly created node. + It is up to the user to set up these attributes correctly. + It is used in `udapi.block.read.conllu` (where speed is important and thus, + only `raw_deps` are set up instead of `deps`). """ - zone = '/' + self.zone if self.zone else '' - if self._bundle is not None: - return self._bundle.address() + zone - elif self.sent_id is not None: - return self.sent_id + zone - else: - return '?' + zone + new_node = EmptyNode(root=self, **kwargs) + self.empty_nodes.append(new_node) + return new_node # TODO document whether misc is a string or dict or it can be both - def create_multiword_token(self, words=None, form=None, misc=None): + def create_multiword_token(self, words=None, form=None, feats=None, misc=None): """Create and return a new multi-word token (MWT) in this tree. The new MWT can be optionally initialized using the following args. Args: words: a list of nodes which are part of the new MWT form: string representing the surface form of the new MWT - misc: misc attribute of the new MWT + misc: FEATS attribute of the new MWT (only `Typo=Yes` allowed there in UD guidelines) + misc: MISC attribute of the new MWT """ - mwt = MWT(words, form, misc, root=self) + # Nested or overlapping MWTs are not allowed in CoNLL-U, + # so first remove all previous MWTs containing any of words. + for w in words: + if w.multiword_token: + w.multiword_token.remove() + # Now, create the new MWT. + mwt = MWT(words, form, feats, misc, root=self) self._mwts.append(mwt) + if words[-1].misc["SpaceAfter"] == "No": + mwt.misc["SpaceAfter"] = "No" + for word in words: + word.misc["SpaceAfter"] = "" return mwt @property @@ -166,16 +202,6 @@ def multiword_tokens(self, mwts): """Set the list of all multi-word tokens in this tree.""" self._mwts = mwts - def _update_ordering(self): - """Update the ord attribute of all nodes. - - Update also the list of all tree nodes stored in root._descendants. - This method is automatically called after node removal or reordering. - """ - self._descendants = sorted(self.unordered_descendants(), key=lambda node: node.ord) - for (new_ord, node) in enumerate(self._descendants, 1): - node.ord = new_ord - def get_sentence(self, if_missing='detokenize'): """Return either the stored `root.text` or (if None) `root.compute_text()`. @@ -234,24 +260,29 @@ def token_descendants(self): result.append(node) return result + @property + def descendants_and_empty(self): + return sorted(self._descendants + self.empty_nodes) + def steal_nodes(self, nodes): """Move nodes from another tree to this tree (append).""" old_root = nodes[0].root for node in nodes[1:]: if node.root != old_root: raise ValueError("steal_nodes(nodes) was called with nodes from several trees") - nodes = sorted(nodes, key=lambda n: n.ord) + nodes = sorted(nodes) whole_tree = nodes == old_root.descendants new_ord = len(self._descendants) # pylint: disable=protected-access for node in nodes: new_ord += 1 node.ord = new_ord + node._root = self if not whole_tree: - for child in [n for n in node.children if n not in nodes]: + for child in [n for n in node._children if n not in nodes]: child._parent = old_root - old_root._children = sorted(old_root.children + [child], key=lambda n: n.ord) - node._children = [n for n in node.children if n in nodes] + old_root._children = sorted(old_root._children + [child]) + node._children = [n for n in node._children if n in nodes] if node.parent == old_root or (not whole_tree and node.parent not in nodes): node.parent._children = [n for n in node.parent._children if n != node] node._parent = self @@ -270,3 +301,39 @@ def steal_nodes(self, nodes): self.create_multiword_token(words=words, form=mwt.form, misc=mwt.misc) self._descendants += nodes # pylint: enable=protected-access + + def flatten(self, deprel='root'): + """Flatten the tree (i.e. attach all nodes to the root) and reset all deprels. + + This is equivalent to + for node in root.descendants: + node.parent = root + node.deprel = 'root' + but it is faster. + """ + self._children = self._descendants[:] + for node in self._children: + node._parent = self + node._children.clear() + + @property + def prev_tree(self): + """Return the previous tree (root) in the document (from the same zone).""" + doc = self._bundle._document + num = self._bundle.number + if len(doc.bundles) <= num - 1 or doc.bundles[num - 1] is not self._bundle: + num = doc.bundles.index(self._bundle) + 1 + if num == 1: + return None + return doc.bundles[num - 2].get_tree(zone=self._zone) + + @property + def next_tree(self): + """Return the next tree (root) in the document (from the same zone).""" + doc = self._bundle._document + num = self._bundle.number + if len(doc.bundles) <= num - 1 or doc.bundles[num - 1] is not self._bundle: + num = doc.bundles.index(self._bundle) + 1 + if len(doc.bundles) <= num: + return None + return doc.bundles[num].get_tree(zone=self._zone) diff --git a/udapi/core/run.py b/udapi/core/run.py index f42f3f9d..6453641c 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -67,6 +67,30 @@ def _parse_command_line_arguments(scenario): return block_names, block_args +def _blocks_in_a_package(package_name): + import importlib.util, pkgutil + + if not importlib.util.find_spec(package_name): + return [] + try: + package = __import__(package_name, fromlist="dummy") + submodule_names = [m.name for m in pkgutil.iter_modules(package.__path__)] + pname = package_name + if pname.startswith("udapi.block."): + pname = pname[12:] + blocks = [] + for sname in submodule_names: + try: # ignore modules with compilation errors + module = __import__(f"{package_name}.{sname}", fromlist="dummy") + bnames = [c for c in dir(module) if c.lower() == sname] + if bnames: + blocks.append(f"{pname}.{bnames[0]}") + except: + pass + return blocks + except: + return [] + def _import_blocks(block_names, block_args): """ Parse block names, import particular packages and call the constructor for each object. @@ -75,28 +99,44 @@ def _import_blocks(block_names, block_args): :param block_args: A list of block arguments to be passed to block constructor. :return: A list of initialized objects. :rtype: list - """ blocks = [] + namespace = {} # Create a namespace dictionary to store imported classes for (block_id, block_name) in enumerate(block_names): # Importing module dynamically. sub_path, class_name = _parse_block_name(block_name) - module = "udapi.block." + sub_path + "." + class_name.lower() + + if block_name.startswith('.'): + # Private modules are recognized by a dot at the beginning + module = block_name.lower()[1:] + else: + module = "udapi.block." + sub_path + "." + class_name.lower() try: command = "from " + module + " import " + class_name + " as b" + str(block_id) logging.debug("Trying to run command: %s", command) - exec(command) # pylint: disable=exec-used - except Exception: - logging.warning("Error when trying import the block %s", block_name) + exec(command, namespace) # Pass namespace as globals + except ModuleNotFoundError as err: + package_name = ".".join(module.split(".")[:-1]) + package_blocks = _blocks_in_a_package(package_name) + if not package_blocks: + raise + raise ModuleNotFoundError( + f"Cannot find block {block_name} (i.e. class {module}.{class_name})\n" + f"Available block in {package_name} are:\n" + + "\n".join(package_blocks)) from err + except Exception as ex: + logging.warning(f"Cannot import block {block_name} (i.e. class {module}.{class_name})") raise # Run the imported module. - kwargs = block_args[block_id] # pylint: disable=unused-variable + kwargs = block_args[block_id] + namespace['kwargs'] = kwargs # Add kwargs to the namespace command = "b%s(**kwargs)" % block_id logging.debug("Trying to evaluate this: %s", command) - new_block_instance = eval(command) # pylint: disable=eval-used - blocks.append(new_block_instance) + new_block_instance = eval(command, namespace) # Pass namespace as globals + args = ' '.join(f"{k}={v}" for k,v in kwargs.items()) + blocks.append((block_name, new_block_instance, args)) return blocks @@ -127,12 +167,15 @@ def execute(self): # Import blocks (classes) and construct block instances. blocks = _import_blocks(block_names, block_args) + return self.run_blocks(blocks) + + def run_blocks(self, blocks): # Initialize blocks (process_start). - for block in blocks: + for _, block, _ in blocks: block.process_start() readers = [] - for block in blocks: + for _, block, _ in blocks: try: block.finished # pylint: disable=pointless-statement readers.append(block) @@ -142,18 +185,16 @@ def execute(self): logging.info('No reader specified, using read.Conllu') conllu_reader = Conllu() readers = [conllu_reader] - blocks = readers + blocks + blocks = [('read.Conllu', conllu_reader, {})] + blocks # Apply blocks on the data. finished = False while not finished: document = Document() logging.info(" ---- ROUND ----") - for block in blocks: - logging.info("Executing block " + block.__class__.__name__) - block.before_process_document(document) - block.process_document(document) - block.after_process_document(document) + for bname, block, args in blocks: + logging.info(f"Executing block {bname} {args}") + block.apply_on_document(document) finished = True @@ -161,10 +202,19 @@ def execute(self): finished = finished and reader.finished # 6. close blocks (process_end) - for block in blocks: + for _, block, _ in blocks: block.process_end() + # Some users may use the block instances (e.g. to retrieve some variables). + return blocks + # TODO: better implementation, included Scen def scenario_string(self): """Return the scenario string.""" return "\n".join(self.args.scenario) + + +def create_block(block, **kwargs): + """A factory function for creating new block instances (handy for IPython).""" + blocks = _import_blocks([block], [kwargs]) + return blocks[0][1] diff --git a/udapi/core/tests/__init__.py b/udapi/core/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/core/tests/data/babinsky.conllu b/udapi/core/tests/data/babinsky.conllu new file mode 100644 index 00000000..f23a81e7 --- /dev/null +++ b/udapi/core/tests/data/babinsky.conllu @@ -0,0 +1,30 @@ +# doc_json_entities = [{"id": "E1", "labels": ["Jaroslav Vomáčka", "Jarda"], "mentions": ["s1#m1", "s2#m1"]}, {"id": "E2", "labels": ["Babinský"], "mentions": ["s2#m2"]}] +# doc_json_relations = [{"from": "E1", "id": "R1", "to": "E2", "type": "call"}] +# sent_id = 1 +# text = Vyšetřování Jaroslava Vomáčky. +# json_mentions = [{"id": "s1#m1", "label": "Jaroslav Vomáčka", "span": [2, 3]}] +1 Vyšetřování vyšetřování NOUN NNNS1-----A---- Case=Nom|Gender=Neut|Number=Sing|Polarity=Pos 0 root _ _ +2 Jaroslava Jaroslav PROPN NNMS2-----A---- Animacy=Anim|Case=Gen|Gender=Masc|NameType=Giv|Number=Sing|Polarity=Pos 1 nmod _ _ +3 Vomáčky Vomáčka PROPN NNMS2-----A---- Animacy=Anim|Case=Gen|Gender=Masc|NameType=Sur|Number=Sing|Polarity=Pos 2 flat _ SpaceAfter=No +4 . . PUNCT Z:------------- _ 1 punct _ SpaceAfter=No + +# sent_id = 2 +# text = Jarda telefonoval loupežníkovi Babinskému. +# json_mention_relations = [{"from": "s2#m1", "id": "r1", "to": "s2#m2", "type": "call"}] +# json_mentions = [{"id": "s2#m1", "label": "Jaroslav", "span": [1]}, {"id": "s2#m2", "label": "Babinský", "span": [4]}] +1 Jarda Jarda PROPN NNMS1-----A---- Animacy=Anim|Case=Nom|Gender=Masc|NameType=Giv|Number=Sing|Polarity=Pos 2 nsubj _ _ +2 telefonoval telefonovat VERB VpYS---XR-AA--- Aspect=Imp|Gender=Masc|Number=Sing|Polarity=Pos|Tense=Past|VerbForm=Part|Voice=Act 0 root _ _ +3 loupežníkovi loupežník NOUN NNMS3-----A---- Animacy=Anim|Case=Dat|Gender=Masc|Number=Sing|Polarity=Pos 4 nmod _ _ +4 Babinskému Babinský PROPN NNMS3-----A---- Animacy=Anim|Case=Dat|Gender=Masc|NameType=Sur|Number=Sing|Polarity=Pos 2 obj _ SpaceAfter=No +5 . . PUNCT Z:------------- _ 2 punct _ SpaceAfter=No + +# doc_json_entities = [{"id": "E3", "labels": ["Rumcajs"], "mentions": ["s3#m1"]}] +# newdoc +# sent_id = 3 +# text = Rumcajs je loupežník. +# json_mentions = [{"id": "s3#m1", "label": "Rumcajs", "span": [1]}] +1 Rumcajs Rumcajs PROPN NNMS1-----A---- Animacy=Anim|Case=Nom|Gender=Masc|NameType=Sur|Number=Sing|Polarity=Pos 3 nsubj _ _ +2 je být AUX VB-S---3P-AA--- Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act 3 cop _ _ +3 loupežník loupežník NOUN NNMS1-----A---- Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing|Polarity=Pos 0 root _ SpaceAfter=No +4 . . PUNCT Z:------------- _ 3 punct _ SpaceAfter=No + diff --git a/udapi/core/tests/data/fr-democrat-dev-sample.conllu b/udapi/core/tests/data/fr-democrat-dev-sample.conllu new file mode 100644 index 00000000..b3a85f80 --- /dev/null +++ b/udapi/core/tests/data/fr-democrat-dev-sample.conllu @@ -0,0 +1,60 @@ +# newdoc id = ungroupped-estrepublicain-2-066 +# global.Entity = eid-etype-head-other +# newpar id = ungroupped-estrepublicain-2-066-p0 +# sent_id = ungroupped-estrepublicain-2-066-p0-s1 +# text = Les allocations de décembre arrivent ! +1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ Entity=(e36772--2 +2 allocations allocation NOUN _ Gender=Fem|Number=Plur 5 nsubj _ _ +3 de de ADP _ _ 4 case _ _ +4 décembre décembre NOUN _ Gender=Masc|Number=Sing 2 nmod _ Entity=e36772) +5 arrivent arriver VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ +6 ! ! PUNCT _ _ 5 punct _ _ + +# newpar id = ungroupped-estrepublicain-2-066-p1 +# sent_id = ungroupped-estrepublicain-2-066-p1-s1 +# text = La Caisse d' Allocations familiales du Territoire de Belfort informe ses allocataires que le montant des prestations sera disponible sur les comptes bancaires ou postaux à partir du 8 janvier . +1 La le DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 2 det _ Entity=(e36773--2 +2 Caisse caisse NOUN _ Gender=Fem|Number=Sing 11 nsubj _ Entity=(e36774-organization-1 +3 d' de ADP _ _ 4 case _ _ +4 Allocations Allocations NOUN _ Gender=Fem|Number=Plur 2 nmod _ _ +5 familiales familial ADJ _ Gender=Fem|Number=Plur 4 amod _ _ +6-7 du _ _ _ _ _ _ _ _ +6 de de ADP _ _ 8 case _ _ +7 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 8 det _ Entity=(e36775--2 +8 Territoire territoire NOUN _ Gender=Masc|Number=Sing 2 nmod _ _ +9 de de ADP _ _ 10 case _ _ +10 Belfort Belfort PROPN _ _ 8 nmod _ Entity=e36775)e36774)e36773) +11 informe informer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ +12 ses son DET _ Gender=Masc|Number=Plur|Poss=Yes|PronType=Prs 13 det _ Entity=(e36776--2(e36773--1) +13 allocataires allocataire NOUN _ Gender=Masc|Number=Plur 11 obj _ Entity=e36776) +14 que que PRON _ PronType=Rel 21 mark _ _ +15 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 16 det _ Entity=(e36777--2 +16 montant montant NOUN _ Gender=Masc|Number=Sing 21 nsubj _ _ +17-18 des _ _ _ _ _ _ _ _ +17 de de ADP _ _ 19 case _ _ +18 les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 19 det _ Entity=(e36778--2 +19 prestations prestation NOUN _ Gender=Fem|Number=Plur 16 nmod _ Entity=e36778)e36777) +20 sera être AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin 21 cop _ _ +21 disponible disponible ADJ _ Gender=Fem|Number=Sing 11 advcl _ _ +22 sur sur ADP _ _ 24 case _ _ +23 les le DET _ Definite=Def|Gender=Masc|Number=Plur|PronType=Art 24 det _ Entity=(e36779--2 +24 comptes compte NOUN _ Gender=Masc|Number=Plur 21 obl _ _ +25 bancaires bancaire ADJ _ Gender=Masc|Number=Plur 24 amod _ _ +26 ou ou CCONJ _ _ 27 cc _ _ +27 postaux postal ADJ _ Gender=Masc|Number=Plur 25 conj _ Entity=e36779) +28 à à ADP _ _ 33 case _ _ +29 partir partir VERB _ VerbForm=Inf 28 fixed _ _ +30-31 du _ _ _ _ _ _ _ _ +30 de de ADP _ _ 28 fixed _ _ +31 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 33 det _ Entity=(e36780--3 +32 8 8 NUM _ _ 33 nummod _ _ +33 janvier janvier NOUN _ Gender=Masc|Number=Sing 21 obl _ Entity=e36780) +34 . . PUNCT _ _ 11 punct _ _ + +# newdoc id = ungroupped-estrepublicain-2-005 +# global.Entity = eid-etype-head-other +# newpar id = ungroupped-estrepublicain-2-005-p0 +# sent_id = ungroupped-estrepublicain-2-005-p0-s1 +# text = Vitry-le-François +1 Vitry-le-François Vitry-le-François PROPN _ _ 0 root _ Entity=(e36781-place-1) + diff --git a/udapi/core/tests/external_tests.sh b/udapi/core/tests/external_tests.sh index 55ded49d..ac93cadb 100755 --- a/udapi/core/tests/external_tests.sh +++ b/udapi/core/tests/external_tests.sh @@ -1,3 +1,6 @@ #!/bin/bash +set -e -udapy read.Conllu files=data/UD_Czech_sample.conllu write.Conllu print_sent_id=0 print_text=0 > out.conllu && diff data/UD_Czech_sample.conllu out.conllu && rm out.conllu +udapy write.Conllu print_sent_id=0 print_text=0 < data/UD_Czech_sample.conllu > out.conllu && diff data/UD_Czech_sample.conllu out.conllu && rm out.conllu + +udapy -s read.Conllu files=data/babinsky.conllu split_docs=1 > out.conllu && diff data/babinsky.conllu out.conllu && rm out.conllu diff --git a/udapi/core/tests/test_coref.py b/udapi/core/tests/test_coref.py new file mode 100755 index 00000000..8952d6d8 --- /dev/null +++ b/udapi/core/tests/test_coref.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import os +import unittest +import udapi +from udapi.block.read.conllu import Conllu as ConlluReader + + +class TestCoref(unittest.TestCase): + + def test_load(self): + data_filename = os.path.join(os.path.dirname(__file__), 'data', 'fr-democrat-dev-sample.conllu') + reader = ConlluReader(files=data_filename, split_docs=True) + docs = reader.read_documents() + self.assertEqual(len(docs), 2) + docs[-1].draw() + coref_entities = docs[-1].coref_entities + self.assertEqual(len(coref_entities), 1) + self.assertEqual(coref_entities[0].eid, 'e36781') + node = next(docs[-1].nodes) + self.assertEqual(len(node.coref_entities), 1) + self.assertEqual(len(node.coref_mentions), 1) + self.assertEqual(node.coref_entities[0], coref_entities[0]) + self.assertEqual(docs[-1].meta["loaded_from"], data_filename) + + def test_edits(self): + data_filename = os.path.join(os.path.dirname(__file__), 'data', 'fr-democrat-dev-sample.conllu') + doc = udapi.Document(data_filename) + first_node = next(doc.nodes) + second_node = first_node.next_node + new_entity = doc.create_coref_entity(etype='person') + self.assertEqual(new_entity.etype, 'person') + self.assertEqual(len(new_entity.mentions), 0) + m1 = new_entity.create_mention(words=[first_node]) # head will be automatically set to words[0] + self.assertEqual(len(new_entity.mentions), 1) + self.assertEqual(m1, new_entity.mentions[0]) + self.assertEqual(m1.entity, new_entity) + self.assertEqual(m1.head, first_node) + self.assertEqual(m1.words, [first_node]) + self.assertEqual(m1.span, '1') + m1.words = [second_node, first_node, first_node] # intentional duplicates and wrong order + self.assertEqual(m1.words, [first_node, second_node]) + self.assertEqual(m1.span, '1-2') + m1.head = second_node + self.assertEqual(m1.head, second_node) + m2 = new_entity.create_mention(head=second_node, span='1-3') # mention.words will be filled according to the span + self.assertEqual(len(new_entity.mentions), 2) + self.assertEqual(new_entity.mentions[0], m2) # 1-3 should go before 1-2 + self.assertEqual(new_entity.mentions[1], m1) + self.assertTrue(m2 < m1) + self.assertEqual(m2.words, [first_node, second_node, second_node.next_node]) + entity2 = doc.create_coref_entity() + m1.entity = entity2 + self.assertEqual(m1.entity.eid, entity2.eid) + m2.entity = entity2 + self.assertEqual(m2.entity.eid, entity2.eid) + + +if __name__ == "__main__": + unittest.main() diff --git a/udapi/core/tests/test_document.py b/udapi/core/tests/test_document.py index 66363ca9..28283dda 100755 --- a/udapi/core/tests/test_document.py +++ b/udapi/core/tests/test_document.py @@ -9,12 +9,15 @@ class TestDocument(unittest.TestCase): def test_init(self): doc = Document() - def test_iterator(self): + def test_ids(self): doc = Document() - doc.bundles = ['a', 'b', 'c'] - for bundle in doc: - print(bundle) - + bundle1 = doc.create_bundle() + bundle2 = doc.create_bundle() + self.assertEqual(bundle1.address(), "1") + self.assertEqual(bundle2.address(), "2") + self.assertEqual([b.bundle_id for b in doc], ["1", "2"]) + tree1 = bundle1.create_tree() + self.assertEqual(tree1.address(), "1") if __name__ == "__main__": unittest.main() diff --git a/udapi/core/tests/test_enhdeps.py b/udapi/core/tests/test_enhdeps.py new file mode 100644 index 00000000..53a74389 --- /dev/null +++ b/udapi/core/tests/test_enhdeps.py @@ -0,0 +1,63 @@ +import unittest +import os +import udapi + +from udapi.core.root import Root +from udapi.core.node import Node, find_minimal_common_treelet +from udapi.core.document import Document +from udapi.block.read.conllu import Conllu as ConlluReader +from udapi.block.write.conllu import Conllu as ConlluWriter + + +class TestEnhDeps(unittest.TestCase): + """Unit tests for udapi.core.node and enhanced dependecies. + Tests the behaviour with empty nodes (with decimal ord, such as 0.1, 2.3 etc.) as well""" + + @classmethod + def setUpClass(cls): + cls.doc = Document() + cls.data = os.path.join(os.path.dirname(udapi.__file__), "core", "tests", "data", "enh_deps.conllu") + cls.doc.load_conllu(cls.data) + cls.tree = cls.doc.bundles[0].get_tree() + cls.nodes = cls.tree.descendants + cls.add_empty_node(cls.tree, 3) + + @staticmethod + def add_empty_node(tree, ord_before, decimal=1): + """Add an empty node to tree after the node with index `ord_before`. + Empty node will receive ord=`ord_before`.`decimal`""" + e = tree.create_empty_child() + e.ord = float('{}.{}'.format(ord_before, decimal)) + e.form = "E{}".format(e.ord) + + def test_datapath(self): + self.assertTrue(os.path.isfile(self.data)) + + def test_nodes(self): + self.assertEqual(6, len(self.nodes)) + + def test_ord_type(self): + self.assertIsNot(str, type(self.nodes[0].ord)) + + def test_create_empty(self): + writer = ConlluWriter() + writer.apply_on_document(self.doc) + # self.tree.draw() + self.assertGreater(len(self.tree.empty_nodes), 0) + + def test_regular_deps(self): + + n = self.nodes[0] + self.assertEqual("0:root|2:amod", n.raw_deps) + + def test_create_deps2empty(self): + e = self.tree.empty_nodes[0] + h = self.nodes[1] + d = self.nodes[5] + e.deps.append({'parent': h, 'deprel':'dep:e2h'}) + d.deps.append({'parent': e, 'deprel': 'dep:d2e'}) + self.assertEqual("2:dep:e2h", e.raw_deps, ) + self.assertEqual("3.1:dep:d2e|5:conj", d.raw_deps) + self.assertEqual(self.tree.descendants_and_empty, self.nodes[:3] + [e] + self.nodes[3:]) + + diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index 52e94722..f2b64a3d 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -36,6 +36,8 @@ def test_topology(self): self.assertEqual(len(nodes[1].children), 3) self.assertEqual(len(nodes[1].children(add_self=True)), 4) self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3) + self.assertEqual(nodes[2].siblings, [nodes[0], nodes[3]]) + self.assertEqual(nodes[2].siblings(following_only=True), [nodes[3]]) self.assertEqual(nodes[0].next_node, nodes[1]) self.assertEqual(nodes[2].prev_node, nodes[1]) @@ -52,12 +54,19 @@ def test_topology(self): # ords and reorderings self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6]) + self.assertTrue(nodes[0].precedes(nodes[1])) + self.assertTrue(nodes[0] < nodes[1]) + self.assertFalse(nodes[0] > nodes[1]) + self.assertTrue(nodes[0] <= nodes[0]) nodes[0].shift_after_node(nodes[1]) self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6]) self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6]) + self.assertEqual([node.ord for node in nodes[1].children], [2, 3, 4]) + nodes[3].shift_before_node(nodes[2]) + self.assertEqual([node.ord for node in nodes[1].children], [2, 3, 6]) - def test_print_subtree(self): - """Test print_subtree() method, which uses udapi.block.write.textmodetrees.""" + def test_draw(self): + """Test the draw() method, which uses udapi.block.write.textmodetrees.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) @@ -98,23 +107,23 @@ def test_print_subtree(self): try: sys.stdout = capture = io.StringIO() - root.print_subtree(color=False) + root.draw(color=False) self.assertEqual(capture.getvalue(), expected1) capture.seek(0) capture.truncate() - root.print_subtree(color=False, attributes='form,feats,misc', - print_sent_id=False, print_text=False) + root.draw(color=False, attributes='form,feats,misc', + print_sent_id=False, print_text=False) self.assertEqual(capture.getvalue(), expected2) capture.seek(0) capture.truncate() - root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0) + root3.draw(color=False, attributes='form', print_sent_id=0, print_text=0) self.assertEqual(capture.getvalue(), expected3) finally: sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type def test_feats(self): - """Test the morphological featrues.""" - node = Node() + """Test the morphological features.""" + node = Node(root=None) self.assertEqual(str(node.feats), '_') node.feats = '' self.assertEqual(str(node.feats), '_') @@ -139,6 +148,29 @@ def test_feats(self): self.assertEqual(str(node.feats), '_') self.assertEqual(node.feats, {}) + def test_deprel(self): + """Test getting setting the dependency relation.""" + node = Node(root=None, deprel='acl:relcl') + self.assertEqual(node.deprel, 'acl:relcl') + self.assertEqual(node.udeprel, 'acl') + self.assertEqual(node.sdeprel, 'relcl') + node.udeprel = 'advcl' + self.assertEqual(node.deprel, 'advcl:relcl') + node.sdeprel = 'tcl' + self.assertEqual(node.deprel, 'advcl:tcl') + node.sdeprel = '' + self.assertEqual(node.deprel, 'advcl') + self.assertEqual(node.udeprel, 'advcl') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj' + self.assertEqual(node.deprel, 'nsubj') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj:pass:outer' + self.assertEqual(node.deprel, 'nsubj:pass:outer') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, 'pass:outer') + def test_deps_getter(self): """Test enhanced dependencies.""" # Create a path to the test CoNLLU file. @@ -182,5 +214,56 @@ def test_deps_setter(self): self.assertEqual(nodes[0].raw_deps, '2:test') + def test_empty_nodes(self): + """Test creation of empty nodes and how their ord is changed when removing nodes.""" + root = Root() + for i in range(3): + root.create_child(form=f'node{i+1}') + + n1, n2, n3 = root.descendants() + n3.parent = n2 + e1 = n1.create_empty_child('dep', after=False, form='e1') + e2 = n1.create_empty_child('dep', after=False, form='e2') + e3 = n1.create_empty_child('dep', after=True, form='e3') + e4 = n1.create_empty_child('dep', after=True, form='e4') + e5 = n2.create_empty_child('dep', after=False, form='e5') + e6 = n1.create_empty_child('dep', after=True, form='e6') + + self.assertEqual(root.empty_nodes, [e1, e2, e3, e4, e5, e6]) + self.assertEqual(root.descendants_and_empty, [e1, e2, n1, e3, e4, e5, e6, n2, n3]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 1, 1.1, 1.2, 1.3, 1.4, 2, 3]) + e5.remove() + self.assertEqual(root.descendants_and_empty, [e1, e2, n1, e3, e4, e6, n2, n3]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 1, 1.1, 1.2, 1.3, 2, 3]) + n1.remove() + self.assertEqual(root.descendants_and_empty, [e1, e2, e3, e4, e6, n2, n3]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2]) + e7 = n3.create_empty_child('dep', after=True, form='e7') + self.assertEqual(root.descendants_and_empty, [e1, e2, e3, e4, e6, n2, n3, e7]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 2.1]) + n2.remove() + self.assertEqual(root.descendants_and_empty, [e1, e2, e3, e4, e6, e7]) + self.assertEqual([n.ord for n in root.descendants_and_empty], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) + + def test_enh_deps_and_reordering(self): + """Test reordering of node ord in enhanced deps when reorderin/removing nodes.""" + root = Root() + for i in range(3): + root.create_child(form=f'node{i+1}') + + n1, n2, n3 = root.descendants() + n1.raw_deps = '2:nsubj|3:obj' + self.assertEqual(n1.raw_deps, '2:nsubj|3:obj') + self.assertEqual(n1.deps, [{'parent': n2, 'deprel': 'nsubj'}, {'parent': n3, 'deprel': 'obj'}]) + n2.shift_after_node(n3) + self.assertEqual(n1.raw_deps, '2:obj|3:nsubj') + # TODO only node.raw_deps are currently guaranteed to return the deps sorted, not node.deps + #self.assertEqual(n1.deps, [{'parent': n3, 'deprel': 'obj'}, {'parent': n2, 'deprel': 'nsubj'}]) + # TODO: after removing a node, all deps should be updated + #n2.remove() + #self.assertEqual(n1.raw_deps, '2:nsubj') + #self.assertEqual(n1.deps, [{'parent': n3, 'deprel': 'obj'}]) + + if __name__ == "__main__": unittest.main() diff --git a/udapi/tool/morphodita.py b/udapi/tool/morphodita.py new file mode 100644 index 00000000..326bee46 --- /dev/null +++ b/udapi/tool/morphodita.py @@ -0,0 +1,40 @@ +"""Wrapper for MorphoDiTa (more pythonic than ufal.morphodita).""" +from collections import namedtuple + +from ufal.morphodita import Morpho, TaggedLemmasForms, TaggedLemmas # pylint: disable=no-name-in-module +from udapi.core.resource import require_file + +FormInfo = namedtuple('FormInfo', 'form lemma tag guesser') + + +class MorphoDiTa: + """Wrapper for MorphoDiTa.""" + + def __init__(self, model): + """Create the MorphoDiTa tool object.""" + self.model = model + path = require_file(model) + self.tool = Morpho.load(path) + if not self.tool: + raise IOError("Cannot load model from file '%s'" % path) + + def forms_of_lemma(self, lemma, tag_wildcard='?', guesser=True): + """Return all forms (a list of FormInfo tuples) of a given lemma matching a given tag wildcard.""" + use_guesser = 1 if guesser else 0 + lemmas_forms = TaggedLemmasForms() + used_guesser = self.tool.generate(lemma, tag_wildcard, use_guesser, lemmas_forms) + forms = [] + for lemma_forms in lemmas_forms: + for form in lemma_forms.forms: + forms.append(FormInfo(form.form, lemma_forms.lemma, form.tag, used_guesser)) + return forms + + def analyze_form(self, form, guesser=True): + """Return all lemma-tag analyses (a list of FormInfo tuples) of a given form.""" + use_guesser = 1 if guesser else 0 + tagged_lemmas = TaggedLemmas() + used_guesser = self.tool.analyze(form, use_guesser, tagged_lemmas) + result = [] + for tl in tagged_lemmas: + result.append(FormInfo(form, tl.lemma, tl.tag, used_guesser)) + return result diff --git a/udapi/tool/udpipe.py b/udapi/tool/udpipe.py index 8fe024c7..83e289a2 100644 --- a/udapi/tool/udpipe.py +++ b/udapi/tool/udpipe.py @@ -1,9 +1,11 @@ """Wrapper for UDPipe (more pythonic than ufal.udpipe).""" import io +import sys from ufal.udpipe import Model, Pipeline, ProcessingError, Sentence # pylint: disable=no-name-in-module from udapi.core.resource import require_file from udapi.block.read.conllu import Conllu as ConlluReader +from udapi.core.root import Root class UDPipe: @@ -20,20 +22,29 @@ def __init__(self, model): self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT) - def tag_parse_tree(self, root): + def tag_parse_tree(self, root, tag=True, parse=True): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" + if not tag and not parse: + raise ValueError('tag_parse_tree(root, tag=False, parse=False) does not make sense.') + descendants = root.descendants + if not descendants: + return pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') - in_data = " ".join([n.form for n in root.descendants]) + in_data = " ".join([n.form for n in descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() - nodes = [root] + root.descendants + attrs = 'upos xpos lemma feats'.split() if tag else [] + if parse: + attrs.append('deprel') + root.flatten() for parsed_node in parsed_root.descendants: - node = nodes[parsed_node.ord] - node.parent = nodes[parsed_node.parent.ord] - for attr in 'upos xpos lemma feats'.split(): + node = descendants[parsed_node.ord - 1] + if parse: + node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root + for attr in attrs: setattr(node, attr, getattr(parsed_node, attr)) # TODO: benchmark which solution is the fastest one. E.g. we could also do @@ -42,42 +53,82 @@ def tag_parse_tree(self, root): # pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants - def tokenize_tag_parse_tree(self, root): - """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.""" + def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, ranges=False): + """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. + + If resegment=True, the returned list of Udapi trees may contain multiple trees. + """ + if ranges: + raise ValueError('ranges=True is implemented only in the REST API version (add "online=1" to the udpipe block)') if root.children: raise ValueError('Tree already contained nodes before tokenization') - # tokenization (I cannot turn off segmenter, so I need to join the segments) + # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions). self.tokenizer.setText(root.text) - u_sentence = Sentence() - is_another = self.tokenizer.nextSentence(u_sentence) - u_words = u_sentence.words - n_words = u_words.size() - 1 - if is_another: - u_sent_cont = Sentence() - while self.tokenizer.nextSentence(u_sent_cont): - n_cont = u_sent_cont.words.size() - 1 - for i in range(1, n_cont + 1): - u_w = u_sent_cont.words[i] + is_another = True + u_sentences = [] + while is_another: + u_sentence = Sentence() + is_another = self.tokenizer.nextSentence(u_sentence) + if is_another: + u_sentences.append(u_sentence) + + # If resegmentation was not required, we need to join the segments. + if not resegment and len(u_sentences) > 1: + first_sent = u_sentences[0] + n_words = first_sent.words.size() - 1 + for other_sent in u_sentences[1:]: + other_words = other_sent.words.size() - 1 + for i in range(1, other_words + 1): + u_w = other_sent.words[i] n_words += 1 u_w.id = n_words - u_words.append(u_w) + first_sent.words.append(u_w) + u_sentences = [first_sent] # tagging and parsing - self.tool.tag(u_sentence, Model.DEFAULT) - self.tool.parse(u_sentence, Model.DEFAULT) + if tag: + for u_sentence in u_sentences: + self.tool.tag(u_sentence, Model.DEFAULT) + if parse: + self.tool.parse(u_sentence, Model.DEFAULT) + elif parse: + raise ValueError('Combination parse=True tag=False is not allowed.') # converting UDPipe nodes to Udapi nodes - heads, nodes = [], [root] - for i in range(1, u_words.size()): - u_w = u_words[i] - node = root.create_child( - form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, - xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, - ) - node.misc = u_w.misc - heads.append(u_w.head) - nodes.append(node) - for node in nodes[1:]: - head = heads.pop(0) - node.parent = nodes[head] + new_root = root + trees = [] + for u_sentence in u_sentences: + if not new_root: + new_root = Root() + new_root.text = u_sentence.getText() if resegment else root.text + heads, nodes = [], [new_root] + u_words = u_sentence.words + for i in range(1, u_words.size()): + u_w = u_words[i] + node = new_root.create_child( + form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, + xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, misc=u_w.misc, + ) + if parse: + heads.append(u_w.head) + nodes.append(node) + if parse: + for node in nodes[1:]: + head = heads.pop(0) + node.parent = nodes[head] + trees.append(new_root) + new_root = None + return trees + + def segment_text(self, text): + """Segment the provided text into sentences.""" + self.tokenizer.setText(text) + is_another = True + sentences = [] + while is_another: + u_sentence = Sentence() + is_another = self.tokenizer.nextSentence(u_sentence) + if is_another: + sentences.append(u_sentence.getText()) + return sentences diff --git a/udapi/tool/udpipeonline.py b/udapi/tool/udpipeonline.py new file mode 100644 index 00000000..ced96d56 --- /dev/null +++ b/udapi/tool/udpipeonline.py @@ -0,0 +1,198 @@ +"""Wrapper for UDPipe online web service.""" +import io +import sys +import email.mime.multipart +import email.mime.nonmultipart +import email.policy +import json +import os +import sys +import urllib.error +import urllib.parse +import urllib.request + +from udapi.block.read.conllu import Conllu as ConlluReader +from udapi.core.root import Root + +class UDPipeOnline: + """Wrapper for UDPipe online web service.""" + + def __init__(self, model, server="https://lindat.mff.cuni.cz/services/udpipe/api"): + """Create the UDPipeOnline tool object.""" + self.model = model + self.server = server + + def list_models(self): + with urllib.request.urlopen(self.server + "/models") as request: + response = json.loads(request.read()) + return list(response["models"].keys()) + + def perform_request(self, params, method="process"): + if not params: + request_headers, request_data = {}, None + else: + message = email.mime.multipart.MIMEMultipart("form-data", policy=email.policy.HTTP) + + for name, value in params.items(): + payload = email.mime.nonmultipart.MIMENonMultipart("text", "plain") + payload.add_header("Content-Disposition", "form-data; name=\"{}\"".format(name)) + payload.add_header("Content-Transfer-Encoding", "8bit") + payload.set_payload(value, charset="utf-8") + message.attach(payload) + + request_data = message.as_bytes().split(b"\r\n\r\n", maxsplit=1)[1] + request_headers = {"Content-Type": message["Content-Type"]} + + try: + with urllib.request.urlopen(urllib.request.Request( + url=f"{self.server}/{method}", headers=request_headers, data=request_data + )) as request: + response = json.loads(request.read()) + except urllib.error.HTTPError as e: + print("An exception was raised during UDPipe 'process' REST request.\n" + "The service returned the following error:\n" + " {}".format(e.fp.read().decode("utf-8")), file=sys.stderr) + raise + except json.JSONDecodeError as e: + print("Cannot parse the JSON response of UDPipe 'process' REST request.\n" + " {}".format(e.msg), file=sys.stderr) + raise + + if "model" not in response or "result" not in response: + raise ValueError("Cannot parse the UDPipe 'process' REST request response.") + + return response["result"] + + def perform_request_urlencoded(self, params, method="process"): + """Perform a request using application/x-www-form-urlencoded to preserve LF newlines. + + This avoids CRLF normalization done by the email MIME serializer, ensuring that + the content of the 'data' field retains Unix LF ("\n") exactly as provided. + """ + request_data = urllib.parse.urlencode(params).encode("utf-8") + request_headers = {"Content-Type": "application/x-www-form-urlencoded; charset=utf-8"} + + try: + with urllib.request.urlopen(urllib.request.Request( + url=f"{self.server}/{method}", headers=request_headers, data=request_data + )) as request: + response = json.loads(request.read()) + except urllib.error.HTTPError as e: + print("An exception was raised during UDPipe '{}' REST request.\n" + "The service returned the following error:\n" + " {}".format(method, e.fp.read().decode("utf-8")), file=sys.stderr) + raise + except json.JSONDecodeError as e: + print("Cannot parse the JSON response of UDPipe '{}' REST request.\n" + " {}".format(method, e.msg), file=sys.stderr) + raise + + if "model" not in response or "result" not in response: + raise ValueError("Cannot parse the UDPipe '{}' REST request response.".format(method)) + + return response["result"] + + def tag_parse_tree(self, root, tag=True, parse=True): + """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" + if not tag and not parse: + raise ValueError('tag_parse_tree(root, tag=False, parse=False) does not make sense.') + descendants = root.descendants + if not descendants: + return + in_data = " ".join([n.form for n in descendants]) + params = {"model": self.model, "data": in_data, "input":"horizontal", "tagger":""} + attrs = 'upos xpos lemma feats'.split() if tag else [] + if parse: + params["parser"] = "" + attrs.append('deprel') + + out_data = self.perform_request_urlencoded(params=params) + conllu_reader = ConlluReader(empty_parent="ignore") + conllu_reader.files.filehandle = io.StringIO(out_data) + parsed_root = conllu_reader.read_tree() + if parse: + root.flatten() + for parsed_node in parsed_root.descendants: + node = descendants[parsed_node.ord - 1] + if parse: + node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root + for attr in attrs: + setattr(node, attr, getattr(parsed_node, attr)) + + def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, ranges=False): + """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. + + If resegment=True, the returned list of Udapi trees may contain multiple trees. + If ranges=True, each token will contain `node.misc[TokenRange]` will contain character level 0-based ranges, e.g. `0:2`. + """ + if parse and not tag: + raise ValueError('Combination parse=True tag=False is not allowed.') + if root.children: + raise ValueError('Tree already contained nodes before tokenization') + + # Tokenize and possibly segment the input text + params = {"model": self.model, "data": root.text, "tokenizer":"" if resegment else "presegmented"} + if tag: + params["tagger"] = "" + if parse: + params["parser"] = "" + if ranges: + params["tokenizer"] = "presegmented;ranges" if resegment else "ranges" + out_data = self.perform_request_urlencoded(params=params) + conllu_reader = ConlluReader(empty_parent="ignore") + conllu_reader.files.filehandle = io.StringIO(out_data) + trees = conllu_reader.read_trees() + + # The input "root" object must be the first item in "trees". + for attr in ('_children', '_descendants', '_mwts', 'text', 'comment'): + setattr(root, attr, getattr(trees[0], attr)) + for node in root._children: + node._parent = root + for node in root._descendants: + node._root = root + trees[0] = root + return trees + + def segment_text(self, text): + """Segment the provided text into sentences returned as a Python list.""" + params = {"model": self.model, "data": text, "tokenizer":"", "output": "plaintext=normalized_spaces"} + return self.perform_request_urlencoded(params=params).rstrip().split("\n") + + def process_document(self, doc, tokenize=True, tag=True, parse=True, resegment=False, ranges=False): + """Delete all existing bundles and substitute them with those parsed by UDPipe.""" + if parse and not tag: + raise ValueError('Combination parse=True tag=False is not allowed.') + params = {"model": self.model, "tokenizer": "presegmented"} + if tag: + params["tagger"] = "" + if parse: + params["parser"] = "" + if resegment: + params["tokenizer"] = "" + if ranges: + params["tokenizer"] = "ranges" if resegment else "presegmented;ranges" + + #in_trees = [] + #for bundle in doc.bundles: + # assert(len(bundle.trees) == 1) + # in_trees.append(bundle.trees[0]) + if tokenize: + params["data"] = "\n".join(root.text for root in doc.trees) + "\n" + else: + params["input"] = "horizontal" + params["data"] = "\n".join(" ".join([n.form for n in root.descendants]) for root in doc.trees) + "\n" + + out_data = self.perform_request_urlencoded(params=params) + conllu_reader = ConlluReader(empty_parent="ignore") + conllu_reader.files.filehandle = io.StringIO(out_data) + trees = conllu_reader.read_trees() + + bundles = list(reversed(doc.bundles)) + for tree in trees: + if bundles: + bundle = bundles.pop() + # TODO is this safe? + bundle.trees = [] + else: + bundle = doc.create_bundle() + bundle.add_tree(tree)