diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000..9530d5c7 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,62 @@ +# Use the latest 2.1 version of CircleCI pipeline process engine. +# See: https://circleci.com/docs/2.0/configuration-reference +version: 2.1 + +# Orbs are reusable packages of CircleCI configuration that you may share across projects, enabling you to create encapsulated, parameterized commands, jobs, and executors that can be used across multiple projects. +# See: https://circleci.com/docs/2.0/orb-intro/ +orbs: + # The python orb contains a set of prepackaged CircleCI configuration you can use repeatedly in your configuration files + # Orb commands and jobs help you with common scripting around a language/tool + # so you dont have to copy and paste it everywhere. + # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python + python: circleci/python@1.5.0 + +# Define a job to be invoked later in a workflow. +# See: https://circleci.com/docs/2.0/configuration-reference/#jobs +jobs: + build-and-test: # This is the name of the job, feel free to change it to better match what you're trying to do! + parameters: + python-version: + type: string + # These next lines defines a Docker executors: https://circleci.com/docs/2.0/executor-types/ + # You can specify an image from Dockerhub or use one of the convenience images from CircleCI's Developer Hub + # A list of available CircleCI Docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python + # The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container + # Change the version below to your required version of python + docker: + - image: cimg/python:<< parameters.python-version >> + # Checkout the code as the first step. This is a dedicated CircleCI step. + # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default. + # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt. + # Then run your tests! + # CircleCI will report the results back to your VCS provider. + steps: + - checkout + - python/install-packages: + pkg-manager: pip + - run: + name: Install Udapi + command: pip install ".[test]" + - run: mkdir -p test-results + - run: + name: Run pytest tests + command: pytest --junitxml=test-results/junit.xml -o junit_family=legacy + - store_test_results: + path: test-results + - run: + name: Color TextModeTrees + command: udapy read.Conllu files=udapi/core/tests/data/babinsky.conllu write.TextModeTrees color=1 + - run: + name: External tests + command: cd udapi/core/tests && ./external_tests.sh + + +# Invoke jobs via workflows +# See: https://circleci.com/docs/2.0/configuration-reference/#workflows +workflows: + test-matrix: + jobs: + - build-and-test: + matrix: + parameters: + python-version: ["3.9", "3.11", "3.13"] diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..0285eddb --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,70 @@ +# This workflow will upload a Python Package to PyPI when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + release-build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Build release distributions + run: | + # NOTE: put your own distribution build steps here. + python -m pip install build + python -m build + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: release-dists + path: dist/ + + pypi-publish: + runs-on: ubuntu-latest + needs: + - release-build + permissions: + # IMPORTANT: this permission is mandatory for trusted publishing + id-token: write + + # Dedicated environments with protections for publishing are strongly recommended. + # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules + environment: + name: pypi + # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status: + url: https://pypi.org/p/udapi + # + # ALTERNATIVE: if your GitHub Release name is the PyPI project version string + # ALTERNATIVE: exactly, uncomment the following line instead: + # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }} + + steps: + - name: Retrieve release distributions + uses: actions/download-artifact@v4 + with: + name: release-dists + path: dist/ + + - name: Publish release distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist/ diff --git a/.gitignore b/.gitignore index a75e7c05..adc7bbbc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.cache .idea +*.egg-info/ *.pyc -.cache +dist/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..8804cc4e --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,23 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Currently, RTD needs to select an OS with OpenSSL>=1.1.1 because of +# urllib3's dependence on that system library. (alternately, pin urllib3<2 +# See https://github.com/urllib3/urllib3/issues/2168 +build: + os: ubuntu-22.04 + tools: + python: "3.10" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + fail_on_warning: false + +python: + install: + - requirements: docs/requirements.txt diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 7a8ded6c..00000000 --- a/.travis.yml +++ /dev/null @@ -1,10 +0,0 @@ -language: python -python: - - "3.3" - - "3.4" - - "3.5" -install: - - python setup.py install -script: - - python -m pytest - - cd udapi/core/tests && ./external_tests.sh diff --git a/CHANGES.txt b/CHANGES.txt new file mode 100644 index 00000000..98e26605 --- /dev/null +++ b/CHANGES.txt @@ -0,0 +1,35 @@ +Udapi Change Log +---------------- +See https://github.com/udapi/udapi-python/commits/master for details. + +0.5.1 2025-11-05 + - make udapy compatible with Python 3.13 + +0.5.0 2025-10-18 + - added mwt.feats + - added root.prev_tree and root.next_tree + - .github/workflows/python-publish.yml + - edits by Dan Zeman in block.ud.* + +0.4.0 2025-03-28 + - support for CorefUD 1.3 + - edits by Dan Zeman in block.ud.* + - requires Python 3.9+ (difficult to test older versions in Circle-CI) + +0.3.0 2022-04-06 + - support for CorefUD 1.0 (new CoNLL-U format for coreference annotation) + - edits by Dan Zeman in block.ud.* + - Circle-CI (instead of Travis-CI) + +0.2.3 2021-02-23 + - support for enhanced dependencies and coreference + - requires Python 3.6+ due to f-strings + - speed-up (benchmark 40.5s -> 10.4s) + +0.2.2 2018-01-08 + - support for loading/storing documents from/to strings + - allow private modules (starting with dot instead of udapi.block) + - MorphoDiTa wrapper udapi/tool/morphodita.py + - root.sent_id returns always the same as root.address() + +0.2.1 2017-10-23 the first PyPI release diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..9cecc1d4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + {one line to give the program's name and a brief idea of what it does.} + Copyright (C) {year} {name of author} + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + {project} Copyright (C) {year} {fullname} + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md index 721d1354..36465c78 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,29 @@ # udapi-python Python framework for processing Universal Dependencies data -[![Build Status](https://travis-ci.org/udapi/udapi-python.svg?branch=master)](https://travis-ci.org/udapi/udapi-python) +[![Build Status](https://circleci.com/gh/udapi/udapi-python.svg?style=shield)](https://circleci.com/gh/udapi/udapi-python) +[![Website](https://img.shields.io/website-up-down-green-red/http/udapi.github.io.svg)](http://udapi.github.io) +[![Documentation Status](https://readthedocs.org/projects/udapi/badge/)](http://udapi.readthedocs.io) ## Requirements -- You need Python 3.3 or higher. +- You need Python 3.9 or higher. +- It is recommended to install Udapi in a Python virtual environment. +- If you need the [ufal.udpipe](https://pypi.python.org/pypi/ufal.udpipe/) parser (to be used from Udapi) + install it (with `pip install --upgrade ufal.udpipe`). ## Install Udapi for developers -Let's clone the git repo to `~/udapi-python/`, install dependencies -and setup `$PATH` and `$PYTHONPATH` accordingly. +Let's clone the git repo e.g. to `~/udapi-python/` and make an [editable installation](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) ```bash cd git clone https://github.com/udapi/udapi-python.git -pip3 install --user -r udapi-python/requirements.txt -echo '## Use Udapi from ~/udapi-python/ ##' >> ~/.bashrc -echo 'export PATH="$HOME/udapi-python/bin:$PATH"' >> ~/.bashrc -echo 'export PYTHONPATH="$HOME/udapi-python/:$PYTHONPATH"' >> ~/.bashrc -source ~/.bashrc # or open new bash +cd udapi-python +pip install -e . ``` ## Install Udapi for users -This is similar to the above, but installs Udapi to the standard (user) Python paths. +This is similar to the above, but installs Udapi from PyPI to the standard (user) Python paths. ``` -pip3 install --user --upgrade git+https://github.com/udapi/udapi-python.git +pip install --upgrade udapi ``` Try `udapy -h` to check it is installed correctly. If it fails, make sure your `PATH` includes the directory where `pip3` installed the `udapy` script. diff --git a/bin/udapy b/bin/udapy index 03b9e3d2..83c7a6f2 100755 --- a/bin/udapy +++ b/bin/udapy @@ -1,70 +1,7 @@ #!/usr/bin/env python3 +"""Thin wrapper for backward compatibility. Calls udapi.cli.main().""" +import sys +from udapi.cli import main -import logging -import argparse - -from udapi.core.run import Run - -# Parse command line arguments. -argparser = argparse.ArgumentParser( - description='udapy - Python interface to Udapi - API for Universal Dependencies') -argparser.add_argument( - "-q", "--quiet", action="store_true", - help="Warning, info and debug messages are suppressed. Only fatal errors are reported.") -argparser.add_argument( - "-v", "--verbose", action="store_true", - help="Warning, info and debug messages are printed to the STDERR.") -argparser.add_argument( - "-s", "--save", action="store_true", - help="Add write.Conllu to the end of the scenario") -argparser.add_argument( - "-T", "--save_text_mode_trees", action="store_true", - help="Add write.TextModeTrees color=1 to the end of the scenario") -argparser.add_argument( - "-H", "--save_html", action="store_true", - help="Add write.TextModeTreesHtml color=1 to the end of the scenario") -argparser.add_argument( - "-A", "--save_all_attributes", action="store_true", - help="Add attributes=form,lemma,upos,xpos,feats,deprel,misc (to be used after -T and -H)") -argparser.add_argument( - "-C", "--save_comments", action="store_true", - help="Add print_comments=1 (to be used after -T and -H)") -argparser.add_argument( - "-M", "--marked_only", action="store_true", - help="Add marked_only=1 to the end of the scenario (to be used after -T and -H)") -argparser.add_argument( - "-N", "--no_color", action="store_true", - help="Add color=0 to the end of the scenario, this overrides color=1 of -T and -H") -argparser.add_argument( - 'scenario', nargs=argparse.REMAINDER, help="A sequence of blocks and their parameters.") - -args = argparser.parse_args() - -# Set the level of logs according to parameters. -if args.verbose: - level = logging.DEBUG -elif args.quiet: - level = logging.CRITICAL -else: - level = logging.INFO - -logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', level=level) - -# Process and provide the scenario. if __name__ == "__main__": - if args.save: - args.scenario = args.scenario + ['write.Conllu'] - if args.save_text_mode_trees: - args.scenario = args.scenario + ['write.TextModeTrees', 'color=1'] - if args.save_html: - args.scenario = args.scenario + ['write.TextModeTreesHtml', 'color=1'] - if args.save_all_attributes: - args.scenario = args.scenario + ['attributes=form,lemma,upos,xpos,feats,deprel,misc'] - if args.save_comments: - args.scenario = args.scenario + ['print_comments=1'] - if args.marked_only: - args.scenario = args.scenario + ['marked_only=1'] - if args.no_color: - args.scenario = args.scenario + ['color=0'] - runner = Run(args) - runner.execute() + sys.exit(main()) diff --git a/bin/udapy.bat b/bin/udapy.bat new file mode 100644 index 00000000..013e08e7 --- /dev/null +++ b/bin/udapy.bat @@ -0,0 +1,4 @@ +@REM The Python launcher "py" must be accessible via the PATH environment variable. +@REM We assume that this batch script lies next to udapy in udapi-python/bin. +@REM The PYTHONPATH environment variable must contain path to udapi-python. +py %~dp$PATH:0\udapy %* diff --git a/demo/python-demo.sh b/demo/python-demo.sh index aefa17cf..d83e51d9 100755 --- a/demo/python-demo.sh +++ b/demo/python-demo.sh @@ -3,4 +3,4 @@ export PATH=../bin:$PATH export PYTHONPATH=../:$PYTHONPATH -udapy read.Conllu filename=en-sample.conllu demo.RehangPrepositions write.Conllu > prepositions-up.conllu +udapy read.Conllu files=en-sample.conllu demo.RehangPrepositions write.Conllu > prepositions-up.conllu diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000..a1d82581 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,4 @@ +_build +udapi.rst +udapi.*.rst +modules.rst diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..17d5375a --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = Udapi +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 00000000..0857cc98 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,17 @@ +.. _api: + +================= +API Documentation +================= + +``udapi`` package +======================== + +.. automodule:: udapi + :members: + +------------------------ + +**Sub-modules** + +.. toctree:: modules diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..b7d0f6e5 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Udapi documentation build configuration file, created by +# sphinx-quickstart on Mon Mar 27 17:08:03 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.ifconfig', + 'sphinx.ext.viewcode'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Udapi' +copyright = '2023, Martin Popel' +author = 'Martin Popel' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0' +# The full version, including alpha/beta/rc tags. +release = '3' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# html_theme = 'alabaster' +import sphinx_rtd_theme +html_theme = 'sphinx_rtd_theme' +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Udapidoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Udapi.tex', 'Udapi Documentation', + 'Martin Popel', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'udapi', 'Udapi Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Udapi', 'Udapi Documentation', + author, 'Udapi', 'API and framework for processing Universal Dependencies', + 'Miscellaneous'), +] + + +def run_apidoc(_): + + cur_dir = os.path.abspath(os.path.dirname(__file__)) + print(cur_dir) + module = os.path.abspath(os.path.join(cur_dir, "..", "udapi")) + print(module) + + from sphinx.ext.apidoc import main + main(['--separate', '-o', cur_dir, module, '--force']) + +def setup(app): + app.connect('builder-inited', run_apidoc) + diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..78a2d540 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,24 @@ +.. Udapi documentation master file, created by + sphinx-quickstart on Mon Mar 27 17:08:03 2017. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to Udapi's documentation! +================================= + +Udapi is a framework providing an API for processing +`Universal Dependencies `_ data. + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + install + api + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/install.rst b/docs/install.rst new file mode 100644 index 00000000..14f81527 --- /dev/null +++ b/docs/install.rst @@ -0,0 +1,21 @@ +.. _instalation: + +============ +Installation +============ + +You need Python 3.3 or higher, pip3 and git. + + +Let's clone the git repo to ``~/udapi-python/``, install dependencies +and setup ``$PATH`` and ``$PYTHONPATH`` accordingly: + +.. code-block:: bash + + cd + git clone https://github.com/udapi/udapi-python.git + pip3 install --user -r udapi-python/requirements.txt + echo '## Use Udapi from ~/udapi-python/ ##' >> ~/.bashrc + echo 'export PATH="$HOME/udapi-python/bin:$PATH"' >> ~/.bashrc + echo 'export PYTHONPATH="$HOME/udapi-python/:$PYTHONPATH"' >> ~/.bashrc + source ~/.bashrc # or open new bash diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..a537f220 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,4 @@ +colorama>=0.4.6 +termcolor +ufal.udpipe +sphinx_rtd_theme diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..18d5c717 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "udapi" +version = "0.5.2" +description = "Python framework for processing Universal Dependencies data" +readme = "README.md" +requires-python = ">=3.9" +license = "GPL-3.0-or-later" +authors = [ + {name = "Martin Popel", email = "popel@ufal.mff.cuni.cz"} +] +classifiers = [ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", +] +dependencies = [ + "colorama", + "termcolor", +] + +[project.urls] +Homepage = "https://github.com/udapi/udapi-python" + +[project.optional-dependencies] +test = ["pytest"] +udpipe = ["ufal.udpipe"] + +[project.scripts] +udapy = "udapi.cli:main" + +[tool.setuptools] +packages = {find = {}} +include-package-data = true diff --git a/requirements.txt b/requirements.txt index a994db47..044d3af7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ -colorama +colorama>=0.4.6 termcolor +ufal.udpipe diff --git a/setup.py b/setup.py deleted file mode 100644 index b9d0890b..00000000 --- a/setup.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 - -from setuptools import setup, find_packages - -# python_requires is supported by pip only from November 2016, -# so let's check the Python version also the old way. -import sys -if sys.version_info < (3, 3): - raise SystemExit('Udapi requires Python 3.3 or higher.') - -setup( - name='udapi-python', - version='0.1', - description='Python framework for processing Universal Dependencies data', - author='Vincent Kriz', - author_email='kriz@ufal.mff.cuni.cz', - url='https://github.com/udapi/udapi-python', - packages=find_packages(), - scripts=['bin/udapy'], - tests_require=['pytest'], - install_requires=['colorama', 'termcolor'], - python_requires='>=3.3' -) diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 00000000..e079f8a6 --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1 @@ +pytest diff --git a/tutorial/01-visualizing.ipynb b/tutorial/01-visualizing.ipynb new file mode 100644 index 00000000..70bea240 --- /dev/null +++ b/tutorial/01-visualizing.ipynb @@ -0,0 +1,554 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "Udapi is an API and framework for processing [Universal Dependencies](http://universaldependencies.org/). In this tutorial, we will focus on the Python version of Udapi. Perl and Java versions are [available](http://udapi.github.io/) as well, but they are missing some of the features.\n", + "\n", + "Udapi can be used from the shell (e.g. Bash), using the wrapper script `udapy`. It can be also used as a library, from Python, IPython or Jupyter notebooks. We will show both of these ways bellow.\n", + "\n", + "This tutorial uses Details sections for extra info (if you want to know more or if you run into problems). You need to click on it to show its content.\n", + "
Details\n", + "It is a substitute for footnotes. The content may be long and showing it in the main text may be distracting.\n", + "
\n", + "\n", + "### Install (upgrade) Udapi\n", + "First, make sure you have the newest version of Udapi. If you have already installed Udapi [using git clone](https://github.com/udapi/udapi-python#install-udapi-for-developers), just run `git pull`. If you have not installed Udapi yet, run\n", + "
Details\n", + "
    \n", + "
  • The command below installs Udapi from GitHub (from the master branch). With pip3 install --user --upgrade udapi, you can install the last version released on PyPI (possibly older).\n", + "
  • The exclamation mark (!) in Jupyter or IPython means that the following command will be executed by the system shell (e.g. Bash).\n", + "
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install --user --upgrade git+https://github.com/udapi/udapi-python.git\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, make sure you can run the command-line interface `udapy`, e.g. by printing the help message." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: udapy [optional_arguments] scenario\r\n", + "\r\n", + "udapy - Python interface to Udapi - API for Universal Dependencies\r\n", + "\r\n", + "Examples of usage:\r\n", + " udapy -s read.Sentences udpipe.En < in.txt > out.conllu\r\n", + " udapy -T < sample.conllu | less -R\r\n", + " udapy -HAM ud.MarkBugs < sample.conllu > bugs.html\r\n", + "\r\n", + "positional arguments:\r\n", + " scenario A sequence of blocks and their parameters.\r\n", + "\r\n", + "optional arguments:\r\n", + " -h, --help show this help message and exit\r\n", + " -q, --quiet Warning, info and debug messages are suppressed. Only fatal errors are reported.\r\n", + " -v, --verbose Warning, info and debug messages are printed to the STDERR.\r\n", + " -s, --save Add write.Conllu to the end of the scenario\r\n", + " -T, --save_text_mode_trees\r\n", + " Add write.TextModeTrees color=1 to the end of the scenario\r\n", + " -H, --save_html Add write.TextModeTreesHtml color=1 to the end of the scenario\r\n", + " -A, --save_all_attributes\r\n", + " Add attributes=form,lemma,upos,xpos,feats,deprel,misc (to be used after -T and -H)\r\n", + " -C, --save_comments Add print_comments=1 (to be used after -T and -H)\r\n", + " -M, --marked_only Add marked_only=1 to the end of the scenario (to be used after -T and -H)\r\n", + " -N, --no_color Add color=0 to the end of the scenario, this overrides color=1 of -T and -H\r\n", + "\r\n", + "See http://udapi.github.io\r\n" + ] + } + ], + "source": [ + "!udapy -h" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Details: If the previous command fails with \"udapy: command not found\"\n", + "This means that Udapi is not properly installed. When installing Udapi with pip3 --user, it is installed into ~/.local/lib/python3.6/site-packages/udapi/ (or similar depending on your Python version) and the wrapper into ~/.local/bin. Thus you need to\n", + "
\n",
+    "export PATH=\"$HOME/.local/bin/:$PATH\"\n",
+    "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Browse CoNLL-U files\n", + "### Get sample UD data\n", + "\n", + "Download and extract [ud20sample.tgz](http://ufal.mff.cuni.cz/~popel/udapi/ud20sample.tgz). There are just 100 sentences for each of the 70 treebanks (`sample.conllu`), plus 4 bigger files (`train.conllu` and `dev.conllu`) for German, English, French and Czech. For full UD ([2.0](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1983) or [newer](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3424)), go to [Lindat](https://lindat.cz)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-12-01 07:53:37-- http://ufal.mff.cuni.cz/~popel/udapi/ud20sample.tgz\n", + "Resolving ufal.mff.cuni.cz (ufal.mff.cuni.cz)... 195.113.20.52\n", + "Connecting to ufal.mff.cuni.cz (ufal.mff.cuni.cz)|195.113.20.52|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4670982 (4,5M) [application/x-gzip]\n", + "Saving to: ‘ud20sample.tgz.1’\n", + "\n", + "ud20sample.tgz.1 100%[===================>] 4,45M 1,49MB/s in 3,0s \n", + "\n", + "2020-12-01 07:53:40 (1,49 MB/s) - ‘ud20sample.tgz.1’ saved [4670982/4670982]\n", + "\n", + "/home/martin/udapi/python/notebook/sample\n" + ] + } + ], + "source": [ + "!wget http://ufal.mff.cuni.cz/~popel/udapi/ud20sample.tgz\n", + "!tar -xf ud20sample.tgz\n", + "%cd sample" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's choose one of the sample files and see the raw [CoNLL-U format](https://universaldependencies.org/format.html).\n", + "
Details: executing from Bash, IPython, Jupyter\n", + "
    \n", + "
  • If you see \"No such file or directory\" error, make sure you executed the previous cell. Note that the cd command is not prefixed by an exclamation mark because that would run in a sub-shell, which \"forgets\" the changed directory when finished. It is prefixed by a percent sign, which marks it as IPython magic.\n", + "
  • cat is another IPython magic command, this time an alias for the shell command of the same name (so you can prefix cat with an exclamation mark, if you prefer), which prints a given file. With automagic on, you can use it without the percent sign.\n", + "
  • In this tutorial, we use | head to show just the first 10 lines of the output (preventing thus big ipynb file size). You can ignore the \"cat: write error: Broken pipe\" warning.\n", + "
  • When using Jupyter, you can omit the | head because long outputs are automatically wrapped in a text box with a scrollbar.\n", + "
  • When running this from IPython or Bash, you can use a pager: less UD_Ancient_Greek/sample.conllu\n", + "
\n", + "
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# newdoc id = tlg0008.tlg001.perseus-grc1.13.tb.xml\r\n", + "# sent_id = tlg0008.tlg001.perseus-grc1.13.tb.xml@1144\r\n", + "# text = ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·\r\n", + "1\tἐρᾷ\tἐράω\tVERB\tv3spia---\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act\t0\troot\t_\t_\r\n", + "2\tμὲν\tμέν\tADV\td--------\t_\t1\tadvmod\t_\t_\r\n", + "3\tἁγνὸς\tἁγνός\tADJ\ta-s---mn-\tCase=Nom|Gender=Masc|Number=Sing\t4\tnmod\t_\t_\r\n", + "4\tοὐρανὸς\tοὐρανός\tNOUN\tn-s---mn-\tCase=Nom|Gender=Masc|Number=Sing\t1\tnsubj\t_\t_\r\n", + "5\tτρῶσαι\tτιτρώσκω\tVERB\tv--ana---\tTense=Past|VerbForm=Inf|Voice=Act\t1\txcomp\t_\t_\r\n", + "6\tχθόνα\tχθών\tNOUN\tn-s---fa-\tCase=Acc|Gender=Fem|Number=Sing\t5\tobj\t_\tSpaceAfter=No\r\n", + "7\t,\t,\tPUNCT\tu--------\t_\t1\tpunct\t_\t_\r\n", + "cat: write error: Broken pipe\r\n" + ] + } + ], + "source": [ + "cat UD_Ancient_Greek/sample.conllu | head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Browse conllu files with `udapy -T`\n", + "While the CoNLL-U format was designed with readibility (by both machines and humans) on mind, it may be still a bit difficult to read and interpret by humans. Let's visualize the dependency tree structure using ASCII-art by piping the conllu file into `udapy -T`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020-12-01 08:00:33,276 [ INFO] execute - No reader specified, using read.Conllu\n", + "2020-12-01 08:00:33,276 [ INFO] execute - ---- ROUND ----\n", + "2020-12-01 08:00:33,276 [ INFO] execute - Executing block Conllu\n", + "2020-12-01 08:00:33,305 [ INFO] execute - Executing block TextModeTrees\n", + "docname = tlg0008.tlg001.perseus-grc1.13.tb.xml\n", + "loaded_from = -\n", + "# sent_id = tlg0008.tlg001.perseus-grc1.13.tb.xml@1144\n", + "# text = ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·\n", + "─┮\n", + " ╰─┮ \u001b[33mἐρᾷ\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mroot\u001b[0m\n", + " ┡─╼ \u001b[33mμὲν\u001b[0m \u001b[31mADV\u001b[0m \u001b[34madvmod\u001b[0m\n", + " │ ╭─╼ \u001b[33mἁγνὸς\u001b[0m \u001b[31mADJ\u001b[0m \u001b[34mnmod\u001b[0m\n", + " ┡─┶ \u001b[33mοὐρανὸς\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m\n", + " ┡─┮ \u001b[33mτρῶσαι\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m\n", + " │ ╰─╼ \u001b[33mχθόνα\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m\n", + " ┡─╼ \u001b[33m,\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m\n", + " │ ╭─╼ \u001b[33mἔρως\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m\n", + " ┡─╼ \u001b[33mδὲ\u001b[0m \u001b[31mCCONJ\u001b[0m \u001b[34mcc\u001b[0m │\n", + " │ ┢─╼ \u001b[33mγαῖαν\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m\n", + " ┡───────────────┾ \u001b[33mλαμβάνει\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mconj\u001b[0m\n", + " │ │ ╭─╼ \u001b[33mγάμου\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m\n", + " │ ╰─┶ \u001b[33mτυχεῖν\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m\n", + " ╰─╼ \u001b[33m·\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "cat UD_Ancient_Greek/sample.conllu | udapy -T | head -n 20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Details:\n", + "
    \n", + "
  • You may be used to see dependency trees where the root node is on the top and words are ordered horizontally (left to right). Here, the root is on left and words are ordered vertically (top to bottom).\n", + "
  • The colors are implemented using the colorama package and ANSI escape codes. When running this from IPython or Bash and using less, you need to instruct it to display the colors with -R:\n", + "\n", + "cat UD_Ancient_Greek/sample.conllu | udapy -T | less -R\n", + "\n", + "
  • You can also use udapy -T -N to disable the colors.\n", + "
  • udapy -q suppresses all Udapi messages (warnings, info, debug) printed on the standard error output, so only fatal errors are printed. By default only debug messages are suppresses, but these can be printed with udapy -v.\n", + "
  • But you already know this because you have read udapy -h, am I right?\n", + "
\n", + "
\n", + "\n", + "`udapy -T` is a shortcut for `udapy write.TextModeTrees color=1`, where `write.TextModeTrees` is a so-called *block* (a basic Udapi processing unit) and `color=1` is its parameter. See [the documentation](https://udapi.readthedocs.io/en/latest/udapi.block.write.html#module-udapi.block.write.textmodetrees) (or even [the source code](https://github.com/udapi/udapi-python/blob/master/udapi/block/write/textmodetrees.py) of `write.TextModeTrees` to learn about further parameters. Now, let's print also the LEMMA and MISC columns and display the columns vertically aligned using parameters `layout=align attributes=form,lemma,upos,deprel,misc`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "docname = tlg0008.tlg001.perseus-grc1.13.tb.xml\r\n", + "loaded_from = -\r\n", + "# sent_id = tlg0008.tlg001.perseus-grc1.13.tb.xml@1144\r\n", + "# text = ἐρᾷ μὲν ἁγνὸς οὐρανὸς τρῶσαι χθόνα, ἔρως δὲ γαῖαν λαμβάνει γάμου τυχεῖν·\r\n", + "─┮ \r\n", + " ╰─┮ \u001b[33mἐρᾷ\u001b[0m \u001b[36mἐράω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mroot\u001b[0m _\u001b[0m\r\n", + " ┡─╼ \u001b[33mμὲν\u001b[0m \u001b[36mμέν\u001b[0m \u001b[31mADV\u001b[0m \u001b[34madvmod\u001b[0m _\u001b[0m\r\n", + " │ ╭─╼ \u001b[33mἁγνὸς\u001b[0m \u001b[36mἁγνός\u001b[0m \u001b[31mADJ\u001b[0m \u001b[34mnmod\u001b[0m _\u001b[0m\r\n", + " ┡─┶ \u001b[33mοὐρανὸς\u001b[0m \u001b[36mοὐρανός\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m _\u001b[0m\r\n", + " ┡─┮ \u001b[33mτρῶσαι\u001b[0m \u001b[36mτιτρώσκω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m _\u001b[0m\r\n", + " │ ╰─╼ \u001b[33mχθόνα\u001b[0m \u001b[36mχθών\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m SpaceAfter=No\u001b[0m\r\n", + " ┡─╼ \u001b[33m,\u001b[0m \u001b[36m,\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m _\u001b[0m\r\n", + " │ ╭─╼ \u001b[33mἔρως\u001b[0m \u001b[36mἔρως\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnsubj\u001b[0m _\u001b[0m\r\n", + " ┡─╼ │ \u001b[33mδὲ\u001b[0m \u001b[36mδέ\u001b[0m \u001b[31mCCONJ\u001b[0m \u001b[34mcc\u001b[0m _\u001b[0m\r\n", + " │ ┢─╼ \u001b[33mγαῖαν\u001b[0m \u001b[36mγαῖα\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m _\u001b[0m\r\n", + " ┡───┾ \u001b[33mλαμβάνει\u001b[0m \u001b[36mλαμβάνω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mconj\u001b[0m _\u001b[0m\r\n", + " │ │ ╭─╼ \u001b[33mγάμου\u001b[0m \u001b[36mγάμος\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobj\u001b[0m _\u001b[0m\r\n", + " │ ╰─┶ \u001b[33mτυχεῖν\u001b[0m \u001b[36mτυγχάνω\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mxcomp\u001b[0m SpaceAfter=No\u001b[0m\r\n", + " ╰─╼ \u001b[33m·\u001b[0m \u001b[36m·\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m _\u001b[0m\r\n", + "\r\n" + ] + } + ], + "source": [ + "cat UD_Ancient_Greek/sample.conllu | udapy -q write.TextModeTrees color=1 layout=align attributes=form,lemma,upos,deprel,misc | head -n 20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Browse conllu files from IPython/Jupyter\n", + "So far, we were using Udapi only via its command-line interface `udapy`, which is handy, but not very Pythonic. So let's now use Udapi as a library and load the English conllu sample file into a document `doc` and visualize the sixth tree (i.e. `doc[5]` in zero-based indexing)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0006\n", + "# text = The third was being run by the head of an investment firm.\n", + "─┮\n", + " │ ╭─╼ \u001b[33mThe\u001b[0m \u001b[31mDET\u001b[0m \u001b[34mdet\u001b[0m\n", + " │ ╭─┶ \u001b[33mthird\u001b[0m \u001b[31mADJ\u001b[0m \u001b[34mnsubj:pass\u001b[0m\n", + " │ ┢─╼ \u001b[33mwas\u001b[0m \u001b[31mAUX\u001b[0m \u001b[34maux\u001b[0m\n", + " │ ┢─╼ \u001b[33mbeing\u001b[0m \u001b[31mAUX\u001b[0m \u001b[34maux:pass\u001b[0m\n", + " ╰─┾ \u001b[33mrun\u001b[0m \u001b[31mVERB\u001b[0m \u001b[34mroot\u001b[0m\n", + " │ ╭─╼ \u001b[33mby\u001b[0m \u001b[31mADP\u001b[0m \u001b[34mcase\u001b[0m\n", + " │ ┢─╼ \u001b[33mthe\u001b[0m \u001b[31mDET\u001b[0m \u001b[34mdet\u001b[0m\n", + " ┡─┾ \u001b[33mhead\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mobl\u001b[0m\n", + " │ │ ╭─╼ \u001b[33mof\u001b[0m \u001b[31mADP\u001b[0m \u001b[34mcase\u001b[0m\n", + " │ │ ┢─╼ \u001b[33man\u001b[0m \u001b[31mDET\u001b[0m \u001b[34mdet\u001b[0m\n", + " │ │ ┢─╼ \u001b[33minvestment\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mcompound\u001b[0m\n", + " │ ╰─┶ \u001b[33mfirm\u001b[0m \u001b[31mNOUN\u001b[0m \u001b[34mnmod\u001b[0m\n", + " ╰─╼ \u001b[33m.\u001b[0m \u001b[31mPUNCT\u001b[0m \u001b[34mpunct\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "import udapi\n", + "doc = udapi.Document(\"UD_English/sample.conllu\")\n", + "doc[5].draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Details:\n", + "
    \n", + "
  • doc = udapi.Document(filename) is a shortcut for\n", + "
    \n",
    +    "import udapi.core.document\n",
    +    "doc = udapi.core.document.Document(filename)\n",
    +    "
    \n", + "
  • We can print the whole document using doc.draw().\n", + "
  • doc.draw(**kwargs) is a shortcut for creating a write.TextModeTrees block and applying it on the document:\n", + "
    \n",
    +    "import udapi.block.write.textmodetrees\n",
    +    "block = udapi.block.write.textmodetrees.TextModeTrees(**kwargs)\n",
    +    "block.run(doc)\n",
    +    "
    \n", + "
\n", + "
\n", + "\n", + "The `draw()` method takes the same parameters as the `write.TextModeTrees` block, so we can for example display only the node ID (aka `ord`, i.e. word-order index), form and [universal (morpho-syntactic) features](https://universaldependencies.org/u/feat/index.html).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0006\n", + "# text = The third was being run by the head of an investment firm.\n", + "─┮ \n", + " │ ╭─╼ \u001b[32m1\u001b[0m \u001b[33mThe\u001b[0m Definite=Def|PronType=Art\u001b[0m\n", + " │ ╭─┶ \u001b[32m2\u001b[0m \u001b[33mthird\u001b[0m Degree=Pos|NumType=Ord\u001b[0m\n", + " │ ┢─╼ \u001b[32m3\u001b[0m \u001b[33mwas\u001b[0m Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\u001b[0m\n", + " │ ┢─╼ \u001b[32m4\u001b[0m \u001b[33mbeing\u001b[0m VerbForm=Ger\u001b[0m\n", + " ╰─┾ \u001b[32m5\u001b[0m \u001b[33mrun\u001b[0m Tense=Past|VerbForm=Part|Voice=Pass\u001b[0m\n", + " │ ╭─╼ \u001b[32m6\u001b[0m \u001b[33mby\u001b[0m _\u001b[0m\n", + " │ ┢─╼ \u001b[32m7\u001b[0m \u001b[33mthe\u001b[0m Definite=Def|PronType=Art\u001b[0m\n", + " ┡─┾ \u001b[32m8\u001b[0m \u001b[33mhead\u001b[0m Number=Sing\u001b[0m\n", + " │ │ ╭─╼ \u001b[32m9\u001b[0m \u001b[33mof\u001b[0m _\u001b[0m\n", + " │ │ ┢─╼ \u001b[32m10\u001b[0m \u001b[33man\u001b[0m Definite=Ind|PronType=Art\u001b[0m\n", + " │ │ ┢─╼ \u001b[32m11\u001b[0m \u001b[33minvestment\u001b[0m Number=Sing\u001b[0m\n", + " │ ╰─┶ \u001b[32m12\u001b[0m \u001b[33mfirm\u001b[0m Number=Sing\u001b[0m\n", + " ╰─╼ \u001b[32m13\u001b[0m \u001b[33m.\u001b[0m _\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "doc[5].draw(layout=\"align\", attributes=\"ord,form,feats\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Document representation in Udapi\n", + "\n", + "Udapi [document](https://github.com/udapi/udapi-python/blob/master/udapi/core/document.py) consists of a sequence of so-called *bundles*, mirroring a sequence of sentences in a typical natural language text.\n", + "\n", + "A [bundle](https://github.com/udapi/udapi-python/blob/master/udapi/core/bundle.py) corresponds to a sentence,\n", + "possibly in multiple versions or with different representations, such as sentence-tuples from parallel corpora, or paraphrases in the same language or alternative analyses (e.g. parses produced by different parsers). If there are more trees in a bundle, they must be distinguished by a so-called *zone* (a label which contains the language code).\n", + "\n", + "Each tree is represented by a special (artificial) [root](https://github.com/udapi/udapi-python/blob/master/udapi/core/root.py) node, which is added to the top of a CoNLL-U tree in the Udapi model. The root node bears the ID of a given tree/sentence (`sent_id`) and its word order (`ord`) is 0. Technically, Root is subclass of Node, with some extra methods.\n", + "\n", + "The [Node](https://github.com/udapi/udapi-python/blob/master/udapi/core/node.py) class corresponds to a node\n", + "of a dependency tree. It provides access to all the CoNLL-U-defined attributes (`ord`, `form`, `lemma`, `upos`, `xpos`, `feats`, `deprel`, `deps`, `misc`). There are methods for tree traversal (`parent`, `root`, `children`, `descendants`); word-order traversal (`next_node`, `prev_node`); tree manipulation (`parent` setter) including word-order changes (`shift_after_node(x)`, `shift_before_subtree(x)`, etc.); and utility methods: `is_descendant_of(x)`, `is_nonprojective()`, `precedes(x)`, `is_leaf()`, `is_root()`, `get_attrs([])`, `compute_text()`, `draw()`.\n", + "\n", + "## Exercise 1: Count prepositions and postpositions\n", + "[Prepositions and postpositions](https://en.wikipedia.org/wiki/Preposition_and_postposition) are together called *adpositions* and assigned the [ADP](https://universaldependencies.org/u/pos/ADP.html) universal part-of-speech tag (`upos`) in UD. Some languages (e.g. English) use mostly prepositions, others mostly postpositions.\n", + "* Do you know any English postpositions?\n", + "* Guess the typical adposition type (i.e. whether a given language uses more prepositions or postpositions) for at least 10 languages of your choice (from those in UD2.0).\n", + "* Complete the following code and find out how many prepositions and postpositions are in `UD_English/sample.conllu` (which has been loaded into `doc`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prepositions, postpositions = 0, 0\n", + "# Iterate over all nodes in the document (in all trees)\n", + "for node in doc.nodes:\n", + " if node.upos == \"ADP\":\n", + " # TODO: fix this code to actually distinguish prepositions and postpositions\n", + " prepositions += 1\n", + "# Print the results\n", + "prepositions, postpositions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you don't know how to proceed click on the following hints.\n", + "
Hint 1:\n", + "In some dependency grammars, adpositions govern noun (i.e. adposition is the *parent* of a given noun node). In other dependency grammars, adpositions depend on nouns (i.e. noun is the *parent* of a given adposition). Find out which style is being used by UD. Check the UD documentation or inspect some of the tree visualizations and guess.\n", + "
\n", + "
Hint 2:\n", + "See the Node documentation and find out how to obtain dependency parent and dependency children. Note that these are properties of a given node, rather than methods, so you should not write parentheses () after the property name.\n", + "
\n", + "
Hint 3:\n", + "doc.nodes iterates over all nodes in the document sorted by the word order, but this would be cumbersome to exploit. Find a method of Node to detect the relative word order of two nodes (within the same tree/sentence).\n", + "
\n", + "
Hint 4:\n", + "Use node.parent and node.precedes(another_node).\n", + "The latter is a shortcut for node.ord < another_node.ord.\n", + "
\n", + "
Solution:\n", + "
\n",
+    "for node in doc.nodes:\n",
+    "    if node.upos == \"ADP\":\n",
+    "        if node.precedes(node.parent):\n",
+    "            prepositions += 1\n",
+    "        else:\n",
+    "            postpositions += 1\n",
+    "
\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2: Explore English postpositions\n", + "The previous exercise indicates there are 7 occurrences of postpositions in the English sample. Find these 7 occurrences and visualize them using `node.draw()`. Count which adpositions (`lemma`) with which dependency relations (`deprel`) are responsible for these occurrences. Recompute these statistics on the bigger English training data. Can you explain these occurrences? What are the reasons? Is any occurrence an annotation error?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For the statistics, you may find useful: count[\"any string\"] += 1\n", + "import collections\n", + "count = collections.Counter()\n", + "big_doc = udapi.Document(\"UD_English/train.conllu\")\n", + "\n", + "for node in doc.nodes:\n", + " # TODO detect postposition\n", + " pass\n", + "\n", + "# Print the statistics\n", + "count.most_common()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Solution 1:\n", + "
\n",
+    "for node in doc.nodes:\n",
+    "    if node.upos == \"ADP\" and node.parent.precedes(node):\n",
+    "        node.parent.draw()\n",
+    "        count[node.lemma + \" \" + node.deprel] += 1\n",
+    "
\n", + "
\n", + "
Hint 1:\n", + "We can see there are many particles of phrase verbs, e.g. \"busted up\".\n", + "These seem to be correctly annotated as ADP according to the UD guidelines.\n", + "Let's filter out those cases and focus on the rest and let's switch to the big train data.\n", + "
\n", + "
Solution 2:\n", + "
\n",
+    "count = collections.Counter()\n",
+    "for node in big_doc.nodes:\n",
+    "    if node.upos == \"ADP\" and node.parent.precedes(node) and node.parent.upos != \"VERB\":\n",
+    "        count[node.lemma + \" \" + node.deprel] += 1\n",
+    "count.most_common()\n",
+    "
\n", + "Alternatively to node.parent.upos != \"VERB\",\n", + "you could also filter out node.deprel != \"compound:prt\",\n", + "or directly focus on node.deprel == \"case\"\n", + "
\n", + "
Partial answer:\n", + "Most of the occurrences are actually annotated correctly,\n", + "although they are not typically considered as postpositions.\n", + "For example, node.deprel == \"fixed\" is being used for multi-word adpositions,\n", + "such as \"because of\", where \"of\" depends on \"because\" from technical (and consistency) reasons,\n", + "but the whole multi-word adpositions precedes its governing nound, so it is actually a multi-word preposition.\n", + "\n", + "What about the remaining occurrences, after filtering out node.deprel not in {\"compound:prt\", \"fixed\"}?\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next tutorial, 02-blocks.ipynb (not finished yet), we will explore several useful Udapi blocks, some of which may be handy when working further on Exercise 2 or similar tasks." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorial/README.md b/tutorial/README.md new file mode 100644 index 00000000..425f7df5 --- /dev/null +++ b/tutorial/README.md @@ -0,0 +1,9 @@ +# Udapi tutorial + +To run this tutorial, install [Jupyter Notebook](https://jupyter.org/install.html) (or JupyterLab) and run `jupyter notebook` from this directory. + +Don't display the tutorial `ipynb` files on GitHub because it cannot render the collapsible Details, Hints and Solution sections, so you would miss important parts of the tutorial. +If you don't have Jupyter installed, you can display the tutorial with https://nbviewer.jupyter.org, using the following links: + +- [01-visualizing.ipynb](https://nbviewer.jupyter.org/github/udapi/udapi-python/blob/master/tutorial/01-visualizing.ipynb) +- 02-blocks.ipynb (not finished yet) diff --git a/tutorial/udapi-tutorial-dz.odt b/tutorial/udapi-tutorial-dz.odt new file mode 100644 index 00000000..d27ff8c4 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.odt differ diff --git a/tutorial/udapi-tutorial-dz.pdf b/tutorial/udapi-tutorial-dz.pdf new file mode 100644 index 00000000..86d975b6 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.pdf differ diff --git a/udapi/__init__.py b/udapi/__init__.py index e69de29b..6c281c0f 100644 --- a/udapi/__init__.py +++ b/udapi/__init__.py @@ -0,0 +1,3 @@ +from .core.document import Document +from .core.run import create_block +from .core.node import CycleError diff --git a/udapi/block/corefud/__init__.py b/udapi/block/corefud/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/corefud/bridgingclusters.py b/udapi/block/corefud/bridgingclusters.py new file mode 100644 index 00000000..30ac49a7 --- /dev/null +++ b/udapi/block/corefud/bridgingclusters.py @@ -0,0 +1,17 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class BridgingClusters(Block): + + def process_node(self,node): + + if 'Bridging' in node.misc and "+" in node.misc['BridgingAllTargetClusterTexts']: + print("SENTENCE : "+node.root.get_sentence()) + print("SOURCE MENTION: "+node.misc['MentionText']) + print("RELATION: "+node.misc['Bridging']) + print("TARGET MENTION: "+node.misc['BridgingTargetMentionText']) + print("TARGET CLUSTER: "+node.misc['BridgingAllTargetClusterTexts']) + print() + + diff --git a/udapi/block/corefud/concatmentionmisc.py b/udapi/block/corefud/concatmentionmisc.py new file mode 100644 index 00000000..74483368 --- /dev/null +++ b/udapi/block/corefud/concatmentionmisc.py @@ -0,0 +1,24 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class ConcatMentionMisc(Block): + """All MISC attributes named MentionMisc_... are concatenated into MentionMisc""" + + def process_tree(self,root): + for node in root.descendants_and_empty: + for attrname in list(node.misc): + matchObj = re.match('MentionMisc_([^[]+)((\[\d+\])?)',attrname) + if matchObj: + innerattrib = matchObj.group(1) + index = matchObj.group(2) + + finalattr = 'MentionMisc'+index + value = node.misc[attrname].replace(",", "%2C") + + if finalattr not in node.misc: + node.misc[finalattr] = f'{innerattrib}:{value}' + else: + node.misc[finalattr] += f',{innerattrib}:{value}' + del node.misc[attrname] + diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py new file mode 100644 index 00000000..fc45540a --- /dev/null +++ b/udapi/block/corefud/countgaps.py @@ -0,0 +1,94 @@ +from udapi.core.block import Block +from collections import defaultdict, Counter + +class CountGaps(Block): + """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" + + def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs): + super().__init__(**kwargs) + self.report_per_newdoc = report_per_newdoc + self.report_per_file = report_per_file + self.report_total = report_total + self._total_counter = defaultdict(Counter) + + def _report_stats(self, counter, header_id=None): + if header_id: + print(f"============ {header_id} ============") + for key in sorted(counter): + print(f"{key:2d}: {counter[key]}") + print("-------") + print(f"SUM: {sum([k*counter[k] for k in counter])}") + + def _count_empty_seqs(self, empty_seqs): + counter = Counter() + for seq in empty_seqs: + counter[len(seq)] += 1 + return counter + + def process_document(self, doc): + file_counters = defaultdict(Counter) + empty_seqs = [] + empty_pars = [] + curr_seq = [] + curr_par = [] + is_empty_par = True + newdoc = None + for i, tree in enumerate(doc.trees): + if tree.newdoc: + if i: + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if is_empty_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}") + newdoc = tree.newdoc + empty_seqs = [] + empty_pars = [] + curr_seq = [] + curr_par = [] + is_empty_par = True + if tree.newpar: + if not tree.newdoc and is_empty_par: + empty_pars.append(curr_par) + curr_par = [] + is_empty_par = True + + has_mention = any(node.coref_mentions for node in tree.descendants) + if not has_mention: + curr_seq.append(tree.sent_id) + curr_par.append(tree.sent_id) + else: + if curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + is_empty_par = False + + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if curr_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}") + + if self.report_per_file: + self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE") + self._report_stats(file_counters["par"], header_id="PAR STATS, FILE") + + self._total_counter["seq"].update(file_counters["seq"]) + self._total_counter["par"].update(file_counters["par"]) + + def process_end(self): + if self.report_total: + self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL") + self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL") diff --git a/udapi/block/corefud/delete.py b/udapi/block/corefud/delete.py new file mode 100644 index 00000000..5aaf94e7 --- /dev/null +++ b/udapi/block/corefud/delete.py @@ -0,0 +1,84 @@ +"""Delete coreference annotation (Entity|Bridge|SplitAnte) and optionally also empty nodes.""" + +from udapi.core.block import Block +import udapi.core.coref +import logging + +class Delete(Block): + + def __init__(self, coref=True, empty=False, misc=False, **kwargs): + """Args: + coref: delete coreference attributes in MISC, i.e (Entity|Bridge|SplitAnte) + empty: delete all empty nodes and references to them (from DEPS and MISC[Functor]) + misc: delete all attributes in MISC except for SpaceAfter + """ + super().__init__(**kwargs) + self.coref = coref + self.empty = empty + self.misc = misc + + def is_root_reachable_by_deps(self, node, parents_to_ignore=None): + """ Check if the root node is reachable from node, possibly after deleting the parents_to_ignore nodes. + """ + stack = [(node, [])] + while stack: + proc_node, path = stack.pop() + # root is reachable + if proc_node == node.root: + return True + # path forms a cycle, the root cannot be reached through this branch + if proc_node not in path: + for dep in proc_node.deps: + # the root cannot be reached through ignored nodes + if dep['parent'] not in parents_to_ignore: + # process the parent recursively + stack.append((dep['parent'], path + [proc_node])) + return False + + def _deps_ignore_nodes(self, node, parents_to_ignore): + """ Retrieve deps from the node, recursively ignoring specified parents. + """ + newdeps = [] + stack = [(node, [])] + while stack: + proc_node, skipped_nodes = stack.pop() + if proc_node not in skipped_nodes: + for dep in proc_node.deps: + if dep['parent'] in parents_to_ignore: + # process the ignored parent recursively + stack.append((dep['parent'], skipped_nodes + [proc_node])) + else: + # keep deps with a parent that shouldn't be ignored + newdeps.append(dep) + # If no newdeps were found (because of a cycle), return the root. + return newdeps if newdeps else [{'parent': node.root, 'deprel': 'root'}] + + def process_document(self, doc): + # This block should work both with coreference loaded (deserialized) and not. + if self.coref: + doc._eid_to_entity = None + for root in doc.trees: + if self.empty: + for node in root.descendants: + # process only the nodes dependent on empty nodes + if '.' in node.raw_deps: + # just remove empty parents if the root remains reachable + if self.is_root_reachable_by_deps(node, root.empty_nodes): + node.deps = [dep for dep in node.deps if not dep['parent'] in root.empty_nodes] + # otherwise propagate to non-empty ancestors + else: + node.deps = self._deps_ignore_nodes(node, root.empty_nodes) + # This needs to be done even if '.' not in node.raw_deps. + if '.' in node.misc['Functor'].split(':')[0]: + del node.misc['Functor'] + root.empty_nodes = [] + + if self.coref or self.misc: + for node in root.descendants + root.empty_nodes: + if self.misc: + node.misc = 'SpaceAfter=No' if node.no_space_after else None + if self.coref: + node._mentions = [] + if not self.misc: + for attr in ('Entity', 'Bridge', 'SplitAnte'): + del node.misc[attr] diff --git a/udapi/block/corefud/fixcorefud02.py b/udapi/block/corefud/fixcorefud02.py new file mode 100644 index 00000000..1575cea6 --- /dev/null +++ b/udapi/block/corefud/fixcorefud02.py @@ -0,0 +1,56 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +NEW_ETYPE = { + "misc": "other", + "date": "time", + "loc": "place", + "location": "place", + "per": "person", + "org": "organization", + "_": "", + } + +class FixCorefUD02(Block): + """Fix errors in CorefUD 0.2 for release of CorefUD 1.0.""" + + def process_document(self, doc): + # For GUM + if doc.meta['global.Entity'] == 'entity-GRP-infstat-MIN-coref_type-identity': + doc.meta['global.Entity'] = 'eid-etype-head-other-infstat-minspan-identity' + + for entity in doc.coref_entities: + if entity.etype: + # Harmonize etype. + # If gen/spec is distinguished, store it in all mentions' other['gstype']. + etype = entity.etype.lower() + if etype.startswith('spec') or etype.startswith('gen'): + gstype = 'gen' if etype.startswith('gen') else 'spec' + for m in entity.mentions: + m.other['gstype'] = gstype + if etype == 'spec': + etype = 'other' + etype = etype.replace('gen', '').replace('spec', '').replace('.', '') + etype = NEW_ETYPE.get(etype, etype) + + # etype="APPOS" is used only in NONPUBL-CorefUD_English-OntoNotes. + # Apposition is a mention-based rather than entity-based attribute. + # We don't know which of the mentions it should be assigned, but let's expect all non-first. + # UD marks appositions with deprel appos, so once someone checks it is really redunant, + # TODO we can delete the appos mention attribute. + if etype == 'appos': + etype = '' + for mention in entity.mentions[1:]: + mention.other['appos'] = '1' + entity.etype = etype + + for mention in entity.mentions: + # Harmonize bridge relation labels + for bridge in mention.bridging: + rel = bridge.relation.lower() + if rel.endswith('-inv'): + rel = 'i' + rel.replace('-inv', '') + rel = rel.replace('-', '') + rel = rel.replace('indirect_', '') + bridge.relation = rel diff --git a/udapi/block/corefud/fixentityacrossnewdoc.py b/udapi/block/corefud/fixentityacrossnewdoc.py new file mode 100644 index 00000000..61e5e4f6 --- /dev/null +++ b/udapi/block/corefud/fixentityacrossnewdoc.py @@ -0,0 +1,25 @@ +from udapi.core.block import Block +import udapi.core.coref +import logging + +class FixEntityAcrossNewdoc(Block): + """ + Fix the error reported by validate.py --coref: + "[L6 Coref entity-across-newdoc] Same entity id should not occur in multiple documents" + by making the entity IDs (eid) unique in each newdoc document. + + This block uses Udapi's support for loading GUM-like GRP document-wide IDs + (so the implementation is simple, although unnecessarily slow). + After applying this block, IDs of all entities are prefixed with document numbers, + e.g. "e45" in the 12th document changes to "d12.e45". + If you prefer simple eid, use corefud.IndexClusters afterwards. + """ + + def process_document(self, doc): + if not doc.eid_to_entity: + logging.warning(f"No entities in document {doc.meta}") + udapi.core.coref.store_coref_to_misc(doc) + assert doc.meta["global.Entity"].startswith("eid") + doc.meta["global.Entity"] = "GRP" + doc.meta["global.Entity"][3:] + udapi.core.coref.load_coref_from_misc(doc) + doc.meta["global.Entity"] = "eid" + doc.meta["global.Entity"][3:] diff --git a/udapi/block/corefud/fixinterleaved.py b/udapi/block/corefud/fixinterleaved.py new file mode 100644 index 00000000..b4a42a43 --- /dev/null +++ b/udapi/block/corefud/fixinterleaved.py @@ -0,0 +1,84 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class FixInterleaved(Block): + """Fix mentions with interleaved or crossing spans. + https://github.com/ufal/corefUD/issues/25 + """ + + def __init__(self, same_entity_only=True, both_discontinuous=False, + crossing_only=False, nested_same_subspan=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.crossing_only = crossing_only + self.nested_same_subspan = nested_same_subspan + + def process_tree(self, tree): + mentions, deleted = set(), set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + for mA, mB in itertools.combinations(mentions, 2): + if mA in deleted or mB in deleted: + continue + if self.same_entity_only and mA.entity != mB.entity: + continue + + # Fully nested spans are OK, except for same-subspan. + sA, sB = set(mA.words), set(mB.words) + if (sA <= sB) or (sB <= sA): + if not self.nested_same_subspan: + continue + elif not set(mA.span.split(',')).intersection(set(mB.span.split(','))): + continue + + # Crossing or interleaved+crossing? + elif self.crossing_only: + if not sA.intersection(sB): + continue + else: + if mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]: + continue + if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: + continue + + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + + mA.words = list(sA.union(sB)) + for wb in sB: + try: + wb._mentions.remove(mB) + except ValueError: + pass + try: + mB.entity.mentions.remove(mB) + except ValueError: + pass + deleted.add(mB) + + # By changing the mA.words, we could have created another error: + # making the span same as another mention. Let's fix it. + sA = set(mA.words) + for mC in sorted(mentions): + if mC in deleted or mC is mA or mC is mB: + continue + if sA != set(mC.words): + continue + # So mA and mC have the same span and we need to delete one of them to fix it. + # We will delete mA because it has the artificially enlarged span, + # while mC is from the original annotation. + for wa in sA: + try: + wa._mentions.remove(mA) + except ValueError: + pass + try: + mA.entity.mentions.remove(mA) + except ValueError: + pass + break + deleted.add(mA) diff --git a/udapi/block/corefud/fixparentheses.py b/udapi/block/corefud/fixparentheses.py new file mode 100644 index 00000000..bc8e6504 --- /dev/null +++ b/udapi/block/corefud/fixparentheses.py @@ -0,0 +1,31 @@ +from udapi.core.block import Block + + +class FixParentheses(Block): + """Find mentions that contain opening parenthesis but do not contain the closing one (or the other way around). + If the missing parenthesis is an immediate neighbour of the mention span, add it to the span.""" + + def __init__(self, mark=True, **kwargs): + super().__init__(**kwargs) + self.mark = mark + + def process_coref_mention(self, mention): + words = [word.lemma for word in mention.words] + pairs = ['()', '[]', '{}'] + for pair in pairs: + if pair[0] in words: + if not pair[1] in words and pair[1] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[-1].ord == int(mention.words[-1].ord) and mention.words[-1].next_node and \ + mention.words[-1].next_node.lemma == pair[1]: + next_node = mention.words[-1].next_node + mention.words.append(next_node) + if self.mark: + next_node.misc['Mark'] = 1 + + elif pair[1] in words and pair[0] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[0].ord == int(mention.words[0].ord) and mention.words[0].prev_node \ + and mention.words[0].prev_node.lemma == pair[0]: + prev_node = mention.words[0].prev_node + mention.words.append(prev_node) + if self.mark: + prev_node.misc['Mark'] = 1 diff --git a/udapi/block/corefud/fixtovalidate.py b/udapi/block/corefud/fixtovalidate.py new file mode 100644 index 00000000..48a3608d --- /dev/null +++ b/udapi/block/corefud/fixtovalidate.py @@ -0,0 +1,39 @@ +from udapi.core.block import Block + +class FixToValidate(Block): + """This block fixes the CorefUD data so that the final documents are valid conllu files.""" + + def _set_root_deprel(self, doc): + for root in doc.trees: + for node in root.children: + if node.deprel != "root": + node.deprel = "root" + + def _unset_root_deprel(self, doc): + for node in doc.nodes: + parent = node.parent + if node.deprel == "root" and parent is not None and not parent.is_root(): + #print("\t".join(['Non-0-root:', node.address(), node.upos, str(node.feats), node.parent.upos, str(node.parent.feats)])) + if parent.upos == "PUNCT" and parent.parent is not None: + node.parent = parent.parent + if node.upos == "CCONJ": + node.deprel = "cc" + elif node.upos == "ADJ" and parent.upos == "PROPN": + node.deprel = "amod" + elif node.upos == "NOUN" and parent.upos == "VERB": + node.deprel = "obl" + else: + node.deprel = "parataxis" + + def _space_before_pardoc(self, doc): + last_node = None + for i, tree in enumerate(doc.trees): + if i > 0: + if (tree.newdoc is not None or tree.newpar is not None) and last_node.no_space_after: + del last_node.misc["SpaceAfter"] + last_node = tree.descendants[-1] + + def process_document(self, doc): + self._set_root_deprel(doc) + self._unset_root_deprel(doc) + self._space_before_pardoc(doc) diff --git a/udapi/block/corefud/guessspan.py b/udapi/block/corefud/guessspan.py new file mode 100644 index 00000000..d6093ece --- /dev/null +++ b/udapi/block/corefud/guessspan.py @@ -0,0 +1,33 @@ +from udapi.core.block import Block + +class GuessSpan(Block): + """Block corefud.GuessSpan heuristically fills mention spans, while keeping mention.head""" + + def process_coref_mention(self, mention): + mwords = mention.head.descendants(add_self=True) + # TODO add heuristics from corefud.PrintMentions almost_forest=1 + + # Add empty nodes that are causing gaps. + # A node "within the span" whose enhanced parent is in the mentions + # must be added to the mention as well. + # "within the span" includes also empty nodes "on the boundary". + # However, don't add empty nodes which are in a gap cause by non-empty nodes. + to_add = [] + min_ord = int(mwords[0].ord) if mwords[0].is_empty() else mwords[0].ord - 1 + max_ord = int(mwords[-1].ord) + 1 + root = mention.head.root + for empty in root.empty_nodes: + if empty in mwords: + continue + if empty.ord > max_ord: + break + if empty.ord > min_ord: + if any(enh['parent'] in mwords for enh in empty.deps): + to_add.append(empty) + elif empty.ord > min_ord + 1 and empty.ord < max_ord - 1: + prev_nonempty = root.descendants[int(empty.ord) - 1] + next_nonempty = root.descendants[int(empty.ord)] + if prev_nonempty in mwords and next_nonempty in mwords: + to_add.append(empty) + #else: empty.misc['Mark'] = f'not_in_treelet_of_{mention.entity.eid}' + mention.words = sorted(mwords + to_add) diff --git a/udapi/block/corefud/gum2corefud.py b/udapi/block/corefud/gum2corefud.py new file mode 100644 index 00000000..bf6d798d --- /dev/null +++ b/udapi/block/corefud/gum2corefud.py @@ -0,0 +1,111 @@ +import re +import logging +from collections import defaultdict +from udapi.core.block import Block + +class Gum2CorefUD(Block): + + def process_tree(self, tree): + docname = tree.bundle.document.meta['docname'] + '_' + + eid_to_entity = tree.bundle.document._eid_to_entity + unfinished_mentions = defaultdict(list) + for node in tree.descendants: + misc_entity = node.misc['Entity'] + if not misc_entity: + continue + # Attribute Entity may contain multiple entities, e.g. + # Entity=(abstract-7-new-2-coref(abstract-3-giv:act-1-coref) + # means a start of entity id=7 and start&end (i.e. single-word mention) of entity id=3. + # The following re.split line splits this into + # entities = ["(abstract-7-new-2-coref", "(abstract-3-giv:act-1-coref)"] + entities = [x for x in re.split('(\([^()]+\)?|[^()]+\))', misc_entity) if x] + for entity in entities: + # GUM 2.9 uses global.Entity = entity-GRP-infstat-MIN-coref_type-identity + # but the closing tag is shortent just to GRP. + opening, closing = (entity[0] == '(', entity[-1] == ')') + entity = entity.strip('()') + if not opening and not closing: + logging.warning(f"Entity {entity} at {node} has no opening nor closing bracket.") + elif not opening and closing: + name = docname + entity + if not unfinished_mentions[name]: + raise ValueError(f"Mention {name} closed at {node}, but not opened in the same tree.") + else: + mention = unfinished_mentions[name].pop() + mention.span = f'{mention.head.ord}-{node.ord}' + else: + attrs = entity.split('-') + if len(attrs) == 6: + etype, grp, infstat, minspan, ctype, wiki = attrs + elif len(attrs) == 5: + wiki = None + etype, grp, infstat, minspan, ctype = attrs + elif len(attrs) > 6: + logging.warning(f"Entity {entity} at {node} has more than 6 attributes.") + etype, grp, infstat, minspan, ctype, wiki = entity.split('-', maxsplit=5) + else: + raise ValueError(f"Less than 5 attributes in {entity} at {node}") + name = docname + grp + entity = eid_to_entity.get(name) + if entity is None: + entity = node.create_coref_entity(eid=name, etype=etype) + mention = entity.mentions[0] + mention.misc = f"Infstat:{infstat},MinSpan:{minspan},CorefType:{ctype}" + if wiki: + mention.misc += ',Wikification:' + wiki #.replace(',', '%2C') + else: + mention = entity.create_mention(head=node) + if closing: + mention.words = [node] + else: + unfinished_mentions[name].append(mention) + del node.misc['Entity'] + + misc_bridges = node.misc['Bridge'] + if misc_bridges: + # E.g. Entity=event-12|Bridge=12<124,12<125 + for misc_bridge in misc_bridges.split(','): + try: + trg_str, src_str = [docname + grp for grp in misc_bridge.split('<')] + except ValueError as err: + raise ValueError(f"{node}: {misc_bridge} {err}") + try: + trg_entity = eid_to_entity[trg_str] + src_entity = eid_to_entity[src_str] + except KeyError as err: + logging.warning(f"{node}: Cannot find entity {err}") + else: + mention = src_entity.mentions[-1] + # TODO: what relation should we choose for Bridging? + # relation = f"{src_str.split('-')[0]}-{trg_str.split('-')[0]}" + relation = '_' + mention.bridging.append((trg_entity, relation)) + del node.misc['Bridge'] + + misc_split = node.misc['Split'] + if misc_split: + # E.g. Entity=(person-54)|Split=4<54,9<54 + src_str = docname + misc_split.split('<')[-1] + ante_entities = [] + for x in misc_split.split(','): + ante_str, this_str = [docname + grp for grp in x.split('<')] + if this_str != src_str: + raise ValueError(f'{node} invalid Split: {this_str} != {src_str}') + # logging.warning + # There are just three such cases in GUM and all are bugs, + # so let's ignore them entirely (the `else` clause will be skipped if exiting `for` w/ `break`). + # break + ante_entities.append(eid_to_entity[ante_str]) + else: + eid_to_entity[src_str].split_ante = ante_entities + del node.misc['Split'] + + for entity_name, mentions in unfinished_mentions.items(): + for mention in mentions: + logging.warning(f"Mention {name} opened at {mention.head}, but not closed in the same tree. Deleting.") + entity = mention.entity + mention.words = [] + entity._mentions.remove(mention) + if not entity._mentions: + del eid_to_entity[name] diff --git a/udapi/block/corefud/indexclusters.py b/udapi/block/corefud/indexclusters.py new file mode 100644 index 00000000..3f5d74d8 --- /dev/null +++ b/udapi/block/corefud/indexclusters.py @@ -0,0 +1,35 @@ +"""Block corefud.IndexClusters""" +from udapi.core.block import Block + + +class IndexClusters(Block): + """Re-index the coreference entity IDs (eid). The final entity IDs are of the "e" form, + where are ordinal numbers starting from the one specified by the `start` parameter. + This block can be applied on multiple documents within one udapy call. + For example, to re-index eid in all conllu files in the current directory + (keeping the IDs unique across all the files), use: + `udapy read.Conllu files='!*.conllu' corefud.IndexClusters write.Conllu overwrite=1` + + Parameters: + ----------- + start : int + the starting index (default=1) + prefix : str + prefix of the IDs before the number (default="e") + """ + + def __init__(self, start=1, prefix='e'): + self.start = start + self.prefix = prefix + + def process_document(self, doc): + entities = doc.coref_entities + if not entities: + return + new_eid_to_entity = {} + for idx, entity in enumerate(entities, self.start): + new_eid = self.prefix + str(idx) + entity.eid = new_eid + new_eid_to_entity[new_eid] = entity + self.start = idx + 1 + doc._eid_to_entity = new_eid_to_entity diff --git a/udapi/block/corefud/link2cluster.py b/udapi/block/corefud/link2cluster.py new file mode 100644 index 00000000..08296531 --- /dev/null +++ b/udapi/block/corefud/link2cluster.py @@ -0,0 +1,137 @@ +import logging +from udapi.core.block import Block + +class Link2Cluster(Block): + """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format. + + Params: + id_attr: name of the attribute in MISC that stores the original-format IDs of nodes + ante_attr: name of the attribute in MISC that stores the ID of the antecedent + of the current node (in the same format as `id_attr`). + delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion? + (i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr, + bridge_attr, bridge_relation_attr if these are used). Default=True. + infstat_attr: name of the attribute in MISC that stores the information status of a given mention + Will be stored in `mention.other['infstat']`. Use None for ignoring this. + coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention + Will be stored in `mention.other['coreftype']`. Use None for ignoring this. + bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent + of the current node/mention (in the same format as `id_attr`). + Default=None, i.e. ignore this parameter. + bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type + (e.g. "part" or "subset"). Default=None, i.e. ignore this parameter. + eid_counter: use a global counter of entity.eid and start with a given number. Default=1. + The main goal of this parameter is to make eid unique across multiple documents. + If you use eid_counter=0, this feature will be turned off, + so entities will be created using `root.document.create_coref_entity()`, + with no eid parameter, so that the eid will start from "e1" in each document processed by this block. + """ + def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, + infstat_attr='information-status', coreftype_attr='coreftype', + bridge_attr=None, bridge_relation_attr=None, eid_counter=1, **kwargs): + super().__init__(**kwargs) + self.id_attr = id_attr + self.ante_attr = ante_attr + self.delete_orig_attrs = delete_orig_attrs + self.infstat_attr = infstat_attr + self.coreftype_attr = coreftype_attr + self.bridge_attr = bridge_attr + self.bridge_relation_attr = bridge_relation_attr + self.eid_counter = int(eid_counter) + + def _new_entity(self, doc): + if not self.eid_counter: + return doc.create_coref_entity() + entity = doc.create_coref_entity(eid=f"e{self.eid_counter}") + self.eid_counter += 1 + return entity + + def _new_mention(self, entity, node): + mention = entity.create_mention(head=node, words=[node]) + if self.infstat_attr and node.misc[self.infstat_attr]: + mention.other['infstat'] = node.misc[self.infstat_attr] + if self.delete_orig_attrs: + del node.misc[self.infstat_attr] + if self.coreftype_attr and node.misc[self.coreftype_attr]: + mention.other['coreftype'] = node.misc[self.coreftype_attr] + if self.delete_orig_attrs: + del node.misc[self.coreftype_attr] + return mention + + def process_document(self, doc): + id2node = {} + links = [] + bridges = [] + for node in doc.nodes_and_empty: + this_id = node.misc[self.id_attr] + if this_id != '': + id2node[this_id] = node + ante_id = node.misc[self.ante_attr] + if ante_id != '': + if ante_id == this_id: + logging.warning(f"{node} has a self-reference {self.ante_attr}={ante_id}") + else: + links.append([ante_id, this_id]) + if self.delete_orig_attrs: + for attr in (self.id_attr, self.ante_attr): + del node.misc[attr] + if self.bridge_attr: + bridge_id = node.misc[self.bridge_attr] + if bridge_id != '': + if bridge_id == this_id: + logging.warning(f"{node} has a self-reference bridging {self.bridge_attr}={bridge_id}") + else: + bridges.append([bridge_id, this_id, node.misc[self.bridge_relation_attr]]) + if self.delete_orig_attrs: + for attr in (self.bridge_attr, self.bridge_relation_attr): + del node.misc[attr] + + # It seems faster&simpler to process the links in any order and implement entity merging, + # rather than trying to sort the links so that no entity merging is needed. + for ante_id, this_id in links: + if ante_id not in id2node: + logging.warning(f"{ante_id} is referenced in {self.ante_attr}, but not in {self.id_attr}") + else: + ante_node, this_node = id2node[ante_id], id2node[this_id] + if not this_node.coref_mentions and not ante_node.coref_mentions: + # None of the nodes is part of any mention/entity. Let's create them. + entity = self._new_entity(this_node.root.document) + self._new_mention(entity, ante_node) + self._new_mention(entity, this_node) + elif this_node.coref_mentions and ante_node.coref_mentions: + # Both of the nodes are part of mentions in different entities. + # Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity). + # While the official API supports "stealing" a single mention (m.entity = another_entity), + # the implementation below using _mentions and _entity is a bit faster. + e_ante, e_this = this_node.coref_entities[0], ante_node.coref_entities[0] + assert e_ante != e_this + for mention in e_ante.mentions: + mention._entity = e_this + e_this._mentions.extend(e_ante.mentions) + e_this._mentions.sort() + e_ante._mentions.clear() + else: + # Only one of the nodes is part of an entity. Let's add the second one to this entity. + if ante_node.coref_mentions: + self._new_mention(ante_node.coref_entities[0], this_node) + else: + self._new_mention(this_node.coref_entities[0], ante_node) + + # Bridging + for ante_id, this_id, relation in bridges: + if ante_id not in id2node: + logging.warning(f"{ante_id} is referenced in {self.bridge_attr}, but not in {self.id_attr}") + else: + ante_node, this_node = id2node[ante_id], id2node[this_id] + if ante_node.coref_mentions: + m_ante = next(m for m in ante_node.coref_mentions if m.head is ante_node) + e_ante = m_ante.entity + else: + e_ante = self._new_entity(ante_node.root.document) + m_ante = self._new_mention(e_ante, ante_node) + if this_node.coref_mentions: + m_this = next(m for m in this_node.coref_mentions if m.head is this_node) + else: + e_this = self._new_entity(this_node.root.document) + m_this = self._new_mention(e_this, this_node) + m_this.bridging.append((e_ante, relation)) diff --git a/udapi/block/corefud/load.py b/udapi/block/corefud/load.py new file mode 100644 index 00000000..92773dc2 --- /dev/null +++ b/udapi/block/corefud/load.py @@ -0,0 +1,12 @@ +from udapi.core.block import Block +import udapi.core.coref + +class Load(Block): + """Load coreference-related MISC attributes into memory. Allow lenient mode by strict=0.""" + + def __init__(self, strict=True): + self.strict = strict + + def process_document(self, doc): + if doc._eid_to_entity is None: + udapi.core.coref.load_coref_from_misc(doc, self.strict) diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py new file mode 100644 index 00000000..8064e67f --- /dev/null +++ b/udapi/block/corefud/markcrossing.py @@ -0,0 +1,39 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +import logging + +class MarkCrossing(Block): + """Find mentions with crossing spans.""" + + def __init__(self, same_entity_only=False, continuous_only=False, print_form=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.continuous_only = continuous_only + self.print_form = print_form + self.log = log + self.mark = mark + self._logged = {} + + def _print(self, mention): + if self.print_form: + return ' '.join([w.form for w in mention.words]) + else: + return mention.span + + def process_node(self, node): + if len(node.coref_mentions) > 1: + for mA, mB in itertools.combinations(node.coref_mentions, 2): + if not (set(mA.words) <= set(mB.words)) and not (set(mB.words) <= set(mA.words)): + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.continuous_only and (',' in mA.span or ',' in mB.span): + continue + if self.mark: + node.misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + cross_id = node.root.sent_id + mA.span + mB.span + if cross_id not in self._logged: + self._logged[cross_id] = True + print(f"crossing mentions at {node}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/markinterleaved.py b/udapi/block/corefud/markinterleaved.py new file mode 100644 index 00000000..c00f73b1 --- /dev/null +++ b/udapi/block/corefud/markinterleaved.py @@ -0,0 +1,45 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkInterleaved(Block): + """Find mentions with interleaved spans.""" + + def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + if len(mentions) > 1: + for mA, mB in itertools.combinations(mentions, 2): + if set(mA.words).intersection(set(mB.words)): + continue + if mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]: + continue + if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]: + continue + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"interleaved mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/marknested.py b/udapi/block/corefud/marknested.py new file mode 100644 index 00000000..8db8a657 --- /dev/null +++ b/udapi/block/corefud/marknested.py @@ -0,0 +1,44 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkNested(Block): + """Find nested mentions.""" + + def __init__(self, same_entity_only=True, both_discontinuous=False, multiword_only=False, + print_form=False, log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.multiword_only = multiword_only + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + for mA, mB in itertools.combinations(mentions, 2): + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + sA, sB = set(mA.words), set(mB.words) + if not (sA <= sB) and not (sB <= sA): + continue + if self.multiword_only and (len(sA) == 1 or len(sB) == 1): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"nested mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/markpairs.py b/udapi/block/corefud/markpairs.py new file mode 100644 index 00000000..cc63b387 --- /dev/null +++ b/udapi/block/corefud/markpairs.py @@ -0,0 +1,138 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +from collections import Counter +import logging + +class MarkPairs(Block): + """Find pairs of coreference mentions within the same sentence with given properties. + Mark these pairs of mentions (using `misc["Mark"]`), so they can be further + processed or printed. + + Usage: + # Find pairs of mentions of the same entity within the same sentence: + cat my.conllu | udapy -TM corefud.MarkPairs same_entity=1 | less -R + + Properties: + same_entity - both mentions belong to the same entity (cluster) + both_continuous - both mentions have continuous spans + both_discontinuous - both mentions have discontinuous spans + nested - span of one mention is nested (a subset of) in the span of the other mention + crossing - spans are crossing (i.e. intersecting, but neither is subset of the other) + interleaved - spans are interleaved (i.e. not intersecting, but neither span precedes the other) + same_head - the same node is a head of both mentions + same_span - both mentions have the same span (which is invalid according to UD's validate.py) + same_subspan - at least one of the mentions is discontinuous and one of its subspans + is also a subspan (or span) of the other mention + + + You can combine any number of properties. + Each property can have one of the three values: + include - this is the default value: include pairs with this property, i.e. ignore the property + exclude - exclude (from the marking) pairs of mentions with this property + only - pairs of mentions without this property will be excluded + + As a shortcut, you can use -1 and 1 instead of exclude and only, so e.g. + nested=only same_head=exclude + can be written as + nested=1 same_head=-1 + """ + + def __init__(self, same_entity=0, both_continuous=0, both_discontinuous=0, + nested=0, crossing=0, interleaved=0, + same_head=0, same_span=0, same_subspan=0, + print_form=False, print_total=True, log=True, mark=True, **kwargs): + super().__init__(**kwargs) + + + self.same_entity = self._convert(same_entity) + self.both_continuous = self._convert(both_continuous) + self.both_discontinuous = self._convert(both_discontinuous) + self.nested = self._convert(nested) + self.crossing = self._convert(crossing) + self.interleaved = self._convert(interleaved) + self.same_head = self._convert(same_head) + self.same_span = self._convert(same_span) + self.same_subspan = self._convert(same_subspan) + + self.print_form = print_form + self.print_total = print_total + self.log = log + self.mark = mark + self.counter = Counter() + + def _convert(self, value): + if value in {-1, 0, 1}: + return value + if value == 'include': + return 0 + if value == 'only': + return 1 + if value == 'exclude': + return -1 + raise ValueError('unknown value ' + value) + + def _ok(self, condition, value): + if value == 0: + return True + return (condition and value == 1) or (not condition and value==-1) + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + self.counter['mentions'] += len(mentions) + + for mA, mB in itertools.combinations(mentions, 2): + self.counter['pairs'] += 1 + if not self._ok(mA.entity == mB.entity, self.same_entity): + continue + if not self._ok(mA.head == mB.head, self.same_head): + continue + + if self.both_continuous or self.both_discontinuous or self.same_span or self.same_subspan: + sA, sB = mA.span, mB.span + cA, cB = ',' not in sA, ',' not in sB + if not self._ok(cA and cB, self.both_continuous): + continue + if not self._ok(not cA and not cB, self.both_discontinuous): + continue + if not self._ok(sA == sB, self.same_span): + continue + if not self._ok(set(sA.split(',')).intersection(set(sB.split(','))), self.same_subspan): + continue + + if self.nested or self.crossing or self.interleaved: + wA, wB = set(mA.words), set(mB.words) + if not self._ok(wA <= wB or wB <= wA, self.nested): + continue + if not self._ok(wA.intersection(wB) and not wA <= wB and not wB <= wA, self.crossing): + continue + if self.interleaved: + a_precedes_b = mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0] + b_precedes_a = mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0] + if not self._ok(not wA.intersection(wB) and not a_precedes_b and not b_precedes_a, self.interleaved): + continue + + self.counter['matching'] += 1 + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + logging.info(f"Found mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") + + def after_process_document(self, doc): + if self.print_total: + #if self.max_trees and seen_trees > self.max_trees: + # print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + msg = f'######## Mentions = {self.counter["mentions"]}, matching/all pairs = {self.counter["matching"]} / {self.counter["pairs"]}' + logging.info(msg) + doc.meta["corefud.MarkPairs"] = msg diff --git a/udapi/block/corefud/marksamesubspan.py b/udapi/block/corefud/marksamesubspan.py new file mode 100644 index 00000000..f3cfd7b3 --- /dev/null +++ b/udapi/block/corefud/marksamesubspan.py @@ -0,0 +1,45 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools + +class MarkSameSubSpan(Block): + """Find mentions with the same subspan.""" + + def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False, nested_only=False, + log=True, mark=True, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + self.both_discontinuous = both_discontinuous + self.nested_only = nested_only + self.print_form = print_form + self.log = log + self.mark = mark + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + if len(mentions) > 1: + for mA, mB in itertools.combinations(mentions, 2): + if self.same_entity_only and mA.entity != mB.entity: + continue + if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span): + continue + sA, sB = set(mA.words), set(mB.words) + if self.nested_only and not (sA <= sB) and not (sB <= sA): + continue + if not set(mA.span.split(',')).intersection(set(mB.span.split(','))): + continue + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + print(f"same-subspan mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py new file mode 100644 index 00000000..61b613cb --- /dev/null +++ b/udapi/block/corefud/mergesamespan.py @@ -0,0 +1,52 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +import logging + +class MergeSameSpan(Block): + """ + Multiple same-span mentions are considered invalid in CoNLL-U, whether they + belong to the same entity or not. If they occur, merge them into one. + Note: We currently do not have mentions across sentence boundaries in the + CorefUD data, so this block processes one sentence at a time. + """ + + def __init__(self, same_entity_only=False, **kwargs): + super().__init__(**kwargs) + self.same_entity_only = same_entity_only + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + for mA, mB in itertools.combinations(mentions, 2): + if self.same_entity_only and mA.entity != mB.entity: + continue + # Reduce non-determinism in which mention is removed: + # If the mentions belong to different entities, sort them by entity (entity) ids. + if mA.entity.eid > mB.entity.eid: + mA, mB = mB, mA + + sA, sB = set(mA.words), set(mB.words) + if sA != sB: + continue + + # If the mentions belong to different entities, we should merge the + # entities first, i.e., pick one entity as the survivor, move the + # mentions from the other entity to this entity, and remove the + # other entity. + if mA.entity != mB.entity: + logging.warning(f"Merging same-span mentions that belong to different entities: {mA.entity.eid} vs. {mB.entity.eid}") + ###!!! TODO: As of now, changing the entity of a mention is not supported in the API. + #for m in mB.entity.mentions: + # m.entity = mA.entity + # Remove mention B. It may have been removed earlier because of + # another duplicate, that is the purpose of try-except. + ###!!! TODO: If we remove a singleton, we are destroying the entity. Then we must also handle possible bridging and split antecedents pointing to that entity! + mB.words = [] + try: + mB.entity.mentions.remove(mB) + except ValueError: + pass diff --git a/udapi/block/corefud/miscstats.py b/udapi/block/corefud/miscstats.py new file mode 100644 index 00000000..dee358d6 --- /dev/null +++ b/udapi/block/corefud/miscstats.py @@ -0,0 +1,35 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class MiscStats(Block): + """Block corefud.MiscStats prints 10 most frequent values of each attribute stored in the MISC field""" + + def __init__(self, maxvalues=10, **kwargs): + + """Create the corefud.MiscStats + + Args: + maxvalues: the number of most frequent values + to be printed for each attribute. + + """ + super().__init__(**kwargs) + self.maxvalues = maxvalues + self.valuecounter = {} + self.totalcounter = Counter() + + def process_node(self,node): + for attrname in node.misc: + shortattrname = re.sub(r'\[\d+\]',r'',attrname) + if not shortattrname in self.valuecounter: + self.valuecounter[shortattrname] = Counter() + self.valuecounter[shortattrname][node.misc[attrname]] += 1 + self.totalcounter[shortattrname] += 1 + + def process_end(self): + for attrname in self.valuecounter: + print() + print(attrname+"\t"+str(self.totalcounter[attrname])) + for value,freq in self.valuecounter[attrname].most_common(self.maxvalues): + print("\t"+str(value)+"\t"+str(freq)) diff --git a/udapi/block/corefud/miscstatstex.py b/udapi/block/corefud/miscstatstex.py new file mode 100644 index 00000000..25d3751a --- /dev/null +++ b/udapi/block/corefud/miscstatstex.py @@ -0,0 +1,44 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class MiscStatsTex(Block): + """Block corefud.MiscStats prints 10 most frequent values of each attribute stored in the MISC field""" + + def __init__(self, maxvalues=10, **kwargs): + + """Create the corefud.MiscStats + + Args: + maxvalues: the number of most frequent values + to be printed for each attribute. + + """ + super().__init__(**kwargs) + self.maxvalues = maxvalues + self.valuecounter = {} + self.totalcounter = Counter() + + def process_node(self,node): + for attrname in node.misc: + shortattrname = re.sub(r'\[\d+\]',r'',attrname) + if not shortattrname in self.valuecounter: + self.valuecounter[shortattrname] = Counter() + self.valuecounter[shortattrname][node.misc[attrname]] += 1 + self.totalcounter[shortattrname] += 1 + + def process_end(self): + for attrname in self.valuecounter: + + total = self.totalcounter[attrname] + distrvalues = [] + + for value,freq in self.valuecounter[attrname].most_common(self.maxvalues): + value = re.sub(r'_',r'\\_',value) + distrvalues.append(f'\\attr{{{str(value)}}} {100*freq/total:2.1f}~\\%') + + attrname = re.sub(r'_',r'\\_',attrname) + print(f" \\item attribute \\attr{{{attrname}}}, {total:,} occurrences, values: "+", ".join(distrvalues)) +# print(f" \\item attribute \\attr\{{attrname}\}, {str(total)} occurrences, distribution of values: "+", ".join(distrvalues)) + + diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py new file mode 100644 index 00000000..00a32e9f --- /dev/null +++ b/udapi/block/corefud/movehead.py @@ -0,0 +1,95 @@ +import logging +from collections import Counter +from udapi.core.block import Block +from udapi.core.node import find_minimal_common_treelet + +class MoveHead(Block): + """Block corefud.MoveHead moves the head to the highest node in each mention.""" + + def __init__(self, bugs='warn', keep_head_if_possible=True, **kwargs): + self.counter = Counter() + self.bugs = bugs + self.keep_head_if_possible = keep_head_if_possible + super().__init__(**kwargs) + + def _eparents(self, node): + if node._raw_deps != '_': + return [d['parent'] for d in node.deps] + if node.parent: + return [node.parent] + return [] + + def find_head(self, mention): + mwords = set(mention.words) + + # First, check the simplest case: no empty words and a treelet in basic dependencies. + basic_heads = [w for w in mention.words if not w.parent or not w.parent in mwords] + assert basic_heads + if len(basic_heads) == 1: + return basic_heads[0], 'treelet' + + # Second, check also enhanced dependencies (but only within basic_heads for simplicity). + enh_heads = [w for w in basic_heads if not any(p in mwords for p in self._eparents(w))] + if not enh_heads: + enh_heads = [w for w in basic_heads if not all(p in mwords for p in self._eparents(w))] + if not enh_heads: + return mention.head, 'cycle' + if len(enh_heads) == 1: + return enh_heads[0], 'treelet' + + # Third, find non-empty parents (ancestors in future) of empty nodes. + empty_nodes, non_empty = [], [] + for w in enh_heads: + (empty_nodes if w.is_empty() else non_empty).append(w) + if empty_nodes: + for empty_node in empty_nodes: + parents = [d['parent'] for d in empty_node.deps if not d['parent'].is_empty()] + if parents: + if parents[0] not in non_empty: + non_empty.append(parents[0]) + else: + # TODO we should climb up, but preventing cycles + # We could also introduce empty_node.nonempty_ancestor + if 'warn' in self.bugs: + logging.warning(f"could not find non-empty parent of {empty_node} for mention {mention.head}") + if 'mark' in self.bugs: + node.misc['Bug'] = 'no-parent-of-empty' + non_empty.sort() + + # Fourth, check if there is a node within the enh_heads governing all the mention nodes + # and forming thus a "gappy treelet", where the head is clearly the "highest" node. + (highest, added_nodes) = find_minimal_common_treelet(*non_empty) + if highest in enh_heads: + return highest, 'gappy' + if highest in mwords: + if 'warn' in self.bugs: + logging.warning(f"Strange mention {mention.head} with highest node {highest}") + if 'mark' in self.bugs: + highest.misc['Bug'] = 'highest-in-mwords' + mention.head.misc['Bug'] = 'highest-head' + + # Fifth, try to convervatively preserve the original head, if it is one of the possible heads. + if self.keep_head_if_possible and mention.head in enh_heads: + return mention.head, 'nontreelet' + + # Finally, return the word-order-wise first head candidate as the head. + return enh_heads[0], 'nontreelet' + + def process_coref_mention(self, mention): + self.counter['total'] += 1 + if len(mention.words) < 2: + self.counter['single-word'] += 1 + else: + new_head, category = self.find_head(mention) + self.counter[category] += 1 + if new_head is mention.head: + self.counter[category + '-kept'] += 1 + else: + self.counter[category + '-moved'] += 1 + mention.head = new_head + + def process_end(self): + logging.info("corefud.MoveHead overview of mentions:") + total = self.counter['total'] + for key, value in self.counter.most_common(): + logging.info(f"{key:>16} = {value:6} ({100*value/total:5.1f}%)") diff --git a/udapi/block/corefud/printentities.py b/udapi/block/corefud/printentities.py new file mode 100644 index 00000000..7230c6a5 --- /dev/null +++ b/udapi/block/corefud/printentities.py @@ -0,0 +1,55 @@ +import re +import os.path +from udapi.core.block import Block +from collections import Counter, defaultdict + +class PrintEntities(Block): + """Block corefud.PrintEntities prints all mentions of a given entity.""" + + def __init__(self, eid_re=None, min_mentions=0, print_ranges=True, mark_head=True, + aggregate_mentions=True, **kwargs): + """Params: + eid_re: regular expression constraining ID of the entities to be printed + min_mentions: print only entities with with at least N mentions + print_ranges: print also addressess of all mentions + (compactly, using the longest common prefix of sent_id) + mark_head: mark the head (e.g. as "red **car**") + """ + super().__init__(**kwargs) + self.eid_re = re.compile(str(eid_re)) if eid_re else None + self.min_mentions = min_mentions + self.print_ranges = print_ranges + self.mark_head = mark_head + self.aggregate_mentions = aggregate_mentions + + def process_document(self, doc): + if 'docname' in doc.meta: + print(f"Coref entities in document {doc.meta['docname']}:") + for entity in doc.coref_entities: + if self.eid_re and not self.eid_re.match(entity.eid): + continue + if len(entity.mentions) < self.min_mentions: + continue + print(f" {entity.eid} has {len(entity.mentions)} mentions:") + if self.aggregate_mentions: + counter = Counter() + ranges = defaultdict(list) + for mention in entity.mentions: + forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) + counter[forms] += 1 + if self.print_ranges: + ranges[forms].append(mention.head.root.address() + ':' +mention.span) + for form, count in counter.most_common(): + print(f"{count:4}: {form}") + if self.print_ranges: + if count == 1: + print(' ' + ranges[form][0]) + else: + prefix = os.path.commonprefix(ranges[form]) + print(f' {prefix} ({" ".join(f[len(prefix):] for f in ranges[form])})') + else: + for mention in entity.mentions: + forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words]) + print(' ' + forms) + if self.print_ranges: + print(f" {mention.head.root.address()}:{mention.span}") diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py new file mode 100644 index 00000000..d011f686 --- /dev/null +++ b/udapi/block/corefud/printmentions.py @@ -0,0 +1,186 @@ +import random +from collections import Counter +from udapi.core.block import Block +from udapi.block.write.textmodetreeshtml import TextModeTreesHtml +from udapi.block.write.textmodetrees import TextModeTrees + +class PrintMentions(Block): + """Print mentions with various properties.""" + + def __init__(self, continuous='include', almost_continuous='include', treelet='include', + forest='include', almost_forest='include', oneword='include', singleton='include', + empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, + print_total=True, print_should=True, + print_sent_id=True, print_text=True, add_empty_line=True, indent=1, + minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc', + print_undef_as='_', print_doc_meta=True, print_comments=False, + mark='(Mark)', hints=True, layout='classic', + **kwargs): + super().__init__(**kwargs) + self.continuous = self._convert(continuous) + self.almost_continuous = self._convert(almost_continuous) + self.treelet = self._convert(treelet) + self.forest = self._convert(forest) + self.almost_forest = self._convert(almost_forest) + self.oneword = self._convert(oneword) + self.singleton = self._convert(singleton) + self.empty = self._convert(empty) + + self.max_trees = max_trees + self.html = html + self.shuffle = shuffle + if shuffle: + random.seed(42) + self.print_other_forms = print_other_forms + self.print_total = print_total, + self.print_should = print_should, + print_class = TextModeTreesHtml if html else TextModeTrees + self.print_block = print_class( + print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, + minimize_cross=minimize_cross, color=color, attributes=attributes, + print_undef_as=print_undef_as, print_doc_meta=print_doc_meta, print_comments=print_comments, + mark=mark, hints=hints, layout=layout) + + def _convert(self, value): + if value in {'include', 'exclude', 'only'}: + return value + if value == 1: + return 'only' + if value == 0: + return 'exclude' + raise ValueError('unknown value ' + value) + + def before_process_document(self, document): + self.print_block.before_process_document(document) + + def after_process_document(self, document): + self.print_block.after_process_document(document) + + def _ok(self, condition, value): + if value == 'include': + return True + return (condition and value == 'only') or (not condition and value=='exclude') + + def _is_auxiliary_etc(self, node): + if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}: + return True + if node.deprel == 'advmod:emph': + return True + if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: + return True + return False + + def _is_forest(self, mention, mwords, almost): + for w in mention.words: + # UD unfortunatelly does not use the copula-as-head style for copula construction, + # so e.g. in "It is my fault", "fault" is the root of the tree and all other words its children. + # However, in the cop-as-head stule, only "my" would depend on "fault" (and should be part of the mention). + # It is difficult to tell apart which w.children are related to w and which to the copula. + # We thus ignore these cases completely (we expect any child is potentially related to the copula). + if any(ch.udeprel == 'cop' for ch in w.children): + continue + for ch in w.children: + if ch not in mwords: + if not almost: + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid + return False + # Punctuation before or after the mention span can depend on any of the mwords + # without breaking the almost_forest property. + # According to the UD guidelines, it should depend on the highest node within the phrase, + # i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines. + if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]): + continue + # Some auxiliary words (e.g. prepositions) may be excluded from the mention span + # without breaking the almost_forest property, but they need to depend + # on the mention head (or if the mention is not a catena, they need to depend + # on one of the potential heads, i.e. a node from mwords whose parent is not in mwords). + # For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head), + # but "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest + # because "with" depends on "Mary", which is not the mention head (nor a potential mention head). + if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)): + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid + return False + return True + + def _is_almost_continuous(self, mention): + if ',' not in mention.span: + return True + nonempty = [w for w in mention.words if not w.is_empty()] + if not nonempty: + return True + mwords = set(mention.words) + gap_nodes = [w for w in mention.head.root.descendants if w > nonempty[0] and w < nonempty[-1] and not w in mwords] + for gap_node in gap_nodes: + if not gap_node.is_empty(): + return False + return True + + def process_document(self, doc): + mentions = [] + for entity in doc.coref_entities: + if self._ok(len(entity.mentions) == 1, self.singleton): + mentions.extend(entity.mentions) + if self.shuffle: + random.shuffle(mentions) + else: + mentions.sort() + + seen_trees = 0 + for mention in mentions: + if not self._ok(len(mention.words) == 1, self.oneword): + continue + if not self._ok(',' not in mention.span, self.continuous): + continue + if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous): + continue + + empty_mwords = [w for w in mention.words if w.is_empty()] + if not self._ok(len(empty_mwords) > 0, self.empty): + continue + + heads, mwords = 0, set(mention.words) + for w in mention.words: + if w.parent: + heads += 0 if w.parent in mwords else 1 + else: + heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 + if not self._ok(heads <= 1, self.treelet): + continue + if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest): + continue + if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest): + continue + + for w in mention.words: + w.misc['Mark'] = 1 + + seen_trees += 1 + if self.max_trees and seen_trees > self.max_trees: + if not self.print_total: + print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + return + else: + this_form = ' '.join([w.form for w in mention.words]) + print("# Mention = " + this_form) + if self.print_other_forms: + counter = Counter() + for m in mention.entity.mentions: + forms = ' '.join([w.form for w in m.words]) + if forms != this_form: + counter[forms] += 1 + if counter: + print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='') + for form, count in counter.most_common(self.print_other_forms): + print(f' "{form}"({count})', end='') + print() + self.print_block.process_tree(mention.head.root) + for w in mention.words: + del w.misc['Mark'] + + if self.print_total: + if self.max_trees and seen_trees > self.max_trees: + print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + print(f'######## Total matching/all mentions = {seen_trees} / {len(mentions)}') + diff --git a/udapi/block/corefud/removemisc.py b/udapi/block/corefud/removemisc.py new file mode 100644 index 00000000..f132aaed --- /dev/null +++ b/udapi/block/corefud/removemisc.py @@ -0,0 +1,18 @@ +from udapi.core.block import Block +import re + +class RemoveMisc(Block): + """Deleting all temporary attributes after primary conversions""" + + def __init__(self, attrnames='', **kwargs): + """ Arg: attrnames = comma-separated list of Misc attributes to be deleted""" + super().__init__(**kwargs) + self.attrs4deletion = set(attrnames.split(',')) + + def process_tree(self,root): + for node in root.descendants_and_empty: + for attrname in list(node.misc): + shortattrname = re.sub(r'\[\d+\]',r'',attrname) + if shortattrname in self.attrs4deletion: + del node.misc[attrname] + diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py new file mode 100644 index 00000000..4551873c --- /dev/null +++ b/udapi/block/corefud/removenocorefentities.py @@ -0,0 +1,21 @@ +from udapi.core.block import Block +import udapi.core.coref +import re +import logging + +class RemoveNoCorefEntities(Block): + """ + Some corpora (e.g., AnCora) include annotation of named entities that are + not annotated for coreference. To distinguish them, their cluster ID starts + with 'NOCOREF' (optionally followed by entity type, so that one cluster + still has just one type). We may want to remove such entities from datasets + that are used to train coreference resolves, to prevent the resolvers from + thinking that all members of a NOCOREF cluster are coreferential. That is + what this block does. + """ + + def process_document(self, doc): + entities = doc.coref_entities + if not entities: + return + doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)} diff --git a/udapi/block/corefud/singleparent.py b/udapi/block/corefud/singleparent.py new file mode 100644 index 00000000..ee9b1948 --- /dev/null +++ b/udapi/block/corefud/singleparent.py @@ -0,0 +1,47 @@ +"""If an empty node has multiple (enhanced-deps) parents, only the highest one is kept.""" +from udapi.core.block import Block +from collections import Counter +from udapi.core.node import find_minimal_common_treelet +import logging + +class SingleParent(Block): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._reasons = Counter() + + def process_tree(self, tree): + for empty in tree.empty_nodes: + self._reasons['_empty'] += 1 + if len(empty.deps) > 1: + self._reasons['_more-parents'] += 1 + parents = [d['parent'] for d in empty.deps] + nonempty_parents = [p for p in parents if not p.is_empty()] + if len(nonempty_parents) != len(parents): + self._reasons['empty-parent'] += 1 + #empty.misc['Mark'] = f"empty-parent:{empty.deps}" + logging.warning(f"Empty node {empty} has an empty parent.") + if not nonempty_parents: + empty.deps = [] + self._reasons['no-nonempty-parent'] += 1 + continue + (highest, added_nodes) = find_minimal_common_treelet(*nonempty_parents) + if highest in nonempty_parents: + self._reasons['one-governs'] += 1 + empty.deps = [d for d in empty.deps if d['parent'] is highest] + continue + nonempty_parents.sort(key=lambda n:n._get_attr('depth')) + if len(nonempty_parents)>1 and nonempty_parents[0]._get_attr('depth') == nonempty_parents[0]._get_attr('depth'): + self._reasons['same-depth'] += 1 + #empty.misc['Mark'] = f"same-depth:{empty.deps}" + else: + self._reasons['one-highest'] += 1 + #empty.misc['Mark'] = f"one-highest:{empty.deps}" + empty.deps = [d for d in empty.deps if d['parent'] is nonempty_parents[0]] + + def after_process_document(self, document): + message = "\n" + for k, v in self._reasons.most_common(): + message += f"{k}={v}\n" + #document.meta["bugs"] = message + logging.info(message) diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py new file mode 100644 index 00000000..527159e9 --- /dev/null +++ b/udapi/block/corefud/stats.py @@ -0,0 +1,305 @@ +from udapi.core.block import Block +from collections import Counter +import re + +class Stats(Block): + """Block corefud.Stats prints various coreference-related statistics.""" + + def __init__(self, m_len_max=5, e_len_max=5, + report_basics=False, report_mentions=True, report_entities=True, + report_details=True, report_words_per_doc=False, report_entity_range=False, + selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _', + exclude_singletons=False, exclude_nonsingletons=False, style='human', + per_doc=False, max_rows_per_page=50, docname='newdoc', docname_len=15, + highlight_docnames=None, + **kwargs): + super().__init__(**kwargs) + self.m_len_max = m_len_max + self.e_len_max = e_len_max + self.report_basics = report_basics + self.report_mentions = report_mentions + self.report_entities = report_entities + self.report_details = report_details + self.report_words_per_doc = report_words_per_doc + self.report_entity_range = report_entity_range + self.exclude_singletons = exclude_singletons + self.exclude_nonsingletons = exclude_nonsingletons + self.style = style + if style not in 'tex tex-table tex-doc human'.split(): + raise ValueError(f'Unknown style {style}') + self.per_doc = per_doc + self.max_rows_per_page = max_rows_per_page + if docname not in 'newdoc filename'.split(): + raise ValueError(f'Unknown style {style}') + self.docname = docname + self.docname_len = docname_len + self.highlight_docnames = highlight_docnames + self._header_printed = False + self._lines_printed = None + + self.counter = Counter() + self.mentions = 0 + self.entities = 0 + self.singletons = 0 + self.total_nodes = 0 + self.longest_mention = 0 + self.longest_entity = 0 + self.m_words = 0 + self.selected_upos = None if selected_upos == 'all' else selected_upos.split() + self.entity_ranges = [] + + def process_document(self, doc): + self.total_nodes += len(list(doc.nodes)) + self.counter['documents'] += 1 + node2docord, current_docord = {}, 0 + if self.report_entity_range: + for node in doc.nodes_and_empty: + node2docord[node] = current_docord + current_docord += 1 + + for entity in doc.coref_entities: + len_mentions = len(entity.mentions) + if len_mentions == 1: + self.singletons += 1 + if len_mentions == 1 and self.exclude_singletons: + continue + elif len_mentions > 1 and self.exclude_nonsingletons: + continue + if self.report_entity_range: + self.entity_ranges.append(node2docord[entity.mentions[-1].head] - node2docord[entity.mentions[0].head]) + self.longest_entity = max(len_mentions, self.longest_entity) + self.counter['c_total_len'] += len_mentions + self.counter[f"c_len_{min(len_mentions, self.e_len_max)}"] += 1 + + self.entities += 1 + if not self.report_mentions and not self.report_details: + continue + for mention in entity.mentions: + self.mentions += 1 + all_words = len(mention.words) + non_empty = len([w for w in mention.words if not w.is_empty()]) + self.m_words += all_words + self.longest_mention = max(non_empty, self.longest_mention) + self.counter['m_total_len'] += non_empty + self.counter[f"m_len_{min(non_empty, self.m_len_max)}"] += 1 + if self.report_details: + upos = 'other' + if not self.selected_upos or mention.head.upos in self.selected_upos: + upos = mention.head.upos + self.counter['m_head_upos_' + upos] += 1 + self.counter['m_with_empty'] += 1 if all_words > non_empty else 0 + self.counter['m_with_gaps'] += 1 if ',' in mention.span else 0 + heads, mwords = 0, set(mention.words) + for w in mention.words: + if w.parent: + heads += 0 if w.parent in mwords else 1 + else: + heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 + self.counter['m_nontreelet'] += 1 if heads > 1 else 0 + + if self.report_basics: + doc_words = 0 + for tree in doc.trees: + self.counter['sents'] += 1 + self.counter['words'] += len(tree.descendants) + self.counter['empty'] += len(tree.empty_nodes) + if tree.newdoc: + self.counter['newdocs'] += 1 + if doc_words > self.counter['max_words_per_doc']: + self.counter['max_words_per_doc'] = doc_words + doc_words = 0 + doc_words += len(tree.descendants) + + def after_process_document(self, doc): + if self.per_doc: + self.process_end(skip=False, doc=doc) + self.counter = Counter() + self.mentions = 0 + self.entities = 0 + self.singletons = 0 + self.total_nodes = 0 + self.longest_mention = 0 + self.longest_entity = 0 + self.m_words = 0 + self.entity_ranges = [] + + def process_end(self, skip=True, doc=None): + if not self._lines_printed: + self.print_header() + self._lines_printed = 0 + if self.per_doc: + if skip: + self.print_footer() + return + else: + docname = doc.meta['loaded_from'] if self.docname == 'filename' else doc[0].trees[0].newdoc + if self.style.startswith('tex'): + if self.highlight_docnames and re.search(self.highlight_docnames, docname): + docname = r"\NEW " + docname + docname = docname.replace('_', r'\_') + print(f"{docname:{self.docname_len}}", end='&' if self.style.startswith('tex') else '\n') + elif self.style.startswith('tex-'): + print(f"{self.counter['documents']:4} documents &") + self._lines_printed += 1 + + mentions_nonzero = 1 if self.mentions == 0 else self.mentions + entities_nonzero = 1 if self.entities == 0 else self.entities + total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes + + columns =[ ] + if self.report_basics: + columns += [('docs', f"{self.counter['newdocs']:6,}"), + ('sents', f"{self.counter['sents']:7,}"), + ('words', f"{self.counter['words']:9,}"), + ('empty', f"{self.counter['empty']:7,}"),] + if self.report_words_per_doc: + columns += [('max_words/doc', f"{self.counter['max_words_per_doc']:7,}"), + ('words/doc', f"{self.counter['words']/self.counter['newdocs']:7,.0f}"),] + if self.report_entities: + columns += [('entities', f"{self.entities:7,}"), + ('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"), + ('longest_entity', f"{self.longest_entity:6}"), + ('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")] + if self.report_entity_range: + self.entity_ranges.sort() + percentile = self.entity_ranges[int(0.95 * (len(self.entity_ranges) - 1))] if self.entity_ranges else 0 + columns += [('entity_range_95percentile', f"{percentile:6,}"),] + for i in range(1, self.e_len_max + 1): + percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero + columns.append((f"c_len_{i}{'' if i < self.e_len_max else '+'}", f"{percent:5.1f}")) + if self.report_mentions: + columns += [('mentions', f"{self.mentions:7,}"), + ('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"), + ('longest_mention', f"{self.longest_mention:6}"), + ('avg_mention', f"{self.counter['m_total_len'] / mentions_nonzero:5.1f}")] + if self.m_len_max: + for i in range(0, self.m_len_max + 1): + percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero + columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}")) + if self.report_details: + columns += [('with_empty', f"{100 * self.counter['m_with_empty'] / mentions_nonzero:5.1f}"), + ('with_gaps', f"{100 * self.counter['m_with_gaps'] / mentions_nonzero:5.1f}"), + ('nontreelet', f"{100 * self.counter['m_nontreelet'] / mentions_nonzero:5.1f}"),] + if self.selected_upos: + upos_list = self.selected_upos + ['other'] + else: + upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')] + for upos in upos_list: + columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}")) + + if self.style.startswith('tex'): + print(" &".join(c[1] for c in columns), end=" \\\\\n") + elif self.style == 'human': + for c in columns: + print(f"{c[0]:>15} = {c[1].strip():>10}") + if not self.per_doc: + self.print_footer() + elif self._lines_printed > self.max_rows_per_page: + self.print_footer(False) + self._lines_printed = 0 + + def print_header(self): + if not self.style.startswith('tex-'): + return + if self.style == 'tex-doc': + if self._lines_printed is None: + print(r'\documentclass[multi=mypage]{standalone}') + print(r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}') + print(r'\usepackage[table]{xcolor}\newcommand{\NEW}{\rowcolor{gray!50}}') + print(r'\title{Udapi coreference statistics}') + print(r'\begin{document}') + print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}') + lines = [r'\begin{mypage}'+"\n"+r'\begin{tabular}{@{}l ', + " " * self.docname_len, + ("document" if self.per_doc else "dataset ") + " " * (self.docname_len-8), + " " * self.docname_len] + if self.report_basics: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{text size} ' + lines[2] += r'& \MC{4}{total number of} ' + lines[3] += r'& docs & sents & words &empty n.' + if self.report_words_per_doc: + lines[0] += "rr " + lines[1] += r'& & ' + lines[2] += r'&\MC{2}{words/doc}' + lines[3] += r'& max & avg ' + if self.report_entities: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{entities} ' + lines[2] += r'& total &per 1k &\MC{2}{length}' + lines[3] += r'& count & words & max & avg ' + if self.report_entity_range: + lines[0] += "r " + lines[1] += r'& ' + lines[2] += r'& range ' + lines[3] += r'& p95 ' + if self.e_len_max: + for i in range(1, self.e_len_max + 1): + lines[0] += "r" + lines[2] += f"& {i:4}" + ("+ " if i==self.e_len_max else " ") + lines[3] += r'& [\%] ' + lines[0] += " " + lines[1] += r'& \MC{' + str(self.e_len_max) + r'}{distribution of entity lengths}' + if self.report_mentions: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{mentions} ' + lines[2] += r'& total &per 1k &\MC{2}{length}' + lines[3] += r'& count & words & max & avg ' + if self.m_len_max: + for i in range(0, self.m_len_max + 1): + lines[0] += "r" + lines[2] += f"& {i:4}" + ("+ " if i==self.m_len_max else " ") + lines[3] += r'& [\%] ' + lines[0] += " " + lines[1] += r'& \MC{' + str(self.m_len_max + 1) + r'}{distribution of mention lengths}' + " "*7 + if self.report_details: + lines[0] += "rrrr " + lines[1] += r'& \MC{3}{mention type} ' + lines[2] += r'&w/empty& w/gap&non-tree' + lines[3] += r'& [\%] ' * 3 + if self.selected_upos: + upos_list = self.selected_upos + ['other'] + else: + upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')] + lines[0] += "@{~}r" * len(upos_list) + lines[1] += r"& \MC{" + str(len(upos_list)) + r"}{distribution of head UPOS}" + lines[2] += ''.join(f'&{upos:7}' for upos in upos_list) + lines[3] += r'& [\%] ' * len(upos_list) + lines[0] += r'@{}}\toprule' + last_col = 1 + lines[1] += r'\\' + lines[2] += r'\\' + lines[3] += r'\\\midrule' + if self.report_basics: + lines[1] += r'\cmidrule(lr){2-7}' if self.report_words_per_doc else r'\cmidrule(lr){2-5}' + lines[2] += r'\cmidrule(lr){2-5}' + last_col += 4 + if self.report_words_per_doc: + lines[2] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+2}" + '}' + last_col += 2 + if self.report_entities: + _cols = 5 if self.report_entity_range else 5 + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+_cols}" + '}' + lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}' + last_col += _cols + if self.e_len_max: + last_col += self.e_len_max + lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}' + if self.report_mentions: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}' + lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}' + last_col += 4 + if self.m_len_max: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+self.m_len_max+1}" + '}' + last_col += self.m_len_max + 1 + if self.report_details: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+3}" + lines[1] += r'}\cmidrule(l){' + f"{last_col+4}-{last_col+3+len(upos_list)}" + '}' + print("\n".join(lines)) + + def print_footer(self, end_doc=True): + if not self.style.startswith('tex-'): + return + print(r'\bottomrule\end{tabular}'+"\n"+r'\end{mypage}') + if self.style == 'tex-doc' and end_doc: + print(r'\end{document}') diff --git a/udapi/block/demo/complexity.py b/udapi/block/demo/complexity.py new file mode 100644 index 00000000..99e8a046 --- /dev/null +++ b/udapi/block/demo/complexity.py @@ -0,0 +1,268 @@ +"""demo.Complexity prints statistics on syntactic complexity. +""" +from udapi.core.basewriter import BaseWriter +from collections import deque + + +def non_punct(nodes): + return [n for n in nodes if n.upos != 'PUNCT'] + + +def is_np(node): + return node.upos in ("NOUN", "PROPN") or (node.upos == "PRON" and node.feats["PronType"] == "Prs" and not node.feats["Poss"]) + + +def is_vp(node): + """E.g. prosili, naléhali a žadonili => 1 coordinated verb phrase, head “prosili”. + + [POS == “VERB”, [deprel == “conj”, POS == “VERB”]], unique coordination heads + TODO: zahrnout i non-VERB? + - vznikla a byla přijata(conj,ADJ,parent=vznikla) + - je(cop,AUX) nešťastný(ADJ) a nechá(conj,VERB,parent=nešťastný) se nalákat + - "podařilo se to a dokladem(ClauseHead,NOUN,conj,parent=podařilo) je(cop,AUX,parent=dokladem)" + - omezit se jen na (či využít) ClauseHead, nebo zahrnout i non-finite verbs (koordinace infinitivů či příčestí)? + "stihl(ClauseHead) napsat(VerbForm=Inf) a publikovat(VerbForm=Inf)" ... napsat ani publikovat nejsou ClauseHead + "rozhodl se ukončit a ukazuje(ClauseHead,parent=ukončit)" správně by mělo být parent=rozhodl, ale parser dělá chyby. + - Parsing vůbec dělá mnoho chyb v koordinacích, takže je vhodné podmínky velmi omezit. + """ + return node.upos == "VERB" or node.misc["ClauseHead"] + + +def is_relcl(node): + """Is a given node a head of a relative clause? + + Unfortunatelly, UDPipe 2.4 produces just acl instead of acl:relcl. + """ + if node.deprel == 'acl:relcl': + return True + return node.udeprel == 'acl' and any('Rel' in c.feats['PronType'] for c in node.children) + + +def is_postponed_nom_mod(node): + """Is a given node a postponed nominal modifier? + + Silvie: [(POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)), child with higher word order than parent + [deprel != “conj”, POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)] + + TODO: Tohle hledá v češtině zcela běžné jevy jako "vznik díla". Nechceme hledat něco jiného? + """ + return node.udeprel != 'conj' and is_np(node) and node.parent.precedes(node) and is_np(node.parent) + + +def is_postponed_adj_mod(node): + # TODO můžeme rozlišovat holý přívlastek ("písní ruských") a rozvitý ("milenec známý z pozdějšího zpracování") + return node.parent.precedes(node) and is_np(node.parent) and node.upos == 'ADJ' #and not node.children + + +def is_complex_nominal(node): + """[(POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)) 2x descendant [deprel != “conj”]] + TODO: punct, case, cc a dep taky ignorovat? + TODO: opravdu descendants a ne children? (descendants snadno roste nad všechny meze, je-li tam třeba vedlejší věta) + TODO: beztak bude chtít odfiltrovat copuly: "Jádrem tvorby jsou sbírky." - Jádrem má 3 děti. + TODO: a nezvýšit ten limit z 2x aspoň na 3x? + """ + return is_np(node) and len([n for n in node.descendants if n.deprel not in ('conj', 'punct', 'case', 'cc', 'dep', 'cop')]) > 1 + + +def is_finite_clause_head(node): + """Is a given node a head of a finite clause? + + Silvie: [(POS == „VERB“ & feats:Verbform == „Fin“ | Verbform == „Part“} ) ] OR [(POS in {„ADJ“, „NOUN“, „PROPN“}, [child POS == „AUX“)]] + - POS == „VERB“ je zbytečné, protože VerbForm=Part je nastaveno i u ADJ ("je nucen" apod.) + - child POS == „AUX“ zase matchuje i např. na "Vidím psa(NOUN), který je(AUX,acl,parent=psa) z dávné doby." + - adjectivized predicates (převažující(VerbForm=Part) básně) by neměly být určeny jako clause_head + + * Most finite verbs with deprel=amod are parsing errors - they should have deprel=acl, + but for better robustness we include these as well. + * Similarly "dep" and "orphan" are mostly parsing errors. + * TODO: by uncommenting the nsubj/csubj line, we find few more real clause heads, but also some false positives. + """ + # TODO appos + if ((node.udeprel in {'root', 'conj', 'acl', 'advcl', 'ccomp', 'csubj', 'obl', 'parataxis', 'amod', 'dep', 'orphan'} + and is_finite_verb(node)) + #or any(c.udeprel in {'nsubj', 'csubj'} for c in node.children) + or (any(c.udeprel == 'cop' for c in node.children) and node.udeprel != 'xcomp')): + return True + xcomp_child = next((c for c in node.children if c.udeprel == 'xcomp'), None) + return xcomp_child and any(c.udeprel == 'cop' for c in xcomp_child.children) + + +# TODO: zahrnout i: bude(aux,AUX,parent=chovat) se chovat(VERB,VerbForm=Inf) +def is_finite_verb(node): + return (node.feats['VerbForm'] in {'Fin', 'Part'} and + (node.upos == 'VERB' or + node.upos == 'ADJ' and any(c.deprel == 'aux:pass' for c in node.children))) + + +def is_adjectivized_predicate(node): + """E.g. kouřící komín, zbitý kluk + + Silvie: [(POS == „ADJ“ & feats:VerbForm == „Part“), parent [POS in {„NOUN“, „PROPN“}] ] + - parent [POS in {„NOUN“, „PROPN“}] zamezí případům jako + "kvůli nesmyslné a stupňující(parent=nesmyslné,deprel=conj) se žárlivosti" + "Nové pronikající(parent=Nové,deprel=amod) socialistické myšlení" asi chyba parsingu, mělo být parent=myšlení? + - dotaz naopak matchuje na "způsob, jakým jsou popsány", proto přidávám podmínku not node.misc["ClauseHead"] + """ + return (node.feats["VerbForm"] == "Part" + and node.upos == "ADJ" + and (node.parent.upos in {"NOUN","PROPN"} or (node.udeprel == "conj" and node.parent.upos == "ADJ")) + and not node.misc["ClauseHead"]) + + +def is_controlled_predicate(node): + """E.g. Mohli jsme odejít i zůstat. + + TODO: Chceme zahrnout i druhý a další člen koordinace, např. "stihl napsat a publikovat", + tedy node.udeprel == "conj" and node.parent.udeprel == "xcomp"? + """ + return node.deprel == "xcomp" + +class Complexity(BaseWriter): + + def __init__(self, matches=False, **kwargs): + super().__init__(**kwargs) + self.matches = matches + + + def report(self, category, groups, expand_type='no'): + if self.matches: + for group in groups: + self.print_match(category, group, expand_type) + else: + print("\t" + str(len(groups)), end='') + + + def expand_subtree(self, nodes, expand_type): + if expand_type == 'no': + return nodes + if len(nodes) > 1: + raise Exception("expanding more than one node not implemented yet") + if expand_type == 'subtree': + return nodes[0].descendants(add_self=True) + #if expand_type == 'subtree_except_conj': + #result = nodes + #for child in group.children: + #if child.udeprel != 'conj': + #result.extend(child.descendants(add_self=True)) + #return = sorted(result) + if expand_type == 'subtree_within_clause': + stack = [n for n in nodes[0].children if n.udeprel != 'conj'] + while stack: + node = stack.pop() + if not node.misc["ClauseHead"]: + nodes.append(node) + stack.extend(node.children()) + return sorted(nodes) + raise ValueError("unknown expand value " + expand_type) + + + def print_match(self, category, group, expand_type='no'): + nodes = self.expand_subtree(group, expand_type) + lemmas = " ".join(n.lemma for n in nodes) + tags = " ".join(n.upos for n in nodes) + n_tokens = str(len(non_punct(nodes))) + print("\t".join([category, nodes[0].root.sent_id, lemmas, tags, n_tokens])) + + + def get_main_clauses(self, root): + main_heads = [] + for main_head in root.children: + main_heads.append(main_head) + main_heads.extend(n for n in main_head.children if n.udeprel == 'conj') + return [[n] for n in main_heads] + + + def get_coord_phrase(self, root, phrase_type_function): + results = [] + for node in root.descendants: + if phrase_type_function(node): + conjuncts = [n for n in node.children if n.udeprel == 'conj' and phrase_type_function(n)] + if conjuncts: + conjunctions = [] + for conj in conjuncts: + # TODO multiword conjunctions (udeprel=flat)? + conjunctions.extend([n for n in conj.children if n.udeprel == 'cc']) + results.append(sorted([node] + conjuncts + conjunctions)) + return results + + # TODO koordinace hlavních i vedlejších vět + def get_t_units(self, main_heads): + results = [] + for main_head in main_heads: + main_clause = [main_head] + dep_heads = [] + stack = main_head.children + while stack: + node = stack.pop() + if node.misc["ClauseHead"]: + dep_heads.append(node) + else: + main_clause.append(node) + stack.extend(node.children) + main_clause = sorted(main_clause) + + for dep_clause_head in dep_heads: + results.append(main_clause + self.expand_subtree([dep_clause_head], 'subtree')) + return results + + # TODO complex t-unit má jinou definici: 3 klauze + def get_complex_t_units(self, root): + results = [] + for node in root.descendants: + if node.deprel != 'root' and node.misc["ClauseHead"]: # TODO: exclude the main clause? + results += self.get_t_units([node]) + return results + + + def process_tree(self, root): + print("# " + root.text) + + allnodes = root.descendants + depth, clause_depth = {0: 0}, {0: 0} + queue = deque(root.children) + clause_heads = [] + while queue: + node = queue.popleft() + depth[node.ord] = depth[node.parent.ord] + 1 + clause_depth[node.ord] = clause_depth[node.parent.ord] + if is_finite_clause_head(node): + node.misc['ClauseHead'] = 1 + clause_heads.append(node) + clause_depth[node.ord] += 1 + queue.extend(node.children) + max_depth = sorted(depth.values())[-1] + max_clause_depth = sorted(clause_depth.values())[-1] + + t_units = self.get_t_units([n for n in root.children if n.deprel == 'root']) + total_t_units_length = sum(len(t_unit) for t_unit in t_units) + mean_t_unit_length = total_t_units_length / (len(t_units) or 1) # TODO co reportovat, když věta nemá žádné t-units? + + if not self.matches: + print("\t".join(str(x) for x in [root.sent_id, len(non_punct(allnodes)), max_depth, max_clause_depth, mean_t_unit_length]), end='') + + self.report("clauses", [[n] for n in clause_heads], 'subtree') + self.report("adjectivized_predicates", [[n] for n in allnodes if is_adjectivized_predicate(n)]) + self.report("controlled_predicates", [[n] for n in allnodes if is_controlled_predicate(n)]) + self.report("main_clauses", self.get_main_clauses(root), 'subtree_within_clause') + self.report("coordinated_verb_phrases", self.get_coord_phrase(root, is_vp)) + self.report("coordinated_noun_phrases", self.get_coord_phrase(root, is_np)) + self.report("coordinated_adjective_phrases", self.get_coord_phrase(root, lambda n: n.upos in ("ADJ", "DET"))) + self.report("coordinated_adverb_phrases", self.get_coord_phrase(root, lambda n: n.upos == "ADV")) + self.report("t-units", t_units) + self.report("complex_t-units", self.get_complex_t_units(root)) + # TODO: najde "básně a písně" a "rychtář a rychtářka" UDPipe kdovíproč určil jako ADV a ADV. Zkontrolovat, máme-li nejlepší možný UDPipe model. + self.report("relative_clauses", [[n] for n in allnodes if is_relcl(n)], 'subtree_within_clause') + self.report("postponed_nominal_modifiers", [[n] for n in allnodes if is_postponed_nom_mod(n)]) + self.report("postponed_adjective_modifiers", [[n] for n in allnodes if is_postponed_adj_mod(n)]) + self.report("complex_nominals", [[n] for n in allnodes if is_complex_nominal(n)]) + + if not self.matches: + # TODO: pro total koordinace asi nemá smysl reportovat matches, jen total count? + self.report("coordinated_phrases_total", self.get_coord_phrase(root, lambda _: True)) + + nonpunct_upos = [n.upos for n in non_punct(allnodes)] + ['NONE', 'NONE'] + brackets = str(len([n for n in allnodes if n.form == '('])) + dashes = str(len([n for n in allnodes if n.form in '-–—―'])) # hyphen, en-dash, em-dash, horizonatal bar + colons = str(len([n for n in allnodes if n.form == ':'])) + semicolons = str(len([n for n in allnodes if n.form == ';'])) + print("\t", "\t".join([nonpunct_upos[0], nonpunct_upos[1], brackets, dashes, colons, semicolons])) diff --git a/udapi/block/demo/newspeak.py b/udapi/block/demo/newspeak.py new file mode 100644 index 00000000..6be2caf5 --- /dev/null +++ b/udapi/block/demo/newspeak.py @@ -0,0 +1,66 @@ +"""demo.Newspeak block for 1984-like newspeak-ization of Czech. + +This is just a demo/draft. + +Usage: + $ echo 'Nejhorší žena je lepší než nejlepší muž.' | \ + udapy -q read.Sentences udpipe.Cs demo.Newspeak write.Sentences + Převelenedobrá žena je veledobrá než převeledobrý muž. +""" +from udapi.core.block import Block +from udapi.tool.morphodita import MorphoDiTa + +ANTONYMS = { + 'špatný': 'dobrý', + 'pomalý': 'rychlý', + # 'muž': 'žena', this does not work because xpos contains gender, + # we would also need to exploit the parsing and change gender of all congruent adj children. +} + + +class Newspeak(Block): + """Change all comparatives to vele-x and superlatives to převele-x.""" + + def __init__(self, morphodita_path='models/morphodita/cs/', + morphodita_model='czech-morfflex-131112.dict', + **kwargs): + """Create the PreVele block object.""" + super().__init__(**kwargs) + self.morphodita = MorphoDiTa(model=morphodita_path + morphodita_model) + + def process_tree(self, tree): + + # apply process_node on all nodes + super().process_tree(tree) + + # Capitalize if needed + first_node = tree.descendants[0] + if tree.text[0].isupper() and not first_node.form[0].isupper(): + first_node.form = first_node.form[0].upper() + first_node.form[1:] + + # Recompute the sentence string + tree.text = tree.compute_text() + + def process_node(self, node): + antonym = ANTONYMS.get(node.lemma) + if antonym is not None: + if node.xpos[11] == 'N': + if node.form.lower().startswith('ne'): + node.lemma = antonym + node.xpos = node.xpos[:10] + 'A' + node.xpos[11:] + node.form = node.form[2:] + else: + forms = self.morphodita.forms_of_lemma(antonym, node.xpos) + if forms: + node.lemma = antonym + node.xpos = node.xpos[:10] + 'N' + node.xpos[11:] + node.form = 'ne' + forms[0].form + + degree = node.feats["Degree"] + if degree in ("Sup", "Cmp"): + new_xpos = node.xpos[:9] + '1' + node.xpos[10:] + forms = self.morphodita.forms_of_lemma(node.lemma, new_xpos) + if forms: + new_form = "vele" if degree == "Cmp" else "převele" + new_form += forms[0].form + node.form = new_form diff --git a/udapi/block/demo/rehangprepositions.py b/udapi/block/demo/rehangprepositions.py index 8d641b49..d25e29bc 100644 --- a/udapi/block/demo/rehangprepositions.py +++ b/udapi/block/demo/rehangprepositions.py @@ -4,6 +4,7 @@ class RehangPrepositions(Block): """This block takes all prepositions (upos=ADP) and rehangs them above their parent.""" + def process_node(self, node): if node.upos == "ADP": origparent = node.parent diff --git a/udapi/block/eval/__init__.py b/udapi/block/eval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/eval/conll17.py b/udapi/block/eval/conll17.py new file mode 100644 index 00000000..61e86383 --- /dev/null +++ b/udapi/block/eval/conll17.py @@ -0,0 +1,288 @@ +r"""Block&script eval.Conll17 for evaluating LAS,UAS,etc as in CoNLL2017 UD shared task. + +This is a reimplementation of the CoNLL2017 shared task official evaluation script, +http://universaldependencies.org/conll17/evaluation.html + +The gold trees and predicted (system-output) trees need to be sentence-aligned +e.g. using `util.ResegmentGold`. +Unlike in `eval.Parsing`, the gold and predicted trees can have different tokenization. + +An example usage and output:: + + $ udapy read.Conllu zone=gold files=gold.conllu \ + read.Conllu zone=pred files=pred.conllu ignore_sent_id=1 \ + eval.Conll17 + Metric | Precision | Recall | F1 Score | AligndAcc + -----------+-----------+-----------+-----------+----------- + Words | 27.91 | 52.17 | 36.36 | 100.00 + UPOS | 27.91 | 52.17 | 36.36 | 100.00 + XPOS | 27.91 | 52.17 | 36.36 | 100.00 + Feats | 27.91 | 52.17 | 36.36 | 100.00 + Lemma | 27.91 | 52.17 | 36.36 | 100.00 + UAS | 16.28 | 30.43 | 21.21 | 58.33 + LAS | 16.28 | 30.43 | 21.21 | 58.33 + CLAS | 10.34 | 16.67 | 12.77 | 37.50 + + +For evaluating multiple systems and testsets (as in CoNLL2017) +stored in `systems/system_name/testset_name.conllu` you can use:: + + #!/bin/bash + SYSTEMS=`ls systems` + [[ $# -ne 0 ]] && SYSTEMS=$@ + set -x + set -e + for sys in $SYSTEMS; do + mkdir -p results/$sys + for testset in `ls systems/$sys`; do + udapy read.Conllu zone=gold files=gold/$testset \ + read.Conllu zone=pred files=systems/$sys/$testset ignore_sent_id=1 \ + util.ResegmentGold \ + eval.Conll17 print_results=0 print_raw=1 \ + > results/$sys/${testset%.conllu} + done + done + python3 `python3 -c 'import udapi.block.eval.conll17 as x; print(x.__file__)'` -r 100 + +The last line executes this block as a script and computes bootstrap resampling with 100 resamples +(default=1000, it is recommended to keep the default or higher value unless testing the interface). +This prints the ranking and confidence intervals (95% by default) and also p-values for each +pair of systems with neighboring ranks. If the difference in LAS is significant +(according to a paired bootstrap test, by default if p < 0.05), +a line is printed between the two systems. + +The output looks like:: + + 1. Stanford 76.17 ± 0.12 (76.06 .. 76.30) p=0.001 + ------------------------------------------------------------ + 2. C2L2 74.88 ± 0.12 (74.77 .. 75.01) p=0.001 + ------------------------------------------------------------ + 3. IMS 74.29 ± 0.13 (74.16 .. 74.43) p=0.001 + ------------------------------------------------------------ + 4. HIT-SCIR 71.99 ± 0.14 (71.84 .. 72.12) p=0.001 + ------------------------------------------------------------ + 5. LATTICE 70.81 ± 0.13 (70.67 .. 70.94) p=0.001 + ------------------------------------------------------------ + 6. NAIST-SATO 70.02 ± 0.13 (69.89 .. 70.16) p=0.001 + ------------------------------------------------------------ + 7. Koc-University 69.66 ± 0.13 (69.52 .. 69.79) p=0.002 + ------------------------------------------------------------ + 8. UFAL-UDPipe-1-2 69.36 ± 0.13 (69.22 .. 69.49) p=0.001 + ------------------------------------------------------------ + 9. UParse 68.75 ± 0.14 (68.62 .. 68.89) p=0.003 + ------------------------------------------------------------ + 10. Orange-Deskin 68.50 ± 0.13 (68.37 .. 68.62) p=0.448 + 11. TurkuNLP 68.48 ± 0.14 (68.34 .. 68.62) p=0.029 + ------------------------------------------------------------ + 12. darc 68.29 ± 0.13 (68.16 .. 68.42) p=0.334 + 13. conll17-baseline 68.25 ± 0.14 (68.11 .. 68.38) p=0.003 + ------------------------------------------------------------ + 14. MQuni 67.93 ± 0.13 (67.80 .. 68.06) p=0.062 + 15. fbaml 67.78 ± 0.13 (67.65 .. 67.91) p=0.283 + 16. LyS-FASTPARSE 67.73 ± 0.13 (67.59 .. 67.85) p=0.121 + 17. LIMSI-LIPN 67.61 ± 0.14 (67.47 .. 67.75) p=0.445 + 18. RACAI 67.60 ± 0.13 (67.46 .. 67.72) p=0.166 + 19. IIT-Kharagpur 67.50 ± 0.14 (67.36 .. 67.64) p=0.447 + 20. naistCL 67.49 ± 0.15 (67.34 .. 67.63) + +TODO: Bootstrap currently reports only LAS, but all the other measures could be added as well. +""" +import argparse +import difflib +import logging +import os +import random +import sys +from collections import Counter +from udapi.core.basewriter import BaseWriter + +CLAS_IGNORE = {'aux', 'case', 'cc', 'clf', 'cop', 'det', 'mark', 'punct'} + + +class Conll17(BaseWriter): + """Evaluate labeled and unlabeled attachment score (LAS and UAS).""" + + def __init__(self, gold_zone='gold', print_raw=False, print_results=True, **kwargs): + """Args: + gold_zone - Which zone contains the gold-standard trees (the other zone contains "pred")? + print_raw - Print raw counts (pred, gold, Words, LAS) for each sentence. + This is useful for bootstrap resampling post-processing to get confidence intervals. + print_results - Print a table with overall results after all document are processed. + """ + super().__init__(**kwargs) + self.gold_zone = gold_zone + self.total_count = Counter() + self.print_raw = print_raw + self.print_results = print_results + + def process_tree(self, tree): + gold_tree = tree.bundle.get_tree(self.gold_zone) + if tree == gold_tree: + return + pred_nodes = tree.descendants + gold_nodes = gold_tree.descendants + pred_forms = [n.form.lower() for n in pred_nodes] + gold_forms = [n.form.lower() for n in gold_nodes] + matcher = difflib.SequenceMatcher(None, pred_forms, gold_forms, autojunk=False) + aligned = [] + for diff in matcher.get_opcodes(): + edit, pred_lo, pred_hi, gold_lo, gold_hi = diff + if edit == 'equal': + aligned.extend(zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi])) + align_map = {tree: gold_tree} + for p_node, g_node in aligned: + align_map[p_node] = g_node + + count = Counter() + count['pred'] = len(pred_nodes) + count['gold'] = len(gold_nodes) + count['Words'] = len(aligned) + count['pred_clas'] = len([n for n in pred_nodes if n.udeprel not in CLAS_IGNORE]) + count['gold_clas'] = len([n for n in gold_nodes if n.udeprel not in CLAS_IGNORE]) + count['alig_clas'] = len([n for _, n in aligned if n.udeprel not in CLAS_IGNORE]) + + for p_node, g_node in aligned: + for attr in ('UPOS', 'XPOS', 'Feats', 'Lemma'): + if p_node.get_attrs([attr.lower()]) == g_node.get_attrs([attr.lower()]): + count[attr] += 1 + if align_map.get(p_node.parent) == g_node.parent: + count['UAS'] += 1 + if p_node.udeprel == g_node.udeprel: + count['LAS'] += 1 + if g_node.udeprel not in CLAS_IGNORE: + count['CLAS'] += 1 + self.total_count.update(count) + + if self.print_raw: + scores = [str(count[s]) for s in ('pred', 'gold', 'Words', 'LAS')] + print(' '.join(scores)) + + def process_end(self): + if not self.print_results: + return + + # Redirect the default filehandle to the file specified by self.files + self.before_process_document(None) + + metrics = ('Words', 'UPOS', 'XPOS', 'Feats', 'Lemma', 'UAS', 'LAS', 'CLAS') + print("Metric | Precision | Recall | F1 Score | AligndAcc") + print("-----------+-----------+-----------+-----------+-----------") + pred, gold = self.total_count['pred'], self.total_count['gold'] + alig = self.total_count['Words'] + for metric in metrics: + if metric == 'CLAS': + pred, gold = self.total_count['pred_clas'], self.total_count['gold_clas'] + alig = self.total_count['alig_clas'] + correct = self.total_count[metric] + precision = correct / pred if pred else 0 + recall = correct / gold if gold else 0 + alignacc = correct / alig if alig else 0 + fscore = 2 * correct / (pred + gold) if pred + gold else 0 + print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{:10.2f}".format( + metric, 100 * precision, 100 * recall, 100 * fscore, 100 * alignacc)) + + +def prec_rec_f1(correct, pred, gold, alig=0): + precision = correct / pred if pred else 0 + recall = correct / gold if gold else 0 + alignacc = correct / alig if alig else 0 + fscore = 2 * correct / (pred + gold) if pred + gold else 0 + return precision, recall, fscore, alignacc + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dir_results", "-d", default="results", help="directory with results") + parser.add_argument("--resamples", "-r", default=1000, type=int, help="how many resamples") + parser.add_argument("--confidence", "-c", default=95, help="use x-percent confidence interval") + parser.add_argument("--tests", "-t", default='all', help="comma-separated test sets") + parser.add_argument("--systems", "-s", default='all', help="comma-separated systems") + parser.add_argument("--randseed", default=0, type=int, help="random seed, default=sys time") + args = parser.parse_args() + res_dir, resamples, conf = args.dir_results, args.resamples, args.confidence + alpha = (1 - conf/100) / 2 + index_lo = int(alpha * (resamples - 1)) + index_hi = resamples - 1 - index_lo + index_mid = int(resamples / 2) + if args.systems == 'all': + systems = os.listdir(res_dir) + else: + systems = args.systems.split(',') + if args.tests == 'all': + tests = set() + for system in systems: + tests.update(os.listdir(res_dir + '/' + system)) + tests = sorted(tests) + else: + tests = args.tests.split(',') + if args.randseed: + random.seed(args.randseed) + results = [] + + print('Loading...', file=sys.stderr) + for system in systems: + sys_results = [] + results.append(sys_results) + for i_test, test in enumerate(tests): + filename = '/'.join((res_dir, system, test)) + try: + with open(filename) as res_file: + sys_results.extend([[i_test] + list(map(int, l.split())) for l in res_file]) + except FileNotFoundError: + logging.warning(filename + ' not found') + samples = len(sys_results) + + print('Resampling...', file=sys.stderr) + boot_results = [] + for i_resample in range(resamples): + print(i_resample + 1, file=sys.stderr, end='\r') + resample_results = [] + boot_results.append(resample_results) + for i_system in range(len(systems)): + pred, gold, words, las = ([0] * len(tests) for _ in range(4)) + for _ in range(samples): + i_test, pre, gol, wor, la_ = random.choice(results[i_system]) + pred[i_test] += pre + gold[i_test] += gol + words[i_test] += wor + las[i_test] += la_ + fscore_sum = 0 + for i_test in range(len(tests)): + _prec, _rec, fscore, _aligacc = prec_rec_f1(las[i_test], pred[i_test], gold[i_test]) + fscore_sum += fscore + resample_results.append(fscore_sum / len(tests)) + print('\n', file=sys.stderr) + + sys_fscores = [] + for i_system, system in enumerate(systems): + sys_fscores.append([boot_results[i_resample][i_system] for i_resample in range(resamples)]) + final_results = [] + sys_sys_wins = [[0] * len(systems) for x in range(len(systems))] + for i_system, system in enumerate(systems): + for j_system in range(i_system): + for i, j in zip(sys_fscores[i_system], sys_fscores[j_system]): + if i > j: + sys_sys_wins[i_system][j_system] += 1 + elif i < j: + sys_sys_wins[j_system][i_system] += 1 + fscores = sorted(sys_fscores[i_system]) + final_results.append([i_system, fscores[index_mid], fscores[index_lo], fscores[index_hi]]) + + sorted_systems = sorted(final_results, key=lambda x: -x[1]) + for rank, sys_results in enumerate(sorted_systems): + i_system, f1_mid, f1_lo, f1_hi = sys_results + if rank < len(systems) - 1: + j_worse_sys = sorted_systems[rank + 1][0] + p_value = (sys_sys_wins[j_worse_sys][i_system] + 1) / (resamples + 1) + p_str = " p=%.3f" % p_value + else: + p_value, p_str = 1, "" + print("%2d. %17s %5.2f ±%5.2f (%5.2f .. %5.2f)%s" % + (rank + 1, systems[i_system], + 100 * f1_mid, 50 * (f1_hi - f1_lo), 100 * f1_lo, 100 * f1_hi, p_str)) + if p_value < (1 - conf/100): + print('-' * 60) + + +if __name__ == "__main__": + main() diff --git a/udapi/block/eval/conll18.py b/udapi/block/eval/conll18.py new file mode 100644 index 00000000..22f42a42 --- /dev/null +++ b/udapi/block/eval/conll18.py @@ -0,0 +1,337 @@ +r"""Block&script eval.Conll18 for evaluating LAS,UAS,etc as in CoNLL2018 UD shared task. + +This is a reimplementation of the CoNLL2018 shared task official evaluation script, +http://universaldependencies.org/conll18/evaluation.html + +The gold trees and predicted (system-output) trees need to be sentence-aligned +e.g. using `util.ResegmentGold`. +Unlike in `eval.Parsing`, the gold and predicted trees can have different tokenization. + +An example usage and output:: + + $ udapy read.Conllu zone=gold files=gold.conllu \ + read.Conllu zone=pred files=pred.conllu ignore_sent_id=1 \ + util.ResegmentGold \ + eval.Conll18 + Metric | Precision | Recall | F1 Score | AligndAcc + -----------+-----------+-----------+-----------+----------- + Words | 27.91 | 52.17 | 36.36 | 100.00 + UPOS | 27.91 | 52.17 | 36.36 | 100.00 + XPOS | 27.91 | 52.17 | 36.36 | 100.00 + Feats | 27.91 | 52.17 | 36.36 | 100.00 + Lemma | 27.91 | 52.17 | 36.36 | 100.00 + UAS | 16.28 | 30.43 | 21.21 | 58.33 + LAS | 16.28 | 30.43 | 21.21 | 58.33 + CLAS | 10.34 | 16.67 | 12.77 | 37.50 + + +For evaluating multiple systems and testsets (as in CoNLL2018) +stored in `systems/system_name/testset_name.conllu` you can use:: + + #!/bin/bash + SYSTEMS=`ls systems` + [[ $# -ne 0 ]] && SYSTEMS=$@ + set -x + set -e + for sys in $SYSTEMS; do + mkdir -p results/$sys + for testset in `ls systems/$sys`; do + udapy read.Conllu zone=gold files=gold/$testset \ + read.Conllu zone=pred files=systems/$sys/$testset ignore_sent_id=1 \ + util.ResegmentGold \ + eval.Conll18 print_results=0 print_raw=LAS \ + > results/$sys/${testset%.conllu} + done + done + python3 `python3 -c 'import udapi.block.eval.conll18 as x; print(x.__file__)'` -r 100 + +The last line executes this block as a script and computes bootstrap resampling with 100 resamples +(default=1000, it is recommended to keep the default or higher value unless testing the interface). +This prints the ranking and confidence intervals (95% by default) and also p-values for each +pair of systems with neighboring ranks. If the difference in LAS is significant +(according to a paired bootstrap test, by default if p < 0.05), +a line is printed between the two systems. + +The output looks like:: + + 1. Stanford 76.17 ± 0.12 (76.06 .. 76.30) p=0.001 + ------------------------------------------------------------ + 2. C2L2 74.88 ± 0.12 (74.77 .. 75.01) p=0.001 + ------------------------------------------------------------ + 3. IMS 74.29 ± 0.13 (74.16 .. 74.43) p=0.001 + ------------------------------------------------------------ + 4. HIT-SCIR 71.99 ± 0.14 (71.84 .. 72.12) p=0.001 + ------------------------------------------------------------ + 5. LATTICE 70.81 ± 0.13 (70.67 .. 70.94) p=0.001 + ------------------------------------------------------------ + 6. NAIST-SATO 70.02 ± 0.13 (69.89 .. 70.16) p=0.001 + ------------------------------------------------------------ + 7. Koc-University 69.66 ± 0.13 (69.52 .. 69.79) p=0.002 + ------------------------------------------------------------ + 8. UFAL-UDPipe-1-2 69.36 ± 0.13 (69.22 .. 69.49) p=0.001 + ------------------------------------------------------------ + 9. UParse 68.75 ± 0.14 (68.62 .. 68.89) p=0.003 + ------------------------------------------------------------ + 10. Orange-Deskin 68.50 ± 0.13 (68.37 .. 68.62) p=0.448 + 11. TurkuNLP 68.48 ± 0.14 (68.34 .. 68.62) p=0.029 + ------------------------------------------------------------ + 12. darc 68.29 ± 0.13 (68.16 .. 68.42) p=0.334 + 13. conll18-baseline 68.25 ± 0.14 (68.11 .. 68.38) p=0.003 + ------------------------------------------------------------ + 14. MQuni 67.93 ± 0.13 (67.80 .. 68.06) p=0.062 + 15. fbaml 67.78 ± 0.13 (67.65 .. 67.91) p=0.283 + 16. LyS-FASTPARSE 67.73 ± 0.13 (67.59 .. 67.85) p=0.121 + 17. LIMSI-LIPN 67.61 ± 0.14 (67.47 .. 67.75) p=0.445 + 18. RACAI 67.60 ± 0.13 (67.46 .. 67.72) p=0.166 + 19. IIT-Kharagpur 67.50 ± 0.14 (67.36 .. 67.64) p=0.447 + 20. naistCL 67.49 ± 0.15 (67.34 .. 67.63) +""" +import argparse +import difflib +import logging +import os +import random +import sys +from collections import Counter +from udapi.core.basewriter import BaseWriter + +CONTENT = {'nsubj', 'obj', 'iobj', 'csubj', 'ccomp', 'xcomp', 'obl', 'vocative', 'expl', + 'dislocated', 'advcl', 'advmod', 'discourse', 'nmod', 'appos', 'nummod', 'acl', + 'amod', 'conj', 'fixed', 'flat', 'compound', 'list', 'parataxis', 'orphan', 'goeswith', + 'reparandum', 'root', 'dep'} +FUNCTIONAL = {'aux', 'cop', 'mark', 'det', 'clf', 'case', 'cc'} +UNIV_FEATS = {'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr', 'Gender', 'Animacy', + 'Number', 'Case', 'Definite', 'Degree', 'VerbForm', 'Mood', 'Tense', 'Aspect', + 'Voice', 'Evident', 'Polarity', 'Person', 'Polite'} + +class Conll18(BaseWriter): + """Evaluate LAS, UAS, MLAS and BLEX.""" + + def __init__(self, gold_zone='gold', print_raw=False, print_results=True, print_counts=False, + **kwargs): + """Args: + gold_zone - Which zone contains the gold-standard trees (the other zone contains "pred")? + print_raw - Print raw counts (pred, gold, aligned, correct) for each sentence. + This is useful for bootstrap resampling post-processing to get confidence intervals. + The parameter print_raw specifies a given metric + (UAS, LAS, MLAS, BLEX, UPOS, XPOS, Feats, Lemma) or is 0 (or False) by default. + print_results - Print a table with overall results after all document are processed. + print_counts - Print counts of correct/gold/system instead of prec/rec/f1 for all metrics. + """ + super().__init__(**kwargs) + self.gold_zone = gold_zone + self.total_count = Counter() + self.print_raw = print_raw + self.print_results = print_results + self.print_counts = print_counts + + def _ufeats(self, feats): + return '|'.join(sorted(x for x in feats.split('|') if x.split('=', 1)[0] in UNIV_FEATS)) + + def process_tree(self, tree): + gold_tree = tree.bundle.get_tree(self.gold_zone) + if tree == gold_tree: + return + pred_nodes = tree.descendants + gold_nodes = gold_tree.descendants + pred_forms = [n.form.lower() for n in pred_nodes] + gold_forms = [n.form.lower() for n in gold_nodes] + matcher = difflib.SequenceMatcher(None, pred_forms, gold_forms, autojunk=False) + aligned = [] + for diff in matcher.get_opcodes(): + edit, pred_lo, pred_hi, gold_lo, gold_hi = diff + if edit == 'equal': + aligned.extend(zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi])) + align_map, feats_match = {tree: gold_tree}, {} + for p_node, g_node in aligned: + align_map[p_node] = g_node + feats_match[p_node] = self._ufeats(str(p_node.feats)) == self._ufeats(str(g_node.feats)) + + count = Counter() + count['pred'] = len(pred_nodes) + count['gold'] = len(gold_nodes) + count['Words'] = len(aligned) + count['pred_cont'] = len([n for n in pred_nodes if n.udeprel in CONTENT]) + count['gold_cont'] = len([n for n in gold_nodes if n.udeprel in CONTENT]) + count['alig_cont'] = len([n for _, n in aligned if n.udeprel in CONTENT]) + + for p_node, g_node in aligned: + count['UPOS'] += 1 if p_node.upos == g_node.upos else 0 + count['XPOS'] += 1 if p_node.xpos == g_node.xpos else 0 + count['Lemmas'] += 1 if g_node.lemma == '_' or p_node.lemma == g_node.lemma else 0 + count['UFeats'] += 1 if feats_match[p_node] else 0 + if feats_match[p_node] and p_node.upos == g_node.upos and p_node.xpos == g_node.xpos: + count['AllTags'] += 1 + if align_map.get(p_node.parent) == g_node.parent and not p_node.misc['Rehanged']: + count['UAS'] += 1 + if p_node.udeprel == g_node.udeprel: + count['LAS'] += 1 + if g_node.udeprel in CONTENT: + count['CLAS'] += 1 + if g_node.lemma == '_' or g_node.lemma == p_node.lemma: + count['BLEX'] += 1 + if self._morpho_match(p_node, g_node, align_map, feats_match): + if not p_node.misc['FuncChildMissing']: + count['MLAS'] += 1 + self.total_count.update(count) + + if self.print_raw: + if self.print_raw in {'CLAS', 'BLEX', 'MLAS'}: + scores = [str(count[s]) for s in ('pred_cont', 'gold_cont', 'alig_cont', + self.print_raw)] + else: + scores = [str(count[s]) for s in ('pred', 'gold', 'Words', self.print_raw)] + print(' '.join(scores)) + + def _morpho_match(self, p_node, g_node, align_map, feats_match): + if p_node.upos != g_node.upos or not feats_match[p_node]: + return False + p_children = [c for c in p_node.children if c.udeprel in FUNCTIONAL and not c.misc['Rehanged']] + g_children = [c for c in g_node.children if c.udeprel in FUNCTIONAL] + if len(p_children) != len(g_children): + return False + for p_child, g_child in zip(p_children, g_children): + if align_map.get(p_child) != g_child: + return False + if p_child.udeprel != g_child.udeprel: + return False + if p_child.upos != g_child.upos or not feats_match[p_child]: + return False + return True + + def process_end(self): + if not self.print_results: + return + + # Redirect the default filehandle to the file specified by self.files + self.before_process_document(None) + + metrics = ('Words', 'UPOS', 'XPOS', 'UFeats', 'AllTags', + 'Lemmas', 'UAS', 'LAS', 'CLAS', 'MLAS', 'BLEX') + if self.print_counts: + print("Metric | Correct | Gold | Predicted | Aligned") + else: + print("Metric | Precision | Recall | F1 Score | AligndAcc") + print("-----------+-----------+-----------+-----------+-----------") + for metric in metrics: + correct = self.total_count[metric] + if metric in {'CLAS', 'BLEX', 'MLAS'}: + pred, gold = self.total_count['pred_cont'], self.total_count['gold_cont'] + alig = self.total_count['alig_cont'] + else: + pred, gold = self.total_count['pred'], self.total_count['gold'] + alig = self.total_count['Words'] + if self.print_counts: + print("{:11}|{:10} |{:10} |{:10} |{:10}".format( + metric, correct, gold, pred, alig)) + else: + precision, recall, fscore, alignacc = prec_rec_f1(correct, pred, gold, alig) + alignacc = "{:10.2f}".format(100 * alignacc) if metric != 'Words' else "" + print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( + metric, 100 * precision, 100 * recall, 100 * fscore, alignacc)) + + +def prec_rec_f1(correct, pred, gold, alig=0): + precision = correct / pred if pred else 0 + recall = correct / gold if gold else 0 + alignacc = correct / alig if alig else 0 + fscore = 2 * correct / (pred + gold) if pred + gold else 0 + return precision, recall, fscore, alignacc + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dir_results", "-d", default="results", help="directory with results") + parser.add_argument("--resamples", "-r", default=1000, type=int, help="how many resamples") + parser.add_argument("--confidence", "-c", default=95, help="use x-percent confidence interval") + parser.add_argument("--tests", "-t", default='all', help="comma-separated test sets") + parser.add_argument("--systems", "-s", default='all', help="comma-separated systems") + parser.add_argument("--randseed", default=0, type=int, help="random seed, default=sys time") + args = parser.parse_args() + res_dir, resamples, conf = args.dir_results, args.resamples, args.confidence + alpha = (1 - conf/100) / 2 + index_lo = int(alpha * (resamples - 1)) + index_hi = resamples - 1 - index_lo + index_mid = int(resamples / 2) + if args.systems == 'all': + systems = os.listdir(res_dir) + else: + systems = args.systems.split(',') + if args.tests == 'all': + tests = set() + for system in systems: + tests.update(os.listdir(res_dir + '/' + system)) + tests = sorted(tests) + else: + tests = args.tests.split(',') + if args.randseed: + random.seed(args.randseed) + results = [] + + print('Loading...', file=sys.stderr) + for system in systems: + sys_results = [] + results.append(sys_results) + for i_test, test in enumerate(tests): + filename = '/'.join((res_dir, system, test)) + try: + with open(filename) as res_file: + sys_results.extend([[i_test] + list(map(int, l.split())) for l in res_file]) + except FileNotFoundError: + logging.warning(filename + ' not found') + samples = len(sys_results) + + print('Resampling...', file=sys.stderr) + boot_results = [] + for i_resample in range(resamples): + print(i_resample + 1, file=sys.stderr, end='\r') + resample_results = [] + boot_results.append(resample_results) + for i_system in range(len(systems)): + pred, gold, words, correct = ([0] * len(tests) for _ in range(4)) + for _ in range(samples): + i_test, pre, gol, wor, corr = random.choice(results[i_system]) + pred[i_test] += pre + gold[i_test] += gol + words[i_test] += wor + correct[i_test] += corr + fscore_sum = 0 + for i_test in range(len(tests)): + _prec, _rec, fscore, _aligacc = prec_rec_f1(correct[i_test], pred[i_test], gold[i_test]) + fscore_sum += fscore + resample_results.append(fscore_sum / len(tests)) + print('\n', file=sys.stderr) + + sys_fscores = [] + for i_system, system in enumerate(systems): + sys_fscores.append([boot_results[i_resample][i_system] for i_resample in range(resamples)]) + final_results = [] + sys_sys_wins = [[0] * len(systems) for x in range(len(systems))] + for i_system, system in enumerate(systems): + for j_system in range(i_system): + for i, j in zip(sys_fscores[i_system], sys_fscores[j_system]): + if i > j: + sys_sys_wins[i_system][j_system] += 1 + elif i < j: + sys_sys_wins[j_system][i_system] += 1 + fscores = sorted(sys_fscores[i_system]) + final_results.append([i_system, fscores[index_mid], fscores[index_lo], fscores[index_hi]]) + + sorted_systems = sorted(final_results, key=lambda x: -x[1]) + for rank, sys_results in enumerate(sorted_systems): + i_system, f1_mid, f1_lo, f1_hi = sys_results + if rank < len(systems) - 1: + j_worse_sys = sorted_systems[rank + 1][0] + p_value = (sys_sys_wins[j_worse_sys][i_system] + 1) / (resamples + 1) + p_str = " p=%.3f" % p_value + else: + p_value, p_str = 1, "" + print("%2d. %17s %5.2f ±%5.2f (%5.2f .. %5.2f)%s" % + (rank + 1, systems[i_system], + 100 * f1_mid, 50 * (f1_hi - f1_lo), 100 * f1_lo, 100 * f1_hi, p_str)) + if p_value < (1 - conf/100): + print('-' * 60) + + +if __name__ == "__main__": + main() diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py new file mode 100644 index 00000000..e4889770 --- /dev/null +++ b/udapi/block/eval/f1.py @@ -0,0 +1,230 @@ +"""Block eval.F1 for evaluating differences between sentences with P/R/F1. + +``eval.F1 zones=en_pred gold_zone=en_gold details=0`` +prints something like:: + + predicted = 210 + gold = 213 + correct = 210 + precision = 100.00% + recall = 98.59% + F1 = 99.29% + +``eval.F1 gold_zone=y attributes=form,upos focus='(?i:an?|the)_DET' details=4`` +prints something like:: + + === Details === + token pred gold corr prec rec F1 + the_DET 711 213 188 26.44% 88.26% 40.69% + The_DET 82 25 19 23.17% 76.00% 35.51% + a_DET 0 62 0 0.00% 0.00% 0.00% + an_DET 0 16 0 0.00% 0.00% 0.00% + === Totals === + predicted = 793 + gold = 319 + correct = 207 + precision = 26.10% + recall = 64.89% + F1 = 37.23% + +This block finds differences between nodes of trees in two zones +and reports the overall precision, recall and F1. +The two zones are "predicted" (on which this block is applied) +and "gold" (which needs to be specified with parameter ``gold``). + +This block also reports the number of total nodes in the predicted zone +and in the gold zone and the number of "correct" nodes, +that is predicted nodes which are also in the gold zone. +By default two nodes are considered "the same" if they have the same ``form``, +but it is possible to check also for other nodes' attributes +(with parameter ``attributes``). + +As usual:: + + precision = correct / predicted + recall = correct / gold + F1 = 2 * precision * recall / (precision + recall) + +The implementation is based on finding the longest common subsequence (LCS) +between the nodes in the two trees. +This means that the two zones do not need to be explicitly word-aligned. +""" +from collections import Counter +import logging +import re + +from udapi.core.basewriter import BaseWriter + +# pylint: disable=too-many-instance-attributes,invalid-name + + +class F1(BaseWriter): + """Evaluate differences between sentences (in different zones) with P/R/F1. + + Args: + zones: Which zone contains the "predicted" trees? + Make sure that you specify just one zone. + If you leave the default value "all" and the document contains more zones, + the results will be mixed, which is most likely not what you wanted. + Exception: If the document conaints just two zones (predicted and gold trees), + you can keep the default value "all" because this block + will skip comparison of the gold zone with itself. + + gold_zone: Which zone contains the gold-standard trees? + + attributes: comma separated list of attributes which should be checked + when deciding whether two nodes are equivalent in LCS + + focus: Regular expresion constraining the tokens we are interested in. + If more attributes were specified in the ``attributes`` parameter, + their values are concatenated with underscore, so ``focus`` should reflect that + e.g. ``attributes=form,upos focus='(a|the)_DET'``. + For case-insensitive focus use e.g. ``focus='(?i)the'`` + (which is equivalent to ``focus='[Tt][Hh][Ee]'``). + + details: Print also detailed statistics for each token (matching the ``focus``). + The value of this parameter ``details`` specifies the number of tokens to include. + The tokens are sorted according to the sum of their *predicted* and *gold* counts. + """ + + def __init__(self, gold_zone, attributes='form', focus=None, details=4, **kwargs): + """Create the eval.F1 block object.""" + super().__init__(**kwargs) + self.gold_zone = gold_zone + self.attrs = attributes.split(',') + self.focus = None + if focus is not None: + self.focus = re.compile(focus) + self.details = details + self.correct, self.pred, self.gold = 0, 0, 0 + self.visited_zones = Counter() + if details: + self._common = Counter() + self._pred = Counter() + self._gold = Counter() + self._total = Counter() + + def process_tree(self, tree): + gold_tree = tree.bundle.get_tree(self.gold_zone) + if tree == gold_tree: + return + self.visited_zones[tree.zone] += 1 + + pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in tree.descendants] + gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in gold_tree.descendants] + + # lcs("abc", "acb") can be either "ab" or "ac". + # We want to prefer the LCS with the highest number of non-focused tokens. + # E.g. if focus="," then lcs("a,c", "ac,") should be "ac" and the comma should be evaluated + # as non-aligned, i.e. eval.F1 should return precision=recall=f1=0 for this sentence. + if self.focus is None: + common = find_lcs(pred_tokens, gold_tokens) + else: + nf_pred_tokens = [x for x in pred_tokens if not self.focus.fullmatch(x)] + nf_gold_tokens = [x for x in gold_tokens if not self.focus.fullmatch(x)] + nf_common = find_lcs(nf_pred_tokens, nf_gold_tokens) + i, j, c, un_pred, un_gold, common = 0, 0, 0, [], [], [] + while i < len(pred_tokens) and j < len(gold_tokens): + if c == len(nf_common): + common += find_lcs(pred_tokens[i:], gold_tokens[j:]) + break + while nf_common[c] != pred_tokens[i]: + un_pred.append(pred_tokens[i]) + i += 1 + while nf_common[c] != gold_tokens[j]: + un_gold.append(gold_tokens[j]) + j += 1 + common += find_lcs(un_pred, un_gold) + un_pred, un_gold = [], [] + while c < len(nf_common) and nf_common[c] == pred_tokens[i] and nf_common[c] == gold_tokens[j]: + i, j, c = i+1, j+1, c+1 + common = [x for x in common if self.focus.fullmatch(x)] + pred_tokens = [x for x in pred_tokens if self.focus.fullmatch(x)] + gold_tokens = [x for x in gold_tokens if self.focus.fullmatch(x)] + + self.correct += len(common) + self.pred += len(pred_tokens) + self.gold += len(gold_tokens) + + if self.details: + for x in common: + self._common[x] += 1 + for x in gold_tokens: + self._gold[x] += 1 + self._total[x] += 1 + for x in pred_tokens: + self._pred[x] += 1 + self._total[x] += 1 + + @property + def f1(self): + pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero + precision = self.correct / pred + recall = self.correct / gold + return 2 * precision * recall / ((precision + recall) or 1) + + def process_end(self): + # Redirect the default filehandle to the file specified by self.files + self.before_process_document(None) + + if not self.visited_zones: + logging.warning('Block eval.F1 was not applied to any zone. ' + 'Check the parameter zones=%s', self.zones) + elif len(self.visited_zones) > 1: + logging.warning('Block eval.F1 was applied to more than one zone %s. ' + 'The results are mixed together. Check the parameter zones=%s', + list(self.visited_zones.elements()), self.zones) + print('Comparing predicted trees (zone=%s) with gold trees (zone=%s), sentences=%d' + % (next(self.visited_zones.elements()), self.gold_zone, + self.visited_zones.most_common(1)[0][1])) + if self.details: + print('=== Details ===') + print('%-10s %5s %5s %5s %6s %6s %6s' + % ('token', 'pred', 'gold', 'corr', 'prec', 'rec', 'F1')) + tokens = self._total.most_common(self.details) + for token, _ in tokens: + _prec = self._common[token] / (self._pred[token] or 1) + _rec = self._common[token] / (self._gold[token] or 1) + _f1 = 2 * _prec * _rec / ((_prec + _rec) or 1) + print('%-10s %5d %5d %5d %6.2f%% %6.2f%% %6.2f%%' + % (token, self._pred[token], self._gold[token], self._common[token], + 100 * _prec, 100 * _rec, 100 * _f1)) + print('=== Totals ===') + + print("%-9s = %7d\n" * 3 + % ('predicted', self.pred, 'gold', self.gold, 'correct', self.correct), end='') + pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero + precision = self.correct / pred + recall = self.correct / gold + f1 = 2 * precision * recall / ((precision + recall) or 1) + print("%-9s = %6.2f%%\n" * 3 + % ('precision', 100 * precision, 'recall', 100 * recall, 'F1', 100 * f1), end='') + + +# difflib.SequenceMatcher does not compute LCS, so let's implement it here +def find_lcs(x, y): + """Find longest common subsequence.""" + m, n = len(x), len(y) + if m == 0 or n == 0: + return [] + elif x[0] == y[0]: + i = 1 + while i < min(m, n) and x[i] == y[i]: + i += 1 + return x[:i] + (find_lcs(x[i:], y[i:]) if i < min(m, n) else []) + else: + C = [[0] * (n + 1) for _ in range(m + 1)] + for i in range(1, m + 1): + for j in range(1, n + 1): + C[i][j] = C[i - 1][j - 1] + 1 if x[i - 1] == y[j - 1] else max(C[i][j - 1], C[i - 1][j]) + index = C[m][n] + lcs = [None] * index + while m > 0 and n > 0: + if x[m - 1] == y[n - 1]: + lcs[index - 1] = x[m - 1] + m, n, index = m - 1, n - 1, index - 1 + elif C[m - 1][n] > C[m][n - 1]: + m -= 1 + else: + n -= 1 + return lcs diff --git a/udapi/block/eval/parsing.py b/udapi/block/eval/parsing.py new file mode 100644 index 00000000..6923c1fb --- /dev/null +++ b/udapi/block/eval/parsing.py @@ -0,0 +1,40 @@ +"""Block eval.Parsing for evaluating UAS and LAS - gold and pred must have the same tokens.""" +from udapi.core.basewriter import BaseWriter + + +class Parsing(BaseWriter): + """Evaluate labeled and unlabeled attachment score (LAS and UAS).""" + + def __init__(self, gold_zone, **kwargs): + """Create the eval.Parsing block object.""" + super().__init__(**kwargs) + self.gold_zone = gold_zone + self.correct_las, self.correct_ulas, self.correct_uas, self.total = 0, 0, 0, 0 + + def process_tree(self, tree): + gold_tree = tree.bundle.get_tree(self.gold_zone) + if tree == gold_tree: + return + pred_nodes = tree.descendants + gold_nodes = gold_tree.descendants + if len(pred_nodes) != len(gold_nodes): + raise ValueError('The sentences do not match (%d vs. %d nodes)' + % (len(pred_nodes), len(gold_nodes))) + + self.total += len(pred_nodes) + for pred_node, gold_node in zip(pred_nodes, gold_nodes): + if pred_node.parent.ord == gold_node.parent.ord: + self.correct_uas += 1 + if pred_node.deprel == gold_node.deprel: + self.correct_las += 1 + if pred_node.udeprel == gold_node.udeprel: + self.correct_ulas += 1 + + + def process_end(self): + # Redirect the default filehandle to the file specified by self.files + self.before_process_document(None) + print('nodes = %d' % self.total) + print('UAS = %6.2f' % (100 * self.correct_uas / self.total)) + print('LAS (deprel) = %6.2f' % (100 * self.correct_las / self.total)) + print('LAS (udeprel) = %6.2f' % (100 * self.correct_ulas / self.total)) diff --git a/udapi/block/msf/case.py b/udapi/block/msf/case.py new file mode 100644 index 00000000..7d362c7f --- /dev/null +++ b/udapi/block/msf/case.py @@ -0,0 +1,448 @@ +""" +Morphosyntactic features (UniDive): +Derive a MS Case feature from morphological case and adposition. +""" +from udapi.core.block import Block +import logging + +class Case(Block): + + adposmap = { + 'v+Loc': 'Ine', + 'uvnitř+Gen': 'Ine', + 'uvnitř+': 'Ine', + 'mezi_uvnitř+Gen': 'Ine', # annotation error? + 'uprostřed+Gen': 'Ces', + 'mezi+Ins': 'Int', + 'mezi+Nom': 'Int', # annotation error + 'mezi+Voc': 'Int', # annotation error + 'vně+Gen': 'Ext', + 'stranou+Gen': 'Ext', + 'stranou+Dat': 'Ext', + 'na+Loc': 'Ade', + 'na_mimo+Loc': 'Ade', # annotation error? + 'na_úroveň+Gen': 'Ade', + 'na_úroveň+': 'Ade', + 'v_proces+Gen': 'Ade', # ??? + 'v_rámec+Gen': 'Ade', # ??? + 'v_rámec+': 'Ade', # ??? + 'v_řada+Gen': 'Ade', # ??? + 'z_oblast+Gen': 'Ade', # ??? + 'vedle+Gen': 'Apu', + 'u+Gen': 'Chz', + 'kolem+Gen': 'Cir', + 'kol+Gen': 'Cir', + 'dokola+Gen': 'Cir', + 'okolo+Gen': 'Cir', + 'v_oblast+Gen': 'Cir', + 'v_oblast+': 'Cir', + 'blízko+Dat': 'Prx', + 'blízko+Gen': 'Prx', + 'blízko+': 'Prx', + 'nedaleko+Gen': 'Prx', + 'daleko+Gen': 'Prx', # lemma of 'nedaleko' + 'poblíž+Gen': 'Prx', + 'daleko_od+Gen': 'Dst', + 'nad+Ins': 'Sup', + 'pod+Ins': 'Sub', + 'vespod+Gen': 'Sub', + 'před+Ins': 'Ant', + 'vpředu+Gen': 'Ant', + 'na_čelo+Gen': 'Ant', + 'v_čelo+Gen': 'Ant', + 'v_čelo+': 'Ant', + 'za+Ins': 'Pst', + 'naproti+Dat': 'Opp', + 'od+Gen': 'Abl', + 'od+Dat': 'Abl', # annotation error + 'směr_od+Gen': 'Abl', + 'z_strana+Gen': 'Abl', + 'z_strana+': 'Abl', + 'z+Gen': 'Ela', + 'z+Nom': 'Ela', # annotation error + 'z+Dat': 'Ela', # annotation error + 'zevnitř+Gen': 'Ela', + 'zprostřed+Gen': 'Cne', + 's+Gen': 'Del', + 'zpod+Gen': 'Sbe', + 'zpoza+Gen': 'Pse', + 'po+Loc': 'Per', + 'cesta+Gen': 'Per', + 'cesta+Ins': 'Per', + 'napříč+Gen': 'Crs', + 'napříč+Ins': 'Crs', + 'podél+Gen': 'Lng', + 'skrz+Acc': 'Inx', + 'přes+Acc': 'Spx', + 'přes+Nom': 'Spx', # annotation error + 'ob+Acc': 'Cix', + 'po+Acc': 'Ter', + 'po+Nom': 'Ter', # annotation error + 'po+Gen': 'Ter', # annotation error + 'do+Gen': 'Ill', + 'do+Acc': 'Ill', # annotation error + 'do_/+Gen': 'Ill', + 'dovnitř+Gen': 'Ill', + 'doprostřed+Gen': 'Cnl', + 'mezi+Acc': 'Itl', + 'na+Acc': 'All', + 'na+Nom': 'All', # annotation error + 'na+Gen': 'All', # annotation error + 'k+Dat': 'Apl', + 'k+Nom': 'Apl', # annotation error + 'vstříc+Dat': 'Apl', + 'do_oblast+Gen': 'Apl', + 'směr+': 'Apl', + 'směr_k+Dat': 'Apl', + 'směr_k+': 'Apl', + 'směr_na+Acc': 'Apl', + 'v_směr_k+Dat': 'Apl', + 'nad+Acc': 'Spl', + 'nad+Nom': 'Spl', # annotation error + 'pod+Acc': 'Sbl', + 'před+Acc': 'Anl', + 'před+Gen': 'Anl', # annotation error + 'za+Acc': 'Psl', + 'dík_za+Acc': 'Psl', # annotation error? + 'dokud': 'Tan', + 'nežli': 'Tan', + 'v+Acc': 'Tem', + 'v+Nom': 'Tem', # annotation error + 'v+Gen': 'Tem', # annotation error + 'při_příležitost+Gen': 'Tem', + 'současně_s+Ins': 'Tem', + 'u_příležitost+Gen': 'Tem', + 'v_období+Gen': 'Tpx', + 'počátkem+Gen': 'Din', + 'počátek+Gen': 'Din', + 'počínat+Ins': 'Din', + 'počínat+': 'Din', + 'začátkem+Gen': 'Din', + 'začátek+Gen': 'Din', + 'během+Gen': 'Dur', + 'postupem+Gen': 'Dur', + 'postup+Gen': 'Dur', + 'při+Loc': 'Dur', + 'v_průběh+Gen': 'Dur', + 'za+Gen': 'Der', + 'koncem+Gen': 'Dtr', + 'konec+Gen': 'Dtr', + 'k_konec+Gen': 'Dtr', + 'končit+Ins': 'Dtr', + 'závěrem+Gen': 'Dtr', + 'závěr+Gen': 'Dtr', + 'na_závěr+Gen': 'Dtr', + 'v_závěr+Gen': 'Dtr', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'před_po+Loc': 'Tps', + 'počínaje+Ins': 'Teg', + 'jménem+Nom': 'Atr', + 'jméno+Nom': 'Atr', + 'zdali': 'Atr', + 'že': 'Atr', + 'z_řada+Gen': 'Gen', + 's+Ins': 'Com', + 's+Nom': 'Com', # annotation error + 'spolu_s+Ins': 'Com', + 'spolu_s+': 'Com', + 'společně_s+Ins': 'Com', + 'společně_s+': 'Com', + 'v_čelo_s+Ins': 'Com', + 'v_spolupráce_s+Ins': 'Com', + 'bez+Gen': 'Abe', + 'včetně+Gen': 'Inc', + 'nad_rámec+Gen': 'Add', + 'kromě+Gen': 'Exc', + 'krom+Gen': 'Exc', + 'mimo+Acc': 'Exc', + 'mimo+Gen': 'Exc', + 'vyjma+Gen': 'Exc', + 'až_na+Acc': 'Exc', + 's_výjimka+Gen': 'Exc', + 's_výjimka+': 'Exc', + 'místo+Gen': 'Sbs', + 'místo+Ins': 'Sbs', # něčím místo něčím jiným + 'místo+Loc': 'Sbs', # annotation error + 'místo_do+Gen': 'Sbs', + 'místo_k+Dat': 'Sbs', + 'místo_na+Acc': 'Sbs', + 'místo_na+': 'Sbs', + 'místo_po+Loc': 'Sbs', + 'místo_v+Acc': 'Sbs', + 'místo_v+': 'Sbs', + 'místo_za+Acc': 'Sbs', + 'namísto+Gen': 'Sbs', + 'namísto_do+Gen': 'Sbs', + 'v_zastoupení+Gen': 'Sbs', + 'výměna_za+Acc': 'Sbs', + 'jako': 'Ess', + 'jako+': 'Ess', + 'jako+Nom': 'Ess', + 'jako+Acc': 'Ess', + 'jako+Dat': 'Ess', + 'jako_u+Gen': 'Ess', + 'jako_v+Loc': 'Ess', + 'formou+Gen': 'Ess', + 'forma+Gen': 'Ess', + 'v_forma+Gen': 'Ess', + 'v_podoba+Gen': 'Ess', + 'v_podoba+': 'Ess', + 'shoda+Gen': 'Equ', + 'v_shoda_s+Ins': 'Equ', + 'do_soulad_s+Ins': 'Sem', + 'na_způsob+Gen': 'Sem', + 'po_vzor+Gen': 'Sem', + 'úměrně+Dat': 'Sem', + 'úměrně_k+Dat': 'Sem', + 'úměrně_s+Ins': 'Sem', + 'v_analogie_s+Ins': 'Sem', + 'v_duch+Gen': 'Sem', + 'v_smysl+Gen': 'Sem', + 'oproti+Dat': 'Dsm', + 'na_rozdíl_od+Gen': 'Dsm', + 'na_rozdíl_od+': 'Dsm', + 'než': 'Cmp', + 'než+Nom': 'Cmp', + 'než+Gen': 'Cmp', + 'než+Acc': 'Cmp', + 'než_nad+Ins': 'Cmp', + 'než_v+Acc': 'Cmp', + 'než_v+Loc': 'Cmp', + 'v_poměr_k+Dat': 'Cmp', + 'v_poměr_k+': 'Cmp', + 'v_porovnání_k+Dat': 'Cmp', + 'v_porovnání_s+Ins': 'Cmp', + 'v_porovnání_s+': 'Cmp', + 'v_srovnání_s+Ins': 'Cmp', + 'v_srovnání_s+': 'Cmp', + 'o+Acc': 'Dif', + 'o+Nom': 'Dif', # annotation error + 'o+Gen': 'Dif', # annotation error + 'o+Dat': 'Dif', # annotation error + 'o_o+Acc': 'Dif', # annotation error + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'díky+Dat': 'Cau', + 'dík+Dat': 'Cau', + 'kvůli+Dat': 'Cau', + 'vinou+Gen': 'Cau', + 'vlivem+Gen': 'Cau', + 'vliv+Gen': 'Cau', + 'vliv+': 'Cau', + 'vinou+Gen': 'Cau', + 'vina+Gen': 'Cau', + 'zásluhou+Gen': 'Cau', + 'zásluha+Gen': 'Cau', + 'z_důvod+Gen': 'Cau', + 'v_důsledek+Gen': 'Cau', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'následek+Gen': 'Cau', + 'aby': 'Pur', + 'jméno+Gen': 'Pur', + 'pro_případ+Gen': 'Pur', + 'v_jméno+Gen': 'Pur', + 'v_zájem+Gen': 'Pur', + 'za_účel+Gen': 'Pur', + 'na_základ+Gen': 'Cns', + 'pod_vliv+Gen': 'Cns', + 's_ohled_na+Acc': 'Cns', + 's_přihlédnutí_k+Dat': 'Cns', + 's_přihlédnutí_na+Acc': 'Cns', + 'v_souvislost_s+Ins': 'Cns', + 'v_souvislost_s+': 'Cns', + 'v_světlo+Gen': 'Cns', + 'vzhledem_k+Dat': 'Cns', + 'v_soulad_s+Ins': 'Cns', + 'v_soulad_s+': 'Cns', + 'z_titul+Gen': 'Cns', + 'ať': 'Ign', + 'bez_ohled_na+Acc': 'Ign', + 'nehledě_k+Dat': 'Ign', + 'nehledě_na+Acc': 'Ign', + 'navzdory+Dat': 'Ccs', + 'vzdor+Dat': 'Ccs', + 'v_rozpor_s+Ins': 'Ccs', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'pokud+Nom': 'Cnd', + 'zda': 'Cnd', + 'v_případ+Gen': 'Cnd', + 'v_případ+': 'Cnd', + 'v_závislost_na+Loc': 'Cnd', + 'v_závislost_s+Ins': 'Cnd', + 'o+Loc': 'The', + 'ohledně+Gen': 'The', + 'stran+Gen': 'The', + 'co_do+Gen': 'The', + 'na_téma+Gen': 'The', + 'na_téma+Nom': 'The', + 'na_téma+': 'The', + 'na_úsek+Gen': 'The', + 'po_stránka+Gen': 'The', + 'v_obor+Gen': 'The', + 'v_otázka+Gen': 'The', + 'v_spojení_s+Ins': 'The', + 'v_věc+Gen': 'The', + 'v_vztah_k+Dat': 'The', + 'v_vztah_k+': 'The', + 'v_záležitost+Gen': 'The', + 'v_znamení+Gen': 'The', + 'z_hledisko+Gen': 'The', + 'z_hledisko+': 'The', + 'podle+Gen': 'Quo', + 'dle+Gen': 'Quo', + 'pomocí+Gen': 'Ins', + 's_pomoc+Gen': 'Ins', + 'prostřednictvím+Gen': 'Ins', + 'prostřednictví+Gen': 'Ins', + 'prostřednictví+Ins': 'Ins', # annotation error + 'prostřednictví+': 'Ins', + 'za_pomoc+Gen': 'Ins', + 'pro+Acc': 'Ben', + 'pro+Nom': 'Ben', # annotation error + 'pro+Gen': 'Ben', # annotation error + 'pro+Ins': 'Ben', # annotation error + 'napospas+Dat': 'Ben', + 'k_prospěch+Gen': 'Ben', + 'na_úkor+Gen': 'Ben', + 'na_vrub+Gen': 'Ben', + 'v_prospěch+Gen': 'Ben', + 'v_neprospěch+Gen': 'Ben', + 'v_služba+Gen': 'Ben', + 'proti+Dat': 'Adv', + 'proti+Gen': 'Adv', + 'kontra+Nom': 'Adv', + 'versus+Nom': 'Adv', + 'vůči+Dat': 'Adv', + # subordinators + 'dokud': 'Tan', + 'nežli': 'Tan', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'zdali': 'Atr', + 'že': 'Atr', + 'jako': 'Ess', + 'než': 'Cmp', + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'aby': 'Pur', + 'ať': 'Ign', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'zda': 'Cnd', + # coordinators + 'a': 'Conj', + 'i': 'Conj', + 'ani': 'Nnor', + 'nebo': 'Disj', + 'či': 'Disj', + 'ale': 'Advs', + 'avšak': 'Advs', + 'však': 'Advs', + 'nýbrž': 'Advs', + 'neboť': 'Reas', + 'tedy': 'Cnsq', + 'tak': 'Cnsq' + } + + def process_node(self, node): + """ + Derives a case value from preposition and morphological case. Stores it + as MSFCase in MISC. + """ + # Do not do anything for function words. + # Specifically for Case, also skip 'det' and 'amod' modifiers (congruent attributes) + # because their Case is only agreement feature inherited from the head noun. + if node.udeprel in ['case', 'mark', 'cc', 'aux', 'cop', 'punct']: + node.misc['MSFFunc'] = 'Yes' + return + elif node.udeprel in ['det', 'amod']: + node.misc['MSFFunc'] = 'No' + return + else: + node.misc['MSFFunc'] = 'No' + # Get all case markers (adpositions) attached to the current node. + adpositions = [] + for c in node.children: + if c.udeprel == 'case': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + adpositions.append(lemma) + # We assume that all features were copied from FEATS to MISC in mwe.MsfInit. + # They may have been further processed there, so we take the input from there. + msfcase = node.misc['MSFCase'] + if adpositions: + adpostring = '_'.join(adpositions) + caseadpostring = adpostring + '+' + msfcase + if caseadpostring in self.adposmap: + msfcase = self.adposmap[caseadpostring] + else: + logging.warn(f"No Case value found for '{caseadpostring}'.") + msfcase = caseadpostring + # Omer wants to collect cases from both adpositions and subordinators + # but we will consider subordinators only if we do not have any case + # from morphology or adpositions. + if not msfcase: + subordinators = [] + for c in node.children: + if c.udeprel == 'mark': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + subordinators.append(lemma) + if subordinators: + subordstring = '_'.join(subordinators) + if subordstring in self.adposmap: + msfcase = self.adposmap[subordstring] + # To lump coordinators with all the above makes even less sense but for + # the moment we do it. + if not msfcase: + coordinators = [] + for c in node.children: + if c.udeprel == 'cc': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + coordinators.append(lemma) + if coordinators: + coordstring = '_'.join(coordinators) + if coordstring in self.adposmap: + msfcase = self.adposmap[coordstring] + node.misc['MSFCase'] = msfcase diff --git a/udapi/block/msf/createabstract.py b/udapi/block/msf/createabstract.py new file mode 100644 index 00000000..fbdf73e5 --- /dev/null +++ b/udapi/block/msf/createabstract.py @@ -0,0 +1,45 @@ +""" +Morphosyntactic features (UniDive): +Create abstract nodes representing dropped arguments of predicates (if verbal +morphology signals that the subject is third person singular, and there is no +subject node, create an abstract node and copy the features there). +""" +from udapi.core.block import Block +import re + +class CreateAbstract(Block): + + def process_node(self, node): + """ + If a node has MSFVerbForm=Fin and at least one of the agreement features + MSFNumber, MSFPerson, MSFGender, MSFAnimacy, MSFPolite, assume that these + features characterize the subject (this block is not suitable for languages + with polypersonal agreement). Check that the subject is present. If not, + create an abstract node to represent it. + """ + if node.misc['MSFVerbForm'] == 'Fin' and any([node.misc[x] for x in ['MSFNumber', 'MSFPerson', 'MSFGender', 'MSFAnimacy', 'MSFPolite']]): + # Current node is a finite predicate. Does it have a subject? If not, create an abstract one. + if not any([x.udeprel in ['nsubj', 'csubj'] for x in node.children]): + # There could already be an abstract subject. We have to look for it in the enhanced graph. + if not any([re.match(r"^[nc]subj", edep['deprel']) for edep in node.deps]): + # Create an abstract subject. + subject = node.create_empty_child('nsubj') + subject.upos = 'PRON' + subject.feats['PronType'] = 'Prs' + subject.misc['MSFPronType'] = 'Prs' + subject.feats['Case'] = 'Nom' + subject.misc['MSFCase'] = 'Nom' + for f in ['Number', 'Person', 'Gender', 'Animacy', 'Polite']: + msf = 'MSF' + f + if node.misc[msf]: + subject.feats[f] = node.misc[msf] + subject.misc[msf] = node.misc[msf] + subject.misc['MSFFunc'] = 'No' + # Regardless of whether it had a subject or not, the agreement features + # should be removed from the verb. + ###!!! We also may want to check if the pre-existing subject has all the features. + node.misc['MSFNumber'] = '' + node.misc['MSFPerson'] = '' + node.misc['MSFGender'] = '' + node.misc['MSFAnimacy'] = '' + node.misc['MSFPolite'] = '' diff --git a/udapi/block/msf/init.py b/udapi/block/msf/init.py new file mode 100644 index 00000000..ceca12af --- /dev/null +++ b/udapi/block/msf/init.py @@ -0,0 +1,53 @@ +""" +Morphosyntactic features (UniDive): +Initialization. Copies features from FEATS as MSF* attributes to MISC. +""" +from udapi.core.block import Block +import re + +class Init(Block): + + + def process_node(self, node): + """ + For every feature in FEATS, creates its MSF* counterpart in MISC. + """ + for f in node.feats: + # Only selected features will be copied. Certain features are not + # interesting for the morphosyntactic annotation. + if f not in ['Abbr', 'AdpType', 'Emph', 'Foreign', 'NameType', 'Style', 'Typo', 'Variant']: + node.misc['MSF'+f] = node.feats[f] + # We are particularly interested in the Case feature but some nominals + # lack it (e.g. acronyms or numbers). If there is a preposition, it may + # indicate the expected case of the nominal. + if not node.feats['Case']: + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + node.misc['MSFCase'] = adpositions[0].feats['Case'] + # If we did not find a preposition to help us, we may be able to read + # the case off an adjectival modifier or determiner. + if not node.misc['MSFCase']: + modifiers = [x for x in node.children if x.udeprel in ['amod', 'det'] and x.feats['Case']] + if modifiers: + node.misc['MSFCase'] = modifiers[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if not node.misc['MSFCase']: + if node.udeprel == 'nsubj': + node.misc['MSFCase'] = 'Nom' + elif node.udeprel == 'obj': + node.misc['MSFCase'] = 'Acc' + # If the node contains Phrase features in MISC (periphrastic verb forms + # detected by Lenka's code), replace the MS features with them. + phrasefeatures = [x for x in node.misc if re.match(r"^Phrase[A-Z]", x)] + for pf in phrasefeatures: + msf = pf + if msf == 'PhraseForm': + msf = 'MSFVerbForm' + else: + msf = re.sub(r"Phrase", 'MSF', pf) + node.misc[msf] = node.misc[pf] + node.misc[pf] = '' diff --git a/udapi/block/msf/numphrase.py b/udapi/block/msf/numphrase.py new file mode 100644 index 00000000..22f68c9d --- /dev/null +++ b/udapi/block/msf/numphrase.py @@ -0,0 +1,36 @@ +""" +Morphosyntactic features (UniDive): +Case in Number Phrases like 'pět mužů' (five men) in Czech. +""" +from udapi.core.block import Block + +class NumPhrase(Block): + + + def process_node(self, node): + """ + Nouns with a 'nummod:gov' dependent are morphologically in genitive, + but the case of the whole phrase (number + counted noun) is different, + probably nominative or accusative. + """ + quantifiers = [x for x in node.children if x.deprel in ['nummod:gov', 'det:numgov']] + current_case = node.misc['MSFCase'] + if (current_case == 'Gen' or current_case == '') and quantifiers: + quantifier_case = quantifiers[0].misc['MSFCase'] + # The quantifier may lack the case feature (e.g. numbers expressed by digits) + # but we may be able to guess it from a preposition or other factors. + if quantifier_case == '': + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + quantifier_case = adpositions[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if quantifier_case == '': + if node.udeprel == 'nsubj': + quantifier_case = 'Nom' + elif node.udeprel == 'obj': + quantifier_case = 'Acc' + node.misc['MSFCase'] = quantifier_case diff --git a/udapi/block/msf/phrase.py b/udapi/block/msf/phrase.py new file mode 100644 index 00000000..cf5a8f81 --- /dev/null +++ b/udapi/block/msf/phrase.py @@ -0,0 +1,168 @@ +""" +Morphosyntactic features (UniDive): +An abstract block as a base for derivation of blocks that discover periphrastic +verb forms and save them as Phrase features in MISC. This block provides the +methods that save the features in MISC. It is based on the Writer module by +Lenka Krippnerová. +""" +from udapi.core.block import Block +import logging + +class Phrase(Block): + + def __init__(self, feature_prefix='CW', **kwargs): + """ + Parameters: + feature_prefix (string) - The prefix of phrase features (e. g. 'CW', 'Phrase'), default is 'CG' + """ + super().__init__(**kwargs) + self.feature_prefix = feature_prefix + + self.dictionary = { + 'person': f'{feature_prefix}Person', + 'number': f'{feature_prefix}Number', + 'mood': f'{feature_prefix}Mood', + 'tense': f'{feature_prefix}Tense', + 'voice': f'{feature_prefix}Voice', + 'aspect':f'{feature_prefix}Aspect', + 'form': f'{feature_prefix}Form', + 'reflex': f'{feature_prefix}Reflex', + 'polarity': f'{feature_prefix}Polarity', + 'gender': f'{feature_prefix}Gender', + 'animacy': f'{feature_prefix}Animacy', + 'ords': feature_prefix, + 'expl': f'{feature_prefix}Expl', + 'analytic': 'Analytic', + } + + # a dictionary where the key is the lemma of a negative particle and the value is a list of the lemmas of their possible children that have a 'fixed' relation + # we do not want to include these negative particles in the phrase; these are expressions like "never", etc. + self.negation_fixed = { + # Belarusian + 'ні' : ['раз'], + 'ня' : ['толькі'], + + # Upper Sorbian + 'nic' : ['naposledku'], + + # Polish + 'nie' : ['mało'], + + # Pomak + 'néma' : ['kak'], + + # Slovenian + 'ne' : ['le'], + + # Russian and Old East Slavic + 'не' : ['то', 'токмо'], + 'ни' : ['в', 'раз', 'шатко'], + 'нет' : ['нет'] + } + + def process_node(self, node): + """ + Override this in a derived class! + """ + logging.fatal('process_node() not implemented.') + + + + def write_node_info(self, node, + tense = None, + person = None, + number = None, + mood = None, + voice = None, + form = None, + reflex = None, + polarity = None, + ords = None, + gender = None, + animacy = None, + aspect = None, + expl=None, + analytic=None): + arguments = locals() + del arguments['self'] # delete self and node from arguments, + del arguments['node'] # we want only grammatical categories + for key,val in arguments.items(): + if val != None: + node.misc[self.dictionary[key]] = val + + def has_fixed_children(self, node): + """ + Returns True if the node has any children with the 'fixed' relation and the node's lemma along with the child's lemma are listed in self.negation_fixed. + """ + fixed_children = [x for x in node.children if x.udeprel == 'fixed'] + + if fixed_children: + if fixed_children[0].lemma in self.negation_fixed.get(node.lemma, []): + return True + return False + + def get_polarity(self, nodes): + """ + Returns 'Neg' if there is exactly one node with Polarity='Neg' among the given nodes. + Returns an empty string if there are zero or more than one such nodes. + """ + neg_count = 0 + for node in nodes: + if node.feats['Polarity'] == 'Neg': + neg_count += 1 + + if neg_count == 1: + return 'Neg' + + # neg_count can be zero or two, in either case we want to return an empty string so that the PhrasePolarity attribute is not generated + else: + return '' + + def get_negative_particles(self, nodes): + """ + Returns a list of all negative particles found among the children + of the specified nodes, except for negative particles with fixed children specified in self.negation_fixed. + """ + neg_particles = [] + for node in nodes: + neg = [x for x in node.children if x.upos == 'PART' and x.feats['Polarity'] == 'Neg' and x.udeprel == 'advmod' and not self.has_fixed_children(x)] + if neg: + neg_particles += neg + return neg_particles + + + def get_is_reflex(self,node,refl): + if node.feats['Voice'] == 'Mid': + return 'Yes' + if len(refl) == 0: + return node.feats['Reflex'] + return 'Yes' + + def get_expl_type(self,node, refl): + if node.feats['Voice'] == 'Mid': + return 'Pv' + if not refl: + return '' + if refl[0].deprel == 'expl': + return 'Pv' + return refl[0].deprel.split(':')[1].capitalize() + + def is_expl_pass(self,refl): + if len(refl) == 0: + return False + return refl[0].deprel == 'expl:pass' + + def get_voice(self,node,refl): + voice = node.feats['Voice'] + if self.is_expl_pass(refl): + return 'Pass' + return voice + + def get_analytic_bool(self,node): + auxes = [x for x in node.children if x.udeprel == 'aux'] + + if auxes: + return 'Yes' + else: + return 'No' + diff --git a/udapi/block/msf/removefunc.py b/udapi/block/msf/removefunc.py new file mode 100644 index 00000000..e169a2de --- /dev/null +++ b/udapi/block/msf/removefunc.py @@ -0,0 +1,17 @@ +""" +Morphosyntactic features (UniDive): +Cleanup. Removes MSF* features from MISC for function nodes (MSFFunc=Yes). +""" +from udapi.core.block import Block + +class RemoveFunc(Block): + + + def process_node(self, node): + """ + Removes MSF* features if MSFFunc=Yes. + """ + if node.misc['MSFFunc'] == 'Yes': + msfeats = [x for x in node.misc if x.startswith('MSF')] + for msf in msfeats: + node.misc[msf] = '' diff --git a/udapi/block/msf/romance/preprocessor.py b/udapi/block/msf/romance/preprocessor.py new file mode 100644 index 00000000..ad7aec1e --- /dev/null +++ b/udapi/block/msf/romance/preprocessor.py @@ -0,0 +1,20 @@ +from udapi.core.block import Block + +class Preprocessor(Block): + + + def process_node(self, node): + + # In Porttinari treebank, the negative adverb não is not marked with feat Polarity=Neg + if node.lemma == 'não' and node.upos == 'ADV': + node.feats['Polarity'] = 'Neg' + + if node.upos == 'ADV' and node.feats['PronType'] == 'Neg': + node.feats['PronType'] = '' + node.feats['Polarity'] = 'Neg' + + # In Romanian RRT treebank, there is no annotation of the voice feature + # Automatically assign passive voice + pass_auxes = [x for x in node.children if x.deprel == 'aux:pass'] + if pass_auxes: + node.feats['Voice'] = 'Pass' \ No newline at end of file diff --git a/udapi/block/msf/romance/romance.py b/udapi/block/msf/romance/romance.py new file mode 100644 index 00000000..ed05fa89 --- /dev/null +++ b/udapi/block/msf/romance/romance.py @@ -0,0 +1,965 @@ +import udapi.block.msf.phrase +from enum import Enum + +AUXES_HAVE = ['ter', 'haber', 'avere'] +AUXES_BE = ['estar', 'essere'] +MODALS = ['poder', 'deber', 'querer', 'saber', # Spanish + Portuguese + 'potere', 'dovere', 'volere', 'sapere'] # Italian + +class Aspect(str, Enum): + ANT = 'Ant' + IMP = 'Imp' + IMPPROG = 'ImpProg' + PERF = 'Perf' + PERFPROG = 'PerfProg' + PROG = 'Prog' + PQP = 'Pqp' + PQPPROG = 'PqpProg' + +class Tense(str, Enum): + FUT = 'Fut' + FUTFUT = 'FutFut' + PAST = 'Past' + PASTFUT = 'PastFut' + PASTPRES = 'PastPres' + PRES = 'Pres' + +class Romance(udapi.block.msf.phrase.Phrase): + + def __init__(self, neg=True, **kwargs): + """ + Parameters: + neg (bool) - If True, process negation and generate the PhrasePolarity=Neg attribute. + feature_prefix (string) - The prefix of phrase features (e. g. 'CG', 'Phrase'), default is 'CG' + """ + super().__init__(**kwargs) + self.neg = neg + + def process_node(self, node): + + if node.misc[self.feature_prefix] != '': + return + + cop = [x for x in node.children if x.udeprel == 'cop'] + + # only expl or expl:pv, no expl:impers or expl:pass + refl = [x for x in node.children if (x.lemma == 'se' or x.lemma == 'soi') and x.upos == 'PRON' and x.udeprel == 'expl' and x.deprel != 'expl:impers' and x.deprel != 'expl:pass'] + + if refl: + expl='Pv' + else: + expl=None + + if cop: + # find auxiliary verbs, modal verbs, and auxiliary verbs related to modal verbs among the children of the content verb and separate them from each other + auxes, neg, modals, modal_auxes, modal_neg = self.find_auxes_and_neg(node) + adp = [x for x in node.children if x.upos == 'ADP'] + + if modals: + # we consider modals themselves to be separate verb forms + self.process_modal_verbs(modals, modal_auxes, modal_neg) + + if auxes: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in auxes] + [r.ord for r in refl] + [a.ord for a in adp] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in auxes] + [a.ord for a in adp] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(cop[0], auxes, expl, polarity, phrase_ords, node) + else: + # no auxiliaries, only cop + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [c.ord for c in cop] + [r.ord for r in refl] + [a.ord for a in adp] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in adp] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_copulas(node, cop, expl, polarity, phrase_ords) + return + + if node.upos == 'VERB': #TODO maybe add "or node.feats['VerbForm'] == 'Part'"? + + # find auxiliary verbs, modal verbs, and auxiliary verbs related to modals among the children of the content verb and separate them from each other + auxes, neg, modals, modal_auxes, modal_neg = self.find_auxes_and_neg(node) + aux_pass = [x for x in auxes if x.deprel == 'aux:pass'] + auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass'] + + # infinitive with a subject is a subjunctive + subj = [x for x in node.children if x.udeprel == 'subj'] + if node.feats['VerbForm'] == 'Inf' and subj: + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + mood='Sub', + form='Fin', + tense=Tense.FUT.value, + gender=node.feats['Gender'], + voice=node.feats['Voice'], + expl=expl, + analytic=self.get_analytic_bool(node), + ords=[node.ord] + ) + return + + if modals: + # we consider modals themselves to be separate verb forms + self.process_modal_verbs(modals, modal_auxes, modal_neg) + + if not auxes: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_phrases_with_ir_aller_estar(node, expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(node, expl, polarity, phrase_ords, node) + + + else: + # no passive auxiliaries + if not aux_pass: + polarity = '' + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(node, auxes, expl, polarity, phrase_ords, node) + + # head verb has only passive auxiliary and no more other auxiliaries + elif not auxes_without_pass: + polarity = '' + + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # TODO phrase-level features are currently determined based on the first passive auxiliary, but it can happen that there are more than one passive auxiliary + self.process_phrases_with_ir_aller_estar(auxes[0], expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(auxes[0], expl, polarity, phrase_ords, node) + + # head verb has passive auxiliary and also other auxiliaries + else: + polarity = '' + + if self.neg is True: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg] + if neg: + polarity = 'Neg' + else: + phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(aux_pass[0], auxes_without_pass, expl, polarity, phrase_ords, node) + + def find_auxes_and_neg(self, node): + """ + Find all auxiliaries and negative adverbials among node.children and classifies them. + + Parameters: + node (udapi.core.node.Node): head word, look for auxiliaries in its children + + Returns: + tuple: a classification of auxiliaries consisting of: + - auxiliaries directly modifying the node, + - negative adverbs modifying the node, + - modal verbs, + - auxiliaries modifying a modal verb, + - negative adverbs modifying a modal verb. + """ + + node_auxes = [] + node_neg = [] + modals = [] + modal_auxes = [] + modal_neg = [] + + for child in node.children: + if child.udeprel == 'aux': + if child.lemma in MODALS: + modals.append(child) + modal_auxes = node_auxes # auxiliaries found so far are assumed to modify the modal verb (they come before it) + node_auxes = [] + + modal_neg = node_neg + node_neg = [] + + else: + node_auxes.append(child) + + elif child.upos == 'ADV' and child.feats['Polarity'] == 'Neg': + node_neg.append(child) + + return node_auxes, node_neg, modals, modal_auxes, modal_neg + + def process_modal_verbs(self, modals, modal_auxes, modal_neg): + """ + Annotates modal verb forms with the Phrase* attributes. + The modal verbs are kept as a single verb form, without including the infinitive of the content word. + + Parameters: + modals (list): all modal verbs among the children of the head content verb (currently assumes there is only one.) + modal_auxes (list): auxiliaries of the modal verb(s) + modal_neg (list): negative adverbs of the modal verb(s) + + """ + if not modal_auxes: + polarity = '' + if self.neg is True: + phrase_ords = [modals[0].ord] + [n.ord for n in modal_neg] + phrase_ords.sort() + + if modal_neg: + polarity='Neg' + else: + phrase_ords = [modals[0].ord] + self.process_phrases_with_ir_aller_estar(modals[0], '', polarity, phrase_ords, modals[0]) + self.process_simple_verb_forms(modals[0], '', polarity, phrase_ords, modals[0]) + + else: + polarity = '' + if self.neg is True: + phrase_ords = [modals[0].ord] + [a.ord for a in modal_auxes] + [n.ord for n in modal_neg] + if modal_neg: + polarity='Neg' + else: + phrase_ords = [modals[0].ord] + [a.ord for a in modal_auxes] + phrase_ords.sort() + + self.process_periphrastic_verb_forms(modals[0], modal_auxes, '', polarity, phrase_ords, modals[0]) + + def process_phrases_with_ir_aller_estar(self, node, expl, polarity, phrase_ords, head_node): + aspect = '' + tense = node.feats['Tense'] + + # phrase already annotated + if head_node.misc[self.feature_prefix] != '': + return + + xcomps = [x for x in node.children if x.udeprel == 'xcomp'] + if node.lemma in ['ir', 'aller', 'estar', 'ter'] and node.upos == 'VERB' and xcomps: + node.misc['PeriAux'] = 'Yes' + + voice = node.feats['Voice'] + auxes = [x for x in xcomps[0].children if x.udeprel == 'aux'] + aux_pass = [x for x in auxes if x.deprel == 'aux:pass'] + auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass'] + + # European Portuguese: estar + a + Inf + if node.lemma == 'estar': + + if node.feats['Tense'] == 'Pres': + tense=Tense.PRES.value + aspect =Aspect.PROG.value + + elif node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + elif node.feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect=Aspect.PERFPROG.value + + elif node.feats['Tense'] == 'Fut': + tense=Tense.FUT.value + aspect=Aspect.PROG.value + + elif node.lemma == 'ter' and len(xcomps) > 1: + tense=Tense.PAST.value + aspect=Aspect.PROG.value + xcomps[0].misc['PeriAux'] = 'Yes' + + elif node.feats['Tense'] == 'Pres': + tense=Tense.FUT.value + + elif node.feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMP.value + + elif node.feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + + elif node.feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERF.value + + + if auxes_without_pass: + if auxes[0].lemma == 'estar': + aspect += 'Prog' + if auxes[0].lemma == 'haber': + aspect += 'Perf' + + + + adp_a = [x for x in xcomps[-1].children if x.lemma == 'a' and x.udeprel == 'mark'] + cop = [x for x in xcomps[0].children if x.udeprel == 'cop'] + phrase_ords = [node.ord] + [x.ord for x in xcomps] + [x.ord for x in auxes] + [x.ord for x in cop] + if adp_a: + phrase_ords += [x.ord for x in adp_a] + + if aux_pass: + voice='Pass' + + phrase_ords.sort() + + self.write_node_info(xcomps[-1], + tense = tense, + number = node.feats['Number'], + person = node.feats['Person'], + aspect = aspect, + mood = node.feats['Mood'], + form = 'Fin', + voice=voice, + expl = expl, + polarity = polarity, + analytic='Yes', + ords=phrase_ords) + return + + def process_simple_verb_forms(self, node, expl, polarity, phrase_ords, head_node): + """ + Annotate simple verb forms or passive verb forms that contain only a passive auxiliary. + + Parameters + node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words of the verb form. + head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase. + """ + + if node.misc['PeriAux'] != '': + return + + # Portuguese + # presente -> PhraseTense=Pres, PhraseAspect='' + # Futuro do presente -> PhraseTense=Fut, PhraseAspect='' + + # Spanish + # presente -> PhraseTense=Pres, PhraseAspect='' + # futuro simple -> PhraseTense=Fut, PhraseAspect='' + + # Italian + # presente -> PhraseTense=Pres, PhraseAspect='' + # futuro semplice -> PhraseTense=Fut, PhraseAspect='' + + aspect = '' + tense = node.feats['Tense'] + form = node.feats['VerbForm'] + + if node.feats['Mood'] == 'Ind': + + # Portuguese + # pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + + # Spanish + # pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp + + # Italian + # imperfetto -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # Portuguese + # pretérito perfeito -> PhraseTense=Past, PhraseAspect=Perf + + # Spanish + # pretérito perfecto -> PhraseTense=Past, PhraseAspect=Perf + + # Italian + # pass remoto -> PhraseTense=Past, PhraseAspect=Perf + elif node.feats['Tense'] == 'Past': + aspect=Aspect.PERF.value + + # Portuguese + # pretérito mais que perfeito simples -> PhraseTense=Past, PhraseAspect=Pqp + elif node.feats['Tense'] == 'Pqp': + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + else: + # viitorul popular/colocvial (intentional future) -> PhraseTense=Fut, PhraseAspect='' + o = [x for x in node.children if x.lemma == 'o' and x.upos == 'PART'] + sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART'] + + + if o and sa: + tense = Tense.FUT.value + phrase_ords.append(o[0].ord) + phrase_ords.append(sa[0].ord) + + phrase_ords.sort() + + + + # Portuguese + # subjunctive presente -> PhraseTense=Pres, PhraseAspect='' + # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' + + # Spanish + # subjunctive presente -> PhraseTense=Pres, PhraseAspect='' + # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' TODO not annotated in treebanks? + + # Italian + # Congiuntivo presente -> PhraseTense=Pres, PhraseAspect='' + if node.feats['Mood'] == 'Sub': + + if node.feats['Tense'] == 'Past': + aspect=Aspect.IMP.value + + # Portuguese + # subjunctive pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + + # Spanish + # Pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp + + # Italian + # Congiuntivo imperfetto -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # Portuguese + # Futuro do pretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + + # Spanish + # pospretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + + # Italian + # Condizionale presente -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + if node.feats['Mood'] == 'Cnd': + aspect='' + tense=Tense.PRES.value + + adp_en = [x for x in head_node.children if x.upos == 'ADP' and x.lemma == 'en' and x.udeprel == 'mark'] + if node.feats['VerbForm'] == 'Part' and adp_en: + phrase_ords.append(adp_en[0].ord) + phrase_ords.sort() + form = 'Ger' + + + self.write_node_info(head_node, + person=node.feats['Person'], + aspect=aspect, + number=node.feats['Number'], + mood=node.feats['Mood'], + form=form, + tense=tense, + gender=head_node.feats['Gender'], + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic=self.get_analytic_bool(head_node), + ords=phrase_ords + ) + + def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_ords, head_node): + """ + Annotate periphrastic verb forms with the Phrase* attributes. + + Parameters + node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary. + auxes (list[udapi.core.node.Node]): All auxiliaries except the passive auxiliaries. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words in the verb form. + head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase. + """ + + # phrase already annotated + if head_node.misc[self.feature_prefix] != '': + return + + if len(auxes) == 1: + # Cnd + if auxes[0].feats['Mood'] == 'Cnd' and (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'Ger'): + + # Portuguese + # aux estar cond + gerund -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].lemma == 'estar': + tense=Tense.PRES.value + aspect=Aspect.PROG.value + + # Portuguese + # Futuro do pretérito composto -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + + # Spanish + # Antepospretérito -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + + # Italian + # Condizionale passato -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd + else: + tense=Tense.PAST.value + aspect='' + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + aspect=aspect, + mood='Cnd', + form='Fin', + expl=expl, + polarity=polarity, + voice=head_node.feats['Voice'], + analytic='Yes', + ords=phrase_ords) + return + + if auxes[0].lemma == 'vir' and auxes[0].feats['Tense'] in ['Pres', 'Imp', 'Past'] and node.feats['VerbForm'] == 'Ger': + + # aux Pres (vir) + gerund -> PhraseTense=PastPres, PraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=Aspect.PROG.value, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + if auxes[0].lemma == 'ir' and node.feats['VerbForm'] == 'Ger': + + # aux Pres (ir) + gerund -> PhraseTense=Pres, PhraseAspect=Prog + tense = auxes[0].feats['Tense'] + aspect = Aspect.PROG.value + + # aux Imp (ir) + gerund -> PhraseTense=Past, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + # Auxiliary 'estar' followed by a gerund + if node.feats['VerbForm'] == 'Ger': + + # Portuguese + Spanish + # pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg + # subjunctive pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + # Portuguese + Spanish + # pretérito perfeito (aux estar) -> PhraseTense=Past, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect=Aspect.PERFPROG.value + + # Portuguese + Spanish + # presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog + # futuro do presente (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog + # subjunctive presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Sub + # subjunctive futuro (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog, PhraseMood=Sub + else: + tense=auxes[0].feats['Tense'] + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + aspect=aspect, + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # Auxiliary 'ter' / 'haber' / 'avere' / 'essere' followed by a participle + if node.feats['VerbForm'] == 'Part': + + # Portuguese + # futuro do presente composto (aux ter) -> PhraseTense=Fut, PhraseAspect=Perf + + # Spanish + # Futuro compuesto antefuturo -> PhraseTense=Fut, PhraseAspect=Perf + + # Italian + # Futuro anteriore -> PhraseTense=Fut, PhraseAspect=Perf + aspect=Aspect.PERF.value + tense=auxes[0].feats['Tense'] + form='Fin' + mood=auxes[0].feats['Mood'] + + adp_en = [x for x in node.children if x.lemma == 'en' and x.upos == 'ADP' and x.udeprel == 'mark'] + if auxes[0].feats['VerbForm'] == 'Part' and adp_en: + tense=Tense.PAST.value + aspect='' + phrase_ords.append(adp_en[0].ord) + phrase_ords.sort() + form='Ger' + + + # Romanian + # Perfect compus -> PhraseTense=Past, PhraseAspect=Perf + elif auxes[0].lemma == 'avea': + tense = Tense.PAST.value + aspect = Aspect.PERF.value + form = 'Fin' + + # Spanish + # Pretérito perfecto compuesto ante presente -> PhraseTense=Past, PhraseAspect=Perf + + # Italian + # Passato prossimo (aux avere/essere) -> PhraseTense=Past, PhraseAspect=Perf + elif auxes[0].feats['Tense'] == 'Pres': + + # Portuguese + # pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf + # subjonctive pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf, PhraseMood=Sub + if auxes[0].lemma == 'fi' or auxes[0].feats['Mood'] == 'Sub': + tense = Tense.PASTPRES.value + + # subjonctive mood not annotated in Romanian data + if auxes[0].lemma == 'fi': + mood='Sub' + else: + tense=Tense.PAST.value + + # Portuguese + # pretérito mais que perfeito composto (aux ter/haver) -> PhraseTense=Past, PhraseAspect=Pqp + # subjonctive pretérito mais-que-perfeito composto (aux ter) -> PhraseTense=Past, PhraseAspect=Pqp, PhraseMood=Sub + + # Spanish + # pretérito pluscuamperfecto -> PhraseTense=Past, PhraseAspect=Pqp + + # Italian + # Trapassato prossimo -> PhraseTense=Past, PhraseAspect=Pqp + elif auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + # Spanish + # pretérito anterior ante pretérito -> PhraseTense=Past, PhraseAspect=Ant + + # Italian + # trapassato remoto -> PhraseTense=Past, PhraseAspect=Ant + + # French + # passé antérieur -> PhraseTense=Past, PhraseAspect=Ant + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect = Aspect.ANT.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=mood, + aspect=aspect, + form=form, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + return + + # auxiliary 'ir' or 'vrea' followed by infinitive + if auxes[0].lemma in ['ir', 'vrea'] and node.feats['VerbForm'] == 'Inf': + + tense=node.feats['Tense'] + aspect='' + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect='' + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect='' + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=Imp + elif auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMP.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect='' + elif auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect='' + + # Futuro perifrástico passado perf -> PhraseTense=PastFut, PhraseAspect=Perf + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERF.value + + # Viitorul standard/literar/simplu -> PhraseTense=Fut, PhraseAspect='' + if auxes[0].lemma == 'vrea': + tense = Tense.FUT.value + aspect = '' + + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # condițional-optativ prezent -> PhraseTense=Pres, PhraseAspect='' + if auxes[0].lemma == 'avea' and node.feats['VerbForm'] == 'Inf': + tense=Tense.PRES.value + aspect='' + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood='Cnd', + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # viitor popular/colloquial (obligative future) -> PhraseTense=Fut, PhraseAspect='' + # viitor popular (potential future - contracted form) -> PhraseTense=Fut, PhraseAspect='' + if node.feats['VerbForm'] == 'Fin': + sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART'] + + if sa: + phrase_ords.append(sa[0].ord) + phrase_ords.sort() + + tense=Tense.FUT.value + aspect='' + + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=head_node.feats['Number'], + person=head_node.feats['Person'], + mood=head_node.feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + elif len(auxes) == 2: + # Romanian + # viitor anterior -> PhraseTense=Fut, PhraseAsoect=Perf + if auxes[0].lemma == 'vrea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part': + + self.write_node_info(head_node, + tense=Tense.PAST.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=Aspect.PERF.value, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # condițional-optativ perfect -> PhraseTense=Past + if auxes[0].lemma == 'avea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part': + + self.write_node_info(head_node, + tense=Tense.PAST.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood='Cnd', + form='Fin', + aspect='', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + return + + # Portuguese + # auxiliry 'ir' followed by auxiliary 'estar' in infinitive and a gerund + if auxes[0].lemma == 'ir' and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PROG.value + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=ImpProg + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMPPROG.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PROG.value + + if auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERFPROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + # auxiliriy 'ir' in present or future tense followed by auxiliary 'ter' in infinitive and a participle + if auxes[0].lemma == 'ir' and (auxes[0].feats['Tense'] in ['Pres', 'Fut']) and auxes[1].lemma == 'ter' and node.feats['VerbForm'] == 'Part': + + # Futuro perifrástico -> PhraseTense=FutFut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PERF.value + + # aux Pres (ir) + aux ter inf + pp -> PhraseTense=Fut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PERF.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords) + + # Cnd (only ter/haber), Sub and Past,Pres,Fut tenses: 2 auxes - ter/haber + estar + if auxes[0].lemma in AUXES_HAVE and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + + tense = auxes[0].feats['Tense'] + aspect = Aspect.PERFPROG.value + + # aux ter cond + estar pp + gerund -> PhraseTense=Past, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].feats['Mood'] == 'Cnd': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + # Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg + # subjonctive Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + # Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg + # subjonctive Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + aspect=Aspect.PQPPROG.value + + # Futuro do presente composto -> PhraseTense=Fut, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Fut' and auxes[0].lemma == 'ter': + tense=Tense.FUT.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + polarity=polarity, + analytic='Yes', + ords=phrase_ords, + ) + return + + def process_copulas(self, node, cop, expl, polarity, phrase_ords): + """ + Annotate non-verbal predicates with copula using the Phrase* attributes. + + This method is specialized for non-periphrastic copulas. + If any auxiliaries are present, process_periphrastic_verb_forms() is called instead. + + Parameters + node (udapi.core.node.Node): The non-verbal predicate that should receive the Phrase* attributes, i.e., the head of the phrase. + cop (list[udapi.core.node.Node]): The copula nodes. + expl (str): The value of the PhraseExpl attribute. + polarity (str): The value of the PhrasePolarity attribute. + phrase_ords (list[int]): The ord values of all member words in the verb form. + """ + + # classify the morphological features of the copula node and propagate them to the entire phrase (treating the copula as the content verb) + self.process_phrases_with_ir_aller_estar(cop[0], expl, polarity, phrase_ords, node) + self.process_simple_verb_forms(cop[0], expl, polarity, phrase_ords, node) + + # adjust PhraseAspect based on the lemma of the copula + if cop[0].feats['Tense'] in ['Pres', 'Fut']: + if cop[0].lemma == 'ser': + node.misc['PeriAspect'] = Aspect.PERF.value + elif cop[0].lemma == 'estar': + node.misc['PeriAspect'] = Aspect.IMP.value \ No newline at end of file diff --git a/udapi/block/msf/slavic/conditional.py b/udapi/block/msf/slavic/conditional.py new file mode 100644 index 00000000..9d15418f --- /dev/null +++ b/udapi/block/msf/slavic/conditional.py @@ -0,0 +1,97 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects conditional verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Conditional(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + if (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'PartRes') or node.feats['VerbForm'] == 'Fin': + # in most Slavic languages, the verb has feats['VerbForm'] == 'Part' but in Polish the verb has feats['VerbForm'] == 'Fin' + + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # list for auxiliary verbs for forming the conditional mood + cop = [x for x in node.children if x.udeprel == 'cop'] # in some cases it may happen that the cop follows the noun, we don't want to these cases in this branch + # in Polish the auxiliary verbs for conditional mood have deprel == 'aux:cnd', in other languages the auxiliary verbs have x.feats['Mood'] == 'Cnd' + + # the conditional mood can be formed using the auxiliary verb or some conjunctions (such as 'aby, kdyby...' in Czech) + # so x.udeprel == 'aux' can't be required because it doesn't meet the conjunctions + + if aux_cnd and not cop: + aux = [x for x in node.children if x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd'] # all auxiliary verbs and conjuctions with feats['Mood'] == 'Cnd' + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person='3' # TODO there is a problem in russian etc. (same as in past tense) + + for aux_verb in aux: + if aux_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + + + self.write_node_info(node, + person=person, + number=node.feats['Number'], + mood='Cnd', + form='Fin', + aspect=node.feats['Aspect'], + expl=self.get_expl_type(node,refl), + polarity=self.get_polarity(phrase_nodes), + voice=self.get_voice(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['VerbForm'] == 'Part' or x.feats['VerbForm'] == 'Fin')] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel=='aux:cnd'] + + if cop and aux_cnd: + # there can be a copula with Mood='Cnd' (i. e. in Old East Slavonic), we don't want to count these copula in phrase_ords twice, so there is x.udeprel != 'cop' in aux list + aux = [x for x in node.children if (x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd') and x.udeprel != 'cop'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + prep + refl + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + + person = '3' + + for aux_verb in aux: + if aux_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + for cop_verb in cop: + if cop_verb.feats['Person'] != '': + person=aux_verb.feats['Person'] + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=person, + number=copVerb.feats['Number'], + mood='Cnd', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + ords=phrase_ords, + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) \ No newline at end of file diff --git a/udapi/block/msf/slavic/converb.py b/udapi/block/msf/slavic/converb.py new file mode 100644 index 00000000..32714630 --- /dev/null +++ b/udapi/block/msf/slavic/converb.py @@ -0,0 +1,94 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects converb (transgressive) forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Converb(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # condition node.upos == 'VERB' to prevent copulas from entering this branch + if node.feats['VerbForm'] == 'Conv' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + form='Conv', + tense=node.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + voice=self.get_voice(node, refl), + analytic=self.get_analytic_bool(node) + ) + + # passive voice + elif node.upos == 'ADJ': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Conv'] + + if aux: + auxVerb = aux[0] + + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + form='Conv', + tense=auxVerb.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=auxVerb.feats['Gender'], + animacy=auxVerb.feats['Animacy'], + voice='Pass', + analytic=self.get_analytic_bool(node) + ) + + # copulas + else: + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Conv'] + + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + tense=copVerb.feats['Tense'], + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'], + form='Conv', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + voice=self.get_voice(copVerb, refl), + analytic=self.get_analytic_bool(node) + ) diff --git a/udapi/block/msf/slavic/future.py b/udapi/block/msf/slavic/future.py new file mode 100644 index 00000000..9cc17717 --- /dev/null +++ b/udapi/block/msf/slavic/future.py @@ -0,0 +1,207 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects future tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Future(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # future tense for Serbian and Croatian + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and (x.lemma == 'hteti' or x.lemma == 'htjeti')] + if node.upos != 'AUX' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + aux_other = [x for x in node.children if x.udeprel == 'aux'] # adding aux for passive voice + cop = [x for x in node.children if x.deprel == 'cop'] + + phrase_nodes = [node] + refl + aux_other + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + if not cop: + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], # srbstina ani chorvatstina vidy nema + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + else: + prep = [x for x in node.children if x.upos == 'ADP'] + phrase_nodes += prep + phrase_ords += [x.ord for x in prep] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + + return + + # Macedonian forms the future tense with the auxiliary word ќе and a verb in the present tense + # Bulgarian forms the future tense with the auxiliary word ще and a verb in the present tense + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + + if node.feats['Tense'] == 'Pres' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # future tense of perfect verbs + # Upper Sorbian forms the future tense in this way, however, the feats[Aspect] are not listed in the data + # in some languages ​​(e.g. in Russian) these verbs have the Tense Fut, in others (e.g. in Czech) they have the Tense Pres + if node.feats['Aspect'] == 'Perf' and (node.feats['Tense'] == 'Pres' or node.feats['Tense'] == 'Fut') and node.feats['VerbForm'] != 'Conv': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + form='Fin', + aspect='Perf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + + # future tense of imperfect verbs and passive voice + # in some languages ​​the verb is in the infinitive, in some it is in the l-participle + # the condition node.upos == 'ADJ' is due to the passive voice - the n-participle is marked as ADJ, but the auxiliary verb is not cop, but aux + if node.upos == 'VERB' or node.upos == 'ADJ': + + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Fut'] + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + if aux: + auxVerb = aux[0] + self.write_node_info(node, + tense='Fut', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + # simple future tense - e.g. in Serbian, the future tense can be formed by combining a verb with a full meaning and an auxiliary verb into one word, i.e. without an auxiliary verb + # or verbs like pojede, půjdeme... in Czech + + if not aux and node.feats['Tense'] == 'Fut': + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Fut'] + if cop: + copVerb = cop[0] + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood']=='Ind'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Fut', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + diff --git a/udapi/block/msf/slavic/imperative.py b/udapi/block/msf/slavic/imperative.py new file mode 100644 index 00000000..5a30d05e --- /dev/null +++ b/udapi/block/msf/slavic/imperative.py @@ -0,0 +1,89 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects imperative verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Imperative(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # the condition node.upos == 'VERB' ensures that copulas do not enter this branch + if node.feats['Mood'] == 'Imp' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + aspect=node.feats['Aspect'], + mood='Imp', + form='Fin', + voice='Act', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # verbs in the passive forms are marked as ADJ + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood'] == 'Imp'] + if aux: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Imp', + voice='Pass', + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Mood'] == 'Imp'] + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Imp', + form='Fin', + voice=self.get_voice(copVerb, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/infinitive.py b/udapi/block/msf/slavic/infinitive.py new file mode 100644 index 00000000..83bc0766 --- /dev/null +++ b/udapi/block/msf/slavic/infinitive.py @@ -0,0 +1,107 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects infinitive verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Infinitive(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + if node.feats['VerbForm'] == 'Inf' and node.upos == 'VERB': + aux = [x for x in node.children if x.udeprel == 'aux'] + if not aux: # the list of auxiliary list must be empty - we don't want to mark infinitives which are part of any other phrase (for example the infinititive is part of the future tense in Czech) + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes == neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] != 'Inf'] + if aux and not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Pass', + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + number=node.feats['Number'], + analytic=self.get_analytic_bool(node) + ) + return + + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + if cop and not aux_forb: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + voice=self.get_voice(cop[0], refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + + # there is a rare verb form called supine in Slovenian, it is used instead of infinitive as the argument of motion verbs + if node.feats['VerbForm'] == 'Sup': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Act', + form='Sup', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node, refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/past.py b/udapi/block/msf/slavic/past.py new file mode 100644 index 00000000..130d972d --- /dev/null +++ b/udapi/block/msf/slavic/past.py @@ -0,0 +1,212 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects past tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Past(udapi.block.msf.phrase.Phrase): + + def get_person_for_langs_with_simple_past(self, node, person): + """ + returns the person which is known from subject, languages with the simple past tense (e. g. Russian) do not express person in these verb forms + if the person was not taken from the subject, the third person would be filled in automatically due to languages ​​with a compound past but simple forms for the third person (e. g. Czech) + """ + subj = [x for x in node.children if x.udeprel == 'nsubj'] + if subj: + subj = subj[0] + if subj.feats['Person'] != '': + person = subj.feats['Person'] + return person + + def process_node(self, node): + + past_tenses = ['Past', 'Imp', 'Pqp'] + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['Tense'] in past_tenses)] + + # there is person 0 in Polish and Ukrainian which is for impersonal statements + # in Polish, verbs with Person=0 have also Tense=Past, in Ukrainian the tense is not specified + if node.feats['Person'] == '0': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood=node.feats['Mood'], + voice='Act', #In Polish, impersonal statements are annotated with Voice=Act. In Ukrainian, the Voice feature is missing; therefore, we decided to annotate these phrases with PhraseVoice=Act + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + # compound past tense + if (node.feats['VerbForm'] in ['Part', 'PartRes', 'Fin']) and node.upos == 'VERB' and node.feats['Voice'] != 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in ['Pres', '']] + aux_pqp = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in past_tenses] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + aux_pqp + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + aux_cnd = [x for x in node.children if (x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd') and x.udeprel != 'conj'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux: + person = aux[0].feats['Person'] + + elif not aux: + person = '3' + + if aux_pqp: + person = aux_pqp[0].feats['Person'] + + # in Slovenian, the participles are not annotated as Tense='Past', the Tense feature is missing here + # but in Bulgarian, there are cases where the participles are annotated as Tense='Imp' + tense = 'Past' + if node.feats['Tense'] == 'Imp': + tense = 'Imp' + if node.feats['Tense'] == 'Pqp': + tense = 'Pqp' + + self.write_node_info(node, + tense=tense, + person=person, + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + + # the past tense of some Slavic languages ​​is formed only by a verb without an auxiliary verb (e.g. Polish) + # or imperfect (special case of the past tense) e.g. in Bulgarian or Croatian + elif (node.feats['Tense'] in past_tenses) and node.upos == 'VERB' and node.feats['VerbForm'] != 'Conv': + + # the past tense is formed only by a content verb, not with an auxiliary + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + + if not aux_forb: + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + + + # passive + elif node.upos == 'ADJ' and node.feats['Voice'] == 'Pass' and not cop: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and (x.feats['Tense'] in past_tenses)] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux_past_tense: + aux_pres_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] # e. g. the auxiliary 'jsem' in the phrase 'byl jsem přinucen' + + phrase_nodes = [node] + aux_past_tense + aux_pres_tense + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_pres_tense: + person = aux_pres_tense[0].feats['Person'] + person = self.get_person_for_langs_with_simple_past(node, person) + + self.write_node_info(node, + tense=aux_past_tense[0].feats['Tense'], + person=person, + number=aux_past_tense[0].feats['Number'], + mood='Ind', + voice='Pass', + form='Fin', + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + + else: + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if cop and not aux_cnd: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux_past_tense + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_past_tense: + person = aux_past_tense[0].feats['Person'] + + # In ru, be, uk, the person is not expressed in past tense and the verbform is Fin, not Part + if cop[0].feats['VerbForm'] == 'Fin': + person = '' + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + tense=cop[0].feats['Tense'], + person=person, + number=cop[0].feats['Number'], + mood='Ind', + voice=self.get_voice(cop[0], refl), + form='Fin', + expl=self.get_expl_type(node,refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=cop[0].feats['Gender'], + animacy=cop[0].feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) diff --git a/udapi/block/msf/slavic/preprocessor.py b/udapi/block/msf/slavic/preprocessor.py new file mode 100644 index 00000000..0672812b --- /dev/null +++ b/udapi/block/msf/slavic/preprocessor.py @@ -0,0 +1,83 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block serves as a preprocessor for Slavic languages before the other blocks +are applied to detect periphrastic verb forms. It improves harmonization of +annotations across the treebanks by addressing some known divergences. +""" + +from udapi.core.block import Block + +class Preprocessor(Block): + + def process_node(self,node): + + # in Ukrainian the active verb forms are not marked as PhraseVoice=Act + if (node.upos == 'VERB' or (node.upos == 'AUX' and node.feats['VerbForm'] == 'Fin')) and node.feats['Voice'] == '': + node.feats['Voice'] = 'Act' + + # in some languages, participles are annotated with UPOS=VERB, while in others they are annotated with UPOS=ADJ + # we change the UPOS to ADJ when a participle expresses case + #if node.upos == 'VERB' and node.feats['VerbForm'] == 'Part' and node.feats['Case'] != '': + # node.upos = 'ADJ' + + # in Polish, the conditional mood for auxiliary verbs is marked as deprel == 'aux:cnd' and not as in the last Slavic languages ​​feats['Mood'] == 'Cnd' + if node.deprel == 'aux:cnd': + node.feats['Mood'] = 'Cnd' + + # unify polarities - some languages ​​mark only Neg (Russian), some mark both Neg and Pos (Czech) + if node.feats['Polarity'] == 'Pos': + node.feats['Polarity'] = '' + + # In Ukrainian, there is no explicit annotation of reflexive verbs + # We decided to unify the annotation of reflexive verbs with Russian and Belarusian, where reflexive verbs are formed similarly + # We add the feature Voice=Mid to reflexive verbs + if node.upos == 'VERB' and (node.lemma.endswith('сь') or node.lemma.endswith('ся')): + node.feats['Voice'] = 'Mid' + + # makedonstina tvori budouci cas pomoci pomocneho slova ќе, u nejz neni nijak vyznaceno, ze se podili na tvorbe budouciho casu + # stejne tak bulharstina pomoci pomocneho slova ще + # makedonstina a bulharstina + if node.feats['Tense'] == 'Pres': + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + if len(aux) == 1: + aux[0].feats['Tense'] = 'Fut' + + # in Czech and in Old Church Slavonic, the participles are sometimes marked with the plural gender + if node.feats['Gender'] == 'Fem,Neut' or node.feats['Gender'] == 'Fem,Masc': + subj = [x for x in node.children if x.udeprel == 'nsubj'] + + # for relative pronouns, only one gender is indicated + if len(subj) == 1: + conj = [x for x in subj[0].children if x.deprel == 'conj'] + if len(conj) == 0: + node.feats['Gender'] = subj[0].feats['Gender'] + node.feats['Number'] = subj[0].feats['Number'] + + # participles in passive are sometimes annotated as VERB, sometimes as ADJ + #if node.upos == 'VERB' and node.feats['Voice'] == 'Pass': + # node.upos = 'ADJ' + + # there are cases where the node has deprel=='expl:pv' or 'expl:pass' or 'expl:impers' and Reflex is not Yes (i.e. Macedonian treebank) + # we add the Reflex=Yes feature + if node.deprel == 'expl:pv' or node.deprel == 'expl:pass' or node.deprel == 'expl:impers': + node.feats['Reflex'] = 'Yes' + + # fixing the mistake in Macedonian treebank (mk_mtb-ud-test.conllu), in sent_id=other0010, there is personal pronoun 'ми' marked as expl:pv, it should be iobj + if node.deprel == 'expl:pv' and node.lemma == 'ми' and node.feats['PronType'] == 'Prs': + node.deprel = '' + node.udeprel = 'iobj' + + # in Old Church Slavonic, there is feature Mood=Sub, but this is a notation for conditional mood + if node.feats['Mood'] == 'Sub': + node.feats['Mood'] = 'Cnd' + + # although infinitives in Old Church Slavonic are annotated with Tense=Pres, they do not convey tense; therefore, we remove this annotation + if node.feats['VerbForm'] == 'Inf': + node.feats['Tense'] = '' + + # in the russian Syntagrus corpus, the negative particles have no Polarity=Neg feature + if node.lemma == 'не' and node.upos == 'PART' and node.udeprel == 'advmod': + node.feats['Polarity'] = 'Neg' + + # TODO maybe we want to set Tense=Fut for the perfective verbs with Tense=Pres? This could solve the problem with the simplified detection of the future tense in Czech + # but there are many verbs with no Aspect value, so the problem is still there diff --git a/udapi/block/msf/slavic/present.py b/udapi/block/msf/slavic/present.py new file mode 100644 index 00000000..7521a08d --- /dev/null +++ b/udapi/block/msf/slavic/present.py @@ -0,0 +1,132 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects present tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Present(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + # the condition VerbForm == 'Fin' ensures that there are no transgressives between the found verbs + # the aspect is not always given in Czech treebanks, so we can't rely on the fact that the imperfect aspect is specified + if node.feats['Tense'] == 'Pres' and node.upos == 'VERB' and node.feats['VerbForm'] == 'Fin' and node.feats['Aspect'] !='Perf': + + aux_forb = [x for x in node.children if x.upos == 'AUX' and (x.lemma == 'ќе' or x.lemma == 'ще' or x.feats['Mood'] == 'Cnd')] # forbidden auxiliaries for present tense (these auxiliaries are used for the future tense or the conditional mood) + + if not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Pres', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Fin', + polarity=self.get_polarity(phrase_nodes), + expl=self.get_expl_type(node,refl), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + # passive voice + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and x.lemma != 'hteti' and x.lemma != 'htjeti'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] != 'Pres'] # we don't want the past passive (e. g. 'byl jsem poučen' in Czech) + + if aux and not aux_forb: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + auxVerb = aux[0] + + self.write_node_info(node, + tense='Pres', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + form='Fin', + voice='Pass', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + analytic=self.get_analytic_bool(node) + ) + return + + # participles + # in some languages, participles are used as attributes (they express case and degree) + if node.upos == 'ADJ' and node.feats['VerbForm'] == 'Part': + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + cop = [x for x in node.children if x.udeprel == 'cop'] + + if not aux_forb and not cop: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + tense=node.feats['Tense'], + number=node.feats['Number'], + form='Part', + voice=self.get_voice(node, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) + return + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Pres'] + aux_forb = [x for x in node.children if x.upos == 'AUX' and x.feats['Tense'] != 'Pres'] # in Serbian this can be a future tense + + if cop and not aux_forb: + aux = [x for x in node.children if x.udeprel == "aux" and x.feats['Mood'] == 'Ind' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Pres', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + expl=self.get_expl_type(node, refl), + polarity=self.get_polarity(phrase_nodes), + analytic=self.get_analytic_bool(node), + ords=phrase_ords + ) diff --git a/udapi/block/mwe/normalize.py b/udapi/block/mwe/normalize.py new file mode 100644 index 00000000..e7ebf24f --- /dev/null +++ b/udapi/block/mwe/normalize.py @@ -0,0 +1,68 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC + and normalizes it so that the type is always annotated at the first word of + the expression.""" +from udapi.core.block import Block +import logging +import re + +class Normalize(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then saves them back but makes sure that the type is annotated at the + first word of the expression (as opposed to the syntactic head or to + any other word). + """ + (mwes, mwes_by_nodes) = self.collect_mwes(root) + nodes = root.descendants + for n in nodes: + # Erase the previous MWE annotations so we can start from scratch. + n.misc['Mwe'] = '' + # There may be multiple MWEs this node is member of. + annotations = [] + for m in mwes_by_nodes[n.ord]: + if n.ord == mwes[m]['nodes'][0]: + annotations.append("%s:%s" % (m, mwes[m]['type'])) + else: + annotations.append(m) + if annotations: + n.misc['Mwe'] = ';'.join(annotations) diff --git a/udapi/block/mwe/possessives.py b/udapi/block/mwe/possessives.py new file mode 100644 index 00000000..0849a210 --- /dev/null +++ b/udapi/block/mwe/possessives.py @@ -0,0 +1,74 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC, + looks for dependent possessive pronouns and reports how they are treated.""" +from udapi.core.block import Block +import logging +import re + +class Possessives(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then surveys the possessive pronouns. + """ + (mwes, mwes_by_nodes) = self.collect_mwes(root) + nodes = root.descendants + for m in mwes: + mwenodes = [x for x in nodes if m in mwes_by_nodes[x.ord]] + mweheads = [x for x in mwenodes if not x.parent in mwenodes] + mwedescendantset = set() + for x in mweheads: + mwedescendantset = mwedescendantset.union(set(x.descendants)) + mwedescendants = list(sorted(mwedescendantset)) + # Is there a possessive pronoun? + possprons = [x for x in mwedescendants if x.upos == 'PRON' and x.feats['Poss'] == 'Yes'] + inpp = [x for x in possprons if m in mwes_by_nodes[x.ord]] + outpp = [x for x in possprons if not m in mwes_by_nodes[x.ord]] + observation = '' + if inpp and outpp: + observation = 'both' + elif inpp: + observation = 'in' + elif outpp: + observation = 'out' + if observation: + expression = ' '.join([x.form if m in mwes_by_nodes[x.ord] else '('+x.form+')' for x in mwedescendants]) + print(observation + ': ' + expression) diff --git a/udapi/block/mwe/tosubdeprels.py b/udapi/block/mwe/tosubdeprels.py new file mode 100644 index 00000000..3682c0c7 --- /dev/null +++ b/udapi/block/mwe/tosubdeprels.py @@ -0,0 +1,62 @@ +"""Block that takes PARSEME-like annotation of multiword expressions from MISC + and projects it to subtypes of dependency relation labels. The motivation is + that a parser could learn to predict the multiword expressions.""" +from udapi.core.block import Block +import logging +import re + +class ToSubDeprels(Block): + + def collect_mwes(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + The expected annotation is in the style of Parseme (see + https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download + the data from http://hdl.handle.net/11372/LRT-5124), except that there + are only ten columns and the annotation from the eleventh column is + copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause). + """ + nodes = root.descendants + mwes = {} # for each mwe id, its type and list of node ids + mwes_by_nodes = {} # for each node id, a list of mwe ids + for n in nodes: + mwes_by_nodes[n.ord] = [] + miscmwe = n.misc['Mwe'] + if miscmwe: + # A node may belong to multiple multiword expressions. + miscmwes = miscmwe.split(';') + for m in miscmwes: + # Either it is NUMBER:TYPE, or just NUMBER. + # Number identifies this MWE among all MWEs in the sentence. + # Type is a main uppercase string (VID, LVC etc.), optionally + # followed by a subtype ('LVC.cause'). + # See https://gitlab.com/parseme/corpora/-/wikis/home + match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m) + if match: + number = match.group(1) + type = match.group(2) + if not number in mwes: + mwes[number] = {'nodes': [], 'type': ''} + if type: + mwes[number]['type'] = type + mwes[number]['nodes'].append(n.ord) + mwes_by_nodes[n.ord].append(number) + else: + logging.warning("Cannot parse Mwe=%s" % m) + return (mwes, mwes_by_nodes) + + def process_tree(self, root): + """ + Collects annotations of multiword expressions from MISC of the nodes. + Then saves the type of the MWE as a subtype of the deprels inside. + """ + nodes = root.descendants + (mwes, mwes_by_nodes) = self.collect_mwes(root) + # Now we hopefully know the type of every multiword expression in the sentence. + for n in nodes: + if mwes_by_nodes[n.ord]: + for m in mwes_by_nodes[n.ord]: + type = re.sub(r"\.", '', mwes[m]['type'].lower()) + # Add the MWE type to the DEPREL if the parent is also in the same MWE. + if n.parent.ord > 0 and m in mwes_by_nodes[n.parent.ord]: + n.deprel += ':' + type diff --git a/udapi/block/read/addbratann.py b/udapi/block/read/addbratann.py new file mode 100644 index 00000000..4f5fc877 --- /dev/null +++ b/udapi/block/read/addbratann.py @@ -0,0 +1,230 @@ +"""Add Brat coreference annotation from *.ann files. + +So far, tested on French LitBank data only. + +T12 HIST 362 366 qui +T13 HIST 349 362 une aventure +R1431 Coreference Arg1:T12 Arg2:T13 + +""" + +from udapi.core.block import Block +from udapi.core.files import Files +import logging +from bisect import bisect_left +import networkx as nx + +def _m(range_s, range_e, offset): + return f"{range_s}-{offset}:{range_e}-{offset}" if offset else f"{range_s}:{range_e}" + +class AddBratAnn(Block): + + def __init__(self, files, zone='', offset=0, detect_bom=True, keep_mention_id=True, + coref_attr="R", no_type_value='_Unsorted_', + **kwargs): + """Args: + files: file names with the coreference annotations (*.ann) + offset: what number to substract from the chatacter indices in the ann files + detect_bom: if True and the current txt file starts with BOM (byte-order mark), add 1 to the offset + """ + super().__init__(**kwargs) + self.zone = zone + self.files = Files(filenames=files) + self.offset = offset + self.detect_bom = detect_bom + self.keep_mention_id = keep_mention_id + self.coref_attr = coref_attr + self.no_type_value = no_type_value + + def process_document(self, document): + + # Read all the important info from the *.ann file. + mentions, attrs, split_ante, clusters = {}, [], [], [] + ann_filehandle = self.files.next_filehandle() + offset = self.offset + if self.detect_bom: + txt_filename = self.files.filename.replace("ann", "txt") + with open(txt_filename, 'rb') as txt_fh: + raw_bytes = txt_fh.read(3) + if raw_bytes == b'\xef\xbb\xbf': + offset += 1 + + for line in ann_filehandle: + line = line.rstrip('\n') + if not "\t" in line: + logging.warning(f"Unexpected line without tabs: {line}") + elif line.startswith("T"): + # T13 HIST 349 362 une aventure + try: + mention_id, type_and_range, form = line.split("\t") + # Usually range are two numbers, but can be more, e.g. type_and_range="Abstract 605 653;654 703" + # Let's take the first and last number only.´ + parts = type_and_range.split() + ne_type, range_s, range_e = parts[0], int(parts[1]), int(parts[-1]) + + # If form ends with spaces, remove them and adjust range_e + stripped_form = form.rstrip(" ") + if form != stripped_form: + num_spaces = len(form) - len(stripped_form) + logging.debug(f"Stripping {num_spaces} space{'s' if num_spaces>1 else ''} from {mention_id} '{form}' ({_m(range_s,range_e,offset)}->{range_e-num_spaces})") + form = stripped_form + range_e = range_e - num_spaces + + + mentions[mention_id] = [ne_type, range_s, range_e, form] + if self.keep_mention_id: + attrs.append(["mention_id", mention_id, mention_id]) + except Exception as e: + logging.warning(f"Unexpected mention line: {line}\n{e}") + elif line.startswith(self.coref_attr): + try: + cor_attr, mention_ids = line.rstrip().split("\t") + parts = mention_ids.split() + assert(parts[0] == "Coreference") + except Exception as e: + logging.warning(f"Unexpected coref line: '{line}'\n{e}") + clusters.append([p.split(":")[1] for p in parts[1:]]) + elif line.startswith("#"): + pass # Let's ignore annotators' comments + else: + logging.warning(f"Unexpected line in {self.files.filename}:\n{line}") + + # Some Brat ann files use link-based representation, e.g. + # R123 Coreference Arg1:T11 Arg2:T13 + # R124 Coreference Arg1:T12 Arg2:T14 + # R125 Coreference Arg1:T13 Arg2:T14 + # This actually means that all four mentions T11, T12, T13 and T14 are in the same cluster (entity). + # However, clusters = [["T11", "T13"], ["T12", "T14"], ["T13", "T14"]] + # and we need to convert it to clusters = [["T11", "T12", "T13", "T14"]] + # Note that if creating entities for link, in their original order, + # R123 and R125 would result in creating two entities and when hitting R125 + # we would need to merge them, i.e. delete one of them and move their mentions to the other. + # This is the solution of corefud.Link2Cluster, but here it seems easier to find connected components. + coref_graph = nx.Graph() + for mention_ids in clusters: + coref_graph.add_node(mention_ids[0]) + for mention_id in mention_ids[1:]: + coref_graph.add_node(mention_id) + coref_graph.add_edge(mention_id, mention_ids[0]) + clusters = [list(component) for component in nx.connected_components(coref_graph)] + + # Create entity objects for non-singletons. + entity_map = {} + for mention_ids in clusters: + etype, etype_index = None, 0 + for index, m_id in enumerate(mention_ids): + if mentions[m_id][0] == self.no_type_value: + pass + elif etype is None: + etype, etype_index = mentions[m_id][0], index + elif etype != mentions[m_id][0]: + logging.warning(f"Mention type mismatch {mention_ids[etype_index]}:{etype} != {m_id}:{mentions[m_id][0]}. Using the former.") + if etype is None: + etype = "other" + entity = document.create_coref_entity(etype=etype) + for m_id in mention_ids: + if m_id in entity_map: + logging.warning(f"Mention {m_id} already in Entity {entity_map[m_id].eid}, not adding to {entity.eid}") + else: + entity_map[m_id] = entity + + # Collect TokenRange (as pre-filled by UDPipe) for each token. + tokens, starts, ends = [], [], [] + for tree in document.trees: + for token in tree.token_descendants: + tokens.append(token) + range_s, range_e = token.misc["TokenRange"].split(":") + starts.append(int(range_s)) + ends.append(int(range_e)) + + # Create mention objects. + mention_map = {} + for mention_id, mention_values in mentions.items(): + + # Find Udapi tokens for each mention. + ne_type, range_s, range_e, form = mention_values + index_s = bisect_left(starts, range_s - offset) + if starts[index_s] != range_s - offset and index_s > 0: + index_s -= 1 + index_e = bisect_left(ends, range_e - offset) + mtokens = tokens[index_s : index_e+1] + token_s, token_e = tokens[index_s], tokens[index_e] + + # Solve cases when the character range crosses Udapi (UDPipe-predicted) token boundaries. + # If the start token is a multi-word token (MWT), + # we can still try to find the proper word within the MWT. + ok_s, ok_e = True, True + if starts[index_s] != range_s - offset: + ok_s = False + if token_s.is_mwt(): + mtokens.pop(0) + first_form = form.split()[0] + new_start = ends[index_s] + for w in reversed(token_s.words): + mtokens = [w] + mtokens + new_start -= len(w.form) + if w.form == first_form or new_start < range_s - offset: + ok_s = True + break + + # similarly for the end token + if ends[index_e] != range_e - offset: + ok_e = False + if token_e.is_mwt(): + mtokens.pop() + last_form = form.split()[-1] + new_end = starts[index_e] + for w in token_e.words: + mtokens.append(w) + new_end += len(w.form) + if w.form == last_form or new_end > range_e - offset: + ok_e = True + break + + if not ok_s or not ok_e: + logging.warning(f"Mention {mention_id} range {_m(range_s, range_e, offset)} ({form})" + f" crosses token boundaries: {token_s.misc} ({token_s.form}) " + f".. {token_e.misc} ({token_e.form})") + + # Project tokens (including MWTs) to words and check forms match. + words, udapi_form = [], "" + for token in mtokens: + words += token.words + udapi_form += token.form + if not token.no_space_after: + udapi_form += " " + udapi_form = udapi_form.rstrip() + if form != udapi_form: + logging.warning(f"Mention {mention_id}: ann form '{form}' != Udapi form '{udapi_form}'") + + # Make sure all words of the mention are in the same sentence. + root = words[0].root + mwords = [words[0]] + for word in words[1:]: + if word.root is root: + mwords.append(word) + else: + logging.warning(f"Cross-sentence mention. Word {word} not in {root}, thus omitting from the mention.") + + # Create entities for singletons + if mention_id not in entity_map: + entity_map[mention_id] = document.create_coref_entity(etype=ne_type) + + # Create the Udapi mention object + mention = entity_map[mention_id].create_mention(words=mwords) + mention_map[mention_id] = mention + + # Fill-in the additional mention attributes. + for attr_name, mention_id, attr_value in attrs: + if mention_id in mention_map: + mention_map[mention_id].other[attr_name] = attr_value + + # Fill-in split antecedents + for arg1, arg2 in split_ante: + if arg1 in entity_map and arg2 in entity_map: + if entity_map[arg1] in entity_map[arg2].split_ante: + logging.warning(f"Repeated SplitAnte: {arg1=} ({entity_map[arg1].eid}) {arg2=} ({entity_map[arg2].eid})") + else: + entity_map[arg2].split_ante.append(entity_map[arg1]) + else: + logging.warning(f"{arg1} or {arg2} not indexed in entity_map") diff --git a/udapi/block/read/addsentences.py b/udapi/block/read/addsentences.py index 75c4ac7d..f676fbe7 100644 --- a/udapi/block/read/addsentences.py +++ b/udapi/block/read/addsentences.py @@ -2,7 +2,9 @@ from udapi.core.basereader import BaseReader # pylint: disable=abstract-method -# read_tree() does not need to be installed here +# read_tree() does not need to be implemented here + + class AddSentences(BaseReader): """A reader for adding plain-text sentences (one sentence per line) files. @@ -12,8 +14,17 @@ class AddSentences(BaseReader): `cat in.conllu | udapy -s read.Conllu read.AddSentences files=in.txt > merged.conllu` """ - def __init__(self, zone='', **kwargs): + def __init__(self, zone='', into='text', **kwargs): + """Args: + into: name of the comment-attribute where the sentence should be stored. Default = text. + That is the sentence is stored in `root.text` and in CoNLL-U it will look like e.g. + `# text = John loves Mary.` + Any other name than "text" is stored to `root.comment`, so e.g. `into=english_text` + will result in a CoNLL-U with a comment line: + `# english_text = John loves Mary.` + """ super().__init__(zone=zone, **kwargs) + self.into = into @staticmethod def is_multizone_reader(): @@ -34,7 +45,10 @@ def process_document(self, document): for bundle in document.bundles: line = self.filehandle.readline() if line == '': - raise IOError('File does not have enoush lines') + raise IOError('File does not have enough lines') root = bundle.get_tree(zone=self.zone) - root.text = line.rstrip() + if self.into == 'text': + root.text = line.rstrip() + else: + root.comment += ' ' + self.into + " = " + line.rstrip() + "\n" self.finished = not self.files.has_next_file() diff --git a/udapi/block/read/addtext.py b/udapi/block/read/addtext.py new file mode 100644 index 00000000..4d0b7771 --- /dev/null +++ b/udapi/block/read/addtext.py @@ -0,0 +1,59 @@ +"""read.AddText is a reader for adding word-wrapped plain-text to existing trees.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root +import logging + +class AddText(BaseReader): + r"""A reader for plain-text files to be stored to existing trees. + + For example LitBank conll files are segmented to sentences and tokenized, + but the SpacesAfter attributes are missing. We need to load the original + (raw) texts, which are not tokenized and not segmented, only word-wrapped + (to 70 characters per line). + + Args: + add_newpar: add newpar CoNLL-U annotations on empty lines (and the beginning of file) + """ + def __init__(self, zone='', add_newpar=True, **kwargs): + super().__init__(zone=zone, **kwargs) + self.add_newpar = add_newpar + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def process_document(self, document): + filehandle = self.next_filehandle() + if filehandle is None: + self.finished = True + return + text = ''.join(self.filehandle.readlines()) + i, end, was_newpar = 0, len(text)-1, True + while i <= end and text[i].isspace(): + i += 1 + + for bundle in document.bundles: + root = bundle.get_tree(zone=self.zone) + if self.add_newpar and was_newpar: + root.newpar = True + was_newpar = False + for node in root.token_descendants: + if text[i:i+len(node.form)] == node.form: + i += len(node.form) + if i > end or text[i].isspace(): + del node.misc['SpaceAfter'] + was_newpar = i+1 < end and text[i+1] == '\n' and text[i] == '\n' + while i <= end and text[i].isspace(): + i += 1 + else: + node.misc['SpaceAfter'] = 'No' + was_newpar = False + else: + logging.warning('Node %s does not match text "%s"', node, text[i:i+20]) + return + root.text = root.compute_text() + self.finished = not self.files.has_next_file() diff --git a/udapi/block/read/ccv.py b/udapi/block/read/ccv.py new file mode 100644 index 00000000..eb449362 --- /dev/null +++ b/udapi/block/read/ccv.py @@ -0,0 +1,78 @@ +"""Ccv class is a reader for Corpus of Czech Verse json files.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root +from udapi.block.ud.setspaceafterfromtext import SetSpaceAfterFromText +import json + +class Ccv(BaseReader): + r"""A reader for Corpus of Czech Verse json files. + + See https://github.com/versotym/corpusCzechVerse + Each verse (line) is stored as one tree (although it is quite often not a whole sentence). + Start of each stanza is marked with `newpar`. + Start of each poem is marked with `newdoc = [poem_id]`. + + Args: + tokenize: create nodes + """ + def __init__(self, tokenize=True, **kwargs): + self.tokenize = tokenize + self._cache = None + super().__init__(**kwargs) + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def read_tree(self): + if self._cache: + return self._cache.pop() + else: + trees = self.read_trees() + if not trees: + return None + self._cache = list(reversed(trees[1:])) + return trees[0] + + def read_trees(self): + if self.filehandle is None: + return None + poems = json.load(self.filehandle) + all_trees = [] + for poem in poems: + poem_trees = [] + for stanza in poem["body"]: + stanza_trees = [] + for line in stanza: + root = Root() + root.text = line["text"] + root.json["rhyme"] = line["rhyme"] + root.json["metre"] = line["metre"] + root.json["stress"] = line["stress"] + stanza_trees.append(root) + if self.tokenize: + words = [[]] + [[w] for w in line["words"]] + for index, puncts in line["punct"].items(): + for punct in puncts: + words[int(index)].append({"token": punct, "lemma": punct}) + for word in words: + for w in word: + node = root.create_child(form=w["token"], lemma=w["lemma"]) + if "morph" in w: + node.xpos = w["morph"] + node.misc["xsampa"] = w["xsampa"] + node.misc["phoebe"] = w["phoebe"] + SetSpaceAfterFromText.process_tree(None, root) + stanza_trees[0].newpar = True + poem_trees.extend(stanza_trees) + root = poem_trees[0] + root.newdoc = poem["poem_id"] + root.json["p_author"] = poem["p_author"] + root.json["b_author"] = poem["b_author"] + root.json["biblio"] = poem["biblio"] + all_trees.extend(poem_trees) + return all_trees diff --git a/udapi/block/read/conll.py b/udapi/block/read/conll.py new file mode 100644 index 00000000..d0aef1ee --- /dev/null +++ b/udapi/block/read/conll.py @@ -0,0 +1,162 @@ +""""Conll is a reader block for CoNLL-like files (CoNLL-U, CoNLL-X, CoNLL-2009).""" +import json +import logging +import re + +import udapi.block.read.conllu +from udapi.core.root import Root +from udapi.core.node import Node + + +class Conll(udapi.block.read.conllu.Conllu): + """A reader of the CoNLL-U files.""" + + def __init__(self, separator='tab', + attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc', **kwargs): + """Create the Conll reader object. + + This us a subclass of udapi.block.read.conllu.Conllu, + which adds a support for arbitrary column names and thus supporting not only CoNLL-U, + but also CoNLL-X, CoNLL-2009 and many other CoNLL-like formats. + + Args: + separator: How are the columns separated? + Default='tab' is the only possibility in valid CoNLL-U files. + 'space' means one or more whitespaces (this does not allow forms with space). + 'doublespace' means two or more spaces. + attributes: comma-separated list of column names in the input files + (default='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc') + Changing the default can be used for loading CoNLL-like formats (not valid CoNLL-U). + For ignoring a column, use "_" as its name. + Column "ord" marks the column with 1-based word-order number/index (usualy called ID). + Column "head" marks the column with dependency parent index (word-order number). + + For example, for CoNLL-X which uses name1=value1|name2=value2 format of FEATS, use + `attributes=ord,form,lemma,upos,xpos,feats,head,deprel,_,_` + but note that attributes upos, feats and deprel will contain language-specific values, + not valid according to UD guidelines and a further conversion will be needed. + You will loose the projective_HEAD and projective_DEPREL attributes. + + For CoNLL-2009 you can use `attributes=ord,form,lemma,_,upos,_,feats,_,head,_,deprel`. + You will loose the predicted_* attributes and semantic/predicate annotation. + + TODO: allow storing the rest of columns in misc, e.g. `node.misc[feats]` + for feats which do not use the name1=value1|name2=value2 format. + """ + super().__init__(**kwargs) + self.node_attributes = attributes.split(',') + self.separator = separator + + # pylint: disable=too-many-locals,too-many-branches,too-many-statements + # Maybe the code could be refactored, but it is speed-critical, + # so benchmarking is needed because calling extra methods may result in slowdown. + + def parse_node_line(self, line, root, nodes, parents, mwts): + if self.separator == 'tab': + fields = line.split('\t') + elif self.separator == 'space': + fields = line.split() + elif self.separator == 'doublespace': + fields = re.split(' +', line) + else: + raise ValueError('separator=%s is not valid' % self.separator) + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + # multi-word tokens will be processed later + if '-' in fields[0]: + mwts.append(fields) + return + if '.' in fields[0]: + empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], + xpos=fields[4], feats=fields[5], misc=fields[9]) + empty.ord = fields[0] + empty.raw_deps = fields[8] # TODO + return + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'head': + try: + parents.append(int(value)) + except ValueError as exception: + if not self.strict and value == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception + elif attribute_name == 'ord': + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") + elif attribute_name == 'deps': + setattr(node, 'raw_deps', value) + elif attribute_name != '_' and value != '_': + setattr(node, attribute_name, value) + + nodes.append(node) + + # Acknowledged code duplication with read.Conllu + def read_tree_from_lines(self, lines): + root = Root() + nodes = [root] + parents = [0] + mwts = [] + for line in lines: + if line[0] == '#': + self.parse_comment_line(line, root) + else: + self.parse_node_line(line, root, nodes, parents, mwts) + + # If no nodes were read from the filehandle (so only root remained in nodes), + # we return None as a sign of failure (end of file or more than one empty line). + if len(nodes) == 1: + return None + + # Empty sentences are not allowed in CoNLL-U, + # but if the users want to save just the sentence string and/or sent_id + # they need to create one artificial node and mark it with Empty=Yes. + # In that case, we will delete this node, so the tree will have just the (technical) root. + # See also udapi.block.write.Conllu, which is compatible with this trick. + if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': + nodes.pop() + root._children = [] + root._descendants = [] + + # Set dependency parents (now, all nodes of the tree are created). + for node_ord, node in enumerate(nodes[1:], 1): + try: + parent = nodes[parents[node_ord]] + except IndexError: + raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) + if node is parent: + if self.fix_cycles: + logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) + parent = root + else: + raise ValueError(f"Detected a cycle: {node} attached to itself") + elif node._children: + climbing = parent._parent + while climbing: + if climbing is node: + if self.fix_cycles: + logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent) + parent = root + break + else: + raise ValueError(f"Detected a cycle: {node}") + climbing = climbing._parent + node._parent = parent + parent._children.append(node) + + # Create multi-word tokens. + for fields in mwts: + range_start, range_end = fields[0].split('-') + words = nodes[int(range_start):int(range_end) + 1] + root.create_multiword_token(words, form=fields[1], misc=fields[-1]) + + return root diff --git a/udapi/block/read/conll2012.py b/udapi/block/read/conll2012.py new file mode 100644 index 00000000..2adbd00f --- /dev/null +++ b/udapi/block/read/conll2012.py @@ -0,0 +1,153 @@ +""""Conll2012 is a reader block for the coreference in CoNLL-2012 format. + +This implementation was tested on the LitBank files only +(and quickly on Portuguese Corref-PT and Summ-it++v2), so far. +LitBank does not use most of the columns, so the implementation +should be improved to handle other types of CoNLL-2012 files. +""" +import json +import logging +import re + +import udapi.block.read.conllu +from udapi.core.root import Root +from udapi.core.node import Node + +RE_BEGIN = re.compile(r'^#begin document ([^ ]+)') + +class Conll2012(udapi.block.read.conllu.Conllu): + """A reader of the Conll2012 files.""" + + def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', emptyval='_', **kwargs): + """Create the Conll2012 reader object. + + Args: + attributes: comma-separated list of column names in the input files + (default='docname,_,ord,form,_,_,_,_,_,_,_,_,coref' suitable for LitBank) + For ignoring a column, use "_" as its name. + Column "ord" marks the column with 0-based (unlike in CoNLL-U, which uses 1-based) + word-order number/index (usualy called ID). + For Corref-PT-SemEval, use attributes='ord,form,_,_,_,_,coref'. + For Summ-it++v2, use attributes='ord,form,_,_,_,_,_,_,coref'. + For FantasyCoref, use attributes='docname,_,ord,form,_,_,_,_,_,_,_,coref'. + emptyval: a symbol that represents an empty value, especially in the coref column + (default='_' suitable for LitBank, Corref-PT-SemEval, and Summ-it++v2) + For FantasyCoref, use emptyval='-'. + """ + super().__init__(**kwargs) + self.node_attributes = attributes.split(',') + self._docname = 'd' + self.emptyval = emptyval + + def parse_comment_line(self, line, root): + if line.startswith("#end document"): + return + match = RE_BEGIN.match(line) + if match: + docname = match.group(1) + # LitBank and FantasyCoref use e.g. + # #begin document (1023_bleak_house_brat); part 0 + if docname.startswith('(') and docname.endswith(');'): + docname = docname[1:-2] + # Summ-it++v2 uses e.g. + # #begin document /home/andre/Recursos-fontes/Summit/Summ-it_v3.0/corpusAnotado_CCR/CIENCIA_2002_22010/CIENCIA_2002_22010.txt + elif docname.startswith('/home/'): + docname = docname.split('/')[-1] + # Corref-PT-SemEval uses e.g. + # #begin document D1_C30_Folha_07-08-2007_09h19.txt.xml + docname = docname.replace('.txt', '').replace('.xml', '') + # FantasyCoref may use parentheses within the document ID e.g. + # #begin document (051_Fundevogel_(Bird-foundling)); part 000 + docname = docname.replace('(', '').replace(')', '') + + root.newdoc = docname + self._global_entity = 'eid-etype-head-other' + root.comment += '$GLOBAL.ENTITY\n' + self._docname = docname + else: + logging.warning(f"Unexpected comment line: {line}") + + def parse_node_line(self, line, root, nodes): + fields = line.split('\t') + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'docname': + # FantasyCoref may use parentheses within the document ID + value = value.replace('(', '').replace(')', '') + if value != self._docname: + logging.warning(f"Document name mismatch {value} != {self._docname}") + + # convert the zero-based index to one-based + # but Corref-PT uses a mix of one-based and zero-based + elif attribute_name == 'ord': + #setattr(node, 'ord', int(value) + 1) + if node.ord not in(int(value) + 1, int(value)): + logging.warning(f"Mismatch: expected {node.ord=}, but found {int(value) + 1} {line=}") + + elif attribute_name == 'coref': + if value and value != self.emptyval: + # LitBank always separates chunks by a vertical bar, e.g. (13)|10) + # Summ-it++v2 does not, e.g. (13)10) + if '|' in value: + chunks = value.split("|") + else: + chunks = [x for x in re.split(r'(\([^()]+\)?|[^()]+\))', value) if x] + modified_entities = [] + escaped_docname = self._docname.replace("-", "") + for entity in chunks: + entity_num = entity.replace("(", "").replace(")","") + modified_entity = f"{escaped_docname}_e{entity_num}--1" + if entity.startswith("(") and entity.endswith(")"): + modified_entity = "(" + modified_entity + ")" + elif entity.startswith("("): + modified_entity = "(" + modified_entity + elif entity.endswith(")"): + modified_entity = f"{escaped_docname}_e{entity_num}" + ")" + + # to avoid parentheses clashes, put the entities with ")" first + if modified_entity.startswith("("): + modified_entities.append(modified_entity) + else: + modified_entities.insert(0, modified_entity) + node.misc['Entity'] = ''.join(modified_entities) + + elif attribute_name == 'form' or (attribute_name != '_' and value != '_'): + setattr(node, attribute_name, value) + nodes.append(node) + + def read_tree_from_lines(self, lines): + root = Root() + nodes = [root] + for line in lines: + if line == '': + pass + elif line[0] == '#': + self.parse_comment_line(line, root) + else: + self.parse_node_line(line, root, nodes) + + # If no nodes were read from the filehandle (so only root remained in nodes), + # we return None as a sign of failure (end of file or more than one empty line). + if len(nodes) == 1: + return None + + return root + + def read_trees(self): + if self.max_docs: + raise NotImplementedError("TODO implement max_docs in read.Conll2012") + # Corref-PT does not put an empty line before #end document, + # so we need to split both on #end document and empty lines. + return [self.read_tree_from_lines(s.split('\n')) for s in + re.split(r'\n\n+|\n#end document\n', self.filehandle.read()) if s] + + def read_tree(self): + raise NotImplementedError("TODO implement read_tree in read.Conll2012") diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 8c80a779..e19cd676 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -1,66 +1,51 @@ """"Conllu is a reader block for the CoNLL-U files.""" +import json import logging import re from udapi.core.basereader import BaseReader from udapi.core.root import Root +from udapi.core.node import Node # Compile a set of regular expressions that will be searched over the lines. # The equal sign after sent_id was added to the specification in UD v2.0. # This reader accepts also older-style sent_id (until UD v2.0 treebanks are released). RE_SENT_ID = re.compile(r'^# sent_id\s*=?\s*(\S+)') -RE_TEXT = re.compile(r'^# text\s*=\s*(.+)') -RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc) (?:\s*id\s*=\s*(.+))?') +RE_TEXT = re.compile(r'^# text\s*=\s*(.*)') +RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?$') +RE_JSON = re.compile(r'^# (doc_)?json_([^ =]+)\s*=\s*(.+)') +RE_GLOBAL_ENTITY = re.compile(r'^# global.Entity\s*=\s*(\S+)') + class Conllu(BaseReader): """A reader of the CoNLL-U files.""" - def __init__(self, strict=False, separator='tab', - attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc', **kwargs): + def __init__(self, strict=False, empty_parent='warn', fix_cycles=False, **kwargs): """Create the Conllu reader object. Args: strict: raise an exception if errors found (default=False, i.e. a robust mode) - separator: How are the columns separated? - Default='tab' is the only possibility in valid CoNLL-U files. - 'space' means one or more whitespaces (this does not allow forms with space). - 'doublespace' means two or more spaces. - attributes: comma-separated list of column names in the input files - (default='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc') - Changing the default can be used for loading CoNLL-like formats (not valid CoNLL-U). - For ignoring a column, use "_" as its name. - Column "ord" marks the column with 1-based word-order number/index (usualy called ID). - Column "head" marks the column with dependency parent index (word-order number). - - For example, for CoNLL-X which uses name1=value1|name2=value2 format of FEATS, use - `attributes=ord,form,lemma,upos,xpos,feats,head,deprel,_,_` - but note attributes that upos, feats and deprel will contain language-specific values, - not valid according to UD guidelines and a further conversion will be needed. - You will loose the projective_HEAD and projective_DEPREL attributes. - - For CoNLL-2009 you can use `attributes=ord,form,lemma,_,upos,_,feats,_,head,_,deprel`. - You will loose the predicted_* attributes and semantic/predicate annotation. - - TODO: allow storing the rest of columns in misc, e.g. `node.misc[feats]` - for feats which do not use the name1=value1|name2=value2 format. + empty_parent: What to do if HEAD is _? Default=warn: issue a warning and attach to the root + or if strict=1 issue an exception. With `empty_parent=ignore` no warning is issued. + fix_cycles: fix cycles by attaching a node in the cycle to the root; fix also HEAD index out of range """ super().__init__(**kwargs) - self.node_attributes = attributes.split(',') self.strict = strict - self.separator = separator - + self.empty_parent = empty_parent + self.fix_cycles = fix_cycles - @staticmethod - def parse_comment_line(line, root): + def parse_comment_line(self, line, root): """Parse one line of CoNLL-U and fill sent_id, text, newpar, newdoc in root.""" sent_id_match = RE_SENT_ID.match(line) if sent_id_match is not None: root.sent_id = sent_id_match.group(1) + root.comment += '$SENT_ID\n' return text_match = RE_TEXT.match(line) if text_match is not None: root.text = text_match.group(1) + root.comment += '$TEXT\n' return pardoc_match = RE_NEWPARDOC.match(line) @@ -68,42 +53,89 @@ def parse_comment_line(line, root): value = True if pardoc_match.group(2) is None else pardoc_match.group(2) if pardoc_match.group(1) == 'newpar': root.newpar = value + root.comment += '$NEWPAR\n' else: root.newdoc = value + root.comment += '$NEWDOC\n' return - root.comment = root.comment + line[1:] + "\n" + json_match = RE_JSON.match(line) + if json_match is not None: + container = root.json + if json_match.group(1) == 'doc_': + if '__doc__' not in root.json: + root.json['__doc__'] = {} + container = root.json['__doc__'] + container[json_match.group(2)] = json.loads(json_match.group(3)) + return - # pylint: disable=too-many-locals,too-many-branches,too-many-statements - # Maybe the code could be refactored, but it is speed-critical, - # so benchmarking is needed because calling extra methods may result in slowdown. - def read_tree(self, document=None): + entity_match = RE_GLOBAL_ENTITY.match(line) + if entity_match is not None: + global_entity = entity_match.group(1) + if self._global_entity and self._global_entity != global_entity: + logging.warning(f"Mismatch in global.Entity: {self._global_entity} != {global_entity}") + self._global_entity = global_entity + root.comment += '$GLOBAL.ENTITY\n' + return + + root.comment += line[1:] + "\n" + + def read_trees(self): + if not self.max_docs: + # Valid CoNLL-U files must have sentences separated by a single empty line. + # However, some users have to work with invalid files e.g. ending with two empty lines. + # It is obvious how to parse such files and re.split(r'\n\n+', s) is only twice as slow + # as s.split('\n\n') and this time is negligble + # relative to the main CoNLL-U parsing in read_tree_from_lines(). + return [self.read_tree_from_lines(s.split('\n')) for s in + re.split(r'\n\n+', self.filehandle.read()) if s] + # udapi.core.basereader takes care about the max_docs parameter. + # However, we can make the loading much faster by not reading + # the whole file if the user wants just first N documents. + trees, lines, loaded_docs = [], [], 0 + for line in self.filehandle: + line = line.rstrip() + if line == '': + tree = self.read_tree_from_lines(lines) + lines = [] + if tree.newdoc: + if loaded_docs == self.max_docs: + return trees + loaded_docs += 1 + if tree: + trees.append(tree) + else: + lines.append(line) + return trees + + def read_tree(self): if self.filehandle is None: return None + lines = [] + for line in self.filehandle: + line = line.rstrip() + if line == '': + break + lines.append(line) + return self.read_tree_from_lines(lines) + # pylint: disable=too-many-locals,too-many-branches,too-many-statements + # Maybe the code could be refactored, but it is speed-critical, + # so benchmarking is needed because calling extra methods may result in slowdown. + def read_tree_from_lines(self, lines): root = Root() nodes = [root] parents = [0] mwts = [] - for line in self.filehandle: - line = line.rstrip() - if line == '': - break + for line in lines: if line[0] == '#': self.parse_comment_line(line, root) else: - if self.separator == 'tab': - fields = line.split('\t') - elif self.separator == 'space': - fields = line.split() - elif self.separator == 'doublespace': - fields = re.split(' +', line) - else: - raise ValueError('separator=%s is not valid' % self.separator) - if len(fields) != len(self.node_attributes): + fields = line.split('\t') + if len(fields) != 10: if self.strict: raise RuntimeError('Wrong number of columns in %r' % line) - fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + fields.extend(['_'] * (10 - len(fields))) # multi-word tokens will be processed later if '-' in fields[0]: mwts.append(fields) @@ -112,27 +144,32 @@ def read_tree(self, document=None): empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], xpos=fields[4], feats=fields[5], misc=fields[9]) empty.ord = fields[0] - empty.raw_deps = fields[8] # TODO + empty.raw_deps = fields[8] # TODO continue - node = root.create_child() - - # TODO slow implementation of speed-critical loading - for (n_attribute, attribute_name) in enumerate(self.node_attributes): - if attribute_name == 'head': - try: - parents.append(int(fields[n_attribute])) - except ValueError as exception: - if not self.strict and fields[n_attribute] == '_': - logging.warning("Empty parent/head index in '%s'", line) - else: - raise exception - elif attribute_name == 'ord': - setattr(node, 'ord', int(fields[n_attribute])) - elif attribute_name == 'deps': - setattr(node, 'raw_deps', fields[n_attribute]) - elif attribute_name != '_': - setattr(node, attribute_name, fields[n_attribute]) + if fields[3] == '_': + fields[3] = None + if fields[4] == '_': + fields[4] = None + if fields[7] == '_': + fields[7] = None + + # ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc + node = Node(root=root, form=fields[1], lemma=fields[2], upos=fields[3], + xpos=fields[4], feats=fields[5], deprel=fields[7], misc=fields[9]) + root._descendants.append(node) + node._ord = int(fields[0]) + if fields[8] != '_': + node.raw_deps = fields[8] + try: + parents.append(int(fields[6])) + except ValueError as exception: + if not self.strict and fields[6] == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception nodes.append(node) @@ -146,24 +183,49 @@ def read_tree(self, document=None): # they need to create one artificial node and mark it with Empty=Yes. # In that case, we will delete this node, so the tree will have just the (technical) root. # See also udapi.block.write.Conllu, which is compatible with this trick. - if len(nodes) == 2 and nodes[1].misc == 'Empty=Yes': + if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': nodes.pop() + root._children = [] + root._descendants = [] # Set dependency parents (now, all nodes of the tree are created). - # TODO: parent setter checks for cycles, but this is something like O(n*log n) - # if done for each node. It could be done faster if the whole tree is checked at once. - # Also parent setter removes the node from its old parent's list of children, - # this could be skipped here by not using `node = root.create_child()`. for node_ord, node in enumerate(nodes[1:], 1): try: - node.parent = nodes[parents[node_ord]] + parent = nodes[parents[node_ord]] except IndexError: - raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) + if self.fix_cycles: + logging.warning(f"Ignoring out-of-range HEAD (attaching to the root instead): {node} HEAD={parents[node_ord]}") + parent = root + else: + raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) + if node is parent: + if self.fix_cycles: + logging.warning("Ignoring a self-cycle (attaching to the root instead):\n%s", node) + parent = root + else: + raise ValueError(f"Detected a cycle: {node} attached to itself") + elif node._children: + climbing = parent._parent + while climbing: + if climbing is node: + if self.fix_cycles: + logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent) + parent = root + break + else: + raise ValueError(f"Detected a cycle: {node}") + climbing = climbing._parent + node._parent = parent + parent._children.append(node) # Create multi-word tokens. for fields in mwts: - range_start, range_end = fields[0].split('-') - words = nodes[int(range_start):int(range_end)+1] - root.create_multiword_token(words, form=fields[1], misc=fields[-1]) + try: + range_start, range_end = fields[0].split('-') + except ValueError: + logging.warning(f"Wrong MWT range in\n{fields[0]}\n\n{lines}") + raise + words = nodes[int(range_start):int(range_end) + 1] + root.create_multiword_token(words, form=fields[1], feats=fields[5], misc=fields[9]) return root diff --git a/udapi/block/read/conllup.py b/udapi/block/read/conllup.py new file mode 100644 index 00000000..16d83d07 --- /dev/null +++ b/udapi/block/read/conllup.py @@ -0,0 +1,107 @@ +"""Conllup is a reader block for the CoNLL-UPlus format. + +Columns which don't have standardize attributes in Udapi/CoNLL-U +are stored in MISC (as key=value pairs). + +This code has been only tested on Hungarian KorKor files for CorefUD so far. +However, in the end, it is not used there (xtsv files are used instead conllup). +""" +import logging +import re + +import udapi.block.read.conll +from udapi.core.root import Root +from udapi.core.node import Node + +RE_GLOBAL_COLUMNS = re.compile(r'^# global.columns\s*=\s*(.+)') +COLUMN_MAP = { + 'ID': 'ord', +} +NORMAL_ATTRS = 'form lemma upos xpos feats deprel misc'.split() + +class Conllup(udapi.block.read.conll.Conll): + """A reader of the CoNLL-UPlus files.""" + + def __init__(self, attributes='autodetect', save_global_columns=False, **kwargs): + """Create the Conllup reader object. + + Args: + attributes: comma-separated list of column names in the input files + (can be used if the global.columns header is missing or needs to be overriden). + Default='autodetect' which means the column names will be loaded from the global.columns header. + For ignoring a column, use "_" as its name. + save_global_columns: keep the "global.columns" header in root.comments. Default=False. + Note that when saving the output to CoNLL-U, the comment is not needed + and it may be even misleading. It could be helpful only once write.Conllup is implemented + (with the possibility to use the same columns as in the input file). + """ + super().__init__(**kwargs) + self.save_global_columns = save_global_columns + if attributes == 'autodetect': + self.node_attributes = None + else: + self.node_attributes = attributes.split(',') + + def parse_comment_line(self, line, root): + if self.node_attributes is None: + global_columns_match = RE_GLOBAL_COLUMNS.match(line) + if global_columns_match is None: + return super().parse_comment_line(line, root) + global_columns = global_columns_match.group(1) + self.node_attributes = [COLUMN_MAP.get(v, v.lower()) for v in global_columns.split(" ")] + if self.save_global_columns: + root.comment += line[1:] + '\n' + return + return super().parse_comment_line(line, root) + + def parse_node_line(self, line, root, nodes, parents, mwts): + fields = line.split('\t') + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + + # multi-word tokens will be processed later + if '-' in fields[0]: + mwts.append(fields) + return + if '.' in fields[0]: + raise NotImplementedError("Empty nodes in CoNLL-UPlus not implement yet in read.Conllup") + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + nonstandard_attrs = [] + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'head': + if value == '???': + value = 0 + try: + parents.append(int(value)) + except ValueError as exception: + if not self.strict and value == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception + elif attribute_name == 'ord': + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") + elif attribute_name == 'deps': + setattr(node, 'raw_deps', value) + elif value == '_' and attribute_name != 'form': + pass + elif attribute_name == '_': + pass + elif attribute_name in NORMAL_ATTRS: + setattr(node, attribute_name, value) + else: + nonstandard_attrs.append([attribute_name, value]) + + # This needs to be done after node.misc is created (if "misc" in node.attributes) + for attribute_name, value in nonstandard_attrs: + node.misc[attribute_name.capitalize()] = value + + nodes.append(node) diff --git a/udapi/block/read/oldcorefud.py b/udapi/block/read/oldcorefud.py new file mode 100644 index 00000000..73e05f3b --- /dev/null +++ b/udapi/block/read/oldcorefud.py @@ -0,0 +1,119 @@ +"""Reader for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation.""" +import re +import logging +import udapi.block.read.conllu +from udapi.core.coref import CorefEntity, CorefMention, BridgingLinks + +class OldCorefUD(udapi.block.read.conllu.Conllu): + + def __init__(self, replace_hyphen_in_id_with='', **kwargs): + """Create the read.OldCorefUD reader object. + + Args: + substitute_hyphen_in_id_for: string to use as a replacement for hyphens in ClusterId + The new format does not allow hyphens in eid (IDs of entity entities), + so we need to replace them. + """ + super().__init__(**kwargs) + self.replace_hyphen_in_id_with = replace_hyphen_in_id_with + self.orig2new = {} + self.new2orig = {} + + def _fix_id(self, cid): + if not cid or '-' not in cid: + return cid + new_cid = self.orig2new.get(cid) + if new_cid is None: + new_cid = cid.replace('-', self.replace_hyphen_in_id_with) + base, counter = new_cid, 1 + while new_cid in self.new2orig: + counter += 1 + new_cid = f"{base}{counter}" + self.new2orig[new_cid] = cid + self.orig2new[cid] = new_cid + return new_cid + + def process_document(self, doc, strict=True): + super().process_document(doc) + + eid_to_entity = {} + for node in doc.nodes_and_empty: + index, index_str = 0, "" + eid = node.misc["ClusterId"] + if not eid: + index, index_str = 1, "[1]" + eid = node.misc["ClusterId[1]"] + eid = self._fix_id(eid) + while eid: + entity = eid_to_entity.get(eid) + if entity is None: + entity = CorefEntity(eid) + eid_to_entity[eid] = entity + mention = CorefMention(words=[node], entity=entity) + if node.misc["MentionSpan" + index_str]: + mention.span = node.misc["MentionSpan" + index_str] + etype = node.misc["ClusterType" + index_str] + if etype: + if entity.etype is not None and etype != entity.etype: + logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + entity.etype = etype + + bridging_str = node.misc["Bridging" + index_str] + if bridging_str: + mention._bridging = BridgingLinks(mention) + for link_str in bridging_str.split(','): + target, relation = link_str.split(':') + target = self._fix_id(target) + if target == eid: + _error("Bridging cannot self-reference the same entity: " + target, strict) + if target not in eid_to_entity: + eid_to_entity[target] = CorefEntity(target) + mention._bridging.append((eid_to_entity[target], relation)) + + split_ante_str = node.misc["SplitAnte" + index_str] + if split_ante_str: + split_antes = [] + # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma. + # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator. + for ante_str in split_ante_str.replace('+', ',').split(','): + ante_str = self._fix_id(ante_str) + if ante_str in eid_to_entity: + if ante_str == eid: + _error("SplitAnte cannot self-reference the same entity: " + eid, strict) + split_antes.append(eid_to_entity[ante_str]) + else: + # split cataphora, e.g. "We, that is you and me..." + ante_cl = CorefEntity(ante_str) + eid_to_entity[ante_str] = ante_cl + split_antes.append(ante_cl) + entity.split_ante = sorted(split_antes) + + # Some CorefUD 0.2 datasets (e.g. ARRAU) separate key-value pairs with spaces instead of commas. + # We also need to escape forbidden characters. + mmisc = node.misc["MentionMisc" + index_str].replace(' ', ',') + mention.other = mmisc.replace('-', '%2D').replace('(', '%28').replace(')', '%29') + index += 1 + index_str = f"[{index}]" + eid = self._fix_id(node.misc["ClusterId" + index_str]) + # c=doc.coref_entities should be sorted, so that c[0] < c[1] etc. + # In other words, the dict should be sorted by the values (according to CorefEntity.__lt__), + # not by the keys (eid). + # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order. + for entity in eid_to_entity.values(): + if not entity._mentions: + _error(f"Entity {entity.eid} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict) + entity._mentions.sort() + doc._eid_to_entity = {c._eid: c for c in sorted(eid_to_entity.values())} + + # Delete all old-style attributes from MISC (so when converting old to new style, the old attributes are deleted). + attrs = "ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() + for node in doc.nodes_and_empty: + for key in list(node.misc): + if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): + del node.misc[key] + + +def _error(msg, strict): + if strict: + raise ValueError(msg) + logging.error(msg) diff --git a/udapi/block/read/sentences.py b/udapi/block/read/sentences.py index 14840a50..7487d580 100644 --- a/udapi/block/read/sentences.py +++ b/udapi/block/read/sentences.py @@ -4,7 +4,28 @@ class Sentences(BaseReader): - """A reader for plain-text sentences (one sentence per line) files.""" + r"""A reader for plain-text sentences (one sentence per line) files. + + Args: + ignore_empty_lines: if True, delete empty lines from the input. + Default=False. + newdoc_if_empty_line: if True, empty lines mark document boundaries, + which are marked with `root.newdoc`. Default=False. + rstrip: a set of characters to be stripped from the end of each line. + Default='\r\n '. You can use rstrip='\n' if you want to preserve + any space or '\r' (Carriage Return) at end of line, + so that `udpipe.Base` keeps these characters in `SpacesAfter`. + As most blocks do not expect whitespace other than a space to appear + in the processed text, using this feature is at your own risk. + """ + def __init__(self, ignore_empty_lines=False, newdoc_if_empty_line=False, + rstrip='\r\n ', **kwargs): + if ignore_empty_lines and newdoc_if_empty_line: + raise ValueError("ignore_empty_lines is not compatible with newdoc_if_empty_line") + self.ignore_empty_lines = ignore_empty_lines + self.newdoc_if_empty_line = newdoc_if_empty_line + self.rstrip = rstrip + super().__init__(**kwargs) @staticmethod def is_multizone_reader(): @@ -18,8 +39,25 @@ def read_tree(self, document=None): if self.filehandle is None: return None line = self.filehandle.readline() + # if readline() returns an empty string, the end of the file has been + # reached, while a blank line is represented by '\n' + # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None + preceded_by_empty_line = False + if self.ignore_empty_lines or self.newdoc_if_empty_line: + while line in {'\n', '\r\n'}: + preceded_by_empty_line = True + line = self.filehandle.readline() + if line == '': + return None root = Root() - root.text = line.rstrip() + root.text = line.rstrip(self.rstrip) + if self.newdoc_if_empty_line and preceded_by_empty_line: + root.newdoc = True return root + + # The first line in a file also marks a start of new document + def after_process_document(self, document): + if self.newdoc_if_empty_line: + document.bundles[0].trees[0].newdoc = True diff --git a/udapi/block/read/text.py b/udapi/block/read/text.py new file mode 100644 index 00000000..161b6b6e --- /dev/null +++ b/udapi/block/read/text.py @@ -0,0 +1,74 @@ +"""Text class is a reader for word-wrapped plain-text files.""" +from udapi.core.basereader import BaseReader +from udapi.core.root import Root + + +class Text(BaseReader): + r"""A reader for plain-text files with sentences on one or more lines. + + Sentences are separated by one or more empty lines. + Newlines within sentences are substituted by a space. + + Args: + rstrip: a set of characters to be stripped from the end of each line. + Default='\r\n '. You can use rstrip='\n' if you want to preserve + any space or '\r' (Carriage Return) at end of line, + so that `udpipe.Base` keeps these characters in `SpacesAfter`. + As most blocks do not expect whitespace other than a space to appear + in the processed text, using this feature is at your own risk. + empty_line: how empty lines are handled. Default 'new_sentence' preserves + the current behaviour (empty lines mark sentence boundaries). Use + 'keep' to read the entire file content into a single sentence (tree), including + empty lines. Use 'newpar' to behave like 'new_sentence' but also set + `root.newpar = True` on each sentence. + """ + def __init__(self, rstrip='\r\n ', empty_line='new_sentence', **kwargs): + if empty_line not in {'new_sentence', 'keep', 'newpar'}: + raise ValueError("empty_line must be 'new_sentence', 'keep' or 'newpar'") + self.rstrip = rstrip + self.empty_line = empty_line + super().__init__(**kwargs) + + @staticmethod + def is_multizone_reader(): + """Can this reader read bundles which contain more zones?. + + This implementation returns always False. + """ + return False + + def read_tree(self, document=None): + if self.filehandle is None: + return None + if self.empty_line == 'keep': + content = self.filehandle.read() + if content == '': + return None + root = Root() + root.text = content + return root + lines = [] + line = None + while True: + line = self.filehandle.readline() + # if readline() returns an empty string, the end of the file has been + # reached, while a blank line is represented by '\n' + # (or '\r\n' if reading a Windows file on Unix machine). + if line == '': + if not lines: + return None + else: + break + elif line in {'\n', '\r\n'}: + if not lines: + continue + else: + break + else: + lines.append(line.rstrip(self.rstrip)) + + root = Root() + root.text = " ".join(lines) + if self.empty_line == 'newpar': + root.newpar = True + return root diff --git a/udapi/block/read/vislcg.py b/udapi/block/read/vislcg.py index 3c5852d7..4c5a87ab 100644 --- a/udapi/block/read/vislcg.py +++ b/udapi/block/read/vislcg.py @@ -1,15 +1,14 @@ """Vislcg is a reader block the VISL-cg format.""" -import shlex - from udapi.core.basereader import BaseReader from udapi.core.root import Root + class Vislcg(BaseReader): """A reader of the VISL-cg format, suitable for VISL Constraint Grammer Parser.""" # TODO check validity and raise helpful exceptions if not valid # pylint: disable=too-many-branches - def read_tree(self, document=None): + def read_tree(self): if self.filehandle is None: return None @@ -22,28 +21,28 @@ def read_tree(self, document=None): if line == '': break if line[0] == '#': - # Are comments allowed in VISL-cg? + root.comment += line[1:] + "\n" continue if line[0].isspace(): - line.lstrip(line) - node, parent_ord = self._node(line, root) + node, parent_ord = self._node(line.lstrip(), root) words.append(node) parents.append(parent_ord) - else: - if words: - words[0].form = form - if len(words) > 1: - split_forms = form.split() - if len(words) == len(split_forms): - for word, split_form in zip(words, split_forms): - word.form = split_form - else: - for word in words[1:]: - word.form = '_' - root.create_multiword_token(words, form=form) - words = [] - form = line[2:-2] + continue + + if words: + words[0].form = form + if len(words) > 1: + split_forms = form.split() + if len(words) == len(split_forms): + for word, split_form in zip(words, split_forms): + word.form = split_form + else: + for word in words[1:]: + word.form = '_' + root.create_multiword_token(words, form=form) + words = [] + form = line[2:-2] if words: words[0].form = form @@ -63,10 +62,15 @@ def read_tree(self, document=None): @staticmethod def _node(line, root): - fields = shlex.split(line) - lemma = fields[0] - xpos = fields[1] - feats_list = fields[2:-2] + # line contains "lemma" xpos feat1 feat2 .. featN @deprel #ord->parent.ord + # Lemma can contain spaces, but quotes within lemma are not escaped, + # so we cannot use fields = shlex.split(line) + # Let's hope that xpos, feats and deprel do not contain any quotes. + end_quote_pos = line.rfind('"') + lemma = line[1:end_quote_pos] + fields = line[end_quote_pos + 1:].split() + xpos = fields[0] + feats_list = fields[3:-2] feats = '|'.join(feats_list) if feats_list else '_' deprel = fields[-2][1:] parent_ord = int(fields[-1].split('->')[1]) diff --git a/udapi/block/segment/merge.py b/udapi/block/segment/merge.py new file mode 100644 index 00000000..9ada45f1 --- /dev/null +++ b/udapi/block/segment/merge.py @@ -0,0 +1,46 @@ +"""Block segment.Merge""" +from udapi.core.block import Block + +class Merge(Block): + """"Re-segmenter merging selected sentences (trees). + + This class merges sentences ending with semicolons, + but it can be used as a base class for merging based on different criteria + by overriding one of the `should_*` methods. + """ + + @staticmethod + def should_merge_tokens(first, second): + """Is there actually a sentence boundary between the first and second node?""" + if first.form[-1] == ';': + return True + return False + + def should_merge_bundles(self, first_bundle, second_bundle): + """Is there actually a sentence boundary between the first and second bundle?""" + first_tree = self._get_our_tree(first_bundle) + second_tree = self._get_our_tree(second_bundle) + return self.should_merge_tokens(first_tree.descendants[-1], second_tree.descendants[0]) + + + def _get_our_tree(self, bundle): + for tree in bundle: + if self._should_process_tree(tree): + return tree + raise ValueError("Bundle %s contains no tree to process." % bundle.address()) + + + def process_document(self, doc): + old_bundles = doc.bundles + prev_bundle = old_bundles[0] + new_bundles = [prev_bundle] + for bundle in old_bundles[1:]: + if self.should_merge_bundles(prev_bundle, bundle): + for tree in bundle: + prev_tree = prev_bundle.get_tree(tree.zone) + prev_tree.steal_nodes(tree.descendants) + prev_tree.text = prev_tree.compute_text() + else: + new_bundles.append(bundle) + prev_bundle = bundle + doc.bundles = new_bundles \ No newline at end of file diff --git a/udapi/block/segment/simple.py b/udapi/block/segment/simple.py new file mode 100644 index 00000000..58be9b6d --- /dev/null +++ b/udapi/block/segment/simple.py @@ -0,0 +1,91 @@ +"""Block segment.Simple""" +from udapi.core.block import Block +from udapi.core.bundle import Bundle +import re + +class Simple(Block): + """"Heuristic segmenter, splits on sentence-final segmentation followed by uppercase. + The exceptions are: + 1) abbreviations of names, e.g. "A. Merkel" + 2) predefined list of nonfinal abbreviations, e.g. "e.g." + + Parameters + ---------- + keep_spaces : bool + do not strip whitespaces from the `text` attribute of the sentences created by segmentation + """ + + def __init__(self, keep_spaces=False, **kwargs): + super().__init__(**kwargs) + self.keep_spaces = keep_spaces + + @staticmethod + def is_nonfinal_abbrev(token): + """Is a given token an abbreviation (without the final period) which cannot end a sentence?""" + if re.search('(např|e.g.)$', token): + return True + return False + + + def is_boundary(self, first, second): + """Is there a sentence boundary between the first and second token?""" + if not first or not second: + return False + if first[-1] in '"“»›)': + first = first[:-1] + if not first: + return False + if second[0] in '"„«¿¡‹(': + second = second[1:] + if not second: + return False + if not second[0].isupper() or second[0].isdigit(): + return False + if not first[-1] in '.!?': + return False + if first[-1] == '.': + # correctly count length in "„A. Merkel" + if first[0] in '"„«¿¡‹(': + first = first[1:] + if len(first) == 2 and first[0].isupper(): + return False + if self.is_nonfinal_abbrev(first[:-1]): + return False + return True + + + def segment_string(self, string): + """Return a list of sentences in a given string.""" + tokens = string.split(' ') + previous = tokens[0] + segments = [previous] + for token in tokens[1:]: + if self.is_boundary(previous, token): + if self.keep_spaces: + segments[-1] += ' ' + segments.append(token) + else: + segments[-1] += ' ' + token + previous = token + return segments + + + def process_document(self, doc): + old_bundles = doc.bundles + new_bundles = [] + for bundle in old_bundles: + new_bundles.append(bundle) + for tree in bundle: + if self._should_process_tree(tree): + if tree.children: + raise ValueError("Segmenting already tokenized text is not supported.") + sentences = self.segment_string(tree.text) + orig_bundle_id = bundle.bundle_id + bundle.bundle_id = orig_bundle_id + '-1' + if len(sentences) > 1: + tree.text = sentences[0] + for i, sentence in enumerate(sentences[1:], 2): + new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i)) + new_bundle.create_tree(tree.zone).text = sentence + new_bundles.append(new_bundle) + doc.bundles = new_bundles diff --git a/udapi/block/tokenize/__init__.py b/udapi/block/tokenize/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/tokenize/onwhitespace.py b/udapi/block/tokenize/onwhitespace.py new file mode 100644 index 00000000..913dae61 --- /dev/null +++ b/udapi/block/tokenize/onwhitespace.py @@ -0,0 +1,97 @@ +"""Block tokenize.OnWhitespace""" +import re +from udapi.core.block import Block + + +class OnWhitespace(Block): + """Base tokenizer, splits on whitespaces, fills SpaceAfter=No. + + Use the parameter `keep_spaces=True` to preserve all whitespaces in the sentence + in the UDPipe way, i.e. using the `SpacesAfter` and `SpacesBefore` features in the MISC field. + It is backward compatible with CoNLL-U v2 `SpaceAfter=No` feature. That is, no following + whitespace is marked by `SpaceAfter=No` and a single following space results in no + whitespace-related markup. + If loading the text using `read.Sentences` and all whitespaces need to be preserved + (in order to be able to reconstruct the original document), the `read.Sentences` block + must be called with `rstrip=''`, `rstrip=\n` or `rstrip=\r\n` to prevent stripping the + trailing whitespace, e.g.:: + $> echo -e "Hello \t world " | udapy read.Sentences $'rstrip=\r\n' tokenize.OnWhitespace keep_spaces=1 write.Conllu + + # sent_id = 1 + # text = Hello world + 1 Hello _ _ _ _ 0 _ _ SpacesAfter=\s\t\s + 2 world _ _ _ _ 0 _ _ _ + Note that the attribute `SpaceAfter=No` is missing for the token `world`, since it is + followed by a single space. + + Parameters + ---------- + keep_spaces : bool + preserve whitespaces by filling MISC attributes `SpacesAfter` and `SpacesBefore` (by default False) + """ + + escape_whitespace_table = str.maketrans({' ':r'\s', '\t':r'\t', '\r':r'\r', '\n':r'\n'}) + + def __init__(self, keep_spaces=False, **kwargs): + super().__init__(**kwargs) + self.keep_spaces = keep_spaces + + @staticmethod + def tokenize_sentence(string): + """A method to be overriden in subclasses.""" + return string.split() + + def process_tree(self, root): + if root.children: + raise ValueError('Tree %s is already tokenized.' % root) + #sentence = ' '.join(root.text.split()) + sentence = root.text + tokens = self.tokenize_sentence(sentence) + + # Check if there are any spaces before the first token + spaces_before = "" + m = re.match(r'\s+', sentence) + if m: + spaces_before = m.group(0) + sentence = sentence[len(spaces_before):] + + for i, token in enumerate(tokens, 1): + spaces_after = "" + + # The token (returned from tokenization) does not match the start of sentence. + # E.g. '. . . word' is tokenized as '... word'. + if not sentence.startswith(token): + # Let's delete the start of sentence anyway, + # using a non-greedy regex and the expected next token + # returned from the tokenization. + # my $next_token = $tokens[$i+1]; + # my ($first, $rest) = ($sentence =~ /^(.*?)(\Q$next_token\E.*)$/); + # $no_space_after = 1 if (defined $first && $first !~ /\s$/); + # $sentence = $rest if (defined $rest); + raise ValueError('tokenization does not match: "%s" vs "%s"' % (token, sentence)) + + # Delete the token from the begining of the sentence. + sentence = sentence[len(token):] + + # Set the SpaceAfter and SpacesAfter properly + m = re.match(r'\s+', sentence) + if m is not None: + spaces_after = m.group(0) + sentence = sentence[len(spaces_after):] + + # normalize whitespace + if not self.keep_spaces: + spaces_before = "" + # spaces_after = "" <=> SpaceAfter=No is never set for the last token <=> len(sentence) = 0 + spaces_after = "" if not len(spaces_after) and len(sentence) else " " + + # create a new node + node = root.create_child(form=token) + node.ord = i + + if i == 1 and spaces_before: + node.misc["SpacesBefore"] = spaces_before.translate(self.escape_whitespace_table) + if not spaces_after: + node.misc["SpaceAfter"] = 'No' + elif spaces_after != " ": + node.misc["SpacesAfter"] = spaces_after.translate(self.escape_whitespace_table) diff --git a/udapi/block/tokenize/simple.py b/udapi/block/tokenize/simple.py new file mode 100644 index 00000000..f7010d13 --- /dev/null +++ b/udapi/block/tokenize/simple.py @@ -0,0 +1,13 @@ +"""Block tokenize.Simple""" +import re + +from udapi.block.tokenize.onwhitespace import OnWhitespace + + +class Simple(OnWhitespace): + """Simple tokenizer, splits on whitespaces and punctuation, fills SpaceAfter=No.""" + + @staticmethod + def tokenize_sentence(string): + """A method to be overriden in subclasses.""" + return re.findall(r'\w+|[^\w\s]', string) diff --git a/udapi/block/transform/__init__.py b/udapi/block/transform/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/transform/deproj.py b/udapi/block/transform/deproj.py new file mode 100644 index 00000000..581f5a6b --- /dev/null +++ b/udapi/block/transform/deproj.py @@ -0,0 +1,43 @@ +"""Block Deproj for deprojectivization of pseudo-projective trees à la Nivre & Nilsson (2005). + +See ud.transform.Proj for details. +TODO: implement also path and head+path strategies. +""" +from udapi.core.block import Block + + +class Deproj(Block): + """De-projectivize the trees à la Nivre & Nilsson (2005).""" + + def __init__(self, strategy='head', label='misc', **kwargs): + """Create the Deproj block object.""" + super().__init__(**kwargs) + self.strategy = strategy + self.label = label + + def process_node(self, node): + if self.label == 'misc': + label = node.misc['pproj'] + elif self.label == 'deprel': + parts = node.sdeprel.split('+', 1) + if len(parts) == 2: + label = parts[1] + node.deprel = node.udeprel + (':' + parts[0] if parts[0] else '') + else: + label = '' + else: + raise(ValueError('Unknown parameter label=%s' % self.label)) + if label == '': + return + reconstructed_parent = self.head_strategy(node, label) + if reconstructed_parent: + node.parent = reconstructed_parent + + def head_strategy(self, node, label): + queue = [n for n in node.parent.children if n != node] # TODO deque + while queue: + adept = queue.pop(0) + if adept.udeprel == label: + return adept + queue.extend(adept.children) + return None diff --git a/udapi/block/transform/flatten.py b/udapi/block/transform/flatten.py new file mode 100644 index 00000000..d218ad27 --- /dev/null +++ b/udapi/block/transform/flatten.py @@ -0,0 +1,25 @@ +"""transform.Flatten block for flattening trees.""" +from udapi.core.block import Block + +class Flatten(Block): + """Apply `node.parent = node.root; node.deprel = 'root'` on all nodes.""" + + def __init__(self, oneroot=False, **kwargs): + """Args: + oneroot: only the first node will have deprel 'root'. + All other nodes will depend on the first node with deprel 'dep'. + This option makes the trees valid according to the validator. + (default=False) + """ + super().__init__(**kwargs) + self.oneroot = oneroot + + def process_tree(self, tree): + for node in tree.descendants: + node.parent = node.root + node.deprel = 'root' + if self.oneroot: + first = tree.descendants[0] + for node in tree.descendants[1:]: + node.parent = first + node.deprel = 'dep' diff --git a/udapi/block/transform/proj.py b/udapi/block/transform/proj.py new file mode 100644 index 00000000..6e284b4c --- /dev/null +++ b/udapi/block/transform/proj.py @@ -0,0 +1,64 @@ +"""Block Proj for (pseudo-)projectivization of trees à la Nivre & Nilsson (2005). + +See http://www.aclweb.org/anthology/P/P05/P05-1013.pdf. +This block tries to replicate Malt parser's projectivization: +http://www.maltparser.org/userguide.html#singlemalt_proj +http://www.maltparser.org/optiondesc.html#pproj-marking_strategy + +TODO: implement also path and head+path strategies. + +TODO: Sometimes it would be better (intuitively) +to lower the gap-node (if its whole subtree is in the gap +and if this does not cause more non-projectivities) +rather than to lift several nodes whose parent-edge crosses this gap. +We would need another label value (usually the lowering is of depth 1), +but the advantage is that reconstruction of lowered edges +during deprojectivization is simple and needs no heuristics. +""" +from udapi.core.block import Block + + +class Proj(Block): + """Projectivize the trees à la Nivre & Nilsson (2005).""" + + def __init__(self, strategy='head', lifting_order='deepest', label='misc', **kwargs): + """Create the Proj block object.""" + super().__init__(**kwargs) + self.lifting_order = lifting_order + self.strategy = strategy + self.label = label + + def process_tree(self, tree): + nonprojs = [self.nonproj_info(n) for n in tree.descendants if n.is_nonprojective()] + for nonproj in sorted(nonprojs, key=lambda info: info[0]): + self.lift(nonproj[1]) + + def nonproj_info(self, node): + if self.lifting_order == 'shortest': + return (abs(node.ord - node.parent.ord), node) + orig_parent = node.parent + node.parent = node.parent.parent + depth = 1 + while node.is_nonprojective(): + node.parent = node.parent.parent + depth += 1 + node.parent = orig_parent + return (-depth, node) + + def lift(self, node): + orig_parent = node.parent + depth = 0 + while node.is_nonprojective(): + node.parent = node.parent.parent + depth += 1 + if depth == 0: + return + self.mark(node, orig_parent.udeprel) + + def mark(self, node, label): + if self.label == 'misc': + node.misc['pproj'] = label + elif self.label == 'deprel': + node.deprel = '%s:%s+%s' % (node.udeprel, node.sdeprel, label) + else: + raise ValueError('Unknown parameter label=%s' % self.label) diff --git a/udapi/block/tutorial/__init__.py b/udapi/block/tutorial/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/tutorial/addarticles.py b/udapi/block/tutorial/addarticles.py new file mode 100644 index 00000000..96f0ba2f --- /dev/null +++ b/udapi/block/tutorial/addarticles.py @@ -0,0 +1,14 @@ +"""tutorial.AddArticles block template.""" +# nickname = xy123 +# TODO: make up a unique nickname and edit the previous line +# if you want your results to be listed on the NPFL070 web (under that nickname). +# Delete the line if you don't want to listed on the web. +from udapi.core.block import Block + +class AddArticles(Block): + """Heuristically insert English articles.""" + + def process_node(self, node): + if node.upos == "NOUN": + the = node.create_child(form="the", lemma="the", upos="DET", deprel="det") + the.shift_before_subtree(node) diff --git a/udapi/block/tutorial/addcommas.py b/udapi/block/tutorial/addcommas.py new file mode 100644 index 00000000..97677d89 --- /dev/null +++ b/udapi/block/tutorial/addcommas.py @@ -0,0 +1,31 @@ +"""tutorial.AddCommas block template.""" +from udapi.core.block import Block + +# nickname = xy123 +# TODO: make up a unique nickname and edit the previous line +# if you want your results to be listed on the NPFL070 web (under that nickname). +# Delete the line if you don't want to listed on the web. + +class AddCommas(Block): + """Heuristically insert nodes for missing commas.""" + + def __init__(self, language='en', **kwargs): + super().__init__(**kwargs) + self.language = language + + def process_node(self, node): + # TODO: Your task: implement some heuristics + if self.should_add_comma_before(node): + comma = node.create_child(form=',', deprel='punct', upos='PUNCT') + comma.shift_before_node(node) + + def should_add_comma_before(self, node): + prev_node = node.prev_node + if prev_node is None: + return False + if self.language == 'en' and prev_node.lemma == 'however': + return True + if any(n.deprel == 'appos' for n in prev_node.children): + return True + + return False diff --git a/udapi/block/tutorial/adpositions.py b/udapi/block/tutorial/adpositions.py new file mode 100644 index 00000000..9c4e131b --- /dev/null +++ b/udapi/block/tutorial/adpositions.py @@ -0,0 +1,35 @@ +"""tutorial.Adpositions block template. + +Example usage:: + + for a in */sample.conllu; do + printf '%50s ' $a; + udapy tutorial.Adpositions < $a; + done | tee results.txt + + # What are the English postpositions? + cat UD_English/sample.conllu | udapy -TM util.Mark \ + node='node.upos == "ADP" and node.parent.precedes(node)' | less -R +""" +from udapi.core.block import Block + + +class Adpositions(Block): + """Compute the number of prepositions and postpositions.""" + + def __init__(self, **kwargs): + """Create the Adpositions block object.""" + super().__init__(**kwargs) + self.prepositions = 0 + self.postpositions = 0 + + def process_node(self, node): + # TODO: Your task: distinguish prepositions and postpositions + if node.upos == "ADP": + self.prepositions += 1 + + def process_end(self): + total = self.prepositions + self.postpositions or 1 + prep = 100 * self.prepositions / total + post = 100 * self.postpositions / total + print("prepositions %5.1f%%, postpositions %5.1f%%" % (prep, post)) diff --git a/udapi/block/tutorial/parse.py b/udapi/block/tutorial/parse.py new file mode 100644 index 00000000..db732a12 --- /dev/null +++ b/udapi/block/tutorial/parse.py @@ -0,0 +1,30 @@ +"""tutorial.Parse block template. + +Usage: +udapy read.Conllu zone=gold files=sample.conllu \ + read.Conllu zone=pred files=sample.conllu \ + transform.Flatten zones=pred \ + tutorial.Parse zones=pred \ + eval.Parsing gold_zone=gold \ + util.MarkDiff gold_zone=gold \ + write.TextModeTreesHtml marked_only=1 files=parse-diff.html +""" +# nickname = xy123 +# TODO: make up a unique nickname and edit the previous line +# if you want your results to be listed on the NPFL070 web (under that nickname). +# Delete the line if you don't want to listed on the web. +from udapi.core.block import Block + +class Parse(Block): + """Dependency parsing.""" + + def __init__(self, language='en', **kwargs): + super().__init__(**kwargs) + self.language = language + + def process_tree(self, root): + # TODO: Your task: implement a better heuristics than "right chain" + for node in root.descendants: + if node.next_node: + node.parent = node.next_node + node.deprel = 'root' diff --git a/udapi/block/tutorial/removecommas.py b/udapi/block/tutorial/removecommas.py new file mode 100644 index 00000000..a07e2bba --- /dev/null +++ b/udapi/block/tutorial/removecommas.py @@ -0,0 +1,13 @@ +"""tutorial.RemoveCommas helper block.""" +from udapi.core.block import Block + + +class RemoveCommas(Block): + """Delete all comma nodes and edit SpaceAfter and text accordingly.""" + + def process_tree(self, root): + for node in root.descendants: + if node.form == ",": + node.remove(children="rehang") + del node.prev_node.misc['SpaceAfter'] + root.text = root.compute_text() diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py new file mode 100644 index 00000000..e7eb3989 --- /dev/null +++ b/udapi/block/ud/addmwt.py @@ -0,0 +1,113 @@ +"""Abstract base class ud.AddMwt for heuristic detection of multi-word tokens.""" +from udapi.core.block import Block +import logging + + +class AddMwt(Block): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def process_node(self, node): + analysis = self.multiword_analysis(node) + if analysis is None: + return + orig_attr = {} + for attr in 'form lemma upos xpos deprel'.split(): + orig_attr[attr] = getattr(node, attr) + orig_attr['feats'] = node.feats.copy() + orig_attr['misc'] = node.misc.copy() + # Defaults for the newly created MWT + mwt_misc = node.misc.copy() + mwt_form = node.form + + forms = analysis['form'].split() + main = analysis.get('main', 0) + parent = node if analysis.get('shape', '') == 'subtree' else node.parent + nodes = [] + for form in forms[0:main]: + new_node = parent.create_child(form=form) + new_node.shift_before_node(node) + nodes.append(new_node) + node.form = forms[main] + nodes.append(node) + for form in forms[main + 1:]: + new_node = parent.create_child(form=form) + new_node.shift_after_node(nodes[-1]) + nodes.append(new_node) + + if orig_attr['form'].isupper(): + for new_node in nodes: + new_node.form = new_node.form.upper() + elif orig_attr['form'][0].isupper(): + nodes[0].form = nodes[0].form.title() + + node.misc = None + for attr in 'lemma upos xpos feats deprel misc'.split(): + if attr in analysis: + values = analysis[attr].split() + for i, new_node in enumerate(nodes): + if len(values) <= i: + logging.warning("Attribute '%s' not supplied for word no. %d" % (attr, i)) + for attr in 'form lemma upos xpos feats deprel misc'.split(): + logging.warning("%s = %s" % (attr, analysis.get(attr, ''))) + if values[i] == '*': + setattr(new_node, attr, orig_attr[attr]) + # No MISC attribute should be duplicated on the word level and token level, + # so if copying MISC to a new_node, delete mwt_misc. + # However, SpaceAfter should be annotated only on the token level, + # so make sure it is not accidentally copied on the word level. + if attr == 'misc': + orig_attr['misc'].clear() + for a in 'SpaceAfter SpacesAfter SpacesBefore'.split(): + if new_node.misc[a]: + orig_attr['misc'][a] = new_node.misc[a] + del new_node.misc[a] + + elif attr == 'feats' and '*' in values[i]: + new_node.feats = values[i] + for feat_name, feat_value in list(new_node.feats.items()): + if feat_value == '*': + new_node.feats[feat_name] = orig_attr['feats'][feat_name] + else: + setattr(new_node, attr, values[i]) + + # Entity (coreference) annotation should be only on the word level, + # so make sure it does not stay on the token level. + if mwt_misc['Entity']: + nodes[0].misc['Entity'] = mwt_misc['Entity'] + del mwt_misc['Entity'] + + # If node is already part of an MWT, we need to delete the old MWT and extend the new MWT. + if node.multiword_token: + mwt_words = node.multiword_token.words + mwt_form = node.multiword_token.form + if node.multiword_token.misc: + mwt_misc.update(node.multiword_token.misc) + node.multiword_token.remove() + mwt_words[mwt_words.index(node):mwt_words.index(node)+1] = nodes + nodes = mwt_words + + mwt = node.root.create_multiword_token(words=nodes, form=mwt_form, misc=mwt_misc) + self.postprocess_mwt(mwt) + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token. + + An example return value is:: + + { + 'form': 'aby bych', + 'lemma': 'aby být', + 'upos': 'SCONJ AUX', + 'xpos': 'J,------------- Vc-S---1-------', + 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin', # _ means empty FEATS + 'deprel': '* aux', # * means keep the original deprel + 'main': 0, # which of the two words will inherit the original children (if any) + 'shape': 'siblings', # the newly created nodes will be siblings or alternatively + #'shape': 'subtree', # the main-indexed node will be the head + } + """ + raise NotImplementedError('multiword_analysis must be overriden in subclasses') + + def postprocess_mwt(self, mwt): + """Optional postprocessing of newly created MWTs.""" + pass diff --git a/udapi/block/ud/addpuncttype.py b/udapi/block/ud/addpuncttype.py new file mode 100644 index 00000000..f5f20e06 --- /dev/null +++ b/udapi/block/ud/addpuncttype.py @@ -0,0 +1,91 @@ +""" +Some UD treebanks use features PunctType and PunctSide that classify +punctuation symbols. This block can be used to add such features to data where +they are missing – the classification is mostly deterministic. If the input +data already contains such features, their values will be overwritten. +""" +from udapi.core.block import Block + +# TODO We need to know the language, there are many other quotation styles, +# e.g. Finnish and Swedish uses the same symbol for opening and closing: ”X”. +# Danish uses uses the French quotes, but switched: »X«. + +PUNCT_TYPES = { + '(': 'Brck', + ')': 'Brck', + '[': 'Brck', + ']': 'Brck', + '{': 'Brck', + '}': 'Brck', + '.': 'Peri', + '...': 'Elip', + '…': 'Elip', + ',': 'Comm', + ';': 'Semi', + ':': 'Colo', + '!': 'Excl', + '¡': 'Excl', # Spanish initial exclamation mark + '?': 'Qest', + '¿': 'Qest', # Spanish initial question mark + '/': 'Colo', # it is used this way in AnCora + '-': 'Dash', + '–': 'Dash', + '—': 'Dash', + '"': 'Quot', + "'": 'Quot', + '`': 'Quot', + '“': 'Quot', # opening English, closing Czech + '”': 'Quot', # closing English + '„': 'Quot', # opening Czech + '‘': 'Quot', # opening English, closing Czech + '’': 'Quot', # closing English + '‚': 'Quot', # opening Czech + '«': 'Quot', # opening French, closing Danish + '»': 'Quot', # closing French, opening Danish + '‹': 'Quot', + '›': 'Quot', + '《': 'Quot', # Korean, Chinese + '》': 'Quot', + '「': 'Quot', # Chinese, Japanese + '」': 'Quot', + '『': 'Quot', + '』': 'Quot' +} + +PUNCT_SIDES = { + '(': 'Ini', + ')': 'Fin', + '[': 'Ini', + ']': 'Fin', + '{': 'Ini', + '}': 'Fin', + '¡': 'Ini', # Spanish initial exclamation mark + '!': 'Fin', # but outside Spanish people may expect empty value + '¿': 'Ini', # Spanish initial question mark + '?': 'Fin', + '《': 'Ini', # Korean, Chinese + '》': 'Fin', + '「': 'Ini', # Chinese, Japanese + '」': 'Fin', + '『': 'Ini', + '』': 'Fin' +} + + +class AddPunctType(Block): + """Add features PunctType and PunctSide where applicable.""" + + def process_node(self, node): + # The two features apply only to PUNCT. If they already occur elsewhere, erase them. + if node.upos != 'PUNCT': + node.feats['PunctType'] = '' + node.feats['PunctSide'] = '' + else: + if node.form in PUNCT_TYPES: + node.feats['PunctType'] = PUNCT_TYPES[node.form] + else: + node.feats['PunctType'] = '' + if node.form in PUNCT_SIDES: + node.feats['PunctSide'] = PUNCT_SIDES[node.form] + else: + node.feats['PunctSide'] = '' diff --git a/udapi/block/ud/ar/fixedeprels.py b/udapi/block/ud/ar/fixedeprels.py new file mode 100644 index 00000000..a4b359ff --- /dev/null +++ b/udapi/block/ud/ar/fixedeprels.py @@ -0,0 +1,699 @@ +"""Block to fix case-enhanced dependency relations in Arabic.""" +from udapi.core.block import Block +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'أَنَّ': [], + 'أَن': [], + 'إِنَّ': [], + 'إِذَا': [], + 'لَو': [], + 'حَيثُ': [], + 'مِثلَ': [], + 'لِأَنَّ': [], + 'كَمَا': [], +# 'فِي_حِينَ': [], + 'فَ': [] + } + + # Reduction and normalization of prepositions and conjunctions, including + # the derived and compound ones. The Latin transliterations are not really + # needed in the process. We include them here as documentation, but also + # to help the poor editor with rendering the lines. Ideally, each line + # should have left-to-right text at both the beginning and end. + substitution = [ + {'target': ('min:gen', 'مِن:gen'), + 'sources': + [('ibtida min', 'اِبتِدَاء_مِن')] + }, + {'target': ('ʾiṯra:gen', 'إِثرَ:gen'), # ʾiṯra = right after + 'sources': + [('ʾiṯra', 'إِثرَ')] + }, + {'target': ('ʾaṯnāʾa:gen', 'أَثنَاءَ:gen'), # ʾaṯnāʾa = during + 'sources': + [('ʾaṯnāʾa', 'أَثنَاءَ')] + }, + {'target': ('ʾiḏ', 'إِذ'), # ʾiḏ = because + 'sources': + [('ʾiḏ', 'إِذ'), + ('ʾiḏ ʾanna', 'إِذ_أَنَّ')] + }, + {'target': ('ʾiḏā', 'إِذَا'), # ʾiḏā = if + 'sources': + [('ʾiḏā', 'إِذَا'), + ('ʾiḏā', 'إِذًا')] + }, + ] + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'اِبتِدَاء_مِن': 'مِن:gen', + 'إِثرَ': 'إِثرَ:gen', # ʾiṯra = right after + 'أَثنَاءَ': 'أَثنَاءَ:gen', # ʾaṯnāʾa = during + 'إِذ': 'إِذ', # ʾiḏ = because + 'إِذ_أَنَّ': 'إِذ', # ʾiḏ ʾanna + 'إِذًا': 'إِذَا', + 'إِذَا': 'إِذَا', # remove morphological case; ʾiḏā = if + 'إِزَاءَ': 'إِزَاءَ:gen', # ʾizāʾa = regarding, facing, towards + 'أَلَّا': 'إِلَّا', + 'إِلَّا': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_إِذَا': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَن': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَنَّ': 'إِلَّا', # ʾillā = except, unless + 'إِلَّا_أَنَّ_هُوَ': 'إِلَّا', # ʾillā = except, unless + 'إِلَى': 'إِلَى:gen', # ʾilā = to + 'إِلَى_أَن': 'إِلَى:gen', + 'إِلَى_أَنَّ': 'إِلَى_أَنَّ', # until? that? + 'إِلَى_أَنَّ_لَدَى': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_مِن': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ_مِن': 'إِلَى_أَنَّ', + 'إِلَى_أَنَّ_هُوَ_مِن_بَينَ': 'إِلَى_أَنَّ', + 'إِلَى_بَعدَ': 'إِلَى:gen', + 'إِلَى_بَينَ': 'إِلَى_بَينِ:gen', # ʾilā bayni = to between + 'إِلَى_جَانِب': 'إِلَى_جَانِبِ:gen', # ʾilā ǧānibi = beside + 'إِلَى_حَوَالَى': 'إِلَى:gen', # ila hawala = to around X + 'إِلَى_حَوَالَى_مِن': 'إِلَى:gen', # ila hawala min + 'إِلَى_حَيثُ': 'إِلَى:gen', + 'إِلَى_حِينَ': 'فِي_حِينِ', # during + 'إِلَى_خَارِجَ': 'إِلَى_خَارِجِ:gen', # ʾilā ḫāriǧi = out + 'إِلَى_فِي': 'إِلَى:gen', + 'إِلَى_قَبلَ': 'إِلَى_قَبلِ:gen', # ʾilā qabli = until before X (e.g. until one year ago) + 'إِلَى_مِثلَ': 'مِثلَ', # miṯla = like + 'إِلَى_نَحوَ': 'إِلَى:gen', # to about N + 'أَمَّا': 'أَمَامَ:gen', + 'إِمَّا_لِ': 'لِ:gen', + 'أَمَامَ': 'أَمَامَ:gen', # ʾamāma = in front of + 'أَمَامَ_مِن': 'أَمَامَ:gen', + 'أَن': 'أَنَّ', # remove morphological case; ʾanna = that + 'أَنَّ': 'أَنَّ', # remove morphological case; ʾanna = that + 'إِن': 'إِنَّ', # remove morphological case; ʾinna = that + 'إِنَّ': 'إِنَّ', # remove morphological case; ʾinna = that + 'إِنَّمَا': 'إِنَّ', + 'إِيَّا': 'إِلَّا', + 'بِ': 'بِ:gen', # bi = for, with + 'بِ_اِتِّجَاه': 'بِاِتِّجَاهِ:gen', # bi-ittiǧāhi = towards + 'بِ_إِزَاءَ': 'إِزَاءَ:gen', # ʾizāʾa = regarding, facing, towards + 'بِ_اِستِثنَاء': 'بِاِستِثنَاءِ:gen', # biistiṯnāʾi = with exception of + 'بِ_اِسم': 'بِاِسمِ:gen', # biismi = in name of + 'بِ_إِضَافَة_إِلَى': 'بِاَلإِضَافَةِ_إِلَى:gen', # bi-al-ʾiḍāfati ʾilā = in addition to + 'بِ_إِضَافَة_إِلَى_أَنَّ': 'إِلَى_أَنَّ', + 'بِ_إِضَافَة_لِ': 'بِاَلإِضَافَةِ_إِلَى:gen', # in addition to + 'بِ_اِعتِبَار': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِبَار_أَنَّ': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِبَار_مِن': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to + 'بِ_اِعتِمَاد_عَلَى': 'بِاَلِاعتِمَادِ_عَلَى:gen', # bi-al-i-ʼʿtimādi ʿalā = depending on + 'بِ_إِلَى': 'بِ:gen', + 'بِ_أَنَّ': 'أَنَّ', # that + 'بِ_أَن': 'بِ:gen', + 'بِ_إِنَّ': 'بِ:gen', + 'بِ_أَنَّ_أَمَامَ': 'أَنَّ', # that + 'بِ_أَنَّ_لَا': 'أَنَّ', # that + 'بِ_أَنَّ_مِن': 'أَنَّ', # that + 'بِ_أَنَّ_هما_مِن': 'أَنَّ', # that + 'بِ_أَنَّ_هُوَ': 'أَنَّ', # that + 'بِ_أَنَّ_هُوَ_عَلَى': 'أَنَّ', # that + 'بِ_اِنطِلَاق': 'بِ:gen', + 'بِ_تَالِي_إِنَّ': 'بِ:gen', + 'بِ_تَعَاوُن_مَعَ': 'بِاَلتَّعَاوُنِ_مَعَ:gen', # bi-at-taʿāwuni maʿa = in cooperation with + 'بِ_تُهمَة': 'بِتُهمَةِ:gen', # bituhmati = on charges of + 'بِ_تَوَازِي_مَعَ': 'بِاَلتَّوَازِي_مَعَ:gen', # bi-at-tawāzī maʿa = in parallel with + 'بِ_ثُمَّ': 'بِ:gen', + 'بِ_جَانِب': 'بِجَانِبِ:gen', # biǧānibi = next to + 'بِ_جِهَة': 'بِ:gen', + 'بِ_حَالَة': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'بِ_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'بِ_حُضُور': 'فِي_حُضُورِ:gen', # together with + 'بِ_حَقّ': 'بِ:gen', + 'بِ_حُكم': 'بِ:gen', + 'بِ_حُلُول': 'بِ:gen', + 'بِ_حَوَالَى': 'بِ:gen', # bi hawala = with around X + 'بِ_حَيثُ': 'بِ:gen', + 'بِ_خُصُوص': 'بِخُصُوصِ:gen', # biḫuṣūṣi = with regard + 'بِ_خِلَاف': 'بِخِلَافِ:gen', # biḫilāfi = in addition to + 'بِ_دَاخِلَ': 'دَاخِلَ:gen', + 'بِ_دَعوَى': 'بِ:gen', + 'بِ_دَور': 'بِ:gen', # bidawri = with role, in turn? + 'بِ_دُون': 'دُونَ:gen', + 'بِ_دُونَ': 'دُونَ:gen', # bi dūni = without + 'بِ_دُونَ_أَن': 'دُونَ:gen', # bi dūni ʾan = without + 'بِ_رِعَايَة': 'بِ:gen', + 'بِ_رَغم': 'رَغمَ:gen', # despite + 'بِ_رَغم_أَنَّ': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن_أَن': 'بِ:gen', + 'بِ_رَغم_مِن_أَنَّ': 'رَغمَ:gen', # despite + 'بِ_رَغم_مِن_أَنَّ_هُوَ': 'بِ:gen', + 'بِ_رِفقَة': 'بِرِفقَةٍ:gen', # birifqatin = in company of + 'بِ_رِئَاسَة': 'بِ:gen', + 'بِ_سَبّ': 'بِ:gen', + 'بِ_سَبَب': 'بِسَبَبِ:gen', # bisababi = because of + 'بِ_شَأن': 'بِشَأنِ:gen', # bišaʾni = about, regarding (lit. with + matter) + 'بِ_شَرط_أَن': 'بِ:gen', + 'بِ_صَدَد': 'بِصَدَدِ:gen', # biṣadadi = with respect to + 'بِ_صَرف_نَظَر_عَن': 'بِصَرفِ_اَلنَّظَرِ_عَن:gen', # biṣarfi an-naẓari ʿan = regardless of + 'بِ_صِفَة': 'بِصِفَةِ:gen', # biṣifati = as + 'بِ_صُورَة': 'بِ:gen', + 'بِ_عَكس': 'بِ:gen', + 'بِ_عَلَى': 'بِ:gen', + 'بِ_عَن': 'بِ:gen', + 'بِ_عَين': 'بِ:gen', + 'بِ_غَضّ_نَظَر_عَن': 'بِغَضِّ_اَلنَّظَرِ_عَن:gen', # biġaḍḍi an-naẓari ʿan = regardless of + 'بِ_فَضل': 'بِفَضلِ:gen', # bifaḍli = thanks to + 'بِ_فِي': 'بِ:gen', + 'بِ_قَدر': 'بِ:gen', + 'بِ_قُرب_مِن': 'بِاَلقُربِ_مِن:gen', # bi-al-qurbi min = near (with proximity to) + 'بِ_قَصد': 'بِقَصدِ:gen', # biqaṣdi = with intention + 'بِ_كَ': 'بِ:gen', + 'بِ_لِ': 'بِ:gen', + 'بِ_لَا': 'بِ:gen', + 'بِ_مَا_أَنَّ': 'بِ:gen', + 'بِ_مَثَابَة': 'بِ:gen', + 'بِ_مِثلَ': 'مِثلَ', # miṯla = like + 'بِ_مُجَرَّد': 'بِ:gen', + 'بِ_مُسَاعَدَة': 'بِ:gen', + 'بِ_مُشَارَكَة': 'بِمُشَارَكَةِ:gen', # bimušārakati = with participation of + 'بِ_مُقَارَنَة_بِ': 'بِاَلمُقَارَنَةِ_بِ:gen', # bi-al-muqāranati bi = in comparison to + 'بِ_مُقتَضَى': 'بِمُقتَضَى:gen', # bimuqtaḍā = with requirement of + 'بِ_مِقدَار': 'بِ:gen', + 'بِ_مِن': 'بِ:gen', + 'بِ_مُنَاسَبَة': 'بِمُنَاسَبَةِ:gen', # bimunāsabati = on the occasion of + 'بِ_مُوجِب': 'بِمُوجِبِ:gen', # bimūǧibi = with motive + 'بِ_نَتِيجَة': 'بِ:gen', + 'بِ_نَحوَ': 'بِ:gen', # by about N + 'بِ_نِسبَة': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati (bin-nisbati) = in proportion/relation to + 'بِ_نِسبَة_إِلَى': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati ʾilā (bin-nisbati ʾilā) = in proportion/relation to + 'بِ_نِسبَة_لِ': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati li (bin-nisbati li) = in proportion/relation to + 'بِ_نِسبَة_لِ_مِن': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati li (bin-nisbati li) = in proportion/relation to + 'بِ_نَظَر_إِلَى': 'بِ:gen', + 'بِ_نِيَابَة_عَن': 'بِاَلنِّيَابَةِ_عَن:gen', # bi-an-niyābati ʿan = on behalf of + 'بِ_هَدَف': 'بِهَدَفِ:gen', # bihadafi = with goal + 'بِ_وَ_لِ': 'بِ:gen', + 'بِ_وَاسِطَة': 'بِوَاسِطَةِ:gen', # biwāsiṭati = by means of + 'بِ_وَاقِع': 'بِ:gen', + 'بِ_وَسَط': 'بِوَسَطِ:gen', # biwasaṭi = in the middle of + 'بِ_وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'بِ_وَصف': 'بِ:gen', + 'بازاء': 'بِ:gen', + 'بالتسخين': 'بِ:gen', + 'بَدَلًا_مِن': 'بَدَلًا_مِن:gen', # badalan min = instead of + 'بدون': 'دُونَ:gen', # without + 'بشان': 'بِشَأنِ:gen', + 'بَعدَ': 'بَعدَ:gen', # baʿda = after + 'بَعدَ_أَن': 'بَعدَ:gen', # baʿda ʾan = after + clause + 'بَعدَ_حَوَالَى': 'بَعدَ:gen', # baada hawala + 'بَعدَ_نَحوَ': 'بَعدَ:gen', # after about N + 'بَعدَمَا': 'بَعدَ:gen', # baʿdamā = after + 'بُعَيدَ': 'بُعَيدَ:gen', # buʿayda = shortly after + 'بَل': 'قَبلَ:gen', + 'بِنَاء_عَلَى': 'بناء_عَلَى:gen', + 'بناء_عَلَى': 'بناء_عَلَى:gen', # bnāʾ ʿalā = based on + 'بناء_لِ': 'لِ:gen', + 'بَيدَ': 'بِ:gen', + 'بَيدَ_أَنَّ': 'بِ:gen', + 'بَينَ': 'بَينَ:gen', # bayna = between + 'بَينَ_حَوَالَى': 'بَينَ:gen', # bayna hawala + 'بينا': 'بَينَ:gen', # bayna = between + 'بَينَ_وَ_وَ_وَ': 'بَينَ:gen', # bayna = between + 'بَينَمَا': 'بَينَ:gen', + 'بَينَمَا_لَم': 'بَينَ:gen', + 'تُجَاهَ': 'تُجَاهَ:gen', # tuǧāha = towards, facing + 'تَحتَ': 'تَحتَ:gen', # tahta = under + 'ثَمَّ': 'بِ:gen', + 'ثُمَّ': 'بِ:gen', + 'جَرَّاء': 'جَرَّاء:gen', # ǧarrāʾ = because of + 'حَتَّى': 'حَتَّى:gen', # ḥattā = until + 'حَتَّى_أَنَّ': 'حَتَّى:gen', # before + 'حَتَّى_إِنَّ': 'حَتَّى:gen', # before + 'حَتَّى_بِ': 'حَتَّى:gen', # before + 'حَتَّى_لَو': 'لَو', # even if + 'حَتَّى_وَ_لَو': 'لَو', # even if + 'حَتَّى_وإن': 'إِنَّ', + 'حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'حَسَبَمَا': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'حَوَالَى': 'حَوَالَى', # ḥawālā = around, about + 'حَوَالَى_مِن': 'مِن:gen', # hawala min = from around X + 'حَولَ': 'حَولَ:gen', # ḥawla = about + 'حولما_إِذَا': 'إِذَا', + 'حَولَ_مَا_إِذَا': 'إِذَا', + 'حِيَالَ': 'حِيَالَ:gen', # ḥiyāla = concerning + 'حَيثُ': 'حَيثُ', # remove morphological case; ḥayṯu = where (SCONJ, not ADV) + 'حِينَمَا': 'فِي_حِينِ', # during + 'خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside + 'خِلَالَ': 'خِلَالَ:gen', # ḫilāla = during + 'خَلفَ': 'خَلفَ:gen', # ḫalfa = behind + 'دَاخِل': + 'دَاخِلَ:gen', # dāḫila = inside of + 'دَاخِلَ': + 'دَاخِلَ:gen', # dāḫila = inside of + 'دُونَ': 'دُونَ:gen', # dūna = without + 'دُونَ_أَن': 'دُونَ:gen', # dūna ʾan = without + 'دُونَ_سِوَى': 'دُونَ:gen', # dūna siwā = without + 'دونما': 'دُونَ:gen', + 'ذٰلِكَ_بَعدَمَا': 'بَعدَ:gen', + 'ذٰلِكَ_عِندَمَا': 'بِ:gen', + 'ذٰلِكَ_لِأَنَّ': 'لِأَنَّ', # because + 'ذٰلِكَ_لِكَي': 'لِكَي', # li-kay = in order to + 'ذٰلِكَ_نَظَر_لِ': 'بِ:gen', + 'رَغمَ': 'رَغمَ:gen', # raġma = despite + 'رَغمَ_أَنَّ': 'رَغمَ:gen', # raġma ʾanna = despite + clause + 'رَغمَ_أَنَّ_مِن': 'رَغمَ:gen', # raġma ʾanna min = despite + 'رَهنَ': 'رَهنَ:gen', # rahna = depending on + 'رَيثَمَا': 'رَهنَ:gen', # rahna = depending on + 'سِوَى': 'سِوَى:gen', # siwā = except for + 'سِوَى_أَنَّ_هُوَ': 'سِوَى:gen', # siwā = except for + 'سِوَى_بِ': 'سِوَى:gen', # siwā = except for + 'سِوَى_عَلَى': 'سِوَى:gen', # siwā = except for + 'سِوَى_لِ': 'سِوَى:gen', # siwā = except for + 'ضِدَّ': 'ضِدَّ:gen', # ḍidda = against + 'ضِمنَ': 'ضِمنَ:gen', # ḍimna = within, inside, among + 'طَالَمَا': + 'طَالَمَا', # ṭālamā = as long as + 'طالَما': + 'طَالَمَا', # ṭālamā = as long as + 'طَالَمَا_أَنَّ': + 'طَالَمَا', # ṭālamā = as long as + 'طِوَالَ': 'طِوَالَ:gen', # ṭiwāla = throughout + 'طِيلَةَ': 'طِيلَةَ:gen', # ṭīlata = during + 'عبر': 'عَبرَ:gen', + 'عَبرَ': 'عَبرَ:gen', # ʿabra = via + 'عَدَا': 'عَدَا:gen', # ʿadā = except for + 'عَقِبَ': 'عَقِبَ:gen', # ʿaqiba = following + 'عَقِبَ_أَن': 'عَقِبَ:gen', # ʿaqiba = following + 'عَقِبَ_مِن': 'عَقِبَ:gen', # ʿaqiba = following + 'عَلَى': 'عَلَى:gen', # ʿalā = on + 'عَلَى_أبواب': 'عَلَى:gen', + 'عَلَى_إِثرَ': 'إِثرَ:gen', # ʿalā ʾiṯri = right after + 'عَلَى_أَثَر': 'عَلَى:gen', + 'عَلَى_اِختِلَاف': 'عَلَى:gen', + 'عَلَى_أَسَاس': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on + 'عَلَى_أَسَاس_أَنَّ': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on + 'عَلَى_اِعتِبَار_أَنَّ': 'عَلَى_اِعتِبَارِ_أَنَّ', # ʿalā iʿtibāri ʾanna = considering that + 'عَلَى_إِلَّا': 'إِلَّا', # ʾillā = except, unless + 'عَلَى_الفور': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_إِلَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَن': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَن_بِ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_عَلَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_مِن_شَأن': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_هُوَ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_أَنَّ_هُوَ_لَدَى': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بِ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بِ_فِي': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_بَينَ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_حَدّ': + 'عَلَى:gen', # ʿalā = on + 'عَلَى_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of + 'عَلَى_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on + 'عَلَى_حَولَ': 'عَلَى:gen', + 'عَلَى_رَأس': 'عَلَى_رَأسِ:gen', # ʿalā raʾsi = on top of + 'عَلَى_رَغم': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغمَ_أَنَّ': 'رَغمَ:gen', # ʿalā raġma ʾanna = despite + clause + 'عَلَى_رَغم_أَنَّ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن_أَنَّ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_رَغم_مِن_أَنَّ_هُوَ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite + 'عَلَى_طَرِيقَة': 'عَلَى_طَرِيقَةِ:gen', # ʿalā ṭarīqati = on the way + 'عَلَى_عَكس': 'عَلَى:gen', + 'عَلَى_غِرَار': 'عَلَى_غِرَارِ:gen', # ʿalā ġirāri = similar to + 'عَلَى_قَيد': 'عَلَى:gen', + 'عَلَى_لِسَان': 'عَلَى:gen', + 'عَلَى_مِثلَ': 'مِثلَ', # miṯla = like + 'عَلَى_مدى': 'عَلَى:gen', + 'عَلَى_مَدَى': 'عَلَى_مَدَى:gen', # ʿalā madā = on period + 'عَلَى_مَقرَبَة_مِن': 'عَلَى_مَقرَبَةٍ_مِن:gen', # ʿalā maqrabatin min = in the vicinity of + 'عَلَى_مِن': 'عَلَى:gen', + 'عَلَى_نَحوَ': 'عَلَى:gen', # to about N + 'عَلَى_يَد': 'عَلَى:gen', + 'عَن': 'عَن:gen', # ʿan = about, from + 'عَن_أَن': 'عَن:gen', + 'عَن_أَنَّ': 'عَن:gen', + 'عَن_أَنَّ_وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond + 'عَن_بِ': 'عَن:gen', + 'عَن_طَرِيق': 'عَن_طَرِيقِ:gen', # ʿan ṭarīqi = via + 'عَن_فِي_أَن': 'عَن:gen', + 'عَن_قُربَ': 'قُربَ:gen', # qurba = near + 'عَن_مِثلَ': 'مِثلَ', # miṯla = like + 'عَن_مِن': 'عَن:gen', + 'عِندَ': 'عِندَمَا', # ʿinda = when + 'عِندَمَا': 'عِندَمَا', # ʿindamā = when + 'غَيرَ': 'إِلَّا', + 'فَ': 'فَ', # fa = so (advcl or coordination) + 'فَ_إِذَا': 'فَ', # fa = so (advcl or coordination) + 'فَ_بَدَل_مِن_أَن': 'فَ', # fa = so (advcl or coordination) + 'فَ_بَينَ': 'فَ', # fa = so (advcl or coordination) + 'فَ_عَلَى': 'فَ', # fa = so (advcl or coordination) + 'فَ_فِي': 'فَ', # fa = so (advcl or coordination) + 'فَ_مِن': 'فَ', # fa = so (advcl or coordination) + 'فَورَ': 'فَورَ:gen', # fawra = as soon as + 'فَوقَ': 'فَوقَ:gen', # fawqa = above, over + 'فِي': 'فِي:gen', # fī = in + 'فِي_اِتِّجَاه': 'بِاِتِّجَاهِ:gen', # bi-ittiǧāhi = towards + 'فِي_أَثنَاءَ': 'أَثنَاءَ:gen', # ʾaṯnāʾa = during + 'فِي_إِطَار': 'فِي_إِطَار:gen', # fī ʾiṭār = in frame + 'فِي_اعقاب': 'فِي_أَعقَابِ:gen', + 'فِي_إِلَى': 'فِي:gen', + 'فِي_أَن': 'فِي:gen', + 'فِي_أَنَّ': 'فِي:gen', + 'فِي_أَنَّ_عَلَى': 'فِي:gen', + 'فِي_أَنَّ_لَدَى': 'فِي:gen', + 'فِي_أَنَّ_مِن': 'فِي:gen', + 'فِي_بِ': 'فِي:gen', + 'فِي_بِ_فِي': 'فِي:gen', + 'فِي_بَاطِن': 'فِي:gen', + 'فِي_بَعدَ': 'فِي:gen', + 'فِي_بَينَ': 'بَينَ:gen', + 'فِي_حَال': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'فِي_حَالَة': 'فِي_حَالِ:gen', # fī ḥāli = in case + 'فِي_حَدّ': 'فِي:gen', + 'فِي_حُضُور': 'فِي_حُضُورِ:gen', # fī ḥuḍūri = in presence of + 'فِي_حَقّ': 'فِي:gen', + 'فِي_حُكم': 'فِي:gen', + 'فِي_حَوَالَى': 'فِي:gen', # fi hawala = in around X + 'فِي_حِين': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِينَ': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِين_أَنَّ': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_حِينَ_أَنَّ_هُوَ': + 'فِي_حِينِ', # fī ḥīni = while + 'فِي_خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside + 'فِي_خِتَام': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion + 'فِي_خِتَامِ': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion + 'فِي_خِلَالَ': 'فِي:gen', + 'فِي_دَاخِل': + 'دَاخِلَ:gen', + 'فِي_دَاخِلَ': 'فِي:gen', + 'فِي_سَبِيل': 'فِي_سَبِيلِ:gen', # fī sabīli = in order to + 'فِي_سِيَاق': 'فِي:gen', + 'فِي_شَأن': 'فِي_شَأنِ:gen', # fī šaʾni = in regard of + 'فِي_شَكل': 'فِي:gen', + 'فِي_صَفّ': 'فِي:gen', + 'فِي_صُورَة': 'فِي:gen', + 'فِي_ضَوء': 'فِي_ضَوءِ:gen', # fī ḍawʾi = in light of + 'فِي_ظِلّ': 'فِي_ظِلِّ:gen', # fī ẓilli = in light of + 'فِي_عُقب': 'فِي_أَعقَابِ:gen', # fī ʾaʿqābi = in the aftermath of + 'فِي_غَضن': 'فِي:gen', + 'فِي_غُضُون': 'فِي:gen', + 'فِي_مَا': 'فِي:gen', + 'فِي_مِثلَ': 'مِثلَ', # miṯla = like + 'فِي_مَجَال': 'فِي_مَجَالِ:gen', # fī maǧāli = in the area of + 'فِي_مستشفى': 'فِي:gen', + 'فِي_مَعَ': 'فِي:gen', + 'فِي_مُقَابِلَ': 'مُقَابِلَ:gen', + 'فِي_مَقدَم': 'فِي:gen', + 'فِي_مِن': 'فِي:gen', + 'فِي_مُنَاسَبَة': 'فِي_مُنَاسَبَةِ:gen', # fī munāsabati = on the occasion of + 'فِي_مُوَاجَهَة': 'فِي:gen', + 'فِي_نَحوَ': 'فِي:gen', # in about N + 'فِي_نِطَاق': 'فِي:gen', + 'فِي_وَجه': 'فِي:gen', + 'فِي_وَسط': 'وَسطَ:gen', + 'فِي_وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'فِيمَا': 'فِيمَا', # fīmā = while + 'قُبَالَةَ': 'قُبَالَةَ:gen', # qubālata = in front of, facing + 'قَبلَ': 'قَبلَ:gen', # qabla = before + 'قَبلَ_أَن': 'قَبلَ:gen', # qabla = before + 'قَبلَ_حَوَالَى': 'قَبلَ:gen', # qabla hawala + 'قَبلَ_نَحوَ': 'قَبلَ:gen', # before about N + 'قُبَيلَ': 'قُبَيلَ:gen', # qubayla = before + 'قُربَ': 'قُربَ:gen', # qurba = near + 'قَيدَ': 'فِي:gen', + 'كَ': 'كَ:gen', # ka = in (temporal?) + 'كَ_أَنَّ': 'كَ:gen', + 'كَ_لِ': 'كَ:gen', + 'كَ_وَ_وَ': 'كَ:gen', + 'كَأَنَّمَا': 'كَأَنَّمَا', # ka-ʾannamā = as if + 'كُلَّمَا': 'كُلَّمَا', # kullamā = whenever + 'كَمَا': 'كَمَا', # remove morphological case; kamā = as + 'كَي': 'لِكَي', # kay = in order to + 'لَ': 'لِ:gen', + 'لَ_عَلَّ': 'لِ:gen', + 'لِ': 'لِ:gen', # li = to + 'لِ_أَجَلّ': 'لِ:gen', + 'لِ_إِلَى': 'لِ:gen', + 'لِ_أَمَامَ_وَ': 'لِ:gen', + 'لِ_أَن': 'لِ:gen', + 'لِ_بِ': 'لِ:gen', + 'لِ_جِهَة': 'لِ:gen', + 'لِ_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of + 'لِ_حَوَالَى': 'لِ:gen', # li hawala = for around X + 'لِ_خَارِجَ': 'لِخَارِجِ:gen', # liḫāriǧi = out + 'لِ_دُخُول': 'لِ:gen', + 'لِ_دَرَجَة_أَنَّ': 'لِ:gen', + 'لِ_سَبَب': 'لِ:gen', + 'لِ_صَالِح': 'لِصَالِحِ:gen', # liṣāliḥi = in interest of + 'لِ_عَلَى': 'لِ:gen', + 'لِ_عَن': 'لِ:gen', + 'لِ_عِندَ': 'لِ:gen', + 'لِ_فِي': 'لِ:gen', + 'لِ_فِي_بَينَ': 'لِ:gen', + 'لِ_كَون': 'لِكَونِ', # likawni = because + 'لِ_لِئَلّا': 'لِ:gen', + 'لِ_مِثلَ': 'مِثلَ', # miṯla = like + 'لِ_مَعَ': 'لِ:gen', + 'لِ_مِن': 'لِ:gen', + 'لِ_نَحوَ': 'لِ:gen', # to/for about N + 'لِ_وَ': 'لِ:gen', + 'لِ_وَ_فِي': 'لِ:gen', + 'لَا': 'إِلَّا', + 'لَا_سِيَّمَا_بَعدَ': 'بَعدَ:gen', + 'لَا_سِيَّمَا_وَ_أَنَّ': 'أَنَّ', + 'لَا_سِيَّمَا_وَ_أَنَّ_هُوَ': 'أَنَّ', + 'لِأَنَّ': 'لِأَنَّ', # remove morphological case; li-ʾanna = because + 'لدى': 'لَدَى:gen', + 'لَدَى': 'لَدَى:gen', # ladā = with, by, of, for + 'لِذَا': 'لِذَا', # liḏā = so, therefore + 'لِذَا_فَ': 'لِ:gen', + 'لِذٰلِكَ': 'لِذَا', # liḏā = so, therefore + 'لٰكِنَّ': 'مَعَ:gen', + 'لكن_إِذَا': 'إِذَا', + 'لكن_بِ': 'بِ:gen', + 'لٰكِن_بَعدَ': 'بَعدَ:gen', + 'لكن_دَاخِلَ': 'دَاخِلَ:gen', + 'لكن_لَدَى': 'لَدَى:gen', + 'لٰكِن_مَعَ': 'مَعَ:gen', + 'لِكَي': 'لِكَي', # li-kay = in order to + 'لَمَّا': 'كُلَّمَا', + 'لَمَّا_لِ': 'كُلَّمَا', + 'لَو': 'لَو', # law = if + 'لَو_أَنَّ': 'لَو', # if + 'لَو_مِن': 'لَو', # if + 'ما': 'مِمَّا', + 'مَا': 'مِمَّا', + 'ما_دَام': 'مِمَّا', + 'مادامت': 'مِمَّا', + 'مَالَم': 'مَالَم', # mālam = unless + 'مَا_إِذَا': 'إِذَا', + 'مِثلَ': 'مِثلَ', # remove morphological case; miṯla = like + 'مِثلَمَا': 'مِثلَ', # miṯla = like + 'مَعَ': 'مَعَ:gen', # maʿa = with + 'مَعَ_أَنَّ': 'مَعَ:gen', + 'مَعَ_بِ': 'مَعَ:gen', + 'مَعَ_فِي': 'مَعَ:gen', + 'مَعَ_مِن_بَينَ': 'بَينَ:gen', + 'مقابل': 'مُقَابِلَ:gen', + 'مُقَابِلَ': 'مُقَابِلَ:gen', # muqābila = in exchange for, opposite to, corresponding to + 'مُقَابِلَ_حَوَالَى': 'مُقَابِلَ:gen', # muqabila hawala + 'مُقَارَن_بِ': 'بِ:gen', + 'مِمَّا': 'مِمَّا', # mimmā = that, which + 'مِمَّا_لَدَى': 'مِمَّا', # mimmā = that, which + 'مِن': 'مِن:gen', # min = from + 'مِن_اجل': 'مِن_أَجلِ:gen', # min ʾaǧli = for the sake of + 'مِن_أَجل': 'مِن_أَجلِ:gen', # min ʾaǧli = for the sake of + 'مِن_أَجل_أَن': 'مِن:gen', + 'مِن_إِلَى': 'مِن:gen', + 'مِن_أَن': 'مِن:gen', + 'مِن_أَنَّ': 'مِن:gen', + 'مِن_بِ': 'مِن:gen', + 'مِن_بَعدَ': 'مِن:gen', + 'مِن_بَينَ': 'بَينَ:gen', + 'مِن_تَحتَ': 'مِن:gen', + 'مِن_ثَمَّ': 'مِن:gen', + 'مِن_ثُمَّ': 'مِن:gen', + 'مِن_جَانِب': 'إِلَى_جَانِبِ:gen', # min ǧānibi = beside + 'مِن_جَرَّاء': 'جَرَّاء:gen', # ǧarrāʾ = because of + 'مِن_حَوَالَى': 'مِن:gen', # min hawala = from around X + 'مِن_حَولَ': 'مِن:gen', + 'مِن_حَيثُ': 'مِن:gen', + 'مِن_خَارِج': 'مِن_خَارِجِ:gen', # min ḫāriǧi = from outside + 'مِن_خَارِجَ': 'مِن_خَارِجِ:gen', # min ḫāriǧi = from outside + 'مِن_خِلَالَ': 'مِن_خِلَالِ:gen', # min ḫilāli = through, during + 'مِن_دَاخِلَ': 'مِن_دَاخِلِ:gen', # min dāḫili = from inside + 'مِن_دُون': 'مِن_دُونِ:gen', # min dūni = without, beneath, underneath + 'مِن_دُونَ': 'مِن_دُونِ:gen', # min dūni = without, beneath, underneath + 'مِن_دُون_أَن': 'مِن_دُونِ:gen', + 'مِن_دُونَ_أَن': 'مِن_دُونِ:gen', # min dūni ʾan = without, beneath, underneath + clause + 'مِن_زَاوِيَة': 'مِن:gen', + 'مِن_شَأن': 'مِن_شَأنِ:gen', # min šaʾni = from matter + 'مِن_ضِمنَ': 'مِن_ضِمنِ:gen', # min ḍimni = from within = including + 'مِن_طَرَف': 'مِن:gen', + 'مِن_عَلَى': 'مِن:gen', + 'مِن_عِندَ': 'مِن:gen', + 'مِن_غَير_أَن': 'مِن:gen', + 'مِن_فَوقَ': 'مِن_فَوقِ:gen', # min fawqi = from above + 'مِن_فِي': 'مِن:gen', + 'مِن_قَبلَ': 'مِن_قِبَلِ:gen', + 'مِن_قِبَل': 'مِن_قِبَلِ:gen', # min qibali = by + 'مِن_قِبَل_بِ_فِي': 'مِن_قِبَلِ:gen', # min qibali = by + 'مِن_مِثلَ': 'مِثلَ', # miṯla = like + 'مِن_مِن': 'مِن:gen', + 'مِن_مِن_بَينَ': 'بَينَ:gen', + 'مِن_مَوقِع': 'مِن:gen', + 'مِن_نَاحِيَة': 'مِن:gen', + 'مِن_وَرَاءَ': 'مِن_وَرَاءِ:gen', # min warāʾi = from behind + 'مُنذُ': 'مُنذُ:gen', # munḏu = since + 'مُنذُ_أَن': 'مُنذُ:gen', + 'مُنذُ_نَحوَ': 'مُنذُ:gen', # since about N + 'مُنذُ_وَ_فِي': 'مُنذُ:gen', + 'مَهمَا': 'مَهمَا', # mahmā = regardless + 'نَاهِيك_بِ': 'بِ:gen', + 'نَتِيجَة_لِ': 'لِ:gen', + 'نَحوَ': 'نَحوَ', # naḥwa = about, approximately + 'نَحوَ_بِ': 'بِ:gen', # about by N + 'هذا_بالأضافة': 'بِ:gen', + 'وان': 'أَنَّ', + 'وإن': 'إِنَّ', + 'وبشان': 'بِشَأنِ:gen', + 'وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond + 'وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle + 'وِفقَ': 'وِفقَ:gen', # wifqa = according to + 'وِفق_لِ': 'وِفقَ:gen', # wifqa = according to + 'ولو': 'إِذَا', # walaw = even if + 'ولو_أَنَّ': 'إِذَا' # walaw = even if + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + @staticmethod + def compose_edeprel(bdeprel, cdeprel): + """ + Composes enhanced deprel from the basic part and optional case + enhancement. + + Parameters + ---------- + bdeprel : str + Basic deprel (can include subtype, e.g., 'acl:relcl'). + cdeprel : TYPE + Case enhancement (can be composed of adposition and morphological + case, e.g., 'k:dat'). It is optional and it can be None or empty + string if there is no case enhancement. + + Returns + ------- + Full enhanced deprel (str). + """ + assert(bdeprel[-1] != ':') + edeprel = bdeprel + if cdeprel: + assert(cdeprel[0] != ':') + edeprel += ':'+cdeprel + return edeprel + + def process_tree(self, tree): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + + We cannot use the process_node() method because it ignores empty nodes. + """ + for node in tree.descendants_and_empty: + for edep in node.deps: + if edep['deprel'] == 'advcl:pred:إِذَن' or edep['deprel'] == 'advcl:pred:كدا' or edep['deprel'] == 'advcl:pred:لكن': + edep['deprel'] = 'advcl:pred' + continue + if edep['deprel'] == 'nmod:بِأَسْرِ:gen': + edep['deprel'] = 'nmod' + continue + m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel']) + if m: + bdeprel = m.group(1) + cdeprel = m.group(2) + solved = False + # Arabic clauses often start with وَ wa "and", which does not add + # much to the meaning but sometimes gets included in the enhanced + # case label. Remove it if there are more informative subsequent + # morphs. + cdeprel = re.sub(r'^وَ_', r'', cdeprel) + cdeprel = re.sub(r'^وَ:', r'', cdeprel) + cdeprel = re.sub(r'^وَ$', r'', cdeprel) + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.fullmatch(x+r'([_:].+)?', cdeprel) + if m and m.group(1) and not x+m.group(1) in exceptions: + cdeprel = x + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: + continue + # Split preposition from morphological case (if any), normalize + # the preposition and add the fixed morphological case where + # applicable. + m = re.fullmatch(r'([^:]+):(nom|gen|acc)', cdeprel) + adposition = m.group(1) if m else cdeprel + if adposition in self.unambiguous: + cdeprel = self.unambiguous[adposition] + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + continue + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/basic2enhanced.py b/udapi/block/ud/basic2enhanced.py new file mode 100644 index 00000000..bc5c8b25 --- /dev/null +++ b/udapi/block/ud/basic2enhanced.py @@ -0,0 +1,23 @@ +"""Block ud.Basic2Enhanced for copying basic dependencies to enhanced where missing. + +UD treebanks are not required to have enhanced dependencies (https://universaldependencies.org/u/overview/enhanced-syntax.html). +However, if such annotation is present (in the DEPS column of CoNLL-U), +it must be present in all nodes and all nodes must be reachable from the root +in the enhanced-deps graph (as checked by the validator). +There may be use cases where enhanced deps are annotated only in some kinds of nodes (e.g. empty nodes) +and the rest of nodes is expected to be the same as in the basic dependencies. +To make such file valid, one can use this block. + +This block should not be used on a file with no enhanced dependencies: +It makes no sense to just duplicate the HEAD+DEPREL information also in the DEPS column. +""" +from udapi.core.block import Block + + +class Basic2Enhanced(Block): + """Make sure DEPS column is always filled.""" + + def process_tree(self, tree): + for node in tree.descendants_and_empty: + if node.raw_deps == "_": + node.raw_deps = f"{node.parent.ord}:{node.deprel}" diff --git a/udapi/block/ud/bg/removedotafterabbr.py b/udapi/block/ud/bg/removedotafterabbr.py index d1d94628..a132dad1 100644 --- a/udapi/block/ud/bg/removedotafterabbr.py +++ b/udapi/block/ud/bg/removedotafterabbr.py @@ -7,6 +7,7 @@ """ from udapi.core.block import Block + class RemoveDotAfterAbbr(Block): """Block for deleting extra PUNCT nodes after abbreviations. diff --git a/udapi/block/ud/ca/__init__.py b/udapi/block/ud/ca/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/ca/addmwt.py b/udapi/block/ud/ca/addmwt.py new file mode 100644 index 00000000..49b79da1 --- /dev/null +++ b/udapi/block/ud/ca/addmwt.py @@ -0,0 +1,194 @@ +"""Block ud.ca.AddMwt for heuristic detection of Catalan contractions. + +According to the UD guidelines, contractions such as "del" = "de el" +should be annotated using multi-word tokens. + +Note that this block should be used only for converting legacy conllu files. +Ideally a tokenizer should have already split the MWTs. +""" +import re +import udapi.block.ud.addmwt + +MWTS = { + 'al': {'form': 'a el', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'als': {'form': 'a els', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'del': {'form': 'de el', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'dels': {'form': 'de els', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'pel': {'form': 'per el', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'pels': {'form': 'per els', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + v['lemma'] = v['form'] + v['upos'] = 'ADP DET' + v['deprel'] = '* det' + # The following are the default values + # v['main'] = 0 # which of the two words will inherit the original children (if any) + # v['shape'] = 'siblings', # the newly created nodes will be siblings + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def __init__(self, verbpron=False, **kwargs): + super().__init__(**kwargs) + self.verbpron = verbpron + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + + if analysis is not None: + # Modify the default attachment of the new syntactic words in special situations. + if re.match(r'^(root|conj|reparandum)$', node.udeprel): + # Copy the dictionary so that we do not modify the original and do not affect subsequent usages. + analysis = analysis.copy() + analysis['shape'] = 'subtree' + return analysis + return None + + def fix_personal_pronoun(self, node): + # There is a mess in lemmas and features of personal pronouns. + if node.upos == 'PRON': + if re.match("^jo$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Nom|Number=Sing|Person=1|PronType=Prs' + if re.match("^(em|m'|-me|'m|me|m)$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc,Dat|Number=Sing|Person=1|PrepCase=Npr|PronType=Prs' + if re.match("^mi$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc|Number=Sing|Person=1|PrepCase=Pre|PronType=Prs' + if re.match("^tu$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Nom|Number=Sing|Person=2|Polite=Infm|PronType=Prs' + if re.match("^(et|t'|-te|'t|te|t)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc,Dat|Number=Sing|Person=2|Polite=Infm|PrepCase=Npr|PronType=Prs' + if re.match("^ti$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc|Number=Sing|Person=2|Polite=Infm|PrepCase=Pre|PronType=Prs' + # Strong forms of third person pronouns can be used as subjects or after preposition. + # Do not mark them as nominative (because of the prepositions). + if re.match("^ell$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^ella$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Fem|Number=Sing|Person=3|PronType=Prs' + if re.match("^(el|-lo|'l|lo)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^(la|-la)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs' + if re.match("^(l')$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs' + if re.match("^(ho|-ho)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs' + if re.match("^(li|-li)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Dat|Number=Sing|Person=3|PronType=Prs' + if re.match("^(es|s'|-se|'s|se|s)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes' + if re.match("^si$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Person=3|PrepCase=Pre|PronType=Prs|Reflex=Yes' + # If nosaltres can be used after a preposition, we should not tag it as nominative. + if re.match("^nosaltres$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Number=Plur|Person=1|PronType=Prs' + # Nós is the majestic first person singular. In accusative and dative, it is identical to first person plural. + if re.match("^nós$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Number=Sing|Person=1|Polite=Form|PronType=Prs' + if re.match("^(ens|-nos|'ns|nos|ns)$", node.form, re.IGNORECASE): + node.lemma = 'jo' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs' + if re.match("^vosaltres$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Plur|Person=2|PronType=Prs' + # Vós is the formal second person singular. In accusative and dative, it is identical to second person plural. + # Vostè is even more formal than vós. In accusative and dative, it is identical to third person singular. + if re.match("^(vós|vostè)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Sing|Person=2|Polite=Form|PronType=Prs' + if re.match("^vostès$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Number=Plur|Person=2|Polite=Form|PronType=Prs' + if re.match("^(us|-vos|-us|vos)$", node.form, re.IGNORECASE): + node.lemma = 'tu' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs' + # Strong forms of third person pronouns can be used as subjects or after preposition. + # Do not mark them as nominative (because of the prepositions). + if re.match("^ells$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Masc|Number=Plur|Person=3|PronType=Prs' + if re.match("^elles$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Gender=Fem|Number=Plur|Person=3|PronType=Prs' + # Els is masculine accusative, or dative in any gender. + if re.match("^(els|-los|'ls|los|ls)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs' + if re.match("^(les|-les)$", node.form, re.IGNORECASE): + node.lemma = 'ell' + node.feats = 'Case=Acc|Gender=Fem|Number=Plur|Person=3|PronType=Prs' + # There are also "adverbial" pronominal clitics that can occur at direct object positions. + if re.match("^(en|n'|'n|-ne|n|ne)$", node.form, re.IGNORECASE): + node.lemma = 'en' + node.feats = 'Case=Gen|Person=3|PronType=Prs' + if re.match("^(hi|-hi)$", node.form, re.IGNORECASE): + node.lemma = 'hi' + node.feats = 'Case=Loc|Person=3|PronType=Prs' + + def report_suspicious_lemmas(self, node): + # There are offset issues of splitted multi_word_expressions. + # Sometimes a word gets the lemma of the neighboring word. + if node.form.lower()[:1] != node.lemma.lower()[:1]: + # Exclude legitimate cases where the lemma starts with a different letter. + hit = True + if node.lemma == 'jo' and re.match("(em|ens|m'|me|mi|nos|nosaltres|'ns)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'tu' and re.match("(et|'t|us|vosaltres|vostè)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'el' and re.match("(la|l|l'|les)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ell' and re.match("(hi|ho|'l|l'|la|-la|les|li|lo|-lo|los|'ls|'s|s'|se|-se|si)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'es' and re.match("(s|se)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'em' and re.match("('m|m|m')", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'en' and re.match("('n|n'|ne|-ne)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'anar' and re.match("(va|van|vàrem)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ser' and re.match("(és|era|eren|eres|érem|essent|estat|ets|foren|fos|fossin|fou)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'estar' and re.match("(sigut)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'caure' and re.match("(queia|queies|quèiem|quèieu|queien)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ampli' and re.match("(àmplia|àmplies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'indi' and re.match("(índies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'obvi' and re.match("(òbvia)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ossi' and re.match("(òssies)", node.form, re.IGNORECASE): + hit = False + if node.lemma == 'ús' and re.match("(usos)", node.form, re.IGNORECASE): + hit = False + # Form = '2001/37/CE', lemma = 'CE' + # Form = 'nº5', lemma = '5' + # Form = 'kg.', lemma = 'quilogram' + # Form = 'un', lemma = '1' + if node.lemma == 'CE' or re.match("nº", node.form, re.IGNORECASE) or re.match("^quil[oò]", node.lemma, re.IGNORECASE) or re.match("^[0-9]+$", node.lemma): + hit = False + if hit: + print("Form = '%s', lemma = '%s', address = %s" % (node.form, node.lemma, node.address())) diff --git a/udapi/block/ud/ca/elque.py b/udapi/block/ud/ca/elque.py new file mode 100644 index 00000000..6b3ad22b --- /dev/null +++ b/udapi/block/ud/ca/elque.py @@ -0,0 +1,116 @@ +""" +This block searches for relative clauses modifying a determiner ('el que...'). +It is written for Catalan but a similar block should work for Spanish and other +Romance languages. +""" +from udapi.core.block import Block +import logging +import re + +class ElQue(Block): + + def __init__(self, fix=False, **kwargs): + """ + Default: Print the annotation patterns but do not fix anything. + fix=1: Do not print the patterns but fix them. + """ + super().__init__(**kwargs) + self.fix = fix + + def process_node(self, node): + # We take 'que' as the central node of the construction. + if node.lemma == 'que' and node.upos == 'PRON' and node.parent.ord > node.ord: + # We will refer to the parent of 'que' as a verb, although it can be + # a non-verbal predicate, too. + que = node + verb = node.parent + # Check the lemma of the determiner. The form may vary for gender and number. + if que.prev_node and que.prev_node.lemma == 'el': + el = que.prev_node + adp = None + if el.prev_node and el.prev_node.upos == 'ADP': + adp = el.prev_node + if adp.udeprel == 'fixed': + adp = adp.parent + if self.fix: + self.fix_pattern(adp, el, que, verb) + else: + self.print_pattern(adp, el, que, verb) + + def print_pattern(self, adp, el, que, verb): + stanford = [] + if adp: + if adp.parent == el: + parentstr = 'el' + elif adp.parent == que: + parentstr = 'que' + elif adp.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(adp.deprel + '(' + parentstr + ', ADP)') + if el.parent == adp: + parentstr = 'ADP' + elif el.parent == que: + parentstr = 'que' + elif el.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(el.deprel + '(' + parentstr + ', el)') + # We found the verb as the parent of 'que', so we do not need to check the parent of 'que' now. + stanford.append(que.deprel + '(VERB, que)') + if verb.parent == adp: + parentstr = 'ADP' + elif verb.parent == el: + parentstr = 'el' + else: + parentstr = 'OTHER' + stanford.append(verb.deprel + '(' + parentstr + ', VERB)') + print('; '.join(stanford)) + + def fix_pattern(self, adp, el, que, verb): + if adp: + if adp.parent == que or adp.parent == verb: + attach(adp, el, 'case') + if el.parent == que: + ###!!! Just a temporary change. In the end it will be attached elsewhere. + attach(el, verb) + el.parent = verb + if len(el.deps) == 1: + el.deps[0]['parent'] = verb + if verb.parent != adp and verb.parent != el and verb.parent != que: + eldeprel = None + if re.match(r'^[nc]subj$', verb.udeprel): + eldeprel = 'nsubj' + elif re.match(r'^ccomp$', verb.udeprel): + eldeprel = 'obj' + elif re.match(r'^advcl$', verb.udeprel): + eldeprel = 'obl' + elif re.match(r'^acl$', verb.udeprel): + eldeprel = 'nmod' + elif re.match(r'^(xcomp|conj|appos|root)$', verb.udeprel): + eldeprel = verb.deprel + if eldeprel: + attach(el, verb.parent, eldeprel) + attach(verb, el, 'acl:relcl') + # If anything before 'el' depends on the verb ('cc', 'mark', 'punct' etc.), + # re-attach it to 'el'. + for c in verb.children: + if c.ord < el.ord and re.match(r'^(cc|mark|case|punct)$', c.udeprel): + attach(c, el) + +def attach(node, parent, deprel=None): + """ + Attach a node to a new parent with a new deprel in the basic tree. In + addition, if there are enhanced dependencies and there is just one incoming + enhanced relation (this is the case in AnCora), this relation will be + modified accordingly. + """ + node.parent = parent + if deprel: + node.deprel = deprel + if len(node.deps) == 1: + node.deps[0]['parent'] = parent + if deprel: + node.deps[0]['deprel'] = deprel diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py new file mode 100644 index 00000000..b36b2512 --- /dev/null +++ b/udapi/block/ud/complywithtext.py @@ -0,0 +1,342 @@ +r"""Block ComplyWithText for adapting the nodes to comply with the text. + +Implementation design details: +Usually, most of the inconsistencies between tree tokens and the raw text are simple to solve. +However, there may be also rare cases when it is not clear how to align the tokens +(nodes in the tree) with the raw text (stored in ``root.text``). +This block tries to solve the general case using several heuristics. + +It starts with running a LCS-like algorithm (LCS = longest common subsequence) +``difflib.SequenceMatcher`` on the raw text and concatenation of tokens' forms, +i.e. on sequences of characters (as opposed to running LCS on sequences of tokens). + +To prevent mis-alignment problems, we keep the spaces present in the raw text +and we insert spaces into the concatenated forms (``tree_chars``) according to ``SpaceAfter=No``. +An example of a mis-alignment problem: +text "énfase na necesidade" with 4 nodes "énfase en a necesidade" +should be solved by adding multiword token "na" over the nodes "en" and "a". +However, running LCS (or difflib) over the character sequences +"énfaseenanecesidade" +"énfasenanecesidade" +may result in énfase -> énfas. + +Author: Martin Popel +""" +import difflib +import logging +import regex + +from udapi.core.block import Block +from udapi.core.mwt import MWT + + +class ComplyWithText(Block): + """Adapt the nodes to comply with the text.""" + + def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, + allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, + previous_form_label='CorrectForm', previous_text_label='OrigText', + added_label='Added', **kwargs): + """Args: + fix_text: After all heuristics are applied, the token forms may still not match the text. + Should we edit the text to match the token forms (as a last resort)? Default=True. + prefer_mwt - What to do if multiple subsequent nodes correspond to a text written + without spaces and non-word characters (punctuation)? + E.g. if "3pm doesn't" is annotated with four nodes "3 pm does n't". + We can use either SpaceAfter=No, or create a multi-word token (MWT). + Note that if there is space or punctuation, SpaceAfter=No will be used always + (e.g. "3 p.m." annotated with three nodes "3 p. m."). + If the character sequence does not match exactly, MWT will be used always + (e.g. "3pm doesn't" annotated with four nodes "3 p.m. does not"). + Thus this parameter influences only the "unclear" cases. + Default=True (i.e. prefer multi-word tokens over SpaceAfter=No). + allow_goeswith - If a node corresponds to multiple space-separated strings in text, + which are not allowed as tokens with space, we can either leave this diff + unresolved or create new nodes and join them with the `goeswith` deprel. + Default=True (i.e. add the goeswith nodes if applicable). + max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). + Default=4. + allow_add_punct - allow creating punctuation-only nodes + allow_delete_punct - allow deleting extra punctuation-only nodes, + which are not represented in root.text + allow_hyphen_goeswith - if e.g. node.form=="mother-in-law" corresponds to + "mother in law" in root.text, convert it to three nodes: + node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") + node2(form="in", deprel="goeswith", upos="X", parent=node1) + node3(form="law", deprel="goeswith", upos="X", parent=node1). + previous_form_label - when changing node.form, we store the previous value + in node.misc[previous_form_label] (so no information is lost). + Default="CorrectForm" because we expect that the previous value + (i.e. the value of node.form before applying this block) + contained the corrected spelling, while root.text contains + the original spelling with typos as found in the raw text. + CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html + When setting this parameter to an empty string, no values will be stored to node.misc. + When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. + previous_text_label - when we are not able to adapt the annotation to match root.text + and fix_text is True, we store the previous root.text value in a CoNLL-U comment with this label. + Default="OrigText". When setting this parameter to an empty string, + no values will be stored to root.comment. + added_label - when creating new nodes because allow_add_punct=True, we mark these nodes + as new_node.misc[added_label] = 1. Default="Added". + """ + super().__init__(**kwargs) + self.fix_text = fix_text + self.prefer_mwt = prefer_mwt + self.allow_goeswith = allow_goeswith + self.max_mwt_length = max_mwt_length + self.allow_add_punct = allow_add_punct + self.allow_delete_punct = allow_delete_punct + self.allow_hyphen_goeswith = allow_hyphen_goeswith + self.previous_form_label = previous_form_label + self.previous_text_label = previous_text_label + self.added_label = added_label + + @staticmethod + def allow_space(form): + """Is space allowed within this token form?""" + return regex.fullmatch('[0-9 ]+([,.][0-9]+)?', form) + + def store_previous_form(self, node): + """Store the previous form of this node into MISC, unless the change is common&expected.""" + if node.form not in ("''", "``") and self.previous_form_label: + node.misc[self.previous_form_label] = node.form + if self.previous_form_label == 'CorrectForm': + node.feats['Typo'] = 'Yes' + + def process_tree(self, root): + text = root.text + if text is None: + raise ValueError('Tree %s has no text, cannot use ud.ComplyWithText' % root) + + # Normalize the stored text (e.g. double space or no-break space -> single space) + # and skip sentences which are already ok. + text = ' '.join(text.split()) + if root.text != text and self.fix_text: + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') + root.text = text + if text == root.compute_text(): + return + + tree_chars, char_nodes = _nodes_to_chars(root.token_descendants) + + # Align. difflib may not give LCS, but usually it is good enough. + matcher = difflib.SequenceMatcher(None, tree_chars, text, autojunk=False) + diffs = list(matcher.get_opcodes()) + _log_diffs(diffs, tree_chars, text, 'matcher') + + diffs = self.unspace_diffs(diffs, tree_chars, text) + _log_diffs(diffs, tree_chars, text, 'unspace') + + diffs = self.merge_diffs(diffs, char_nodes) + _log_diffs(diffs, tree_chars, text, 'merge') + + # Solve diffs. + self.solve_diffs(diffs, tree_chars, char_nodes, text) + + # Fill SpaceAfter=No. + tmp_text = text + for node in root.token_descendants: + if tmp_text.startswith(node.form): + tmp_text = tmp_text[len(node.form):] + if not tmp_text or tmp_text[0].isspace(): + del node.misc['SpaceAfter'] + tmp_text = tmp_text.lstrip() + else: + node.misc['SpaceAfter'] = 'No' + else: + logging.warning('Node %s does not match text "%s"', node, tmp_text[:20]) + break + + # Edit root.text if needed. + if self.fix_text: + computed_text = root.compute_text() + if text != computed_text: + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') + root.text = computed_text + + def unspace_diffs(self, orig_diffs, tree_chars, text): + diffs = [] + for diff in orig_diffs: + edit, tree_lo, tree_hi, text_lo, text_hi = diff + if edit != 'insert': + if tree_chars[tree_lo] == ' ': + tree_lo += 1 + if tree_chars[tree_hi - 1] == ' ': + tree_hi -= 1 + if text[text_lo] == ' ': + text_lo += 1 + if text[text_hi - 1] == ' ': + text_hi -= 1 + old = tree_chars[tree_lo:tree_hi] + new = text[text_lo:text_hi] + if old == '' and new == '': + continue + elif old == new: + edit = 'equal' + elif old == '': + edit = 'insert' + diffs.append((edit, tree_lo, tree_hi, text_lo, text_hi)) + return diffs + + def merge_diffs(self, orig_diffs, char_nodes): + """Make sure each diff starts on original token boundary. + + If not, merge the diff with the previous diff. + E.g. (equal, "5", "5"), (replace, "-6", "–7") + is changed into (replace, "5-6", "5–7") + """ + diffs = [] + for diff in orig_diffs: + edit, tree_lo, tree_hi, text_lo, text_hi = diff + if edit != 'insert' and char_nodes[tree_lo] is not None: + diffs.append(diff) + elif edit == 'equal': + while tree_lo < tree_hi and char_nodes[tree_lo] is None: + tree_lo += 1 + text_lo += 1 + diffs[-1] = ('replace', diffs[-1][1], tree_lo, diffs[-1][3], text_lo) + if tree_lo < tree_hi: + diffs.append(('equal', tree_lo, tree_hi, text_lo, text_hi)) + else: + if not diffs: + diffs = [diff] + elif diffs[-1][0] != 'equal': + diffs[-1] = ('replace', diffs[-1][1], tree_hi, diffs[-1][3], text_hi) + else: + p_tree_hi = diffs[-1][2] - 1 + p_text_hi = diffs[-1][4] - 1 + while char_nodes[p_tree_hi] is None: + p_tree_hi -= 1 + p_text_hi -= 1 + assert p_tree_hi >= diffs[-1][1] + assert p_text_hi >= diffs[-1][3] + diffs[-1] = ('equal', diffs[-1][1], p_tree_hi, diffs[-1][3], p_text_hi) + diffs.append(('replace', p_tree_hi, tree_hi, p_text_hi, text_hi)) + return diffs + + def solve_diffs(self, diffs, tree_chars, char_nodes, text): + for diff in diffs: + edit, tree_lo, tree_hi, text_lo, text_hi = diff + + if edit == 'equal': + pass + elif edit == 'insert': + forms = text[text_lo:text_hi].split(' ') + if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: + next_node = char_nodes[tree_lo] + for f in reversed(forms): + new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') + new.shift_before_node(next_node) + new.misc[self.added_label] = 1 + else: + logging.warning('Unable to insert nodes\n%s', + _diff2str(diff, tree_chars, text)) + elif edit == 'delete': + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + if all(regex.fullmatch('\p{P}+', n.form) for n in nodes): + if self.allow_delete_punct: + for node in nodes: + node.remove(children='rehang') + else: + logging.warning('Unable to delete punctuation nodes (try ud.ComplyWithText allow_delete_punct=1)\n%s', + _diff2str(diff, tree_chars, text)) + else: + logging.warning('Unable to delete non-punctuation nodes\n%s', + _diff2str(diff, tree_chars, text)) + else: + assert edit == 'replace' + # Revert the splittng and solve the diff. + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + form = text[text_lo:text_hi] + self.solve_diff(nodes, form.strip()) + + def solve_diff(self, nodes, form): + """Fix a given (minimal) tokens-vs-text inconsistency.""" + nodes_str = ' '.join([n.form for n in nodes]) # just for debugging + node = nodes[0] + + # First, solve the cases when the text contains a space. + if ' ' in form: + node_form = node.form + if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: + node_form = node_form.replace('-', '') + if len(nodes) == 1: + if node_form == form.replace(' ', ''): + if self.allow_space(form): + self.store_previous_form(node) + node.form = form + elif self.allow_goeswith: + self.store_previous_form(node) + forms = form.split() + node.form = forms[0] + node.feats['Typo'] = 'Yes' + for split_form in reversed(forms[1:]): + new = node.create_child(form=split_form, deprel='goeswith', upos='X') + new.shift_after_node(node) + else: + logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) + elif self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('[ \p{P}]+', form[len(node.form):]): + for punct_form in reversed(form[len(node.form):].split()): + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') + new.shift_after_node(node) + new.misc[self.added_label] = 1 + else: + logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) + else: + logging.warning(f'Unable to solve {len(nodes)}:{len(form.split(" "))} diff:\n{nodes_str} -> {form}') + + # Second, solve the cases when multiple nodes match one form (without any spaces). + elif len(nodes) > 1: + # If the match is exact, we can choose between MWT ans SpaceAfter solutions. + if not self.prefer_mwt and ''.join([n.form for n in nodes]) == form: + pass # SpaceAfter=No will be added later on. + # If one of the nodes is already a MWT, we cannot have nested MWTs. + # TODO: enlarge the MWT instead of failing. + elif any(isinstance(n, MWT) for n in nodes): + logging.warning('Unable to solve partial-MWT diff:\n%s -> %s', nodes_str, form) + # MWT with too many words are suspicious. + elif len(nodes) > self.max_mwt_length: + logging.warning('Not creating too long (%d>%d) MWT:\n%s -> %s', + len(nodes), self.max_mwt_length, nodes_str, form) + # Otherwise, create a new MWT. + else: + node.root.create_multiword_token(nodes, form) + + # Third, solve the 1-1 cases. + else: + if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): + punct_form = form[len(node.form):] + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') + new.shift_after_node(node) + new.misc[self.added_label] = 1 + else: + self.store_previous_form(node) + node.form = form + + +def _nodes_to_chars(nodes): + chars, char_nodes = [], [] + for node in nodes: + form = node.form + if node.misc['SpaceAfter'] != 'No' and node != nodes[-1]: + form += ' ' + chars.extend(form) + char_nodes.append(node) + char_nodes.extend([None] * (len(form) - 1)) + return ''.join(chars), char_nodes + + +def _log_diffs(diffs, tree_chars, text, msg): + if logging.getLogger().isEnabledFor(logging.DEBUG): + logging.warning('=== After %s:', msg) + for diff in diffs: + logging.warning(_diff2str(diff, tree_chars, text)) + + +def _diff2str(diff, tree, text): + old = '|' + ''.join(tree[diff[1]:diff[2]]) + '|' + new = '|' + ''.join(text[diff[3]:diff[4]]) + '|' + return '{:7} {!s:>50} --> {!s}'.format(diff[0], old, new) diff --git a/udapi/block/ud/convert1to2.py b/udapi/block/ud/convert1to2.py index 72d08ab8..d76c50b9 100644 --- a/udapi/block/ud/convert1to2.py +++ b/udapi/block/ud/convert1to2.py @@ -23,9 +23,10 @@ "csubjpass": "csubj:pass", "auxpass": "aux:pass", "name": "flat:name", - "foreign": "flat", # "flat:foreign" not needed once we have Foreign=Yes in FEATS + "foreign": "flat", # "flat:foreign" not needed once we have Foreign=Yes in FEATS } + class Convert1to2(Block): """Block for converting UD v1 to UD v2.""" @@ -45,7 +46,7 @@ def __init__(self, skip='', save_stats=True, **kwargs): self.skip = {k for k in skip.split(',')} self.save_stats = save_stats - def process_tree(self, tree): # pylint: disable=too-many-branches + def process_tree(self, tree): # pylint: disable=too-many-branches """Apply all the changes on the current tree. This method is automatically called on each tree by Udapi. @@ -114,15 +115,21 @@ def change_upos_copula(node): if node.deprel == 'cop' and node.upos not in ("AUX", "PRON"): node.upos = "AUX" - @staticmethod - def change_deprel_simple(node): + def change_deprel_simple(self, node): """mwe→fixed, dobj→obj, *pass→*:pass, name→flat, foreign→flat+Foreign=Yes.""" - if node.deprel == 'foreign': + if node.udeprel == 'foreign': node.feats['Foreign'] = 'Yes' + udeprel, sdeprel = node.udeprel, node.sdeprel try: - node.deprel = DEPREL_CHANGE[node.deprel] + node.deprel = DEPREL_CHANGE[udeprel] except KeyError: - pass + return + if sdeprel: + if ':' in node.deprel: + self.log(node, 'deprel', 'deprel=%s:%s new_deprel=%s but %s is lost' % + (udeprel, sdeprel, node.deprel, sdeprel)) + else: + node.deprel += ':' + sdeprel def change_neg(self, node): """neg→advmod/det/ToDo + Polarity=Neg. @@ -139,7 +146,7 @@ def change_neg(self, node): if 'Neg' not in node.feats['PronType']: node.feats['Polarity'] = 'Neg' - if node.upos in ['ADV', 'PART']: + if node.upos in ['ADV', 'PART', 'AUX']: node.deprel = 'advmod' elif node.upos == 'DET': node.deprel = 'det' @@ -162,24 +169,30 @@ def is_nominal(node): """Returns 'no' (for predicates), 'yes' (sure nominals) or 'maybe'. Used in `change_nmod`.""" - if node.upos in ["VERB", "AUX", "ADJ", "ADV"]: + if node.upos in ["VERB", "AUX", "ADV"]: return 'no' + # check whether the node is a predicate + # (either has a nsubj/csubj dependendent or a copula dependent) + has_cop = any("subj" in child.deprel or child.deprel == 'cop' for child in node.children) + # Adjectives are very likely complements of copula verbs. + if node.upos == "ADJ": + return "no" if has_cop else "maybe" # Include NUM for examples such as "one of the guys" # and DET for examples such as "some/all of them" if node.upos in ["NOUN", "PRON", "PROPN", "NUM", "DET"]: - # check whether the node is a predicate - # (either has a nsubj/csubj dependendent or a copula dependent) - if any(["subj" in child.deprel or child.deprel == 'cop' for child in node.children]): - return 'maybe' - return 'yes' + return "maybe" if has_cop else "yes" return 'maybe' def change_nmod(self, node): """nmod→obl if parent is not nominal, but predicate.""" - if node.deprel == 'nmod': + if node.udeprel == 'nmod': parent_is_nominal = self.is_nominal(node.parent) if parent_is_nominal == 'no': - node.deprel = 'obl' + node.udeprel = 'obl' + elif node.deprel == 'nmod:tmod': + node.deprel = 'obl:tmod' + elif node.deprel == 'nmod:poss': + node.deprel = 'nmod:poss' elif parent_is_nominal == 'maybe': self.log(node, 'nmod', 'deprel=nmod, but parent is ambiguous nominal/predicate') @@ -269,13 +282,16 @@ def fix_remnants_in_tree(self, root): Remnant's parent is always the correlate (same-role) node. Usually, correlate's parent is the head of the whole ellipsis subtree, i.e. the first conjunct. However, sometimes remnants are deeper, e.g. - 'Over 300 Iraqis are reported dead and 500 wounded.' with edges: - nsubjpass(reported, Iraqis) - nummod(Iraqis, 300) - remnant(300, 500) + 'Over 300 Iraqis are reported dead and 500 wounded.' with edges:: + + nsubjpass(reported, Iraqis) + nummod(Iraqis, 300) + remnant(300, 500) + Let's expect all remnants in one tree are part of the same ellipsis structure. + TODO: theoretically, there may be more ellipsis structures with remnants in one tree, - but I have no idea how to distinguish them from the deeper-remnants cases. + but I have no idea how to distinguish them from the deeper-remnants cases. """ remnants = [n for n in root.descendants if n.deprel == 'remnant'] if not remnants: diff --git a/udapi/block/ud/cs/__init__.py b/udapi/block/ud/cs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/cs/addmwt.py b/udapi/block/ud/cs/addmwt.py new file mode 100644 index 00000000..a690c95b --- /dev/null +++ b/udapi/block/ud/cs/addmwt.py @@ -0,0 +1,245 @@ +"""Block ud.cs.AddMwt for heuristic detection of multi-word tokens.""" +import udapi.block.ud.addmwt +import re +import logging + +# Define static rules for 'aby', 'kdyby' and similar forms. +MWTS = { + 'abych': {'form': 'aby bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'kdybych': {'form': 'když bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'abys': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'abysi': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybys': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybysi': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'aby': {'form': 'aby by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'kdyby': {'form': 'když by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'abychom': {'form': 'aby bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychom': {'form': 'když bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + # Old Czech 'abychme' == Modern Czech 'abychom' + 'abychme': {'form': 'aby bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychme': {'form': 'když bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'abyste': {'form': 'aby byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'abyšte': {'form': 'aby byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyste': {'form': 'když byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyšte': {'form': 'když byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + # Old Czech 'abyšta' == dual number; 2nd or 3rd person, the one example in data so far is 3rd. + 'abyšta': {'form': 'aby byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, + 'kdybyšta': {'form': 'když byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, +} +for v in MWTS.values(): + v['upos'] = 'SCONJ AUX' + number = '-' + if 'Sing' in v['feats']: + number = 'S' + elif 'Plur' in v['feats']: + number = 'P' + person = '-' + if 'Person=1' in v['feats']: + person = '1' + elif 'Person=2' in v['feats']: + person = '2' + v['xpos'] = 'J,------------- Vc-%s---%s-------' % (number, person) + v['deprel'] = '* aux' + v['lemma'] = v['form'].split()[0] + ' být' + v['main'] = 0 + v['shape'] = 'siblings' + +# Define static rules for 'nač', 'oč', 'zač' (but not 'proč'). +# Add them to the already existing dictionary MWTS. +# nač -> na + co +for prep in 'na o za'.split(): + MWTS[prep + 'č'] = { + 'form': prep + ' co', + 'lemma': prep + ' co', + 'upos': 'ADP PRON', + 'xpos': 'RR--4---------- PQ--4----------', + 'feats': 'AdpType=Prep|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } +# In 19th century texts (Hičkok etalon), one instance of 'seč' was also split (and annotated as ADP + accusative!) +# A few additional instances were found in older texts, too (e.g. 16th century). +# We must do it separately, as the preposition is vocalized. +MWTS['seč'] = { + 'form': 'se' + ' co', + 'lemma': 's' + ' co', + 'upos': 'ADP PRON', + 'xpos': 'RV--4---------- PQ--4----------', + 'feats': 'AdpType=Voc|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', +} + +# Old Czech 'toliť' (special case with 3 subtokens; general -ť will be solved dynamically below). +MWTS['toliť'] = { + 'form': 'to li ť', + 'lemma': 'ten li ť', + 'upos': 'DET SCONJ PART', + 'xpos': '* J,------------- TT-------------', + 'feats': '* _ _', + 'deprel': '* mark discourse', + 'main': 0, + 'shape': 'siblings' +} + + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + # Avoid adding a MWT if the current node already is part of an MWT. + if node.multiword_token: + return None + analysis = MWTS.get(node.form.lower(), None) + if analysis is not None: + return analysis + # If the node did not match any of the static rules defined in MWTS, + # check it against the "dynamic" rules below. The enclitic 'ť' will be + # separated from its host but only if it has been marked by an annotator + # in MISC. (These are annotation conventions used for Old Czech in the + # Hičkok project.) + if node.misc['AddMwt'] != '': + subtokens = node.misc['AddMwt'].split() + if len(subtokens) != 2: + logging.warning("MISC 'AddMwt=%s' has unexpected number of subtokens." % node.misc['AddMwt']) + return None + token_from_subtokens = ''.join(subtokens) + if subtokens[1] == 'jsi': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' jsi', + 'lemma': '* být', + 'upos': '* AUX', + 'xpos': '* VB-S---2P-AAI--', + 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', + 'deprel': '* aux', + 'main': 0, + 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', + } + if subtokens[1] == 'jest': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' jest', + 'lemma': '* být', + 'upos': '* AUX', + 'xpos': '* VB-S---3P-AAI-2', + 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', + 'deprel': '* aux', + 'main': 0, + 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', + } + if subtokens[1] == 'i': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' i', + 'lemma': '* i', + 'upos': '* CCONJ', + 'xpos': '* J^-------------', + 'feats': '* _', + 'deprel': '* cc', + 'main': 0, + 'shape': 'subtree', + } + if subtokens[1] in ['ť', 'tě', 'ti']: + if token_from_subtokens != node.form: + logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) + return None + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' ' + subtokens[1], + 'lemma': '* ť', + 'upos': '* PART', + 'xpos': '* TT-------------', + 'feats': '* _', + 'deprel': '* discourse', + 'main': 0, + 'shape': 'subtree', + } + # dajžto = dajž + to + if subtokens[1] == 'to': + if token_from_subtokens != node.form: + logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) + return None + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' ' + subtokens[1], + 'lemma': '* ten', + 'upos': '* DET', + 'xpos': '* PDNS4----------', + 'feats': '* Case=Acc|Gender=Neut|Number=Sing|PronType=Dem', + 'deprel': '* obj', + 'main': 0, + 'shape': 'subtree', + } + # Contractions of prepositions and pronouns almost could be processed + # regardless of AddMwt instructions by the annotator, but we still + # require it to be on the safe side. For example, both 'přědeň' and + # 'přěden' are attested in Old Czech but then we do not want to catch + # 'on' (besides the wanted 'oň'). Another reason si that the pronoun + # could be masculine or neuter. We pick Gender=Masc and Animacy=Anim + # by default, unless the original token was annotated as Animacy=Inan + # or Gender=Neut. + m = re.match(r"^(na|nade|o|po|pro|přěde|ski?rz[eě]|za)[nň](ž?)$", node.form.lower()) + if m: + node.misc['AddMwt'] = '' + # Remove vocalization from 'přěde' (přěd něj) but keep it in 'skrze' + # (skrze něj). + if m.group(1) == 'přěde': + pform = 'přěd' + plemma = 'před' + adptype = 'Voc' + at = 'V' + elif re.match(r"^ski?rz[eě]$", m.group(1).lower()): + pform = m.group(1) + plemma = 'skrz' + adptype = 'Voc' + at = 'V' + else: + pform = m.group(1) + plemma = m.group(1) + adptype = 'Prep' + at = 'R' + # In UD PDT, Gender=Masc,Neut, and in PDT it is PEZS4--3 / P4ZS4---. + if node.feats['Gender'] == 'Neut': + gender = 'Neut' + animacy = '' + g = 'N' + elif node.feats['Animacy'] == 'Inan': + gender = 'Masc' + animacy = 'Animacy=Inan|' + g = 'I' + else: + gender = 'Masc' + animacy = 'Animacy=Anim|' + g = 'M' + if m.group(2).lower() == 'ž': + return { + 'form': pform + ' nějž', + 'lemma': plemma + ' jenž', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- P4'+g+'S4---------2', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|PrepCase=Pre|PronType=Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } + else: + return { + 'form': pform + ' něj', + 'lemma': plemma + ' on', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- PE'+g+'S4--3-------', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|Person=3|PrepCase=Pre|PronType=Prs', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } + return None + + def postprocess_mwt(self, mwt): + if mwt.words[0].deprel == 'fixed' and mwt.words[0].parent.parent.upos == 'VERB': + mwt.words[1].parent = mwt.words[0].parent.parent diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py new file mode 100644 index 00000000..4e2be633 --- /dev/null +++ b/udapi/block/ud/cs/fixedeprels.py @@ -0,0 +1,685 @@ +"""Block to fix case-enhanced dependency relations in Czech.""" +from udapi.core.block import Block +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'aby': [], + 'ač': [], + 'ačkoli': [], # 'ačkoliv' se převede na 'ačkoli' dole + 'ačkoliv': [], # ... ale možná ne když je doprovázeno předložkou + 'ať': [], + 'byť': [], + 'i_když': [], + 'jak': [], + 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole + 'jako': [], + 'jakoby': ['jakoby_pod:ins'], # these instances in FicTree should be spelled 'jako by' + 'když': [], + 'než': ['než_aby'], + 'nežli': [], + 'pokud': [], + 'protože': [], + 'takže': [], + 'třebaže': [], + 'že': [] + } + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'á': 'na:acc', # "á konto té záležitosti", ovšem "á konto" není ani spojeno jako složená předložka (význam = "na konto") + 'abi': 'aby', + 'aby_na': 'na:loc', + 'ačkoliv': 'ačkoli', + 'ať': 'ať', # remove morphological case + 'ať_forma': 'formou:gen', + 'ať_jako': 'jako', + 'ať_na': 'na:loc', + 'ať_s': 's:ins', + 'ať_v': 'v:loc', + 'ať_v_oblast': 'v_oblasti:gen', + 'ať_z': 'z:gen', + 'ať_z_hledisko': 'z_hlediska:gen', + 'ať_z_strana': 'ze_strany:gen', + 'až_do': 'do:gen', + 'až_o': 'o:acc', + 'během': 'během:gen', + 'bez': 'bez:gen', + 'bez_ohled_na': 'bez_ohledu_na:acc', + 'bez_na': 'bez_ohledu_na:acc', ###!!! a temporary hack to silence the validator about (https://github.com/UniversalDependencies/UD_Czech-PDT/issues/10#issuecomment-2710721703) + 'bez_zřetel_k': 'bez_zřetele_k:dat', + 'bez_zřetel_na': 'bez_zřetele_na:acc', + 'blízko': 'blízko:dat', + 'blízko_k': 'blízko:dat', + 'blíž': 'blízko:dat', + 'blíže': 'blízko:dat', + 'bok_po_bok_s': 'bok_po_boku_s:ins', + 'cesta': 'cestou:gen', + 'co_jako': 'jako', + 'coby': 'coby', # remove morphological case + 'daleko': 'nedaleko:gen', + 'daleko_od': 'od:gen', + 'dík': 'díky:dat', + 'díky': 'díky:dat', + 'dle': 'dle:gen', + 'do': 'do:gen', + 'do_čelo': 'do_čela:gen', + 'do_k': 'k:dat', + 'do_oblast': 'do_oblasti:gen', + 'do_rozpor_s': 'do_rozporu_s:ins', + 'do_ruka': 'do_rukou:gen', + 'do_soulad_s': 'do_souladu_s:ins', + 'důsledkem': 'v_důsledku:gen', + 'forma': 'formou:gen', + 'formou': 'formou:gen', + 'hledět_na': 'nehledě_na:acc', + 'i_když': 'i_když', # remove morphological case + 'i_pro': 'pro:acc', + 'jak_aby': 'jak', + 'jak_ad': 'jak', + 'jakkoliv': 'jakkoli', + 'jako': 'jako', # remove morphological case + 'jako_kupříkladu': 'jako', + 'jakoby': 'jako', + 'jakoby_pod': 'pod:ins', + 'jakožto': 'jako', + 'jelikož_do': 'jelikož', + 'jenom': 'jen', + 'jesli': 'jestli', + 'jestli_že': 'jestliže', + 'jménem': 'jménem:gen', + 'k': 'k:dat', + 'k_konec': 'ke_konci:gen', + 'k_prospěch': 'ku_prospěchu:gen', + 'kdykoliv': 'kdykoli', + 'kol': 'kolem:gen', + 'kolem': 'kolem:gen', + 'kolem_dokola': 'kolem:gen', + 'koncem': 'koncem:gen', + 'konec': 'koncem:gen', + 'krom': 'kromě:gen', + 'kromě': 'kromě:gen', + 'kvůli': 'kvůli:dat', + 'leda_když': 'ledaže', + 'li_jako': 'li', + 'liž': 'li', + 'mezi_uvnitř': 'uvnitř:gen', + 'na:ins': 'na:acc', + 'na_báze': 'na_bázi:gen', + 'na_čelo': 'na_čele:gen', + 'na_mimo': 'na:loc', # na kurtě i mimo něj + 'na_než': 'na:acc', # na víc než čtyři a půl kilometru + 'na_od': 'na_rozdíl_od:gen', + 'na_počátek': 'na_počátku:gen', + 'na_počest': 'na_počest:gen', # appears also with :dat but the meaning is same + 'na_podklad': 'na_podkladě:gen', + 'na_rozdíl_od': 'na_rozdíl_od:gen', + 'na_strana': 'na_straně:gen', + 'na_účet': 'na_účet:gen', + 'na_újma': 'gen', # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier + 'na_úroveň': 'na_úrovni:gen', + 'na_úroveň_okolo': 'na_úrovni:gen', + 'na_úsek': 'na_úseku:gen', + 'na_začátek': 'na_začátku:gen', + 'na_základ': 'na_základě:gen', + 'na_základna': 'na_základně:gen', + 'na_závěr': 'na_závěr:gen', + 'na_zda': 'na:loc', # na tom, zda a v jaké formě... + 'namísto': 'namísto:gen', + 'namísto_do': 'do:gen', + 'napospas': 'napospas:dat', + 'narozdíl_od': 'na_rozdíl_od:gen', + 'následek': 'následkem:gen', + 'navzdory': 'navzdory:dat', + 'nedaleko': 'nedaleko:gen', + 'než': 'než', # remove morphological case + 'nežli': 'nežli', # remove morphological case + 'o_jako': 'jako', + 'o_o': 'o:acc', + 'od': 'od:gen', + 'od_počínaje': 'počínaje:ins', # od brambor počínaje a základní zeleninou konče + 'ohledně': 'ohledně:gen', + 'okolo': 'okolo:gen', + 'oproti': 'oproti:dat', + 'po_v': 'po:loc', + 'po_bok': 'po_boku:gen', + 'po_doba': 'po_dobu:gen', + 'po_stránka': 'po_stránce:gen', + 'po_vzor': 'po_vzoru:gen', + 'poblíž': 'poblíž:gen', + 'počátek': 'počátkem:gen', + 'počátkem': 'počátkem:gen', + 'počínaje': 'počínaje:ins', + 'počínat': 'počínaje:ins', + 'počínat_od': 'počínaje:ins', + 'pod_dojem': 'pod_dojmem:gen', + 'pod_tlak': 'pod_tlakem:gen', + 'pod_vliv': 'pod_vlivem:gen', + 'pod_záminka': 'pod_záminkou:gen', + 'pod_záminka_že': 'pod_záminkou_že', + 'podél': 'podél:gen', + 'podle': 'podle:gen', + 'pomoc': 'pomocí:gen', + 'pomocí': 'pomocí:gen', + 'postup': 'postupem:gen', + 'pouze_v': 'v:loc', + 'pro': 'pro:acc', + 'pro_aby': 'pro:acc', + 'prostřednictví': 'prostřednictvím:gen', + 'prostřednictvím': 'prostřednictvím:gen', + 'proti': 'proti:dat', + 'proto_aby': 'aby', + 'protože': 'protože', # remove morphological case + 'před_během': 'během:gen', # před a během utkání + 'před_po': 'po:loc', # před a po vyloučení Schindlera + 'přes': 'přes:acc', + 'přes_přes': 'přes:acc', # annotation error + 'přestože': 'přestože', # remove morphological case + 'při': 'při:loc', + 'při_pro': 'při:loc', + 'při_příležitost': 'při_příležitosti:gen', + 'ruka_v_ruka_s': 'ruku_v_ruce_s:ins', + 's_cíl': 's_cílem', # s cílem projednat X + 's_ohled_k': 's_ohledem_k:dat', + 's_ohled_na': 's_ohledem_na:acc', + 's_pomoc': 's_pomocí:gen', + 's_postup': 'postupem:gen', + 's_přihlédnutí_k': 's_přihlédnutím_k:dat', + 's_přihlédnutí_na': 's_přihlédnutím_na:acc', + 's_výjimka': 's_výjimkou:gen', + 's_výjimka_z': 's_výjimkou:gen', + 's_výjimka_že': 's_výjimkou_že', + 's_vyloučení': 's_vyloučením:gen', + 's_zřetel_k': 'se_zřetelem_k:dat', + 's_zřetel_na': 'se_zřetelem_na:acc', + 'severně_od': 'od:gen', + 'skrz': 'skrz:acc', + 'směr_do': 'směrem_do:gen', + 'směr_k': 'směrem_k:dat', + 'směr_na': 'směrem_na:acc', + 'směr_od': 'směrem_od:gen', + 'směr_přes': 'směrem_přes:acc', + 'směr_z': 'směrem_z:gen', + 'společně_s': 'společně_s:ins', + 'spolu': 'spolu_s:ins', + 'spolu_s': 'spolu_s:ins', + 'spolu_se': 'spolu_s:ins', + 'stranou': 'stranou:gen', + 'stranou_od': 'stranou:gen', + 'takže': 'takže', # remove morphological case + 'takže_a': 'takže', + 'třebaže': 'třebaže', # remove morphological case + 'tvář_v_tvář': 'tváří_v_tvář:dat', + 'u': 'u:gen', + 'u_příležitost': 'u_příležitosti:gen', + 'uprostřed': 'uprostřed:gen', + 'uvnitř': 'uvnitř:gen', + 'v:ins': 'v:loc', # ve skutečností (překlep) + 'v_analogie_s': 'v_analogii_s:ins', + 'v_blízkost': 'v_blízkosti:gen', + 'v_čas': 'v_čase:gen', + 'v_čelo': 'v_čele:gen', + 'v_čelo_s': 'v_čele_s:ins', + 'v_doba': 'v_době:gen', + 'v_dohoda_s': 'v_dohodě_s:ins', + 'v_duch': 'v_duchu:gen', + 'v_důsledek': 'v_důsledku:gen', + 'v_forma': 've_formě:gen', + 'v_jméno': 've_jménu:gen', + 'v_k': 'k:dat', + 'v_kombinace_s': 'v_kombinaci_s:ins', + 'v_konfrontace_s': 'v_konfrontaci_s:ins', + 'v_kontext_s': 'v_kontextu_s:ins', + 'v_na': 'na:loc', + 'v_neprospěch': 'v_neprospěch:gen', + 'v_oblast': 'v_oblasti:gen', + 'v_oblast_s': 's:ins', + 'v_obor': 'v_oboru:gen', + 'v_otázka': 'v_otázce:gen', + 'v_podoba': 'v_podobě:gen', + 'v_poměr_k': 'v_poměru_k:dat', + 'v_porovnání_s': 'v_porovnání_s:ins', + 'v_proces': 'v_procesu:gen', + 'v_prospěch': 've_prospěch:gen', + 'v_protiklad_k': 'v_protikladu_k:dat', + 'v_průběh': 'v_průběhu:gen', + 'v_případ': 'v_případě:gen', + 'v_případ_že': 'v_případě_že', + 'v_rámec': 'v_rámci:gen', + 'v_reakce_na': 'v_reakci_na:acc', + 'v_rozpor_s': 'v_rozporu_s:ins', + 'v_řada': 'v_řadě:gen', + 'v_shoda_s': 've_shodě_s:ins', + 'v_služba': 've_službách:gen', + 'v_směr': 've_směru:gen', + 'v_směr_k': 've_směru_k:dat', + 'v_směr_na': 've_směru_k:dat', # same meaning as ve_směru_na:acc + 'v_smysl': 've_smyslu:gen', + 'v_součinnost_s': 'v_součinnosti_s:ins', + 'v_souhlas_s': 'v_souhlasu_s:ins', + 'v_soulad_s': 'v_souladu_s:ins', + 'v_souvislost_s': 'v_souvislosti_s:ins', + 'v_spojení_s': 've_spojení_s:ins', + 'v_spojení_se': 've_spojení_s:ins', + 'v_spojený_s': 've_spojení_s:ins', + 'v_spojitost_s': 've_spojitosti_s:ins', + 'v_spolupráce_s': 've_spolupráci_s:ins', + 'v_s_spolupráce': 've_spolupráci_s:ins', + 'v_srovnání_s': 've_srovnání_s:ins', + 'v_srovnání_se': 've_srovnání_s:ins', + 'v_stav': 've_stavu:gen', + 'v_stín': 've_stínu:gen', + 'v_světlo': 've_světle:gen', + 'v_úroveň': 'v_úrovni:gen', + 'v_věc': 've_věci:gen', + 'v_vztah_k': 've_vztahu_k:dat', + 'v_vztah_s': 've_vztahu_k:dat', + 'v_zájem': 'v_zájmu:gen', + 'v_záležitost': 'v_záležitosti:gen', + 'v_závěr': 'v_závěru:gen', + 'v_závislost_na': 'v_závislosti_na:loc', + 'v_závislost_s': 'v_závislosti_s:ins', + 'v_znamení': 've_znamení:gen', + 'včetně': 'včetně:gen', + 'vedle': 'vedle:gen', + 'versus': 'versus:nom', + 'vina': 'vinou:gen', + 'vliv': 'vlivem:gen', + 'vlivem': 'vlivem:gen', + 'vůči': 'vůči:dat', + 'výměna_za': 'výměnou_za:acc', + 'vzhledem': 'vzhledem_k:dat', + 'vzhledem_k': 'vzhledem_k:dat', + 'z': 'z:gen', + 'z_důvod': 'z_důvodu:gen', + 'z_hledisko': 'z_hlediska:gen', + 'z_oblast': 'z_oblasti:gen', + 'z_řada': 'z_řad:gen', + 'z_strana': 'ze_strany:gen', + 'z_nedostatek': 'z_nedostatku:gen', + 'z_titul': 'z_titulu:gen', + 'z_začátek': 'ze_začátku:gen', + 'za_pomoc': 'za_pomoci:gen', + 'za_účast': 'za_účasti:gen', + 'za_účel': 'za_účelem:gen', + 'začátek': 'začátkem:gen', + 'zásluha': 'zásluhou:gen', + 'zatím_co': 'zatímco', + 'závěr': 'závěrem:gen', + 'závisle_na': 'nezávisle_na:loc', + 'že': 'že', # remove morphological case + 'že_ať': 'ať', + 'že_jako': 'že', + 'že_jakoby': 'že', + 'že_za': 'za:gen' + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + @staticmethod + def compose_edeprel(bdeprel, cdeprel): + """ + Composes enhanced deprel from the basic part and optional case + enhancement. + + Parameters + ---------- + bdeprel : str + Basic deprel (can include subtype, e.g., 'acl:relcl'). + cdeprel : TYPE + Case enhancement (can be composed of adposition and morphological + case, e.g., 'k:dat'). It is optional and it can be None or empty + string if there is no case enhancement. + + Returns + ------- + Full enhanced deprel (str). + """ + edeprel = bdeprel + if cdeprel: + edeprel += ':'+cdeprel + return edeprel + + def process_tree(self, tree): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + + We cannot use the process_node() method because it ignores empty nodes. + """ + for node in tree.descendants_and_empty: + for edep in node.deps: + m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel']) + if m: + bdeprel = m.group(1) + cdeprel = m.group(2) + solved = False + # Issues caused by errors in the original annotation must be fixed early. + # Especially if acl|advcl occurs with a preposition that unambiguously + # receives a morphological case in the subsequent steps, and then gets + # flagged as solved. + if re.match(r'advcl', bdeprel): + # The following advcl should in fact be obl. + if re.fullmatch(r'do(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'do:gen' + elif re.fullmatch(r'k(?::dat)?', cdeprel): ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'k:dat' + elif re.fullmatch(r'místo(?::gen)?', cdeprel): # 'v poslední době se množí bysem místo bych' + bdeprel = 'obl' + cdeprel = 'místo:gen' + elif re.fullmatch(r'od(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + bdeprel = 'obl' + cdeprel = 'od:gen' + elif re.fullmatch(r'podle(?::gen)?', cdeprel): + bdeprel = 'obl' + cdeprel = 'podle:gen' + elif re.fullmatch(r's(?::ins)?', cdeprel): ###!!! "seděli jsme tam s Člověče, nezlob se!" Měla by se opravit konverze stromu. + bdeprel = 'obl' + cdeprel = 's:ins' + elif re.fullmatch(r'v_duchu?(?::gen)?', cdeprel): + bdeprel = 'obl' + cdeprel = 'v_duchu:gen' + elif re.fullmatch(r'v', cdeprel): + bdeprel = 'obl' + cdeprel = 'v:loc' + # byl by pro, abychom... ###!!! Opravit i konverzi stromu. + elif re.fullmatch(r'pro(?::acc)?', cdeprel): + cdeprel = 'aby' + elif re.match(r'acl', bdeprel): + # The following acl should in fact be nmod. + if re.fullmatch(r'k(?::dat)?', cdeprel): + bdeprel = 'nmod' + cdeprel = 'k:dat' + elif re.fullmatch(r'na_způsob(?::gen)?', cdeprel): # 'střídmost na způsob Masarykova "jez dopolosyta"' + bdeprel = 'nmod' + cdeprel = 'na_způsob:gen' + elif re.fullmatch(r'od(?::gen)?', cdeprel): + bdeprel = 'nmod' + cdeprel = 'od:gen' + elif re.fullmatch(r'v', cdeprel): + bdeprel = 'nmod' + cdeprel = 'v:loc' + else: # bdeprel is 'obl' or 'nmod' + # The following subordinators should be removed if they occur with nominals. + if re.match(r'(ačkoli|když)', cdeprel): # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here! + cdeprel = '' + # Removing 'až' must be done early. The remainder may be 'počátek' + # and we will want to convert it to 'počátkem:gen'. + elif re.match(r'až_(.+):(gen|dat|acc|loc|ins)', cdeprel): + cdeprel = re.sub(r'až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2', cdeprel) + elif re.fullmatch(r'jestli(?::gen)?', cdeprel): # nevím, jestli osmého nebo devátého září + cdeprel = 'gen' + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.fullmatch(x+r'([_:].+)?', cdeprel) + if m and m.group(1) and not x+m.group(1) in exceptions: + cdeprel = x + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: + continue + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.fullmatch(x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?', cdeprel) + if m: + cdeprel = self.unambiguous[x] + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + if re.match(r'(obl|nmod)', bdeprel): + m = re.fullmatch(r'(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?', cdeprel) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(1)) + if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase): + cdeprel = adpcase + edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel) + continue + ###!!! bdeprel and cdeprel are not visible from here on but we may want to use them there as well. + if re.match(r'^(acl|advcl):', edep['deprel']): + # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations). + edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel']) + edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating + edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel']) + if edep['deprel'] == 'acl:v' and node.form == 'patře': + edep['deprel'] = 'nmod:v:loc' + node.deprel = 'nmod' + node.lemma = 'patro' + node.upos = 'NOUN' + node.xpos = 'NNNS6-----A----' + node.feats['Aspect'] = '' + node.feats['Gender'] = 'Neut' + node.feats['Tense'] = '' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + elif re.match(r'^(nmod|obl(:arg)?):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and (node.parent == None or node.parent.feats['Case'] == 'Loc') or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'obl:loc': + # Annotation error. The first occurrence in PDT dev: + # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...' + # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'. + # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'. + edep['deprel'] = 'obl:v:loc' + elif edep['deprel'] == 'obl:arg:loc': + # Annotation error. The first occurrence in PDT dev: + edep['deprel'] = 'obl:arg:na:loc' + elif edep['deprel'] == 'nmod:loc': + # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa. + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc': + # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object? + # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now. + edep['deprel'] = 'obl' + elif edep['deprel'] == 'nmod:voc': + # 'v 8. čísle tiskoviny Ty rudá krávo' + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'nmod:co:nom': + # Annotation error: 'kompatibilní znamená tolik co slučitelný' + # 'co' should be relative pronoun rather than subordinating conjunction. + edep['deprel'] = 'acl:relcl' + node.deprel = 'acl:relcl' + elif re.match(r'^(obl(:arg)?):li$', edep['deprel']): + edep['deprel'] = 'advcl:li' + elif re.match(r'^(nmod|obl(:arg)?):mezi:voc$', edep['deprel']): + edep['deprel'] = re.sub(r':voc$', r':acc', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']): + edep['deprel'] += ':acc' + elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']): + edep['deprel'] += ':gen' + elif re.match(r'^obl:místo_za:acc$', edep['deprel']): + # 'chytají krávu místo za rohy spíše za ocas' + # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution. + for c in node.children: + if c.form == 'místo': + c.upos = 'ADV' + c.deprel = 'cc' + edep['deprel'] = 'obl:za:acc' + elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']): + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel']) + elif re.match(r'^(nmod|obl(:arg)?):na(:gen)?$', edep['deprel']): + edep['deprel'] = re.sub(r':gen$', '', edep['deprel']) + # The case is unknown. We need 'acc' or 'loc'. + # The locative is probably more frequent but it is not so likely with every noun. + # If there is an nummod:gov child, it must be accusative and not locative. + # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma): + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:arg:na_konec$', edep['deprel']): + # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku' + edep['deprel'] = 'obl:arg:na:acc' + elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']): + # Annotation error. + if node.form == 's': + ohled = node.next_node + na = ohled.next_node + noun = na.next_node + self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc') + self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(na, node, 'fixed', 'fixed') + self.set_basic_and_enhanced(node, noun, 'case', 'case') + elif re.match(r'^nmod:pára:nom$', edep['deprel']): + # Annotation error: 'par excellence'. + edep['deprel'] = 'nmod' + for c in node.children: + if c.udeprel == 'case' and c.form.lower() == 'par': + c.lemma = 'par' + c.upos = 'ADP' + c.xpos = 'RR--X----------' + c.feats['Case'] = '' + c.feats['Gender'] = '' + c.feats['Number'] = '' + c.feats['Polarity'] = '' + c.feats['AdpType'] = 'Prep' + elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']): + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']): + if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']): + # Accusative would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']): + # Genitive would be possible but unlikely. + edep['deprel'] += ':ins' + elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci': + # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition. + # Find the content nominal. + cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)] + vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v'] + if len(cnouns) > 0 and len(vs) > 0: + cnoun = cnouns[0] + v = vs[0] + self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins') + self.set_basic_and_enhanced(v, cnoun, 'case', 'case') + self.set_basic_and_enhanced(node, v, 'fixed', 'fixed') + elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']): + # ':nom' occurs in 'karneval v Rio de Janeiro' + edep['deprel'] = re.sub(r':nom$', '', edep['deprel']) + if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0: + edep['deprel'] += ':acc' + else: + edep['deprel'] += ':loc' + elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']): + # There is just one occurrence and it is an error: + # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...' + # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'. + edep['deprel'] = 'obl:s:ins' + elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']): + # Instrumental would be possible but unlikely. + edep['deprel'] += ':acc' + else: + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly' + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate! + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel']) + edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel']) + edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel']) + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/cs/fixmorpho.py b/udapi/block/ud/cs/fixmorpho.py new file mode 100644 index 00000000..7fcb0e12 --- /dev/null +++ b/udapi/block/ud/cs/fixmorpho.py @@ -0,0 +1,471 @@ +""" +A Czech-specific block to fix lemmas, UPOS and morphological features in UD. +It should increase consistency across the Czech treebanks. It focuses on +individual closed-class verbs (such as the auxiliary "být") or on entire classes +of words (e.g. whether or not nouns should have the Polarity feature). It was +created as part of the Hičkok project (while importing nineteenth-century Czech +data) but it should be applicable on any other Czech treebank. +""" +from udapi.core.block import Block +import logging +import re + +class FixMorpho(Block): + + def process_node(self, node): + # Do not touch words marked as Foreign or Typo. They may not behave the + # way we expect in Czech data. + if node.feats['Foreign'] == 'Yes' or node.feats['Typo'] == 'Yes': + return + #---------------------------------------------------------------------- + # NOUNS, PROPER NOUNS, AND ADJECTIVES + #---------------------------------------------------------------------- + # Nouns do not have polarity but the Prague-style tagsets may mark it. + if node.upos in ['NOUN', 'PROPN']: + if node.feats['Polarity'] == 'Pos': + node.feats['Polarity'] = '' + elif node.feats['Polarity'] == 'Neg': + logging.warn(f'To remove Polarity=Neg from the NOUN {node.form}, we may have to change its lemma ({node.lemma}).') + # For some nouns, there is disagreement in whether to tag and lemmatize + # them as proper nouns. We must be careful and not add too many to this + # rule, as many of them could be used as surnames and then they should + # be PROPN. + if node.upos == 'PROPN' and re.fullmatch(r'(bůh|duch|hospodin|město|milost|pan|pán|panna|stvořitel|trojice)', node.lemma.lower()): + node.lemma = node.lemma.lower() + node.upos = 'NOUN' + # Lemmatization. + if node.upos == 'NOUN' and node.lemma == 'zem': + node.lemma = 'země' + if node.upos == 'ADJ': + # Adjectives should be lemmatized to lowercase even if they are part of + # a multiword name, e.g., "Malá" in "Malá Strana" should be lemmatized + # to "malý". Exception: Possessive adjectives derived from personal + # names, e.g., "Karlův". + if node.feats['Poss'] != 'Yes': + node.lemma = node.lemma.lower() + # Short forms of adjectives are rare in Modern Czech and uninflected + # (they are used as predicates), so they lack the Case feature. But + # they were inflected for Case in the past, so it is better to add + # Case=Nom for consistency. + if node.feats['Variant'] == 'Short' and node.feats['Case'] == '': + node.feats['Case'] = 'Nom' + #---------------------------------------------------------------------- + # PRONOUNS AND DETERMINERS + #---------------------------------------------------------------------- + # Clitic forms of personal pronouns have Variant=Short if there is also a longer, full form. + if node.upos == 'PRON' and node.feats['PronType'] == 'Prs' and re.fullmatch(r'(mi|mě|ti|tě|si|se|ho|mu)', node.form.lower()): + node.feats['Variant'] = 'Short' + # Forms of "my" should be lemmatized as "já". + if node.upos == 'PRON' and node.lemma == 'my': + node.lemma = 'já' + # Forms of "vy" should be lemmatized as "ty". + if node.upos == 'PRON' and node.lemma == 'vy': + node.lemma = 'ty' + # Forms of "oni" should be lemmatized as "on" and cases that allow + # a preposition should have PrepCase. + if node.upos == 'PRON' and node.lemma in ['on', 'oni']: + node.lemma = 'on' + if node.feats['Case'] not in ['Nom', 'Voc']: + if node.form.lower().startswith('j'): + node.feats['PrepCase'] = 'Npr' + elif re.match(r'[nň]', node.form.lower()): + node.feats['PrepCase'] = 'Pre' + # In 19th century data, the grammaticalized usages of "se", "si" are + # tagged as PART (rather than a reflexive PRON, which is the standard). + # Even if it already was tagged PRON, some features may have to be added. + if node.upos in ['PRON', 'PART'] and node.form.lower() in ['se', 'si']: + node.lemma = 'se' + node.upos = 'PRON' + node.feats['PronType'] = 'Prs' + node.feats['Reflex'] = 'Yes' + if node.form.lower() == 'se': + # Occasionally "se" can be genitive: "z prudkého do se dorážení". + if not node.feats['Case'] == 'Gen': + node.feats['Case'] = 'Acc' + else: + node.feats['Case'] = 'Dat' + node.feats['Variant'] = 'Short' + # As the genitive/accusative form of "on", "jeho" should have PrepCase. + if node.upos == 'PRON' and node.form.lower() == 'jeho': + node.feats['PrepCase'] = 'Npr' + # Possessive pronouns have Person, Gender[psor] and Number[psor]. + # Although it is questionable, plural possessors are lemmatized to singular + # possessors in an analogy to personal pronouns: "my" --> "já", "náš" --> "můj". + # Some source corpora lack Person and [psor] features, others do not respect + # the lemmatization rule, so in the end we have to look at the forms; but + # there are potentially many variants, especially in old texts. + if node.upos == 'DET' and node.feats['Poss'] == 'Yes': + if node.form.lower().startswith('m'): + # můj muoj mój mého mému mém mým moje má mojí mé moji mou mí mých mými + node.feats['Person'] = '1' + node.feats['Number[psor]'] = 'Sing' + elif node.form.lower().startswith('t'): + # tvůj tvuoj tvój tvého tvému tvém tvým tvoje tvá tvojí tvé tvoji tvou tví tvých tvými + node.feats['Person'] = '2' + node.feats['Number[psor]'] = 'Sing' + elif node.form.lower().startswith('n'): + # náš našeho našemu našem naším naše naší naši našich našim našimi + node.lemma = 'můj' + node.feats['Person'] = '1' + node.feats['Number[psor]'] = 'Plur' + elif node.form.lower().startswith('v'): + # váš vašeho vašemu vašem vaším vaše vaší vaši vašich vašim vašimi + node.lemma = 'tvůj' + node.feats['Person'] = '2' + node.feats['Number[psor]'] = 'Plur' + elif node.form.lower() == 'jeho': + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Sing' + if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']): + node.feats['Gender[psor]'] = 'Masc,Neut' + elif re.fullmatch(r'jehož', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Sing' + if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']): + node.feats['Gender[psor]'] = 'Masc,Neut' + elif re.fullmatch(r'(její|jejího|jejímu|jejím|jejích|jejími|jejíma)', node.form.lower()): + node.lemma = 'jeho' + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Sing' + node.feats['Gender[psor]'] = 'Fem' + elif re.fullmatch(r'jejíž', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Sing' + node.feats['Gender[psor]'] = 'Fem' + elif re.fullmatch(r'jich|jejich', node.form.lower()): + node.lemma = 'jeho' + node.feats['Person'] = '3' + node.feats['Number[psor]'] = 'Plur' + elif re.fullmatch(r'jichž|jejichž', node.form.lower()): + node.lemma = 'jehož' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Plur' + elif re.fullmatch(r'jichžto|jejichžto', node.form.lower()): + node.lemma = 'jehožto' + node.feats['PronType'] = 'Rel' + node.feats['Number[psor]'] = 'Plur' + elif node.lemma == 'čí': + node.feats['Poss'] = 'Yes' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Reflexive possessive pronoun should not forget the Reflex=Yes feature. + if node.upos == 'DET' and node.lemma == 'svůj': + node.feats['Reflex'] = 'Yes' + # Demonstrative, interrogative, relative, negative, total and indefinite + # pronouns (or determiners, because some of them get the DET tag). + if node.upos in ['PRON', 'DET']: + # Relative pronoun "jenž" should be PRON, not DET + # (it inflects for Gender but it can never be used as congruent attribute). + if re.fullmatch(r'(jenž|jenžto)', node.lemma): + node.upos = 'PRON' + if node.form.lower().startswith('j'): + node.feats['PrepCase'] = 'Npr' + else: + node.feats['PrepCase'] = 'Pre' + # Relative pronoun "ješto" should be PRON, not DET (if it is not SCONJ, but that was excluded by a condition above) + # (it inflects for Gender but it can never be used as congruent attribute). + elif node.form.lower() in ['ješto', 'ježto']: + node.lemma = 'jenžto' + node.upos = 'PRON' + node.feats['PrepCase'] = 'Npr' + # Relative pronoun "an" is PRON (not DET). + elif node.lemma == 'an': + node.upos = 'PRON' + node.feats['PronType'] = 'Rel' + # Pronoun "kdo" is PRON (not DET). + elif node.lemma == 'kdo': + node.lemma = 'kdo' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "kdož" is PRON (not DET). + elif node.lemma == 'kdož': + node.lemma = 'kdož' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Rel' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "někdo", "kdosi" is PRON (not DET). + elif re.fullmatch(r'(kdosi|někdo)', node.lemma): + node.upos = 'PRON' + node.feats['PronType'] = 'Ind' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "nikdo" is PRON (not DET). + elif node.lemma == 'nikdo': + node.lemma = 'nikdo' + node.upos = 'PRON' + node.feats['PronType'] = 'Neg' + # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc. + # However, we do not annotate Number ("kdo" can be the subject of a plural verb). + node.feats['Gender'] = 'Masc' + node.feats['Animacy'] = 'Anim' + node.feats['Number'] = '' + # Pronoun "co" is PRON (not DET). + elif node.lemma == 'co': + node.lemma = 'co' + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "což" is PRON (not DET). + elif node.lemma in ['což', 'cože']: + node.upos = 'PRON' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Rel' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "něco" is PRON (not DET). + elif re.fullmatch(r'(cokoli|cosi|něco)', node.lemma): + node.upos = 'PRON' + node.feats['PronType'] = 'Ind' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "nic" is PRON (not DET). + elif node.lemma == 'nic': + node.lemma = 'nic' + node.upos = 'PRON' + node.feats['PronType'] = 'Neg' + # We do not annotate Gender and Number, although it could be argued + # to be Gender=Neut|Number=Sing. + node.feats['Gender'] = '' + node.feats['Animacy'] = '' + node.feats['Number'] = '' + # Pronoun "týž" is DET and PronType=Dem. + elif re.fullmatch(r'(tentýž|týž)', node.lemma): + node.upos = 'DET' + node.feats['PronType'] = 'Dem' + # Pronoun "každý" is DET and PronType=Tot. + elif node.lemma == 'každý': + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + # Pronoun "vše" is lemmatized to "všechen", it is DET and PronType=Tot. + elif node.form.lower() == 'vše': + node.lemma = 'všechen' + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + elif node.lemma == 'všechen': + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + elif re.fullmatch(r'(všecek|všecka|všecku|všecko|všickni)', node.form.lower()): + node.lemma = 'všechen' + node.upos = 'DET' + node.feats['PronType'] = 'Tot' + # Pronoun "sám" is lemmatized to the long form, it is DET and PronType=Emp. + elif node.lemma in ['sám', 'samý']: + node.lemma = 'samý' + node.upos = 'DET' + node.feats['PronType'] = 'Emp' + node.feats['Variant'] = 'Short' if re.fullmatch(r'(sám|sama|samo|sami|samy|samu)', node.form.lower()) else '' + #---------------------------------------------------------------------- + # PRONOMINAL NUMERALS AND ADVERBS + #---------------------------------------------------------------------- + # The numeral "oba" should be NUM, not PRON or DET. But it should have PronType=Tot. + if node.upos in ['NUM', 'PRON', 'DET'] and node.lemma == 'oba': + node.upos = 'NUM' + node.feats['NumType'] = 'Card' + node.feats['NumForm'] = 'Word' + node.feats['PronType'] = 'Tot' + # Pronominal cardinal numerals should be DET, not NUM. + if node.upos == 'NUM': + if re.fullmatch(r'(mnoho|málo|několik)', node.lemma): + node.upos = 'DET' + node.feats['PronType'] = 'Ind' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' ###!!! so we are losing the distinction mnoho/nemnoho? + elif re.fullmatch(r'(toliko?)', node.lemma): + node.lemma = 'tolik' + node.upos = 'DET' + node.feats['PronType'] = 'Dem' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(kolik)', node.lemma): + node.upos = 'DET' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + node.feats['NumForm'] = '' + node.feats['Polarity'] = '' + if node.upos in ['ADV', 'NUM']: + if re.fullmatch(r'(mnoho|málo|několi)krát', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + node.feats['PronType'] = 'Ind' + elif re.fullmatch(r'(tolikrát)', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + node.feats['PronType'] = 'Dem' + elif re.fullmatch(r'(kolikrát)', node.lemma): + node.upos = 'ADV' + node.feats['NumType'] = 'Mult' + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + # Pronominal adverbs have PronType but most of them do not have Degree + # and Polarity. + if node.upos == 'ADV': + if re.fullmatch(r'(dosud|dotud|nyní|odsud|odtud|proto|sem|tady|tak|takož|takto|tam|tamto|teď|tehdy|tenkrát|tu|tudy|zde)', node.lemma): + node.feats['PronType'] = 'Dem' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(dokdy|dokud|jak|kam|kde|kdy|kterak|kudy|odkdy|odkud|proč)', node.lemma): + if node.feats['PronType'] == '': + node.feats['PronType'] = 'Int,Rel' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(kdežto)', node.lemma): + node.feats['PronType'] = 'Rel' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(jakkoli|jaksi|kamkoli|kamsi|kdekoli|kdesi|kdykoli|kdysi|kudykoli|kudysi|nějak|někam|někde|někdy|někudy)', node.lemma): + node.feats['PronType'] = 'Ind' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + elif re.fullmatch(r'(nic|nijak|nikam|nikde|nikdy|nikudy)', node.lemma): + node.feats['PronType'] = 'Neg' + node.feats['Degree'] = '' + node.feats['Polarity'] = '' + # Total pronominals can be negated ("nevždy"). Then they get Degree, too. + elif re.fullmatch(r'(odevšad|všude|všudy|ve?ždy|ve?ždycky)', node.lemma): + node.feats['PronType'] = 'Tot' + node.feats['Degree'] = 'Pos' + node.feats['Polarity'] = 'Pos' + #---------------------------------------------------------------------- + # VERBS AND AUXILIARIES + #---------------------------------------------------------------------- + # In Czech UD, "být" is always tagged as AUX and never as VERB, regardless + # of the fact that it can participate in purely existential constructions + # where it no longer acts as a copula. Czech tagsets typically do not + # distinguish AUX from VERB, which means that converted data may have to + # be fixed. + if node.upos == 'VERB' and node.lemma in ['být', 'bývat', 'bývávat']: + node.upos = 'AUX' + if node.upos in ['ADV', 'VERB'] and re.fullmatch(r'(ne)?lze', node.form.lower()): + node.upos = 'ADV' + node.lemma = 'lze' # not 'nelze' + node.feats['VerbForm'] = '' + node.feats['Voice'] = '' + node.feats['Aspect'] = '' + node.feats['Mood'] = '' + node.feats['Tense'] = '' + node.feats['Person'] = '' + node.feats['Number'] = '' + node.feats['Degree'] = 'Pos' + if node.upos in ['VERB', 'AUX']: + # Most non-passive verb forms have Voice=Act, and infinitives should + # have it, too. Passive infinitives are always periphrastic. + # (This is not done in the PDT tagset, but we should add it.) + if node.feats['VerbForm'] == 'Inf': + node.feats['Voice'] = 'Act' + # Same for imperatives. + elif node.feats['Mood'] == 'Imp': + node.feats['Voice'] = 'Act' + # Some verbs lack the Aspect feature although they are not biaspectual. + if node.feats['Aspect'] == '': + if re.fullmatch(r'(cítit|čekat|činit|číst|dávat|dělat|dít|dívat|hledat|chodit|chtít|jít|kralovat|ležet|milovat|mít|mluvit|moci|mus[ei]t|mysl[ei]t|patřit|počínat|prosit|ptát|působit|sedět|snažit|vědět|vidět|vyprávět|zdát|znamenat|žít)', node.lemma): + node.feats['Aspect'] = 'Imp' + elif re.fullmatch(r'(dát|dojít|dostat|nalézt|napadnout|nechat|obrátit|odpovědět|otevřít|počít|položit|pomoci|poslat|postavit|povědět|poznat|přijít|přinést|říci|učinit|udělat|ukázat|vrátit|vstát|vydat|vzít|začít|zeptat|zůstat)', node.lemma): + node.feats['Aspect'] = 'Perf' + # We must look at word form to distinguish imperfective "stát" from perfective "stát se". + elif re.fullmatch(r'(stojí(me?|š|te)?|stál(a|o|i|y)?)', node.form.lower()): + node.feats['Aspect'] = 'Imp' + elif re.fullmatch(r'(stan(u|eš|e|eme?|ete|ou)|stal(a|o|i|y)?)', node.form.lower()): + node.feats['Aspect'] = 'Perf' + # Present forms of perfective verbs normally have Tense=Pres despite + # meaning future. However, a few imperfective verbs have a separate + # future form (distinct from present form), which gets Tense=Fut + # despite inflecting similarly to present forms. + if node.feats['Mood'] == 'Ind' and node.feats['Tense'] == 'Pres' and node.feats['Aspect'] != 'Perf' and re.match(r'(ne)?((bud|půjd|pojed|polez|pones)(u|eš|e|eme?|ete|ou)|polet(ím|íš|í|íme|íte))', node.form.lower()): + node.feats['Tense'] = 'Fut' + # Passive participles (including the short forms) should be ADJ, not VERB. + # But they keep the verbal features of VerbForm, Voice, Aspect. + if node.feats['VerbForm'] == 'Part' and node.feats['Voice'] == 'Pass': + node.upos = 'ADJ' + # But now we need an adjectival lemma. + ###!!! Bohužel to občas zahodí normalizaci, kterou tam Martinův tým zavedl ručně, např. "rozhřita" mělo lemma "rozehřát", ale já teď místo "rozehřátý" vyrobím "rozhřitý". + ###!!! odepříno - odepříný místo odepřený + ###!!! dovolíno - dovolíný místo dovolený + ###!!! vyslyšána - vyslyšaný místo vyslyšený + ###!!! obmezený místo omezený, oslyšaný místo oslyšený + node.misc['LDeriv'] = node.lemma + node.lemma = re.sub(r'([nt])[auoiy]?$', r'\1ý', node.form.lower()) + node.lemma = re.sub(r'áný$', r'aný', node.lemma) # ztroskotány --> ztroskotáný --> ztroskotaný; zachován, spořádán + if node.feats['Polarity'] == 'Neg': + node.lemma = re.sub(r'^ne', '', node.lemma) + if node.feats['Case'] == '': + node.feats['Case'] = 'Nom' + if node.feats['Degree'] == '': + node.feats['Degree'] = 'Pos' + node.feats['Variant'] = 'Short' + #---------------------------------------------------------------------- + # ADVERBS + #---------------------------------------------------------------------- + # Words that indicate the speaker's attitude are tagged ADV in UD, + # although the Czech tagsets often treat them as particles. + if node.upos == 'PART' and re.fullmatch(r'(ani|asi?|až|bezpochyby|bohdá|co|dokonce|jen|jistě|již|hlavně|hned|jednoduše|leda|možná|naopak|nejen|nejspíše?|opravdu|ovšem|patrně|právě|prej|prý|přece|především|rozhodně|skoro|skutečně|snad|spíše?|teda|tedy|třeba|určitě|věru|vlastně|vůbec|zajisté|zase|zrovna|zřejmě|zvlášť|zvláště)', node.lemma): + node.upos = 'ADV' + node.feats['Degree'] = 'Pos' + node.feats['Polarity'] = 'Pos' + node.misc['CzechParticle'] = 'Yes' + # Adverb "brzo" should be lemmatized as "brzy". + if node.upos == 'ADV' and node.form.lower() == 'brzo': + node.lemma = 'brzy' + if node.upos == 'ADV' and node.form.lower() == 'teprv': + node.lemma = 'teprve' + # All non-pronominal adverbs (and also some pronominal ones) should + # have Degree and Polarity. At least for now we also exclude adverbial + # numerals, e.g. "jednou" – "nejednou". + if node.upos == 'ADV' and node.feats['PronType'] == '' and node.feats['NumType'] == '': + if node.feats['Degree'] == '': + node.feats['Degree'] = 'Pos' + if node.feats['Polarity'] == '': + node.feats['Polarity'] = 'Pos' + #---------------------------------------------------------------------- + # PREPOSITIONS + #---------------------------------------------------------------------- + # Preposition "u" may combine with Case=Loc|Acc in old texts, and then + # it functions as a vocalized counterpart of "v". Nevertheless, we always + # lemmatize it as "u" and thus AdpType is Prep, not Voc. + if node.upos == 'ADP' and node.form.lower() == 'u': + node.lemma = 'u' + node.feats['AdpType'] = 'Prep' + #---------------------------------------------------------------------- + # CONJUNCTIONS + #---------------------------------------------------------------------- + # As a conjunction (and not particle/adverb), "ani" is coordinating and + # not subordinating. + if node.upos == 'SCONJ' and node.lemma == 'ani': + node.upos = 'CCONJ' + if node.upos == 'CCONJ' and node.lemma == 'nebť': + node.lemma = 'neboť' + #---------------------------------------------------------------------- + # PARTICLES (other than those already grabbed above) + #---------------------------------------------------------------------- + # "jako" should be SCONJ but 19th century data have it as PART. + if node.upos == 'PART': + if node.lemma == 'jako': + node.upos = 'SCONJ' + elif node.lemma == 'ti': + node.lemma = 'ť' diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py new file mode 100644 index 00000000..da9f5bda --- /dev/null +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -0,0 +1,979 @@ +""" +Block to identify missing or ill-valued features in Czech. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + # The convention used in PDT is not consistent. Adjectives are fully disambiguated + # (three genders, two animacies, three numbers, seven cases), even though some + # forms are shared among many feature combinations. On the other hand, pronouns + # and determiners omit some features in the context of certain values of other + # features (e.g., gender and animacy are not distinguished in plural if the case + # is genitive, dative, locative or instrumental). + # In contrast, ČNK (CNC) fully disambiguates pronouns and determiners just like + # adjectives. + # Here we can trigger one of the two conventions. It should become a block parameter + # in the future. + pdt20 = False # True = like in PDT 2.0; False = like in ČNK + + def process_node(self, node): + # Czech constraints should not be applied to foreign words. + if node.feats['Foreign'] == 'Yes': + pass + # NOUNS ################################################################ + elif node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case']) + if node.feats['VerbForm'] == 'Vnoun': + # verbal nouns: bytí, dělání, ... + self.check_allowed_features(node, { + 'VerbForm': ['Vnoun'], + 'Gender': ['Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes'] + }) + elif node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + # PROPER NOUNS ######################################################### + elif node.upos == 'PROPN': + self.check_required_features(node, ['Gender', 'Number', 'Case']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + if node.feats['Poss'] == 'Yes': # possessive adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + elif node.feats['NumType'] == 'Ord' or node.feats['NumType'] == 'Mult': # ordinal numerals are a subtype of adjectives; same for some multiplicative numerals (dvojí, trojí) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord', 'Mult'], + 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho') + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'], # sedmer (Mult Short) duch tvój; pól čtverta (Ord Short) komára + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord', 'Mult'], + 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho') + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives + self.check_required_features(node, ['VerbForm', 'Voice']) + if node.feats['Voice'] == 'Act': # active participles have tense, passives don't but they have degree + if node.feats['Gender'] == 'Masc': + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + if node.feats['Gender'] == 'Masc': + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Degree']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Number', 'Case', 'Polarity', 'Degree']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: # regular adjectives, including short forms + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # on, ona, ono, oni, ony + if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony + self.check_adjective_like(node, ['PronType', 'Person'], { + 'PronType': ['Prs'], + 'Person': ['3'] + }) + elif re.match(r"^(ho|mu)$", node.form.lower()): + # The short (clitic) forms do not have PrepCase in Modern Czech. + # Old Czech has also 'jmu' (besides 'jemu' and 'mu') and 'jho' + # (besides 'jeho' and 'ho'); it should not have Variant=Short + # and it should have PrepCase=Npr (the next block). + self.check_adjective_like(node, ['PronType', 'Person', 'Variant'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Variant': ['Short'] + }) + else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně + # Mostly only two gender groups and no animacy: + # Masc,Neut ... jeho, jho, jemu, jmu, jej, něm, jím + # Fem ... jí, ji, ní + # Neut ... je + # No gender in dual and plural: + # Plur ... jich, jim, je, nich, jimi + # Here we require PrepCase but disallow Variant. + self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # 1st and 2nd person do not have gender: já, ty + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + elif re.search(r'k[dt][oe]', node.lemma): # kdo (kto), kdož, někdo, nikdo + # There is no Number. Někdo and nikdo behave like singular; + # kdo is by default singular as well but it also occurs as subject + # of plural verbs ("ti, kdo nepřišli včas, byli vyloučeni"). + # In Old Czech, "nikde" is a variant of the pronoun "nikdo" (nobody) + # (while in New Czech, "nikde" (nowhere) is a pronominal adverb only). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kdo to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], + 'Gender': ['Masc'], + 'Animacy': ['Anim'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif re.match(r'^(co(si?)?|což|což?koliv?|něco|lečco|lecco|ledacos?|nic|nicož)$', node.lemma): + # Although these pronouns behave by default as neuter singular, + # no Gender and Number is annotated. However, quite unusually, + # there is Animacy=Inan without Gender. + ###!!! This should probably be fixed in all Czech treebanks and + ###!!! in Interset. The pronoun should get Gender=Neut and no + ###!!! animacy. For now, let's at least make animacy an optional + ###!!! feature (I see that we already do not fill it in the Old + ###!!! Czech data). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, co to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], + 'Animacy': ['Inan'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif node.lemma == 'ješto': + # Unlike 'jenžto', this relative pronoun does not inflect, it + # always occurs in a nominative position, but the context can + # be any gender and number. + # Update from the Hičkok project: 'ješto' is lemmatized to + # 'jenžto' (see below), meaning that this branch should not be + # needed for the new data. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Case': ['Nom'] + }) + elif re.match(r'^(jenž|jenžto)$', node.lemma): + # The relative pronouns 'jenž', 'jenžto' inflect for gender; + # while we normally take this as a sign of DET (instead of PRON), + # these can never act as real DET because they never modify a + # nominal. + # Similarly to the personal pronoun 'on', animacy is only + # annotated for masculine nominative plural, non-nominative + # forms are merged for masculine and neuter (jehož, jemuž), and + # non-singular gender is only annotated in nominative (while + # these cases are common for all genders: jichž, jimž, jimiž). + # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even + # in the nominative, although there is no prepositional counter- + # part (but similarly the locative has no prepositionless form). + # Update from the Hičkok project: In Old Czech, both 'jenž' and + # 'jenžto' (or its variant 'ješto') can be used uninflected, + # accompanied by a resumptive pronoun which provides the inflection. + # In this case, the Hičkok data will not annotate Gender, Animacy, + # Number and Case of the relative pronoun. Therefore, we require + # the full set of features if any of them is present; otherwise, + # we only expect PronType and PrepCase. + if node.feats['Gender'] != '' or node.feats['Animacy'] != '' or node.feats['Number'] != '' or node.feats['Case'] != '': + self.check_adjective_like(node, ['PronType', 'PrepCase'], { + 'PronType': ['Rel'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: + self.check_required_features(node, ['PronType', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'PrepCase': ['Npr'] + }) + else: + # What remains is the relative pronoun 'an'. It behaves similarly + # to 'jenž' but it does not have the PrepCase feature and it + # only occurs in the nominative. + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom'] + }) + else: # not Masc Plur: an, ana, ano, any + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. + # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. + if re.match(r'^(je?ho|jejich|j[ií]ch)$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner because no n-form can be used (jeho dům VS. na jeho dům). + # Compare with genitive/accusative of the pronoun "on", there the form changes after preposition and PrepCase must be annotated + # (jeho se bojím VS. bez něho se neobejdu). + }) + # Relative possessive determiners 'jehož' and 'jejichž' behave similarly + # to the personal possessive determiners but they do not have Person. + # Normally determiners do not change j->n after prepositions but we + # have an example in Old Czech (štěpové zlatí, na nichžto větviech...) + elif re.match(r'^(jeho|jejich|[jn][ií]ch)ž(e|to)?$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner (muž, jehož manželka zahynula při nehodě) because no n-form can be used + # (after preposition: muž, na jehož manželku jste si stěžoval). Compare with genitive/accusative of the relative pronoun "jenž", + # there the form changes after preposition and PrepCase must be annotated (muž, jehož se bojím VS. muž, bez něhož se neobejdeme). + }) + # Feminine personal possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)$', node.form.lower()): + # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (její bota, její boty, její botě, její botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiej') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # Feminine relative possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(e|to)?)$', node.form.lower()): + # The feminine possessive 'jejíž' slightly inflects, unlike 'jehož' and 'jejichž'. + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (jejíž bota, jejíž boty, jejíž botě, jejíž botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiejž') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(můj|tvůj|svůj)(ž(e|to)?)?$', node.lemma): + if node.feats['Reflex'] == 'Yes': + self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'] + }) + else: + self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2'], + 'Number[psor]': ['Sing', 'Plur'] + }) + elif re.match(r'^(ně|lec|ni)?číž?(koliv?)?$', node.lemma): + self.check_adjective_like(node, ['PronType', 'Poss'], { + 'PronType': ['Int', 'Rel', 'Ind', 'Neg'], + 'Poss': ['Yes'] + }) + elif re.match(r'^(sám|samý)$', node.lemma): + # The above condition looks at both lemma options, although only one lemma is assumed. + # However, in New Czech data the one lemma is "samý" while in Old Czech data it is "sám". + # Unlike other determiners, it allows Variant=Short: sám, sama, samu, samo, sami, samy. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Emp'], 'Variant': ['Short']}) + elif node.lemma == 'veškerý': + # In Old Czech, this determiner also allows Variant=Short: veškeren, veškera, veškeru, veškero, veškeři, veškery. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Tot'], 'Variant': ['Short']}) + elif node.lemma == 'žádný': + # In Old Czech, this determiner also allows Variant=Short: žáden, žádna, žádnu, žádno, žádni, žádny. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Neg'], 'Variant': ['Short']}) + elif node.feats['NumType'] in ['Ord', 'Mult']: # pronominal numerals 'několikátý', 'několikerý', 'několiký' etc. + self.check_adjective_like(node, ['PronType', 'NumType'], { + 'PronType': ['Ind', 'Int', 'Rel', 'Dem'], + 'NumType': ['Ord', 'Mult'] + }) + elif node.feats['NumType'] == 'Card': # pronominal quantifiers 'mnoho', 'málo', 'několik' etc. + if node.lemma == 'nejeden': + self.check_adjective_like(node, ['PronType', 'NumType'], {'PronType': ['Ind'], 'NumType': ['Card']}) + else: + # Lemmas 'hodně' and 'málo' have Degree even if used as quantifiers and not adverbs: + # hodně, více, nejvíce; málo, méně, nejméně + # Lemmas 'mnoho' and 'málo' can be negated (nemnoho, nemálo). + self.check_required_features(node, ['PronType', 'NumType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Ind', 'Int', 'Rel', 'Dem'], + 'NumType': ['Card'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Int', 'Rel', 'Ind', 'Neg', 'Tot']}) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + if node.feats['NumType'] == 'Sets': + # 'jedny', 'dvoje', 'oboje', 'troje', 'čtvery' + # Number should perhaps be only Plur because the counted noun will be Plur. + # Gender is not annotated in PDT but there are different forms ('jedni' vs. 'jedny', + # and in Old Czech also 'dvoji' vs. 'dvoje'), so we should allow Gender (and Animacy). + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Sets'], + 'PronType': ['Tot'], # for 'oboje' + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. + # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. + # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. + # 'pět' and more have Number=Plur, Case: pět, pěti. + # 'půl' has no Number and Case, although it behaves syntactically similarly to 'pět' (but genitive is still 'půl', not '*půli'). + # 'sto', 'tisíc', 'milión', 'miliarda' etc. have Gender (+ possibly Animacy) and Number (depending on their form). + elif node.lemma == 'jeden': + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(dva|oba)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Gender', 'Number', 'Case']) + if self.pdt20: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(dvé|obé)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'obé' + 'NumForm': ['Word'], + 'Gender': ['Neut'], + 'Number': ['Sing'], # when 'dvé' is subject, the verb is neuter singular + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif node.lemma == 'půl': + self.check_required_features(node, ['NumType', 'NumForm']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'] + }) + elif re.match(r'^(sto|tisíc|.+ili[oó]n|.+iliarda)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + # In PDT, cardinal numerals higher than four in nominative/accusative/vocative + # have Number=Sing instead of Plur! It may be motivated by the default + # agreement they trigger on verbs (but they don't have Gender=Neut). + # It does not make much sense but we must allow Sing before a better + # approach is defined and implemented in the data. + # On the other hand, we may want to allow Dual for "stě". + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # VERBS AND AUXILIARIES ################################################ + elif node.upos in ['VERB', 'AUX']: + # There are only three lemmas recognized as AUX in Czech. This is not + # about features and it would be caught by the UD validator, but it + # is error in morphology, so let's report it here as well. + if node.upos == 'AUX' and node.lemma not in ['být', 'bývat', 'bývávat']: + self.bug(node, 'NonAuxLemma') + # All Czech verbs (and some adjectives and nouns) must have VerbForm. + # Almost all verbs have lexical Aspect but we cannot require it + # because there are a few biaspectual verbs (e.g. 'analyzovat') that + # do not have the feature. + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] in ['Inf', 'Sup']: + # There is no voice. For some reason, PDT does not annotate that + # the infinitive form is active (while a passive infinitive is + # a combination of the infinitive with a passive participle). + self.check_required_features(node, ['Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Inf', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Fin': + # Voice is optional. For some reason it is not annotated with + # imperatives (although passive imperatives are a combination + # of the active imperative and a passive participle). It is + # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. + # Conditional "by" has no person and number (it is typically + # 3rd person but it could be other persons, too, as in "ty by + # ses bál"). + if node.feats['Mood'] == 'Cnd': + if node.form.lower() == 'by': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'] + }) + elif node.form.lower() == 'byšta': + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['2', '3'], + 'Number': ['Dual'] + }) + else: + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'] + }) + elif node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Voice': ['Act'], # optional in Old Czech data, not used with imperatives in Modern Czech data (at least not yet) + 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'] + }) + else: # indicative + self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Voice': ['Act'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short', 'Long'], # distinguishes sigmatic (Long) and asigmatic (Short) aorist + 'Emph': ['Yes'] + }) + elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # converb + # Old Czech data annotate converb gender by context rather than form + # (because the form was different than in Modern Czech) and for + # masculines they also include animacy. In Modern Czech animacy is + # currently not annotated and Masc,Neut gender is merged. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['NumType'] != '': + # Adverbial multiplicative numerals (jednou, dvakrát, třikrát) + # belong here. They have also pronominal counterparts (kolikrát, + # tolikrát, několikrát). There are also adverbial ordinal numerals + # (zaprvé, poprvé, zadruhé, podruhé). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_allowed_features(node, { + 'NumType': ['Mult', 'Ord'], + 'PronType': ['Dem', 'Int', 'Rel', 'Int,Rel', 'Ind'] + }) + elif self.pdt20: + if node.feats['PronType'] != '': + # Pronominal adverbs in PDT are neither compared nor negated. + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] + }) + elif node.feats['Degree'] != '': + # Adverbs that are compared can also be negated. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + else: + if node.feats['PronType'] == 'Tot': + # Total adverbs in Old Czech can be negated: vždy, nevždy. + # Then for consistence with other adverbs, we also require + # Degree, although it will be always Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'PronType': ['Tot'], + 'Degree': ['Pos'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['PronType'] != '': + # Other pronominal adverbs are neither compared nor negated. + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg'] + }) + else: + # All other adverbs should have both Degree and Polarity, + # although for some of them the values will always be Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_required_features(node, ['AdpType', 'Case']) + self.check_allowed_features(node, { + 'AdpType': ['Prep', 'Voc'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Abbr': ['Yes'] + }) + # SUBORDINATING CONJUNCTIONS ########################################### + elif node.upos == 'SCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) + # COORDINATING CONJUNCTIONS ############################################ + elif node.upos == 'CCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) + # PARTICLES ############################################################ + elif node.upos == 'PART': + # "t." = "totiž" + self.check_allowed_features(node, { + 'Abbr': ['Yes'] + }) + # THE REST: NO FEATURES ################################################ + # (OR UNDEFINED UPOS) ################################################## + else: + if not node.upos in ['INTJ', 'PUNCT', 'SYM', 'X']: + bugmsg = 'UnknownUpos' + if node.upos: + bugmsg += node.upos + self.bug(node, bugmsg) + self.check_allowed_features(node, {}) + + def check_adjective_like(self, node, r0, a0): + """ + Long form of adjectives, pronouns and determiners mostly share declension + paradigms and thus the sets of features that are expected. Whether the + actual feature sets are the same depends on the tagging convention (PDT + vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are + not; in ČNK, both adjectives and pronouns (incl. determiners) are fully + disambiguated. This method defines the core inflectional features while + any extras (such as PronType for pronouns) have to be provided by the + caller in parameters r0 (list) and a0 (dict). + """ + required_features = [] + allowed_features = {} + full_set = node.upos == 'ADJ' or not self.pdt20 + if full_set: + # Even in the full set, animacy is only distinguished for the + # masculine gender. + if node.feats['Gender'] == 'Masc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + # Gender is annotated in all cases in singular (ten, ta, to) + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished + # in plural if gender is distinguished and it is masculine; in + # singular it is distinguished only in accusative (toho, ten). + # Other cases in plural are gender-less (těch, těm, těmi). + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + } + else: + required_features = ['Number', 'Case'] + allowed_features = { + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + } + required_features = r0 + required_features + a0.update(allowed_features) + allowed_features = a0 + self.check_required_features(node, required_features) + self.check_allowed_features(node, allowed_features) diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py new file mode 100644 index 00000000..e9367d46 --- /dev/null +++ b/udapi/block/ud/da/fixmultisubject.py @@ -0,0 +1,123 @@ +""" +Block ud.da.FixMultiSubject tries to fix some systemic instances of predicates +that have more than one subject dependent. +""" +from udapi.core.block import Block +import re + +class FixMultiSubject(Block): + """ + Make sure that a predicate has at most one subject. Note that it can + only fix instances that follow certain pattern observed in the Danish + data. + """ + + def process_node(self, node): + subjects = [x for x in node.children if re.match(r'^[nc]subj$', x.udeprel)] + if len(subjects) > 1: + # Pattern 1: A node is is attached as xcomp to the current node, and + # one of the subjects is closer to that xcomp than to the current + # node. + xcompchildren = [x for x in node.children if x.udeprel == 'xcomp'] + # Pattern 2: Similar to pattern 1, but advcl instead of xcomp, and + # possibly not so many other mis-attached dependents. + advclchildren = [x for x in node.children if x.udeprel == 'advcl'] + # Pattern 3: Instead of xcomp or advcl, there is a simple amod + # (under a verb!), in fact an adjective with a copula that should + # have been advcl. Alternatively, the nonverbal clause is headed + # by a noun, and the deprel is obl instead of amod. + amodchildren = [x for x in node.children if re.match(r'^(amod|obl)$', x.udeprel)] + if len(subjects) == 2 and len(xcompchildren) > 0: + for xcompnode in xcompchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(xcompnode, x) for x in subjects] + # Is the first subject closer to xcomp than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to xcomp? + if dx[0] <= dn[0] and dn[1] <= dx[1]: + # The first subject should be re-attached to the xcomp node. + subjects[0].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if c != xcompnode and dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + break + # Is the second subject closer to xcomp than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to xcomp? + elif dx[1] <= dn[1] and dn[0] <= dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = xcompnode + # There are typically other dependents that should belong to the xcomp node. + for c in node.children: + if c != xcompnode and dist(xcompnode, c) < dist(node, c): + c.parent = xcompnode + # The xcompnode should probably be attached as something else + # than xcomp, perhaps even the direction of the relation should + # be reversed, but one would have to resolve this manually. + xcompnode.misc['ToDo'] = 'check-xcomp' + break + elif len(subjects) == 2 and len(advclchildren) > 0: + for advclnode in advclchildren: + dn = [dist(node, x) for x in subjects] + dx = [dist(advclnode, x) for x in subjects] + # Is the first subject closer to advcl than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to advcl? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = advclnode + break + # Is the second subject closer to advcl than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to advcl? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = advclnode + break + elif len(subjects) == 2 and len(amodchildren) > 0: + for amodnode in amodchildren: + if len([x for x in amodnode.children if x.udeprel == 'cop']) > 0: + dn = [dist(node, x) for x in subjects] + dx = [dist(amodnode, x) for x in subjects] + # Is the first subject closer to amod than it is to the current node? + # At the same time, is the second subject closer to the current node than it is to amod? + if dx[0] < dn[0] and dn[1] < dx[1]: + # The first subject should be re-attached to the advcl node. + subjects[0].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break + # Is the second subject closer to amod than it is to the current node? + # At the same time, is the first subject closer to the current node than it is to amod? + elif dx[1] < dn[1] and dn[0] < dx[0]: + # The second subject should be re-attached to the xcomp node. + subjects[1].parent = amodnode + amodnode.deprel = 'advcl' + # There are typically other dependents that should belong to the amod node. + for c in node.children: + if c != amodnode and dist(amodnode, c) < dist(node, c): + c.parent = amodnode + break + +def dist(x, y): + if x.ord < y.ord: + a = x + b = y + else: + a = y + b = x + d = b.ord - a.ord + # Count the commas between the two nodes. A comma should be seen as increasing + # the distance of the nodes, that is, decreasing the probability that they + # are in the same clause. + nc = 0 + for i in a.root.descendants: + if i.ord > a.ord and i.ord < b.ord: + if i.form == ',': + nc += 1 + d += nc * 10 + return d diff --git a/udapi/block/ud/de/__init__.py b/udapi/block/ud/de/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/de/addmwt.py b/udapi/block/ud/de/addmwt.py new file mode 100644 index 00000000..18778a4a --- /dev/null +++ b/udapi/block/ud/de/addmwt.py @@ -0,0 +1,50 @@ +"""Block ud.de.AddMwt for heuristic detection of German contractions. + +According to the UD guidelines, contractions such as "am" = "an dem" +should be annotated using multi-word tokens. + +Notice that this should be used only for converting existing conllu files. +Ideally a tokenizer should have already split the MWTs. +""" +import udapi.block.ud.addmwt + +MWTS = { + 'am': {'form': 'an dem', }, + 'ans': {'form': 'an das', }, + 'aufs': {'form': 'auf das', }, + 'beim': {'form': 'bei dem', }, + 'durchs': {'form': 'durch das', }, + 'fürs': {'form': 'fürs das', }, + 'hinterm': {'form': 'hinter dem', }, + 'hinters': {'form': 'hinter das', }, + 'im': {'form': 'in dem', }, + 'ins': {'form': 'in das', }, + 'übers': {'form': 'über das', }, + 'ums': {'form': 'um das', }, + 'unterm': {'form': 'unter dem', }, + 'unters': {'form': 'unter das', }, + 'vom': {'form': 'von dem', }, + 'vorm': {'form': 'vor dem', }, + 'vors': {'form': 'vor das', }, + 'zum': {'form': 'zu dem', }, + 'zur': {'form': 'zu der', }, +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + v['lemma'] = v['form'].split()[0] + ' der' + v['upos'] = 'ADP DET' + v['xpos'] = 'APPR ART' + v['deprel'] = 'case det' + v['feats'] = '_ *' + # The following are the default values + # v['main'] = 0 # which of the two words will inherit the original children (if any) + # v['shape'] = 'siblings', # the newly created nodes will be siblings + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + return MWTS.get(node.form.lower(), None) diff --git a/udapi/block/ud/de/fixgsd.py b/udapi/block/ud/de/fixgsd.py new file mode 100644 index 00000000..65d12681 --- /dev/null +++ b/udapi/block/ud/de/fixgsd.py @@ -0,0 +1,58 @@ +""" +Block to fix annotation of UD German-GSD. +""" +from udapi.core.block import Block +import logging +import re + +class FixGSD(Block): + + def process_node(self, node): + """ + Normalizes tokenization, lemmatization and tagging of ordinal numerals + that are expressed using digits followed by a period. + https://github.com/UniversalDependencies/UD_German-GSD/issues/24 + """ + # Ignore periods that terminate a sentence, although they could belong + # to an ordinal numeral at the same time. + if node.form == '.' and node.next_node: + # Ignore number+period combinations that have an intervening space. + if node.prev_node and re.match(r'^\d+$', node.prev_node.form) and node.prev_node.no_space_after: + # Merge the number and the period into one token. + number = node.prev_node + period = node + # The period should not have any children but if it does, re-attach them to the number. + for c in period.children: + c.parent = number + # The period should be followed by a space but if it isn't, mark it at the number. + number.misc['SpaceAfter'] = 'No' if period.no_space_after else '' + number.form += '.' + number.lemma = number.form + number.upos = 'ADJ' + number.xpos = 'ADJA' + number.feats = '_' + number.feats['NumType'] = 'Ord' + if number.udeprel == 'nummod': + number.deprel = 'amod' + period.remove() + # Even if the digits and the period are already in one token, check their annotation. + if re.match(r'^\d+\.$', node.form): + node.lemma = node.form + node.upos = 'ADJ' + node.xpos = 'ADJA' + node.feats = '_' + node.feats['NumType'] = 'Ord' + if node.udeprel == 'nummod': + node.deprel = 'amod' + # Finally, make sure that ordinal numerals expressed verbosely are tagged properly. + # Unlike for digits, do not remove the features for Gender, Number, and Case. + # Skip 'acht' because we cannot reliably distinguish it from the cardinal numeral and from the verb 'achten'. + if re.match(r'^(erst|zweit|dritt|viert|fünft|sechst|siebt|neunt|(drei|vier|fünf|sechs|sieb|acht|neun)?zehnt|elft|zwölft)(er)?$', node.lemma, re.IGNORECASE): + # Skip 'erst' that is used as an adverb. + if node.lemma != 'erst' or node.upos != 'ADV': + node.lemma = re.sub(r'^(.+)er$', r'\1', node.lemma) + node.upos = 'ADJ' + node.xpos = 'ADJA' + node.feats['NumType'] = 'Ord' + if node.udeprel == 'nummod': + node.deprel = 'amod' diff --git a/udapi/block/ud/de/fixhdt.py b/udapi/block/ud/de/fixhdt.py new file mode 100644 index 00000000..a3792a96 --- /dev/null +++ b/udapi/block/ud/de/fixhdt.py @@ -0,0 +1,109 @@ +""" +Block to fix annotation of UD German-HDT. + +It was created independently of ud.de.AddMwt but it aims to do essentially the +same thing. Future work: make the two blocks converge. + +Currently known differences: +- This block covers a wider range of contractions. +- This block generates morphological features for the syntactic words. +- This block does not touch words that look like contractions but do not have PronType=Art (this is a reliable indicator in HDT). +- This block overrides the default attachment when the original relation is root, conj, reparandum. +- The other block takes advantage of the generic class ud.AddMwt, so it does not have to re-invent common procedures. +""" +from udapi.core.block import Block +import logging +import re + +class FixHDT(Block): + + def process_node(self, node): + # PronType=Art with ADP is wrong. Fused prepositions and articles should be decomposed in UD. + # The following contractions have been observed: + # a. am ans aufs beim durchs fürs hinterm hinters im ins übers ums unterm unters vom vorm vors z. zum zur + if node.upos == 'ADP' and node.feats['PronType'] == 'Art': + if re.match("^(a\.|am|ans|aufs|beim|durchs|fürs|hinter[ms]|im|ins|übers|ums|unter[ms]|vom|vor[ms]|z\.|zu[mr])$", node.form, re.IGNORECASE): + # We need two nodes instead of one. Create a node. + # The parent should not be the root but unfortunately it is not guaranteed. + node2 = node.create_child() + node2.shift_after_node(node) + if not re.match(r"^(root|conj|reparandum)$", node.udeprel): + node2.parent = node.parent + node.deprel = 'case' + node2.deprel = 'det' + mwt = node.root.create_multiword_token(form=node.form, words=[node, node2], misc=node.misc) + node.misc['SpaceAfter'] = '' + # We want to respect the original letter case in the forms of the syntactic words. + # We can use the isupper() method to find out whether all letters are uppercase. + # However, detecting first-letter capitalization requires more work. + up = 2 if mwt.form.isupper() else 1 if mwt.form[:1].isupper() else 0 + up2 = 2 if up == 2 else 0 + if re.match(r"^(a\.|am|ans)$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'an') + node.lemma = 'an' + elif re.match(r"^aufs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'auf') + node.lemma = 'auf' + elif re.match(r"^beim$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'bei') + node.lemma = 'bei' + elif re.match(r"^durchs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'durch') + node.lemma = 'durch' + elif re.match(r"^fürs$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'für') + node.lemma = 'für' + elif re.match(r"^hinter[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'hinter') + node.lemma = 'hinter' + elif re.match(r"^(im|ins)$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'in') + node.lemma = 'in' + elif re.match(r"^übers$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'über') + node.lemma = 'über' + elif re.match(r"^ums$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'um') + node.lemma = 'um' + elif re.match(r"^unter[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'unter') + node.lemma = 'unter' + elif re.match(r"^vom$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'von') + node.lemma = 'von' + elif re.match(r"^vor[ms]$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'vor') + node.lemma = 'vor' + elif re.match(r"^(z\.|zu[mr])$", mwt.form, re.IGNORECASE): + node.form = mimic_case(up, 'zu') + node.lemma = 'zu' + node.upos = 'ADP' + node.xpos = 'APPR' + node.feats = '_' + node.feats['AdpType'] = 'Prep' + # We must use search() because match() only checks at the beginning of the string. + if re.search("[m\.]$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'dem') + node2.feats = 'Case=Dat|Definite=Def|Gender=Masc,Neut|Number=Sing|PronType=Art' + node.feats['Case'] = 'Dat' + node2.lemma = 'der' + elif re.search("s$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'das') + node2.feats = 'Case=Acc|Definite=Def|Gender=Neut|Number=Sing|PronType=Art' + node.feats['Case'] = 'Acc' + node2.lemma = 'der' + elif re.search("r$", mwt.form, re.IGNORECASE): + node2.form = mimic_case(up2, 'der') + node2.feats = 'Case=Dat|Definite=Def|Gender=Fem|Number=Sing|PronType=Art' + node.feats['Case'] = 'Dat' + node2.lemma = 'der' + node2.upos = 'DET' + node2.xpos = 'ART' + +def mimic_case(up, x): + if up >= 2: + return x.upper() + elif up == 1: + return x[:1].upper() + x[1:].lower() + else: + return x.lower() diff --git a/udapi/block/ud/el/__init__.py b/udapi/block/ud/el/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/el/addmwt.py b/udapi/block/ud/el/addmwt.py new file mode 100644 index 00000000..ac753ed5 --- /dev/null +++ b/udapi/block/ud/el/addmwt.py @@ -0,0 +1,36 @@ +"""Block ud.el.AddMwt for heuristic detection of multi-word (σε+DET) tokens. + +Notice that this should be used only for converting existing conllu files. +Ideally a tokenizer should have already split the MWTs. +Also notice that this block does not deal with the relatively rare +``PRON(Person=2)+'*+PRON(Person=3, i.e. "σ'το" and "στο")`` MWTs. +""" +import udapi.block.ud.addmwt + +MWTS = { + 'στη': {'form': 'σ τη', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'}, + 'στην': {'form': 'σ την', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'}, + 'στα': {'form': 'σ τα', 'feats': '_ Case=Acc|Definite=Def|Gender=Neut|Number=Plur|PronType=Art'}, + 'στους': {'form': 'σ τους', 'feats': '_ Case=Acc|Definite=Def|Gender=Masc|Number=Plur|PronType=Art'}, + 'στις': {'form': 'σ τις', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Plur|PronType=Art'}, + 'στον': {'form': 'σ τον', 'feats': '_ Case=Acc|Definite=Def|Gender=Masc|Number=Sing|PronType=Art'}, + 'στο': {'form': 'σ το', 'feats': '_ Case=Acc|Definite=Def|Gender=*|Number=Sing|PronType=Art'}, +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + v['lemma'] = 'σε ο' + v['upos'] = 'ADP DET' + v['xpos'] = 'AsPpSp AtDf' + v['deprel'] = 'case det' + # The following are the default values + # v['main'] = 0 # which of the two words will inherit the original children (if any) + # v['shape'] = 'siblings', # the newly created nodes will be siblings + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + return MWTS.get(node.form.lower(), None) diff --git a/udapi/block/ud/en/__init__.py b/udapi/block/ud/en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/en/setspaceafter.py b/udapi/block/ud/en/setspaceafter.py new file mode 100644 index 00000000..1ebc3054 --- /dev/null +++ b/udapi/block/ud/en/setspaceafter.py @@ -0,0 +1,46 @@ +"""Block ud.en.SetSpaceAfter for heuristic setting of SpaceAfter=No in English. + +Usage:: + + udapy -s ud.en.SetSpaceAfter < in.conllu > fixed.conllu + +Author: Martin Popel +""" +import udapi.block.ud.setspaceafter + + +class SetSpaceAfter(udapi.block.ud.setspaceafter.SetSpaceAfter): + """Block for heuristic setting of the SpaceAfter=No MISC attribute in English. + + """ + + def process_tree(self, root): + nodes = root.descendants + for i, node in enumerate(nodes[:-1]): + next_form = nodes[i + 1].form + + # Contractions like "don't" and possessive suffix 's should be annotated as MWT. + # However, older UD_English-EWT versions did not follow this rule and even v2.7 + # contains some forgotten occurrences, so let's handle these as well. + if next_form in {"n't", "'s"}: + self.mark_no_space(node) + + # Parsers may distinguish opening and closing single quotes by XPOS. + elif node.form == "'" and node.xpos == "``": + self.mark_no_space(node) + elif next_form == "'" and nodes[i + 1].xpos == "''": + self.mark_no_space(node) + + + # hyphen-compounds + elif node.form == '-' and i: + if ((nodes[i - 1] is node.parent or nodes[i - 1].parent is node.parent) and + (nodes[i + 1] is node.parent or nodes[i + 1].parent is node.parent)): + self.mark_no_space(nodes[i - 1]) + self.mark_no_space(node) + + # $200 + elif node.form == '$' and nodes[i + 1].upos == 'NUM': + self.mark_no_space(node) + + super().process_tree(root) diff --git a/udapi/block/ud/es/__init__.py b/udapi/block/ud/es/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/es/addmwt.py b/udapi/block/ud/es/addmwt.py new file mode 100644 index 00000000..92f80160 --- /dev/null +++ b/udapi/block/ud/es/addmwt.py @@ -0,0 +1,109 @@ +"""Block ud.es.AddMwt for heuristic detection of Spanish contractions. + +According to the UD guidelines, contractions such as "del" = "de el" +should be annotated using multi-word tokens. + +Note that this block should be used only for converting legacy conllu files. +Ideally a tokenizer should have already split the MWTs. +""" +import re +import udapi.block.ud.addmwt + +MWTS = { + 'al': {'form': 'a el'}, + 'del': {'form': 'de el'}, +} + +LEMMA = { + 'se': 'él', + 'le': 'él', + 'la': 'él', + 'lo': 'él', + 'te': 'tú', + 'me': 'yo', +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + v['lemma'] = v['form'] + v['upos'] = 'ADP DET' + v['deprel'] = '* det' + v['feats'] = '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art' + # The following are the default values + # v['main'] = 0 # which of the two words will inherit the original children (if any) + # v['shape'] = 'siblings', # the newly created nodes will be siblings + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def __init__(self, verbpron=False, **kwargs): + super().__init__(**kwargs) + self.verbpron = verbpron + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + + if analysis is not None: + # Modify the default attachment of the new syntactic words in special situations. + if re.match(r'^(root|conj|reparandum)$', node.udeprel): + # Copy the dictionary so that we do not modify the original and do not affect subsequent usages. + analysis = analysis.copy() + analysis['shape'] = 'subtree' + return analysis + + if not self.verbpron or node.upos not in {'VERB', 'AUX'}: + return None + + form = node.form.lower() + + if re.search('(me|la|le|lo|se|te)$', form): + verbform = node.feats['VerbForm'] + # TODO there are contractions even with VerbForm=Fin + if verbform == 'Fin' or form == 'pese': + return None + del node.feats['VerbForm'] + pron = form[-2:] + return { + 'form': form[:-2] + ' ' + pron, + 'lemma': '* ' + LEMMA[pron], + 'upos': '* PRON', + 'feats': 'VerbForm=%s *' % verbform, + 'deprel': '* iobj', + 'main': 0, + 'shape': 'subtree', + } + + if re.search('l[oe]s$', form): + verbform = node.feats['VerbForm'] + if verbform == 'Fin': + return None + del node.feats['VerbForm'] + pron = form[-3:] + return { + 'form': form[:-3] + ' ' + pron, + 'lemma': '* él', + 'upos': '* PRON', + 'feats': 'VerbForm=%s *' % verbform, + 'deprel': '* iobj', + 'main': 0, + 'shape': 'subtree', + } + + # TODO: multiple suffixes, e.g. compratelo = compra + te + lo + return None + + # Sometimes "del" has a shape which is neither "siblings" nor "subtree". + # E.g. in "a partir del NOUN" + # "del" = "de el", but + # "de" is attached to "a" (as fixed), while "el" is attached to the NOUN. + def postprocess_mwt(self, mwt): + if mwt.form.lower() in {'al', 'del'} and mwt.words[1].parent.precedes(mwt.words[1]): + head = mwt.words[1].next_node + while head.upos not in {'NOUN', 'PROPN'}: + if head.parent.precedes(head) or head.is_root(): + head = mwt.words[1].next_node + break + head = head.parent + mwt.words[1].parent = head diff --git a/udapi/block/ud/es/elque.py b/udapi/block/ud/es/elque.py new file mode 100644 index 00000000..4d14b98d --- /dev/null +++ b/udapi/block/ud/es/elque.py @@ -0,0 +1,116 @@ +""" +This block searches for relative clauses modifying a determiner ('el que, el cual...'). +It is written for Spanish but a similar block should work for other Romance +languages. +""" +from udapi.core.block import Block +import logging +import re + +class ElQue(Block): + + def __init__(self, fix=False, **kwargs): + """ + Default: Print the annotation patterns but do not fix anything. + fix=1: Do not print the patterns but fix them. + """ + super().__init__(**kwargs) + self.fix = fix + + def process_node(self, node): + # We take 'que' as the central node of the construction. + if re.match(r'^(que|cual)$', node.lemma) and node.upos == 'PRON' and node.parent.ord > node.ord: + # We will refer to the parent of 'que' as a verb, although it can be + # a non-verbal predicate, too. + que = node + verb = node.parent + # Check the lemma of the determiner. The form may vary for gender and number. + if que.prev_node and que.prev_node.lemma == 'el': + el = que.prev_node + adp = None + if el.prev_node and el.prev_node.upos == 'ADP': + adp = el.prev_node + if adp.udeprel == 'fixed': + adp = adp.parent + if self.fix: + self.fix_pattern(adp, el, que, verb) + else: + self.print_pattern(adp, el, que, verb) + + def print_pattern(self, adp, el, que, verb): + stanford = [] + if adp: + if adp.parent == el: + parentstr = 'el' + elif adp.parent == que: + parentstr = 'que' + elif adp.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(adp.deprel + '(' + parentstr + ', ADP)') + if el.parent == adp: + parentstr = 'ADP' + elif el.parent == que: + parentstr = 'que' + elif el.parent == verb: + parentstr = 'VERB' + else: + parentstr = 'OTHER' + stanford.append(el.deprel + '(' + parentstr + ', el)') + # We found the verb as the parent of 'que', so we do not need to check the parent of 'que' now. + stanford.append(que.deprel + '(VERB, que)') + if verb.parent == adp: + parentstr = 'ADP' + elif verb.parent == el: + parentstr = 'el' + else: + parentstr = 'OTHER' + stanford.append(verb.deprel + '(' + parentstr + ', VERB)') + print('; '.join(stanford)) + + def fix_pattern(self, adp, el, que, verb): + if adp: + if adp.parent == que or adp.parent == verb: + attach(adp, el, 'case') + if el.parent == que: + ###!!! Just a temporary change. In the end it will be attached elsewhere. + attach(el, verb) + el.parent = verb + if len(el.deps) == 1: + el.deps[0]['parent'] = verb + if verb.parent != adp and verb.parent != el and verb.parent != que: + eldeprel = None + if re.match(r'^[nc]subj$', verb.udeprel): + eldeprel = 'nsubj' + elif re.match(r'^ccomp$', verb.udeprel): + eldeprel = 'obj' + elif re.match(r'^advcl$', verb.udeprel): + eldeprel = 'obl' + elif re.match(r'^acl$', verb.udeprel): + eldeprel = 'nmod' + elif re.match(r'^(xcomp|conj|appos|root)$', verb.udeprel): + eldeprel = verb.deprel + if eldeprel: + attach(el, verb.parent, eldeprel) + attach(verb, el, 'acl:relcl') + # If anything before 'el' depends on the verb ('cc', 'mark', 'punct' etc.), + # re-attach it to 'el'. + for c in verb.children: + if c.ord < el.ord and re.match(r'^(cc|mark|case|punct)$', c.udeprel): + attach(c, el) + +def attach(node, parent, deprel=None): + """ + Attach a node to a new parent with a new deprel in the basic tree. In + addition, if there are enhanced dependencies and there is just one incoming + enhanced relation (this is the case in AnCora), this relation will be + modified accordingly. + """ + node.parent = parent + if deprel: + node.deprel = deprel + if len(node.deps) == 1: + node.deps[0]['parent'] = parent + if deprel: + node.deps[0]['deprel'] = deprel diff --git a/udapi/block/ud/es/fixexclamation.py b/udapi/block/ud/es/fixexclamation.py new file mode 100644 index 00000000..7dea8e0d --- /dev/null +++ b/udapi/block/ud/es/fixexclamation.py @@ -0,0 +1,47 @@ +"""Block to fix tokenization of exclamation marks in UD Spanish-AnCora.""" +from udapi.core.block import Block +import logging +import re + +class FixExclamation(Block): + + def process_node(self, node): + """ + In Spanish AnCora, there are things like '¡Hola!' as one token. + The punctuation should be separated. One may question whether this + should include names of companies (Yahoo!) or products (la revista + Hello!) but it should, as company and product names often have + multiple tokens (even multiple full words, not just punctuation) + and these are also separated in UD. + """ + if re.search(r'^[¡!]\w', node.form): + # Separate the punctuation and attach it to the rest. + punct = node.create_child() + punct.shift_before_node(node) + punct.form = node.form[:1] + node.form = node.form[1:] + punct.lemma = punct.form + punct.upos = 'PUNCT' + punct.xpos = 'faa' if punct.form == '¡' else 'fat' + punct.feats['PunctType'] = 'Excl' + punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin' + punct.misc['SpaceAfter'] = 'No' + punct.deprel = 'punct' + # Mark the position for manual check. + node.misc['Mark'] = 'PunctSep' + if re.search(r'\w[¡!]$', node.form): + # Separate the punctuation and attach it to the rest. + punct = node.create_child() + punct.shift_after_node(node) + punct.form = node.form[-1:] + node.form = node.form[:-1] + punct.lemma = punct.form + punct.upos = 'PUNCT' + punct.xpos = 'faa' if punct.form == '¡' else 'fat' + punct.feats['PunctType'] = 'Excl' + punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin' + punct.misc['SpaceAfter'] = node.misc['SpaceAfter'] + node.misc['SpaceAfter'] = 'No' + punct.deprel = 'punct' + # Mark the position for manual check. + node.misc['Mark'] = 'PunctSep' diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py new file mode 100644 index 00000000..62fa0f4d --- /dev/null +++ b/udapi/block/ud/es/fixtenerque.py @@ -0,0 +1,47 @@ +"""Block to fix spurious auxiliary verbs in UD Spanish-AnCora.""" +from udapi.core.block import Block +import logging +import re + +class FixTenerQue(Block): + + def process_node(self, node): + """ + Some Spanish treebanks treat the verb 'tener' in constructions such as + 'tener que comer' as auxiliary. This is wrong and the validator will + flag it as an error. This block fixes such annotations. + + EDIT: 'ir a comer' is processed the same way. + """ + if re.match(r'^(tener|ir)$', node.lemma) and node.upos == 'AUX': + node.upos = 'VERB' + # In rare cases the auxiliary may have been promoted due to ellipsis. + # Most of the time however, it is attached as 'aux' to the main verb. + if node.udeprel == 'aux': + mainverb = node.parent + self.reattach(node, mainverb.parent, mainverb.deprel) + self.reattach(mainverb, node, 'xcomp') + # Some children of the former main verb should be reattached to 'tener'. + # Others (especially a direct object) should stay with the former main verb. + for c in mainverb.children: + if not re.match(r'^(obj|iobj|obl|ccomp|xcomp|conj|list|compound|flat|fixed|goeswith|reparandum)$', c.udeprel): + self.reattach(c, node, c.deprel) + # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'. + for c in node.children: + if re.match(r'^(que|a)$', c.form.lower()) and c.ord > node.ord and c.ord < mainverb.ord: + self.reattach(c, mainverb, 'mark') + + def reattach(self, node, parent, deprel): + """ + Changes the incoming dependency relation to a node. Makes sure that the + same change is done in the basic tree and in the enhanced graph. + """ + if node.deps: + # If the enhanced graph contains the current basic relation, remove it. + orig_n_deps = len(node.deps) + node.deps = [x for x in node.deps if x['parent'] != node.parent or re.sub(r':.*', '', x['deprel']) != node.udeprel] + # Add the new basic relation to the enhanced graph only if the original one was there. + if len(node.deps) < orig_n_deps: + node.deps.append({'parent': parent, 'deprel': deprel}) + node.parent = parent + node.deprel = deprel diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py new file mode 100644 index 00000000..643ecd7c --- /dev/null +++ b/udapi/block/ud/es/fixverbfeats.py @@ -0,0 +1,38 @@ +"""Block to fix features (and potentially lemmas) of verbs in UD Spanish-PUD.""" +from udapi.core.block import Block +import logging +import re + +class FixVerbFeats(Block): + + def process_node(self, node): + """ + The features assigned to verbs in Spanish PUD are often wrong, although + the annotation was (reportedly) done manually. For example, infinitives + are tagged with VerbForm=Fin instead of VerbForm=Inf. + """ + if re.match(r'^(VERB|AUX)$', node.upos): + if re.search(r'[aei]r$', node.form, re.IGNORECASE): + # The infinitive has no features other than VerbForm. + node.feats = {} + node.feats['VerbForm'] = 'Inf' + node.lemma = node.form.lower() + elif re.search(r'ndo$', node.form, re.IGNORECASE): + if node.form.lower() != 'entiendo': + # The gerund has no features other than VerbForm. + # The lemma is not always straightforward but we have fixed it manually. + node.feats = {} + node.feats['VerbForm'] = 'Ger' + elif re.search(r'([ai]d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE): + # The (past) participle has always Gender and Number. + # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma). + # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?) + gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem') + number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing') + node.feats = {} + node.feats['VerbForm'] = 'Part' + node.feats['Tense'] = 'Past' + node.feats['Gender'] = gender + node.feats['Number'] = number + if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE): + node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower()) diff --git a/udapi/block/ud/exgoogle2ud.py b/udapi/block/ud/exgoogle2ud.py new file mode 100644 index 00000000..f63fad74 --- /dev/null +++ b/udapi/block/ud/exgoogle2ud.py @@ -0,0 +1,97 @@ +"""Block ud.ExGoogle2ud converts data which were originally annotated in Google style +then converted with an older version of ud.Google2ud to UDv2, +then manually edited and we don't want to loose these edits, +so we cannot simply rerun the newer version of ud.Google2ud on the original Google data. +""" +from udapi.block.ud.fixchain import FixChain +from udapi.block.ud.fixpunct import FixPunct +from udapi.block.ud.fixrightheaded import FixRightheaded +from udapi.block.ud.complywithtext import ComplyWithText +from udapi.block.ud.es.addmwt import AddMwt as es_AddMwt +from udapi.block.ud.joinasmwt import JoinAsMwt +from udapi.core.block import Block + + +class ExGoogle2ud(Block): + """Convert former Google Universal Dependency Treebank into UD style.""" + + def __init__(self, lang='unk', **kwargs): + super().__init__(**kwargs) + self.lang = lang + + self._fixpunct_block = None if self.lang == 'it' else FixPunct() + self._fixrigheaded_block = FixRightheaded() + self._fixchain_block = FixChain() + self._comply_block = None + if lang == 'ja': + self._comply_block = ComplyWithText() + + self._addmwt_block = None + self._joinasmwt_block = None + if lang == 'es': + self._addmwt_block = es_AddMwt() + self._joinasmwt_block = JoinAsMwt() + + def process_tree(self, root): + for node in root.descendants: + self.fix_node(node) + + for block in ( + self._addmwt_block, + self._joinasmwt_block, + self._comply_block, + self._fixrigheaded_block, # deprel=fixed,flat,... should be always head-initial + self._fixchain_block, # and form a flat structure, not a chain. + self._fixpunct_block): # commas should depend on the subord unit. + if block: + block.process_tree(root) + + def fix_node(self, node): + """Various fixed taken from ud.Google2ud.""" + + if node.xpos == 'SYM': # These are almost always tagged as upos=X which is wrong. + node.upos = 'SYM' + if node.deprel in {'punct', 'p'}: + if node.form in "_-.؟”'": + node.upos = 'PUNCT' + else: + node.deprel = 'dep' # This is another way how to say deprel=todo. + + if self.lang != 'es' and node.udeprel == 'nmod' and node.deprel != 'nmod': + parent_is_nominal = self.is_nominal(node.parent) + if parent_is_nominal == 'no': + node.deprel = 'obl' + ':' + node.sdeprel + elif node.deprel == 'nmod:tmod': + node.deprel = 'obl:tmod' + + if node.deprel == 'obl:gmod' and self.lang == 'ar': + node.deprel = 'obl' + node.feats['Case'] = 'Gen' + + if node.upos == 'CCONJ' and node.deprel == 'mark': + node.upos = 'SCONJ' + + if self.lang == 'es': + if node.deprel == 'compound': + # most of the uppercase compounds are upos=PROPN, but not all, e.g. Hack Forums + if node.form[0].isupper(): + node.deprel = 'flat:name' + else: + node.deprel = 'nmod' + + @staticmethod + def is_nominal(node): + """Returns 'no' (for predicates), 'yes' (sure nominals) or 'maybe'. + + Used in `change_nmod`.""" + if node.upos in ["VERB", "AUX", "ADJ", "ADV"]: + return 'no' + # Include NUM for examples such as "one of the guys" + # and DET for examples such as "some/all of them" + if node.upos in ["NOUN", "PRON", "PROPN", "NUM", "DET"]: + # check whether the node is a predicate + # (either has a nsubj/csubj dependendent or a copula dependent) + if any(["subj" in child.deprel or child.deprel == 'cop' for child in node.children]): + return 'maybe' + return 'yes' + return 'maybe' diff --git a/udapi/block/ud/fixadvmodbyupos.py b/udapi/block/ud/fixadvmodbyupos.py new file mode 100644 index 00000000..a2e4439c --- /dev/null +++ b/udapi/block/ud/fixadvmodbyupos.py @@ -0,0 +1,103 @@ +""" +Block ud.FixAdvmodByUpos will change the dependency relation from advmod to something else +if the UPOS is not ADV. +""" +from udapi.core.block import Block + + +class FixAdvmodByUpos(Block): + """ + Make sure advmod is not used with UPOS it should not be used with. + """ + + def process_node(self, node): + if node.udeprel == 'advmod': + if node.upos in ['NOUN', 'PROPN', 'PRON', 'DET', 'NUM']: + node.deprel = 'obl' + elif node.upos == 'VERB': + node.deprel = 'advcl' + elif node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos in ['ADP', 'SCONJ']: + if node.parent.upos == 'VERB': + node.deprel = 'mark' + else: + node.deprel = 'case' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + else: + node.deprel = 'dep' + ###!!! The following are not advmod so they should probably have their + ###!!! own block or this block should have a different name. + elif node.udeprel == 'expl': + if node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'ADP': + node.deprel = 'case' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.udeprel in ['aux', 'cop']: + if node.upos != 'AUX': + node.deprel = 'dep' + elif node.udeprel == 'case': + if node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'PRON': + node.deprel = 'nmod' + elif node.udeprel == 'mark': + if node.upos in ['PRON', 'DET']: + node.deprel = 'nsubj' # it could be also obj, iobj, obl or nmod; just guessing what might be more probable + elif node.upos == 'NOUN': + node.deprel = 'obl' + elif node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.udeprel == 'cc': + if node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.upos == 'NOUN': + node.deprel = 'dep' + elif node.udeprel == 'det': + if node.upos == 'NOUN': + node.deprel = 'nmod' + elif node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'NUM': + node.deprel = 'nummod' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'VERB': + node.deprel = 'dep' + elif node.upos == 'SCONJ': + node.deprel = 'mark' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.upos == 'X': + node.deprel = 'dep' + elif node.udeprel == 'nummod': + if node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'PRON': + node.deprel = 'nmod' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'ADP': + node.deprel = 'case' + elif node.udeprel == 'punct': + if node.upos != 'PUNCT': + node.deprel = 'dep' + elif node.udeprel == 'obl' and node.parent.upos in ['NOUN', 'PROPN', 'PRON'] and node.parent.udeprel in ['nsubj', 'obj', 'iobj', 'obl', 'vocative', 'dislocated', 'expl', 'nmod']: + node.deprel = 'nmod' diff --git a/udapi/block/ud/fixchain.py b/udapi/block/ud/fixchain.py new file mode 100644 index 00000000..b3a586f6 --- /dev/null +++ b/udapi/block/ud/fixchain.py @@ -0,0 +1,18 @@ +"""Block ud.FixChain for making sure deprel=fixed|flat|goeswith|list does not form a chain.""" +from udapi.core.block import Block + + +class FixChain(Block): + """Make sure deprel=fixed etc. does not form a chain, but a flat structure.""" + + def __init__(self, deprels='fixed,flat,goeswith,list', **kwargs): + """Args: + deprels: comma-separated list of deprels to be fixed. Default = fixed,goeswith,list. + """ + super().__init__(**kwargs) + self.deprels = deprels.split(',') + + def process_node(self, node): + for deprel in self.deprels: + if node.udeprel == deprel and node.parent.udeprel == deprel: + node.parent = node.parent.parent diff --git a/udapi/block/ud/fixcompoundname.py b/udapi/block/ud/fixcompoundname.py new file mode 100644 index 00000000..90596e35 --- /dev/null +++ b/udapi/block/ud/fixcompoundname.py @@ -0,0 +1,46 @@ +""" +Block ud.FixCompoundName finds compound relations between PROPN nodes and converts +them to flat:name. This is not necessarily correct in all situations. The difference +between compound and flat is that compound allows to distinguish head and modifier. +Multiword person names (given name and surname, or various other patterns) typically +should be analyzed as flat but there are treebanks that incorrectly use compound +for person names. This block can be used to fix them. +""" +from udapi.core.block import Block +import regex as re +import logging + + +class FixCompoundName(Block): + """ + Converts a compound relation between two PROPN nodes into a flat relation. + Compounds of a PROPN and a non-PROPN will be left alone, although they are + suspicious, too. + """ + + def process_node(self, node): + if node.upos == 'PROPN' and node.udeprel == 'compound' and node.parent.upos == 'PROPN': + origparent = node.parent + grandparent = origparent.parent + outdeprel = origparent.deprel + # See if there are other PROPN compound siblings. + # (The list node.children is automatically sorted by ord. If any new sorting is needed later, we can compare nodes directly, their default comparison value is ord.) + namewords = [x for x in origparent.children(add_self=True) if x.upos == 'PROPN' and (x.udeprel == 'compound' or x == origparent)] + # The Hindi treebank tags dates (['30', 'navaṁbara'], ['disaṁbara', '1993']) as PROPN compounds. + # This is wrong but it is also different from personal names we are targeting here. + # Hence, we will skip "names" that contain numbers. + if any(re.search(r"\d", x.form) for x in namewords): + #logging.info(str([x.misc['Translit'] for x in namewords])) + ###!!! We currently cannot transform enhanced dependencies. + ###!!! If we proceed, the basic tree would diverge from the enhanced dependencies. + if len(node.deps) > 0: + logging.fatal('There are enhanced dependencies but ud.FixCompoundName has been implemented only for basic dependencies.') + # The first name word will be the technical head. If it is the current parent, fine. + head = namewords[0] + rest = namewords[1:] + if head != origparent: + head.parent = grandparent + head.deprel = outdeprel + for n in rest: + n.parent = head + n.deprel = 'flat:name' diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py new file mode 100644 index 00000000..9b4ce191 --- /dev/null +++ b/udapi/block/ud/fixleaf.py @@ -0,0 +1,42 @@ +""" +Block ud.FixLeaf checks that function word dependents are leaves. +Certain known exceptions are observed (e.g., fixed expressions). +""" +from udapi.core.block import Block +import logging +import re + +class FixLeaf(Block): + """ + Make sure that function words are leaves unless one of the known exceptions + applies. + """ + + def __init__(self, deprels='aux,cop,case,mark,cc', **kwargs): + """ + Args: + deprels: comma-separated list of deprels to be fixed. Default = aux,cop,case,mark,cc. + """ + super().__init__(**kwargs) + self.deprels = deprels.split(',') + + def process_node(self, node): + for deprel in self.deprels: + if node.udeprel == deprel: + # Every function dependent can have a fixed child. + # We will also allow conj, cc, punct, goeswith, reparandum. + allowed = ['fixed', 'punct', 'goeswith', 'reparandum'] + if deprel != 'cc': + allowed += ['conj', 'cc'] + children = [c for c in node.children if not (c.udeprel in allowed)] + # Re-attach the remaining children to an acceptable ancestor. + ancestor = node.parent + while ancestor.udeprel in self.deprels: + ancestor = ancestor.parent + for c in children: + c.parent = ancestor + # If there are enhanced dependencies, check whether we want to redirect them too. + if c.deps: + for edep in c.deps: + if edep['parent'] == node: + edep['parent'] = ancestor diff --git a/udapi/block/ud/fixmultiobjects.py b/udapi/block/ud/fixmultiobjects.py new file mode 100644 index 00000000..485b85f0 --- /dev/null +++ b/udapi/block/ud/fixmultiobjects.py @@ -0,0 +1,47 @@ +""" +Block ud.FixMultiObjects will ensure that no node has more than one (direct) object child. +""" +from udapi.core.block import Block + + +class FixMultiObjects(Block): + """ + Make sure there is at most one object. + """ + + def process_node(self, node): + objects = [x for x in node.children if x.udeprel == 'obj'] + if len(objects) > 1: + subjects = [x for x in node.children if x.udeprel in ['nsubj', 'csubj']] + # Some heuristics that could work in AnCora: + # If all objects are after the verb, keep the one that is closest to the verb. + if objects[0].ord > node.ord: + objects = objects[1:] + for o in objects: + o.deprel = 'obl:arg' + o.deps[0]['deprel'] = 'obl:arg' + elif objects[-1].ord < node.ord: + objects = objects[:-1] + for o in objects: + o.deprel = 'dislocated' + o.deps[0]['deprel'] = 'dislocated' + # ho experimenta tot + elif objects[-1].lemma in ['tot', 'todo']: + objects[-1].parent = objects[0] + objects[-1].deprel = 'nmod' + objects[-1].deps[0]['parent'] = objects[0] + objects[-1].deps[0]['deprel'] = 'nmod' + # X se llama Y + elif node.lemma in ['llamar', 'considerar', 'decir', 'denunciar', 'causar', 'escribir', 'hacer', 'rubricar']: + objects[-1].deprel = 'xcomp' + objects[-1].deps[0]['deprel'] = 'xcomp' + elif len(subjects) == 0: + objects[0].deprel = 'nsubj' + objects[0].deps[0]['deprel'] = 'nsubj' + else: + objects[0].deprel = 'dislocated' + objects[0].deps[0]['deprel'] = 'dislocated' + # For the moment, we take the dummiest approach possible: The first object survives and all others are forced to a different deprel. + #objects = objects[1:] + #for o in objects: + # o.deprel = 'iobj' diff --git a/udapi/block/ud/fixmultisubjects.py b/udapi/block/ud/fixmultisubjects.py new file mode 100644 index 00000000..f8aeca06 --- /dev/null +++ b/udapi/block/ud/fixmultisubjects.py @@ -0,0 +1,23 @@ +""" +Block ud.FixMultiSubjects will ensure that no node has more than one subject child (except those +marked as :outer). +""" +import re +from udapi.core.block import Block + + +class FixMultiSubjects(Block): + """ + Make sure there is at most one subject that is not marked as :outer. + """ + + def process_node(self, node): + subjects = [x for x in node.children if re.match(r"^[nc]subj(:|$)", x.deprel) and not re.search(r":outer$", x.deprel)] + # For the moment, we take the dummiest approach possible: The first subject survives and all others are forced to a different deprel. + if len(subjects) > 1: + subjects = subjects[1:] + for s in subjects: + if re.match(r"^n", s.deprel): + s.deprel = 'obl' + else: + s.deprel = 'advcl' diff --git a/udapi/block/ud/fixmwtspace.py b/udapi/block/ud/fixmwtspace.py new file mode 100644 index 00000000..a2b7b875 --- /dev/null +++ b/udapi/block/ud/fixmwtspace.py @@ -0,0 +1,22 @@ +""" +Block ud.FixMwtSpace looks for multiword tokens whose form contains a space, +which should be avoided. If found, the block checks whether it can remove +the multiword token seamlessly, that is, whether the syntactic words correspond +to the space-delimited parts of the multiword token. If possible, the MWT +line will be removed. +""" +from udapi.core.block import Block +import re + + +class FixMwtSpace(Block): + """Try to remove multiword tokens with spaces.""" + + def process_node(self, node): + if node.multiword_token: + mwt = node.multiword_token + if re.search(r' ', mwt.form): + if node == mwt.words[0]: + wordforms = [x.form for x in mwt.words] + if ' '.join(wordforms) == mwt.form: + mwt.remove() diff --git a/udapi/block/ud/fixpseudocop.py b/udapi/block/ud/fixpseudocop.py new file mode 100644 index 00000000..f4d9a1ec --- /dev/null +++ b/udapi/block/ud/fixpseudocop.py @@ -0,0 +1,45 @@ +"""Block to fix annotation of verbs that are currently treated as copulas + but they should be treated as normal verbs (with secondary predication) + instead.""" +from udapi.core.block import Block +import re + +class FixPseudoCop(Block): + + def __init__(self, lemmas, noncopaux=False, **kwargs): + """Create the ud.FixPseudoCop block instance. + + Args: + lemmas: comma-separated list of lemmas of the pseudocopulas that should be fixed + noncopaux: do the same for non-copula auxiliaries with the given lemma + """ + super().__init__(**kwargs) + self.lemmas = lemmas.split(',') + self.noncopaux = noncopaux + + def process_node(self, node): + pseudocop = self.lemmas + if node.lemma in pseudocop: + # Besides spurious copulas, this block can be optionally used to fix spurious auxiliaries (if noncopaux is set). + if node.udeprel == 'cop' or self.noncopaux and node.udeprel == 'aux': + secpred = node.parent + grandparent = secpred.parent + node.parent = grandparent + node.deprel = secpred.deprel + secpred.parent = node + secpred.deprel = "xcomp" + ###!!! We should also take care of DEPS if they exist. + # As a copula, the word was tagged AUX. Now it should be VERB. + node.upos = "VERB" + # Examine the children of the original parent. + # Those that modify the clause should be re-attached to me. + # Those that modify the word (noun, adjective) should stay there. + for c in secpred.children: + # obl is borderline. It could modify an adjective rather than a clause. + # obj and iobj should not occur in copular clauses but it sometimes + # occurs with pseudocopulas: "I declare him handsome." + if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): + c.parent = node + # Another possible error is that the word is tagged AUX without being attached as "cop" or "aux". + elif self.noncopaux and node.upos == 'AUX': + node.upos = 'VERB' diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py new file mode 100644 index 00000000..854a24a8 --- /dev/null +++ b/udapi/block/ud/fixpunct.py @@ -0,0 +1,302 @@ +"""Block ud.FixPunct for making sure punctuation is attached projectively. + +Punctuation in Universal Dependencies has the tag PUNCT, dependency relation punct, +and is always attached projectively, usually to the head of a neighboring subtree +to its left or right (see https://universaldependencies.org/u/dep/punct.html). +Punctuation normally does not have children. If it does, we will fix it first. + +This block tries to re-attach punctuation projectively and according to the guidelines. +It should help in cases where punctuation is attached randomly, always to the root +or always to the neighboring word. However, there are limits to what it can do; +for example it cannot always recognize whether a comma is introduced to separate +the block to its left or to its right. Hence if the punctuation before running +this block is almost good, the block may actually do more harm than good. + +Since the punctuation should not have children, we should not create a non-projectivity +if we check the root edges going to the right. +""" +from udapi.core.block import Block +# pylint: disable=no-self-use + +# TODO We need to know the language, there are many other quotation styles, +# e.g. Finnish and Swedish uses the same symbol for opening and closing: ”X”. +# Danish uses uses the French quotes, but switched: »X«. +PAIRED_PUNCT = { + '(': ')', + '[': ']', + '{': '}', + '"': '"', # ASCII double quotes + "'": "'", # ASCII single quotes + '“': '”', # quotation marks used in English, ... + '„': '“', # Czech, German, Russian, ... + '«': '»', # French, Russian, Spanish, ... + '‹': '›', # dtto + '《': '》', # Korean, Chinese + '「': '」', # Chinese, Japanese + '『': '』', # ditto + '¿': '?', # Spanish paired question marks + '¡': '!', # Spanish paired exclamation marks + } + +FINAL_PUNCT = '.?!' + + +class FixPunct(Block): + """Make sure punctuation nodes are attached projectively.""" + + def __init__(self, check_paired_punct_upos=False, copy_to_enhanced=False, **kwargs): + """Create the ud.FixPunct block instance. + + Args: + check_paired_punct_upos: fix paired punctuation tokens only if their UPOS=PUNCT. + The default is false, which means that fixed punctuation is detected only + based on the form with the exception of single & double quote character, + which is frequently ambiguous*, so UPOS=PUNCT is checked always. + *) Single quote can be an apostrophe. Double quote as a NOUN can be the inch symbol. + copy_to_enhanced: for all upos=PUNCT, let the enhanced depencies + be the same as the basic dependencies. + """ + super().__init__(**kwargs) + self._punct_type = None + self.check_paired_punct_upos = check_paired_punct_upos + self.copy_to_enhanced = copy_to_enhanced + + def _is_punct(self, node): + if node.upos == 'PUNCT': + return True + if self.check_paired_punct_upos: + return False + if node.form in "'\"": + return False + if node.form in PAIRED_PUNCT or node.form in PAIRED_PUNCT.values(): + return True + return False + + def process_tree(self, root): + # First, make sure no PUNCT has children. + # This may introduce multiple subroots, which will be fixed later on + # (preventing to temporarily create multiple subroots here would prevent fixing some errors). + for node in root.descendants: + while self._is_punct(node.parent): + node.parent = node.parent.parent + + # Second, fix paired punctuations: quotes and brackets, marking them in _punct_type. + # This should be done before handling the subordinate punctuation, + # in order to prevent non-projectivities e.g. in dot-before-closing-quote style sentences: + # I call him "Bob." + # Here both quotes and the sentence-final dot should be attached to "Bob". + # (As you can see on the previous line, I don't like this American typographic rule.) + self._punct_type = [None] * (1 + len(root.descendants)) + for node in root.descendants: + if self._punct_type[node.ord] != 'closing': + closing_punct = PAIRED_PUNCT.get(node.form) + if closing_punct is not None: + self._fix_paired_punct(root, node, closing_punct) + + # Third, fix subordinate punctuation (i.e. any punctuation not marked in _punct_type). + for node in root.descendants: + if node.upos == 'PUNCT' and not self._punct_type[node.ord]: + self._fix_subord_punct(node) + + # UD requires "exactly one word is the head of the sentence, dependent on a notional ROOT", i.e. a single "subroot". + # This seems to be a stronger rule than no-PUNCT-children because it is checked by the validator. + # So lets prevent multiple subroots (at the cost of possibly re-introducing PUNCT-children). + if len(root.children) > 1: + selected_subroot = next((n for n in root.children if n.udeprel == 'root'), root.children[0]) + for a_subroot in root.children: + if a_subroot != selected_subroot: + a_subroot.parent = selected_subroot + + # Check if the subroot is still marked with deprel=root. + # This may not hold if the original subroot was a paired punctuation, which was rehanged. + if root.children[0].udeprel != 'root': + root.children[0].udeprel = 'root' + if self.copy_to_enhanced: + root.children[0].deps = [{'parent': root, 'deprel': 'root'}] + for another_node in root.children[0].descendants: + if another_node.udeprel == 'root': + another_node.udeprel = 'punct' + + # TODO: This block changes parents not only for PUNCT nodes. These should be reflected into enhanced deps as well. + if self.copy_to_enhanced: + for node in root.descendants: + if node.upos == 'PUNCT': + node.deps = [{'parent': node.parent, 'deprel': node.deprel}] + + def _fix_subord_punct(self, node): + # Dot used as the ordinal-number marker (in some languages) or abbreviation marker. + # TODO: detect these cases somehow + # Numbers can be detected with `node.parent.form.isdigit()`, + # but abbreviations are more tricky because the Abbr=Yes feature is not always used. + if node.form == '.' and node.parent == node.prev_node: + return + + # Even non-paired punctuation like commas and dashes may work as paired. + # Detect such cases and try to preserve, but only if projective. + p_desc = node.parent.descendants(add_self=1) + if node in (p_desc[0], p_desc[-1]) and len(p_desc) == p_desc[-1].ord - p_desc[0].ord + 1: + if (p_desc[0].upos == 'PUNCT' and p_desc[-1].upos == 'PUNCT' + and p_desc[0].parent == node.parent and p_desc[-1].parent == node.parent): + return + + # Initialize the candidates (left and right) with the nearest nodes excluding punctuation. + # Final punctuation should not be attached to any following, so exclude r_cand there. + l_cand, r_cand = node.prev_node, node.next_node + if node.form in FINAL_PUNCT: + r_cand = None + while l_cand.ord > 0 and l_cand.upos == 'PUNCT': + if self._punct_type[l_cand.ord] == 'opening' and l_cand.parent != node: + l_cand = None + break + l_cand = l_cand.prev_node + while r_cand is not None and r_cand.upos == 'PUNCT': + if self._punct_type[r_cand.ord] == 'closing' and r_cand.parent != node: + r_cand = None + break + r_cand = r_cand.next_node + + # Climb up from the candidates, until we would reach the root or "cross" the punctuation. + # If the candidates' descendants span across the punctuation, we also stop + # because climbing higher would cause a non-projectivity (the punct would be the gap). + l_path, r_path = [l_cand], [r_cand] + if l_cand is None or l_cand.is_root(): + l_cand, l_path = None, [] + else: + while (not l_cand.parent.is_root() and l_cand.parent < node + and not node < l_cand.descendants(add_self=1)[-1]): + l_cand = l_cand.parent + l_path.append(l_cand) + if r_cand is not None: + while (not r_cand.parent.is_root() and node < r_cand.parent + and not r_cand.descendants(add_self=1)[0] < node): + r_cand = r_cand.parent + r_path.append(r_cand) + + # Filter out candidates which would lead to non-projectivities, i.e. bugs + # punct-nonproj and punct-nonproj-gap as checked by the UD validator and ud.MarkBugs. + orig_parent = node.parent + l_path = [n for n in l_path if n and self._will_be_projective(node, n)] + r_path = [n for n in r_path if n and self._will_be_projective(node, n)] + l_cand = l_path[-1] if l_path else None + r_cand = r_path[-1] if r_path else None + node.parent = orig_parent + + # Now select between l_cand and r_cand -- which will be the new parent? + # The lower one. Note that if neither is descendant of the other and neither is None + # (which can happen in rare non-projective cases), we arbitrarily prefer l_cand, + # but if the original parent is either on l_path or r_path, we keep it as acceptable. + if l_cand is not None and l_cand.is_descendant_of(r_cand): + cand, path = l_cand, l_path + elif r_cand is not None and r_cand.is_descendant_of(l_cand): + cand, path = r_cand, r_path + elif l_cand is not None: + cand, path = l_cand, l_path + r_path + elif r_cand is not None: + cand, path = r_cand, l_path + r_path + else: + return + + # The guidelines say: + # Within the relevant unit, a punctuation mark is attached + # at the highest possible node that preserves projectivity. + # However, sometimes it is difficult to detect the unit (and its head). + # E.g. in "Der Mann, den Sie gestern kennengelernt haben, kam wieder." + # the second comma should depend on "kennengelernt", not on "Mann" + # because the unit is just the relative clause. + # We try to be conservative and keep the parent, unless we are sure it is wrong. + if node.parent not in path: + node.parent = cand + node.deprel = 'punct' + + def _will_be_projective(self, node, cand): + node.parent = cand + return not node.is_nonprojective() and not self._causes_gap(node) + + def _causes_gap(self, node): + return node.is_nonprojective_gap() and not node.parent.is_nonprojective_gap() + + def _fix_paired_punct(self, root, opening_node, closing_punct): + if (self.check_paired_punct_upos + or opening_node.form in "'\"") and opening_node.upos != 'PUNCT': + return + nested_level = 0 + for node in root.descendants[opening_node.ord:]: + if node.form == closing_punct: + if nested_level > 0: + nested_level -= 1 + else: + self._fix_pair(root, opening_node, node) + return + elif node.form == opening_node.form: + nested_level += 1 + + def _fix_pair(self, root, opening_node, closing_node): + # Ideally, paired punctuation symbols should be attached to the single + # head of the subtree inside. Provided the inside segment is a single + # subtree. + heads = [] + punct_heads = [] + for node in root.descendants: + if node == opening_node or node == closing_node: + continue + # If this is a node inside of the pair, is its parent outside? + if node > opening_node and node < closing_node: + if node.parent < opening_node or node.parent > closing_node: + if node.upos == 'PUNCT': + punct_heads.append(node) + else: + heads.append(node) + # Not only the punctuation symbols must not be attached non-projectively, + # they also must not cause non-projectivity of other relations. This could + # happen if an outside node is attached to an inside node. To account for + # this, mark the inside parent as a head, too. + elif node.parent > opening_node and node.parent < closing_node: + if node.parent.upos == 'PUNCT': + punct_heads.append(node.parent) + else: + heads.append(node.parent) + + # Punctuation should not have children, but if there is no other head candidate, + # let's break this rule. + if len(heads) == 0: + heads = punct_heads + # If there are no nodes between the opening and closing mark (), + # let's treat the marks as any other (non-pair) punctuation. + if len(heads) == 0: + return + else: + # Ideally, there should be only a single head. + # If not, we could try e.g. to choose the "widests-span head": + # opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0] + # closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0] + # which often leads to selecting the same head for the opening and closing punctuation + # ignoring single words inside the paired punct which are non-projectively attached outside. + # However, this means that the paired punctuation will be attached non-projectively, + # which is forbidden by the UD guidelines. + # Thus, we will choose the nearest head, which is the only way how to prevent non-projectivities. + # Sort the heads by their ords (this is not guaranteed because we were adding a mixture of + # inside heads and inside parents of outside nodes). + heads.sort(key=lambda x: x.ord) + opening_node.parent = heads[0] + closing_node.parent = heads[-1] + + self._punct_type[opening_node.ord] = 'opening' + self._punct_type[closing_node.ord] = 'closing' + + # In rare cases, non-projective gaps may remain. Let's dirty fix these! + # E.g. in "the (lack of) reproducibility", the closing parenthesis + # should be attached to "of" rather than to "lack" + # -- breaking the paired-marks-have-same-parent rule + # in order to prevent the punct-nonproj-gap bug (recently checked by validator.py). + if self._causes_gap(opening_node): + opening_node.parent = opening_node.next_node + while (opening_node.parent.ord < closing_node.ord - 1 + and (opening_node.parent.upos == 'PUNCT' or opening_node.is_nonprojective() + or self._causes_gap(opening_node))): + opening_node.parent = opening_node.parent.next_node + if self._causes_gap(closing_node): + closing_node.parent = closing_node.prev_node + while (closing_node.parent.ord > opening_node.ord + 1 + and (closing_node.parent.upos == 'PUNCT' or closing_node.is_nonprojective() + or self._causes_gap(closing_node))): + closing_node.parent = closing_node.parent.prev_node diff --git a/udapi/block/ud/fixpunctchild.py b/udapi/block/ud/fixpunctchild.py new file mode 100644 index 00000000..07ef3eb3 --- /dev/null +++ b/udapi/block/ud/fixpunctchild.py @@ -0,0 +1,10 @@ +"""Block ud.FixPunctChild for making sure punctuation nodes have no children.""" +from udapi.core.block import Block + + +class FixPunctChild(Block): + """Make sure punct nodes have no children by rehanging the children upwards.""" + + def process_node(self, node): + while node.parent.deprel == 'punct': + node.parent = node.parent.parent diff --git a/udapi/block/ud/fixrightheaded.py b/udapi/block/ud/fixrightheaded.py new file mode 100644 index 00000000..045278dd --- /dev/null +++ b/udapi/block/ud/fixrightheaded.py @@ -0,0 +1,33 @@ +"""Block ud.FixRightheaded for making sure flat,fixed,appos,goeswith,list is head initial. + +Note that deprel=conj should also be left-headed, +but it is not included in this fix-block by default +because coordinations are more difficult to convert +and one should use a specialized block instead. +""" +from udapi.core.block import Block + + +class FixRightheaded(Block): + """Make sure deprel=flat,fixed,... form a head-initial (i.e. left-headed) structure.""" + + def __init__(self, deprels='flat,fixed,appos,goeswith,list', **kwargs): + """Args: + deprels: comma-separated list of deprels to be fixed. + Default = flat,fixed,appos,goeswith,list. + """ + super().__init__(**kwargs) + self.deprels = deprels.split(',') + + def process_node(self, node): + for deprel in self.deprels: + if node.udeprel == deprel and node.precedes(node.parent): + orig_parent = node.parent + node.parent = orig_parent.parent + if deprel != 'conj': + for child in orig_parent.children: + child.parent = node + orig_parent.parent = node + head_deprel = orig_parent.deprel + orig_parent.deprel = node.deprel + node.deprel = head_deprel diff --git a/udapi/block/ud/fixroot.py b/udapi/block/ud/fixroot.py new file mode 100644 index 00000000..be972d8b --- /dev/null +++ b/udapi/block/ud/fixroot.py @@ -0,0 +1,37 @@ +""" +Block ud.FixRoot will ensure that the tree is free of common root-related errors. +Simple heuristics are used; it is likely that human inspection would lead to +a different solution. Nevertheless, if a quick fix is needed to pass the +validation, this block can be helpful. + +WARNING: The block currently ignores enhanced dependencies. +""" +import re +from udapi.core.block import Block + + +class FixRoot(Block): + """ + Fixes the following validation errors: + - Only one node must be attached directly to the artificial root node. + => If the root has multiple children, keep the first one. Attach the other + ones to the first one. Change their deprel to 'parataxis'. + - The node attached as a child of the artificial root node must have the + 'root' relation (or its subtype). + => If the root child has another deprel, change it to 'root'. + - The node attached as a child of the artificial root node is the only one + allowed to have the 'root' relation (or its subtype). + => If another node has that deprel, change it to 'parataxis'. + """ + + def process_tree(self, root): + rchildren = root.children + if len(rchildren) > 1: + for i in range(len(rchildren)-1): + rchildren[i+1].parent = rchildren[0] + rchildren[i+1].deprel = 'parataxis' + if rchildren[0].udeprel != 'root': + rchildren[0].deprel = 'root' + for n in root.descendants: + if not n.parent == root and n.udeprel == 'root': + n.deprel = 'parataxis' diff --git a/udapi/block/ud/fr/__init__.py b/udapi/block/ud/fr/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/fr/addmwt.py b/udapi/block/ud/fr/addmwt.py new file mode 100644 index 00000000..948a927a --- /dev/null +++ b/udapi/block/ud/fr/addmwt.py @@ -0,0 +1,82 @@ +"""Block ud.fr.AddMwt for heuristic detection of French contractions. + +According to the UD guidelines, contractions such as "des" = "de les" +should be annotated using multi-word tokens. + +Note that this block should be used only for converting legacy conllu files. +Ideally a tokenizer should have already split the MWTs. +""" +import udapi.block.ud.addmwt + +MWTS = { + 'au': {'form': 'à le', 'lemma': 'à le'}, + 'aux': {'form': 'à les', 'lemma': 'à le'}, + 'des': {'form': 'de les', 'lemma': 'de le'}, + 'du': {'form': 'de le', 'lemma': 'de le'}, + + 'auquel': {'form': 'à lequel', 'upos': 'ADP PRON', 'lemma': 'à lequel'}, + 'auxquels': {'form': 'à lesquels', 'upos': 'ADP PRON', 'lemma': 'à lequel'}, + 'auxquelles': {'form': 'à lesquelles', 'upos': 'ADP PRON', 'lemma': 'à lequel'}, + 'desquels': {'form': 'de lesquels', 'upos': 'ADP PRON', 'lemma': 'de lequel'}, + 'desquelles': {'form': 'de lesquelles', 'upos': 'ADP PRON', 'lemma': 'de lequel'}, + 'duquel': {'form': 'de lequel', 'upos': 'ADP PRON', 'lemma': 'de lequel'}, +} +# TODO https://fr.wiktionary.org/wiki/des#Vocabulaire_apparent.C3.A9_par_le_sens_2 +# lists more contractions, e.g. "dudit", "audit" + +# shared values for all entries in MWTS +for v in MWTS.values(): + if not v.get('upos'): + v['upos'] = 'ADP DET' + if not v.get('shape'): + v['shape'] = 'subtree' + if not v.get('deprel'): + v['deprel'] = 'case det' if v['upos'] == 'ADP DET' else 'case *' + if not v.get('main'): + v['main'] = 1 if v['upos'] == 'ADP PRON' else 0 + v['feats'] = '_ *' + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + + # "du" can be + # - "du + le" (tagged ADP) + # - the partitive article "du" (tagged DET) + # - past participle of devoir (correctly dû, tagged VERB) + # Only the ADP case should be split. + # Similarly with "des" -> "de les". + if node.upos != 'ADP': + return None + + return MWTS.get(node.form.lower(), None) + + # "du" has a shape which is neither "siblings" nor "subtree" + # E.g. in "À partir du XXIe siècle" + # "du" = "de le", but + # "de" is attached to "À", while "le" is attached to "siècle". + def postprocess_mwt(self, mwt): + if mwt.form.lower() in {'du', 'des', 'au', 'aux'}: + if mwt.words[0].descendants[-1] != mwt.words[1]: + pass + elif mwt.words[0].precedes(mwt.words[0].parent): + mwt.words[1].parent = mwt.words[0].parent + else: + head = mwt.words[1].next_node + while head.upos not in {'NOUN', 'PROPN'} and not head.is_root(): + if head.parent.precedes(head): + head = mwt.words[1].next_node + break + head = head.parent + if head.is_root(): + head = mwt.words[1].next_node + mwt.words[1].parent = head + + if mwt.words[1].parent == mwt.words[0] and mwt.words[0].descendants[-1].deprel == 'fixed': + mwt.words[1].deprel = 'fixed' + if (mwt.words[0].parent.precedes(mwt.words[0]) + and mwt.words[0].prev_node.udeprel in {'case', 'fixed'}): + mwt.words[0].deprel = 'fixed' diff --git a/udapi/block/ud/ga/to2.py b/udapi/block/ud/ga/to2.py index 4d8506e1..dbf093a9 100644 --- a/udapi/block/ud/ga/to2.py +++ b/udapi/block/ud/ga/to2.py @@ -4,6 +4,7 @@ """ from udapi.core.block import Block + class To2(Block): """Block for fixing the remaining cases (after ud.Convert1to2) in UD_Irish.""" diff --git a/udapi/block/ud/gl/__init__.py b/udapi/block/ud/gl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/gl/to2.py b/udapi/block/ud/gl/to2.py new file mode 100644 index 00000000..81a17c64 --- /dev/null +++ b/udapi/block/ud/gl/to2.py @@ -0,0 +1,60 @@ +"""Block ud.gl.To2 UD_Galician-specific conversion of UDv1 to UDv2 + +Author: Martin Popel +""" +from udapi.core.block import Block + +ADP_HEAD_PREFERENCES = { + 'NOUN': 10, + 'PRON': 9, + 'ADJ': 8, + 'VERB': 8, + 'PUNCT': -10, +} + + +class To2(Block): + """Block for fixing the remaining cases (before ud.Convert1to2) in UD_Galician.""" + + def process_node(self, node): + + # UD_Galician v1.4 uses incorrectly deprel=cop not for the copula verb, + # but for its complement (typically ADJ) and also copula is the head. + if node.deprel == 'cop': + copula = node.parent + # In UDv2 discussions it has been decided that only a limited set of verbs + # can be annotated as copula. For Spanish, "estar" was questionable, but accepted. + # I guess in Galician it is the same. The rest (considerar, resultar, quedar,...) + # should not be annotated as copulas. Luckily, in UD_Galician v1.4 they are + # governing the clause, so no change of topology is needed, just deprel=xcomp. + if copula.lemma in ('ser', 'estar'): + node.parent = copula.parent + for cop_child in copula.children: + cop_child.parent = node + copula.parent = node + node.deprel = copula.deprel + copula.deprel = 'cop' + else: + node.deprel = 'xcomp' + + # Prepositions should depend on the noun, not vice versa. + # This is easy to fix, but unfortunatelly, there are many nodes with deprel=case + # which are not actually prepostions or case markes, but standard NOUNs, VERBs etc. + # These are left as ToDo. + if node.deprel == 'case' and node.children: + if node.upos not in ('ADP', 'CONJ', 'PART'): + node.misc['ToDo'] = 'case-upos' + else: + children = sorted(node.children, key=lambda n: -ADP_HEAD_PREFERENCES.get(n.upos, 0)) + children[0].parent = node.parent + node.parent = children[0] + for child in children[1:]: + child.parent = children[0] + + # Punctuation should have no children. + if node.deprel == 'punct' and node.children and node.upos == 'PUNCT': + children = sorted(node.children, key=lambda n: -ADP_HEAD_PREFERENCES.get(n.upos, 0)) + children[0].parent = node.parent + node.parent = children[0] + for child in children[1:]: + child.parent = children[0] diff --git a/udapi/block/ud/goeswithfromtext.py b/udapi/block/ud/goeswithfromtext.py index 64e1d99f..fe419fa2 100644 --- a/udapi/block/ud/goeswithfromtext.py +++ b/udapi/block/ud/goeswithfromtext.py @@ -9,6 +9,7 @@ from udapi.core.block import Block + class GoeswithFromText(Block): """Block for splitting nodes and attaching via goeswith according to the the sentence text. @@ -96,6 +97,6 @@ def process_tree(self, root): else: last_node.misc['SpaceAfter'] = 'No' else: - assert False # we have checked the whole sentence already + assert False # we have checked the whole sentence already if text: logging.warning('Extra text "%s" in tree %s', text, root) diff --git a/udapi/block/ud/google2ud.py b/udapi/block/ud/google2ud.py new file mode 100644 index 00000000..3ba20c5c --- /dev/null +++ b/udapi/block/ud/google2ud.py @@ -0,0 +1,528 @@ +"""Block ud.Google2ud for converting Google Universal Dependency Treebank into UD. + +Usage: +udapy -s ud.Google2ud < google.conllu > ud2.conllu +""" +import re +from udapi.block.ud.convert1to2 import Convert1to2 +from udapi.block.ud.complywithtext import ComplyWithText +from udapi.block.ud.fixchain import FixChain +from udapi.block.ud.fixrightheaded import FixRightheaded +from udapi.block.ud.fixpunct import FixPunct +from udapi.block.ud.de.addmwt import AddMwt as de_AddMwt +from udapi.block.ud.es.addmwt import AddMwt as es_AddMwt +from udapi.block.ud.fr.addmwt import AddMwt as fr_AddMwt +from udapi.block.ud.pt.addmwt import AddMwt as pt_AddMwt +from udapi.block.ud.joinasmwt import JoinAsMwt + +DEPREL_CHANGE = { + "ROOT": "root", + "prep": "case", + "ncomp": "case:loc", # only in Chinese; Herman proposes case:loc + "p": "punct", + "poss": "nmod:poss", + "ps": "case", + "num": "nummod", + "number": "nummod", # TODO ? + "tmod": "nmod:tmod", + "vmod": "acl", + "rcmod": "acl:relcl", + "npadvmod": "advmod", + "preconj": "cc:preconj", + "predet": "det:predet", + "gobj": "obj", + "postneg": "neg", # will be changed to advmod + Polarity=Neg in ud.Convert1to2 + "pronl": "obj", # TODO: or expl? UD_French seems to use a mix of both + "oblcomp": "obl", + "mes": "clf", # TODO: structural transformation needed + "mwn": "compound:n", # nominal multi-word + "mwa": "compound:a", # adjectival multi-word + "mwv": "compound:v", # verbal multi-word + "asp": "aux", # aspectual particle + "rcmodrel": "mark:relcl", + "auxcaus": "aux", # redundant with Voice=Cau + "topic": "dep", + "possessive": "case", + "quantmod": "det", # TODO UD_Hindi uses "dep" for the same words + "agent": "obl:agent", + # TODO: "ref" - in basic dependencies it should be rehanged and relabelled + "conjv": "compound:conjv", + "advphmod": "advmod", + "clas": "clf", + "narg": "nmod:arg", # Turkish only +} + +FEATS_CHANGE = { + "proper=false": "", + "Proper=false": "", + "case=prep": "", + "case=unsp_c": "", + "gender=unsp_g": "", + "gender_antecedent=unsp_g": "", + "voice=unsp_v": "", + "number=unsp_n": "", + "number_antecedent=unsp_n": "", + "tense=unsp_t": "", + "mood=unsp_m": "", + "animacy=unsp_r": "", + "aspect=unsp_a": "", + "case=rel": "", # redundant with rcmodrel (mark:relcl) + "reciprocity=non-rcp": "", + "reciprocity=rcp": "PronType=Rcp", + "aspect=imperf": "Aspect=Imp", + "form=long": "Variant=Long", + "form=short": "Variant=Short", + "person=reflex": "Reflex=Yes", + "case=reflex": "Reflex=Yes", + "case=dir": "Case=Nom", + "gender=pl_tantum": "Number=Ptan", + "gender_antecedent=fem_a": "Gender[psor]=Fem", + "gender_antecedent=masc_a": "Gender[psor]=Masc", + "gender_antecedent=neut_a": "Gender[psor]=Neut", + "number_antecedent=sing_a": "Number[psor]=Sing", + "number_antecedent=plur_a": "Number[psor]=Plur", + "person_antecedent=1_a": "Person[psor]=1", + "person_antecedent=2_a": "Person[psor]=2", + "person_antecedent=3_a": "Person[psor]=3", + "definiteness=def": "Definite=Def", + "definiteness=indef": "Definite=Ind", + "mood=sub1": "Mood=Sub|Tense=Pres", # de + "mood=sub2": "Mood=Sub|Tense=Past", # de + "mood=inter": "PronType=Int", # TODO or keep Mood=Inter (it is used in UD_Chinese) + "tense=cnd": "Mood=Cnd", + "degree=sup_a": "Degree=Abs", + "degree=sup_r": "Degree=Sup", + "case=obl": "Case=Acc", + "tense=impf": "Tense=Imp", + "animacy=rat": "Animacy=Hum", + "animacy=irrat": "Animacy=Nhum", + "honorific=hon": "Polite=Form", + "mood=psm": "Tense=Fut", # TODO ? + "form=fin": "VerbForm=Fin", + "form=ger": "VerbForm=Ger", + "formality=fml": "Polite=Form", + "Evidentiality=Nfh": "Evident=Nfh", + "Evidentiality=Fh": "Evident=Fh", +} + +FR_DAYS_MONTHS = ('lundi mardi mercredi jeudi vendredi samedi dimanche ' + 'janvier février mars avril mai juin juillet août ' + 'septembre octobre novembre décembre'.split()) + + +class Google2ud(Convert1to2): + """Convert Google Universal Dependency Treebank into UD style.""" + + def __init__(self, lang='unk', non_mwt_langs='ar en ja ko zh', **kwargs): + """Create the Google2ud block instance. + + See ``Convert1to2`` for all the args. + """ + super().__init__(**kwargs) + self.lang = lang + + self._addmwt_block = None + if lang == 'de': + self._addmwt_block = de_AddMwt() + elif lang == 'es': + self._addmwt_block = es_AddMwt() + elif lang == 'fr': + self._addmwt_block = fr_AddMwt() + elif lang == 'pt': + self._addmwt_block = pt_AddMwt() + self._joinasmwt_block = JoinAsMwt() if lang in {'es', 'tr'} else None + + self._fixrigheaded_block = None + if lang in {'ar', 'de', 'en', 'fr', 'hi', 'ru', 'th', 'zh'}: + self._fixrigheaded_block = FixRightheaded() + elif lang == 'tr': + self._fixrigheaded_block = FixRightheaded(deprels='conj,flat,fixed,appos,goeswith,list') + + # Normalize the attachment of punctuation for all languages. + self._fixpunct_block = FixPunct() + + self._fixchain_block = None + if lang in {'pt', 'ru'}: + self._fixchain_block = FixChain() + + # UD_English v2.0 still uses "do n't" with SpaceAfter=No, + # instead of annotating it as a multiword token. + # In several other languages it is also common + # that syntactic words are not separated with a space without being an MWT. + self._comply_block = ComplyWithText(prefer_mwt=bool(lang not in non_mwt_langs.split())) + + def process_tree(self, root): + comment_lines = root.comment.split("\n") + root.sent_id = comment_lines[0].strip().replace(' ', '-') + root.text = comment_lines[1].strip() + # The third line of comments contains the English translation. + root.comment = '' if self.lang == "en" or len(comment_lines) < 3 else comment_lines[2] + + # ud.ComplyWithText is the very first step because it may change the tokenization + # and also it fills SpaceAfter=No, which is used in further steps. + if self._comply_block: + self._comply_block.process_tree(root) + + # `deprel=goeswith` must be fixed now because it also changes the number of nodes. + # Unlike UDv2, Google style uses `goeswith` mostly to fix "wrong" tokenization, + # e.g. "e-mail" written correctly without spaces, but tokenized into three words. + # Moreover, the hyphen is not always marked with `goeswith`. + if self.lang in {'de', 'fr', 'it', 'pt', 'ru', 'tr'}: + for node in root.descendants: + if node.form == '-' and node.no_space_after and node.prev_node.no_space_after: + if 'goeswith' in (node.prev_node.deprel, node.next_node.deprel): + node.deprel = 'goeswith' + if self.lang == 'fr': + node.deprel = 'goeswith' + node.parent = node.next_node + for node in root.descendants: + self.fix_goeswith(node) + + # Google Turkish annotation of coordination is very different from both UDv1 and UDv2. + # Also some of the deprel=ig nodes should be merged with their parents. + if self.lang == 'tr': + for node in root.descendants: + conjs = [n for n in node.children if n.deprel == 'conj'] + if conjs: + conjs[0].parent = node.parent + conjs[0].deprel = node.deprel + node.deprel = 'conj' + for nonfirst_conj in conjs[1:] + [node]: + nonfirst_conj.parent = conjs[0] + for node in root.descendants: + if node.deprel == 'ig' and re.match('leş|laş', node.parent.form.lower()): + self._merge_with(node.parent, node) + + # Multi-word prepositions must be solved before fix_deprel() fixes pobj+pcomp. + for node in root.descendants: + self.fix_multiword_prep(node) + + # Fixing feats, upos and deprel in separate steps (the order is important). + for node in root.descendants: + self.fix_feats(node) + for node in root.descendants: + self.fix_upos(node) + for node in root.descendants: + self.fix_deprel(node) + + # This needs to be executed after all other deprels are converted + for node in root.descendants: + if node.deprel in ('acomp', 'attr'): # TODO not sure about attr + copula = node.parent + node.parent = copula.parent + node.deprel = copula.deprel + copula.parent = node + copula.deprel = 'cop' + for child in copula.children: + child.parent = node + + # call ud.Convert1to2 + super().process_tree(root) + + for block in ( + self._addmwt_block, # e.g. "im" -> "in dem" in de. Must follow Convert1to2. + self._joinasmwt_block, # no pair of alphabetical words with SpaceAfter=No + self._fixrigheaded_block, # deprel=fixed,flat,... should be always head-initial + self._fixchain_block, # and form a flat structure, not a chain. + self._fixpunct_block): # commas should depend on the subord unit. + if block: + block.process_tree(root) + + if self.lang == 'tr': + root.children[0].deprel = 'root' + for node in root.descendants: + if node.deprel in {'obl:poss', 'obl:arg'}: + node.udeprel = 'nmod' + + def fix_goeswith(self, node): + """Solve deprel=goeswith which is almost always wrong in the Google annotation.""" + if node.deprel != 'goeswith': + return + + # It has been decided German should use "compound" and keep e.g. "E-mail" as three words. + # The only two cases we want to merge are: + # * dots marking ordinal numbers (21. Oktober) should be merged with the number + # keeping the upos of the number (Google has the dot as parent, don't ask me why). + # There are still bugs in the output ("Oktober" depends on "21.") which I give up. + # * apostrophes in foreign words "don't" or "Smith'" (the original English was "Smith's"). + if self.lang == 'de': + if (node.precedes(node.parent) and node.misc['SpaceAfter'] == 'No' + and node.next_node.form in ".'"): + node.next_node.upos = node.upos + self._merge_with(node.next_node, node) + elif (node.parent.precedes(node) and node.prev_node.misc['SpaceAfter'] == 'No' + and node.prev_node.form in ".'"): + node.prev_node.upos = node.upos + self._merge_with(node.prev_node, node) + else: + node.deprel = 'compound' + + # Other languages use goeswith for marking Google-tokenization errors. + # In Portuguese, there are in addition cases like "Primeira Dama". + elif self.lang in {'fr', 'it', 'pt', 'ru', 'tr'}: + if node.precedes(node.parent) and node.misc['SpaceAfter'] == 'No': + self._merge_with(node.next_node, node) + elif node.parent.precedes(node) and node.prev_node.misc['SpaceAfter'] == 'No': + self._merge_with(node.prev_node, node) + elif self.lang in {'pt'}: + node.deprel = 'compound' + + @staticmethod + def _merge_with(node, delete_node): + """Concat forms, merge feats, remove `delete_node`, and keep SpaceAfter of the right node. + + Should be called only on neighboring nodes. + """ + if node.precedes(delete_node): + node.form += delete_node.form + node.misc['SpaceAfter'] = delete_node.misc['SpaceAfter'] + else: + node.form = delete_node.form + node.form + if node.parent == delete_node: + node.parent = delete_node.parent + for child in delete_node.children: + child.parent = node + delete_node.feats.update(node.feats) + node.feats = delete_node.feats + # node.misc['Merge'] = 1 + delete_node.remove() + + def fix_multiword_prep(self, node): + """Solve pobj/pcomp depending on pobj/pcomp. + + Only some of these cases are multi-word prepositions (which should get deprel=fixed). + """ + if node.deprel in ('pobj', 'pcomp') and node.parent.deprel in ('pobj', 'pcomp'): + lo_prep = node.parent + hi_prep = node.parent.parent + if hi_prep.deprel != 'prep': + return + # E.g. in "from A to B", the Google style attaches "to"/pcomp under "from"/prep. + # Let's use this heuristics: if the prepositions are not next to each other, + # they should be siblings (as in "from A to B"). + if abs(lo_prep.ord - hi_prep.ord) != 1: + lo_prep.parent = hi_prep.parent + lo_prep.deprel = 'prep' + # Some languages (e.g. pt) in UDv2 do not use multi-word prepositions at all. + elif self.lang in {'pt'}: + node.parent = hi_prep + lo_prep.parent = node + lo_prep.deprel = 'case' + elif self.lang == 'es' and lo_prep.form in {'entre', 'en', 'a'}: + node.parent = hi_prep + lo_prep.parent = node + lo_prep.deprel = 'case' + elif self.lang == 'es' and lo_prep.form == 'para': + node.parent, node.deprel = hi_prep.parent, 'obj' + lo_prep.deprel, hi_prep.deprel = 'mark', 'mark' + lo_prep.parent, hi_prep.parent = node, node + # Otherwise, they are probably multi-word prepositions, e.g. "according to", + # but they can also be sibling prepositions, e.g. "out of". + # The Google style does not distinguish those and I don't see any heuristics, + # so let's mark these cases as ToDo. + else: + first_prep, second_prep = hi_prep, lo_prep + if lo_prep.precedes(hi_prep): + first_prep, second_prep = lo_prep, hi_prep + first_prep.parent = hi_prep.parent + second_prep.parent = first_prep + for prep_child in second_prep.children: + prep_child.parent = first_prep + second_prep.deprel = 'fixed' + if self.lang == 'es' and lo_prep.form == 'par': + pass + else: + self.log(second_prep, 'unsure-multi-prep', 'deprel=fixed, but may be siblings') + + @staticmethod + def fix_feats(node): + """Remove language prefixes, capitalize names and values, apply FEATS_CHANGE.""" + orig_feats = dict(node.feats) + node.feats = None + for name, value in sorted(orig_feats.items()): + name = name.split('/')[1] + if name == 'inflection_type': + node.misc['InflectionType'] = value.capitalize() + continue + if "antecedent" in name and node.upos == 'PRON': + node.feats["PronType"] = "Prs" + new = FEATS_CHANGE.get(name + '=' + value) + if new is not None: + if new != '': + for new_pair in new.split('|'): + new_name, new_value = new_pair.split('=') + node.feats[new_name] = new_value + elif name[0].isupper(): + node.feats[name] = value + else: + node.feats[name.capitalize()] = value.capitalize() + + # Don't loose info about proper names which will not have upos=PROPN. + if node.feats['Proper'] == 'True': + if node.xpos not in {'NNP', 'NNPS'}: + node.misc['Proper'] = 'True' + del node.feats['Proper'] + + def fix_upos(self, node): + """PRT→PART, .→PUNCT, NOUN+Proper→PROPN, VERB+neg→AUX etc.""" + if node.xpos == 'SYM': # These are almost always tagged as upos=X which is wrong. + node.upos = 'SYM' + if node.deprel in {'punct', 'p'}: + if node.form in "_-.؟”'": + node.upos = 'PUNCT' + else: + node.deprel = 'dep' # This is another way how to say deprel=todo. + elif node.upos == '.': + node.upos = 'PUNCT' + elif node.upos == 'PRT': + node.upos = 'PART' + elif node.upos == 'NOUN': + if node.xpos in {'NNP', 'NNPS'}: + node.upos = 'PROPN' + + # Japanese uses negators with deprel=neg, which should be changed to advmod in Convert1to2. + if node.upos == "VERB" and node.deprel == "neg": + node.upos = "AUX" + + # Indonesian uses prefixes (me, di, ber, ke,...) and suffixes (an, kan, i,...), + # which are written without spaces with the main word/stem (according to the raw text). + # These could be treated as syntactic words and annotated using multi-word tokens. + # However, there is no annotation about their dependency relations (just suff, pref) + # and UD_Indonesian v2.0 keeps them as one word with the stem. So let's follow this style. + # Chinese AFFIXes are more tricky to convert. + # It seems these words are quite often tagged as PART in UD_Chinese. + if node.upos == 'AFFIX': + if node.deprel == 'suff': + node.prev_node.form += node.form + node.remove(children='rehang') + elif node.deprel == 'pref': + node.next_node.form = node.form + node.next_node.form + node.remove(children='rehang') + else: + self.log(node, 'affix', 'upos=AFFIX deprel=' + node.deprel) + node.upos = 'PART' + + if node.upos == 'PUNCT' and node.form in ('$', '£'): + node.upos = 'SYM' + + if node.upos == "NUM" and node.deprel == "det" and not node.form.isnumeric(): + node.upos = "DET" + + if self.lang == 'de' and node.upos == 'CONJ' and node.form.lower() == 'zu': + node.deprel = 'mark' + node.upos = 'PART' + node.xpos = 'RP' + if node.parent.deprel == 'aux': + node.parent = node.parent.parent + + if node.upos == 'CONJ' and node.deprel == 'mark': + node.upos = 'SCONJ' + + if self.lang == 'fr': + if node.upos == 'PROPN' and node.form.lower() in FR_DAYS_MONTHS: + node.upos = 'NOUN' + if node.form == 'États-Unis': + node.upos = 'PROPN' + + def fix_deprel(self, node): + """Convert Google dependency relations to UD deprels. + + Change topology where needed. + """ + try: + node.deprel = DEPREL_CHANGE[node.deprel] + except KeyError: + pass + + if node.deprel in ('nn', 'compound'): + if node.upos == 'PROPN' and node.parent.upos == 'PROPN': + node.deprel = 'flat:name' + else: + node.deprel = 'compound' + elif node.deprel in ('pobj', 'pcomp'): + if node.parent.deprel in ('case', 'prep', 'conj'): + preposition = node.parent + node.parent = preposition.parent + preposition.parent = node + + # ud.Convert1to2 will change 'nmod' to 'obl' if needed + if preposition.deprel == 'conj': + node.deprel = 'conj' + preposition.deprel = 'case' + elif node.deprel == 'pobj': + node.deprel = 'nmod' + else: + node.deprel = 'xcomp' # TODO check if pcomp -> xcomp is correct + + # Prepositions should not have any children (except for deprel=fixed/mwe), see + # http://universaldependencies.org/u/overview/syntax.html#multiword-function-words. + # Unfortunatelly, there are many annotation errors and it is almost always better + # to rehang the extra children (at least to prevent spurious non-projectivities). + # In case of PUNCTuation it is surely correct. + # Otherwise, let's mark it as ToDo. + for extra_prep_child in preposition.children: + if extra_prep_child.udeprel in ('fixed', 'mwe'): + continue + extra_prep_child.parent = node + if extra_prep_child.upos != 'PUNCT': + self.log(extra_prep_child, 'ex-adp-child', 'was an extra adposition child') + else: + self.log(node, node.deprel, node.deprel + ' but parent.deprel!=case') + node.deprel = 'obj' + elif node.deprel == 'infmod': + node.deprel = 'xcomp' + node.feats['VerbForm'] = 'Inf' + elif node.deprel == 'partmod': + node.deprel = 'ccomp' + node.feats['VerbForm'] = 'Part' + elif node.deprel == 'suff': + node.misc['OrigDeprel'] = 'suff' + node.deprel = 'dep' + elif node.deprel == 'gmod': + node.deprel = 'nmod' if node.feats['Case'] == 'Gen' else 'nmod:gmod' + elif node.deprel == 'cc': + if node.upos == 'PUNCT' and node.form == ',': + node.deprel = 'punct' + elif node.deprel == 'parataxis': + if node.children: + cc_node = node.descendants[0].prev_node + if cc_node.udeprel == 'cc' and cc_node.parent == node.parent: + node.deprel = 'conj' + elif node.deprel == 'dislocated': + if self.lang == 'fr': + nsubj = next((n for n in node.parent.children if n.udeprel == 'nsubj'), None) + if nsubj is not None: + node.deprel = 'nsubj' + nsubj.deprel = 'expl' if nsubj.upos == 'PRON' else 'dislocated' + elif node.deprel == 'appos': + if self.lang == 'fr' and node.parent.form in {'M.', 'Mme', 'Dr'}: + node.deprel = 'flat:name' + elif node.deprel == 'prt': + if self.lang in {'en', 'de', 'nl', 'sv', 'da', 'no', 'th'}: + node.deprel = 'compound:prt' + elif self.lang == 'tr': + node.deprel = 'advmod:emph' + else: + node.deprel = 'dep:prt' + elif node.deprel == 'redup': + node.deprel = 'compound:plur' if self.lang == 'id' else 'compound:redup' + elif node.deprel == 'ig': + if node.parent.form == 'ki' and node.parent.deprel not in {'prep', 'pobj'}: + ki = node.parent + node.deprel = ki.deprel + ki.upos = 'ADP' + ki.deprel = 'case' + node.parent = ki.parent + ki.parent = node + elif node.upos == 'AUX' or node.form == 'ler': # dır, dir, ydi, dı, ydı, tu, değil,... + node.deprel = 'cop' + elif node.parent.upos == 'AUX': # yaşlıyken, gençken,... + copula = node.parent + node.parent = copula.parent + copula.parent = node + node.deprel = copula.deprel + copula.deprel = 'cop' + elif node.upos == 'PUNCT': + node.deprel = 'punct' + else: + node.deprel = 'dep:ig' diff --git a/udapi/block/ud/he/fixneg.py b/udapi/block/ud/he/fixneg.py index 5062854c..15325990 100644 --- a/udapi/block/ud/he/fixneg.py +++ b/udapi/block/ud/he/fixneg.py @@ -6,6 +6,7 @@ from udapi.core.block import Block + class FixNeg(Block): """Block for fixing the remaining cases (after ud.Convert1to2) of deprel=neg in UD_Hebrew.""" diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py new file mode 100644 index 00000000..004ab4af --- /dev/null +++ b/udapi/block/ud/hi/fixaux.py @@ -0,0 +1,170 @@ +""" +Block to fix annotation of verbs that are currently treated as auxiliaries +but they should be treated as normal verbs instead. +""" +from udapi.core.block import Block +import logging +import re + +class FixAux(Block): + + def process_node(self, node): + self.fix_lemma(node) + # The following verbs appear in verb-verb compounds as the semantically + # less salient element: le (to take), de (to give), ḍāla / phenka (to throw), + # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring), + # pahuñca (to reach), dekha (to look), phara (to return), cala (to walk), + # caṛha (to climb), saṛa (to rot), nikala (to get out), nikāla (to remove), girā (to drop), + # samā (to encounter), dhamaka (to bully), khaḍā (to stand), daboca (to catch), + # gujara (to pass), ghera (to surround), baca (to escape). + # There are also jā (to go) and paṛa (to fall) but we do not list them here + # because they can also act as genuine auxiliaries. + hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकल', 'निकाल', 'गिरा', 'समा', 'धमक', 'खडा', 'दबोच', 'गुजर', 'फूंक', 'घेर', 'बच'] + urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ'] + recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$' + # Control and raising verbs. + # चाहना چاہنا (cāhnā) “to want, to wish” is a control verb but not an auxiliary. + # Its form چاہیئے (cāhie) “should, ought to” (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary. + # दिखाना دکھانا (dikhānā) “to show” + # बनना بننا (bananā) “to become” + hiphase = ['लग', 'चुक', 'चाह', 'दिखा', 'बन', 'करा'] + urphase = ['لگ', 'چک', 'چاہ', 'دکھا', 'بن'] + rephase = r'^(' + '|'.join(hiphase + urphase) + r')$' + if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': + node.deprel = 'compound' + # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. + node.upos = "VERB" + # वाला والا (vālā) with infinitive is annotated as auxiliary but it should not. + # It is not even a verb (it does not have a verbal paradigm); it is more + # like an adjective morphologically, and like a noun syntactically. It means + # “the one who does the action of the content verb infinitive.” + # Some occurrences in the original annotation are case or mark, so we do not + # check AUX/aux here. + elif node.lemma == 'वाला' or node.lemma == 'والا': + node.upos = 'ADJ' + node.feats['AdpType'] = '' + node.feats['VerbForm'] = '' + node.feats['Aspect'] = '' + node.deprel = 'compound' + elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux': + secpred = node.parent + grandparent = secpred.parent + node.parent = grandparent + node.deprel = secpred.deprel + secpred.parent = node + secpred.deprel = "xcomp" + ###!!! We should also take care of DEPS if they exist. + # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX. + node.upos = "VERB" + # Examine the children of the original parent. + # Those that modify the clause should be re-attached to me. + # Those that modify the word (noun, adjective) should stay there. + for c in secpred.children: + # obl is borderline. It could modify an adjective rather than a clause. + # obj and iobj should not occur in copular clauses but it sometimes + # occurs with pseudocopulas: "I declare him handsome." + if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel): + c.parent = node + + def fix_lemma(self, node): + """ + Some verbal forms have wrong lemmas in the Hindi/Urdu treebanks. If they + are tagged AUX, it means that either the validator fails to recognize a + correct auxiliary, or we fail here to recognize a spurious auxiliary that + must be fixed. + """ + if node.upos == 'AUX': + # آنے is the oblique infinitive form of “to come” + if node.lemma == 'آنہ': + node.lemma = 'آ' + # بنانا बनाना “make, create, produce, cause to be/become” + # (I don't know why in some instances بنا was used as lemma for کر “to do”.) + if node.form == 'کر' and node.lemma == 'بنا': + node.lemma = 'کر' + # چاہئے (cāhie) “should, ought to” occurs with alternative spellings (should they also be labeled as typos?) + if node.form == 'چاہئے' or node.form == 'چاہیئے' or node.form == 'چاہیے': + node.lemma = 'چاہئے' + if node.form == 'چاہئیں': + node.lemma = 'چاہئے' + node.feats['Number'] = 'Plur' + # چاہے seems to be a wrong lemma of چاہیں_گے “would like” + if node.lemma == 'چاہے': + node.lemma = 'چاہ' + # चुका چکا is a perfective participle of चुकना چکنا (cuknā) “to be finished” + if node.lemma == 'चुका': + node.lemma = 'चुक' + if node.lemma == 'چکا': + node.lemma = 'چک' + # दिया دیا is a perfective participle of देना دینا (denā) “to give” + if node.lemma == 'दिया': + node.lemma = 'दे' + if node.lemma == 'دیا' or node.lemma == 'دی' or node.lemma == 'دیت': + node.lemma = 'دے' + # دکھائیں (dikhānā) “to show” + if node.form == 'دکھائیں': + node.lemma = 'دکھا' + # گا, گی, گے denote the future tense. They are written as separate + # words in Urdu (while they are just suffixes in Hindi). However, + # when written as a separate auxiliary, all these forms should share + # the same lemma. + if node.lemma == 'گی' or node.lemma == 'گے': + node.lemma = 'گا' + # گیا is a perfective participle of जाना جانا‎ (jānā) “to go” + # जान جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا‎. + if node.lemma == 'जाना' or node.lemma == 'जान': + node.lemma = 'जा' + if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی' or node.lemma == 'جاتے' or node.lemma == 'جات': + node.lemma = 'جا' + # Wrongly lemmatized present forms of “to be”. + # In one instance, ہے had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'हों' or node.lemma == 'है.': + node.lemma = 'है' + if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے': + node.lemma = 'ہے' + # लिया لیا is a perfective participle of लेना لینا (lenā) “to take” + # In one instance, لیا had a lemma from a neighboring verb, so we also look at the form. + if node.lemma == 'लिया': + node.lemma = 'ले' + if node.lemma == 'لیا' or node.form == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے': + node.lemma = 'لے' + # लगा لگا is a perfective participle of लगना لگنا (lagnā) “to seem, to appear” + if node.lemma == 'लगा': + node.lemma = 'लग' + if node.lemma == 'لگا': + node.lemma = 'لگ' + # पहुंचा پہنچا is a perfective participle of पहुंचना پہنچنا (pahuñcnā) “to reach” + if node.lemma == 'पहुंचा' or node.lemma == 'पहुँच': + node.lemma = 'पहुंच' + # پڑے is a perfective participle of پڑنا (paṛnā) “to fall” + if node.lemma == 'پڑے': + node.lemma = 'پڑ' + # پھرے is a perfective participle of پھرنا (pharnā) “to return” + if node.lemma == 'پھرے': + node.lemma = 'پھر' + # रहा رہا is a perfective participle of रहना رہنا (rahnā) “to stay” + if node.lemma == 'रहा' or node.lemma == 'रहूं': + node.lemma = 'रह' + if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے': + node.lemma = 'رہ' + # sakna to be able to + if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا' or node.lemma == 'سکت': + node.lemma = 'سک' + # Wrongly lemmatized past forms of “to be”. + if node.lemma == 'थी': + node.lemma = 'था' + if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں': + node.lemma = 'تھا' + # उठा اٹھا is a perfective participle of उठना اٹھنا (uṭhnā) “to rise, get up” + if node.lemma == 'उठा': + node.lemma = 'उठ' + if node.lemma == 'اٹھا': + node.lemma = 'اٹھ' + # The compound part vālā is not an auxiliary. We handle it in process_node() + # but it must be lemmatized properly. + if node.lemma == 'والی': + node.lemma = 'والا' + # The postposition ke after a verbal stem is not an auxiliary. + # Example: علحدہ علحدہ کیس رجسٹر کر کے “by registering separate cases” + if node.lemma == 'کا' and node.form == 'کے': + node.upos = 'ADP' + node.deprel = 'mark' diff --git a/udapi/block/ud/id/__init__.py b/udapi/block/ud/id/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py new file mode 100644 index 00000000..a8d50748 --- /dev/null +++ b/udapi/block/ud/id/addmwt.py @@ -0,0 +1,219 @@ +""" +Block ud.id.AddMwt cuts the clitic "-nya" in Indonesian (preprocessed with +MorphInd whose output is stored in MISC attribute MorphInd). +""" +import udapi.block.ud.addmwt +import logging +import re + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + if re.search(r'^(ku|kau)', node.form, re.IGNORECASE) and re.search(r'^\^(aku

_PS1|kamu

_PS2)\+', node.misc['MorphInd']) and node.upos == 'VERB': + splitform = re.sub(r'^(ku|kau)', r'\1 ', node.form, flags=re.IGNORECASE) + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = 'PRON VERB' + if re.search(r'^ku ', splitform.lower()): + lemma = re.sub(r'^ku ', 'aku ', splitform.lower()) + feats = 'Number=Sing|Person=1|PronType=Prs *' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split())<2: + xpos = 'PS1 VSA' + else: + lemma = re.sub(r'^kau ', 'kamu ', splitform.lower()) + feats = 'Number=Sing|Person=2|PronType=Prs *' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split())<2: + xpos = 'PS2 VSA' + deprel = 'nsubj *' + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} + elif re.search(r'(nya|ku|mu)$', node.form, re.IGNORECASE) and re.search(r'\+(dia

_PS3|aku

_PS1|kamu

_PS2)\$$', node.misc['MorphInd']): + if node.upos == 'VERB': + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + # For transitive verbs with the meN- prefix, -nya is an object clitic. + # For passive verbs with the di- prefix, -nya refers to a passive agent. + # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization. + # The same would hold for intransitive verbs with the meN- prefix but we cannot recognize them (we will treat all meN- verbs as transitive). + menverb = True if re.match(r'^\^meN\+', node.misc['MorphInd']) else False + diverb = True if re.match(r'^\^di\+', node.misc['MorphInd']) else False + nominalization = not menverb and not diverb + # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + if nominalization: + lemma = splitform.lower() + upos = 'VERB DET' + feats = '* Definite=Def|PronType=Art' + deprel = '* det' + else: + upos = 'VERB PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' + # The agent of the passive verb is coded like a direct object of an active verb, + # so we might want to use obj:agent rather than obl:agent. However, full nominals + # as passive agents can be optionally accompanied by the preposition _oleh_ "by", + # which is an argument in favor of saying that they are oblique. So we currently + # mark all passive agents as obliques, although it is disputable in Austronesian + # languages (unlike Indo-European passives). + deprel = '* obl:agent' if diverb else '* obj' + xpos = re.sub(r'\+', ' ', node.xpos) + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'(NOUN|PROPN|X)', node.upos): + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = '* PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* nmod:poss' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'PRON' and re.match(r'^diri(nya|ku|mu)$', node.form, re.IGNORECASE): + # dirinya = reflexive himself/herself/itself (similarly, diriku = myself, dirimu = yourself; somewhere else we should check that they have the right features) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3. + # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3. + node.feats['Number[psor]'] = '' + node.feats['Person[psor]'] = '' + upos = 'PRON PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=3|PronType=Prs' + xpos = 'NSD PS3' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=1|PronType=Prs' + xpos = 'NSD PS1' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=2|PronType=Prs' + xpos = 'NSD PS2' + deprel = '* nmod:poss' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'ADJ' and re.search(r'(nya)$', node.form, re.IGNORECASE): + # nominalized adjective + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'ADJ DET' + feats = '* Definite=Def|PronType=Art' + if re.match(r' ', node.xpos): + xpos = re.sub(r'\+', ' ', node.xpos) + else: + xpos = 'ASP PS3' + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'^(banyak|semua)nya$', node.form, re.IGNORECASE): + # semua = all (DET) + # semuanya = nominalization of semua, i.e., 'everything' (PRON) + # banyak = many, much (DET) + # banyaknya = nominalization of banyak, i.e., 'a lot' (PRON) + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'DET DET' + feats = ('PronType=Tot' if lemma == 'semua nya' else 'PronType=Ind')+' Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif re.match(r'^(satu)nya$', node.form, re.IGNORECASE): + # satu = one (NUM) + # satunya = nominalization of satu, meaning 'the only one' + splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = 'NUM DET' + feats = 'NumType=Card Definite=Def|PronType=Art' + xpos = re.sub(r'\+', ' ', node.xpos) + deprel = '* det' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + elif node.upos == 'ADP' and re.match(r'^R--\+PS[123]$', node.xpos) or re.match(r'^(bersama|dibawah|didalam|sekitar)nya$', node.form, re.IGNORECASE): + # Fused preposition and pronoun. + # Most of them are recognized as R--+PS3 by MorphInd. However, some are different: + # bersamanya = 'with him' = VSA+PS3 + # dibawahnya = 'under it' = VSP+PS3 + # didalamnya = 'inside it' = VSP+PS3 + # sekitarnya = 'around it' = D--+PS3 + # However: + # layaknya = 'like' is a derivation from 'layak' = 'worthy' (ASP+PS3) + splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE) + upos = 'ADP PRON' + if re.search(r' nya$', splitform.lower()): + lemma = re.sub(r' nya$', ' dia', splitform.lower()) + feats = '* Number=Sing|Person=3|PronType=Prs' + xpos = 'R-- PS3' + elif re.search(r' ku$', splitform.lower()): + lemma = re.sub(r' ku$', ' aku', splitform.lower()) + feats = '* Number=Sing|Person=1|PronType=Prs' + xpos = 'R-- PS1' + else: + lemma = re.sub(r' mu$', ' kamu', splitform.lower()) + feats = '* Number=Sing|Person=2|PronType=Prs' + xpos = 'R-- PS2' + if node.udeprel == 'case': + if re.match(r'^(NOUN|PROPN|PRON|DET|NUM|X|SYM)$', node.parent.upos): + deprel = 'nmod' + else: + deprel = 'obl' + else: + deprel = '*' + deprel = 'case '+deprel + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel} + else: + # Do not warn about instances that are known exceptions. + # akibatnya = as a result (SCONJ); akibat = result + # bukannya = instead (PART); bukan = no, not + # layaknya = like (ADP); layak = worthy + # sebaiknya = should (AUX) + # sesampainya = once in / arriving at (ADP) + # tidaknya = whether or not (PART); tidak = no, not + # Adverbs are an exception, too. The -nya morpheme could be derivation. E.g., 'ironis' = 'ironic'; 'ironisnya' = 'ironically'. + if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|layak|sebaik|sesampai|tidak)(nya|ku|mu)$', node.form, re.IGNORECASE): + logging.warning("Form '%s' analyzed by MorphInd as having the -nya|-ku|-mu clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos)) + return None + elif re.search(r'(kah|lah|pun|tah)$', node.form, re.IGNORECASE) and re.search(r'\+(kah|lah|pun|tah)_T--\$$', node.misc['MorphInd']): + splitform = re.sub(r'(kah|lah|pun|tah)$', r' \1', node.form, flags=re.IGNORECASE) + lemma = splitform.lower() + upos = '* PART' + feats = '* _' + xpos = re.sub(r'\+', ' ', node.xpos) + if len(xpos.split()) < 2: + xpos = xpos + ' T--' + deprel = '* advmod:emph' + # 'main': 0 ... this is the default value (the first node will be the head and inherit children) + return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel} + return None + + def postprocess_mwt(self, mwt): + """Distribute the MorphInd analysis to the two parts so that we can later use it to fix the lemmas of verbs.""" + match = re.match(r'^\^(.*)\+(aku

_PS1|kamu

_PS2|dia

_PS3|kah_T--|lah_T--|pun_T--|tah_T--)\$$', mwt.misc['MorphInd']) + if not match: + match = re.match(r'^\^(aku

_PS1|kamu

_PS2)\+(.*)\$$', mwt.misc['MorphInd']) + if match: + mwt.words[0].misc['MorphInd'] = '^'+match.group(1)+'$' + mwt.words[1].misc['MorphInd'] = '^'+match.group(2)+'$' diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py new file mode 100644 index 00000000..4ea23d06 --- /dev/null +++ b/udapi/block/ud/id/fixgsd.py @@ -0,0 +1,447 @@ +"""Block to fix annotation of UD Indonesian-GSD.""" +from udapi.core.block import Block +import logging +import re + +class FixGSD(Block): + + def fix_upos_based_on_morphind(self, node): + """ + Example from data: ("kesamaan"), the correct UPOS is NOUN, as + suggested by MorphInd. + Based on my observation so far, if there is a different UPOS between + the original GSD and MorphInd, it's better to trust MorphInd + I found so many incorrect UPOS in GSD, especially when NOUNs become + VERBs and VERBs become NOUNs. + I suggest adding Voice=Pass when the script decides ke-xxx-an as VERB. + """ + if node.upos == 'VERB' and node.xpos == 'NSD' and re.match(r'^ke.+an$', node.form, re.IGNORECASE): + node.upos = 'NOUN' + if node.udeprel == 'acl': + node.deprel = 'nmod' + elif node.udeprel == 'advcl': + node.deprel = 'obl' + + def fix_semua(self, node): + """ + Indonesian "semua" means "everything, all". + Originally it was DET, PRON, or ADV. + Ika: I usually only labeled "semua" as DET only if it's followed by a + NOUN/PROPN. If it's followed by DET (including '-nya' as DET) or it's + not followed by any NOUN/DET, I labeled them as PRON. + """ + if node.form.lower() == 'semua': + if re.match(r'^(NOUN|PROPN)$', node.parent.upos) and node.parent.ord > node.ord: + node.upos = 'DET' + if node.udeprel == 'nmod' or node.udeprel == 'advmod': + node.deprel = 'det' + else: + node.upos = 'PRON' + if node.udeprel == 'det' or node.udeprel == 'advmod': + node.deprel = 'nmod' + node.feats['PronType'] = 'Tot' + + def fix_ordinal_numerals(self, node): + """ + Ordinal numerals should be ADJ NumType=Ord in UD. They have many different + UPOS tags in Indonesian GSD. This method harmonizes them. + pertama = first + kedua = second + ketiga = third + keempat = fourth + kelima = fifth + keenam = sixth + ketujuh = seventh + kedelapan = eighth + kesembilan = ninth + ke-48 = 48th + + However! The ke- forms (i.e., not 'pertama') can also function as total + versions of cardinal numbers ('both', 'all three' etc.). If the numeral + precedes the noun, it is a total cardinal; if it follows the noun, it is + an ordinal. An exception is when the modified noun is 'kali' = 'time'. + Then the numeral is ordinal regardless where it occurs, and together + with 'kali' it functions as an adverbial ordinal ('for the second time'). + """ + # We could also check the XPOS, which is derived from MorphInd: re.match(r'^CO-', node.xpos) + if re.match(r'^pertama(nya)?$', node.form, re.IGNORECASE): + node.upos = 'ADJ' + node.feats['NumType'] = 'Ord' + if re.match(r'^(det|nummod|nmod)$', node.udeprel): + node.deprel = 'amod' + elif re.match(r'^(kedua|ketiga|keempat|kelima|keenam|ketujuh|kedelapan|kesembilan|ke-?\d+)(nya)?$', node.form, re.IGNORECASE): + if node.parent.ord < node.ord or node.parent.lemma == 'kali': + node.upos = 'ADJ' + node.feats['NumType'] = 'Ord' + if re.match(r'^(det|nummod|nmod)$', node.udeprel): + node.deprel = 'amod' + else: + node.upos = 'NUM' + node.feats['NumType'] = 'Card' + node.feats['PronType'] = 'Tot' + if re.match(r'^(det|amod|nmod)$', node.udeprel): + node.deprel = 'nummod' + + def rejoin_ordinal_numerals(self, node): + """ + If an ordinal numeral is spelled using digits ('ke-18'), it is often + tokenized as multiple tokens, which is wrong. Fix it. + """ + if node.form.lower() == 'ke': + dash = None + number = None + if node.next_node: + if node.next_node.form == '-': + dash = node.next_node + if dash.next_node and re.match(r'^\d+$', dash.next_node.form): + number = dash.next_node + node.form = node.form + dash.form + number.form + node.lemma = node.lemma + dash.lemma + number.lemma + elif re.match(r'^\d+$', node.next_node.form) and (node.parent == node.next_node or node.next_node.parent == node): + number = node.next_node + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = node.form + '-' + number.form + node.form = node.form + number.form + node.lemma = node.lemma + '-' + number.lemma + if number: + # Let us pretend that these forms are always ordinal numerals. + # Situations where they act as total cardinals will be disambiguated + # in a subsequent call to fix_ordinal_numerals(). + node.upos = 'ADJ' + node.xpos = 'CO-' + node.feats['NumType'] = 'Ord' + node.misc['MorphInd'] = '^ke_R--+' + number.form + '_CC-$' + # Find the parent node. Assume that the dash, if present, was not the head. + if node.parent == number: + node.parent = number.parent + node.deprel = number.deprel + if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): + node.deprel = 'amod' + # Adjust SpaceAfter. + node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' + # Remove the separate node of the dash and the number. + if dash: + if len(dash.children) > 0: + for c in dash.children: + c.parent = node + dash.remove() + if len(number.children) > 0: + for c in number.children: + c.parent = node + number.remove() + # There may have been spaces around the dash, which are now gone. Recompute the sentence text. + node.root.text = node.root.compute_text() + + def rejoin_decades(self, node): + """ + In Indonesian, the equivalent of English "1990s" is written as "1990-an". + In GSD, it is often tokenized as multiple tokens, which is wrong. Fix it. + """ + if node.form.lower() == 'an': + dash = None + number = None + if node.prev_node: + if node.prev_node.form == '-': + dash = node.prev_node + if dash.prev_node and re.match(r'^\d+$', dash.prev_node.form): + number = dash.prev_node + node.form = number.form + dash.form + node.form + node.lemma = number.lemma + dash.lemma + node.lemma + elif re.match(r'^\d+$', node.prev_node.form) and (node.parent == node.prev_node or node.prev_node.parent == node): + number = node.prev_node + node.feats['Typo'] = 'Yes' + node.misc['CorrectForm'] = number.form + '-' + node.form + node.form = number.form + node.form + node.lemma = number.lemma + '-' + node.lemma + if number: + # The combined token is no longer a numeral. It cannot quantify an entity. + # Instead, it is itself something like a noun (or perhaps proper noun). + node.upos = 'NOUN' + node.xpos = 'NSD' + node.feats['NumType'] = '' + # In some cases, "-an" is labeled as foreign for no obvious reason. + node.feats['Foreign'] = '' + node.misc['MorphInd'] = '^' + number.form + '_CC-+an_F--$' + # Find the parent node. Assume that the dash, if present, was not the head. + if node.parent == number: + node.parent = number.parent + node.deprel = number.deprel + if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel): + node.deprel = 'nmod' + # No need to adjust SpaceAfter, as the 'an' node was the last one in the complex. + #node.misc['SpaceAfter'] = 'No' if number.no_space_after else '' + # Remove the separate node of the dash and the number. + if dash: + if len(dash.children) > 0: + for c in dash.children: + c.parent = node + dash.remove() + if len(number.children) > 0: + for c in number.children: + c.parent = node + number.remove() + # There may have been spaces around the dash, which are now gone. Recompute the sentence text. + node.root.text = node.root.compute_text() + + def merge_reduplication(self, node): + """ + Reduplication is a common morphological device in Indonesian. Reduplicated + nouns signal plural but some reduplications also encode emphasis, modification + of meaning etc. In the previous annotation of GSD, reduplication was mostly + analyzed as three tokens, e.g., for plurals, the second copy would be attached + to the first one as compound:plur, and the hyphen would be attached to the + second copy as punct. We want to analyze reduplication as a single token. + Fix it. + """ + # We assume that the previous token is a hyphen and the token before it is the parent. + first = node.parent + root = node.root + # Example of identical reduplication: negara-negara = countries + # Example of reduplication with -an: kopi-kopian = various coffee trees + # Example of reduplication with vowel substitution: bolak-balik = alternating + # Example of reduplication with di-: disebut-sebut = mentioned (the verb sebut is reduplicated, then passivized) + # Example of reduplication with se-: sehari-hari = daily (hari = day) + # The last pattern is not reduplication but we handle it here because the procedure is very similar: non-/sub-/anti- + a word. + if first.ord == node.ord-2 and (first.form.lower() == node.form.lower() or first.form.lower() + 'an' == node.form.lower() or re.match(r'^(.)o(.)a(.)-\1a\2i\3$', first.form.lower() + '-' + node.form.lower()) or first.form.lower() == 'di' + node.form.lower() or first.form.lower() == 'se' + node.form.lower() or re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower())): + hyph = node.prev_node + if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): + # This is specific to the reduplicated plurals. The rest will be done for any reduplications. + # Note that not all reduplicated plurals had compound:plur. So we will look at whether they are NOUN. + ###!!! Also, reduplicated plural nouns always have exact copies on both sides of the hyphen. + ###!!! Some other reduplications have slight modifications on one or the other side. + if node.upos == 'NOUN' and first.form.lower() == node.form.lower(): + first.feats['Number'] = 'Plur' + # For the non-/sub-/anti- prefix we want to take the morphology from the second word. + if re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower()): + first.lemma = first.lemma + '-' + node.lemma + first.upos = node.upos + first.xpos = node.xpos + first.feats = node.feats + first.misc['MorphInd'] = re.sub(r'\$\+\^', '+', first.misc['MorphInd'] + '+' + node.misc['MorphInd']) + # Neither the hyphen nor the current node should have children. + # If they do, re-attach the children to the first node. + for c in hyph.children: + c.parent = first + for c in node.children: + c.parent = first + # Merge the three nodes. + # It is possible that the last token of the original annotation + # is included in a multi-word token. Then we must extend the + # multi-word token to the whole reduplication! Example: + # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' + # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. + mwt = node.multiword_token + if mwt: + # We assume that the MWT has only two words. We are not prepared for other possibilities. + if len(mwt.words) > 2: + logging.critical('MWT of only two words is expected') + mwtmisc = mwt.misc.copy() + second = mwt.words[1] + mwt.remove() + first.form = first.form + '-' + node.form + hyph.remove() + node.remove() + first.misc['SpaceAfter'] = '' + mwt = root.create_multiword_token([first, second], form=first.form + second.form, misc=mwtmisc) + else: + first.form = first.form + '-' + node.form + if node.no_space_after: + first.misc['SpaceAfter'] = 'No' + else: + first.misc['SpaceAfter'] = '' + hyph.remove() + node.remove() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() + # In some cases the non-/sub-/anti- prefix is annotated as the head of the phrase and the above pattern does not catch it. + elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti|multi|kontra)$', node.form.lower()): + prefix = node + stem = first # here it is not the first part at all + hyph = stem.prev_node + if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form): + # For the non-/sub-/anti- prefix we want to take the morphology from the second word. + stem.lemma = prefix.lemma + '-' + stem.lemma + stem.misc['MorphInd'] = re.sub(r'\$\+\^', '+', prefix.misc['MorphInd'] + '+' + stem.misc['MorphInd']) + # Neither the hyphen nor the prefix should have children. + # If they do, re-attach the children to the stem. + for c in hyph.children: + c.parent = stem + for c in prefix.children: + c.parent = stem + # Merge the three nodes. + # It is possible that the last token of the original annotation + # is included in a multi-word token. Then we must extend the + # multi-word token to the whole reduplication! Example: + # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-' + # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'. + mwt = stem.multiword_token + if mwt: + # We assume that the MWT has only two words. We are not prepared for other possibilities. + if len(mwt.words) > 2: + logging.critical('MWT of only two words is expected') + mwtmisc = mwt.misc.copy() + second = mwt.words[1] + mwt.remove() + stem.form = prefix.form + '-' + stem.form + prefix.remove() + hyph.remove() + stem.misc['SpaceAfter'] = '' + mwt = root.create_multiword_token([stem, second], form=stem.form + second.form, misc=mwtmisc) + else: + stem.form = prefix.form + '-' + stem.form + prefix.remove() + hyph.remove() + # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen. + # If it did not, then we have a mismatch with the sentence text, which we must fix. + # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-'). + root.text = root.compute_text() + + def fix_plural_propn(self, node): + """ + It is unlikely that a proper noun will have a plural form in Indonesian. + All examples observed in GSD should actually be tagged as common nouns. + """ + if node.upos == 'PROPN' and node.feats['Number'] == 'Plur': + node.upos = 'NOUN' + node.lemma = node.lemma.lower() + if node.upos == 'PROPN': + node.feats['Number'] = '' + + def fix_satu_satunya(self, node): + """ + 'satu' = 'one' (NUM) + 'satu-satunya' = 'the only' + """ + root = node.root + if node.form == 'nya' and node.parent.form.lower() == 'satu' and node.parent.udeprel == 'fixed' and node.parent.parent.form.lower() == 'satu': + satu0 = node.parent.parent + satu1 = node.parent + nya = node + dash = None + if satu1.ord == satu0.ord+2 and satu1.prev_node.form == '-': + dash = satu1.prev_node + satu0.misc['SpaceAfter'] = 'No' + dash.misc['SpaceAfter'] = 'No' + root.text = root.compute_text() + satu1.deprel = 'compound:redup' + nya.parent = satu0 + # We actually cannot leave the 'compound:redup' here because it is not used in Indonesian. + if node.form == 'nya' and node.parent.form.lower() == 'satu': + satu0 = node.parent + nya = node + if satu0.next_node.form == '-': + dash = satu0.next_node + if dash.next_node.form.lower() == 'satu': + satu1 = dash.next_node + if satu1.ord == node.ord-1: + # Merge satu0 + dash + satu1 into one node. + satu0.form = satu0.form + dash.form + satu1.form + dash.remove() + satu1.remove() + # There should be a multi-word token comprising satu1 + nya. + mwt = nya.multiword_token + if mwt: + mwtmisc = mwt.misc.copy() + mwt.remove() + mwt = root.create_multiword_token([satu0, nya], form=satu0.form + nya.form, misc=mwtmisc) + satu0.misc['SpaceAfter'] = '' + root.text = root.compute_text() + if node.multiword_token and node.no_space_after: + node.misc['SpaceAfter'] = '' + + def lemmatize_from_morphind(self, node): + # The MISC column contains the output of MorphInd for the current word. + # The analysis has been interpreted wrongly for some verbs, so we need + # to re-interpret it and extract the correct lemma. + morphind = node.misc['MorphInd'] + if node.upos == 'VERB': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r"_V[SP][AP]$", "", morphind) + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r"\+", morphind) + # Expected suffixes are -kan, -i, -an, or no suffix at all. + # There is also the circumfix ke-...-an which seems to be nominalized adjective: + # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama"; + # but I am not sure what is the reason that these are tagged VERB. + if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]): + del morphemes[-1] + # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all. + # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+". + while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + elif node.upos == 'NOUN': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_(N[SP]D|VSA)$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefixes are peN-, per-, ke-, ber-. + # Expected suffix is -an. + if len(morphemes) > 1 and re.match(r'^an$', morphemes[-1]): + del morphemes[-1] + if len(morphemes) > 1 and re.match(r'^(peN|per|ke|ber)$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + elif node.upos == 'ADJ': + if morphind: + # Remove the start and end tags from morphind. + morphind = re.sub(r"^\^", "", morphind) + morphind = re.sub(r"\$$", "", morphind) + # Remove the final XPOS tag from morphind. + morphind = re.sub(r'_ASS$', '', morphind) + # Do not proceed if there is an unexpected final XPOS tag. + if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind): + # Split morphind to prefix, stem, and suffix. + morphemes = re.split(r'\+', morphind) + # Expected prefix is ter-. + if len(morphemes) > 1 and re.match(r'^ter$', morphemes[0]): + del morphemes[0] + # Check that we are left with just one morpheme. + if len(morphemes) != 1: + logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats)) + else: + lemma = morphemes[0] + # Remove the stem POS category. + lemma = re.sub(r'<[a-z]+>', '', lemma) + node.lemma = lemma + else: + logging.warning("No MorphInd analysis found for form '%s'" % (node.form)) + + def process_node(self, node): + self.fix_plural_propn(node) + self.fix_upos_based_on_morphind(node) + self.fix_semua(node) + self.rejoin_ordinal_numerals(node) + self.fix_ordinal_numerals(node) + self.rejoin_decades(node) + self.merge_reduplication(node) + self.fix_satu_satunya(node) + self.lemmatize_from_morphind(node) diff --git a/udapi/block/ud/joinasmwt.py b/udapi/block/ud/joinasmwt.py new file mode 100644 index 00000000..be93bd3c --- /dev/null +++ b/udapi/block/ud/joinasmwt.py @@ -0,0 +1,51 @@ +"""Block ud.JoinAsMwt for creating multi-word tokens + +if multiple neighboring words are not separated by a space +and the boundaries between the word forms are alphabetical. +""" +from udapi.core.block import Block + + +class JoinAsMwt(Block): + """Create MWTs if words are not separated by a space..""" + + def __init__(self, revert_orig_form=True, **kwargs): + """Args: + revert_orig_form: if any node of the newly created MWT has `misc['OrigForm']`, + it is used as the FORM (and deleted from MISC). Useful after `ud.ComplyWithText`. + Default=True. + """ + super().__init__(**kwargs) + self.revert_orig_form = revert_orig_form + + def process_node(self, node): + if node.multiword_token: + return + mwt_nodes = [node] + while (node.next_node and not node.next_node.multiword_token + and self.should_join(node, node.next_node)): + node = node.next_node + mwt_nodes.append(node) + if len(mwt_nodes) > 1: + self.create_mwt(mwt_nodes) + + def should_join(self, node, next_node): + return node.no_space_after and node.form[-1].isalpha() and next_node.form[0].isalpha() + + def create_mwt(self, mwt_nodes): + mwt_form = ''.join([n.form for n in mwt_nodes]) + mwt = mwt_nodes[0].root.create_multiword_token(words=mwt_nodes, form=mwt_form) + if mwt_nodes[0].node.misc['SpaceAfter'] == 'No': + mwt.misc['SpaceAfter'] = 'No' + for mwt_node in mwt_nodes: + del mwt_node.misc['SpaceAfter'] + if self.revert_orig_form: + for mwt_node in mwt_nodes: + if mwt_node.misc['OrigForm']: + mwt_node.form = mwt_node.misc['OrigForm'] + del mwt_node.misc['OrigForm'] + self.postprocess_mwt() + + # a helper method to be overriden + def postprocess_mwt(self, mwt): + pass diff --git a/udapi/block/ud/jointoken.py b/udapi/block/ud/jointoken.py new file mode 100644 index 00000000..43d2b30d --- /dev/null +++ b/udapi/block/ud/jointoken.py @@ -0,0 +1,97 @@ +""" +Block ud.JoinToken will join a given token with the preceding one. +""" +from udapi.core.block import Block +import logging + + +class JoinToken(Block): + """ + Merge two tokens into one. A MISC attribute is used to mark the tokens that + should join the preceding token. (The attribute may have been set by an + annotator or by a previous block that tests the specific conditions under + which joining is desired.) Joining cannot be done across sentence + boundaries; if necessary, apply util.JoinSentence first. Multiword tokens + are currently not supported: None of the nodes to be merged can belong to + a MWT. (The block ud.JoinAsMwt may be of some help, but it works differently.) + Merging is simple if there is no space between the tokens (see SpaceAfter=No + at the first token). If there is a space, there are three options in theory: + + 1. Keep the tokens as two nodes but apply the UD goeswith relation + (see https://universaldependencies.org/u/overview/typos.html) and + the related annotation rules. + 2. Join them into one token that contains a space. Such "words with + spaces" can be exceptionally allowed in UD if they are registered + in the given language. + 3. Remove the space without any trace. Not recommended in UD unless the + underlying text was created directly for UD and can be thus considered + part of the annotation. + + At present, this block does not support merging with spaces at all, but + in the future one or more of the options may be added. + """ + + def __init__(self, misc_name='JoinToken', misc_value=None, **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the joining + default: JoinToken + misc_value: value of the MISC attribute to trigger the joining; + if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + self.misc_value = misc_value + + def process_node(self, node): + """ + The JoinToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be merged with the previous node and the + attribute will be removed from MISC, or a warning will be issued that + the merging cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + if node.misc[self.misc_name] == '': + return + if self.misc_value and node.misc[self.misc_name] != self.misc_value: + return + prevnode = node.prev_node + if not prevnode: + logging.warning("MISC %s cannot be used at the first token of a sentence." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if node.multiword_token or prevnode.multiword_token: + logging.warning("MISC %s cannot be used if one of the nodes belongs to a multiword token." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if prevnode.misc['SpaceAfter'] != 'No': + logging.warning("MISC %s cannot be used if there is space between the tokens." % self.misc_name) + node.misc['Bug'] = 'JoiningTokensWithSpaceNotSupported' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if prevnode.deps or node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # If the first token depends on the second token, re-attach it to the + # second token's parent to prevent cycles. + if prevnode in node.descendants: + prevnode.parent = node.parent + prevnode.deprel = node.deprel + # Re-attach all children of the second token to the first token. + for c in node.children: + c.parent = prevnode + # Concatenate the word forms of the two tokens. Assume that morphological + # annotation, including the lemma, is already updated accordingly (we + # cannot guess it anyway). + prevnode.form += node.form + # Remove SpaceAfter=No from the first token unless the second token has + # this attribute, too (meaning that there is no space between the second + # token and whatever comes next). + prevnode.misc['SpaceAfter'] = node.misc['SpaceAfter'] + # Remove the current node. The joining instruction was in its MISC, so + # it will disappear together with the node. + node.remove() diff --git a/udapi/block/ud/kk/fixspuriousaux.py b/udapi/block/ud/kk/fixspuriousaux.py new file mode 100644 index 00000000..044ff178 --- /dev/null +++ b/udapi/block/ud/kk/fixspuriousaux.py @@ -0,0 +1,27 @@ +"""Block to convert spurious auxiliaries to lexical verbs in Kazakh.""" +from udapi.core.block import Block +import logging +import re + +class FixSpuriousAux(Block): + + def process_node(self, node): + """ + Some verbs that are called auxiliary by the traditional grammar, should + be analyzed in UD as VERB + non-finite xcomp. + """ + if node.upos == 'AUX' and node.udeprel == 'aux': + # баста = start + if re.match(r'^(баста|кет)$', node.lemma): + node.upos = 'VERB' + # The auxiliary inherits the incoming relation of its original parent. + lexverb = node.parent + node.parent = lexverb.parent + node.deprel = lexverb.deprel + # The auxiliary also inherits some but not all children of the lexical verb. + for c in lexverb.children: + if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): + c.parent = node + # The lexical verb becomes an xcomp of the auxiliary. + lexverb.parent = node + lexverb.deprel = 'xcomp' diff --git a/udapi/block/ud/la/addmwt.py b/udapi/block/ud/la/addmwt.py new file mode 100644 index 00000000..27831151 --- /dev/null +++ b/udapi/block/ud/la/addmwt.py @@ -0,0 +1,41 @@ +""" Block ud.la.AddMwt for heuristic detection of multi-word (PRON + cum, nonne) and abbreviations-dots tokens. """ +import udapi.block.ud.addmwt + +MWTS = { + 'mecum': {'lemma': 'ego cum', 'form': 'me cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'tecum': {'lemma': 'tu cum', 'form': 'te cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'nobiscum': {'lemma': 'nos cum', 'form': 'nobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Neut|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'vobiscum': {'lemma': 'vos cum', 'form': 'vobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'uobiscum': {'lemma': 'uos cum', 'form': 'uobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, + 'secum': {'lemma': 'sui cum', 'form': 'se cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, # can be singular or plural + 'nonne': {'lemma': 'non ne', 'form': 'non ne', 'upos': 'PART PART', 'feats': 'Polarity=Neg Clitic=Yes|PartType=Int', 'deprel': 'advmod:neg discourse', 'shape': 'sibling'} +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + # v['xpos'] = '' # treebank-specific + if 'shape' not in v: + v['shape'] = 'subtree' + v['main'] = 0 + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + if analysis is not None: + return analysis + + if node.form.endswith('.') and len(node.form) > 1 and node.form != '...': + # currently under discussion + return {'form': node.form[:-1] + ' .', + 'lemma': '* .', + 'upos': '* PUNCT', + 'xpos': '_ _', + 'feats': '* _', + 'deprel': '* punct', + 'main': 0, + 'shape': 'subtree'} + diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py new file mode 100644 index 00000000..a7b506e8 --- /dev/null +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -0,0 +1,338 @@ +""" +Block to identify missing or ill-valued features in Latin. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.la.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def __init__(self, flavio=False, **kwargs): + """ + Create the ud.la.MarkFeatsBugs block instance. + + Args: + flavio=1: Accept features as defined by Flavio for treebanks he + maintains. By default, a more conservative set of features and + values is expected. + """ + super().__init__(**kwargs) + self.flavio = flavio + + def process_node(self, node): + rf = [] + af = {} + # PROIEL-specific: greek words without features + # LLCT-specific: corrupted nodes + if node.lemma in ['greek.expression', 'missing^token']: + pass + # NOUNS ################################################################ + elif node.upos == 'NOUN': + if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns + rf = ['Gender', 'Number', 'Case'] + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Dim'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'VerbForm': ['Part', 'Vnoun']} + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Proper'] = ['Yes'] + af['Polarity'] = ['Neg'] + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # PROPER NOUNS ######################################################### + elif node.upos == 'PROPN': + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: # abbreviated and indeclinable nouns + rf = ['Gender', 'Number', 'Case'] + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes']} + if self.flavio: + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: + rf = ['Gender', 'Number', 'Case'] + af = { + 'NumType': ['Dist', 'Mult', 'Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Cmp', 'Sup', 'Abs'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Polarity': ['Neg'], + 'VerbForm': ['Part']} + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] + af['Variant'] = ['Greek'] + af['Degree'].append('Dim') + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + rf = ['PronType', 'Case'] + af = { + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Proper': ['Yes'], + 'Compound': ['Yes'], + 'Polarity': ['Neg'] + } + if node.feats['PronType'] == 'Prs': + af['Reflex'] = ['Yes'] + if node.feats['Reflex'] == 'Yes': # seipsum, se + rf.extend(['Person']) + # seipsum has gender and number but se does not, so it is not required + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + af['Person'] = ['3'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + else: # not reflexive: ego, tu, is, nos + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 3rd person must have gender + if node.feats['Person'] == '3': # is, id + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + elif re.match(r'^(Rel|Int)$', node.feats['PronType']): + rf.extend(['Gender', 'Number']) + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + elif node.feats['PronType'] == 'Ind': + rf = [f for f in rf if f != 'Case'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + # lexical check of PronTypes + af['PronType'] = [] + if node.lemma in ['ego', 'tu', 'is', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'egoipse', 'egometipse', 'tumetipse', 'semetipse', 'nosmetipse']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquis', 'nemo', 'nihil', 'nihilum', 'qui', 'quis', 'quisquis', 'quiuis', 'quivis']: + af['PronType'].append('Ind') + elif node.lemma in ['inuicem', 'invicem']: + af['PronType'].append('Rcp') + rf.remove('Case') + if node.lemma in ['qui', 'quicumque', 'quisquis']: + af['PronType'].append('Rel') + if node.lemma in [ 'ecquis', 'ecqui', 'numquis', 'qui', 'quis', 'quisnam']: + af['PronType'].append('Int') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurO', 'IndEurX', 'LatAnom', 'LatPron'] + af['Compound'] = ['Yes'] + af['Polarity'] = ['Neg'] + af['Form'] = ['Emp'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + rf = ['PronType'] + if node.feats['Case']: + rf.extend(['Gender', 'Number', 'Case']) + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Cmp', 'Abs', 'Sup'], + 'Polarity': ['Neg'], + 'Proper': ['Yes'], + 'PronType': [] + } + if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' + rf.extend(['Poss', 'Person[psor]']) + af['PronType'] = ['Prs'] + af['Poss'] = 'Yes' + af['Person[psor]'] = ['1', '2', '3'] + af['Reflex'] = ['Yes'] + # The possessor's number is distinguished in the first and second person (meus vs. noster) but not in the third person (suus). + if node.feats['Person[psor]'] != '3': + rf.append('Number[psor]') + af['Number[psor]'] = ['Sing', 'Plur'] + if node.feats['PronType'] == 'Ind': + af['NumType'] = ['Card'] + # lexical check of PronTypes + if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: + if not af['PronType'] == ['Prs']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquantus', 'aliqui', 'aliquot', 'quidam', 'nonnullus', 'nullus', 'quantuscumque', 'quantuslibet', 'qui', 'quilibet', 'quispiam', 'quiuis', 'quivis', 'quotlibet', 'ullus', 'unus', 'uterque','multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + af['PronType'].append('Ind') + elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: + af['PronType'].append('Tot') + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: + af['PronType'].append('Rel') + if node.lemma in ['qui', 'quantus', 'quot']: + af['PronType'].append('Int') + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot', 'praedictus', 'praefatus', 'suprascriptus']: + af['PronType'].append('Dem') + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter', 'uterlibet', 'uterque']: + af['PronType'].append('Con') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['NumType'] = ['Card'] + af['Degree'].append('Dim') + af['PronType'].append('Art') + if re.match(r'^(unus|ambo)', node.lemma): + af['NumValue'] = ['1', '2'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + rf = ['NumType', 'NumForm'] + af = { + 'NumType': ['Card', 'Ord'], + 'NumForm': ['Word', 'Roman', 'Digit'], + 'Proper': ['Yes']} + # Arabic digits and Roman numerals do not have inflection features. + if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['NumForm'].append('Reference') + af['Compound'] = ['Yes'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + rf = ['VerbForm', 'Aspect'] + af = { + 'VerbForm': ['Inf', 'Fin', 'Part', 'Conv'], + 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], + 'Polarity': ['Neg'], + 'Typo': ['Yes'] + } + if node.feats['VerbForm'] not in ['Part', 'Conv']: + rf.append('Tense') + af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] + if node.upos == 'VERB' or (node.upos == 'AUX' and node.lemma != 'sum'): + rf.append('Voice') + af['Voice'] = ['Act', 'Pass'] + if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive + rf.extend(['Mood', 'Person', 'Number']) + af['Mood'] = ['Ind', 'Sub', 'Imp'] + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + elif node.feats['VerbForm'] == 'Part': + rf.extend(['Gender', 'Number', 'Case']) + af['Number'] = ['Sing', 'Plur'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Sing'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Neut'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + af['Degree'] = ['Abs', 'Cmp'] + if node.misc['TraditionalMood'].startswith('Gerundi'): + af['Voice'] = ['Pass'] + af['Aspect'] = 'Prosp' + elif node.feats['VerbForm'] == 'Conv': + rf.extend(['Case', 'Gender', 'Number']) + af['Case'] = ['Abl', 'Acc'] + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Voice'] = ['Act'] + elif node.feats['VerbForm'] == 'Inf': + af['Tense'].remove('Pqp') + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + af['VerbType'] = ['Mod'] + if 'Degree' in af: + af['Degree'].append('Dim') + else: + af['Degree'] = ['Dim'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] + if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + elif node.feats['VerbForm'] == 'Inf': + af['Case'] = ['Nom', 'Acc', 'Abl'] + af['Gender'] = ['Neut'] + af['Number'] = ['Sing'] + af['InflClass[nominal]'] = ['Ind'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + af = { + 'AdvType': ['Loc', 'Tim'], + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'NumType': ['Card', 'Mult', 'Ord'], # e.g., primum + 'Polarity': ['Neg'] + } + if self.flavio: + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['VerbForm'] = ['Fin', 'Part'] + af['Degree'].append('Dim') + self.check_allowed_features(node, af) + # PARTICLES ############################################################ + elif node.upos == 'PART': + af = { + 'PartType': ['Int', 'Emp'], + 'Polarity': ['Neg'] + } + if self.flavio: + af['Form'] = ['Emp'] + af['PronType'] = ['Dem'] + af['Compound'] = ['Yes'] + self.check_allowed_features(node, af) + # CONJUNCTIONS ######################################################### + elif re.match(r'^[CS]CONJ$', node.upos): + af = { + 'PronType': ['Rel', 'Con'], + 'Polarity': ['Neg'], + 'Compound': ['Yes']} + if self.flavio: + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['VerbForm'] = ['Fin'] + af['NumType'] = ['Card'] + af['ConjType'] = ['Expl'] + af['AdvType'] = ['Loc'] + self.check_allowed_features(node, af) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + rf = ['AdpType'] + af = { + 'AdpType': ['Prep', 'Post'], + 'Abbr': ['Yes'] + } + if self.flavio: + af['VerbForm'] = ['Part'] + af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] + self.check_allowed_features(node, af) + # X ########################################################## + elif node.upos == 'X': + af = {'Abbr': ['Yes']} + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) diff --git a/udapi/block/ud/lemmatize.py b/udapi/block/ud/lemmatize.py new file mode 100644 index 00000000..a234256f --- /dev/null +++ b/udapi/block/ud/lemmatize.py @@ -0,0 +1,42 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + def process_node(self, node): + """ + Some treebanks lack lemmas for some or all words. Occasionally we may be + able to guess that the lemma is identical to the word form. This block + will then fill out the lemma. + + For some parts of speech, we can only say that the form is the lemma if + we have morphological features that will confirm it is the right form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + # Many closed classes do not inflect and have the same lemma as the form (just lowercased). + if re.match(r'^(PUNCT|SYM|ADP|CCONJ|SCONJ|PART|INTJ|X)$', node.upos): + node.lemma = node.form.lower() + # NOUN PROPN ADJ PRON DET NUM VERB AUX ADV + # ADV: use positive affirmative + elif re.match(r'^(ADV)$', node.upos) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + # VERB and AUX: use the infinitive + elif re.match(r'^(VERB|AUX)$', node.upos) and node.feats['VerbForm'] == 'Inf' and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + # NOUN and PROPN: use singular nominative (but do not lowercase for PROPN) + # Note: This rule is wrong in German, where no nouns should be lowercased. + elif re.match(r'^(NOUN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + elif re.match(r'^(PROPN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form + # ADJ: use masculine singular nominative positive affirmative + elif re.match(r'^(ADJ)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']): + node.lemma = node.form.lower() + # ADJ, PRON, DET: use masculine singular nominative (pronouns: each person has its own lemma) + elif re.match(r'^(ADJ|PRON|DET)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() + # NUM: use masculine nominative (number, if present at all, is lexical) + elif re.match(r'^(NUM)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Nom)?$', node.feats['Case']): + node.lemma = node.form.lower() diff --git a/udapi/block/ud/lt/fixedeprels.py b/udapi/block/ud/lt/fixedeprels.py new file mode 100644 index 00000000..9b1cb98d --- /dev/null +++ b/udapi/block/ud/lt/fixedeprels.py @@ -0,0 +1,144 @@ +"""Block to fix case-enhanced dependency relations in Lithuanian.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'jako' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'kaip': [], + 'lyg': [], + 'negu': [], + 'nei': [], + 'nes': [] + } + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'apie': 'apie:acc', # about (topic) + 'dėl': 'dėl:gen', # because of + 'iki': 'iki:gen', # until + 'iš': 'iš:gen', # from, out of + 'į': 'į:acc', # to, into, in + 'jei': 'jei', # remove morphological case # if + 'jeigu': 'jeigu', # remove morphological case # if + 'jog': 'jog', # remove morphological case # because + 'kadangi': 'kadangi', # remove morphological case # since, because + 'kai': 'kai', # remove morphological case # when + 'kaip': 'kaip', # remove morphological case # as, than + 'lyg': 'lyg', # remove morphological case # like + 'negu': 'negu', # remove morphological case # than + 'nei': 'nei', # remove morphological case # more than + 'nes': 'nes', # remove morphological case # because + 'nors': 'nors', # remove morphological case # though, although, when, if + 'nuo': 'nuo:gen', # from + 'pagal': 'pagal:acc', # according to, under, by + 'pagal_dėl': 'pagal:acc', + 'per': 'per:acc', # through, over (přes) + 'prie': 'prie:gen', # to, at, near, under + 'prieš': 'prieš:acc', # against + 'su': 'su:ins', # with + 'tarp': 'tarp:gen', # between + 'tarsi': 'tarsi', # remove morphological case # as if + 'virš': 'virš:gen' # above + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Czech basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + # Issues caused by errors in the original annotation must be fixed early. + # Especially if acl|advcl occurs with a preposition that unambiguously + # receives a morphological case in the subsequent steps, and then gets + # flagged as solved. + edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel']) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'jako_v' becomes just 'jako'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m and m.group(2) and not x+m.group(2) in exceptions: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + continue + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. Exclude 'nom' and 'voc', which cannot + # be correct. + m = re.match(r'^(obl(?::arg)?|nmod):(po|už)(?::(?:nom|voc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase and not re.search(r':(nom|voc)$', adpcase): + edep['deprel'] = m.group(1)+':'+adpcase + continue + # The remaining instance of 'po' should be ':acc'. + elif m.group(2) == 'po': + edep['deprel'] = m.group(1)+':po:acc' + continue + # The remaining 'už' are ':acc' (they are second conjuncts + # in coordinated oblique modifiers). + elif m.group(2) == 'už': + edep['deprel'] = m.group(1)+':už:acc' + continue + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index 37fd94bd..ee58084a 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -8,6 +8,13 @@ Usage: udapy -s ud.MarkBugs < in.conllu > marked.conllu 2> log.txt +Some tests may be customized for individual languages if the language code is +available as the zone id. The zone id can be provided in the sentence id after +the slash (e.g., "sent_id = s125/en" for English), or as a parameter of the +reader: + +udapy -s read.Conllu zone=en ud.MarkBugs < in.conllu > marked.conllu 2> log.txt + Errors are both logged to stderr and marked within the nodes' MISC field, e.g. `node.misc['Bug'] = 'aux-chain'`, so the output conllu file can be searched for "Bug=" occurences. @@ -28,25 +35,40 @@ 'VERB': 'VerbForm', } + class MarkBugs(Block): """Block for checking suspicious/wrong constructions in UD v2.""" - def __init__(self, save_stats=True, skip=None, **kwargs): + def __init__(self, save_stats=True, tests=None, skip=None, max_cop_lemmas=2, **kwargs): """Create the MarkBugs block object. Args: save_stats: store the bug statistics overview into `document.misc["bugs"]`? - skip: a regex. If `re.search(skip, short_msg)` the node is not reported. + tests: a regex of tests to include. + If `not re.search(tests, short_msg)` the node is not reported. + You can use e.g. `tests=aux-chain|cop-upos` to apply only those two tests. + Default = None (or empty string or '.*') which all tests. + skip: a regex of tests to exclude. + If `re.search(skip, short_msg)` the node is not reported. You can use e.g. `skip=no-(VerbForm|NumType|PronType)`. + This has higher priority than the `tests` regex. Default = None (or empty string) which means no skipping. + max_cop_lemmas: how many different lemmas are allowed to have deprel=cop. + Default = 2, so all except for the two most frequent lemmas are reported as bugs. """ super().__init__(**kwargs) self.save_stats = save_stats self.stats = collections.Counter() + self.tests_re = re.compile(tests) if (tests is not None and tests != '') else None self.skip_re = re.compile(skip) if (skip is not None and skip != '') else None + self.max_cop_lemmas = max_cop_lemmas + self.cop_count = collections.Counter() + self.cop_nodes = collections.defaultdict(list) def log(self, node, short_msg, long_msg): """Log node.address() + long_msg and add ToDo=short_msg to node.misc.""" + if self.tests_re is not None and not self.tests_re.search(short_msg): + return if self.skip_re is not None and self.skip_re.search(short_msg): return logging.debug('node %s %s: %s', node.address(), short_msg, long_msg) @@ -57,35 +79,47 @@ def log(self, node, short_msg, long_msg): node.misc['Bug'] = short_msg self.stats[short_msg] += 1 - # pylint: disable=too-many-branches + # pylint: disable=too-many-branches, too-many-statements def process_node(self, node): - form, deprel, upos, feats = node.form, node.deprel, node.upos, node.feats + form, udeprel, upos, feats = node.form, node.udeprel, node.upos, node.feats parent = node.parent - for dep in ('aux', 'fixed', 'appos', 'goeswith'): - if deprel == dep and parent.deprel == dep: + for dep in ('aux', 'fixed', 'goeswith', 'list'): + if udeprel == dep and parent.udeprel == dep: self.log(node, dep + '-chain', dep + ' dependencies should not form a chain.') - for dep in ('flat', 'fixed', 'conj', 'appos', 'goeswith'): - if deprel == dep and node.precedes(parent): + # 'appos-chain' is more difficult to test because nested appositions are allowed. + # The commented-out code below prevents just some of the false alarms + # (those where changing the nested appos into flat would result in non-projectivity). + # Unfortunatelly, there are still too many false alarms, so let's skip this test completely. + # It seems that multiple appositions as siblings are much less common than nested. + # if deprel == 'appos' and parent.deprel == 'appos': + # if not node.precedes(parent.children[-1]): + # self.log(node, 'appos-chain', 'appos should not form a chain except when nested.') + + for dep in ('flat', 'fixed', 'conj', 'appos', 'goeswith', 'list'): + if udeprel == dep and node.precedes(parent): self.log(node, dep + '-rightheaded', dep + ' relations should be left-headed, not right.') - if deprel == 'cop' and upos not in ('AUX', 'PRON'): + if udeprel == 'cop' and upos not in ('AUX', 'PRON'): self.log(node, 'cop-upos', 'deprel=cop upos!=AUX|PRON (but %s)' % upos) - if deprel == 'mark' and upos == 'PRON': + if udeprel == 'mark' and upos == 'PRON': self.log(node, 'mark-upos', 'deprel=mark upos=PRON') - if deprel == 'det' and upos not in ('DET', 'PRON'): + if udeprel == 'det' and upos not in ('DET', 'PRON'): self.log(node, 'det-upos', 'deprel=det upos!=DET|PRON (but %s)' % upos) - if deprel == 'punct' and upos != 'PUNCT': + if udeprel == 'punct' and upos != 'PUNCT': self.log(node, 'punct-upos', 'deprel=punct upos!=PUNCT (but %s)' % upos) for i_upos, i_feat in REQUIRED_FEATURE_FOR_UPOS.items(): - if upos == i_upos and not node.feats[i_feat]: - self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) + if upos == i_upos and not feats[i_feat]: + # Some languages do not distinguish finite and non-finite forms of verbs. + # The VerbForm feature is not obligatory in those languages. + if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb', 'naq'}: + self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) if feats['VerbForm'] == 'Fin': if upos not in ('VERB', 'AUX'): @@ -93,22 +127,22 @@ def process_node(self, node): if not feats['Mood']: self.log(node, 'finverb-mood', 'VerbForm=Fin but Mood feature is missing') - if feats['Degree'] and upos not in ('ADJ', 'ADV'): - self.log(node, 'degree-upos', - 'Degree=%s upos!=ADJ|ADV (but %s)' % (feats['Degree'], upos)) - - subject_children = [n for n in node.children if 'subj' in n.deprel] + subject_children = [n for n in node.children if 'subj' in n.udeprel and n.sdeprel != 'outer'] if len(subject_children) > 1: - self.log(node, 'multi-subj', 'More than one [nc]subj(:pass)? child') - - object_children = [n for n in node.children if n.deprel in ('obj', 'ccomp')] + self.log(node, 'multi-subj', 'More than one (non-outer) [nc]subj child') + + # Since "ccomp" is considered a clausal counterpart of "obj" in UD v2, + # one may conclude that "obj" and "ccomp" are mutually exclusive. + # However, this has always be a gray zone and people have occasionally + # brought up examples where they would want the two relations to co-occur. + # Also, there is no clausal counterpart for "iobj", which may cause some + # of the problems. It is probably safer not to consider "ccomp" in this + # test. Nevertheless, two "obj" under the same parent are definitely an + # error. + object_children = [n for n in node.children if n.udeprel == 'obj'] if len(object_children) > 1: self.log(node, 'multi-obj', 'More than one obj|ccomp child') - # In addition to http://universaldependencies.org/svalidation.html - if parent.deprel == 'punct': - self.log(node, 'punct-child', 'parent.deprel=punct') - # See http://universaldependencies.org/u/overview/syntax.html#the-status-of-function-words # TODO: Promotion by Head Elision: It is difficult to detect this exception. # So far, I have just excluded "det" from the forbidded parent.deprel set @@ -119,15 +153,15 @@ def process_node(self, node): # It seems the documentation does not allow any other deprel than advmod, # so there should be no false alarms. Some errors are not reported, i.e. the cases # when advmod incorrectly depends on a function word ("right before midnight"). - if parent.deprel in ('aux', 'cop', 'mark', 'clf', 'case'): - if deprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod'): + if parent.udeprel in ('aux', 'cop', 'mark', 'clf', 'case'): + if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod', 'reparandum'): self.log(node, parent.deprel + '-child', 'parent.deprel=%s deprel!=conj|cc|punct|fixed|goeswith' % parent.deprel) # goeswith should be left-headed, but this is already checked, so let's skip right-headed. - if deprel == 'goeswith' and parent.precedes(node): + if udeprel == 'goeswith' and parent.precedes(node): span = node.root.descendants(add_self=1)[parent.ord:node.ord] - intruder = next((n for n in span[1:] if n.deprel != "goeswith"), None) + intruder = next((n for n in span[1:] if n.udeprel != "goeswith"), None) if intruder is not None: self.log(intruder, 'goeswith-gap', "deprel!=goeswith but lies within goeswith span") else: @@ -138,10 +172,30 @@ def process_node(self, node): if upos == 'SYM' and form.isalpha(): self.log(node, 'sym-alpha', "upos=SYM but all form chars are alphabetical: " + form) - if upos == 'PUNCT' and any(char.isalpha() for char in form): + if upos == 'PUNCT' and any(char.isalpha() for char in form): self.log(node, 'punct-alpha', "upos=PUNCT but form has alphabetical char(s): " + form) + if upos == 'PUNCT' and udeprel not in ('punct', 'fixed', 'goeswith', 'root'): + self.log(node, 'punct-deprel', 'upos=PUNCT deprel!=punct|fixed|goeswith|root (but %s)' + % udeprel) + + if upos == 'PUNCT' and node.is_nonprojective(): + self.log(node, 'punct-nonproj', 'upos=PUNCT and edge is non-projective') + if upos == 'PUNCT' and node.is_nonprojective_gap() and not parent.is_nonprojective_gap(): + self.log(node, 'punct-nonproj-gap', 'upos=PUNCT and causing a non-projectivity') + + if udeprel == 'cop': + lemma = node.lemma if node.lemma != '_' else form + self.cop_nodes[lemma].append(node) + self.cop_count[lemma] += 1 + def after_process_document(self, document): + for lemma, _count in self.cop_count.most_common()[self.max_cop_lemmas:]: + for node in self.cop_nodes[lemma]: + self.log(node, 'cop-many-lemmas', 'deprel=cop but lemma=%s not in top-%d' + % (lemma, self.max_cop_lemmas)) + self.cop_count.clear() + self.cop_nodes.clear() total = 0 message = 'ud.MarkBugs Error Overview:' for bug, count in sorted(self.stats.items(), key=lambda pair: (pair[1], pair[0])): diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py new file mode 100644 index 00000000..26c5624d --- /dev/null +++ b/udapi/block/ud/markfeatsbugs.py @@ -0,0 +1,73 @@ +""" +Block to identify missing or ill-valued features in a treebank. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. This is a base block that only +implements service methods. A language-specific block must be derived from this +one and define the actual rules valid in that language. + +Usage (Czech example): cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html +""" +from udapi.core.block import Block + +class MarkFeatsBugs(Block): + + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def check_allowed_features(self, node, allowed): + """ + We need a dictionary indexed by feature names that are allowed; for each + feature name, there is a list of allowed values. + """ + # Check for features that are not allowed but the node has them. + # For features that are allowed, check that their values are allowed. + for f in node.feats: + if f in allowed: + if not node.feats[f] in allowed[f]: + self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') + else: + self.bug(node, 'Feat' + f + 'NotAllowed') + + def check_required_features(self, node, required): + """ + We need a list of names of features whose values must not be empty. + """ + for f in required: + if not f in node.feats: + self.bug(node, 'Feat' + f + 'Missing') + + def process_node(self, node): + """ + This is a generic block, do nothing here. In a language-specific block + based on this one, rules similar to the examples below can be specified: + + # NOUNS ################################################################ + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + #... + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) + """ + return diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py new file mode 100644 index 00000000..13c8434c --- /dev/null +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -0,0 +1,279 @@ +""" +Block to identify missing or ill-valued features in Malayalam. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.ml.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def process_node(self, node): + # FOREIGN WORDS ######################################################## + # Do not put any restrictions on words that have Foreign=Yes. These may + # also have Lang=xx in MISC, which would mean that the official + # validator would judge them by the rules for language [xx]. But even + # if they are not fully code-switched (e.g. because they are written in + # the Malayalam script, like the English verb പ്ലാന്റ് plānṟ "plant"), + # they still may not have the regular features of Malayalam morphology. + if node.feats['Foreign'] == 'Yes': + pass + # NOUNS AND PROPER NOUNS ############################################### + elif re.match(r'^(NOUN|PROPN)$', node.upos): + self.check_required_features(node, ['Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'NumType': ['Ord'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + rf = ['PronType', 'Case'] + af = { + 'PronType': ['Prs', 'Int', 'Ind'], # demonstrative pronouns are treated as third person personal pronouns + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + } + if node.feats['PronType'] == 'Prs': + af['Reflex'] = ['Yes'] + if node.feats['Reflex'] == 'Yes': + rf = ['PronType'] + else: # not reflexive + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī; or 3rd person താൻ tān̕ + if node.feats['Person'] == '3' and not node.lemma == 'താൻ': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ; but not താൻ tān̕ + rf.append('Deixis') + af['Deixis'] = ['Prox', 'Remt'] + if node.feats['Number'] == 'Sing': + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + # third person singular neuter pronouns also distinguish animacy (animate neuter are animals and plants, they have a different accusative form) + if node.feats['Gender'] == 'Neut': + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + else: # plural pronouns do not distinguish gender but they do distinguish animacy + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + elif node.feats['Person'] == '1' and node.feats['Number'] == 'Plur': + rf.append('Clusivity') + af['Clusivity'] = ['In', 'Ex'] + # Interrogative pronouns, too, can be case-marked. Therefore, the + # base form must have Case=Nom. + # ആര് ār "who" (Nom) എന്ത് ent "what" (Nom, Acc.Inan) + # ആരെ āre "who" (Acc) എന്തെ ente "what" (Acc.Anim) എന്തിനെ entine "what" (Acc.Anim or maybe Inan but optional) + # ആരുടെ āruṭe "who" (Gen) എന്തിന് entin "what" (Gen) or "why" + # ആരൊക്കെ ārokke "who" (Dat?) എന്തൊക്കെ entokke "what" (Dat?) + #elif node.feats['PronType'] == 'Int': + # rf.append('Animacy') + # af['Animacy'] = ['Anim', 'Inan'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + if node.feats['PronType'] == 'Art': + self.check_required_features(node, ['PronType', 'Definite']) + self.check_allowed_features(node, { + 'PronType': ['Art'], + 'Definite': ['Ind'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['PronType']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Deixis': ['Prox', 'Remt'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card', 'Frac'], + 'NumForm': ['Word'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # VERBS ################################################################ + elif node.upos == 'VERB': + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] == 'Inf': + self.check_allowed_features(node, { + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + # Unlike other forms, the imperative distinguishes politeness. + # The verb stem serves as an informal imperative: തുറ tuṟa "open" + # The citation form may serve as a formal imperative: തുറക്കുക tuṟakkūka "open" + # Finally, there is another formal imperative with -kkū: തുറക്കൂ tuṟakkū "open" + self.check_required_features(node, ['Mood', 'Polite']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Polite': ['Infm', 'Form'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['Mood'] == 'Nec': + self.check_required_features(node, ['Mood', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Nec'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + else: + self.check_required_features(node, ['Mood', 'Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Pot', 'Cnd'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + elif node.feats['VerbForm'] == 'Part': + self.check_required_features(node, ['Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + else: # verbal noun + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Foreign': ['Yes'], + 'Typo': ['Yes'] + }) + # AUXILIARIES ########################################################## + elif node.upos == 'AUX': + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub', 'Cnd'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + else: # verbal noun + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Typo': ['Yes'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {'Typo': ['Yes']}) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_allowed_features(node, { + # Case suffixes after numbers are separate tokens, they are attached + # via the 'case' relation and they bear the Case feature (the number does not). + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Abbr': ['Yes'], + 'Typo': ['Yes']}) + # PARTICLES ############################################################ + elif node.upos == 'PART': + self.check_allowed_features(node, { + 'Polarity': ['Neg'], + 'Abbr': ['Yes'], + 'Typo': ['Yes'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {'Abbr': ['Yes'], 'Typo': ['Yes']}) diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py new file mode 100644 index 00000000..bd63ee7d --- /dev/null +++ b/udapi/block/ud/mr/addformsinmwt.py @@ -0,0 +1,94 @@ +""" +Block ud.mr.AddFormsInMwt looks for multiword tokens whose words lack forms. +Based on the form of the surface token and on the information provided in +the lemmas and UPOS, tries to reconstruct the forms of individual words. +""" +from udapi.core.block import Block +import re +import logging + + +class AddFormsInMwt(Block): + """Guess forms of syntactic worms within a multiword token.""" + + def process_node(self, node): + if node.form == '_' and node.multiword_token: + mwt = node.multiword_token + # Many multiword tokens consist of NOUN + ADP. Beware: The adposition + # may have a form different from its lemma. It happens with possessive + # postpositions चा, चे, which distinguish the gender and number of + # the possessed entity. + if len(mwt.words) == 2 and re.match(r'^(ADP|PART)$', mwt.words[1].upos): + # Occasionally the lemma of the possessive postposition is mistakenly 'ची' instead of 'चा'. + if mwt.words[1].lemma == 'चा' or mwt.words[1].lemma == 'ची': + mwt.words[1].lemma = 'चा' + # चा (cā) ... Masc Sing + # ची (cī) ... Fem Sing, Neut Plur + # चे (ce) ... Neut Sing, Masc Plur + # च्या (cyā) ... Fem Plur + # चं (caṁ) ... ? + m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)$', mwt.form) + # The resulting form is different with personal pronouns. + # माझा (mājhā), माझी (mājhī), माझे (mājhe), माझ्या (mājhyā) + # तुझी (tujhī), तुझे (tujhe) + # आपला (āpalā), आपली (āpalī), आपल्या (āpalyā) + # त्याचं (tyācaṁ) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = m.group(2) + elif m2: + if node == mwt.words[0]: + node.form = m2.group(1) + else: + node.form = 'च' + m2.group(2) + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + elif mwt.words[1].lemma == 'वरती': + m = re.match(r'^(.+)(वर(?:ती)?)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = m.group(2) + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + else: # not the possessive 'चा' + m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + else: + node.form = node.lemma + else: + logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma)) + elif len(mwt.words) == 3 and re.match(r'^(ADP|PART)$', mwt.words[1].upos) and re.match(r'^(ADP|PART)$', mwt.words[2].upos): + # Compound postpositions where the middle word is the possessive 'चा'. + # The lemma of the middle word should be 'चा' but sometimes it is 'च्या'. + if re.match(r'^(चा|च्या)$', mwt.words[1].lemma): + m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form) + m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)(.+)$', mwt.form) + if m: + if node == mwt.words[0]: + node.form = m.group(1) + elif node == mwt.words[1]: + node.form = m.group(2) + node.lemma = 'चा' + else: + node.form = m.group(3) + elif m2: + if node == mwt.words[0]: + node.form = m2.group(1) + elif node == mwt.words[1]: + node.form = 'च' + m2.group(2) + node.lemma = 'चा' + else: + node.form = m2.group(3) + else: + logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) + else: + logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma)) + else: + logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma + '/' + x.upos for x in mwt.words]))) diff --git a/udapi/block/ud/printfixed.py b/udapi/block/ud/printfixed.py new file mode 100644 index 00000000..313943bb --- /dev/null +++ b/udapi/block/ud/printfixed.py @@ -0,0 +1,104 @@ +""" +Block PrintFixed prints occurrences of fixed multiword expressions in UD. It +can be run twice in a row, first collecting known fixed expressions and then +also reporting other occurrences of these expressions where they are not +annotated as fixed. + +Usage: +udapy ud.PrintFixed only_forms=1 < in.conllu | sort -u > fixed_expressions.txt +udapy ud.PrintFixed known_expressions=fixed_expressions.txt < in.conllu | sort | uniq -c | less + +Author: Dan Zeman +""" +from udapi.core.block import Block +import re +import logging + +class PrintFixed(Block): + """ + Print fixed multiword expressions. + """ + + def __init__(self, only_forms=False, known_expressions=None, **kwargs): + """ + Create the PrintFixed block. + + Parameters: + only_forms=1: print the word forms but not tags and other info; + This can be used to create the list of known forms that we want to + identify even if they are not annotated as fixed. + known_expressions: the name of the text file with the expressions + """ + super().__init__(**kwargs) + self.only_forms = only_forms + self.known_expressions = {} + self.first_words = {} + self.max_length = 2 + if known_expressions: + fh = open(known_expressions, 'r', encoding='utf-8') + n = 0 + for expression in fh.readlines(): + expression = expression.replace('\n', '') + if expression in self.known_expressions: + self.known_expressions[expression] += 1 + else: + self.known_expressions[expression] = 1 + logging.info("Read known fixed expression '%s'" % expression) + n += 1 + words = expression.split(' ') + first_word = words[0] + self.first_words[first_word] = 1 + length = len(words) + if length > self.max_length: + self.max_length = length + logging.info('Read %d known fixed expressions.' % n) + + def process_node(self, node): + fixed_children = [x for x in node.children if x.udeprel == 'fixed'] + if len(fixed_children) > 0: + # Fixed children are always to the right of of the parent. But there + # may be other nodes in between that are not fixed children (for + # example, there may be punctuation that is attached to one of the + # fixed nodes). + n = node + list_of_forms = [node.form.lower()] + list_of_tags = [node.upos] + while n != fixed_children[-1]: + n = n.next_node + if n.parent == node and n.udeprel == 'fixed': + list_of_forms.append(n.form.lower()) + list_of_tags.append(n.upos) + else: + list_of_forms.append('X') + list_of_tags.append('X') + forms = ' '.join(list_of_forms) + tags = ' '.join(list_of_tags) + if self.only_forms: + print(forms) + else: + print("%s / %s / %s" % (forms, tags, node.deprel)) + else: + # If this is not the first word of a fixed expression, check whether + # something that looks like a known fixed expression starts here. + # Note that it is also possible that a known expression starts here + # but only a subset is actually marked as such; we currently do not + # account for this. + if node.form.lower() in self.first_words: + n = node + list_of_forms = [node.form.lower()] + list_of_tags = [node.upos] + for i in range(self.max_length - 1): + n = n.next_node + if not n: + break + ###!!! At present we cannot identify known expressions with gaps ('X'). + list_of_forms.append(n.form.lower()) + list_of_tags.append(n.upos) + forms = ' '.join(list_of_forms) + if forms in self.known_expressions: + if self.only_forms: + print(forms) + else: + tags = ' '.join(list_of_tags) + print("%s / %s / NOT FIXED" % (forms, tags)) + break diff --git a/udapi/block/ud/pt/__init__.py b/udapi/block/ud/pt/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/pt/addhyphenmwt.py b/udapi/block/ud/pt/addhyphenmwt.py new file mode 100644 index 00000000..9492b1a2 --- /dev/null +++ b/udapi/block/ud/pt/addhyphenmwt.py @@ -0,0 +1,37 @@ +"""Block ud.pt.AddHyphenMwt for transforming hyphen compounds into multiword tokens in Portuguese-GSD. + +See https://github.com/UniversalDependencies/UD_Portuguese-GSD/issues/39 +""" +from udapi.core.block import Block + +class AddHyphenMwt(Block): + + def _ok(self, token): + # The hyphen in "al-Assad" perhaps should be kept as a separate word. + return token.form.isalnum() and token.form.lower() != 'al' + + def process_tree(self, root): + tokens, i = root.token_descendants, 1 + while i+1 < len(tokens): + start_i = i-1 + if tokens[i].form == "-" and self._ok(tokens[i-1]) and self._ok(tokens[i+1]): + while i+3 < len(tokens) and tokens[i+2].form == "-" and self._ok(tokens[i+3]): + i += 2 + compound, words = tokens[start_i:i+2], [] + for token in compound: + words += token.words + heads = [w for w in words if w.parent not in words] + cuckolds = [w for w in words if w not in heads and any(c not in words for c in w.children)] + if len(heads) > 1: + for h in heads: + h.misc["ToDo"] = 'NonCatenaCompound' + elif cuckolds: + for c in cuckolds: + c.misc["ToDo"] = 'HasChildrenOutsideCompound' + else: + compound_form = "".join(t.form for t in compound) + for hyphen in compound[1::2]: + hyphen.remove() + root.create_multiword_token([w for w in words if w.form != '-'], compound_form) + root.text = None + i += 1 diff --git a/udapi/block/ud/pt/addmwt.py b/udapi/block/ud/pt/addmwt.py new file mode 100644 index 00000000..daa605b2 --- /dev/null +++ b/udapi/block/ud/pt/addmwt.py @@ -0,0 +1,148 @@ +"""Block ud.pt.AddMwt for heuristic detection of Portuguese contractions. + +According to the UD guidelines, contractions such as "dele" = "de ele" +should be annotated using multi-word tokens. + +Note that this block should be used only for converting legacy conllu files. +Ideally a tokenizer should have already split the MWTs. +""" +import udapi.block.ud.addmwt + +MWTS = { + 'à': {'form': 'a a', 'lemma': 'a o'}, + 'às': {'form': 'a as', 'lemma': 'a o'}, + 'ao': {'form': 'a o', 'lemma': 'a o'}, + 'aos': {'form': 'a os', 'lemma': 'a o'}, + 'da': {'form': 'de a', 'lemma': 'de o'}, + 'das': {'form': 'de as', 'lemma': 'de o'}, + 'dessa': {'form': 'de essa', 'lemma': 'de esse'}, + 'dessas': {'form': 'de essas', 'lemma': 'de esse'}, + 'desse': {'form': 'de esse', 'lemma': 'de esse'}, + 'desses': {'form': 'de esses', 'lemma': 'de esse'}, + 'desta': {'form': 'de esta', 'lemma': 'de este'}, + 'destas': {'form': 'de estas', 'lemma': 'de este'}, + 'deste': {'form': 'de este', 'lemma': 'de este'}, + 'destes': {'form': 'de estes', 'lemma': 'de este'}, + 'disso': {'form': 'de isso', 'lemma': 'de este'}, + 'disto': {'form': 'de isto', 'lemma': 'de este'}, + 'do': {'form': 'de o', 'lemma': 'de o'}, # 'upos': 'ADP PRON', 'deprel': 'case *'' + 'dos': {'form': 'de os', 'lemma': 'de o'}, + 'dum': {'form': 'de um', 'lemma': 'de um'}, + 'duma': {'form': 'de uma', 'lemma': 'de um'}, + 'dumas': {'form': 'de umas', 'lemma': 'de um'}, + 'duns': {'form': 'de uns', 'lemma': 'de um'}, + 'na': {'form': 'em a', 'lemma': 'em o'}, + 'nas': {'form': 'em as', 'lemma': 'em o'}, # ADP PRON + 'nesses': {'form': 'em esses', 'lemma': 'em esse'}, + 'nesta': {'form': 'em esta', 'lemma': 'em este'}, + 'neste': {'form': 'em este', 'lemma': 'em este'}, + 'nisso': {'form': 'em isso', 'lemma': 'em este'}, + 'nisto': {'form': 'em isto', 'lemma': 'em este', + 'upos': 'ADP PRON', 'main': 1, 'shape': 'subtree'}, + 'no': {'form': 'em o', 'lemma': 'em o'}, # PRON cases are excluded below + 'nos': {'form': 'em os', 'lemma': 'em o'}, # PRON cases are excluded below + 'num': {'form': 'em um', 'lemma': 'em um'}, + 'numa': {'form': 'em uma', 'lemma': 'em um'}, + 'numas': {'form': 'em umas', 'lemma': 'em um'}, + 'nuns': {'form': 'em uns', 'lemma': 'em um'}, + 'pela': {'form': 'por a', 'lemma': 'por o'}, + 'pelas': {'form': 'por as', 'lemma': 'por o'}, + 'pelos': {'form': 'por os', 'lemma': 'por o'}, + 'pelo': {'form': 'por o', 'lemma': 'por o'}, + # TODO daí = de aí = ADP ADV = case advmod +} + +# shared values for all entries in MWTS +for v in MWTS.values(): + if not v.get('upos'): + v['upos'] = 'ADP DET' + if not v.get('deprel'): + v['deprel'] = 'case det' + v['feats'] = '_ *' + # The following are the default values + # v['main'] = 0 # which of the two words will inherit the original children (if any) + # v['shape'] = 'siblings', # the newly created nodes will be siblings + +for pronoun in 'ela ele eles elas'.split(): + MWTS['d' + pronoun] = { + 'form': 'de ' + pronoun, + 'lemma': 'de ' + pronoun, + 'upos': 'ADP PRON', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } + + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + + # "no" can be either a contraction of "em o", or a pronoun + if node.form.lower() in ('no', 'nos') and node.upos == 'PRON': + return + + analysis = MWTS.get(node.form.lower(), None) + + # If the input is e.g.: + # 1 na _ ADP _ _ deprel_x ? + # 2 verdade _ NOUN _ _ fixed 1 + # The expected output is: + # 1-2 na _ _ _ _ _ _ + # 1 em _ ADP _ _ deprel_x ? + # 2 a _ DET _ _ fixed 1 + # 3 verdade _ NOUN _ _ fixed 1 + if analysis and analysis['deprel'] == 'case det' and node.udeprel != 'case': + copy = dict(analysis) + copy['deprel'] = '* det' + copy['shape'] = 'subtree' + first_child = next((c for c in node.children if node.precedes(c)), None) + if first_child is not None and first_child.udeprel == 'fixed': + copy['deprel'] = '* fixed' + return copy + if analysis is not None: + return analysis + + if node.form.lower().endswith('-se') and node.upos == 'VERB': + return { + 'form': node.form.lower()[:-3] + ' se', + 'lemma': '* se', + 'upos': '* PRON', + 'feats': '* _', + 'deprel': '* nsubj', # or '* expl' + 'main': 0, + 'shape': 'subtree', + } + elif node.form.lower().endswith('-lo') and node.upos == 'VERB': + return { + 'form': node.form.lower()[:-3] + ' lo', + 'lemma': '* ele', + 'upos': '* PRON', + 'feats': '* _', + 'deprel': '* obj', + 'main': 0, + 'shape': 'subtree', + } + elif node.form.lower().endswith('-los') and node.upos == 'VERB': + return { + 'form': node.form.lower()[:-4] + ' los', + 'lemma': '* eles', + 'upos': '* PRON', + 'feats': '* _', + 'deprel': '* obj', + 'main': 0, + 'shape': 'subtree', + } + elif node.form.lower().endswith('-o') and node.upos == 'VERB': + return { + 'form': node.form.lower()[:-2] + ' o', + 'lemma': '* ele', + 'upos': '* PRON', + 'feats': '* _', + 'deprel': '* obj', + 'main': 0, + 'shape': 'subtree', + } + return None diff --git a/udapi/block/ud/removemwt.py b/udapi/block/ud/removemwt.py new file mode 100644 index 00000000..99c37b4d --- /dev/null +++ b/udapi/block/ud/removemwt.py @@ -0,0 +1,38 @@ +"""Block ud.RemoveMwt for removing multi-word tokens.""" +from udapi.core.block import Block + + +class RemoveMwt(Block): + """Substitute MWTs with one word representing the whole MWT.""" + + def process_tree(self, root): + for mwt in root.multiword_tokens: + words = mwt.words + words[0].form = mwt.form + words[0].misc = mwt.misc + words[0].upos = self.guess_upos(words) + words[0].feats = self.guess_feats(words) + words[0].deprel = self.guess_deprel(words) + mwt.remove() + for word in words[1:]: + word.remove(children='rehang') + + @staticmethod + def guess_upos(words): + """UPOS of the whole MWT""" + return words[0].upos + + @staticmethod + def guess_deprel(words): + """DEPREL of the whole MWT""" + return words[0].deprel + # Alternatively, we could define deprel subtypes + # return words[0].deprel + ':' + ','.join([w.deprel for w in words[1:]]) + + @staticmethod + def guess_feats(words): + """FEATS of the whole MWT""" + feats = words[0].feats + for word in words[1:]: + feats.update(word.feats) + return feats diff --git a/udapi/block/ud/ro/fixfixed.py b/udapi/block/ud/ro/fixfixed.py new file mode 100644 index 00000000..14d16464 --- /dev/null +++ b/udapi/block/ud/ro/fixfixed.py @@ -0,0 +1,20 @@ +"""Block ud.ro.FixFixed + +Author: Dan Zeman +""" +import logging + +from udapi.core.block import Block + + +class FixFixed(Block): + """Block for fixing annotation of some 'fixed' expressions.""" + + def process_node(self, node): + fixchildren = [x for x in node.children if x.udeprel=='fixed'] + nfc = len(fixchildren) + if nfc > 0: + if node.udeprel == 'advmod' and node.feats['ExtPos'] == '': + node.feats['ExtPos'] = 'ADV' + elif node.feats['ExtPos'] == '': + logging.info('Another case: '+node.lemma+' '+' '.join([x.form for x in fixchildren])) diff --git a/udapi/block/ud/ro/fixneg.py b/udapi/block/ud/ro/fixneg.py index a22131b2..68888aa6 100644 --- a/udapi/block/ud/ro/fixneg.py +++ b/udapi/block/ud/ro/fixneg.py @@ -6,13 +6,14 @@ from udapi.core.block import Block + class FixNeg(Block): """Block for fixing the remaining cases (after ud.Convert1to2) of deprel=neg in UD_Romanian.""" def process_node(self, node): if node.deprel == "neg": if node.upos == "PRON" and node.form == "ne": - node.feats = 'Polarity=Neg' # delete other features + node.feats = 'Polarity=Neg' # delete other features elif node.upos != "ADJ": logging.warning("Strange node %s with deprel=neg", node) node.upos = "ADV" diff --git a/udapi/block/ud/ro/setspaceafter.py b/udapi/block/ud/ro/setspaceafter.py index 80bfda8f..6c4b27e3 100644 --- a/udapi/block/ud/ro/setspaceafter.py +++ b/udapi/block/ud/ro/setspaceafter.py @@ -1,7 +1,8 @@ """Block ud.ro.SetSpaceAfter for heuristic setting of SpaceAfter=No in Romanian. -Usage: -udapy -s ud.ro.SetSpaceAfter < in.conllu > fixed.conllu +Usage:: + + udapy -s ud.ro.SetSpaceAfter < in.conllu > fixed.conllu Author: Martin Popel """ @@ -9,17 +10,21 @@ import udapi.block.ud.setspaceafter + class SetSpaceAfter(udapi.block.ud.setspaceafter.SetSpaceAfter): """Block for heuristic setting of the SpaceAfter=No MISC attribute in Romanian. Romanian uses many contractions, e.g. - raw | meaning | tokenized | lemmatized - -------|---------|-----------|----------- - n-ar | nu ar | n- ar | nu avea - să-i | să îi | să -i | să el - într-o | în o | într- o | întru un - nu-i | nu îi | nu -i | nu el - nu-i | nu e | nu -i | nu fi + + ======= ======= ========= ========== + raw meaning tokenized lemmatized + ======= ======= ========= ========== + n-ar nu ar n- ar nu avea + să-i să îi să -i să el + într-o în o într- o întru un + nu-i nu îi nu -i nu el + nu-i nu e nu -i nu fi + ======= ======= ========= ========== Detokenization is quite simple: no space after word-final hyphen and before word-initial hyphen. There are just two exceptions, I have found: @@ -33,7 +38,7 @@ def process_tree(self, root): # Mark contractions like -i, -și, -l, -urilor, but not negative numbers like -12,3. # Store SpaceAfter=No to the previous node. - next_form = nodes[i+1].form + next_form = nodes[i + 1].form if re.match('-.*[^0-9,.]', next_form): self.mark_no_space(node) diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py new file mode 100644 index 00000000..6fa73460 --- /dev/null +++ b/udapi/block/ud/ru/fixedeprels.py @@ -0,0 +1,279 @@ +"""Block to fix case-enhanced dependency relations in Russian.""" +from udapi.core.block import Block +import logging +import re + +class FixEdeprels(Block): + + # Sometimes there are multiple layers of case marking and only the outermost + # layer should be reflected in the relation. For example, the semblative 'как' + # is used with the same case (preposition + morphology) as the nominal that + # is being compared ('как_в:loc' etc.) We do not want to multiply the relations + # by all the inner cases. + # The list in the value contains exceptions that should be left intact. + outermost = { + 'более_чем': [], + 'будто': [], + 'ведь': [], + 'ежели': [], + 'если': [], + 'как': ['как_только'], + 'когда': [], + 'кроме_как': [], + 'менее_чем': [], + 'минус': [], + 'нежели': [], + 'плюс': [], + 'пока': [], + 'поскольку': [], + 'потому_что': [], + 'пусть': [], + 'равно_как': [], + 'раз': [], + 'словно': [], + 'так_что': [], + 'хоть': [], + 'хотя': [], + 'чем': [], + 'что': [], + 'чтобы': [], + 'яко': [] + } + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'versus': 'версус:nom', + 'loc': 'в:loc', + 'в_вид': 'в_виде:gen', + 'в_во_глава': 'в:acc', # annotation error: 'входил в группу во главе с геологом' + 'в_для': 'в:acc', + 'в_качество': 'в_качестве:gen', + 'в_отношение': 'в_отношении:gen', + 'в_с': 'в:loc', # annotation error: 'в партнерстве с ACCELS' lacks the second level + 'в_связь_с': 'в_связи_с:ins', + 'в_случай_если': 'в_случае_если', + 'в_случай_когда': 'в_случае_когда', + 'в_соответствие_с': 'в_соответствии_с:ins', + 'в_течение': 'в_течение:gen', + 'в_то_быть': 'в:loc', + 'в_тот_время_как': 'в_то_время_как', + 'в_угода': 'в_угоду:dat', + 'в_ход': 'в_ходе:gen', + 'вблизи': 'вблизи:gen', + 'взамен': 'взамен:gen', + 'вместо': 'вместо:gen', + 'во_глава': 'во_главе_с:ins', + 'во_глава_с': 'во_главе_с:ins', + 'во_избежание': 'во_избежание:gen', + 'возле': 'возле:gen', + 'вокруг': 'вокруг:gen', + 'вплоть_до': 'вплоть_до:gen', + 'вроде': 'вроде:gen', + 'выше': 'выше:gen', + 'для': 'для:gen', + 'для_в': 'для:gen', + 'до_то_как': 'до:gen', # до того, как ... + 'за_исключение': 'за_исключением:gen', + 'из_более_чем': 'из:gen', + 'к': 'к:dat', + 'ко': 'ко:dat', + 'коли_скоро': 'коль_скоро', + 'кроме': 'кроме:gen', + 'между_во_глава': 'между:ins', # annotation error: 'между делегацией Минобороны во главе с замминистра Владимиром Исаковым и лидером Приднестровья Игорем Смирновым' + 'на_вперед': 'на:acc', + 'над': 'над:ins', # at least I have not encountered any genuine example of accusative + 'насчет': 'насчет:gen', + 'несмотря_на': 'несмотря_на:acc', + 'ниже': 'ниже:gen', + 'около': 'около:gen', + 'от_до': 'от:gen', + 'от_от': 'от:gen', + 'от_с': 'от:gen', + 'относительно': 'относительно:gen', + 'перед': 'перед:ins', + 'по_мера': 'по_мере:gen', + 'по_мера_то_как': 'по_мере_того_как', + 'по_отношение_ко?': 'по_отношению_к:dat', + 'по_повод': 'по_поводу:gen', + 'по_сравнение_с': 'по_сравнению_с:ins', + 'помимо': 'помимо:gen', + 'порядка': 'порядка:gen', + 'после': 'после:gen', + 'посредством_как': 'посредством:gen', + 'при': 'при:loc', + 'при_помощь': 'при_помощи:gen', + 'при_условие_что': 'при_условии_что', + 'про': 'про:acc', + 'против': 'против:gen', + 'с_более_чем': 'с:gen', + 'с_во_глава': 'с:ins', + 'с_на': 'с:par', + 'с_помощь': 'с_помощью:gen', + 'с_тем': 'с:ins', + 'с_тот_пора_как': 'с_тех_пор_как', + 'с_что': 'с:ins', + 'свыше': 'свыше:gen', + 'со_сторона': 'со_стороны:gen', + 'согласно': 'согласно:dat', + 'спустя': 'спустя:acc', + 'среди': 'среди:gen', + 'среди_в': 'среди:gen', + 'так_чтобы': 'чтобы', + 'тем_между': 'между:ins', + 'у': 'у:gen', + 'у_без': 'у:gen', + 'через': 'через:acc', + 'чтоб': 'чтобы' + } + + def copy_case_from_adposition(self, node, adposition): + """ + In some treebanks, adpositions have the Case feature and it denotes the + valency case that the preposition's nominal must be in. + """ + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == adposition] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + return adposition+':'+prepchildren[0].feats['Case'].lower() + else: + return None + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Russian basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + # Although in theory allowed by the EUD guidelines, Russian does not enhance the ccomp relation with case markers. + edep['deprel'] = re.sub(r'^ccomp:чтобы$', r'ccomp', edep['deprel']) + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause. + edep['deprel'] = re.sub(r':(быть|сколь|столько|типа).*', '', edep['deprel']) + # Some markers should be discarded only if they occur as clause markers (acl, advcl). + edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel']) + # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl). + edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'obl:\1\2', edep['deprel']) + edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'nmod:\1\2', edep['deprel']) + # If the case marker starts with 'столько', remove this part. + # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else. + # Similarly, 'то' occurs in 'то...то' and should be removed. + edep['deprel'] = re.sub(r':(столько|то|точно)[_:]', ':', edep['deprel']) + # If one of the following expressions occurs followed by another preposition + # or by morphological case, remove the additional case marking. For example, + # 'словно_у' becomes just 'словно'. + for x in self.outermost: + exceptions = self.outermost[x] + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) + if m and m.group(2) and not x+m.group(2) in exceptions: + edep['deprel'] = m.group(1)+':'+x + solved = True + break + if solved: + continue + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|par|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + if solved: + continue + # The following prepositions have more than one morphological case + # available. + m = re.match(r'^(obl(?::arg)?|nmod):(до|из|от)(?::(?:nom|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or partitive are possible. Pick genitive. + edep['deprel'] = m.group(1)+':'+m.group(2)+':gen' + continue + # Both "на" and "в" also occur with genitive. However, this + # is only because there are numerals in the phrase ("в 9 случаев из 10") + # and the whole phrase should not be analyzed as genitive. + m = re.match(r'^(obl(?::arg)?|nmod):(в|во|на|о)(?::(?:nom|gen|dat|voc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Accusative or locative are possible. Pick locative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + continue + # Unlike in Czech, 'над' seems to allow only instrumental and not accusative. + m = re.match(r'^(obl(?::arg)?|nmod):(за|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Accusative or instrumental are possible. Pick accusative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':acc' + continue + m = re.match(r'^(obl(?::arg)?|nmod):(между)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or instrumental are possible. Pick genitive. + edep['deprel'] = m.group(1)+':'+m.group(2)+':gen' + continue + m = re.match(r'^(obl(?::arg)?|nmod):(по)(?::(?:nom|gen|voc|ins))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Dative, accusative or locative are possible. Pick dative. + edep['deprel'] = m.group(1)+':'+m.group(2)+':dat' + continue + m = re.match(r'^(obl(?::arg)?|nmod):(с)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel']) + if m: + adpcase = self.copy_case_from_adposition(node, m.group(2)) + if adpcase: + edep['deprel'] = m.group(1)+':'+adpcase + else: + # Genitive or instrumental are possible. Pick instrumental. + edep['deprel'] = m.group(1)+':'+m.group(2)+':ins' + continue + if re.match(r'^(nmod|obl):', edep['deprel']): + if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc': + # This is a same-case noun-noun modifier, which just happens to be in the locative. + # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has + # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant. + edep['deprel'] = 'nmod' + elif edep['deprel'] == 'nmod:loc': + edep['deprel'] = 'nmod:nom' + elif edep['deprel'] == 'nmod:voc': + edep['deprel'] = 'nmod:nom' + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/ru/fixremnant.py b/udapi/block/ud/ru/fixremnant.py index d94b0e5c..b41431db 100644 --- a/udapi/block/ud/ru/fixremnant.py +++ b/udapi/block/ud/ru/fixremnant.py @@ -4,6 +4,7 @@ """ from udapi.core.block import Block + class FixRemnant(Block): """ad-hoc fixing the remaining cases (after ud.Convert1to2) of deprel=remnant in UD_Russian.""" diff --git a/udapi/block/ud/ru/fixtoest.py b/udapi/block/ud/ru/fixtoest.py new file mode 100644 index 00000000..1b603e96 --- /dev/null +++ b/udapi/block/ud/ru/fixtoest.py @@ -0,0 +1,35 @@ +"""Block to fix annotation of то есть in Russian.""" +from udapi.core.block import Block +import logging +import re + +class FixToEst(Block): + + def process_node(self, node): + """ + In the converted data from Kira, the fixed expression "то есть" ("that is") + is treated as a subordinator and attached as "mark", which later makes it + part of complex enhanced relation labels. I believe that this analysis is + wrong and that it will be better to label these expressions as "cc". + """ + if node.udeprel == 'mark' and node.lemma == 'то': + if len([c for c in node.children if c.udeprel == 'fixed' and c.lemma == 'быть']) > 0: + self.set_basic_and_enhanced(node, node.parent, 'cc', 'cc') + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/setspaceafter.py b/udapi/block/ud/setspaceafter.py index 00193770..04c9fffb 100644 --- a/udapi/block/ud/setspaceafter.py +++ b/udapi/block/ud/setspaceafter.py @@ -9,13 +9,15 @@ from udapi.core.block import Block + class SetSpaceAfter(Block): """Block for heuristic setting of the SpaceAfter=No MISC attribute.""" - def __init__(self, not_after='¡¿([{„', not_before='.,;:!?}])', fix_text=True, **kwargs): + def __init__(self, not_after='¡ ¿ ( [ { „ /', not_before='. , ; : ! ? } ] ) / ?? ??? !! !!! ... …', + fix_text=True, extra_not_after='', extra_not_before='', **kwargs): super().__init__(**kwargs) - self.not_after = not_after - self.not_before = not_before + self.not_after = (not_after + ' ' + extra_not_after).split(' ') + self.not_before = (not_before + ' ' + extra_not_before).split(' ') self.fix_text = fix_text self.changed = False @@ -25,7 +27,7 @@ def process_tree(self, root): self.changed = False # Undirected double quotes are ambiguous. - # If there is an even number of quotes in a sentence, supposed they are not nested + # If there is an even number of quotes in a sentence, suppose they are not nested # and treat odd-indexed ones as opening and even-indexed ones as closing. # Otherwise (odd number, e.g. when quoting multiple sentences), don't remove any space. matching_quotes = not bool(count_of_form['"'] % 2) @@ -35,22 +37,25 @@ def process_tree(self, root): # Some languages use directed „quotes“ and some “quotes”, # so the symbol “ (U+201C) is ambiguous and we heuristically check for presence of „. if count_of_form['„']: - not_before += '“' + not_before += ['“'] else: - not_after += '“' + not_after += ['“'] for i, node in enumerate(nodes[:-1]): - next_form = nodes[i+1].form + next_form = nodes[i + 1].form if node.form in self.not_after or next_form in not_before: self.mark_no_space(node) - if matching_quotes and node.form == '"': - if odd_indexed_quote: + if node.form == '"': + if matching_quotes: + if odd_indexed_quote: + self.mark_no_space(node) + elif i: + self.mark_no_space(nodes[i - 1]) + odd_indexed_quote = not odd_indexed_quote + elif i==0: self.mark_no_space(node) - elif i: - self.mark_no_space(nodes[i-1]) - odd_indexed_quote = not odd_indexed_quote - if matching_quotes and nodes[-1].form == '"': + if nodes[-1].form == '"': self.mark_no_space(nodes[-2]) if self.fix_text and self.changed: diff --git a/udapi/block/ud/setspaceafterfromtext.py b/udapi/block/ud/setspaceafterfromtext.py index 0c4d8d9d..ec7ab658 100644 --- a/udapi/block/ud/setspaceafterfromtext.py +++ b/udapi/block/ud/setspaceafterfromtext.py @@ -9,13 +9,19 @@ from udapi.core.block import Block + class SetSpaceAfterFromText(Block): """Block for setting of the SpaceAfter=No MISC attribute according to the sentence text.""" def process_tree(self, root): + # Empty nodes cannot have 'SpaceAfter=No', so make sure the file is valid. + for empty_node in root.empty_nodes: + del empty_node.misc['SpaceAfter'] + text = root.text - computed = root.compute_text() - if text == computed: + if text is None: + raise ValueError('Tree %s has no text, cannot use ud.SetSpaceAfterFromText' % root) + if text == root.compute_text(): return for node in root.token_descendants: diff --git a/udapi/block/ud/settranslation.py b/udapi/block/ud/settranslation.py new file mode 100644 index 00000000..487cca06 --- /dev/null +++ b/udapi/block/ud/settranslation.py @@ -0,0 +1,59 @@ +""" +Block SetTranslation for setting of sentence-level translation (the attribute +text_en for English translation) from a separate text file (one sentence per +line). For example, one can export the original sentences using write.SentencesHtml, +then Google-translate them in the web browser, then CTRL+C CTRL+V to a plain +text editor, save them as translations.txt and import them using this block. + +Usage: +udapy -s ud.SetTranslation file=translations.txt < in.conllu > out.conllu + +Author: Dan Zeman +""" +from udapi.core.block import Block +import re +import logging + +class SetTranslation(Block): + """ + Set text_en to the next available translation. + """ + + def __init__(self, file, overwrite=False, **kwargs): + """ + Create the SetTranslation block. + + Parameters: + file: the name of the text file with the translations (one sentence per line) + overwrite=1: set the translation even if the sentence already has one + (default: do not overwrite existing translations) + """ + super().__init__(**kwargs) + self.file = file + fh = open(self.file, 'r', encoding='utf-8') + self.trlines = fh.readlines() + self.nlines = len(self.trlines) + self.iline = 0 + self.overwrite = overwrite + + def process_tree(self, tree): + if self.iline < self.nlines: + translation = self.trlines[self.iline] + self.iline += 1 + comments = [] + if tree.comment: + comments = tree.comment.split('\n') + i_tr = -1 + for i in range(len(comments)): + # The initial '#' character has been stripped. + if re.match(r'\s*text_en\s*=', comments[i]): + i_tr = i + break + if i_tr >= 0: + if self.overwrite: + comments[i_tr] = ' text_en = ' + translation + else: + comments.append(' text_en = ' + translation) + tree.comment = '\n'.join(comments) + elif self.iline == self.nlines: + logging.warning('There are only %d translation lines but there are more input sentences.' % self.nlines) diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py new file mode 100644 index 00000000..7de53881 --- /dev/null +++ b/udapi/block/ud/sk/fixedeprels.py @@ -0,0 +1,138 @@ +"""Block to fix case-enhanced dependency relations in Slovak.""" +from udapi.core.block import Block +import re + +class FixEdeprels(Block): + + # Secondary prepositions sometimes have the lemma of the original part of + # speech. We want the grammaticalized form instead. List even those that + # will have the same lexical form, as we also want to check the morphological + # case. And include all other prepositions that have unambiguous morphological + # case, even if they are not secondary. + unambiguous = { + 'a_hoci': 'hoci', + 'ako': 'ako', # remove morphological case + 'ako_na': 'ako', + 'ako_z': 'ako', + 'akoby_z': 'z:gen', + 'akže': 'ak', + 'ani_keby': 'keby', + 'ani_keď': 'keď', + 'až_keď': 'keď', + 'do': 'do:gen', + 'k': 'k:dat', + 'kto': 'kým', ###!!! The lemma should be fixed! The pronoun has grammaticalized as a subordinator. + 'mimo': 'mimo:gen', + 'na_rozdiel_od': 'na_rozdiel_od:gen', + 'na_základ': 'na_základe:gen', + 'od': 'od:gen', + 'pod_vplyv': 'pod_vplyvom:gen', + 'pomoc': 'pomocou:gen', + 'pre': 'pre:acc', + 'prostredníctvom': 'prostredníctvom:gen', + 'prv_ako': 'ako', + 's': 's:ins', + 's_cieľ': 's_cieľom', # no case, used with infinitives (advcl) + 's_dôraz_na': 's_dôrazom_na:acc', + 's_ohľad_na': 's_ohľadom_na:acc', + 's_pomoc': 's_pomocou:gen', + 'smer_k': 'smerom_k:dat', + 'spoločne_s': 'spoločne_s:ins', + 'spolu_s': 'spolu_s:ins', + 'v_dôsledok': 'v_dôsledku:gen', + 'v_meno': 'v_mene:gen', + 'v_oblasť': 'v_oblasti:gen', + 'v_porovnanie_s': 'v_porovnaní_s:ins', + 'v_porovnaniu_s': 'v_porovnaní_s:ins', + 'v_priebeh': 'v_priebehu:gen', + 'v_prípad': 'v_prípade:gen', + 'v_prospech': 'v_prospech:gen', + 'v_rámec': 'v_rámci:gen', + 'v_spolupráca_s': 'v_spolupráci_s:ins', + 'v_súlad_s': 'v_súlade_s:ins', + 'v_súvislosť_s': 'v_súvislosti_s:ins', + 'v_ústrety': 'v_ústrety:dat', + 'v_vzťah_k': 'vo_vzťahu_k:dat', + 'v_závislosť_na': 'v_závislosti_na:loc', + 'vzhľad_na': 'vzhľadom_na:acc', + 'z': 'z:gen', + 'z_hľadisko': 'z_hľadiska:gen', + 'začiatkom': 'začiatkom:gen' + } + + def process_node(self, node): + """ + Occasionally the edeprels automatically derived from the Slovak basic + trees do not match the whitelist. For example, the noun is an + abbreviation and its morphological case is unknown. + """ + for edep in node.deps: + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):', edep['deprel']) + if m: + bdeprel = m.group(1) + solved = False + for x in self.unambiguous: + # All secondary prepositions have only one fixed morphological case + # they appear with, so we can replace whatever case we encounter with the correct one. + m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+self.unambiguous[x] + solved = True + break + # The following prepositions have more than one morphological case + # available. Thanks to the Case feature on prepositions, we can + # identify the correct one. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):(medzi|na|o|po|pred|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel']) + if m: + # The following is only partial solution. We will not see + # some children because they may be shared children of coordination. + prepchildren = [x for x in node.children if x.lemma == m.group(2)] + if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '': + edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower() + solved = True + # If we failed to identify the case of the preposition in the + # preceding steps, pick a default. It applies mostly to 'o' + # with wrongly split time values. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):o$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':o:acc' + solved = True + m = re.match(r'^(obl(?::arg)?|nmod):(po|v)$', edep['deprel']) + if m: + edep['deprel'] = m.group(1)+':'+m.group(2)+':loc' + solved = True + # Some cases do not occur with nominal modifiers without preposition. + # If we see them, chances are that it is the same-case modifier, + # and the same case just happens to be the one we see. For vocatives, + # it is also possible that they have been confused with nominatives. + if not solved: + m = re.match(r'^(obl(?::arg)?|nmod):(voc|loc)$', edep['deprel']) + if m: + edep['deprel'] = m.group(1) + solved = True + # Annotation and conversion errors. + if not solved: + # Povedal som jej „na zdorovie“. + if edep['deprel'] == 'obl:arg:na' and node.form == 'zdorovie': + self.set_basic_and_enhanced(node, edep['parent'], 'ccomp', 'ccomp') + solved = True + + def set_basic_and_enhanced(self, node, parent, deprel, edeprel): + ''' + Modifies the incoming relation of a node both in the basic tree and in + the enhanced graph. If the node does not yet depend in the enhanced + graph on the current basic parent, the new relation will be added without + removing any old one. If the node already depends multiple times on the + current basic parent in the enhanced graph, all such enhanced relations + will be removed before adding the new one. + ''' + old_parent = node.parent + node.parent = parent + node.deprel = deprel + node.deps = [x for x in node.deps if x['parent'] != old_parent] + new_edep = {} + new_edep['parent'] = parent + new_edep['deprel'] = edeprel + node.deps.append(new_edep) diff --git a/udapi/block/ud/splittoken.py b/udapi/block/ud/splittoken.py new file mode 100644 index 00000000..16c60a38 --- /dev/null +++ b/udapi/block/ud/splittoken.py @@ -0,0 +1,107 @@ +""" +Block ud.SplitToken will split a given token into multiple tokens. +""" +from udapi.core.block import Block +import re +import logging + + +class SplitToken(Block): + """ + Split a token into two or more. A MISC attribute is used to mark the tokens + that should be split. (The attribute may have been set by an annotator or + by a previous block that tests the specific conditions under which splitting + is desired.) Multiword tokens are currently not supported: The node to be + split cannot belong to a MWT. Note that the result will not be a MWT either + (use the block ud.AddMwt if that is desired). There will be simply a new + attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes + (indicating that this was an error in the source text). + """ + + def __init__(self, misc_name='SplitToken', **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the splitting + default: SplitToken + The value of the attribute should indicate where to split the token. + It should be a string that is identical to node.form except that + there is one or more spaces where the token should be split. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + + def process_node(self, node): + """ + The SplitToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be split to multiple nodes and the + attribute will be removed from MISC, or a warning will be issued that + the splitting cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + value = node.misc[self.misc_name] + if value == '': + return + if node.multiword_token: + logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # Verify that the value of the MISC attribute can be used as specification + # of the split. + if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value): + logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + if re.search(r'\s', node.form): + logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + if re.sub(r' ', '', value) != node.form: + logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + # Do the split. + space_after = node.misc['SpaceAfter'] + forms = value.split(' ') + # Optionally, SplitTokenMorpho in MISC can have the morphological annotation + # of the new tokens. For example: + # SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act + if node.misc['SplitTokenMorpho'] != '': + morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ') + del node.misc['SplitTokenMorpho'] + else: + morphoblocks = ['' for x in forms] + node.form = forms[0] + last_node = node + for form, morpho in zip(forms[1:], morphoblocks[1:]): + last_node.misc['SpaceAfter'] = 'No' + last_node.misc['CorrectSpaceAfter'] = 'Yes' + lemma = form + upos = node.upos + feats = str(node.feats) + xpos = node.xpos + if morpho != '': + cols = morpho.split('\\t') + for c in cols: + colname, value = c.split('=', 1) + if colname == 'LEMMA': + lemma = value + elif colname == 'UPOS': + upos = value + elif colname == 'FEATS': + feats = re.sub(r'\\p', '|', value) + elif colname == 'XPOS': + xpos = value + else: + logging.fatal(f"c = {c}") + new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep') + new_node.shift_after_node(last_node) + last_node = new_node + last_node.misc['SpaceAfter'] = space_after + del node.misc[self.misc_name] diff --git a/udapi/block/ud/splitunderscoretokens.py b/udapi/block/ud/splitunderscoretokens.py index 25caeb3b..44575e0c 100644 --- a/udapi/block/ud/splitunderscoretokens.py +++ b/udapi/block/ud/splitunderscoretokens.py @@ -8,6 +8,7 @@ import logging from udapi.core.block import Block + class SplitUnderscoreTokens(Block): """Block for spliting tokens with underscores and attaching the new nodes using deprel=flat. @@ -22,7 +23,7 @@ class SplitUnderscoreTokens(Block): Real-world use cases: UD_Irish (`default_deprel=fixed`) and UD_Czech-CLTT v1.4. """ - def __init__(self, deprel=None, default_deprel='flat', **kwargs): + def __init__(self, deprel=None, default_deprel='flat', lemma='split', **kwargs): """Create the SplitUnderscoreTokens block instance. Args: @@ -30,14 +31,21 @@ def __init__(self, deprel=None, default_deprel='flat', **kwargs): Most common values are: flat, fixed, compound. Default=None. default_deprel: Which deprel to use for the newly created nodes if the heuristics in `deprel_for()` method fail. Default=flat. + lemma: What to do with the lemmas? + - 'split' (the default) means to split them on underscores as well + (and warn in case of a different number of underscores than in the form). + - 'form' means to copy the forms to the lemmas """ super().__init__(**kwargs) self.deprel = deprel self.default_deprel = default_deprel + self.lemma = lemma def process_node(self, node): if node.form != '_' and '_' in node.form: forms = node.form.split('_') + if self.lemma == 'form': + node.lemma = node.form lemmas = node.lemma.split('_') if len(forms) != len(lemmas): logging.warning("Different number of underscores in %s and %s, skipping.", diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py new file mode 100644 index 00000000..952644f8 --- /dev/null +++ b/udapi/block/ud/ug/fixspuriousaux.py @@ -0,0 +1,46 @@ +"""Block to convert spurious auxiliaries to lexical verbs in Uyghur.""" +from udapi.core.block import Block +import logging +import re + +class FixSpuriousAux(Block): + + def process_node(self, node): + """ + Some verbs that are called auxiliary by the traditional grammar, should + be analyzed in UD as VERB + non-finite xcomp. + """ + # Sometimes there is a double error: it should not be auxiliary, it is + # attached as aux but it is not tagged AUX. So we only look at the deprel. + if node.udeprel == 'aux': + # بەر/بار = give (used with actions done for the benefit of somebody) + # چىق = go out + # چىقىش = come out + # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) + # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur) + # باق = do ever? + # ئۆت = pass + # كۆرۈش = see + # باشلى = start + # يەت = be enough + # قايت = return + # چۈش = fall down + # قىل = do + # چاپ = jump + # قورق = fear + # كەلتۈر = cause + # كىر = enter + # _ ... some putative auxiliaries do not even have a lemma + if re.match(r'^(بەر|بار|چىق|چىقىش|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش|قىل|چاپ|قورق|كەلتۈر|كىر)$', node.lemma): + node.upos = 'VERB' + # The auxiliary inherits the incoming relation of its original parent. + lexverb = node.parent + node.parent = lexverb.parent + node.deprel = lexverb.deprel + # The auxiliary also inherits some but not all children of the lexical verb. + for c in lexverb.children: + if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel): + c.parent = node + # The lexical verb becomes an xcomp of the auxiliary. + lexverb.parent = node + lexverb.deprel = 'xcomp' diff --git a/udapi/block/ud/yue/lemmatize.py b/udapi/block/ud/yue/lemmatize.py new file mode 100644 index 00000000..87279dc1 --- /dev/null +++ b/udapi/block/ud/yue/lemmatize.py @@ -0,0 +1,43 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + # dictionary: form --> lemma + lemma = { + '𡃁仔': '笭仔', + '仲': '重', + '企': '徛', + '係咪': '係', + '出嚟': '出唻', + '可': '可以', + '啦': '喇', + '㗎喇': '㗎嘑', + '喇': '嘑', + '嚟': '唻', + '就嚟': '就唻', + '死𡃁妹': '死笭妹', + '老豆': '老頭', + '蚊': '緡', + '蛋撻': '蛋澾', + '返嚟': '返唻', + '過嚟人': '過唻人', + '過嚟': '過唻' + } + + def process_node(self, node): + """ + Parts of the Cantonese treebank lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + + For Cantonese, lemmatization includes normalization of some characters. + These are the few cases where lemma differs from the surface form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + if node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py new file mode 100644 index 00000000..abacf29f --- /dev/null +++ b/udapi/block/ud/zh/lemmatize.py @@ -0,0 +1,81 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + def __init__(self, rewrite='empty', **kwargs): + """ + Create the ud.zh.Lemmatize block instance. + + Args: + rewrite=empty: set the lemma if it was empty so far; do not touch the rest + rewrite=form: set the lemma if it was empty or equal to form; do not touch the rest + rewrite=all: set the lemma regardless of what it was previously + """ + super().__init__(**kwargs) + if not re.match(r'^(empty|form|all)$', rewrite): + raise ValueError("Unexpected value of parameter 'rewrite'") + self.rewrite = rewrite + + # dictionary: form --> lemma + lemma = { + # The plural suffix -men. + '我們': '我', # trad + '我们': '我', # simp + '他們': '他', # trad + '他们': '他', # simp + '它們': '它', # trad + '它们': '它', # simp + '牠們': '牠', # trad + '她們': '她', # trad + '她们': '她', # simp + '人們': '人', # trad + '人们': '人' # simp + } + + def process_node(self, node): + """ + Parts of the Chinese treebanks lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + """ + if self.rewrite == 'empty' and not (node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + # Lemmatize negated verbs to their affirmative forms. + # 不是 bùshì = not be + # 沒有 没有 méiyǒu = not exist + # 沒能 没能 méinéng = cannot + # 未能 wèinéng = cannot + # Lemmatize question verbs to their base forms. + # 要不要 yàobùyào = do (you) want? + # 有没有 yǒuméiyǒu = do (you) have? + # Verbs that are derived from the copula and tagged as the copula need + # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). + # 亦為 亦为 yìwèi = také + # 則為 则为 zéwèi = potom + # 更為 更为 gèngwèi = více + # 認為 认为 rènwéi = myslet, věřit + # 以為 以为 yǐwéi = myslet, věřit + # 以爲 以为 yǐwéi = myslet, věřit + if re.match(r'^(AUX|VERB)$', node.upos): + m1 = re.match(r'^([不没沒未])(.+)$', node.form) + m2 = re.match(r'^(.+)([不没沒未])\1$', node.form) + m3 = re.search(r'([是爲為为])', node.form) + if m1: + node.lemma = m1.group(2) + node.feats['Polarity'] = 'Neg' + elif m2: + node.lemma = m2.group(1) + node.feats['Mood'] = 'Int' + elif m3: + node.lemma = m3.group(1) + if node.lemma == '爲': + node.lemma = '為' + elif node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form diff --git a/udapi/block/udpipe/__init__.py b/udapi/block/udpipe/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py new file mode 100644 index 00000000..9d053cb7 --- /dev/null +++ b/udapi/block/udpipe/base.py @@ -0,0 +1,270 @@ +"""Block udpipe.Base for tagging and parsing using UDPipe.""" +from udapi.core.block import Block +from udapi.tool.udpipeonline import UDPipeOnline +from udapi.core.bundle import Bundle + +# Import UDPipe only if available (requires ufal.udpipe) +try: + from udapi.tool.udpipe import UDPipe + UDPIPE_AVAILABLE = True +except ImportError: + UDPIPE_AVAILABLE = False + +KNOWN_MODELS = { + 'af': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', + 'af_afribooms': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe', + 'grc': 'models/udpipe/2.4/ancient_greek-perseus-ud-2.4-190531.udpipe', + 'grc_perseus': 'models/udpipe/2.4/ancient_greek-perseus-ud-2.4-190531.udpipe', + 'grc_proiel': 'models/udpipe/2.4/ancient_greek-proiel-ud-2.4-190531.udpipe', + 'ar': 'models/udpipe/2.4/arabic-padt-ud-2.4-190531.udpipe', + 'ar_padt': 'models/udpipe/2.4/arabic-padt-ud-2.4-190531.udpipe', + 'hy': 'models/udpipe/2.4/armenian-armtdp-ud-2.4-190531.udpipe', + 'hy_armtdp': 'models/udpipe/2.4/armenian-armtdp-ud-2.4-190531.udpipe', + 'eu': 'models/udpipe/2.4/basque-bdt-ud-2.4-190531.udpipe', + 'eu_bdt': 'models/udpipe/2.4/basque-bdt-ud-2.4-190531.udpipe', + 'be': 'models/udpipe/2.4/belarusian-hse-ud-2.4-190531.udpipe', + 'be_hse': 'models/udpipe/2.4/belarusian-hse-ud-2.4-190531.udpipe', + 'bg': 'models/udpipe/2.4/bulgarian-btb-ud-2.4-190531.udpipe', + 'bg_btb': 'models/udpipe/2.4/bulgarian-btb-ud-2.4-190531.udpipe', + 'ca': 'models/udpipe/2.4/catalan-ancora-ud-2.4-190531.udpipe', + 'ca_ancora': 'models/udpipe/2.4/catalan-ancora-ud-2.4-190531.udpipe', + 'zh': 'models/udpipe/2.4/chinese-gsd-ud-2.4-190531.udpipe', + 'zh_gsd': 'models/udpipe/2.4/chinese-gsd-ud-2.4-190531.udpipe', + 'lzh': 'models/udpipe/2.4/classical_chinese-kyoto-ud-2.4-190531.udpipe', + 'lzh_kyoto': 'models/udpipe/2.4/classical_chinese-kyoto-ud-2.4-190531.udpipe', + 'cop': 'models/udpipe/2.4/coptic-scriptorium-ud-2.4-190531.udpipe', + 'cop_scriptotium': 'models/udpipe/2.4/coptic-scriptorium-ud-2.4-190531.udpipe', + 'hr': 'models/udpipe/2.4/croatian-set-ud-2.4-190531.udpipe', + 'hr_set': 'models/udpipe/2.4/croatian-set-ud-2.4-190531.udpipe', + 'cs': 'models/udpipe/2.4/czech-pdt-ud-2.4-190531.udpipe', + 'cs_pdt': 'models/udpipe/2.4/czech-pdt-ud-2.4-190531.udpipe', + 'cs_cac': 'models/udpipe/2.4/czech-cac-ud-2.4-190531.udpipe', + 'cs_cltt': 'models/udpipe/2.4/czech-cltt-ud-2.4-190531.udpipe', + 'cs_fictree': 'models/udpipe/2.4/czech-fictree-ud-2.4-190531.udpipe', + 'da': 'models/udpipe/2.4/danish-ddt-ud-2.4-190531.udpipe', + 'da_ddt': 'models/udpipe/2.4/danish-ddt-ud-2.4-190531.udpipe', + 'nl': 'models/udpipe/2.4/dutch-alpino-ud-2.4-190531.udpipe', + 'nl_alpino': 'models/udpipe/2.4/dutch-alpino-ud-2.4-190531.udpipe', + 'nl_lassysmall': 'models/udpipe/2.4/dutch-lassysmall-ud-2.4-190531.udpipe', + 'en': 'models/udpipe/2.4/english-ewt-ud-2.4-190531.udpipe', + 'en_ewt': 'models/udpipe/2.4/english-ewt-ud-2.4-190531.udpipe', + 'en_gum': 'models/udpipe/2.4/english-gum-ud-2.4-190531.udpipe', + 'en_lines': 'models/udpipe/2.4/english-lines-ud-2.4-190531.udpipe', + 'en_partut': 'models/udpipe/2.4/english-partut-ud-2.4-190531.udpipe', + 'et_edt': 'models/udpipe/2.4/estonian-edt-ud-2.4-190531.udpipe', + 'et_ewt': 'models/udpipe/2.4/estonian-ewt-ud-2.4-190531.udpipe', + 'fi': 'models/udpipe/2.4/finnish-tdt-ud-2.4-190531.udpipe', + 'fi_tdt': 'models/udpipe/2.4/finnish-tdt-ud-2.4-190531.udpipe', + 'fi_ftb': 'models/udpipe/2.4/finnish-ftb-ud-2.4-190531.udpipe', + 'fr_gsd': 'models/udpipe/2.4/french-gsd-ud-2.4-190531.udpipe', + 'fr_partut': 'models/udpipe/2.4/french-partut-ud-2.4-190531.udpipe', + 'fr_sequoia': 'models/udpipe/2.4/french-sequoia-ud-2.4-190531.udpipe', + 'fr_spoken': 'models/udpipe/2.4/french-spoken-ud-2.4-190531.udpipe', + 'gl_ctg': 'models/udpipe/2.4/galician-ctg-ud-2.4-190531.udpipe', + 'gl_treegal': 'models/udpipe/2.4/galician-treegal-ud-2.4-190531.udpipe', + 'de': 'models/udpipe/2.4/german-gsd-ud-2.4-190531.udpipe', + 'got': 'models/udpipe/2.4/gothic-proiel-ud-2.4-190531.udpipe', + 'el': 'models/udpipe/2.4/greek-gdt-ud-2.4-190531.udpipe', + 'he': 'models/udpipe/2.4/hebrew-htb-ud-2.4-190531.udpipe', + 'hi': 'models/udpipe/2.4/hindi-hdtb-ud-2.4-190531.udpipe', + 'hu': 'models/udpipe/2.4/hungarian-szeged-ud-2.4-190531.udpipe', + 'id': 'models/udpipe/2.4/indonesian-gsd-ud-2.4-190531.udpipe', + 'ga': 'models/udpipe/2.4/irish-idt-ud-2.4-190531.udpipe', + 'it_isdt': 'models/udpipe/2.4/italian-isdt-ud-2.4-190531.udpipe', + 'it_partut': 'models/udpipe/2.4/italian-partut-ud-2.4-190531.udpipe', + 'it_postwita': 'models/udpipe/2.4/italian-postwita-ud-2.4-190531.udpipe', + 'it_vit': 'models/udpipe/2.4/italian-vit-ud-2.4-190531.udpipe', + 'ja': 'models/udpipe/2.4/japanese-gsd-ud-2.4-190531.udpipe', + 'ko_gsd': 'models/udpipe/2.4/korean-gsd-ud-2.4-190531.udpipe', + 'ko_kaist': 'models/udpipe/2.4/korean-kaist-ud-2.4-190531.udpipe', + 'la_ittb': 'models/udpipe/2.4/latin-ittb-ud-2.4-190531.udpipe', + 'la_perseus': 'models/udpipe/2.4/latin-perseus-ud-2.4-190531.udpipe', + 'la_proiel': 'models/udpipe/2.4/latin-proiel-ud-2.4-190531.udpipe', + 'lv': 'models/udpipe/2.4/latvian-lvtb-ud-2.4-190531.udpipe', + 'lt_alksnis': 'models/udpipe/2.4/lithuanian-alksnis-ud-2.4-190531.udpipe', + 'lt_hse': 'models/udpipe/2.4/lithuanian-hse-ud-2.4-190531.udpipe', + 'mt': 'models/udpipe/2.4/maltese-mudt-ud-2.4-190531.udpipe', + 'mr': 'models/udpipe/2.4/marathi-ufal-ud-2.4-190531.udpipe', + 'sme': 'models/udpipe/2.4/north_sami-giella-ud-2.4-190531.udpipe', + 'no_bokmaal': 'models/udpipe/2.4/norwegian-bokmaal-ud-2.4-190531.udpipe', + 'no_nynorsklia': 'models/udpipe/2.4/norwegian-nynorsklia-ud-2.4-190531.udpipe', + 'no_nynorsk': 'models/udpipe/2.4/norwegian-nynorsk-ud-2.4-190531.udpipe', + 'cu': 'models/udpipe/2.4/old_church_slavonic-proiel-ud-2.4-190531.udpipe', + 'fro': 'models/udpipe/2.4/old_french-srcmf-ud-2.4-190531.udpipe', + 'orv': 'models/udpipe/2.4/old_russian-torot-ud-2.4-190531.udpipe', + 'fa': 'models/udpipe/2.4/persian-seraji-ud-2.4-190531.udpipe', + 'pl_lfg': 'models/udpipe/2.4/polish-lfg-ud-2.4-190531.udpipe', + 'pl_pdb': 'models/udpipe/2.4/polish-pdb-ud-2.4-190531.udpipe', + 'pt_bosque': 'models/udpipe/2.4/portuguese-bosque-ud-2.4-190531.udpipe', + 'pt_gsd': 'models/udpipe/2.4/portuguese-gsd-ud-2.4-190531.udpipe', + 'ro_nonstandard': 'models/udpipe/2.4/romanian-nonstandard-ud-2.4-190531.udpipe', + 'ro_rrt': 'models/udpipe/2.4/romanian-rrt-ud-2.4-190531.udpipe', + 'ru_gsd': 'models/udpipe/2.4/russian-gsd-ud-2.4-190531.udpipe', + 'ru_syntagrus': 'models/udpipe/2.4/russian-syntagrus-ud-2.4-190531.udpipe', + 'ru_taiga': 'models/udpipe/2.4/russian-taiga-ud-2.4-190531.udpipe', + 'sr': 'models/udpipe/2.4/serbian-set-ud-2.4-190531.udpipe', + 'sk': 'models/udpipe/2.4/slovak-snk-ud-2.4-190531.udpipe', + 'sl_ssj': 'models/udpipe/2.4/slovenian-ssj-ud-2.4-190531.udpipe', + 'sl_sst': 'models/udpipe/2.4/slovenian-sst-ud-2.4-190531.udpipe', + 'es_ancora': 'models/udpipe/2.4/spanish-ancora-ud-2.4-190531.udpipe', + 'es_gsd': 'models/udpipe/2.4/spanish-gsd-ud-2.4-190531.udpipe', + 'sv_lines': 'models/udpipe/2.4/swedish-lines-ud-2.4-190531.udpipe', + 'sv_talbanken': 'models/udpipe/2.4/swedish-talbanken-ud-2.4-190531.udpipe', + 'ta': 'models/udpipe/2.4/tamil-ttb-ud-2.4-190531.udpipe', + 'te': 'models/udpipe/2.4/telugu-mtg-ud-2.4-190531.udpipe', + 'tr': 'models/udpipe/2.4/turkish-imst-ud-2.4-190531.udpipe', + 'uk': 'models/udpipe/2.4/ukrainian-iu-ud-2.4-190531.udpipe', + 'ur': 'models/udpipe/2.4/urdu-udtb-ud-2.4-190531.udpipe', + 'ug': 'models/udpipe/2.4/uyghur-udt-ud-2.4-190531.udpipe', + 'vi': 'models/udpipe/2.4/vietnamese-vtb-ud-2.4-190531.udpipe', + 'wo': 'models/udpipe/2.4/wolof-wtb-ud-2.4-190531.udpipe', +} + + +class Base(Block): + """Base class for all UDPipe blocks.""" + + # pylint: disable=too-many-arguments + def __init__(self, model=None, model_alias=None, online=False, + tokenize=True, tag=True, parse=True, resegment=False, + ranges=False, delete_nodes=False, **kwargs): + super().__init__(**kwargs) + self.model, self.model_alias, self.online = model, model_alias, online + self._tool = None + self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment + self.ranges, self.delete_nodes = ranges, delete_nodes + + @property + def tool(self): + """Return the tool (UDPipe in this case), created lazily.""" + if self._tool: + return self._tool + if not self.model: + if not self.model_alias: + raise ValueError('model (path/to/model) or model_alias (e.g. en) must be set!') + if self.online: + self.model = self.model_alias + else: + self.model = KNOWN_MODELS[self.model_alias] + if self.online: + self._tool = UDPipeOnline(model=self.model) + else: + if not UDPIPE_AVAILABLE: + raise ImportError("UDPipe is not available. Install ufal.udpipe or use online=1") + self._tool = UDPipe(model=self.model) + return self._tool + + def process_document(self, doc): + tok, tag, par, reseg, ranges = self.tokenize, self.tag, self.parse, self.resegment, self.ranges + if self.zones == "all" and self.online: + self.tool.process_document(doc, tok, tag, par, reseg, ranges) + return + old_bundles = doc.bundles + new_bundles = [] + for bundle in old_bundles: + for tree in bundle: + new_bundles.append(bundle) + if self._should_process_tree(tree): + if self.delete_nodes: + for subroot in tree.children: + subroot.remove() + if tok: + new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=reseg, + tag=tag, parse=par, ranges=ranges) + if self.resegment and len(new_trees) > 1: + orig_bundle_id = bundle.bundle_id + bundle.bundle_id = orig_bundle_id + '-1' + for i, new_tree in enumerate(new_trees[1:], 2): + new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}") + new_tree.zone = tree.zone + new_bundle.add_tree(new_tree) + new_bundles.append(new_bundle) + elif not tok and not reseg and (tag or par): + self.tool.tag_parse_tree(tree, tag=tag, parse=par) + elif not tok and reseg and not tag and not par: + sentences = self.tool.segment_text(tree.text) + if len(sentences) > 1: + orig_bundle_id = bundle.bundle_id + bundle.bundle_id = orig_bundle_id + '-1' + tree.text = sentences[0] + for i, sentence in enumerate(sentences[1:], 2): + new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}") + new_tree = new_bundle.create_tree(zone=tree.zone) + new_tree.text = sentence + new_bundles.append(new_bundle) + else: + raise ValueError(f"Unimplemented tokenize={tok} tag={tag} parse={par} resegment={reseg}") + doc.bundles = new_bundles + +''' +Udapi::Block::UDPipe::Base - tokenize, tag and parse into UD + +=head1 SYNOPSIS + + # from the command line + echo John loves Mary | udapi.pl Read::Sentences UDPipe::Base model_alias=en Write::TextModeTrees + + # in scenario + UDPipe::Base model=/home/me/english-ud-1.2-160523.udpipe + UDPipe::Base model_alias=en + UDPipe::EN # shortcut for the above + UDPipe::EN tokenize=1 tag=1 parse=0 + +=head1 DESCRIPTION + +This block loads L (a wrapper for the UDPipe C++ tool) with +the given C for analysis into the Universal Dependencies (UD) style. +UDPipe can do tokenization, tagging (plus lemmatization and universal features) +and parsing (with deprel labels) and users of this block can select which of the +substasks should be done using parameters C, C and C. +The default is to do all three. + +=head1 TODO + +UDPipe can do also sentence segmentation, but L does not supported it yet. + +Similarly with multi-word tokens. + +=head1 PARAMETERS + +=head2 C + +Path to the model file within Udapi share +(or relative path starting with "./" or absolute path starting with "/"). +This parameter is required if C is not supplied. + +=head2 C + +The C parameter can be omitted if this parameter is supplied. +Currently available model aliases are: + +B. + +They correspond to paths where the language code in the alias is substituted +with the respective language name, e.g. B expands to +C. + +=head1 tokenize + +Do tokenization, i.e. create new nodes with attributes +C

, C (if SpaceAfter=No) and C. +The sentence string is taken from the root's attribute C. + +=head1 tag + +Fill node attributes: C, C, C and C. +On the input, just the attribute C is expected. + +=head1 parse + +Fill node attributes: C and rehang the nodes to their parent. +On the input, attributes C, C, C and C are expected. + +=head1 SEE ALSO + +L + +L +''' diff --git a/udapi/block/udpipe/cs.py b/udapi/block/udpipe/cs.py new file mode 100644 index 00000000..743efcb7 --- /dev/null +++ b/udapi/block/udpipe/cs.py @@ -0,0 +1,10 @@ +"""Block udpipe.Cs for tagging and parsing Czech.""" +from udapi.block.udpipe.base import Base + + +class Cs(Base): + """Tag and parse Czech.""" + + def __init__(self, **kwargs): + """Create the udpipe.Cs block object.""" + super().__init__(model_alias='cs', **kwargs) diff --git a/udapi/block/udpipe/en.py b/udapi/block/udpipe/en.py new file mode 100644 index 00000000..7cb74a25 --- /dev/null +++ b/udapi/block/udpipe/en.py @@ -0,0 +1,10 @@ +"""Block udpipe.En for tagging and parsing English.""" +from udapi.block.udpipe.base import Base + + +class En(Base): + """Tag and parse English.""" + + def __init__(self, **kwargs): + """Create the udpipe.En block object.""" + super().__init__(model_alias='en', **kwargs) diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index c5fa04f2..6e4f2ac9 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -5,10 +5,12 @@ from udapi.core.block import Block -pp = pprint.pprint # pylint: disable=invalid-name +pp = pprint.pprint # pylint: disable=invalid-name # We need exec in this block and the variables this etc. are not unused but provided for the exec # pylint: disable=exec-used,unused-variable + + class Eval(Block): r"""Special block for evaluating code given by parameters. @@ -27,7 +29,8 @@ class Eval(Block): # pylint: disable=too-many-arguments,too-many-instance-attributes def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, - expand_code=True, **kwargs): + coref_mention=None, coref_entity=None, empty_nodes=False, + expand_code=True, mwt=None, **kwargs): super().__init__(**kwargs) self.doc = doc self.bundle = bundle @@ -35,10 +38,14 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.node = node self.start = start self.end = end + self.mwt = mwt self.before_doc = before_doc self.after_doc = after_doc self.before_bundle = before_bundle self.after_bundle = after_bundle + self.coref_mention = coref_mention + self.coref_entity = coref_entity + self.empty_nodes = empty_nodes self.expand_code = expand_code self.count = collections.Counter() @@ -64,11 +71,21 @@ def process_document(self, document): if self.doc: exec(self.expand_eval_code(self.doc)) - if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node: + if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node or self.mwt: for bundle in doc.bundles: - #TODO if self._should_process_bundle(bundle): + # TODO if self._should_process_bundle(bundle): self.process_bundle(bundle) + if self.coref_entity or self.coref_mention: + for entity in doc.coref_entities: + if self.coref_entity: + this = entity + exec(self.expand_eval_code(self.coref_entity)) + if self.coref_mention: + for mention in entity.mentions: + this = mention + exec(self.expand_eval_code(self.coref_mention)) + def process_bundle(self, bundle): # Extract variables, so they can be used in eval code document = doc = bundle.document @@ -80,11 +97,11 @@ def process_bundle(self, bundle): if self.bundle: exec(self.expand_eval_code(self.bundle)) - if self.tree or self.node: + if self.tree or self.node or self.mwt: trees = bundle.trees for tree in trees: - #TODO if not self._should_process_tree(tree): continue - self.process_tree(tree) + if self._should_process_tree(tree): + self.process_tree(tree) if self.after_bundle: exec(self.expand_eval_code(self.after_bundle)) @@ -100,10 +117,16 @@ def process_tree(self, tree): exec(self.expand_eval_code(self.tree)) if self.node: - for node in tree.descendants(): + nodes = tree.descendants_and_empty if self.empty_nodes else tree.descendants + for node in nodes: this = node exec(self.expand_eval_code(self.node)) + if self.mwt: + for mwt in tree.multiword_tokens: + this = mwt + exec(self.expand_eval_code(self.mwt)) + def process_start(self): if self.start: exec(self.expand_eval_code(self.start)) diff --git a/udapi/block/util/filter.py b/udapi/block/util/filter.py index 88bedb76..811973ec 100644 --- a/udapi/block/util/filter.py +++ b/udapi/block/util/filter.py @@ -1,10 +1,12 @@ """Filter is a special block for keeping/deleting subtrees specified by parameters.""" -import re # may be useful in eval, thus pylint: disable=unused-import +import re # may be useful in eval, thus pylint: disable=unused-import from udapi.core.block import Block # We need eval in this block # pylint: disable=eval-used + + class Filter(Block): """Special block for keeping/deleting subtrees specified by parameters. @@ -15,7 +17,7 @@ class Filter(Block): # keep only trees which contain ToDo|Bug nodes udapy -s util.Filter keep_tree_if_node='re.match("ToDo|Bug", str(node.misc))' < in > filtered - # keep only non-projective trees, annotate non-projective edges with Mark=nofeats and show. + # keep only non-projective trees, annotate non-projective edges with Mark=nonproj and show. udapy -T util.Filter keep_tree_if_node='node.is_nonprojective()' mark=nonproj < in | less -R # delete trees which contain deprel=remnant @@ -25,11 +27,10 @@ class Filter(Block): udapy -s util.Filter delete_subtree='node.deprel == "remnant"' < in > filtered """ - - def __init__(self, # pylint: disable=too-many-arguments + def __init__(self, # pylint: disable=too-many-arguments delete_tree=None, delete_tree_if_node=None, delete_subtree=None, keep_tree=None, keep_tree_if_node=None, keep_subtree=None, - mark=None, **kwargs): + keep_node=None, mark=None, **kwargs): """Create the Filter block object. Args: @@ -55,6 +56,10 @@ def __init__(self, # pylint: disable=too-many-arguments If no node in the tree was marked (i.e. only the root without any children remained), the whole tree will be deleted. + `keep_node`: Python expression to be evaluated for each node and if False, + the node will be deleted and its children rehanged to its parent. + Multiple nodes can be deleted (or kept) this way. + `mark`: a string or None. This makes sense only with `keep_tree_if_node`, where the matched nodes are marked with `Mark=` in `node.misc`, so they will be highlighted if printed with `write.TextModeTrees`. Default=None. @@ -70,9 +75,10 @@ def __init__(self, # pylint: disable=too-many-arguments self.keep_tree = keep_tree self.keep_tree_if_node = keep_tree_if_node self.keep_subtree = keep_subtree + self.keep_node = keep_node self.mark = mark - def process_tree(self, tree): # pylint: disable=too-many-branches + def process_tree(self, tree): # pylint: disable=too-many-branches root = tree if self.delete_tree is not None: @@ -117,8 +123,17 @@ def process_tree(self, tree): # pylint: disable=too-many-branches kept_subtrees.append(node) if not kept_subtrees: tree.remove() + return else: for node in kept_subtrees: node.parent = root for orig_subroot in [n for n in root.children if n not in kept_subtrees]: orig_subroot.remove() + + if self.keep_node is not None: + nodes_to_delete = [node for node in tree.descendants if not eval(self.keep_node)] + if nodes_to_delete == tree.descendants: + tree.remove() + return + for node in nodes_to_delete: + node.remove(children='rehang') diff --git a/udapi/block/util/findbug.py b/udapi/block/util/findbug.py new file mode 100644 index 00000000..e1ea838c --- /dev/null +++ b/udapi/block/util/findbug.py @@ -0,0 +1,71 @@ +"""Block util.FindBug for debugging. + +Usage: +If block xy.Z fails with a Python exception, +insert "util.FindBug block=" into the scenario, +e.g. to debug ``second.Block``, use + + udapy first.Block util.FindBug block=second.Block > bug.conllu + +This will create the file bug.conllu with the bundle, which caused the bug. + +The second.Block can have any parameters, e.g. + udapy first.Block util.FindBug block=second.Block param1=value1 param2=value2 > bug.conllu +""" +import copy +import logging + +from udapi.core.basewriter import BaseWriter +from udapi.block.write.conllu import Conllu +from udapi.core.run import _parse_block_name + + +class FindBug(BaseWriter): + """Debug another block by finding a minimal testcase conllu file.""" + + def __init__(self, block, first_error_only=True, + files='-', filehandle=None, docname_as_file=False, encoding='utf-8', + newline='\n', overwrite=False, + **kwargs): + """Args: block, first_error_only. + All other parameters (which are not parameters of BaseWriter) + will be passed to the block being inspected. + """ + super().__init__(files, filehandle, docname_as_file, encoding, newline, overwrite) + self.block = block + self.first_error_only = first_error_only + self._kwargs = kwargs + + def process_document(self, document): + sub_path, class_name = _parse_block_name(self.block) + module = "udapi.block." + sub_path + "." + class_name.lower() + try: + command = "from " + module + " import " + class_name + " as B" + logging.debug("Trying to run command: %s", command) + exec(command) # pylint: disable=exec-used + except Exception: + logging.warning("Error when trying import the block %s", self.block) + raise + + command = "B(**self._kwargs)" + logging.debug("Trying to evaluate this: %s", command) + new_block = eval(command) # pylint: disable=eval-used + + doc_copy = copy.deepcopy(document) + writer = Conllu(files=self.orig_files) + + for bundle_no, bundle in enumerate(doc_copy.bundles, 1): + logging.debug('Block %s processing bundle #%d (id=%s)', + self.block, bundle_no, bundle.bundle_id) + try: + new_block.process_bundle(bundle) + except Exception as exc: # pylint: disable=broad-except + logging.warning('util.FindBug found a problem in bundle %d in block %s: %r', + bundle_no, self.block, exc) + logging.warning('Printing a minimal example to %s', self.orig_files) + + for tree in document.bundles[bundle_no - 1].trees: + writer.process_tree(tree) + + if self.first_error_only: + raise diff --git a/udapi/block/util/joinsentence.py b/udapi/block/util/joinsentence.py new file mode 100644 index 00000000..578f3865 --- /dev/null +++ b/udapi/block/util/joinsentence.py @@ -0,0 +1,77 @@ +""" +Block util.JoinSentence will join a given sentence with the preceding one. +""" +import logging +from udapi.core.block import Block + +class JoinSentence(Block): + """ + Joins a sentence with the preceding one. There are two ways how to indicate + the sentences that this block should process. + + Method 1: Parameter sent_id provides the id of the sentence that should be + merged with the preceding one. At most one sentence pair from the input will + be merged, even if there are multiple sentences with the given id. + + Method 2: A MISC attribute can be specified that, if found, will trigger + joining of the current sentence to the previous one. With this approach, + multiple sentence pairs can be merged during one run. + """ + + def __init__(self, sent_id=None, misc_name=None, misc_value=None, **kwargs): + """ + Args: + sent_id: which sentence should be appended to the previous one + misc_name: name of the MISC attribute that can trigger the joining (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the joining; if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + if misc_name: + if sent_id: + logging.fatal('Cannot combine misc_value with sent_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + self.sent_id = sent_id + self.misc_name = misc_name + self.misc_value = misc_value + + def process_document(self, document): + previous_tree = None + for bundle_no, bundle in enumerate(document.bundles): + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to join all zones but we do not try to do it at present. + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + # The MISC attribute we are looking for should logically occur + # on the first node of the sentence but we can take it from any node. + join_commands = [n for n in root.descendants if n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if join_commands: + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove from the node the MISC attribute that triggered the sentence split. + for n in join_commands: + n.misc[self.misc_name] = '' + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + else: + previous_tree = root + elif bundle.bundle_id == self.sent_id: + logging.info('Found!') + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + root = bundle.get_tree() + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + # We have found our sentence. No need to process the rest of the document. + break diff --git a/udapi/block/util/mark.py b/udapi/block/util/mark.py new file mode 100644 index 00000000..bcb4f894 --- /dev/null +++ b/udapi/block/util/mark.py @@ -0,0 +1,59 @@ +"""util.Mark is a special block for marking nodes specified by parameters.""" +import re # may be useful in eval, thus pylint: disable=unused-import + +from udapi.core.block import Block + +# We need eval in this block +# pylint: disable=eval-used + + +class Mark(Block): + """Mark nodes specified by parameters. + + Example usage from command line:: + # see non-projective trees with non-projective edges highlighted + udapy -TM util.Mark node='node.is_nonprojective()' < in | less -R + """ + + def __init__(self, node, mark=1, mark_attr="Mark", add=True, print_stats=False, empty=False, **kwargs): + """Create the Mark block object. + + Args: + `node`: Python expression to be evaluated for each node and if True, + the node will be marked. + + `mark`: the node will be marked with `Mark=` in `node.misc`. Default=1. + + `mark_attr`: use this MISC attribute name instead of "Mark". + + `add`: should we keep existing Mark|ToDo|Bug? Default=True. + + `print_stats`: print the total number of marked nodes to stdout at process_end + + `empty`: apply the code also on empty nodes + """ + super().__init__(**kwargs) + self.mark = mark + self.mark_attr = mark_attr + self.node = node + self.add = add + self.print_stats = print_stats + self._marked = 0 + self.empty = empty + + def process_node(self, node): + if eval(self.node): + node.misc[self.mark_attr] = self.mark + self._marked += 1 + elif not self.add: + del node.misc[self.mark_attr] + del node.misc['ToDo'] + del node.misc['Bug'] + + def process_empty_node(self, empty_node): + if self.empty: + self.process_node(empty_node) + + def process_end(self): + if self.print_stats: + print(f'util.Mark marked {self._marked} nodes') diff --git a/udapi/block/util/markdiff.py b/udapi/block/util/markdiff.py new file mode 100644 index 00000000..e102ca9c --- /dev/null +++ b/udapi/block/util/markdiff.py @@ -0,0 +1,117 @@ +"""util.MarkDiff is a special block for marking differences between parallel trees.""" +import collections +import difflib +import pprint +from udapi.core.block import Block + + +class MarkDiff(Block): + """Mark differences between parallel trees.""" + + def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc', + mark=1, mark_attr='Mark', add=False, print_stats=0, ignore_parent=False, + align=False, align_attr='Align', **kwargs): + """Create the Mark block object. + Params: + gold_zone: Which of the zones should be treated as gold? + (The changes are interpreted as from a "pred"=predicted zone into the gold zone.) + attributes: Which node attributes should be considered when searching for diffs? + The tree topology, i.e. node parent is always considered. + mark: What value should be used in `node.misc['Mark']` of the differing nodes? + mark_attr: use this MISC attribute name instead of "Mark". + Use mark_attr=0 to prevent marking diffs in MISC. + add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block, + so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block. + print_stats: How many lines of statistics should be printed? -1 means all. + ignore_parent: ignore differences in dependency parents + align: store word alignment, possible values are False (no alignment stored, the default) + "from-pred", i.e. pred_node.misc["Align"] = aligned_gold_node.ord, + "from-gold", i.e. gold_node.misc["Align"] = aligned_pred_node.ord and + "both", i.e. both from-pred and from-gold. + If only forms should be considered for inducing the word alignment, + you should use "util.MarkDiff attributes='form' ignore_parent=1 align=1". + Only one-to-one alignment is supported. + align_attr: use this MISC attribute name instead of "Align". + """ + super().__init__(**kwargs) + self.gold_zone = gold_zone + self.attrs = attributes.split(',') + self.mark = mark + self.mark_attr = mark_attr + self.add = add + self.print_stats = print_stats + self.ignore_parent = ignore_parent + self.align = align + self.align_attr = align_attr + self.stats = collections.Counter() + if not mark_attr and not align and not print_stats: + raise ValueError('mark_attr=0 does not make sense without align or print_stats') + + def process_tree(self, tree): + gold_tree = tree.bundle.get_tree(self.gold_zone) + if tree == gold_tree: + return + if not self.add: + for node in tree.descendants + gold_tree.descendants: + del node.misc[self.mark_attr] + del node.misc['ToDo'] + del node.misc['Bug'] + + pred_nodes, gold_nodes = tree.descendants, gold_tree.descendants + # Make sure both pred and gold trees are marked, even if one has just deleted nodes. + if len(pred_nodes) != len(gold_nodes) and self.mark_attr: + tree.add_comment(f'{self.mark_attr} = {self.mark}') + gold_tree.add_comment(f'{self.mark_attr} = {self.mark}') + pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in pred_nodes] + gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in gold_nodes] + matcher = difflib.SequenceMatcher(None, pred_tokens, gold_tokens, autojunk=False) + diffs = list(matcher.get_opcodes()) + + alignment = {-1: -1} + for diff in diffs: + edit, pred_lo, pred_hi, gold_lo, gold_hi = diff + if edit in {'equal', 'replace'}: + for i in range(pred_lo, pred_hi): + alignment[i] = i - pred_lo + gold_lo + if self.align in ("both", "from-pred"): + pred_nodes[i].misc[self.align_attr] = i - pred_lo + gold_lo + 1 + if self.align in ("both", "from-gold"): + gold_nodes[i - pred_lo + gold_lo].misc[self.align_attr] = i + 1 + + for diff in diffs: + edit, pred_lo, pred_hi, gold_lo, gold_hi = diff + if edit == 'equal': + for p_node, g_node in zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]): + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + self.stats['ONLY-PARENT-CHANGED'] += 1 + if self.mark_attr: + p_node.misc[self.mark_attr] = self.mark + g_node.misc[self.mark_attr] = self.mark + else: + if self.mark_attr: + for node in pred_nodes[pred_lo:pred_hi] + gold_nodes[gold_lo:gold_hi]: + node.misc[self.mark_attr] = self.mark + if self.print_stats: + if edit == 'replace': + # first n nodes are treated as aligned, the rest is treated as ADDED/DELETED + n = min(pred_hi - pred_lo, gold_hi - gold_lo) + for p_node, g_node in zip(pred_nodes[pred_lo:pred_lo + n], gold_nodes[gold_lo:gold_lo + n]): + for attr in self.attrs: + p_value, g_value = p_node._get_attr(attr), g_node._get_attr(attr) + if p_value != g_value: + self.stats[f'{attr.upper()}: {p_value} -> {g_value}'] += 1 + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + self.stats['PARENT-CHANGED'] += 1 + pred_lo, gold_lo = pred_lo + n, gold_lo + n + for node in gold_nodes[gold_lo:gold_hi]: + self.stats['ADD-WORD'] += 1 + self.stats['ADD-LEMMA: ' + node.lemma] += 1 + for node in pred_nodes[pred_lo:pred_hi]: + self.stats['DELETE-WORD'] += 1 + self.stats['DELETE-LEMMA: ' + node.lemma] += 1 + + def process_end(self): + if self.print_stats: + how_many = None if self.print_stats in (-1, '-1') else self.print_stats + for edit, count in self.stats.most_common(how_many): + print(f'{count:4} {edit}') diff --git a/udapi/block/util/markmwtbugsatnodes.py b/udapi/block/util/markmwtbugsatnodes.py new file mode 100644 index 00000000..ebc2ef4e --- /dev/null +++ b/udapi/block/util/markmwtbugsatnodes.py @@ -0,0 +1,25 @@ +"""util.MarkMwtBugsAtNodes copies Bug attributes from MISC of multiword tokens to MISC of member nodes. + Otherwise they will be ignored when write.TextModeTrees marked_only=1 is called.""" + +from udapi.core.block import Block + +class MarkMwtBugsAtNodes(Block): + """ + If a node belongs to a multiword token and the MWT has Bug in MISC, copy + the Bug to the node so that filtering trees with bugs works. + The same bug note will be copied to all nodes in the MWT. + """ + + ###!!! Do we want to do the same thing also with ToDo attributes? + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def process_node(self, node): + if node.multiword_token: + if node.multiword_token.misc['Bug']: + self.bug(node, node.multiword_token.misc['Bug']) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py new file mode 100644 index 00000000..4cce4ab8 --- /dev/null +++ b/udapi/block/util/normalize.py @@ -0,0 +1,97 @@ +"""util.Normalize normalizes the ordering of various attributes in CoNLL-U.""" +from udapi.core.block import Block +from pathlib import Path + +class Normalize(Block): + """Normalize the ordering of attributes in the FEATS and MISC columns. + + The attribute-value pairs in the FEATS column in CoNLL-U files + must be sorted alphabetically (case-insensitive) according to the guidelines + (https://universaldependencies.org/format.html#morphological-annotation). + The same is highly recommended for the MISC column. + It is useful e.g. for comparing two conllu files with diff. + + Udapi does the sorting automatically, but for speed reasons + only when writing into these attributes. + This block thus just forces deserialization of node.feats and node.misc, + so that the Udapi later sorts the attributes during serialization. + It is a bit more efficient than something like + util.Eval node='node.feats["Number"] = node.feats["Number"]' + or + util.Eval node='node.misc["NonExistentAttribute"] = None' + """ + + def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, start_sent_id=1, sent_id_prefix="", + sent_id_from_filename=False, sent_id_reset_at_newdoc=False, newdoc_from_filename=False, **kwargs): + """ + Args: + `feats`: normalize the ordering of FEATS. Default=True. + `misc`: normalize the ordering of MISC. Default=True. + `sent_id`: normalize sent_id so it forms a sequence of integers. Default=False. + `empty_node_ord`: normalize ord attributes of empty nodes. Default=False. + `start_sent_id`: the first sent_id number + `sent_id_prefix`: a string to be prepended before the integer sent_id. Default=empty string. + `sent_id_from_filename`: add Path(doc.meta["loaded_from"]).stem before the `sent_id_prefix`. Default=False. + `sent_id_reset_at_newdoc`: reset the sent_id counter to 1 for each new document. Default=False. + `newdoc_from_filename`: set newdoc to Path(doc.meta["loaded_from"]).stem. Default=False. + """ + super().__init__(**kwargs) + self.feats = feats + self.misc = misc + self.sent_id = sent_id + self.empty_node_ord = empty_node_ord + self.next_sent_id = start_sent_id + self.sent_id_prefix = sent_id_prefix + self.sent_id_from_filename = sent_id_from_filename + self.sent_id_reset_at_newdoc = sent_id_reset_at_newdoc + self.newdoc_from_filename = newdoc_from_filename + if sent_id_reset_at_newdoc and not sent_id_from_filename: + raise ValueError("Cannot use sent_id_reset_at_newdoc without sent_id_from_filename") + if sent_id_prefix or start_sent_id != 1 or sent_id_from_filename: + self.sent_id = True + + # TODO: normalize also the order of standardized comments like text, sent_id,... + + def process_bundle(self, bundle): + is_newdoc = any(tree.newdoc for tree in bundle.trees) + if self.newdoc_from_filename and is_newdoc: + tree = next(tree for tree in bundle.trees if tree.newdoc) + tree.newdoc = Path(bundle.document.meta["loaded_from"]).stem + if self.sent_id: + if self.sent_id_reset_at_newdoc and is_newdoc: + self.next_sent_id = 1 + prefix = self.sent_id_prefix + if self.sent_id_from_filename: + prefix = Path(bundle.document.meta["loaded_from"]).stem + prefix + bundle.bundle_id = prefix + str(self.next_sent_id) + self.next_sent_id += 1 + + for tree in bundle: + if self._should_process_tree(tree): + self.process_tree(tree) + + def process_tree(self, tree): + if self.empty_node_ord: + node_ord, empty_ord = 0, 0 + for node in tree.descendants_and_empty: + if node.is_empty(): + empty_ord += 1 + old_empty_ord, new_empty_ord = str(node.ord), f"{node_ord}.{empty_ord}" + if old_empty_ord != new_empty_ord: + # Make sure all nodes in this sentence have deserialized enhanced deps. + for n in tree.descendants_and_empty: + n.deps + node.ord = new_empty_ord + else: + empty_ord = 0 + node_ord = node.ord + for node in tree.descendants: + self.process_node(node) + + def process_node(self, node): + if self.feats: + node.feats._deserialize_if_empty() + node.feats._string = None + if self.misc: + node.misc._deserialize_if_empty() + node.misc._string = None diff --git a/udapi/block/util/resegmentgold.py b/udapi/block/util/resegmentgold.py new file mode 100644 index 00000000..383510b6 --- /dev/null +++ b/udapi/block/util/resegmentgold.py @@ -0,0 +1,146 @@ +"""util.ResegmentGold is a block for sentence alignment and re-segmentation of two zones.""" +import logging +import unicodedata +from udapi.core.block import Block +from udapi.core.mwt import MWT +from udapi.core.root import Root + +FUNCTIONAL = {'aux', 'cop', 'mark', 'det', 'clf', 'case', 'cc'} + +class ResegmentGold(Block): + """Sentence-align two zones (gold and pred) and resegment the pred zone. + + The two zones must contain the same sequence of characters. + """ + + def __init__(self, gold_zone='gold', **kwargs): + """Args: + gold_zone: which zone contains the gold segmentation + """ + super().__init__(**kwargs) + self.gold_zone = gold_zone + + @staticmethod + def _strip_spaces(string): + return ''.join(filter(lambda c: unicodedata.category(c) != "Zs", string)) + + def process_document(self, document): + if not document.bundles: + return + pred_trees = self.extract_pred_trees(document) + was_subroot = set() + for pred_tree in pred_trees: + for n in pred_tree.children: + was_subroot.add(n) + + for bundle_no, bundle in enumerate(document.bundles): + g_tree = bundle.trees[0] + p_tree = pred_trees.pop() + g_chars = self._strip_spaces(''.join(t.form for t in g_tree.token_descendants)) + p_chars = self._strip_spaces(''.join(t.form for t in p_tree.token_descendants)) + g_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", g_chars)) + p_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", p_chars)) + if g_chars == p_chars: + bundle.add_tree(p_tree) + continue + + # Make sure that p_tree contains enough nodes. + moved_roots = [] + while len(p_chars) < len(g_chars): + if not pred_trees: + raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars)) + new_p_tree = pred_trees.pop() + p_chars += self._strip_spaces(''.join(t.form for t in new_p_tree.token_descendants)) + moved_roots.extend(new_p_tree.children) + p_tree.steal_nodes(new_p_tree.descendants) + self.choose_root(p_tree, was_subroot, g_tree) + + if not p_chars.startswith(g_chars): + raise ValueError('sent_id=%s: !p_chars.startswith(g_chars):\np_chars=%s\ng_chars=%s' + % (g_tree.sent_id, p_chars, g_chars)) + if g_chars == p_chars: + bundle.add_tree(p_tree) + continue + + # Now p_tree contains more nodes than it should. + p_chars = '' + tokens = p_tree.token_descendants + for index, token in enumerate(tokens): + p_chars += self._strip_spaces(token.form) + if len(p_chars) > len(g_chars): + logging.warning('Pred token crossing gold sentences: %s', g_tree.sent_id) + # E.g. gold cs ln95048-151-p2s8 contains SpaceAfter=No on the last word + # of the sentence, resulting in "uklidnila.Komentář" in the raw text. + # It is not obvious how to fix this "properly", i.e. without increasing + # or decreasing the resulting LAS. The current solution is quite hacky. + if index + 1 == len(tokens): + next_p_tree = Root(zone=p_tree.zone) + pred_trees.append(next_p_tree) + next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):], + misc='Rehanged=Yes') + bundle.add_tree(p_tree) + break + else: + next_tok = tokens[index + 1] + next_tok.form = p_chars[len(g_chars):] + next_tok.form + p_chars = g_chars + if len(p_chars) == len(g_chars): + next_p_tree = Root(zone=p_tree.zone) + words = [] + for token in tokens[index + 1:]: + if isinstance(token, MWT): + words.extend(token.words) + else: + words.append(token) + for word in words: + if word in was_subroot: + del word.misc['Rehanged'] + if word.parent is not p_tree and word.parent not in words: + if word.udeprel in FUNCTIONAL: + word.parent.misc['FuncChildMissing'] = 'Yes' + for child in word.children: + if child not in words and child.udeprel in FUNCTIONAL: + word.misc['FuncChildMissing'] = 'Yes' + next_p_tree.steal_nodes(words) + self.choose_root(p_tree, was_subroot, g_tree) + self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0]) + pred_trees.append(next_p_tree) + bundle.add_tree(p_tree) + break + + def extract_pred_trees(self, document): + """Delete all trees with zone!=gold_zone from the document and return them.""" + pred_trees = [] + for bundle in reversed(document.bundles): + zones = [t.zone for t in bundle.trees] + if len(zones) > 2 or (len(zones) == 2 and self.gold_zone not in zones): + raise ValueError('Expected two zones including gold_zone=%s, but found: %s' + % (self.gold_zone, zones)) + for tree in bundle.trees: + if tree.zone != self.gold_zone: + pred_trees.append(tree) + tree.remove() + for bundle in document.bundles: + if not bundle.trees: + bundle.remove() + if not document.bundles: + raise ValueError('No bundles with gold_zone=' + self.gold_zone) + return pred_trees + + @staticmethod + def choose_root(p_tree, was_subroot, g_tree): + """Prevent multiple roots, which are forbidden in CoNLL-U.""" + possible_subroots = [n for n in p_tree.children if n in was_subroot] + if possible_subroots: + the_subroot = possible_subroots[0] + g_subroot = g_tree.children[0] + possible_subroots = sorted([n for n in possible_subroots if n.form == g_subroot.form], + key=lambda n: abs(n.ord - g_subroot.ord)) + the_subroot = possible_subroots[0] if possible_subroots else the_subroot + else: + the_subroot = p_tree.children[0] + the_subroot.misc['Rehanged'] = 'Yes' + for subroot in p_tree.children: + if subroot is not the_subroot: + subroot.parent = the_subroot + subroot.misc['Rehanged'] = 'Yes' diff --git a/udapi/block/util/see.py b/udapi/block/util/see.py new file mode 100644 index 00000000..9a895b88 --- /dev/null +++ b/udapi/block/util/see.py @@ -0,0 +1,124 @@ +"""Block util.See prints statistics about the nodes matching a given condition. + +Example usage from the command line:: + +udapy util.See node='node.is_nonprojective()' n=3 \ + stats=dir,children,c_upos,p_lemma,deprel,feats_split < in.conllu + +Example output:: + +node.is_nonprojective() +matches 245 out of 35766 nodes (0.7%) in 174 out of 1478 trees (11.8%) +=== dir (2 values) === + right 193 78% delta=+37% + left 52 21% delta=-33% +=== children (9 values) === + 0 64 26% delta=-38% + 2 58 23% delta=+14% + 3 38 15% delta= +7% +=== c_upos (15 values) === + NOUN 118 23% delta= +4% + DET 61 12% delta= -3% + PROPN 47 9% delta= +1% +=== p_lemma (187 values) === + il 5 2% delta= +1% + fonction 4 1% delta= +1% + écrire 4 1% delta= +1% +=== deprel (22 values) === + appos 41 16% delta=+15% + conj 41 16% delta=+13% + punct 36 14% delta= +4% +=== feats_split (20 values) === + Number=Sing 114 21% delta= +2% + Gender=Masc 81 15% delta= +3% + _ 76 14% delta= -6% + +In addition to absolute counts for each value, the percentage within matching nodes is printed +and a delta relative to percentage within all nodes. +This helps to highlight what is special about the matching nodes. +""" +from collections import Counter +import re # may be useful in eval, thus pylint: disable=unused-import + +from udapi.core.block import Block + +STATS = 'dir,edge,depth,children,siblings,p_upos,p_lemma,c_upos,form,lemma,upos,deprel,feats_split' + +# We need eval in this block +# pylint: disable=eval-used + + +class See(Block): + """Print statistics about the nodes specified by the parameter `node`.""" + + def __init__(self, node, n=5, stats=STATS, empty=False, **kwargs): + """Args: + `node`: Python expression to be evaluated for each node and if True, + the node will be considered "matching". + `n`: Top n values will be printed for each statistic. + `stats`: a list of comma-separated statistics to be printed. + A statistic can be an attribute (`form`, `lemma`) or a pseudo-attribute + (`depth` = depth of a node in dependency tree, + `children` = number of children nodes, + `p_lemma` = lemma of a parent node, etc). + See `udapi.core.Node.get_attrs` for a full list of statistics. + `empty`: apply the code also on empty nodes + """ + super().__init__(**kwargs) + self.node = node + self.n_limit = n + self.stats = stats.split(',') + self.match = dict() + self.every = dict() + for stat in self.stats: + self.match[stat] = Counter() + self.every[stat] = Counter() + self.overall = Counter() + self.empty = empty + + def process_tree(self, root): + self.overall['trees'] += 1 + tree_match = False + nodes = root.descendants_and_empty if self.empty else root.descendants + for node in nodes: + matching = self.process_node(node) + self.overall['nodes'] += 1 + if matching: + self.overall['matching_nodes'] += 1 + if not tree_match: + self.overall['matching_trees'] += 1 + tree_match = True + + def process_node(self, node): + matching = eval(self.node) + for stat in self.stats: + for value in node.get_attrs([stat], undefs=''): + self.every[stat][value] += 1 + self.every[stat]['T O T A L'] += 1 + if matching: + self.match[stat][value] += 1 + self.match[stat]['T O T A L'] += 1 + return matching + + def process_end(self): + print(self.node) + print("matches %d out of %d nodes (%.1f%%) in %d out of %d trees (%.1f%%)" + % (self.overall['matching_nodes'], + self.overall['nodes'], + self.overall['matching_nodes'] * 100 / self.overall['nodes'], + self.overall['matching_trees'], + self.overall['trees'], + self.overall['matching_trees'] * 100 / self.overall['trees'])) + for stat in self.stats: + vals = len(self.match[stat].keys()) - 1 + print("=== %s (%d value%s) ===" % (stat, vals, 's' if vals > 1 else '')) + match_total = self.match[stat]['T O T A L'] or 1 + every_total = self.every[stat]['T O T A L'] or 1 + for value, match_count in self.match[stat].most_common(self.n_limit + 1): + if value == 'T O T A L': + continue + every_count = self.every[stat][value] + match_perc = 100 * match_count / match_total + every_perc = 100 * every_count / every_total + print("%15s %5d %3d%% delta=%+3d%%" + % (value, match_count, match_perc, match_perc - every_perc)) diff --git a/udapi/block/util/split.py b/udapi/block/util/split.py new file mode 100644 index 00000000..6eb2f650 --- /dev/null +++ b/udapi/block/util/split.py @@ -0,0 +1,39 @@ +"""util.Split is a special block for splitting documents.""" +import math +from udapi.core.basereader import BaseReader + +# pylint: disable=abstract-method +# read_tree() does not need to be installed here + + +class Split(BaseReader): + """Split Udapi document (with sentence-aligned trees in bundles) into several parts.""" + + def __init__(self, parts=None, bundles_per_doc=None, **kwargs): + """Args: + parts: into how many parts should the document be split + bundles_per_doc: number of bundles per the newly created part + """ + super().__init__(**kwargs) + if parts is None and bundles_per_doc is None: + raise ValueError('parts or bundles_per_doc must be specified') + if parts is not None and bundles_per_doc is not None: + raise ValueError('Cannot specify both parts and bundles_per_doc') + self.parts = parts + self.bundles_per_doc = bundles_per_doc + self.buffer = None + + @staticmethod + def is_multizone_reader(): + return False + + def process_document(self, document): + if not self.buffer: + self.buffer = document.bundles + document.bundles = [] + if self.bundles_per_doc is None: + self.bundles_per_doc = math.ceil(len(self.buffer) / self.parts) + self.buffer.extend(document.bundles) + document.bundles = self.buffer[:self.bundles_per_doc] + self.buffer = self.buffer[self.bundles_per_doc:] + self.finished = not self.buffer diff --git a/udapi/block/util/splitsentence.py b/udapi/block/util/splitsentence.py new file mode 100644 index 00000000..b6ca57d8 --- /dev/null +++ b/udapi/block/util/splitsentence.py @@ -0,0 +1,134 @@ +""" +Block util.SplitSentence will split a given sentence at a given token. +""" +import logging +from udapi.core.block import Block +from udapi.core.root import Root + +class SplitSentence(Block): + """ + If the sent_id of the current sentence matches the parameter, splits the + sentence into two. The first token of the second sentence is also given as + a parameter. + + Alternatively, a MISC attribute can be specified that triggers sentence + splitting at the given token. With this approach, multiple sentence splits + can be performed during one run. + """ + + def __init__(self, sent_id=None, word_id=None, misc_name=None, misc_value=None, **kwargs): + """ + Args: + sent_id: which sentence should be split (new ids will have A and B appended) + word_id: which word should be the first word of the second sentence (tokens and words will be renumbered) + misc_name: name of the MISC attribute that can trigger the split (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the split; if not specified, then simple occurrence of the attribute with any value will cause the split + MISC attributes that have triggered sentence split will be removed from their node. + """ + super().__init__(**kwargs) + if misc_name: + if sent_id or word_id: + logging.fatal('Cannot combine misc_value with sent_id or word_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + if not word_id: + logging.fatal('Missing parameter word_id') + self.sent_id = sent_id + self.word_id = word_id + self.misc_name = misc_name + self.misc_value = misc_value + + def process_document(self, document): + for bundle_no, bundle in enumerate(document.bundles): + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to split all zones but we do not try to do it at present. + # (The zones may be translations to other languages and it is not likely that we would + # want to split each translation at the same position.) + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + split_points = [n for n in root.descendants if n.ord > 1 and n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if split_points: + # Create as many new bundles as there are split points. + n_new = len(split_points) + current_bid = bundle.bundle_id + idletter = 'B' # a letter will be added to bundle ids to distinguish them + for i in range(n_new): + new_bundle = document.create_bundle() + new_bundle.bundle_id = current_bid + idletter + new_root = Root(zone='') + new_bundle.add_tree(new_root) + # Identify nodes to move to the new bundle. + first_node_id = split_points[i].ord + if i < n_new - 1: + next_first_node_id = split_points[i+1].ord + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id and n.ord < next_first_node_id] + else: + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id] + new_root.steal_nodes(nodes_to_move) + self.make_zeros_roots(new_root) + new_root.text = new_root.compute_text() + # The new bundle was created at the end of the document. + # Move it to the position right after the current bundle. + document.bundles.pop() + document.bundles.insert(bundle_no + i + 1, new_bundle) + idletter = chr(ord(idletter) + 1) + # Remove from the node the MISC attribute that triggered the sentence split. + split_points[i].misc[self.misc_name] = '' + # Update the id of the current bundle, fix its zero-dependents and recompute sentence text. + bundle.bundle_id += 'A' + self.make_zeros_roots(root) + root.text = root.compute_text() + # Update the bundle numbers of the new bundles and all bundles after them. + updated_no = bundle_no + 1 + for b in document.bundles[(bundle_no+1):]: + b.number = updated_no + updated_no += 1 + elif bundle.bundle_id == self.sent_id: + logging.info('Found!') + root = bundle.get_tree() + nodes_to_move = [n for n in root.descendants if n.ord >= self.word_id] + if len(nodes_to_move) == 0: + logging.fatal('No nodes to move to the new sentence; word_id may be out of range') + # Create a new bundle at the end of the current document. + new_bundle = document.create_bundle() + # Move the new bundle to the position right after the current bundle. + new_bundle_no = bundle_no + 1 + document.bundles.pop() + document.bundles.insert(new_bundle_no, new_bundle) + updated_no = new_bundle_no + for b in document.bundles[new_bundle_no:]: + b.number = updated_no + updated_no += 1 + new_bundle.bundle_id = bundle.bundle_id + 'B' + bundle.bundle_id += 'A' + new_root = Root(zone='') + new_bundle.add_tree(new_root) + new_root.steal_nodes(nodes_to_move) + # The steal_nodes() method does not make sure that all nodes newly attached + # to the artificial root have the 'root' relation. Fix it. + self.make_zeros_roots(root) + self.make_zeros_roots(new_root) + # Update the sentence text attributes of the new sentences. + root.text = root.compute_text() + new_root.text = new_root.compute_text() + # We have found our sentence. No need to process the rest of the document. + break + + def make_zeros_roots(self, root): + """ + The steal_nodes() method does not make sure that all nodes newly attached + to the artificial root have the 'root' relation. Fix it. + """ + n_root = 0 + for n in root.descendants: + if n.parent.is_root(): + n.deprel = 'root' + n_root += 1 + if n_root > 1: + logging.warning('More than one 0:root relation in newly segmented sentence %s.' % root.bundle.bundle_id) diff --git a/udapi/block/util/wc.py b/udapi/block/util/wc.py index df9db3f4..9920d0b6 100644 --- a/udapi/block/util/wc.py +++ b/udapi/block/util/wc.py @@ -1,13 +1,20 @@ """Wc is a special block for printing statistics (word count etc).""" from udapi.core.block import Block + class Wc(Block): """Special block for printing statistics (word count etc).""" - def __init__(self, **kwargs): - """Create the Wc block object.""" + def __init__(self, tsv=False, **kwargs): + """Create the Wc block object. + + Params: + tsv: print just tab-separated-values (trees, words, tokens, MWTs, empty nodes) + """ super().__init__(**kwargs) self.trees, self.words, self.mwts, self.tokens, self.empty = 0, 0, 0, 0, 0 + self.docs, self.paragraphs = 0, 0 + self.tsv = tsv def process_tree(self, tree): self.trees += 1 @@ -16,10 +23,21 @@ def process_tree(self, tree): self.mwts += mwtoks self.tokens += len(tree.token_descendants) if mwtoks else len(tree.descendants) self.empty += len(tree.empty_nodes) + if tree.newdoc or tree == tree.document[0].trees[0]: + self.docs += 1 + if tree.newpar: + self.paragraphs += 1 def process_end(self): - print('%8d trees\n%8d words' % (self.trees, self.words)) - if self.mwts: - print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) - if self.empty: - print('%8d empty nodes' % self.empty) + if self.tsv: + print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty, self.docs, self.paragraphs)))) + else: + print('%8d trees\n%8d words' % (self.trees, self.words)) + if self.mwts: + print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) + if self.empty: + print('%8d empty nodes' % self.empty) + if self.docs: + print('%8d documents' % self.docs) + if self.paragraphs: + print('%8d paragraphs' % self.paragraphs) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 8c65f0fd..ad647477 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -1,7 +1,7 @@ """Conllu class is a a writer of files in the CoNLL-U format.""" +import json from udapi.core.basewriter import BaseWriter - class Conllu(BaseWriter): """A writer of files in the CoNLL-U format.""" @@ -11,65 +11,144 @@ def __init__(self, print_sent_id=True, print_text=True, print_empty_trees=True, self.print_text = print_text self.print_empty_trees = print_empty_trees - # A list of Conllu columns. - self.node_attributes = ["ord", "form", "lemma", "upos", "xpos", - "feats", "parent", "deprel", "raw_deps", "misc"] - - def process_tree(self, tree): # pylint: disable=too-many-branches - nodes = tree.descendants + def process_tree(self, tree): # pylint: disable=too-many-branches + empty_nodes = tree.empty_nodes + if empty_nodes: + nodes = sorted(tree._descendants + empty_nodes) + else: + nodes = tree._descendants # Empty sentences are not allowed in CoNLL-U, so with print_empty_trees==0 # we need to skip the whole tree (including possible comments). if not nodes and not self.print_empty_trees: return + # If tree.comment contains placeholders $NEWDOC,...$TEXT, replace them with the actual + # value of the attribute and make note on which line (i_*) they were present. + comment_lines = tree.comment.splitlines() + i_newdoc, i_newpar, i_sent_id, i_text, i_global_entity = -1, -1, -1, -1, -1 + for i, c_line in enumerate(comment_lines): + if c_line == '$SENT_ID': + i_sent_id = i + comment_lines[i] = ' sent_id = ' + tree.sent_id if self.print_sent_id else None + elif c_line == '$TEXT': + i_text = i + if self.print_text: + if tree.text is None: + comment_lines[i] = ' text = ' + tree.compute_text() + else: + comment_lines[i] = ' text = ' + tree.text.replace('\n', '').replace('\r', '').rstrip() + elif c_line == '$NEWDOC': + i_newdoc = i + if self.print_sent_id and tree.newdoc: + comment_lines[i] = ' newdoc' + (' id = ' + tree.newdoc if tree.newdoc is not True else '') + else: + comment_lines[i] = None + elif c_line == '$NEWPAR': + i_newpar = i + if self.print_sent_id and tree.newpar: + comment_lines[i] = ' newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '') + else: + comment_lines[i] = None + elif c_line == '$GLOBAL.ENTITY': + i_global_entity = i + ge = tree.document.meta.get('global.Entity') + if ge: + comment_lines[i] = ' global.Entity = ' + ge + else: + comment_lines[i] = None + + # Now print the special comments: global.columns, newdoc, newpar, sent_id and text. + # If these comments were already present in tree.comment (as marked with the placeholders), + # keep them at their original position and print also all comment lines preceding them. + # It they were missing, try to print them at the correct position. + printed_i = -1 + if comment_lines and comment_lines[0].startswith(' global.columns'): + printed_i += 1 + print('#' + comment_lines[printed_i]) if self.print_sent_id: if tree.newdoc: - value = ' id = ' + tree.newdoc if tree.newdoc is not True else '' - print('# newdoc' + value) + if i_newdoc == -1: + print('# newdoc' + (' id = ' + tree.newdoc if tree.newdoc is not True else '')) + else: + while printed_i < i_newdoc: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) + ge = tree.document.meta.get('global.Entity') + if ge: + if i_global_entity == -1: + print('# global.Entity = ' + ge) + else: + while printed_i < i_global_entity: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) if tree.newpar: - value = ' id = ' + tree.newpar if tree.newpar is not True else '' - print('# newpar' + value) - print('# sent_id = ' + tree.address()) + if i_newpar == -1: + print('# newpar' + (' id = ' + tree.newpar if tree.newpar is not True else '')) + else: + while printed_i < i_newpar: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) + if i_sent_id == -1: + print('# sent_id = ' + tree.sent_id) + else: + while printed_i < i_sent_id: + printed_i += 1 + if comment_lines[printed_i]: + print('#' + comment_lines[printed_i]) + if self.print_text and i_text == -1: + print('# text = ' + (tree.compute_text() if tree.text is None else tree.text.replace('\n', '').replace('\r', '').rstrip())) - if self.print_text: - print("# text = " + tree.get_sentence()) + for c_line in comment_lines[printed_i + 1:]: + if c_line: + print('#' + c_line) - comment = tree.comment - if comment: - comment = comment.rstrip() - print('#' + comment.replace('\n', '\n#')) + # Special-purpose json_* comments should always be at the end of the comment block. + if tree.json: + for key, value in sorted(tree.json.items()): + print(f"# json_{key} = {json.dumps(value, ensure_ascii=False, sort_keys=True)}") last_mwt_id = 0 - empty_nodes = list(tree.empty_nodes) - next_empty_ord = int(float(empty_nodes[0].ord)) if empty_nodes else -1 for node in nodes: - mwt = node.multiword_token - if mwt and node.ord > last_mwt_id: - last_mwt_id = mwt.words[-1].ord - print('\t'.join([mwt.ord_range(), - mwt.form if mwt.form is not None else '_', - '_\t_\t_\t_\t_\t_\t_', str(mwt.misc)])) - values = [getattr(node, attr_name) for attr_name in self.node_attributes] - values = ['_' if v is None else str(v) for v in values] - try: - values[6] = str(node.parent.ord) - except AttributeError: - values[6] = '0' - print('\t'.join(values)) - if node.ord == next_empty_ord: - empty = empty_nodes.pop(0) - values = [str(getattr(empty, a)) for a in self.node_attributes] - values[6] = '_' - values[7] = '_' - print('\t'.join(values)) - next_empty_ord = int(float(empty_nodes[0].ord)) if empty_nodes else -1 + mwt = node._mwt + if mwt and node._ord > last_mwt_id: + print('\t'.join((mwt.ord_range, + '_' if mwt.form is None else mwt.form, + '_\t_\t_', + '_' if mwt._feats is None else str(mwt.feats), + '_\t_\t_', + '_' if mwt._misc is None else str(mwt.misc)))) + last_mwt_id = mwt.words[-1]._ord - # Empty sentences are not allowed in CoNLL-U, + if node._parent is None: + head = '_' # Empty nodes + else: + try: + head = str(node._parent._ord) + except AttributeError: + head = '0' + + print('\t'.join('_' if v is None else v for v in + (str(node._ord), node.form, node.lemma, node.upos, node.xpos, + '_' if node._feats is None else str(node.feats), head, node.deprel, + node.raw_deps, '_' if node._misc is None else str(node.misc)))) + + # Empty sentences (sentences with no non-empty nodes) are not allowed in CoNLL-U, # but with print_empty_trees==1 (which is the default), # we will print an artificial node, so we can print the comments. - if not nodes: + if not tree._descendants: print("1\t_\t_\t_\t_\t_\t0\t_\t_\tEmpty=Yes") # Empty line separates trees in CoNLL-U (and is required after the last tree as well) print("") + + def before_process_document(self, document): + """Print doc_json_* headers.""" + super().before_process_document(document) + if document.json: + for key, value in sorted(document.json.items()): + print("# doc_json_%s = %s" + % (key, json.dumps(value, ensure_ascii=False, sort_keys=True))) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py new file mode 100644 index 00000000..1d5d4716 --- /dev/null +++ b/udapi/block/write/corefhtml.py @@ -0,0 +1,478 @@ +"""CorefHtml class is a writer for HTML+JavaScript visualization of coreference. + +When using lazy loading of documents (infinite scrolling), +modern browsers don't allow JavaScript to load files from a local file system +("Access to XMLHttpRequest at 'file://.../doc2.html' from origin 'null' has been +blocked by CORS policy: Cross origin requests are only supported for protocol schemes: +http, data, chrome, chrome-extension, https.") + +The recommended solution is to start a local web server, e.g. using + python -m http.server +and browse http://0.0.0.0:8000/my.html. + +Non-recommended solution is to run + google-chrome --new-window --user-data-dir=/tmp/chrome-proxy --allow-file-access-from-files my.html +""" +from udapi.core.basewriter import BaseWriter +from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention +from collections import Counter +import udapi.block.write.html +import gzip +import sys +import os +import re + +ETYPES = 'person place organization animal plant object substance time number abstract event'.split() + +HTYPES = 'PROPN NOUN PRON VERB DET OTHER'.split() + +HEADER = ''' + +Udapi CorefUD viewer + + +''' + +CSS = ''' +#wrap {display: flex; align-items: flex-start;} +#main {width: 100%; padding: 5px; background: white; z-index:100;} +#overview { position: sticky; top: 0; overflow-y: scroll; height:95vh; resize:horizontal; + display: grid; border-right: double; + padding: 5px; width: 20em; background: #ddd; border-radius: 5px; +} +#main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; + padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#main-menu div {display: inline-block;} +#menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} +#menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} +.change .b1 {transform: translate(0, 9px) rotate(-45deg);} +.change .b2 {opacity: 0;} +.change .b3 {transform: translate(0, -9px) rotate(45deg);} + +.m {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.nobox {border:1px solid transparent; padding:0; background: transparent !important; display: inline} +.nobox .labels {display: inline;} +.nocolor {color: black !important;} +.nobold {font-weight: normal;} +.labels {display: block; font-size: 10px;} +.showtree {margin: 5px; user-select: none;} +.display-inline {display: inline;} +.close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} +i.empty {color: gray; border: 3px outset gray; padding: 1px;} +.sentence .singleton {border-style: dotted;} +.crossing:before {content: "!"; display: block; background: #ffd500;} +.active {border: 1px solid red !important;} +.selected {background: red !important; text-shadow: 1px 1px 4px white, -1px 1px 4px white, 1px -1px 4px white, -1px -1px 4px white;} +.sent_id {display: none; background: #ddd; border-radius: 3px;} +''' + +SCRIPT_BASE = ''' +function add_mention_listeners(mentions){ + mentions.click(function(e) { + let was_selected = $(this).hasClass("selected"); + $(".m").removeClass("selected"); + if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} + e.stopPropagation(); + }); + mentions.hover( + function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, + function(e) {$(".m").removeClass("active");} + ); +} +add_mention_listeners($(".m")); + +window.onhashchange = function() { + $(".m").removeClass("selected"); + var fragment = window.location.hash.substring(1); + if (fragment) {$("." + fragment).addClass("selected");} +} + +function menuclick(x) { + x.classList.toggle("change"); + $("#main-menu").toggle(); +} + +async function load_doc(doc_num) { + loading_now = true; + let filename = docs_dir + "/doc" + doc_num + ".html.gz" + console.log("loading " + filename); + try { + const res = await fetch(filename); + let raw = await res.arrayBuffer(); + data = pako.inflate(raw, {to: "string"}); + } catch (error){ + if (! load_fail_reported) { + load_fail_reported = true; + alert("Cannot load " + filename + "\\nLocal files do not support lazy loading." + + " Run a web server 'python -m http.server'\\n" + + "error = " + error); + } + } + $("#main").append(data); + add_mention_listeners($("#doc" + doc_num + " .m")); + $("#doc" + doc_num + " .sentence").each(add_show_tree_button); + $('.eid').toggle($('#show-eid')[0].checked); + $('.etype').toggle($('#show-etype')[0].checked); + $('.sent_id').toggle($('#show-sent_id')[0].checked); + $('.showtree').toggle($('#show-trees')[0].checked); + $('.m').toggleClass('nocolor', ! $('#show-color')[0].checked); + $('.m').toggleClass('nobox', ! $('#show-boxes')[0].checked); + $('.norm').toggle($('#show-norm')[0].checked); + $('.head').toggleClass('nobold', ! $('#show-heads')[0].checked); + $('.empty').toggle($('#show-empty')[0].checked); + $('.sentence').toggleClass('display-inline', ! $('#show-breaks')[0].checked); + $('.par').toggle($('#show-pars')[0].checked); + $('h1').toggle($('#show-docs')[0].checked); + $('.m').toggleClass('htype',$('#htype')[0].checked) + loading_now = false; +} + +var docs_loaded = 1; +var load_fail_reported = false; +var loading_now = false; +add_show_tree_button = function(index, el){ // to be redefined later if show_trees=True + $(el).prepend('🆔' + el.dataset.id + ''); +} +function load_more() { + if (!loading_now && $(window).scrollTop() >= $(document).height() - $(window).height() - 42 && docs_loaded < all_docs) { + docs_loaded += 1; + load_doc(docs_loaded); + } +} +$(window).scroll(load_more); +const resizeObserver = new ResizeObserver(entries =>load_more()); +resizeObserver.observe(document.body); +''' + +SCRIPT_SHOWTREE = ''' +function show_tree_in_tdiv(tdiv, doc_number, index){ + tdiv.treexView([docs_json[doc_number][index]]); + $("\n' + ) + + # The first ud_doc will be printed to the main html file. + self.process_ud_doc(ud_docs[0], 1) + print('') # id=main + + # Other ud_docs will be printed into separate files (so they can be loaded lazily) + orig_stdout = sys.stdout + try: + for i, ud_doc in enumerate(ud_docs[1:], 2): + sys.stdout = gzip.open(f"{self.docs_dir}/doc{i}.html.gz", 'wt') + self.process_ud_doc(ud_doc, i) + sys.stdout.close() + finally: + sys.stdout = orig_stdout + + print(f'') + print('') + + def _start_subspan(self, subspan, crossing=False): + m = subspan.mention + e = m.entity + classes = f'{_dom_esc(e.eid)} {self._mention_ids[m]} {e.etype or "other"} m' + title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}' + classes += f" {m.head.upos if m.head.upos in HTYPES else 'OTHER'}" + title += f'\nhead-upos={m.head.upos}' + if self.colors: + classes += f' {self._entity_colors[e]}' + if all(w.is_empty() for w in subspan.words): + classes += ' empty' + if len(e.mentions) == 1: + classes += ' singleton' + if crossing: + classes += ' crossing' + title += '\ncrossing' + if m.other: + title += f'\n{m.other}' + span_id = '' + if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m: + span_id = f'id="{_dom_esc(e.eid)}" ' + # The title should be always rendered left-to-right (e.g. "head=X", not "X=head"), + # so for RTL languages, we need to use explicit dir="ltr" and insert a nested span with dir="rtl". + if self.rtl: + print(f'' + f'{_dom_esc(subspan.subspan_eid)}' + f' {e.etype}', end='') + else: + print(f'' + f'{_dom_esc(subspan.subspan_eid)}' + f' {e.etype}', end='') + + def process_tree(self, tree): + mentions = set() + nodes_and_empty = tree.descendants_and_empty + for node in nodes_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + subspans = [] + for mention in mentions: + subspans.extend(mention._subspans()) + subspans.sort(reverse=True) + + if tree.newdoc: + print(f'

{tree.newdoc if tree.newdoc is not True else ""}


') + elif tree.newpar: + print('
') + opened, prev_node_mention = [], True + rtl = ' dir="rtl"' if self.rtl else "" + print(f'

') + for node in nodes_and_empty: + if not prev_node_mention and subspans and subspans[-1].words[0] == node: + print('', end='') + while subspans and subspans[-1].words[0] == node: + subspan = subspans.pop() + self._start_subspan(subspan) + opened.append(subspan) + + if not opened and prev_node_mention: + print('', end='') + prev_node_mention = True if opened else False + is_head = self._is_head(node) + if is_head: + print('', end='') + if node.is_empty(): + print('', end='') + print(node.form, end='') + if node.is_empty(): + print('', end='') + if is_head: + print('', end='') + + while opened and opened[-1].words[-1] == node: + if self.rtl: + print('', end='') + else: + print('', end='') + opened.pop() + + # Two mentions are crossing iff their spans have non-zero intersection, + # but neither is a subset of the other, e.g. (e1 ... (e2 ... e1) ... e2). + # Let's visualize this (simplified) as + # ......... + # i.e. let's split mention e2 into two subspans which are next to each other. + # Unfortunatelly, we cannot mark now both crossing mentions using html class "crossing" + # (opening tags are already printed), so we'll mark only the second part of the second mention. + endings = [x for x in opened if x.words[-1] == node] + if endings: + new_opened, brokens, found_crossing = [], [], False + for subspan in opened: + if subspan.words[-1] == node: + found_crossing = True + elif found_crossing: + brokens.append(subspan) + else: + new_opened.append(subspan) + opened = new_opened + print('' * (len(endings) + len(brokens)), end='') + for broken in brokens: + self._start_subspan(broken, True) + opened.append(subspan) + + if not node.no_space_after: + print(' ', end='') + + if not prev_node_mention: + print('', end='') + print('

') + + def _is_head(self, node): + for mention in node.coref_mentions: + if mention.head == node: + return mention + return None + + +# id needs to be a valid DOM querySelector +# so it cannot contain [#./:] and maybe more, +# so let's substitute all [^\w\d-] to be on the safe side. +# DOM IDs cannot start with a digit, so prepend e.g. "n" if needed. +def _dom_esc(string): + if string[0].isdecimal(): + string = 'n' + string + return re.sub(r'[^\w\d-]', '_', string) + +def _id(node): + if node is None: + return 'null' + return _dom_esc(node.address()) + +def _esc(string): + if string is None: + string = '' + return string.replace('\\', '\\\\').replace('"', r'\"') diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 2fd76bf7..ae85d43c 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -5,17 +5,19 @@ class Html(BaseWriter): """A writer for HTML+JavaScript+SVG visualization of dependency trees. - Usage: - # from the command line - udapy write.Html < file.conllu > file.html - firefox file.html - - # for offline use, we need to download first three JavaScript libraries - wget https://code.jquery.com/jquery-2.1.4.min.js - wget https://cdn.rawgit.com/eligrey/FileSaver.js/master/FileSaver.min.js - wget https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js - udapy write.Html path_to_js=. < file.conllu > file.html - firefox file.html + .. code-block:: bash + + # from the command line + udapy write.Html < file.conllu > file.html + firefox file.html + + For offline use, we need to download first three JavaScript libraries:: + + wget https://code.jquery.com/jquery-2.1.4.min.js + wget https://cdn.rawgit.com/eligrey/FileSaver.js/1.3.4r/FileSaver.min.js + wget https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js + udapy write.Html path_to_js=. < file.conllu > file.html + firefox file.html This writer produces an html file with drawings of the dependency trees in the document (there are buttons for selecting which bundle will be shown). @@ -32,7 +34,7 @@ class Html(BaseWriter): This block is based on `Treex::View `_ but takes a different approach. `Treex::View` depends on (older version of) - `Valence` (Perl interface to `Electron `_) + `Valence` (Perl interface to `Electron `_) and comes with a script `view-treex`, which takes a treex file, converts it to json behind the scenes (which is quite slow) and displays the json in a Valence window. @@ -63,7 +65,7 @@ def __init__(self, path_to_js='web', **kwargs): def process_document(self, doc): if self.path_to_js == 'web': jquery = 'https://code.jquery.com/jquery-2.1.4.min.js' - fsaver = 'https://cdn.rawgit.com/eligrey/FileSaver.js/master/FileSaver.min.js' + fsaver = 'https://cdn.rawgit.com/eligrey/FileSaver.js/1.3.4/FileSaver.min.js' js_t_v = 'https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js' else: jquery = self.path_to_js + '/jquery-2.1.4.min.js' @@ -71,22 +73,38 @@ def process_document(self, doc): js_t_v = self.path_to_js + '/js-treex-view.js' print('') - print('Udapi viewer') # TODO doc.loaded_from + print('Udapi viewer') # TODO doc.loaded_from for js_file in (jquery, fsaver, js_t_v): print('' % js_file) print('\n') print('
') + + def print_doc_json(self, doc): + print('[') for (bundle_number, bundle) in enumerate(doc, 1): - # TODO: if not self._should_process_bundle(bundle): continue if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' - for tree in bundle.trees: - # TODO: if not self._should_process_tree(tree): continue + try: + trees = bundle.trees + except: + trees = [bundle] # allow to call print_doc_json([tree1, tree2]) + for tree in trees: zone = tree.zone if first_zone: first_zone = False @@ -99,24 +117,16 @@ def process_document(self, doc): print('"labels":["zone=%s","id=%s"]}' % (zone, tree.address())) desc += ',["[%s]","label"],[" ","space"]' % zone for node in tree.descendants: - desc += self.print_node(node) + desc += self.print_node_json(node) desc += r',["\n","newline"]' print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) - print('];') - print("$('#treex-view').treexView(data);") - print('''function saveTree() { - var svg_el = jQuery('svg'); - if (svg_el.length) { - var svg = new Blob([svg_el.parent().html()], {type: "image/svg+xml"}); - saveAs(svg, 'tree.svg'); - } - }''') - print('') + print(']') + @staticmethod - def print_node(node): + def print_node_json(node): """JSON representation of a given node.""" # pylint does not understand `.format(**locals())` and falsely alarms for unused vars # pylint: disable=too-many-locals,unused-variable @@ -132,7 +142,7 @@ def print_node(node): multiline_feats = feats.replace('|', r'\n') print(',{{"id":{id_node},"parent":{id_parent},"order":{order},{firstson_str}{rbrother_str}' '"data":{{"ord":{order},"form":"{form}","lemma":"{lemma}","upos":"{upos}",' - '"xpos":"{xpos}","feats":"{feats}","deprel":"{deprel}",' # TODO: deps + '"xpos":"{xpos}","feats":"{feats}","deprel":"{deprel}",' # TODO: deps '"misc":"{misc}","id":"{address}"}},' '"labels":["{form}","#{{#bb0000}}{upos}","#{{#0000bb}}{deprel}"],' '"hint":"lemma={lemma}\\n{multiline_feats}"}}'.format(**locals())) @@ -149,6 +159,7 @@ def _id(node): return 'null' return '"n%s"' % node.address().replace('#', '-').replace('/', '-') + def _esc(string): if string is None: string = '' diff --git a/udapi/block/write/oldcorefud.py b/udapi/block/write/oldcorefud.py new file mode 100644 index 00000000..49f9beb0 --- /dev/null +++ b/udapi/block/write/oldcorefud.py @@ -0,0 +1,58 @@ +"""Writer for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation.""" +import re +import logging +import udapi.block.write.conllu + +class OldCorefUD(udapi.block.write.conllu.Conllu): + + def process_document(self, doc): + if not doc.coref_entities: + logging.warning("Using write.OldCorefUD on a document without any coreference annotation") + + # Delete both new-style (GUM-style) and old-style (CorefUD 0.1) coreference annotations from MISC. + attrs = "Entity Split Bridge ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split() + for node in doc.nodes_and_empty: + for key in list(node.misc): + if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): + del node.misc[key] + del doc.meta['global.Entity'] + + # doc._eid_to_entity is a dict, which is insertion ordered in Python 3.7+. + # The insertion order is sorted according to CorefEntity.__lt__ (see few lines above). + # However, new entities could be added meanwhile or some entities edited, + # so we need to sort the entities again before storing to MISC. + # We also need to mare sure entity.mentions are sorted in each entity + # because the ordering of entities is defined by the first mention in each entity. + # Ordering of mentions within a entity can be changed when e.g. changing the span + # of a given mention or reordering words within a sentence and in such events + # Udapi currently does not automatically update the ordering of entities. + for entity in doc.coref_entities: + entity._mentions.sort() + for entity in sorted(doc.coref_entities): + for mention in entity.mentions: + head = mention.head + if head.misc["ClusterId"]: + for a in attrs: + if head.misc[a]: + head.misc[a + "[1]"] = head.misc[a] + del head.misc[a] + index_str = "[2]" + else: + index, index_str = 1, "[1]" + while(head.misc["ClusterId" + index_str]): + index += 1 + index_str = f"[{index}]" + if index == 1: + index_str = "" + head.misc["ClusterId" + index_str] = entity.eid + head.misc["MentionSpan" + index_str] = mention.span + head.misc["ClusterType" + index_str] = entity.etype + if mention._bridging: + head.misc["Bridging" + index_str] = ','.join(f'{l.target.eid}:{l.relation}' for l in sorted(mention.bridging)) + if entity.split_ante: + serialized = ','.join((c.eid for c in sorted(entity.split_ante))) + head.misc["SplitAnte" + index_str] = serialized + if mention.other: + head.misc["MentionMisc" + index_str] = str(mention.other).replace('%2D', '-') + + super().process_document(doc) diff --git a/udapi/block/write/sdparse.py b/udapi/block/write/sdparse.py index 209938b5..13487738 100644 --- a/udapi/block/write/sdparse.py +++ b/udapi/block/write/sdparse.py @@ -8,33 +8,35 @@ class Sdparse(BaseWriter): """A writer of files in the Stanford dependencies format, suitable for Brat visualization. Usage: - udapy write.Sdparse print_upos=0 < in.conllu + ``udapy write.Sdparse print_upos=0 < in.conllu`` Example output:: - ~~~ sdparse - Corriere Sport da pagina 23 a pagina 26 - name(Corriere, Sport) - case(pagina-4, da) - nmod(Corriere, pagina-4) - nummod(pagina-4, 23) - case(pagina-7, a) - nmod(Corriere, pagina-7) - nummod(pagina-7, 26) - ~~~ + ~~~ sdparse + Corriere Sport da pagina 23 a pagina 26 + name(Corriere, Sport) + case(pagina-4, da) + nmod(Corriere, pagina-4) + nummod(pagina-4, 23) + case(pagina-7, a) + nmod(Corriere, pagina-7) + nummod(pagina-7, 26) + ~~~ To visualize it, use embedded Brat, e.g. go to - http://universaldependencies.org/visualization.html#editing + http://universaldependencies.org/visualization.html#editing. Click the edit button and paste the output of this writer excluding the `~~~` marks. Notes: - Original Stanford dependencies format (http://nlp.stanford.edu/software/dependencies_manual.pdf) + The original `Stanford dependencies format + `_ allows explicit specification of the root dependency, e.g. `root(ROOT-0, makes-8)`. However, this is not allowed by Brat, so this writer does not print it. UD v2.0 allows tokens with spaces, but I am not aware of any Brat support. Alternatives: + * `write.Conllu` Brat recently supports also the CoNLL-U input * `write.TextModeTrees` may be more readable/useful in some usecases * `write.Html` dtto, press "Save as SVG" button, convert to pdf diff --git a/udapi/block/write/sentences.py b/udapi/block/write/sentences.py index 60eb6bec..70553d7d 100644 --- a/udapi/block/write/sentences.py +++ b/udapi/block/write/sentences.py @@ -3,13 +3,14 @@ class Sentences(BaseWriter): - """A writer of plain-text sentences (one per line). + """A writer of plain-text sentences (one sentence per line). Usage: udapy write.Sentences if_missing=empty < my.conllu > my.txt + udapy write.Sentences newdoc=1 newpar=1 < my.conllu > my.txt """ - def __init__(self, if_missing='detokenize', **kwargs): + def __init__(self, if_missing='detokenize', newdoc=None, newpar=None, **kwargs): """Create the Sentences writer block. Parameters: @@ -18,9 +19,21 @@ def __init__(self, if_missing='detokenize', **kwargs): * `empty`: print an empty line * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` * `fatal`: raise an exception + newdoc: What to do if `root.newdoc` is not None? (default=None) + * None: ignore it + * True: print an empty_line (except for the first tree, i.e. bundle.number==1) + newpar: What to do if `root.newpar` is not None? (default=None) + * None: ignore it + * True: print an empty_line (except for the first tree, i.e. bundle.number==1) """ super().__init__(**kwargs) self.if_missing = if_missing + self.newdoc = newdoc + self.newpar = newpar def process_tree(self, tree): + if self.newdoc and tree.newdoc and tree.bundle.number > 1: + print() + if self.newpar and tree.newpar and tree.bundle.number > 1: + print() print(tree.get_sentence(self.if_missing)) diff --git a/udapi/block/write/sentenceshtml.py b/udapi/block/write/sentenceshtml.py new file mode 100644 index 00000000..e0f87241 --- /dev/null +++ b/udapi/block/write/sentenceshtml.py @@ -0,0 +1,37 @@ +"""SentencesHtml class is a writer for sentences in HTML list (could be Google-translated, remembering sentence correspondence).""" +from udapi.core.basewriter import BaseWriter + + +class SentencesHtml(BaseWriter): + """A writer of sentences in HTML list (one per item). + + Usage: + udapy write.SentencesHtml if_missing=empty < my.conllu > my.html + """ + + def __init__(self, title='Sentences from CoNLL-U', if_missing='detokenize', **kwargs): + """Create the SentencesHtml writer block. + + Parameters: + if_missing: What to do if `root.text` is `None`? (default=detokenize) + * `detokenize`: use `root.compute_text()` to compute the sentence. + * `empty`: print an empty line + * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` + * `fatal`: raise an exception + """ + super().__init__(**kwargs) + self.title = title + self.if_missing = if_missing + + def before_process_document(self, document): + super().before_process_document(document) + print('\n\n\n') + print('' + self.title + '') + print('\n\n
    \n') + + def after_process_document(self, document): + print("
\n\n") + super().after_process_document(document) + + def process_tree(self, tree): + print('
  • %s
  • ' % (tree.sent_id, tree.get_sentence(self.if_missing))) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index 3c9f7308..a8a7ab3d 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -1,8 +1,10 @@ """An ASCII pretty printer of dependency trees.""" +import os import re import sys import colorama +import collections from termcolor import colored from udapi.core.basewriter import BaseWriter @@ -12,103 +14,174 @@ 'upos': 'red', 'deprel': 'blue', 'ord': 'green', + 'misc[Entity]': 'magenta', } # Too many instance variables, arguments, branches... # I don't see how to fix this while not making the code less readable or more difficult to use. # pylint: disable=R0902,R0912,R0913,R0914 + + class TextModeTrees(BaseWriter): - """An ASCII pretty printer of dependency trees. + r"""An ASCII pretty printer of dependency trees. + + .. code-block:: bash - SYNOPSIS - # from command line (visualize CoNLL-U files) - udapy write.TextModeTrees color=1 < file.conllu | less -R + # from the command line (visualize CoNLL-U files) + udapy write.TextModeTrees color=1 < file.conllu | less -R - # is scenario (examples of other parameters) - write.TextModeTrees indent=1 print_sent_id=1 print_sentence=1 - write.TextModeTrees zones=en,cs attributes=form,lemma,upos minimize_cross=0 + In scenario (examples of other parameters):: + + write.TextModeTrees indent=2 print_sent_id=0 print_sentence=1 layout=align + write.TextModeTrees zones=en,cs attributes=form,lemma,upos minimize_cross=0 - DESCRIPTION This block prints dependency trees in plain-text format. - For example the following CoNLL-U file (with tabs instead of spaces) - - 1 I I PRON PRP Number=Sing|Person=1 2 nsubj _ _ - 2 saw see VERB VBD Tense=Past 0 root _ _ - 3 a a DET DT Definite=Ind 4 det _ _ - 4 dog dog NOUN NN Number=Sing 2 dobj _ _ - 5 today today NOUN NN Number=Sing 2 nmod:tmod _ SpaceAfter=No - 6 , , PUNCT , _ 2 punct _ _ - 7 which which DET WDT PronType=Rel 10 nsubj _ _ - 8 was be VERB VBD Person=3|Tense=Past 10 cop _ _ - 9 a a DET DT Definite=Ind 10 det _ _ - 10 boxer boxer NOUN NN Number=Sing 4 acl:relcl _ SpaceAfter=No - 11 . . PUNCT . _ 2 punct _ _ - - will be printed (with the default parameters) as - ─┮ - │ ╭─╼ I PRON nsubj - ╰─┾ saw VERB root - │ ╭─╼ a DET det - ├────────────────────────┾ dog NOUN dobj - ├─╼ today NOUN nmod:tmod │ - ├─╼ , PUNCT punct │ - │ │ ╭─╼ which DET nsubj - │ │ ├─╼ was VERB cop - │ │ ├─╼ a DET det - │ ╰─┶ boxer NOUN acl:relcl - ╰─╼ . PUNCT punct - - Some non-projective trees cannot be printed witout crossing edges. - TextModeTrees uses a special "bridge" symbol ─╪─ to mark this: - ─┮ - │ ╭─╼ 1 - ├─╪───┮ 2 - ╰─┶ 3 │ - ╰─╼ 4 - - By default parameter `color=auto`, so if the output is printed to the console + For example the following CoNLL-U file (with tabs instead of spaces):: + + 1 I I PRON PRP Number=Sing|Person=1 2 nsubj _ _ + 2 saw see VERB VBD Tense=Past 0 root _ _ + 3 a a DET DT Definite=Ind 4 det _ _ + 4 dog dog NOUN NN Number=Sing 2 dobj _ _ + 5 today today NOUN NN Number=Sing 2 nmod:tmod _ SpaceAfter=No + 6 , , PUNCT , _ 2 punct _ _ + 7 which which DET WDT PronType=Rel 10 nsubj _ _ + 8 was be VERB VBD Person=3|Tense=Past 10 cop _ _ + 9 a a DET DT Definite=Ind 10 det _ _ + 10 boxer boxer NOUN NN Number=Sing 4 acl:relcl _ SpaceAfter=No + 11 . . PUNCT . _ 2 punct _ _ + + will be printed (with the default parameters plus hints=0) as:: + + ─┮ + │ ╭─╼ I PRON nsubj + ╰─┾ saw VERB root + │ ╭─╼ a DET det + ├────────────────────────┾ dog NOUN dobj + ├─╼ today NOUN nmod:tmod │ + ├─╼ , PUNCT punct │ + │ │ ╭─╼ which DET nsubj + │ │ ├─╼ was VERB cop + │ │ ├─╼ a DET det + │ ╰─┶ boxer NOUN acl:relcl + ╰─╼ . PUNCT punct + + With ``layout=compact``, the output will be (note the nodes "today" and ","):: + + ─┮ + │ ╭─╼ I PRON nsubj + ╰─┾ saw VERB root + │ ╭─╼ a DET det + ┡───┾ dog NOUN dobj + ┡─╼ │ today NOUN nmod:tmod + ┡─╼ │ , PUNCT punct + │ │ ╭─╼ which DET nsubj + │ │ ┢─╼ was VERB cop + │ │ ┢─╼ a DET det + │ ╰─┶ boxer NOUN acl:relcl + ╰─╼ . PUNCT punct + + With ``layout=align-words``, the output will be:: + + ─┮ + │ ╭─╼ I PRON nsubj + ╰─┾ saw VERB root + │ ╭─╼ a DET det + ┡───┾ dog NOUN dobj + ┡─╼ │ today NOUN nmod:tmod + ┡─╼ │ , PUNCT punct + │ │ ╭─╼ which DET nsubj + │ │ ┢─╼ was VERB cop + │ │ ┢─╼ a DET det + │ ╰─┶ boxer NOUN acl:relcl + ╰─╼ . PUNCT punct + + And finally with ``layout=align``:: + + ─┮ + │ ╭─╼ I PRON nsubj + ╰─┾ saw VERB root + │ ╭─╼ a DET det + ┡───┾ dog NOUN dobj + ┡─╼ │ today NOUN nmod:tmod + ┡─╼ │ , PUNCT punct + │ │ ╭─╼ which DET nsubj + │ │ ┢─╼ was VERB cop + │ │ ┢─╼ a DET det + │ ╰─┶ boxer NOUN acl:relcl + ╰─╼ . PUNCT punct + + Some non-projective trees cannot be printed without crossing edges. + TextModeTrees uses a special "bridge" symbol ─╪─ to mark this:: + + ─┮ + │ ╭─╼ 1 + ├─╪───┮ 2 + ╰─┶ 3 │ + ╰─╼ 4 + + With ``color=auto`` (which is the default), if the output is printed to the console (not file or pipe), each node attribute is printed in different color. If a given node's MISC contains any of `ToDo`, `Bug` or `Mark` attributes (or any other specified in the parameter `mark`), the node will be highlighted - (by reveresing the background and foreground colors). + (by reversing the background and foreground colors). This block's method `process_tree` can be called on any node (not only root), - which is useful for printing subtrees using `node.print_subtree()`, + which is useful for printing subtrees using ``node.draw()``, which is internally implemented using this block. + For use in LaTeX, you can insert the output of this block (without colors) + into ``\begin{verbatim}...\end{verbatim}``, but you need to compile with pdflatex (xelatex not supported) + and you must add the following code into the preamble:: + + \usepackage{pmboxdraw} + \DeclareUnicodeCharacter{256D}{\textSFi} %╭ + \DeclareUnicodeCharacter{2570}{\textSFii} %╰ + SEE ALSO - `write.TextModeTreesHtml` + :py:class:`.TextModeTreesHtml` """ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color='auto', attributes='form,upos,deprel', - print_undef_as='', print_doc_meta=True, print_comments=False, - mark='ToDo|Bug|Mark', marked_only=False, **kwargs): + print_undef_as='_', print_doc_meta=True, print_comments=False, print_empty=True, + print_mwt=False, mark='(ToDo|ToDoOrigText|Bug|Mark)', marked_only=False, hints=True, + layout='classic', **kwargs): """Create new TextModeTrees block object. Args: - print_sent_id: Print ID of the tree (its root, aka "sent_id") above each tree? - print_sentence: Print plain-text detokenized sentence on a line above each tree? - add_empty_line: Print an empty line after each tree? - indent: Number of characters to indent node depth in the tree for better readability. - minimize_cross: Minimize crossings of edges in non-projective trees? - Trees without crossings are subjectively more readable, but usually - in practice also "deeper", that is with higher maximal line length. - color: Print the node attribute with ANSI terminal colors? - Default = 'auto' which means that color output only if the output filehandle - is interactive (console). Each attribute is assigned a color (the mapping is - tested on black background terminals and can be changed only in source code). - If you plan to pipe the output (e.g. to "less -R") and you want the colors, - you need to set explicitly color=1, see the example in Synopsis. - attributes: A comma-separated list of node attributes which should be printed. Possible - values are ord, form, lemma, upos, xpos, feats, deprel, deps, misc. - print_undef_as: What should be printed instead of undefined attribute values (if any)? - print_doc_meta: Print `document.meta` metadata before each document? - print_comments: Print comments (other than sent_id and text)? - mark: a regex. If `re.search(mark, str(node.misc))` the node is highlighted. - If `print_comments and re.search(mark, root.comment)` the comment is highlighted. - Empty string means no highlighting. Default = 'ToDo|Bug|Mark'. - marked_only: print only trees containing one or more marked nodes/comments. Default=False. + print_sent_id: Print ID of the tree (its root, aka "sent_id") above each tree? + print_text: Print plain-text detokenized sentence on a line above each tree? + add_empty_line: Print an empty line after each tree? + indent: Number of characters to indent node depth in the tree for better readability. + minimize_cross: Minimize crossings of edges in non-projective trees? + Trees without crossings are subjectively more readable, but usually + in practice also "deeper", that is with higher maximal line length. + color: Print the node attribute with ANSI terminal colors? + Default = 'auto' which means that color output only if the output filehandle + is interactive (console). Each attribute is assigned a color (the mapping is + tested on black background terminals and can be changed only in source code). + If you plan to pipe the output (e.g. to "less -R") and you want the colors, + you need to set explicitly color=1, see the example in Synopsis. + attributes: A comma-separated list of node attributes which should be printed. Possible + values are ``ord``, ``form``, ``lemma``, ``upos``, ``xpos``, ``feats``, ``deprel``, ``deps``, ``misc``. + print_undef_as: What should be printed instead of undefined attribute values (if any)? + print_doc_meta: Print ``document.meta`` metadata before each document? + print_comments: Print comments (other than ``sent_id`` and ``text``)? + print_empty: Print empty nodes? Default=True + print_mwt: Print multi-word tokens? Default=False + mark: A regex pattern. If ``re.search(mark + '=', str(node.misc))`` matches, the node is highlighted. + If ``print_comments`` and ``re.search(r'^ %s = ' % mark, root.comment, re.M)`` matches, + the comment is highlighted. Empty string means no highlighting. + Default = ``'(ToDo|ToDoOrigText|Bug|Mark)'``. + marked_only: Print only trees containing one or more marked nodes/comments. Default ``False``. + hints: Use thick-marked segments (┡ and ┢) to distinguish whether a given node precedes + or follows its parent. Default ``True``. If ``False``, plain ├ is used in both cases. + layout: Tree layout style: + + - ``'classic'`` (default): shows word attributes immediately next to each node + - ``'compact'``: never print edges after (right to) words even in non-projectivities + - ``'align-words'``: like ``'compact'`` but all first attributes (forms by default) are aligned + - ``'align'``: like ``'align-words'`` but all attributes are aligned in columns """ super().__init__(**kwargs) self.print_sent_id = print_sent_id @@ -120,29 +193,38 @@ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, ind self.print_undef_as = print_undef_as self.print_doc_meta = print_doc_meta self.print_comments = print_comments + self.print_empty = print_empty + self.print_mwt = print_mwt self.mark = mark self.marked_only = marked_only + self.layout = layout # _draw[is_bottommost][is_topmost] line = '─' * indent self._horiz = line + '╼' self._draw = [[line + '┾', line + '┮'], [line + '┶', self._horiz]] - # _space[is_bottommost][is_topmost] + # _space[precedes_parent][is_topmost_or_bottommost] # _vert[is_crossing] space = ' ' * indent - self._space = [[space + '├', space + '╭'], [space + '╰']] + if hints: + self._space = [[space + '┡', space + '╰'], [space + '┢', space + '╭']] + else: + self._space = [[space + '├', space + '╰'], [space + '├', space + '╭']] self._vert = [space + '│', line + '╪'] self.attrs = attributes.split(',') - self.mark_re = re.compile(mark, re.S) if (mark is not None and mark != '') else None + self.mark_re, self.comment_mark_re = None, None + if mark is not None and mark != '': + self.mark_re = re.compile(mark + '=') + self.comment_mark_re = re.compile(r'^ %s = ' % mark, re.M) self._index_of = [] - self._gaps = [] + self._gaps = collections.Counter() self.lines = [] self.lengths = [] # We want to be able to call process_tree not only on root node, - # so this block can be called from node.print_subtree(**kwargs) + # so this block can be called from node.print_draw(**kwargs) # on any node and print its subtree. Thus, we cannot assume that # allnodes[idx].ord == idx. Instead of node.ord, we'll use index_of[node.ord], # which is its index within the printed subtree. @@ -157,74 +239,104 @@ def _compute_gaps(self, node): self._gaps[node.ord] = rmost - lmost - descs return lmost, rmost, descs + 1 - def should_print_tree(self, root): + def should_print_tree(self, root, allnodes): """Should this tree be printed?""" if not self.marked_only: return True - if any(self.is_marked(n) for n in root.descendants(add_self=1)): + if any(self.is_marked(n) for n in allnodes): return True if not self.print_comments or root.comment is None or self.mark_re is None: return False - return self.mark_re.search(root.comment) + return self.comment_mark_re.search(root.comment) - def process_tree(self, root): + def process_tree(self, root, force_print=False): """Print the tree to (possibly redirected) sys.stdout.""" - allnodes = root.descendants(add_self=1) - if not self.should_print_tree(root): + if self.print_empty: + if root.is_root() and not self.print_mwt: + allnodes = [root] + root.descendants_and_empty + else: + allnodes = root.descendants(add_self=1, add_mwt=self.print_mwt) + empty = [e for e in root._root.empty_nodes if e > allnodes[0] and e < allnodes[-1]] + allnodes.extend(empty) + allnodes.sort() + else: + allnodes = root.descendants(add_self=1, add_mwt=self.print_mwt) + if not force_print and not self.should_print_tree(root, allnodes): return - self._index_of = {allnodes[i].ord: i for i in range(len(allnodes))} + self._index_of = {allnodes[i].ord_range if allnodes[i].is_mwt() else allnodes[i].ord: i for i in range(len(allnodes))} self.lines = [''] * len(allnodes) self.lengths = [0] * len(allnodes) # Precompute the number of non-projective gaps for each subtree if self.minimize_cross: - self._gaps = [0,] * (1 + len(root.root.descendants)) self._compute_gaps(root) # Precompute lines for printing - stack = [root,] + stack = [root, ] while stack: node = stack.pop() children = node.children(add_self=1) min_idx, max_idx = self._index_of[children[0].ord], self._index_of[children[-1].ord] - max_length = max([self.lengths[i] for i in range(min_idx, max_idx+1)]) - for idx in range(min_idx, max_idx+1): + max_length = max([self.lengths[i] for i in range(min_idx, max_idx + 1)]) + for idx in range(min_idx, max_idx + 1): idx_node = allnodes[idx] - filler = '─' if self._ends(idx, '─╭╰├╪') else ' ' + filler = '─' if self._ends(idx, '─╭╰╪┡┢') else ' ' self._add(idx, filler * (max_length - self.lengths[idx])) topmost = idx == min_idx botmost = idx == max_idx if idx_node is node: self._add(idx, self._draw[botmost][topmost]) - self.add_node(idx, node) + if self.layout == 'classic': + self.add_node(idx, node) else: - if idx_node.parent is not node: - self._add(idx, self._vert[self._ends(idx, '─╭╰├╪')]) + if idx_node.is_mwt() or idx_node.parent is not node: + self._add(idx, self._vert[self._ends(idx, '─╭╰╪┡┢')]) else: - self._add(idx, self._space[botmost][topmost]) + precedes_parent = idx < self._index_of[node.ord] + self._add(idx, self._space[precedes_parent][topmost or botmost]) if idx_node.is_leaf(): self._add(idx, self._horiz) - self.add_node(idx, idx_node) + if self.layout == 'classic': + self.add_node(idx, idx_node) else: stack.append(idx_node) # sorting the stack to minimize crossings of edges if self.minimize_cross: - stack = sorted(stack, key=lambda x: -self._gaps[x.ord]) + stack.sort(key=lambda x: -self._gaps[x.ord]) + + if self.layout == 'classic': + for idx, node in enumerate(allnodes): + if node.is_empty() or node.is_mwt(): + self.add_node(idx, node) + else: + columns_attrs = [[a] for a in self.attrs] if self.layout == 'align' else [self.attrs] + for col_attrs in columns_attrs: + self.attrs = col_attrs + max_length = max(self.lengths) + for idx, node in enumerate(allnodes): + if self.layout.startswith('align'): + self._add(idx, ' ' * (max_length - self.lengths[idx])) + self.add_node(idx, node) + self.attrs = [a for sublist in columns_attrs for a in sublist] # Print headers (if required) and the tree itself + self.print_headers(root) + for line in self.lines: + print(line) + + if self.add_empty_line: + print('') + + def print_headers(self, root): + """Print sent_id, text and other comments related to the tree.""" if self.print_sent_id: print('# sent_id = ' + root.address()) if self.print_text: print("# text = " + (root.get_sentence() if root.is_root() else root.compute_text())) if self.print_comments and root.comment: print('#' + self.colorize_comment(root.comment.rstrip().replace('\n', '\n#'))) - for line in self.lines: - print(line) - - if self.add_empty_line: - print('') def _ends(self, idx, chars): return bool(self.lines[idx] and self.lines[idx][-1] in chars) @@ -238,11 +350,16 @@ def before_process_document(self, document): super().before_process_document(document) if self.color == 'auto': self.color = sys.stdout.isatty() - if self.color: - colorama.init() + if self.color: + colorama.just_fix_windows_console() + # termcolor since 2.1 also autodetects whether sys.stdout.isatty() + # and if not, it disables the colors, so `cat i.conllu | udapy -T | less -R" + # does not work. We need to turn off termcolor's autodetection with FORCE_COLOR. + os.environ["FORCE_COLOR"] = "1" if self.print_doc_meta: for key, value in sorted(document.meta.items()): - print('%s = %s' % (key, value)) + if key[0] != '_': + print('%s = %s' % (key, value)) def _add(self, idx, text): self.lines[idx] += text @@ -250,14 +367,18 @@ def _add(self, idx, text): def add_node(self, idx, node): """Render a node with its attributes.""" - if not node.is_root(): + if node.is_mwt() or not node.is_root(): values = node.get_attrs(self.attrs, undefs=self.print_undef_as) self.lengths[idx] += 1 + len(' '.join(values)) + marked = self.is_marked(node) if self.color: - marked = self.is_marked(node) for i, attr in enumerate(self.attrs): values[i] = self.colorize_attr(attr, values[i], marked) - self.lines[idx] += ' ' + ' '.join(values) + if not self.color and marked: + self.lines[idx] += ' **' + ' '.join(values) + '**' + self.lengths[idx] += 4 + else: + self.lines[idx] += ' ' + ' '.join(values) def is_marked(self, node): """Should a given node be highlighted?""" diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index ecb0efb8..0ad39da4 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -1,5 +1,5 @@ """An ASCII pretty printer of colored dependency trees in HTML.""" -from html import escape # pylint: disable=no-name-in-module +from html import escape # pylint: disable=no-name-in-module from udapi.block.write.textmodetrees import TextModeTrees @@ -15,6 +15,7 @@ mark {box-shadow: 0px 0px 0px 1px red; font-weight: bold;} ''' + class TextModeTreesHtml(TextModeTrees): """An ASCII pretty printer of colored dependency trees in HTML. @@ -25,7 +26,7 @@ class TextModeTreesHtml(TextModeTrees): This block is a subclass of `TextModeTrees`, see its documentation for more info. """ - def __init__(self, color=True, title='Udapi visualization', **kwargs): + def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, whole_bundle=True, **kwargs): """Create new TextModeTreesHtml block object. Args: see `TextModeTrees`. @@ -34,14 +35,20 @@ def __init__(self, color=True, title='Udapi visualization', **kwargs): (see the `mark` parameter) to be more eye-catching. title: What title metadata to use for the html? + zones_in_rows: print trees from the same bundle side by side (i.e. in the same row). + whole_bundle: always print the whole bundle (all its trees) if any of the trees is marked + (relevant only with marked_only=True and zones_in_rows=True) """ super().__init__(color=color, **kwargs) self.title = title + self.zones_in_rows = zones_in_rows + self.whole_bundle = whole_bundle def before_process_document(self, document): # TextModeTrees.before_process_document changes the color property, # we need to skip this, but call BaseWriter's method which redirects stdout. - super(TextModeTrees, self).before_process_document(document) #pylint: disable=bad-super-call + # pylint: disable=bad-super-call + super(TextModeTrees, self).before_process_document(document) print('\n\n\n') print('' + self.title + '') print('