From 1ef23f857b8f8c9b7080b9c33d2dde56a14abf1f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 16:58:39 +0100 Subject: [PATCH 001/670] Reworked feature checking so that a similar block can be written for another language. --- udapi/block/ud/cs/markfeatsbugs.py | 37 ++------------- udapi/block/ud/markfeatsbugs.py | 75 ++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 32 deletions(-) create mode 100644 udapi/block/ud/markfeatsbugs.py diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 11ecd6d9..3fb8d058 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -2,12 +2,15 @@ Block to identify missing or ill-valued features in Czech. Any bugs that it finds will be saved in the MISC column as a Bug attribute, which can be later used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 attributes=form,lemma,upos,xpos,feats,deprel,misc """ -from udapi.core.block import Block +import udapi.block.ud.markfeatsbugs import logging import re -class MarkFeatsBugs(Block): +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): # The convention used in PDT is not consistent. Adjectives are fully disambiguated # (three genders, two animacies, three numbers, seven cases), even though some @@ -21,36 +24,6 @@ class MarkFeatsBugs(Block): # in the future. pdt20 = False # True = like in PDT 2.0; False = like in ČNK - def bug(self, node, bugstring): - bugs = [] - if node.misc['Bug']: - bugs = node.misc['Bug'].split('+') - if not bugstring in bugs: - bugs.append(bugstring) - node.misc['Bug'] = '+'.join(bugs) - - def check_allowed_features(self, node, allowed): - """ - We need a dictionary indexed by feature names that are allowed; for each - feature name, there is a list of allowed values. - """ - # Check for features that are not allowed but the node has them. - # For features that are allowed, check that their values are allowed. - for f in node.feats: - if f in allowed: - if not node.feats[f] in allowed[f]: - self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') - else: - self.bug(node, 'Feat' + f + 'NotAllowed') - - def check_required_features(self, node, required): - """ - We need a list of names of features whose values must not be empty. - """ - for f in required: - if not f in node.feats: - self.bug(node, 'Feat' + f + 'Missing') - def process_node(self, node): # NOUNS ################################################################ if node.upos == 'NOUN': diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py new file mode 100644 index 00000000..b24dcecb --- /dev/null +++ b/udapi/block/ud/markfeatsbugs.py @@ -0,0 +1,75 @@ +""" +Block to identify missing or ill-valued features in a treebank. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. This is a base block that only +implements service methods. A language-specific block must be derived from this +one and define the actual rules valid in that language. + +Usage (Czech example): cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +""" +from udapi.core.block import Block +import logging +import re + +class MarkFeatsBugs(Block): + + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def check_allowed_features(self, node, allowed): + """ + We need a dictionary indexed by feature names that are allowed; for each + feature name, there is a list of allowed values. + """ + # Check for features that are not allowed but the node has them. + # For features that are allowed, check that their values are allowed. + for f in node.feats: + if f in allowed: + if not node.feats[f] in allowed[f]: + self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed') + else: + self.bug(node, 'Feat' + f + 'NotAllowed') + + def check_required_features(self, node, required): + """ + We need a list of names of features whose values must not be empty. + """ + for f in required: + if not f in node.feats: + self.bug(node, 'Feat' + f + 'Missing') + + def process_node(self, node): + """ + This is a generic block, do nothing here. In a language-specific block + based on this one, rules similar to the examples below can be specified: + + # NOUNS ################################################################ + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + #... + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) + """ + return From 5a836db5852a97a69b972646493024894a7d3ca4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 17:37:16 +0100 Subject: [PATCH 002/670] Added Latin. --- udapi/block/ud/cs/markfeatsbugs.py | 2 +- udapi/block/ud/la/markfeatsbugs.py | 608 +++++++++++++++++++++++++++++ 2 files changed, 609 insertions(+), 1 deletion(-) create mode 100644 udapi/block/ud/la/markfeatsbugs.py diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 3fb8d058..ef203033 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py new file mode 100644 index 00000000..8741eabb --- /dev/null +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -0,0 +1,608 @@ +""" +Block to identify missing or ill-valued features in Latin. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAM ud.la.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def process_node(self, node): + # NOUNS ################################################################ + if node.upos == 'NOUN': + self.check_required_features(node, ['Gender', 'Number', 'Case']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + # PROPER NOUNS ######################################################### + elif node.upos == 'PROPN': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Animacy']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) + else: + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + if node.feats['Poss'] == 'Yes': # possessive adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Poss': ['Yes'], + 'Gender[psor]': ['Masc', 'Fem'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'Foreign': ['Yes']}) + elif node.feats['NumType'] == 'Ord': # ordinal numerals are a subtype of adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Ord'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Foreign': ['Yes']}) + elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives + self.check_required_features(node, ['VerbForm', 'Voice']) + if node.feats['Voice'] == 'Act': # active participles have tense, passives don't + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Act'], + 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) + self.check_allowed_features(node, { + 'VerbForm': ['Part'], + 'Aspect': ['Imp', 'Perf'], + 'Voice': ['Pass'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity', 'Variant']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Foreign': ['Yes']}) + else: # regular adjectives + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + else: + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # on, ona, ono, oni, ony + if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony + self.check_adjective_like(node, ['PronType', 'Person'], { + 'PronType': ['Prs'], + 'Person': ['3'] + }) + else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně + # Mostly only two gender groups and no animacy: + # Masc,Neut ... jeho, jemu, jej, něm, jím + # Fem ... jí, ji, ní + # Neut ... je + # No gender in dual and plural: + # Plur ... jich, jim, je, nich, jimi + self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { + 'PronType': ['Prs'], + 'Person': ['3'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: # 1st and 2nd person do not have gender: já, ty + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'] + }) + elif re.search(r'k[dt]o', node.lemma): # kdo (kto), kdož, někdo, nikdo + # There is no Number. Někdo and nikdo behave like singular; + # kdo is by default singular as well but it also occurs as a subject + # of plural verbs. + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'Gender': ['Masc'], + 'Animacy': ['Anim'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif re.match(r'^(co|což|něco|nicož)$', node.lemma): + # Although these pronouns behave by default as neuter singular, + # no Gender and Number is annotated. However, quite unusually, + # there is Animacy=Inan without Gender. + ###!!! This should probably be fixed in all Czech treebanks and + ###!!! in Interset. The pronoun should get Gender=Neut and no + ###!!! animacy. For now, let's at least make animacy an optional + ###!!! feature (I see that we already do not fill it in the Old + ###!!! Czech data). + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'Animacy': ['Inan'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + elif node.lemma == 'ješto': + # Unlike 'jenžto', this relative pronoun does not inflect, it + # always occurs in a nominative position, but the context can + # be any gender and number. + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Case': ['Nom'] + }) + elif re.match(r'^(jenž|jenžto)$', node.lemma): + # The relative pronouns 'jenž', 'jenžto' inflect for gender; + # while we normally take this as a sign of DET (instead of PRON), + # these can never act as real DET because they never modify a + # nominal. + # Similarly to the personal pronoun 'on', animacy is only + # annotated for masculine nominative plural, non-nominative + # forms are merged for masculine and neuter (jehož, jemuž), and + # non-singular gender is only annotated in nominative (while + # these cases are common for all genders: jichž, jimž, jimiž). + # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even + # in the nominative, although there is no prepositional counter- + # part (but similarly the locative has no prepositionless form). + self.check_adjective_like(node, ['PronType', 'PrepCase'], { + 'PronType': ['Rel'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: + # What remains is the relative pronoun 'an'. It behaves similarly + # to 'jenž' but it does not have the PrepCase feature and it + # only occurs in the nominative. + if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani + self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Plur'], + 'Case': ['Nom'] + }) + else: # not Masc Plur: an, ana, ano, any + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. + # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. + if re.match(r'^(jeho|jejich|jich)(ž(to)?)?$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc,Neut'] + }) + elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): + # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. + # Congruent gender is annotated only in singular. Masculine and + # neuter are merged even in nominative. Feminine singular does + # not distinguish case in PDT but we need it in Old Czech at + # least for 'jejiej'. + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs', 'Rel'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + if node.feats['Reflex'] == 'Yes': + self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Reflex': ['Yes'] + }) + else: + self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2'], + 'Number[psor]': ['Sing', 'Plur'] + }) + else: + self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp']}) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + ###!!! Somehow the NumValue feature from PDT via Interset is useless. + # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. + # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. + # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. + # 'pět' and more have Number=Plur, Case: pět, pěti. + if node.lemma == 'jeden': + self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + elif re.match(r'^(dva|oba)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Gender', 'Number', 'Case']) + if self.pdt20: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'NumValue': ['1,2,3'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + self.check_required_features(node, ['Aspect', 'VerbForm']) + if node.feats['VerbForm'] == 'Inf': + # There is no voice. For some reason, PDT does not annotate that + # the infinitive form is active (while a passive infinitive is + # a combination of the infinitive with a passive participle). + self.check_required_features(node, ['Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Fin': + # Voice is optional. For some reason it is not annotated with + # imperatives (although passive imperatives are a combination + # of the active imperative and a passive participle). It is + # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. + if node.feats['Mood'] == 'Cnd': + self.check_required_features(node, ['Mood', 'Person']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'] # optional: it is not annotated in the third person + }) + elif node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # indicative + self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Voice': ['Act'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist + }) + elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # converb + self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] + }) + elif node.feats['Degree'] != '': + # Adverbs that are compared can also be negated. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_required_features(node, ['AdpType', 'Case']) + self.check_allowed_features(node, { + 'AdpType': ['Prep', 'Voc'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) + + def check_adjective_like(self, node, r0, a0): + """ + Long form of adjectives, pronouns and determiners mostly share declension + paradigms and thus the sets of features that are expected. Whether the + actual feature sets are the same depends on the tagging convention (PDT + vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are + not; in ČNK, both adjectives and pronouns (incl. determiners) are fully + disambiguated. This method defines the core inflectional features while + any extras (such as PronType for pronouns) have to be provided by the + caller in parameters r0 (list) and a0 (dict). + """ + required_features = [] + allowed_featurs = {} + full_set = node.upos == 'ADJ' or not self.pdt20 + if full_set: + # Even in the full set, animacy is only distinguished for the + # masculine gender. + if node.feats['Gender'] == 'Masc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Fem', 'Neut'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + else: + # Gender is annotated in all cases in singular (ten, ta, to) + # but only in nominative, accusative, and vocative in plural + # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished + # in plural if gender is distinguished and it is masculine; in + # singular it is distinguished only in accusative (toho, ten). + # Other cases in plural are gender-less (těch, těm, těmi). + # Note that this is not consistent with adjectives, where we + # disambiguate gender in all cases in plural. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': + required_features = ['Gender', 'Animacy', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing'], + 'Case': ['Acc'] + } + else: + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + } + elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): + required_features = ['Gender', 'Number', 'Case'] + allowed_features = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Acc', 'Voc'] + } + else: + required_features = ['Number', 'Case'] + allowed_features = { + 'Number': ['Dual', 'Plur'], + 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] + } + required_features = r0 + required_features + a0.update(allowed_features) + allowed_features = a0 + self.check_required_features(node, required_features) + self.check_allowed_features(node, allowed_features) From 0f167c2a64adcb98a61740c348e3b7579502005d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 17:47:54 +0100 Subject: [PATCH 003/670] Removed Czech-specific rules from Latin block. For a start, the Latin rules check NOUNs and PROPNs only. --- udapi/block/ud/la/markfeatsbugs.py | 598 +---------------------------- 1 file changed, 11 insertions(+), 587 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 8741eabb..4cf6c1b3 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -16,593 +16,17 @@ def process_node(self, node): # NOUNS ################################################################ if node.upos == 'NOUN': self.check_required_features(node, ['Gender', 'Number', 'Case']) - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) - else: - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Foreign': ['Yes']}) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Animacy']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) - else: - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) - # ADJECTIVES ########################################################### - elif node.upos == 'ADJ': - if node.feats['Poss'] == 'Yes': # possessive adjectives - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case']) - self.check_allowed_features(node, { - 'Poss': ['Yes'], - 'Gender[psor]': ['Masc', 'Fem'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'Poss': ['Yes'], - 'Gender[psor]': ['Masc', 'Fem'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names - 'Foreign': ['Yes']}) - elif node.feats['NumType'] == 'Ord': # ordinal numerals are a subtype of adjectives - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Ord'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Ord'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Foreign': ['Yes']}) - elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives - self.check_required_features(node, ['VerbForm', 'Voice']) - if node.feats['Voice'] == 'Act': # active participles have tense, passives don't - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Act'], - 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Act'], - 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí' - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Pass'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) - self.check_allowed_features(node, { - 'VerbForm': ['Part'], - 'Aspect': ['Imp', 'Perf'], - 'Voice': ['Pass'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: # regular adjectives - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) - # PRONOUNS ############################################################# - elif node.upos == 'PRON': - self.check_required_features(node, ['PronType']) - if node.feats['PronType'] == 'Prs': - if node.feats['Reflex'] == 'Yes': - self.check_required_features(node, ['PronType', 'Reflex', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Reflex': ['Yes'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], - 'Variant': ['Short'] - }) - else: # not reflexive - if node.feats['Person'] == '3': # on, ona, ono, oni, ony - if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony - self.check_adjective_like(node, ['PronType', 'Person'], { - 'PronType': ['Prs'], - 'Person': ['3'] - }) - else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně - # Mostly only two gender groups and no animacy: - # Masc,Neut ... jeho, jemu, jej, něm, jím - # Fem ... jí, ji, ní - # Neut ... je - # No gender in dual and plural: - # Plur ... jich, jim, je, nich, jimi - self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], { - 'PronType': ['Prs'], - 'Person': ['3'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: # 1st and 2nd person do not have gender: já, ty - self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['1', '2'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Variant': ['Short'] - }) - elif re.search(r'k[dt]o', node.lemma): # kdo (kto), kdož, někdo, nikdo - # There is no Number. Někdo and nikdo behave like singular; - # kdo is by default singular as well but it also occurs as a subject - # of plural verbs. - self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], - 'Gender': ['Masc'], - 'Animacy': ['Anim'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] - }) - elif re.match(r'^(co|což|něco|nicož)$', node.lemma): - # Although these pronouns behave by default as neuter singular, - # no Gender and Number is annotated. However, quite unusually, - # there is Animacy=Inan without Gender. - ###!!! This should probably be fixed in all Czech treebanks and - ###!!! in Interset. The pronoun should get Gender=Neut and no - ###!!! animacy. For now, let's at least make animacy an optional - ###!!! feature (I see that we already do not fill it in the Old - ###!!! Czech data). - self.check_required_features(node, ['PronType', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], - 'Animacy': ['Inan'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] - }) - elif node.lemma == 'ješto': - # Unlike 'jenžto', this relative pronoun does not inflect, it - # always occurs in a nominative position, but the context can - # be any gender and number. - self.check_required_features(node, ['PronType', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Case': ['Nom'] - }) - elif re.match(r'^(jenž|jenžto)$', node.lemma): - # The relative pronouns 'jenž', 'jenžto' inflect for gender; - # while we normally take this as a sign of DET (instead of PRON), - # these can never act as real DET because they never modify a - # nominal. - # Similarly to the personal pronoun 'on', animacy is only - # annotated for masculine nominative plural, non-nominative - # forms are merged for masculine and neuter (jehož, jemuž), and - # non-singular gender is only annotated in nominative (while - # these cases are common for all genders: jichž, jimž, jimiž). - # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even - # in the nominative, although there is no prepositional counter- - # part (but similarly the locative has no prepositionless form). - self.check_adjective_like(node, ['PronType', 'PrepCase'], { - 'PronType': ['Rel'], - 'PrepCase': ['Npr', 'Pre'] - }) - else: - # What remains is the relative pronoun 'an'. It behaves similarly - # to 'jenž' but it does not have the PrepCase feature and it - # only occurs in the nominative. - if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani - self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Plur'], - 'Case': ['Nom'] - }) - else: # not Masc Plur: an, ana, ano, any - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Rel'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom'] - }) - # DETERMINERS ########################################################## - elif node.upos == 'DET': - # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. - # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. - if re.match(r'^(jeho|jejich|jich)(ž(to)?)?$', node.form.lower()): - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) - self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], - 'Poss': ['Yes'], - 'Person': ['3'], - 'Number[psor]': ['Sing', 'Dual', 'Plur'], - 'Gender[psor]': ['Masc,Neut'] - }) - elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): - # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. - # Congruent gender is annotated only in singular. Masculine and - # neuter are merged even in nominative. Feminine singular does - # not distinguish case in PDT but we need it in Old Czech at - # least for 'jejiej'. - if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], - 'Poss': ['Yes'], - 'Person': ['3'], - 'Number[psor]': ['Sing'], - 'Gender[psor]': ['Fem'], - 'Gender': ['Masc,Neut', 'Fem'], - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - else: - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], - 'Poss': ['Yes'], - 'Person': ['3'], - 'Number[psor]': ['Sing'], - 'Gender[psor]': ['Fem'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - if node.feats['Reflex'] == 'Yes': - self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Reflex': ['Yes'] - }) - else: - self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Person': ['1', '2'], - 'Number[psor]': ['Sing', 'Plur'] - }) - else: - self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp']}) - # NUMERALS ############################################################# - elif node.upos == 'NUM': - self.check_required_features(node, ['NumType', 'NumForm']) - # Arabic digits and Roman numerals do not have inflection features. - if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Digit', 'Roman'] - }) - else: - ###!!! Somehow the NumValue feature from PDT via Interset is useless. - # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi. - # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. - # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. - # 'pět' and more have Number=Plur, Case: pět, pěti. - if node.lemma == 'jeden': - self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - elif re.match(r'^(dva|oba)$', node.lemma): - self.check_required_features(node, ['NumType', 'NumForm', 'NumValue', 'Gender', 'Number', 'Case']) - if self.pdt20: - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - else: - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - else: - self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'], - 'NumValue': ['1,2,3'], - 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - }) - # VERBS AND AUXILIARIES ################################################ - elif re.match(r'^(VERB|AUX)$', node.upos): - self.check_required_features(node, ['Aspect', 'VerbForm']) - if node.feats['VerbForm'] == 'Inf': - # There is no voice. For some reason, PDT does not annotate that - # the infinitive form is active (while a passive infinitive is - # a combination of the infinitive with a passive participle). - self.check_required_features(node, ['Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Inf'], - 'Polarity': ['Pos', 'Neg'] - }) - elif node.feats['VerbForm'] == 'Fin': - # Voice is optional. For some reason it is not annotated with - # imperatives (although passive imperatives are a combination - # of the active imperative and a passive participle). It is - # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. - if node.feats['Mood'] == 'Cnd': - self.check_required_features(node, ['Mood', 'Person']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Cnd'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Dual', 'Plur'] # optional: it is not annotated in the third person - }) - elif node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Imp'], - 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) - 'Number': ['Sing', 'Dual', 'Plur'], - 'Polarity': ['Pos', 'Neg'] - }) - else: # indicative - self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Ind'], - 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative - 'Voice': ['Act'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short', 'Long'] # distinguishes sigmatic (Long) and asigmatic (Short) aorist - }) - elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Part'], - 'Tense': ['Past'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Polarity': ['Pos', 'Neg'] - }) - else: - self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Part'], - 'Tense': ['Past'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Fem', 'Neut'], - 'Polarity': ['Pos', 'Neg'] - }) - else: # converb - self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Conv'], - 'Tense': ['Past', 'Pres'], - 'Voice': ['Act'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy - 'Polarity': ['Pos', 'Neg'] - }) - # ADVERBS ############################################################## - elif node.upos == 'ADV': - if node.feats['PronType'] != '': - # Pronominal adverbs are neither compared nor negated. - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] - }) - elif node.feats['Degree'] != '': - # Adverbs that are compared can also be negated. - self.check_required_features(node, ['Degree', 'Polarity']) - self.check_allowed_features(node, { - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'] - }) - else: - # The remaining adverbs are neither pronominal, nor compared or - # negated. - self.check_allowed_features(node, {}) - # ADPOSITIONS ########################################################## - elif node.upos == 'ADP': - self.check_required_features(node, ['AdpType', 'Case']) + self.check_required_features(node, ['Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'AdpType': ['Prep', 'Voc'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] - }) - # THE REST: NO FEATURES ################################################ - else: - self.check_allowed_features(node, {}) - - def check_adjective_like(self, node, r0, a0): - """ - Long form of adjectives, pronouns and determiners mostly share declension - paradigms and thus the sets of features that are expected. Whether the - actual feature sets are the same depends on the tagging convention (PDT - vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are - not; in ČNK, both adjectives and pronouns (incl. determiners) are fully - disambiguated. This method defines the core inflectional features while - any extras (such as PronType for pronouns) have to be provided by the - caller in parameters r0 (list) and a0 (dict). - """ - required_features = [] - allowed_featurs = {} - full_set = node.upos == 'ADJ' or not self.pdt20 - if full_set: - # Even in the full set, animacy is only distinguished for the - # masculine gender. - if node.feats['Gender'] == 'Masc': - required_features = ['Gender', 'Animacy', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - } - else: - required_features = ['Gender', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - } - else: - # Gender is annotated in all cases in singular (ten, ta, to) - # but only in nominative, accusative, and vocative in plural - # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished - # in plural if gender is distinguished and it is masculine; in - # singular it is distinguished only in accusative (toho, ten). - # Other cases in plural are gender-less (těch, těm, těmi). - # Note that this is not consistent with adjectives, where we - # disambiguate gender in all cases in plural. - if node.feats['Number'] == 'Sing': - if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc': - required_features = ['Gender', 'Animacy', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing'], - 'Case': ['Acc'] - } - else: - required_features = ['Gender', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] - } - elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']): - required_features = ['Gender', 'Number', 'Case'] - allowed_features = { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Dual', 'Plur'], - 'Case': ['Nom', 'Acc', 'Voc'] - } - else: - required_features = ['Number', 'Case'] - allowed_features = { - 'Number': ['Dual', 'Plur'], - 'Case': ['Gen', 'Dat', 'Loc', 'Ins'] - } - required_features = r0 + required_features - a0.update(allowed_features) - allowed_features = a0 - self.check_required_features(node, required_features) - self.check_allowed_features(node, allowed_features) + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'NameType': ['Giv', 'Sur', 'Geo'], + 'Foreign': ['Yes']}) From 606515a088cc9779b3fef46795a0c4a6bb1f6613 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 17:51:05 +0100 Subject: [PATCH 004/670] Usage layout=compact. --- udapi/block/ud/cs/markfeatsbugs.py | 2 +- udapi/block/ud/la/markfeatsbugs.py | 2 +- udapi/block/ud/markfeatsbugs.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index ef203033..309e7ac8 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -3,7 +3,7 @@ finds will be saved in the MISC column as a Bug attribute, which can be later used in filters and highlighted in text output. -Usage: cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +Usage: cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 4cf6c1b3..8aea567f 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -3,7 +3,7 @@ finds will be saved in the MISC column as a Bug attribute, which can be later used in filters and highlighted in text output. -Usage: cat *.conllu | udapy -HAM ud.la.MarkFeatsBugs > bugs.html +Usage: cat *.conllu | udapy -HAMX layout=compact ud.la.MarkFeatsBugs > bugs.html Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py index b24dcecb..1bb8188b 100644 --- a/udapi/block/ud/markfeatsbugs.py +++ b/udapi/block/ud/markfeatsbugs.py @@ -5,7 +5,7 @@ implements service methods. A language-specific block must be derived from this one and define the actual rules valid in that language. -Usage (Czech example): cat *.conllu | udapy -HAM ud.cs.MarkFeatsBugs > bugs.html +Usage (Czech example): cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html """ from udapi.core.block import Block import logging From 204da3bbb4bfa59c085c4c05a6bc8be2e134e27d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 21:05:39 +0100 Subject: [PATCH 005/670] More rules for Latin features (cloned from Czech). --- udapi/block/ud/la/markfeatsbugs.py | 148 +++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 8aea567f..96c7b682 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -30,3 +30,151 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'NameType': ['Giv', 'Sur', 'Geo'], 'Foreign': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree']) + self.check_allowed_features(node, { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # on, ona, ono, oni, ony + self.check_required_features(node, ['PronType', 'Person', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + else: # 1st and 2nd person do not have gender: já, ty + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2', '3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + else: + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + }) + # VERBS AND AUXILIARIES ################################################ + elif re.match(r'^(VERB|AUX)$', node.upos): + self.check_required_features(node, ['Aspect', 'VerbForm']) + if node.feats['VerbForm'] == 'Inf': + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Voice': ['Act'], + 'Person': ['1', '2', '3'], + 'Number': ['Sing', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['VerbForm'] == 'Part': + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # verbal noun + self.check_required_features(node, ['Tense', 'Number', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Sing', 'Plur'], + 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + # PARTICLES ############################################################ + elif node.upos == 'PART': + self.check_allowed_features(node, { + 'Polarity': ['Neg'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) From 98db11584577be72a5748c8c81cb4030348270c0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 22:32:50 +0100 Subject: [PATCH 006/670] Added feature rules for Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 191 +++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 udapi/block/ud/ml/markfeatsbugs.py diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py new file mode 100644 index 00000000..a46580d1 --- /dev/null +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -0,0 +1,191 @@ +""" +Block to identify missing or ill-valued features in Malayalam. Any bugs that it +finds will be saved in the MISC column as a Bug attribute, which can be later +used in filters and highlighted in text output. + +Usage: cat *.conllu | udapy -HAMX layout=compact ud.ml.MarkFeatsBugs > bugs.html +Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +""" +import udapi.block.ud.markfeatsbugs +import logging +import re + +class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + + def process_node(self, node): + # NOUNS AND PROPER NOUNS ############################################### + if re.match(r'^(NOUN|PROPN)$', node.upos): + self.check_required_features(node, ['Animacy', 'Number', 'Case']) + self.check_allowed_features(node, { + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], + 'Foreign': ['Yes']}) + # ADJECTIVES ########################################################### + elif node.upos == 'ADJ': + self.check_allowed_features(node, { + 'Foreign': ['Yes']}) + # PRONOUNS ############################################################# + elif node.upos == 'PRON': + self.check_required_features(node, ['PronType']) + if node.feats['PronType'] == 'Prs': + if node.feats['Reflex'] == 'Yes': + self.check_required_features(node, ['PronType', 'Reflex', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Reflex': ['Yes'], + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: # not reflexive + if node.feats['Person'] == '3': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Deixis': ['Prox', 'Remt'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: # plural pronouns do not distinguish gender + self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['3'], + 'Deixis': ['Prox', 'Remt'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī + self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + # DETERMINERS ########################################################## + elif node.upos == 'DET': + if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['1', '2', '3'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + else: + self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + # NUMERALS ############################################################# + elif node.upos == 'NUM': + self.check_required_features(node, ['NumType', 'NumForm']) + # Arabic digits and Roman numerals do not have inflection features. + if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Digit', 'Roman'] + }) + else: + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'], + 'Number': ['Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) + # VERBS ################################################################ + elif node.upos == 'VERB': + self.check_required_features(node, ['VerbForm', 'Voice']) + if node.feats['VerbForm'] == 'Inf': + self.check_allowed_features(node, { + 'VerbForm': ['Inf'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + elif node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + else: + self.check_required_features(node, ['Mood', 'Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Nec'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + elif node.feats['VerbForm'] == 'Part': + self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Part'], + 'Tense': ['Past'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'] + }) + else: # verbal noun + self.check_required_features(node, ['Tense', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + }) + # AUXILIARIES ########################################################## + elif node.upos == 'AUX': + self.check_required_features(node, ['VerbForm']) + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'] + }) + # ADVERBS ############################################################## + elif node.upos == 'ADV': + if node.feats['PronType'] != '': + # Pronominal adverbs are neither compared nor negated. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) + # PARTICLES ############################################################ + elif node.upos == 'PART': + self.check_allowed_features(node, { + 'Polarity': ['Neg'] + }) + # THE REST: NO FEATURES ################################################ + else: + self.check_allowed_features(node, {}) From 84965b94e618f1f2b5fb2cdc3a46fca4dc897c5e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 24 Nov 2022 23:44:42 +0100 Subject: [PATCH 007/670] Non-personal pronouns in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index a46580d1..fc25eccb 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -65,6 +65,12 @@ def process_node(self, node): 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] }) + else: # not personal + self.check_required_features(node, ['PronType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + }) # DETERMINERS ########################################################## elif node.upos == 'DET': if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' From cce7db13d41deeba166ff7a766ae58c6a4fb3db0 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Nov 2022 11:41:50 +0100 Subject: [PATCH 008/670] Malayalam determiners have fewer features than pronouns. --- udapi/block/ud/ml/markfeatsbugs.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index fc25eccb..41d4cf09 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -73,23 +73,16 @@ def process_node(self, node): }) # DETERMINERS ########################################################## elif node.upos == 'DET': - if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) + if node.feats['PronType'] == 'Art': + self.check_required_features(node, ['PronType', 'Definite']) self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Person': ['1', '2', '3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'PronType': ['Art'], + 'Definite': ['Ind'] }) else: - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) + self.check_required_features(node, ['PronType']) self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] }) # NUMERALS ############################################################# elif node.upos == 'NUM': From 8a9435f4b3a510dc1b2f6f34c98ee9f5e9e80b5f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 25 Nov 2022 15:09:34 +0100 Subject: [PATCH 009/670] Added Chinese lemmatization. --- udapi/block/ud/zh/lemmatize.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 udapi/block/ud/zh/lemmatize.py diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py new file mode 100644 index 00000000..7db798a0 --- /dev/null +++ b/udapi/block/ud/zh/lemmatize.py @@ -0,0 +1,34 @@ +"""Block to add missing lemmas in cases where it seems obvious what the lemma should be.""" +from udapi.core.block import Block +import logging +import re + +class Lemmatize(Block): + + # dictionary: form --> lemma + lemma = { + # The plural suffix -men. + '我們': '我', # trad + '我们': '我', # simp + '他們': '他', # trad + '他们': '他', # simp + '它們': '它', # trad + '它们': '它', # simp + '牠們': '牠', # trad + '她們': '她', # trad + '她们': '她', # simp + '人們': '人', # trad + '人们': '人' # simp + } + + def process_node(self, node): + """ + Parts of the Chinese treebanks lack lemmas. Fortunately, lemmatization + of Sino-Tibetan languages is pretty straightforward most of the time, + as the lemma typically equals to the actual word form. + """ + if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': + if node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form From 72f045ef84ea000f403d210301d33d1acf3f7018 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Nov 2022 12:46:54 +0100 Subject: [PATCH 010/670] Enable rewriting of lemmas in Chinese. --- udapi/block/ud/zh/lemmatize.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 7db798a0..2b7a2dc5 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -5,6 +5,20 @@ class Lemmatize(Block): + def __init__(self, rewrite='empty', **kwargs): + """ + Create the ud.zh.Lemmatize block instance. + + Args: + rewrite=empty: set the lemma if it was empty so far; do not touch the rest + rewrite=form: set the lemma if it was empty or equal to form; do not touch the rest + rewrite=all: set the lemma regardless of what it was previously + """ + super().__init__(**kwargs) + if not re.match(r'^(empty|form|all)$', rewrite): + raise ValueError("Unexpected value of parameter 'rewrite'") + self.rewrite = rewrite + # dictionary: form --> lemma lemma = { # The plural suffix -men. @@ -27,8 +41,11 @@ def process_node(self, node): of Sino-Tibetan languages is pretty straightforward most of the time, as the lemma typically equals to the actual word form. """ - if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes': - if node.form in self.lemma: - node.lemma = self.lemma[node.form] - else: - node.lemma = node.form + if self.rewrite == 'empty' and not (node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): + return + if node.form in self.lemma: + node.lemma = self.lemma[node.form] + else: + node.lemma = node.form From b4dd844870291532089ce518bb0ad4d1f562d92a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Nov 2022 14:59:31 +0100 Subject: [PATCH 011/670] Use lemmatization to make copulas acceptable. --- udapi/block/ud/zh/lemmatize.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 2b7a2dc5..9b4c7cba 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -45,7 +45,13 @@ def process_node(self, node): return elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): return - if node.form in self.lemma: + # Verbs that are derived from the copula and tagged as the copula need + # to have the lemma of the copula (是 shì). + if re.search(r'是', node.form) and re.match(r'^(AUX|VERB)$', node.upos): + node.lemma = '是' + if node.form == '不是': + node.feats['Polarity'] = 'Neg' + elif node.form in self.lemma: node.lemma = self.lemma[node.form] else: node.lemma = node.form From 3bea246947ce88825cc15f690e0de744b85c37ee Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 26 Nov 2022 23:40:44 +0100 Subject: [PATCH 012/670] Another Chinese copula. --- udapi/block/ud/zh/lemmatize.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 9b4c7cba..298f3501 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -46,9 +46,18 @@ def process_node(self, node): elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): return # Verbs that are derived from the copula and tagged as the copula need - # to have the lemma of the copula (是 shì). - if re.search(r'是', node.form) and re.match(r'^(AUX|VERB)$', node.upos): - node.lemma = '是' + # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). + # 亦為 亦为 Yì wèi také + # 則為 则为 Zé wèi potom + # 更為 更为 Gèng wèi více + # 認為 认为 Rènwéi myslet, věřit + # 以為 以为 Yǐwéi myslet, věřit + # 以爲 以为 Yǐwéi myslet, věřit + m = re.search(r'([是爲為为])', node.form) + if m and re.match(r'^(AUX|VERB)$', node.upos): + node.lemma = m.group(1) + if node.lemma == '爲': + node.lemma = '為' if node.form == '不是': node.feats['Polarity'] = 'Neg' elif node.form in self.lemma: From d9af327a10bc816334d6e0514f636206dfb44c9f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 27 Nov 2022 01:44:04 +0100 Subject: [PATCH 013/670] readers' parameter merge=1 so e.g. `udapy read.Conllu files=a.connlu,b.conllu merge=1` merges the two files into one document and should be equivalent to `cat a.conllu b.conllu | udapy read.Conllu from=-`. --- udapi/core/basereader.py | 82 +++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 38 deletions(-) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 53a1129c..a3b334da 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -13,7 +13,7 @@ class BaseReader(Block): # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', - sent_id_filter=None, split_docs=False, ignore_sent_id=False, **kwargs): + sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None @@ -28,6 +28,7 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id + self.merge = merge # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it in here, the reader. # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, @@ -111,43 +112,48 @@ def try_fast_load(self, document): if filehandle is None: self.finished = True return True - try: - trees = self.read_trees() - except NotImplementedError: - return False - document.meta['loaded_from'] = self.filename - document.meta['global.Entity'] = self._global_entity - if trees and trees[0].newdoc and trees[0].newdoc is not True: - document.meta["docname"] = trees[0].newdoc - - bundle, last_bundle_id = None, '' - for root in trees: - add_to_the_last_bundle = False - - if self.ignore_sent_id: - root._sent_id = None - elif root._sent_id is not None: - parts = root._sent_id.split('/', 1) - bundle_id = parts[0] - if len(parts) == 2: - root.zone = parts[1] - add_to_the_last_bundle = bundle_id == last_bundle_id - last_bundle_id = bundle_id - if self.zone != 'keep': - root.zone = self.zone - - # assign new/next bundle to `bundle` if needed - if not bundle or not add_to_the_last_bundle: - bundle = document.create_bundle() - if last_bundle_id != '': - bundle.bundle_id = last_bundle_id - - bundle.add_tree(root) - - self.next_filehandle() - if self.filehandle is None: - self.finished = True + while True: + try: + trees = self.read_trees() + except NotImplementedError: + return False + + document.meta['loaded_from'] = self.filename + document.meta['global.Entity'] = self._global_entity + if trees and trees[0].newdoc and trees[0].newdoc is not True: + document.meta["docname"] = trees[0].newdoc + + bundle, last_bundle_id = None, '' + for root in trees: + add_to_the_last_bundle = False + + if self.ignore_sent_id: + root._sent_id = None + elif root._sent_id is not None: + parts = root._sent_id.split('/', 1) + bundle_id = parts[0] + if len(parts) == 2: + root.zone = parts[1] + add_to_the_last_bundle = bundle_id == last_bundle_id + last_bundle_id = bundle_id + if self.zone != 'keep': + root.zone = self.zone + + # assign new/next bundle to `bundle` if needed + if not bundle or not add_to_the_last_bundle: + bundle = document.create_bundle() + if last_bundle_id != '': + bundle.bundle_id = last_bundle_id + + bundle.add_tree(root) + + self.next_filehandle() + if self.filehandle is None: + self.finished = True + return True + if not self.merge: + return True return True # pylint: disable=too-many-branches,too-many-statements @@ -190,7 +196,7 @@ def process_document(self, document): while True: root = self.filtered_read_tree() if root is None: - if trees_loaded == 0 and self.files.has_next_file(): + if (trees_loaded == 0 or self.merge) and self.files.has_next_file(): filehandle = self.next_filehandle() continue self.finished = not self.files.has_next_file() From e148621de92ea26550634d4972b6e0093660a103 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:11:40 +0100 Subject: [PATCH 014/670] Lemmatization of negated verbs in Chinese. --- udapi/block/ud/zh/lemmatize.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 298f3501..75d62716 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -45,6 +45,9 @@ def process_node(self, node): return elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'): return + # Lemmatize negated verbs to their affirmative forms. + # 不是 bùshì = not be + # 没有 méiyǒu = not exist # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). # 亦為 亦为 Yì wèi také @@ -53,13 +56,16 @@ def process_node(self, node): # 認為 认为 Rènwéi myslet, věřit # 以為 以为 Yǐwéi myslet, věřit # 以爲 以为 Yǐwéi myslet, věřit - m = re.search(r'([是爲為为])', node.form) - if m and re.match(r'^(AUX|VERB)$', node.upos): - node.lemma = m.group(1) - if node.lemma == '爲': - node.lemma = '為' - if node.form == '不是': + if re.match(r'^(AUX|VERB)$', node.upos): + m1 = re.match(r'^(不|没)(.+)$', node.form) + m2 = re.search(r'([是爲為为])', node.form) + if m1: + node.lemma = m1.group(1) node.feats['Polarity'] = 'Neg' + elif m2: + node.lemma = m2.group(1) + if node.lemma == '爲': + node.lemma = '為' elif node.form in self.lemma: node.lemma = self.lemma[node.form] else: From 40f224a9e9554d9573d9059fb7aea16ea20a731a Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:26:13 +0100 Subject: [PATCH 015/670] Oops! Wrong part! --- udapi/block/ud/zh/lemmatize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 75d62716..7658d9b4 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -60,7 +60,7 @@ def process_node(self, node): m1 = re.match(r'^(不|没)(.+)$', node.form) m2 = re.search(r'([是爲為为])', node.form) if m1: - node.lemma = m1.group(1) + node.lemma = m1.group(2) node.feats['Polarity'] = 'Neg' elif m2: node.lemma = m2.group(1) From 0e0d53905e40848c0e7a11e4d87aa3715a93ee33 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:35:42 +0100 Subject: [PATCH 016/670] =?UTF-8?q?Another=20negation=20pattern:=20?= =?UTF-8?q?=E6=9C=AA.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/ud/zh/lemmatize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 7658d9b4..9c492800 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -48,6 +48,7 @@ def process_node(self, node): # Lemmatize negated verbs to their affirmative forms. # 不是 bùshì = not be # 没有 méiyǒu = not exist + # 未能 wèinéng = cannot # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). # 亦為 亦为 Yì wèi také @@ -57,7 +58,7 @@ def process_node(self, node): # 以為 以为 Yǐwéi myslet, věřit # 以爲 以为 Yǐwéi myslet, věřit if re.match(r'^(AUX|VERB)$', node.upos): - m1 = re.match(r'^(不|没)(.+)$', node.form) + m1 = re.match(r'^([不没未])(.+)$', node.form) m2 = re.search(r'([是爲為为])', node.form) if m1: node.lemma = m1.group(2) From 74445e4722de55a7e9714642def6fd09a1d5e2ae Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 17:50:50 +0100 Subject: [PATCH 017/670] Another negation pattern. --- udapi/block/ud/zh/lemmatize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 9c492800..436c3587 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -48,6 +48,7 @@ def process_node(self, node): # Lemmatize negated verbs to their affirmative forms. # 不是 bùshì = not be # 没有 méiyǒu = not exist + # 沒能 méinéng = cannot # 未能 wèinéng = cannot # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). @@ -58,7 +59,7 @@ def process_node(self, node): # 以為 以为 Yǐwéi myslet, věřit # 以爲 以为 Yǐwéi myslet, věřit if re.match(r'^(AUX|VERB)$', node.upos): - m1 = re.match(r'^([不没未])(.+)$', node.form) + m1 = re.match(r'^([不没沒未])(.+)$', node.form) m2 = re.search(r'([是爲為为])', node.form) if m1: node.lemma = m1.group(2) From 13088ed765f05f6f684595863a81a783e8ceafbb Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 27 Nov 2022 18:18:26 +0100 Subject: [PATCH 018/670] Lemmatization of interrogative verbs in Chinese. --- udapi/block/ud/zh/lemmatize.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py index 436c3587..abacf29f 100644 --- a/udapi/block/ud/zh/lemmatize.py +++ b/udapi/block/ud/zh/lemmatize.py @@ -47,25 +47,32 @@ def process_node(self, node): return # Lemmatize negated verbs to their affirmative forms. # 不是 bùshì = not be - # 没有 méiyǒu = not exist - # 沒能 méinéng = cannot + # 沒有 没有 méiyǒu = not exist + # 沒能 没能 méinéng = cannot # 未能 wèinéng = cannot + # Lemmatize question verbs to their base forms. + # 要不要 yàobùyào = do (you) want? + # 有没有 yǒuméiyǒu = do (you) have? # Verbs that are derived from the copula and tagged as the copula need # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi). - # 亦為 亦为 Yì wèi také - # 則為 则为 Zé wèi potom - # 更為 更为 Gèng wèi více - # 認為 认为 Rènwéi myslet, věřit - # 以為 以为 Yǐwéi myslet, věřit - # 以爲 以为 Yǐwéi myslet, věřit + # 亦為 亦为 yìwèi = také + # 則為 则为 zéwèi = potom + # 更為 更为 gèngwèi = více + # 認為 认为 rènwéi = myslet, věřit + # 以為 以为 yǐwéi = myslet, věřit + # 以爲 以为 yǐwéi = myslet, věřit if re.match(r'^(AUX|VERB)$', node.upos): m1 = re.match(r'^([不没沒未])(.+)$', node.form) - m2 = re.search(r'([是爲為为])', node.form) + m2 = re.match(r'^(.+)([不没沒未])\1$', node.form) + m3 = re.search(r'([是爲為为])', node.form) if m1: node.lemma = m1.group(2) node.feats['Polarity'] = 'Neg' elif m2: node.lemma = m2.group(1) + node.feats['Mood'] = 'Int' + elif m3: + node.lemma = m3.group(1) if node.lemma == '爲': node.lemma = '為' elif node.form in self.lemma: From 64f5bc7427efd3c32a84229e2c2c901b545118b4 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 1 Dec 2022 14:26:38 +0100 Subject: [PATCH 019/670] print also number of documents and paragraphs if any, based on newdoc and newpar annotations --- udapi/block/util/wc.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/udapi/block/util/wc.py b/udapi/block/util/wc.py index 137c95e9..e8ea2676 100644 --- a/udapi/block/util/wc.py +++ b/udapi/block/util/wc.py @@ -13,6 +13,7 @@ def __init__(self, tsv=False, **kwargs): """ super().__init__(**kwargs) self.trees, self.words, self.mwts, self.tokens, self.empty = 0, 0, 0, 0, 0 + self.docs, self.paragraphs = 0, 0 self.tsv = tsv def process_tree(self, tree): @@ -22,13 +23,21 @@ def process_tree(self, tree): self.mwts += mwtoks self.tokens += len(tree.token_descendants) if mwtoks else len(tree.descendants) self.empty += len(tree.empty_nodes) + if tree.newdoc: + self.docs += 1 + if tree.newpar: + self.paragraphs += 1 def process_end(self): if self.tsv: - print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty)))) + print('\t'.join(map(str, (self.trees, self.words, self.tokens, self.mwts, self.empty, self.docs, self.paragraphs)))) else: print('%8d trees\n%8d words' % (self.trees, self.words)) if self.mwts: print('%8d multi-word tokens\n%8d tokens' % (self.mwts, self.tokens)) if self.empty: print('%8d empty nodes' % self.empty) + if self.docs: + print('%8d documents' % self.docs) + if self.paragraphs: + print('%8d paragraphs' % self.paragraphs) From c29590fefe4a045c8c33c0e8729c3a2582d1cf5f Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 2 Dec 2022 23:18:53 +0100 Subject: [PATCH 020/670] Enable separate checking of Flavio's approach to Latin morphology. --- udapi/block/ud/la/markfeatsbugs.py | 141 +++++++++++++++-------------- 1 file changed, 75 insertions(+), 66 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 96c7b682..149fcd18 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -12,15 +12,32 @@ class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): + def __init__(self, flavio=False, **kwargs): + """ + Create the ud.la.MarkFeatsBugs block instance. + + Args: + flavio=1: Accept features as defined by Flavio for treebanks he + maintains. By default, a more conservative set of features and + values is expected. + """ + super().__init__(**kwargs) + self.flavio = flavio + def process_node(self, node): # NOUNS ################################################################ if node.upos == 'NOUN': - self.check_required_features(node, ['Gender', 'Number', 'Case']) - self.check_allowed_features(node, { + rf = ['Gender', 'Number', 'Case'] + af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes']} + if self.flavio: + rf.append('InflClass') + af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': self.check_required_features(node, ['Gender', 'Number', 'Case']) @@ -32,13 +49,20 @@ def process_node(self, node): 'Foreign': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree']) - self.check_allowed_features(node, { + rf = ['Gender', 'Number', 'Case', 'Degree'] + af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Foreign': ['Yes']}) + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Foreign': ['Yes']} + if self.flavio: + # Flavio does not use Degree=Pos, hence Degree is not required. + rf = [f for f in rf if f != 'Degree'] + rf.append('InflClass') + af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # PRONOUNS ############################################################# elif node.upos == 'PRON': self.check_required_features(node, ['PronType']) @@ -81,13 +105,19 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] }) else: - self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { + rf = ['PronType', 'Gender', 'Number', 'Case'] + af = { 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} + if self.flavio: + rf.append('InflClass') + af['PronType'].append('Con') + af['InflClass'] = ['LatPron'] + af['Form'] = ['Emp'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # NUMERALS ############################################################# elif node.upos == 'NUM': self.check_required_features(node, ['NumType', 'NumForm']) @@ -98,73 +128,52 @@ def process_node(self, node): 'NumForm': ['Digit', 'Roman'] }) else: - self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_required_features(node, ['NumType', 'NumForm']) self.check_allowed_features(node, { 'NumType': ['Card'], - 'NumForm': ['Word'], - 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + 'NumForm': ['Word'] }) # VERBS AND AUXILIARIES ################################################ elif re.match(r'^(VERB|AUX)$', node.upos): - self.check_required_features(node, ['Aspect', 'VerbForm']) - if node.feats['VerbForm'] == 'Inf': - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Inf'], - 'Polarity': ['Pos', 'Neg'] - }) - elif node.feats['VerbForm'] == 'Fin': - if node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood', 'Person', 'Number']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Fin'], - 'Mood': ['Imp'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Plur'], - 'Polarity': ['Pos', 'Neg'] - }) - else: # indicative or subjunctive - self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Sub'], - 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative - 'Voice': ['Act'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Plur'], - 'Polarity': ['Pos', 'Neg'] - }) + rf = ['Aspect', 'VerbForm'] + af = { + 'Aspect': ['Imp', 'Perf', 'Prosp'], + 'VerbForm': ['Inf', 'Fin', 'Part', 'Vnoun'], + 'Polarity': ['Pos', 'Neg']} + if node.feats['VerbForm'] == 'Fin': + rf.extend(['Mood', 'Person', 'Number']) + af['Mood'] = ['Ind', 'Sub', 'Imp'] + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + if re.match(r'^(Ind|Sub)$', node.feats['Mood']): # indicative or subjunctive + rf.extend(['Voice', 'Tense']) + af['Voice'] = ['Act', 'Pass'] + af['Tense'] = ['Past', 'Imp', 'Pres', 'Fut'] elif node.feats['VerbForm'] == 'Part': - self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Part'], - 'Tense': ['Past'], - 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB - 'Number': ['Sing', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Polarity': ['Pos', 'Neg'] - }) + rf.extend(['Tense', 'Gender', 'Number', 'Voice']) + af['Tense'] = ['Past'] + af['Voice'] = ['Act'] + af['Number'] = ['Sing', 'Plur'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] else: # verbal noun - self.check_required_features(node, ['Tense', 'Number', 'Voice']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prosp'], - 'VerbForm': ['Vnoun'], - 'Tense': ['Past', 'Pres'], - 'Voice': ['Act'], - 'Number': ['Sing', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular - 'Polarity': ['Pos', 'Neg'] - }) + rf.extend(['Tense', 'Voice']) + af['Tense'] = ['Past', 'Pres'] + af['Voice'] = ['Act'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] + if self.flavio: + # Flavio has killed Tense in his treebanks. + rf = [f for f in rf if f != 'Tense'] + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI2', 'LatX'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # ADVERBS ############################################################## elif node.upos == 'ADV': if node.feats['PronType'] != '': # Pronominal adverbs are neither compared nor negated. self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'AdvType': ['Loc'] }) else: # The remaining adverbs are neither pronominal, nor compared or From 8b05a49741481d20cf4b0b4ec41bf92b4a696701 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 3 Dec 2022 12:36:35 +0100 Subject: [PATCH 021/670] Adjusted Latin feature rules. --- udapi/block/ud/la/markfeatsbugs.py | 209 +++++++++++++++++------------ 1 file changed, 121 insertions(+), 88 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 149fcd18..31d112b8 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -25,121 +25,146 @@ def __init__(self, flavio=False, **kwargs): self.flavio = flavio def process_node(self, node): + rf = [] + af = {} # NOUNS ################################################################ if node.upos == 'NOUN': - rf = ['Gender', 'Number', 'Case'] + if not node.feats['Abbr'] == 'Yes': + rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Dim'], + 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: - rf.append('InflClass') - af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - self.check_required_features(node, ['Gender', 'Number', 'Case']) - self.check_allowed_features(node, { + if not node.feats['Abbr'] == 'Yes': + rf = ['Gender', 'Number', 'Case'] + af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) + 'Abbr': ['Yes'], + 'Foreign': ['Yes']} + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Proper'] = ['Yes'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': - rf = ['Gender', 'Number', 'Case', 'Degree'] + if not node.feats['Abbr'] == 'Yes': + rf = ['Gender', 'Number', 'Case', 'Degree'] af = { + 'NumType': ['Ord', 'Dist'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: # Flavio does not use Degree=Pos, hence Degree is not required. rf = [f for f in rf if f != 'Degree'] - rf.append('InflClass') - af['InflClass'] = ['IndEurA', 'IndEurO', 'IndEurX'] + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PRONOUNS ############################################################# elif node.upos == 'PRON': - self.check_required_features(node, ['PronType']) + rf = ['PronType', 'Case'] + af = { + 'PronType': ['Prs', 'Rel', 'Ind'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + } if node.feats['PronType'] == 'Prs': - if node.feats['Reflex'] == 'Yes': - self.check_required_features(node, ['PronType', 'Reflex', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Reflex': ['Yes'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] - }) - else: # not reflexive - if node.feats['Person'] == '3': # on, ona, ono, oni, ony - self.check_required_features(node, ['PronType', 'Person', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) - else: # 1st and 2nd person do not have gender: já, ty - self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['1', '2'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) + af['Reflex'] = ['Yes'] + if node.feats['Reflex'] == 'Yes': # seipsum, se + # seipsum has gender and number but se does not, so it is not required + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Person'] = ['3'] + af['Case'] = ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + else: # not reflexive: ego, tu, is, nos + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 1st and 2nd person do not have gender + if node.feats['Person'] == '3': # is, id + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + elif re.match(r'^(Rel|Ind)$', node.feats['PronType']): + rf.extend(['Gender', 'Number']) + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['LatAnom', 'LatPron'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # DETERMINERS ########################################################## elif node.upos == 'DET': - if node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Poss': ['Yes'], - 'Person': ['1', '2', '3'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - }) + rf = ['PronType', 'Gender', 'Number', 'Case'] + af = { + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Number': ['Sing', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} + if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' + rf.extend(['Poss', 'Person[psor]']) + af['PronType'] = ['Prs'] + af['Poss'] = 'Yes' + af['Person[psor]'] = ['1', '2', '3'] + af['Reflex'] = ['Yes'] + # The possessor's number is distinguished in the first and second person (meus vs. noster) but not in the third person (suus). + if node.feats['Person[psor]'] != '3': + rf.append('Number[psor]') + af['Number[psor]'] = ['Sing', 'Plur'] else: - rf = ['PronType', 'Gender', 'Number', 'Case'] - af = { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Emp'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} - if self.flavio: - rf.append('InflClass') - af['PronType'].append('Con') - af['InflClass'] = ['LatPron'] - af['Form'] = ['Emp'] - self.check_required_features(node, rf) - self.check_allowed_features(node, af) + af['PronType'] = ['Dem', 'Rel', 'Ind', 'Tot', 'Con'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['Form'] = ['Emp'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # NUMERALS ############################################################# elif node.upos == 'NUM': - self.check_required_features(node, ['NumType', 'NumForm']) + rf = ['NumType', 'NumForm'] + af = { + 'NumType': ['Card'], + 'NumForm': ['Word', 'Roman', 'Digit'] + } # Arabic digits and Roman numerals do not have inflection features. - if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Digit', 'Roman'] - }) - else: - self.check_required_features(node, ['NumType', 'NumForm']) - self.check_allowed_features(node, { - 'NumType': ['Card'], - 'NumForm': ['Word'] - }) + if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + if self.flavio: + # Flavio added InflClass but not everywhere, so it is not required. + af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # VERBS AND AUXILIARIES ################################################ elif re.match(r'^(VERB|AUX)$', node.upos): - rf = ['Aspect', 'VerbForm'] + rf = ['VerbForm'] af = { - 'Aspect': ['Imp', 'Perf', 'Prosp'], 'VerbForm': ['Inf', 'Fin', 'Part', 'Vnoun'], 'Polarity': ['Pos', 'Neg']} + # Main verbs have aspect but auxiliaries don't. + if node.upos == 'VERB': + rf.append('Aspect') + af['Aspect'] = ['Imp', 'Perf', 'Prosp'] if node.feats['VerbForm'] == 'Fin': rf.extend(['Mood', 'Person', 'Number']) af['Mood'] = ['Ind', 'Sub', 'Imp'] @@ -150,40 +175,48 @@ def process_node(self, node): af['Voice'] = ['Act', 'Pass'] af['Tense'] = ['Past', 'Imp', 'Pres', 'Fut'] elif node.feats['VerbForm'] == 'Part': - rf.extend(['Tense', 'Gender', 'Number', 'Voice']) + rf.extend(['Tense', 'Gender', 'Number', 'Voice', 'Case']) af['Tense'] = ['Past'] - af['Voice'] = ['Act'] + af['Voice'] = ['Act', 'Pass'] af['Number'] = ['Sing', 'Plur'] af['Gender'] = ['Masc', 'Fem', 'Neut'] - else: # verbal noun + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + af['Degree'] = ['Abs'] + elif node.feats['VerbForm'] == 'Vnoun': rf.extend(['Tense', 'Voice']) af['Tense'] = ['Past', 'Pres'] - af['Voice'] = ['Act'] + af['Voice'] = ['Act', 'Pass'] af['Gender'] = ['Masc', 'Fem', 'Neut'] + # else: nothing to be added form VerbForm=Inf if self.flavio: # Flavio has killed Tense in his treebanks. rf = [f for f in rf if f != 'Tense'] # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI2', 'LatX'] + if node.feats['VerbForm'] == 'Part': + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## elif node.upos == 'ADV': - if node.feats['PronType'] != '': - # Pronominal adverbs are neither compared nor negated. - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], - 'AdvType': ['Loc'] - }) - else: - # The remaining adverbs are neither pronominal, nor compared or - # negated. - self.check_allowed_features(node, {}) + af = { + 'AdvType': ['Loc', 'Tim'], + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'] + } + if self.flavio: + af['Compound'] = 'Yes' + af['Form'] = 'Emp' + self.check_allowed_features(node, af) # PARTICLES ############################################################ elif node.upos == 'PART': - self.check_allowed_features(node, { + af = { + 'PartType': ['Int'], 'Polarity': ['Neg'] - }) + } + if self.flavio: + af['Form'] = 'Emp' + self.check_allowed_features(node, af) # THE REST: NO FEATURES ################################################ else: self.check_allowed_features(node, {}) From ab86f1b93d6e20bf4f42c18c1af9f3c22c5e4f64 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 4 Dec 2022 12:07:10 +0100 Subject: [PATCH 022/670] Refined features of pronouns in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 71 +++++++++++++----------------- 1 file changed, 30 insertions(+), 41 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 41d4cf09..96cf8b55 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -27,50 +27,38 @@ def process_node(self, node): 'Foreign': ['Yes']}) # PRONOUNS ############################################################# elif node.upos == 'PRON': - self.check_required_features(node, ['PronType']) + rf = ['PronType', 'Case'] + af = { + 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + } if node.feats['PronType'] == 'Prs': + af['Reflex'] = ['Yes'] if node.feats['Reflex'] == 'Yes': - self.check_required_features(node, ['PronType', 'Reflex', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Reflex': ['Yes'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) + af['Case'] = [c for c in af['Case'] if c != 'Nom' and c != 'Voc'] else: # not reflexive - if node.feats['Person'] == '3': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ + rf.extend(['Person', 'Number']) + af['Person'] = ['1', '2', '3'] + af['Number'] = ['Sing', 'Plur'] + # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī; or 3rd person താൻ tān̕ + if node.feats['Person'] == '3' and not node.lemma == 'താൻ': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ; but not താൻ tān̕ + rf.append('Deixis') + af['Deixis'] = ['Prox', 'Remt'] if node.feats['Number'] == 'Sing': - self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Gender', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Deixis': ['Prox', 'Remt'], - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) - else: # plural pronouns do not distinguish gender - self.check_required_features(node, ['PronType', 'Person', 'Deixis', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['3'], - 'Deixis': ['Prox', 'Remt'], - 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) - else: # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī - self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Prs'], - 'Person': ['1', '2'], - 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) - else: # not personal - self.check_required_features(node, ['PronType', 'Case']) - self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] - }) + rf.append('Gender') + af['Gender'] = ['Masc', 'Fem', 'Neut'] + # third person singular neuter pronouns also distinguish animacy (animate neuter are animals and plants, they have a different accusative form) + if node.feats['Gender'] == 'Neut': + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + else: # plural pronouns do not distinguish gender but they do distinguish animacy + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] + elif node.feats['Person'] == '1' and node.feats['Number'] == 'Plur': + rf.append('Clusivity') + af['Clusivity'] = ['In', 'Ex'] + self.check_required_features(node, rf) + self.check_allowed_features(node, af) # DETERMINERS ########################################################## elif node.upos == 'DET': if node.feats['PronType'] == 'Art': @@ -82,7 +70,8 @@ def process_node(self, node): else: self.check_required_features(node, ['PronType']) self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Deixis': ['Prox', 'Remt'] }) # NUMERALS ############################################################# elif node.upos == 'NUM': From cd9b962cb602eced89466af00ee077afd20d63bc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 4 Dec 2022 15:04:02 +0100 Subject: [PATCH 023/670] Write sentences in a HTML list. --- udapi/block/write/sentenceshtml.py | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 udapi/block/write/sentenceshtml.py diff --git a/udapi/block/write/sentenceshtml.py b/udapi/block/write/sentenceshtml.py new file mode 100644 index 00000000..e0f87241 --- /dev/null +++ b/udapi/block/write/sentenceshtml.py @@ -0,0 +1,37 @@ +"""SentencesHtml class is a writer for sentences in HTML list (could be Google-translated, remembering sentence correspondence).""" +from udapi.core.basewriter import BaseWriter + + +class SentencesHtml(BaseWriter): + """A writer of sentences in HTML list (one per item). + + Usage: + udapy write.SentencesHtml if_missing=empty < my.conllu > my.html + """ + + def __init__(self, title='Sentences from CoNLL-U', if_missing='detokenize', **kwargs): + """Create the SentencesHtml writer block. + + Parameters: + if_missing: What to do if `root.text` is `None`? (default=detokenize) + * `detokenize`: use `root.compute_text()` to compute the sentence. + * `empty`: print an empty line + * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` + * `fatal`: raise an exception + """ + super().__init__(**kwargs) + self.title = title + self.if_missing = if_missing + + def before_process_document(self, document): + super().before_process_document(document) + print('\n\n\n') + print('' + self.title + '') + print('\n\n
    \n') + + def after_process_document(self, document): + print("
\n\n") + super().after_process_document(document) + + def process_tree(self, tree): + print('
  • %s
  • ' % (tree.sent_id, tree.get_sentence(self.if_missing))) From faeecb50ca7c3dfbc1130c628593c9b7031f035e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sun, 4 Dec 2022 19:30:44 +0100 Subject: [PATCH 024/670] Refined feature tests for Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 96cf8b55..2372bd23 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -57,6 +57,15 @@ def process_node(self, node): elif node.feats['Person'] == '1' and node.feats['Number'] == 'Plur': rf.append('Clusivity') af['Clusivity'] = ['In', 'Ex'] + # Interrogative pronouns, too, can be case-marked. Therefore, the + # base form must have Case=Nom. + # ആര് ār "who" (Nom) എന്ത് ent "what" (Nom, Acc.Inan) + # ആരെ āre "who" (Acc) എന്തെ ente "what" (Acc.Anim) എന്തിനെ entine "what" (Acc.Anim or maybe Inan but optional) + # ആരുടെ āruṭe "who" (Gen) എന്തിന് entin "what" (Gen) or "why" + # ആരൊക്കെ ārokke "who" (Dat?) എന്തൊക്കെ entokke "what" (Dat?) + elif node.feats['PronType'] == 'Int': + rf.append('Animacy') + af['Animacy'] = ['Anim', 'Inan'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # DETERMINERS ########################################################## @@ -101,13 +110,18 @@ def process_node(self, node): }) elif node.feats['VerbForm'] == 'Fin': if node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood', 'Voice']) + # Unlike other forms, the imperative distinguishes politeness. + # The verb stem serves as an informal imperative: തുറ tuṟa "open" + # The citation form may serve as a formal imperative: തുറക്കുക tuṟakkūka "open" + # Finally, there is another formal imperative with -kkū: തുറക്കൂ tuṟakkū "open" + self.check_required_features(node, ['Mood', 'Voice', 'Polite']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Polite': ['Infm', 'Form'] }) else: self.check_required_features(node, ['Mood', 'Tense', 'Voice']) From 9f1c9adadd6b5e53aa9cf5aaea9cd8e26cdfe663 Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Tue, 6 Dec 2022 10:24:48 +0100 Subject: [PATCH 025/670] Further adjusted Latin feature rules. --- udapi/block/ud/la/markfeatsbugs.py | 155 ++++++++++++++++++++--------- 1 file changed, 107 insertions(+), 48 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 31d112b8..323f60f7 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -29,7 +29,7 @@ def process_node(self, node): af = {} # NOUNS ################################################################ if node.upos == 'NOUN': - if not node.feats['Abbr'] == 'Yes': + if not node.feats['Abbr'] == 'Yes' or node.feats['Case']: # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], @@ -41,61 +41,71 @@ def process_node(self, node): if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['VerbForm'] = ['Part'] + af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - if not node.feats['Abbr'] == 'Yes': + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: # abbreviated and indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'NameType': ['Giv', 'Sur', 'Geo'], 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: - # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] - af['Proper'] = ['Yes'] + af['Compound'] = 'Yes' + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: + af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': - if not node.feats['Abbr'] == 'Yes': - rf = ['Gender', 'Number', 'Case', 'Degree'] + if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: + rf = ['Gender', 'Number', 'Case'] af = { 'NumType': ['Ord', 'Dist'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], - 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Degree': ['Cmp', 'Sup', 'Abs'], 'Abbr': ['Yes'], - 'Foreign': ['Yes']} + 'Foreign': ['Yes'], + 'Polarity': ['Neg']} if self.flavio: # Flavio does not use Degree=Pos, hence Degree is not required. - rf = [f for f in rf if f != 'Degree'] + # rf = [f for f in rf if f != 'Degree'] # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] + af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] + af['Degree'].append('Dim') + af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # PRONOUNS ############################################################# elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { - 'PronType': ['Prs', 'Rel', 'Ind'], + 'PronType': ['Prs', 'Rel', 'Ind', 'Int', 'Rcp'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] if node.feats['Reflex'] == 'Yes': # seipsum, se + rf.extend(['Person']) # seipsum has gender and number but se does not, so it is not required - af['Gender'] = ['Masc'] - af['Number'] = ['Sing'] + # TODO: seipsum in ITTB, but why lemma seipsum instead of seipse? + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] af['Person'] = ['3'] - af['Case'] = ['Gen', 'Dat', 'Acc', 'Loc', 'Abl'] + af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Abl'] else: # not reflexive: ego, tu, is, nos rf.extend(['Person', 'Number']) af['Person'] = ['1', '2', '3'] @@ -104,22 +114,34 @@ def process_node(self, node): if node.feats['Person'] == '3': # is, id rf.append('Gender') af['Gender'] = ['Masc', 'Fem', 'Neut'] - elif re.match(r'^(Rel|Ind)$', node.feats['PronType']): + elif re.match(r'^(Rel|Int)$', node.feats['PronType']): rf.extend(['Gender', 'Number']) af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] + elif node.feats['PronType'] == 'Ind': + rf = [f for f in rf if f != 'Case'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatAnom', 'LatPron'] + af['Compound'] = ['Yes'] + af['Polarity'] = ['Neg'] + af['Form'] = ['Emp'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # DETERMINERS ########################################################## elif node.upos == 'DET': - rf = ['PronType', 'Gender', 'Number', 'Case'] + rf = ['PronType'] + if node.feats['Case']: + rf.extend(['Gender', 'Number', 'Case']) af = { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']} + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Degree': ['Cmp', 'Abs', 'Sup'], + 'Polarity': ['Neg'] + } if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' rf.extend(['Poss', 'Person[psor]']) af['PronType'] = ['Prs'] @@ -131,11 +153,16 @@ def process_node(self, node): rf.append('Number[psor]') af['Number[psor]'] = ['Sing', 'Plur'] else: - af['PronType'] = ['Dem', 'Rel', 'Ind', 'Tot', 'Con'] + af['PronType'] = ['Dem', 'Rel', 'Ind', 'Int', 'Tot', 'Con'] if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['Compound'] = ['Yes'] af['Form'] = ['Emp'] + af['NumType'] = ['Card'] + af['Degree'].append('Dim') + if re.match(r'^(unus|ambo)', node.lemma): + af['NumValue'] = ['1', '2'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # NUMERALS ############################################################# @@ -151,50 +178,59 @@ def process_node(self, node): af['Number'] = ['Sing', 'Plur'] af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] if self.flavio: - # Flavio added InflClass but not everywhere, so it is not required. + # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # VERBS AND AUXILIARIES ################################################ elif re.match(r'^(VERB|AUX)$', node.upos): - rf = ['VerbForm'] + rf = ['VerbForm', 'Aspect'] af = { - 'VerbForm': ['Inf', 'Fin', 'Part', 'Vnoun'], - 'Polarity': ['Pos', 'Neg']} + 'VerbForm': ['Inf', 'Fin', 'Part'], + 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], + 'Polarity': ['Neg'] + } + if not re.match(r'^(Ger|Gdv)$', node.feats['VerbForm']): + rf.append('Tense') + af['Tense'] = ['Pres', 'Fut'] + if node.upos == 'VERB': # and not node.lemma.endswith('sum'): # compounds of sum + rf.append('Voice') + af['Voice'] = ['Act', 'Pass'] # Main verbs have aspect but auxiliaries don't. - if node.upos == 'VERB': - rf.append('Aspect') - af['Aspect'] = ['Imp', 'Perf', 'Prosp'] - if node.feats['VerbForm'] == 'Fin': + # TODO: apparently, apparently AUXs have aspect as well + # if node.upos == 'VERB': + # rf.append('Aspect') + # af['Aspect'] = ['Imp', 'Inch', 'Perf', 'Prosp'] + if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive rf.extend(['Mood', 'Person', 'Number']) + af['Tense'].extend(['Past', 'Pqp']) af['Mood'] = ['Ind', 'Sub', 'Imp'] af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] - if re.match(r'^(Ind|Sub)$', node.feats['Mood']): # indicative or subjunctive - rf.extend(['Voice', 'Tense']) - af['Voice'] = ['Act', 'Pass'] - af['Tense'] = ['Past', 'Imp', 'Pres', 'Fut'] elif node.feats['VerbForm'] == 'Part': - rf.extend(['Tense', 'Gender', 'Number', 'Voice', 'Case']) - af['Tense'] = ['Past'] - af['Voice'] = ['Act', 'Pass'] + rf.extend(['Gender', 'Number', 'Case']) af['Number'] = ['Sing', 'Plur'] af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] - af['Degree'] = ['Abs'] - elif node.feats['VerbForm'] == 'Vnoun': - rf.extend(['Tense', 'Voice']) - af['Tense'] = ['Past', 'Pres'] - af['Voice'] = ['Act', 'Pass'] + af['Degree'] = ['Abs', 'Cmp'] af['Gender'] = ['Masc', 'Fem', 'Neut'] - # else: nothing to be added form VerbForm=Inf + af['Tense'].append('Past') + # else: nothing to be added for VerbForm=Inf if self.flavio: # Flavio has killed Tense in his treebanks. rf = [f for f in rf if f != 'Tense'] + af['VerbForm'].append('Vnoun') # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI2', 'LatX'] - if node.feats['VerbForm'] == 'Part': + af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + if 'Degree' in af: + af['Degree'].append('Dim') + else: + af['Degree'] = ['Dim'] + af['Compound'] = ['Yes'] + af['Proper'] = ['Yes'] + if re.match(r'^(Part|Vnoun)$', node.feats['VerbForm']): af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO'] + af['VerbForm'].append('Vnoun') self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## @@ -202,20 +238,43 @@ def process_node(self, node): af = { 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], - 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'] + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Polarity': ['Neg'] } if self.flavio: - af['Compound'] = 'Yes' - af['Form'] = 'Emp' + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['NumType'] = ['Card', 'Ord'] # e.g., primum + af['VerbForm'] = ['Part'] + af['Degree'].append('Dim') self.check_allowed_features(node, af) # PARTICLES ############################################################ elif node.upos == 'PART': af = { - 'PartType': ['Int'], + 'PartType': ['Int', 'Emp'], 'Polarity': ['Neg'] } if self.flavio: - af['Form'] = 'Emp' + af['Form'] = ['Emp'] + af['PronType'] = ['Dem'] + self.check_allowed_features(node, af) + # CONJUNCTIONS ######################################################### + elif re.match(r'^[CS]CONJ$', node.upos): + af = { + 'PronType': ['Rel', 'Con'], + 'Polarity': ['Neg']} + if self.flavio: + af['Compound'] = ['Yes'] + af['Form'] = ['Emp'] + af['VerbForm'] = ['Fin'] + af['NumType'] = ['Card'] + self.check_allowed_features(node, af) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + if self.flavio: + af = { + 'VerbForm': ['Part'], + 'Proper': ['Yes']} self.check_allowed_features(node, af) # THE REST: NO FEATURES ################################################ else: From 3de5c225d9fc8e1a56922bd13b2feea8f4ca7bf4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Tue, 6 Dec 2022 11:21:33 +0100 Subject: [PATCH 026/670] Usage: the new parameter merge=1 implemented by Martin. --- udapi/block/ud/cs/markfeatsbugs.py | 2 +- udapi/block/ud/la/markfeatsbugs.py | 2 +- udapi/block/ud/ml/markfeatsbugs.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 309e7ac8..30ee90b2 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 31d112b8..74a06a07 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAMX layout=compact ud.la.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 2372bd23..b286a27c 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -4,7 +4,7 @@ used in filters and highlighted in text output. Usage: cat *.conllu | udapy -HAMX layout=compact ud.ml.MarkFeatsBugs > bugs.html -Windows: python udapy read.Conllu files="a.conllu,b.conllu" ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc +Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs import logging From 1544c7474cdf91aa2f1c52b3566dedf11a127e5f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 14 Dec 2022 17:17:00 +0100 Subject: [PATCH 027/670] update for newer versions of termcolor and colorama --- requirements.txt | 2 +- udapi/block/write/textmodetrees.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 647361f7..044d3af7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -colorama +colorama>=0.4.6 termcolor ufal.udpipe diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index f3f6e007..41539670 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -1,4 +1,5 @@ """An ASCII pretty printer of dependency trees.""" +import os import re import sys @@ -344,8 +345,12 @@ def before_process_document(self, document): super().before_process_document(document) if self.color == 'auto': self.color = sys.stdout.isatty() - if self.color: - colorama.init() + if self.color: + colorama.just_fix_windows_console() + # termcolor since 2.1 also autodetects whether sys.stdout.isatty() + # and if not, it disables the colors, so `cat i.conllu | udapy -T | less -R" + # does not work. We need to turn off termcolor's autodetection with FORCE_COLOR. + os.environ["FORCE_COLOR"] = "1" if self.print_doc_meta: for key, value in sorted(document.meta.items()): print('%s = %s' % (key, value)) From 9b0d20115a4dfea531519bf54f8fe5326ac77261 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 16 Dec 2022 23:03:43 +0100 Subject: [PATCH 028/670] read.Sentences newdoc_if_empty_line=1 --- udapi/block/read/sentences.py | 14 ++++++++++++-- udapi/core/document.py | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/udapi/block/read/sentences.py b/udapi/block/read/sentences.py index 356e196f..9b428331 100644 --- a/udapi/block/read/sentences.py +++ b/udapi/block/read/sentences.py @@ -9,6 +9,8 @@ class Sentences(BaseReader): Args: ignore_empty_lines: if True, delete empty lines from the input. Default=False. + newdoc_if_empty_line: if True, empty lines mark document boundaries, + which are marked with `root.newdoc`. Default=False. rstrip: a set of characters to be stripped from the end of each line. Default='\r\n '. You can use rstrip='\n' if you want to preserve any space or '\r' (Carriage Return) at end of line, @@ -16,8 +18,12 @@ class Sentences(BaseReader): As most blocks do not expect whitespace other than a space to appear in the processed text, using this feature is at your own risk. """ - def __init__(self, ignore_empty_lines=False, rstrip='\r\n ', **kwargs): + def __init__(self, ignore_empty_lines=False, newdoc_if_empty_line=False, + rstrip='\r\n ', **kwargs): + if ignore_empty_lines and newdoc_if_empty_line: + raise ValueError("ignore_empty_lines is not compatible with newdoc_if_empty_line") self.ignore_empty_lines = ignore_empty_lines + self.newdoc_if_empty_line = newdoc_if_empty_line self.rstrip = rstrip super().__init__(**kwargs) @@ -38,11 +44,15 @@ def read_tree(self, document=None): # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None - if self.ignore_empty_lines: + preceded_by_empty_line = False + if self.ignore_empty_lines or self.newdoc_if_empty_line: while line in {'\n', '\r\n'}: + preceded_by_empty_line = True line = self.filehandle.readline() if line == '': return None root = Root() root.text = line.rstrip(self.rstrip) + if self.newdoc_if_empty_line and preceded_by_empty_line: + root.newdoc = True return root diff --git a/udapi/core/document.py b/udapi/core/document.py index dcf146ea..d6a84f0e 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -23,7 +23,7 @@ def __init__(self, filename=None, **kwargs): No pre-processing is applied, so when loading the document from a *.txt file, `Document("a.txt").nodes` will be empty and you need to run tokenization first. You can pass additional parameters for `udapi.block.read.sentences` - (`ignore_empty_lines` and `rstrip`). + (`ignore_empty_lines`, `newdoc_if_empty_line` and `rstrip`). """ self.bundles = [] self._highest_bundle_id = 0 From 83989865bf94f3ae9355364f05ac32aef84e8979 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 20 Dec 2022 11:12:47 +0100 Subject: [PATCH 029/670] bugfix logging.warning takes multiple *args to be substituted for %s, not a single argument, see https://docs.python.org/3/library/logging.html#logging.debug However, using f-strings seems to be less error-prone. --- udapi/block/read/conllu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index d703fb26..7e59e2f9 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -73,7 +73,7 @@ def parse_comment_line(self, line, root): if entity_match is not None: global_entity = entity_match.group(1) if self._global_entity and self._global_entity != global_entity: - logging.warning("Mismatch in global.Entity: %s != %s", (self._global_entity, global_entity)) + logging.warning(f"Mismatch in global.Entity: {self._global_entity} != {global_entity}") self._global_entity = global_entity root.comment += '$GLOBAL.ENTITY\n' return From f93d4c92a64b9aad8bcdf1d2a8045bc6ae554cc5 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:04:47 +0100 Subject: [PATCH 030/670] fix a bug preventing to load two conllu files into two zones BaseReader calls ``` if self.zone != 'keep': root.zone = self.zone ``` so it supposes that root.sent_id will reflect the new zone. Originally, `root.sent_id` was computed each time on the fly, but after optimization it is cached in `root._sent_id`. --- udapi/core/root.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/core/root.py b/udapi/core/root.py index 0132566a..6a5717a2 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -95,6 +95,12 @@ def zone(self, zone): if self._bundle: self._bundle.check_zone(zone) self._zone = zone + if self._bundle is not None: + self._sent_id = self._bundle.address() + '/' + zone + elif self._sent_id: + self._sent_id = self._sent_id.split('/', 1)[0] + '/' + zone + else: + self._sent_id = '?/' + zone @property def parent(self): From 187a2b20139a60c0ca3ad8f08325b3851a695e86 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:08:59 +0100 Subject: [PATCH 031/670] util.MarkDiff ignore_parent=1 sometimes we may not be interested in differences in the topology --- udapi/block/util/markdiff.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/udapi/block/util/markdiff.py b/udapi/block/util/markdiff.py index 3d183f57..6c57ab36 100644 --- a/udapi/block/util/markdiff.py +++ b/udapi/block/util/markdiff.py @@ -9,7 +9,7 @@ class MarkDiff(Block): """Mark differences between parallel trees.""" def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc', - mark=1, add=False, print_stats=0, **kwargs): + mark=1, add=False, print_stats=0, ignore_parent=False, **kwargs): """Create the Mark block object. Params: gold_zone: Which of the zones should be treated as gold? @@ -20,6 +20,7 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc add: If False, node.misc attributes Mark, ToDo and Bug will be deleted before running this block, so that the marked_only option (e.g. via `udapy -TM`) prints only nodes marked by this block. print_stats: How many lines of statistics should be printed? -1 means all. + ignore_parent: ignore differences in dependency parents """ super().__init__(**kwargs) self.gold_zone = gold_zone @@ -27,6 +28,7 @@ def __init__(self, gold_zone, attributes='form,lemma,upos,xpos,deprel,feats,misc self.mark = mark self.add = add self.print_stats = print_stats + self.ignore_parent = ignore_parent self.stats = collections.Counter() def process_tree(self, tree): @@ -60,7 +62,7 @@ def process_tree(self, tree): edit, pred_lo, pred_hi, gold_lo, gold_hi = diff if edit == 'equal': for p_node, g_node in zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]): - if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: p_node.misc['Mark'] = self.mark g_node.misc['Mark'] = self.mark self.stats['ONLY-PARENT-CHANGED'] += 1 @@ -76,7 +78,7 @@ def process_tree(self, tree): p_value, g_value = p_node._get_attr(attr), g_node._get_attr(attr) if p_value != g_value: self.stats[f'{attr.upper()}: {p_value} -> {g_value}'] += 1 - if alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: + if not self.ignore_parent and alignment.get(p_node.parent.ord - 1) != g_node.parent.ord - 1: self.stats['PARENT-CHANGED'] += 1 pred_lo, gold_lo = pred_lo + n, gold_lo + n for node in gold_nodes[gold_lo:gold_hi]: From 2ad4922b5f9fe4196c5b67a00f42f45039f83c3a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:09:42 +0100 Subject: [PATCH 032/670] write.TextModeTreesHtml prints zones side by side by default --- udapi/block/write/textmodetreeshtml.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 9f9f6aa2..7fedc1b8 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -26,7 +26,7 @@ class TextModeTreesHtml(TextModeTrees): This block is a subclass of `TextModeTrees`, see its documentation for more info. """ - def __init__(self, color=True, title='Udapi visualization', **kwargs): + def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, **kwargs): """Create new TextModeTreesHtml block object. Args: see `TextModeTrees`. @@ -38,6 +38,7 @@ def __init__(self, color=True, title='Udapi visualization', **kwargs): """ super().__init__(color=color, **kwargs) self.title = title + self.zones_in_rows = zones_in_rows def before_process_document(self, document): # TextModeTrees.before_process_document changes the color property, @@ -82,3 +83,15 @@ def print_headers(self, root): print(escape(text)) if self.print_comments and root.comment: print('#' + self.colorize_comment(escape(root.comment)).rstrip().replace('\n', '\n#')) + + def process_bundle(self, bundle): + if self.zones_in_rows: + print("") + for tree in bundle: + if self._should_process_tree(tree): + print("") + print("
    ") + self.process_tree(tree) + print("
    ") + else: + super().process_bundle(bundle) From a49785d844e85771d499b3431cf8d8c9f3878307 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 9 Jan 2023 21:28:09 +0100 Subject: [PATCH 033/670] empty zone does not need a slash in sent_id --- udapi/core/root.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/udapi/core/root.py b/udapi/core/root.py index 6a5717a2..3e6bf62b 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -95,12 +95,13 @@ def zone(self, zone): if self._bundle: self._bundle.check_zone(zone) self._zone = zone + slashzone = '/' + zone if zone else '' if self._bundle is not None: - self._sent_id = self._bundle.address() + '/' + zone + self._sent_id = self._bundle.address() + slashzone elif self._sent_id: - self._sent_id = self._sent_id.split('/', 1)[0] + '/' + zone + self._sent_id = self._sent_id.split('/', 1)[0] + slashzone else: - self._sent_id = '?/' + zone + self._sent_id = '?' + slashzone @property def parent(self): From 5a7ccdc00b7466d1a1469fec9b2a0a63efce1880 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 11 Jan 2023 14:43:05 +0100 Subject: [PATCH 034/670] Case=Ben allowed in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index b286a27c..47437e2a 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -19,7 +19,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], 'Foreign': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': @@ -30,7 +30,7 @@ def process_node(self, node): rf = ['PronType', 'Case'] af = { 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] @@ -97,7 +97,7 @@ def process_node(self, node): 'NumType': ['Card'], 'NumForm': ['Word'], 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] }) # VERBS ################################################################ elif node.upos == 'VERB': From e9fe589322d5f6d03d318862bc93ec9eba26bd85 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 18 Jan 2023 12:49:10 +0100 Subject: [PATCH 035/670] Comment: link to the issue where "interleaved" is defined. https://github.com/ufal/corefUD/issues/25 --- udapi/block/corefud/fixinterleaved.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/udapi/block/corefud/fixinterleaved.py b/udapi/block/corefud/fixinterleaved.py index c5a1b3ed..026b19f3 100644 --- a/udapi/block/corefud/fixinterleaved.py +++ b/udapi/block/corefud/fixinterleaved.py @@ -3,7 +3,9 @@ import itertools class FixInterleaved(Block): - """Fix mentions with interleaved or crossing spans.""" + """Fix mentions with interleaved or crossing spans. + https://github.com/ufal/corefUD/issues/25 + """ def __init__(self, same_entity_only=True, both_discontinuous=False, crossing_only=False, nested_same_subspan=True, **kwargs): @@ -58,8 +60,8 @@ def process_tree(self, tree): pass deleted.add(mB) - # By changing the mA.words, we could have create another error: - # making the span same as another mention. Let's fix it + # By changing the mA.words, we could have created another error: + # making the span same as another mention. Let's fix it. sA = set(mA.words) for mC in mentions: if mC in deleted or mC is mA or mC is mB: From 6a9501b6522fca2fe4d38c2fcdf8946170ae69c4 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 14:44:30 +0100 Subject: [PATCH 036/670] Updated feature checking for ml. --- udapi/block/ud/ml/markfeatsbugs.py | 68 +++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 47437e2a..54119030 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -19,18 +19,21 @@ def process_node(self, node): self.check_allowed_features(node, { 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'], - 'Foreign': ['Yes']}) + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': self.check_allowed_features(node, { - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Typo': ['Yes']}) # PRONOUNS ############################################################# elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns - 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Typo': ['Yes'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] @@ -74,13 +77,15 @@ def process_node(self, node): self.check_required_features(node, ['PronType', 'Definite']) self.check_allowed_features(node, { 'PronType': ['Art'], - 'Definite': ['Ind'] + 'Definite': ['Ind'], + 'Typo': ['Yes'] }) else: self.check_required_features(node, ['PronType']) self.check_allowed_features(node, { 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], - 'Deixis': ['Prox', 'Remt'] + 'Deixis': ['Prox', 'Remt'], + 'Typo': ['Yes'] }) # NUMERALS ############################################################# elif node.upos == 'NUM': @@ -89,24 +94,27 @@ def process_node(self, node): if re.match(r'^(Digit|Roman)$', node.feats['NumForm']): self.check_allowed_features(node, { 'NumType': ['Card'], - 'NumForm': ['Digit', 'Roman'] + 'NumForm': ['Digit', 'Roman'], + 'Typo': ['Yes'] }) else: - self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_required_features(node, ['NumType', 'NumForm', 'Case']) self.check_allowed_features(node, { 'NumType': ['Card'], 'NumForm': ['Word'], 'Number': ['Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp'] + 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Typo': ['Yes'] }) # VERBS ################################################################ elif node.upos == 'VERB': - self.check_required_features(node, ['VerbForm', 'Voice']) + self.check_required_features(node, ['VerbForm']) if node.feats['VerbForm'] == 'Inf': self.check_allowed_features(node, { 'VerbForm': ['Inf'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Fin': if node.feats['Mood'] == 'Imp': @@ -121,26 +129,39 @@ def process_node(self, node): 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], - 'Polite': ['Infm', 'Form'] + 'Polite': ['Infm', 'Form'], + 'Typo': ['Yes'] + }) + elif node.feats['Mood'] == 'Nec': + self.check_required_features(node, ['Mood', 'Voice']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Nec'], + 'Polarity': ['Pos', 'Neg'], + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) else: self.check_required_features(node, ['Mood', 'Tense', 'Voice']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Nec'], + 'Mood': ['Ind', 'Pot'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Part': - self.check_required_features(node, ['Tense', 'Voice']) + self.check_required_features(node, ['Tense']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Part'], 'Tense': ['Past'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'] + 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) else: # verbal noun self.check_required_features(node, ['Tense', 'Voice']) @@ -151,6 +172,7 @@ def process_node(self, node): 'Gender': ['Masc', 'Fem', 'Neut'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Typo': ['Yes'] }) # AUXILIARIES ########################################################## elif node.upos == 'AUX': @@ -161,7 +183,8 @@ def process_node(self, node): 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], - 'Polarity': ['Pos', 'Neg'] + 'Polarity': ['Pos', 'Neg'], + 'Typo': ['Yes'] }) else: # indicative or subjunctive self.check_required_features(node, ['Mood', 'Tense']) @@ -171,23 +194,26 @@ def process_node(self, node): 'Mood': ['Ind', 'Sub'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'] + 'Typo': ['Yes'] }) # ADVERBS ############################################################## elif node.upos == 'ADV': if node.feats['PronType'] != '': # Pronominal adverbs are neither compared nor negated. self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'] + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'], + 'Typo': ['Yes'] }) else: # The remaining adverbs are neither pronominal, nor compared or # negated. - self.check_allowed_features(node, {}) + self.check_allowed_features(node, {'Typo': ['Yes']}) # PARTICLES ############################################################ elif node.upos == 'PART': self.check_allowed_features(node, { - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Typo': ['Yes'] }) # THE REST: NO FEATURES ################################################ else: - self.check_allowed_features(node, {}) + self.check_allowed_features(node, {'Typo': ['Yes']}) From 448bba23b9aa90f8741efcd7565a516a7c84c85b Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 14:45:28 +0100 Subject: [PATCH 037/670] bug fix --- udapi/block/ud/ml/markfeatsbugs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 54119030..4741d2fa 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -193,7 +193,7 @@ def process_node(self, node): 'VerbForm': ['Fin'], 'Mood': ['Ind', 'Sub'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative - 'Polarity': ['Pos', 'Neg'] + 'Polarity': ['Pos', 'Neg'], 'Typo': ['Yes'] }) # ADVERBS ############################################################## From 7524bd5cdbe88661eb09eb46f88bc3de07f5716e Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 14:59:48 +0100 Subject: [PATCH 038/670] Updated feature checking for ml. --- udapi/block/ud/ml/markfeatsbugs.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 4741d2fa..be084e22 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -25,6 +25,7 @@ def process_node(self, node): # ADJECTIVES ########################################################### elif node.upos == 'ADJ': self.check_allowed_features(node, { + 'VerbForm': ['Part'], 'Foreign': ['Yes'], 'Typo': ['Yes']}) # PRONOUNS ############################################################# @@ -66,9 +67,9 @@ def process_node(self, node): # ആരെ āre "who" (Acc) എന്തെ ente "what" (Acc.Anim) എന്തിനെ entine "what" (Acc.Anim or maybe Inan but optional) # ആരുടെ āruṭe "who" (Gen) എന്തിന് entin "what" (Gen) or "why" # ആരൊക്കെ ārokke "who" (Dat?) എന്തൊക്കെ entokke "what" (Dat?) - elif node.feats['PronType'] == 'Int': - rf.append('Animacy') - af['Animacy'] = ['Anim', 'Inan'] + #elif node.feats['PronType'] == 'Int': + # rf.append('Animacy') + # af['Animacy'] = ['Anim', 'Inan'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # DETERMINERS ########################################################## @@ -122,13 +123,12 @@ def process_node(self, node): # The verb stem serves as an informal imperative: തുറ tuṟa "open" # The citation form may serve as a formal imperative: തുറക്കുക tuṟakkūka "open" # Finally, there is another formal imperative with -kkū: തുറക്കൂ tuṟakkū "open" - self.check_required_features(node, ['Mood', 'Voice', 'Polite']) + self.check_required_features(node, ['Mood', 'Polite']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], - 'Voice': ['Act', 'Pass', 'Cau'], 'Polite': ['Infm', 'Form'], 'Typo': ['Yes'] }) @@ -164,7 +164,9 @@ def process_node(self, node): 'Typo': ['Yes'] }) else: # verbal noun - self.check_required_features(node, ['Tense', 'Voice']) + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Vnoun'], From 0c0e0a257896741295c27661397e5d263aa8d1dc Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 15:12:49 +0100 Subject: [PATCH 039/670] AUX allows Vnoun. --- udapi/block/ud/ml/markfeatsbugs.py | 45 ++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index be084e22..4f17c45f 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -174,28 +174,45 @@ def process_node(self, node): 'Gender': ['Masc', 'Fem', 'Neut'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], 'Typo': ['Yes'] }) # AUXILIARIES ########################################################## elif node.upos == 'AUX': self.check_required_features(node, ['VerbForm']) - if node.feats['Mood'] == 'Imp': - self.check_required_features(node, ['Mood']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf', 'Prog'], - 'VerbForm': ['Fin'], - 'Mood': ['Imp'], - 'Polarity': ['Pos', 'Neg'], - 'Typo': ['Yes'] - }) - else: # indicative or subjunctive - self.check_required_features(node, ['Mood', 'Tense']) + if node.feats['VerbForm'] == 'Fin': + if node.feats['Mood'] == 'Imp': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Imp'], + 'Polarity': ['Pos', 'Neg'], + 'Typo': ['Yes'] + }) + else: # indicative or subjunctive + self.check_required_features(node, ['Mood', 'Tense']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf', 'Prog'], + 'VerbForm': ['Fin'], + 'Mood': ['Ind', 'Sub'], + 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'Polarity': ['Pos', 'Neg'], + 'Typo': ['Yes'] + }) + else: # verbal noun + # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice. + # Currently both forms are VerbForm=Vnoun. + #self.check_required_features(node, ['Tense', 'Voice']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], - 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Sub'], - 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative + 'VerbForm': ['Vnoun'], + 'Tense': ['Past', 'Pres'], + 'Gender': ['Masc', 'Fem', 'Neut'], 'Polarity': ['Pos', 'Neg'], + # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], 'Typo': ['Yes'] }) # ADVERBS ############################################################## From 94e7e85033515b101873c58e16a97dcd7b465dd9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 15:15:41 +0100 Subject: [PATCH 040/670] Foreign VERB --- udapi/block/ud/ml/markfeatsbugs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 4f17c45f..2cb4f791 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -115,6 +115,7 @@ def process_node(self, node): 'VerbForm': ['Inf'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Fin': @@ -130,6 +131,7 @@ def process_node(self, node): 'Mood': ['Imp'], 'Polarity': ['Pos', 'Neg'], 'Polite': ['Infm', 'Form'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) elif node.feats['Mood'] == 'Nec': @@ -140,6 +142,7 @@ def process_node(self, node): 'Mood': ['Nec'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) else: @@ -151,6 +154,7 @@ def process_node(self, node): 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) elif node.feats['VerbForm'] == 'Part': @@ -161,6 +165,7 @@ def process_node(self, node): 'Tense': ['Past'], 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) else: # verbal noun @@ -176,6 +181,7 @@ def process_node(self, node): 'Voice': ['Act', 'Pass', 'Cau'], # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix. 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Foreign': ['Yes'], 'Typo': ['Yes'] }) # AUXILIARIES ########################################################## From e79bd16052f39cad08782315887df7849177ce3d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Wed, 25 Jan 2023 22:46:50 +0100 Subject: [PATCH 041/670] Conditional in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 2cb4f791..75552c36 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -32,7 +32,7 @@ def process_node(self, node): elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { - 'PronType': ['Prs', 'Int'], # demonstrative pronouns are treated as third person personal pronouns + 'PronType': ['Prs', 'Int', 'Ind'], # demonstrative pronouns are treated as third person personal pronouns 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], 'Typo': ['Yes'] } @@ -150,7 +150,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Pot'], + 'Mood': ['Ind', 'Pot', 'Cnd'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], 'Voice': ['Act', 'Pass', 'Cau'], From 337e7f6d159cf68bacb88529ea843c6c8b67a18d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 26 Jan 2023 00:05:13 +0100 Subject: [PATCH 042/670] Conditional in Malayalam. --- udapi/block/ud/ml/markfeatsbugs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 75552c36..5ca2b4fb 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -202,7 +202,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf', 'Prog'], 'VerbForm': ['Fin'], - 'Mood': ['Ind', 'Sub'], + 'Mood': ['Ind', 'Sub', 'Cnd'], 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative 'Polarity': ['Pos', 'Neg'], 'Typo': ['Yes'] From b6600ea65e001d76ffbec656382384d60511d76c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 26 Jan 2023 00:07:40 +0100 Subject: [PATCH 043/670] Don't print empty tables if no trees will be printed in a given bundle Fixes #110 --- udapi/block/write/textmodetreeshtml.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 7fedc1b8..5ccceb78 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -86,12 +86,22 @@ def print_headers(self, root): def process_bundle(self, bundle): if self.zones_in_rows: - print("") + # Don't print
    if no tree will be printed in this bundle. + marked_trees = [] for tree in bundle: if self._should_process_tree(tree): + if self.print_empty: + allnodes = [tree] + tree.descendants_and_empty + else: + allnodes = tree.descendants(add_self=1) + if self.should_print_tree(tree, allnodes): + marked_trees.append(tree) + if marked_trees: + print("") + for tree in marked_trees: print("") - print("
    ") self.process_tree(tree) print("
    ") + print("") else: super().process_bundle(bundle) From b8b68bf6474751dbf5ec7205ea40936c19c5aa73 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 26 Jan 2023 09:56:25 +0100 Subject: [PATCH 044/670] Do not check foreign words for Malayalam features. --- udapi/block/ud/ml/markfeatsbugs.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 5ca2b4fb..12e2ef0f 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -13,8 +13,17 @@ class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): def process_node(self, node): + # FOREIGN WORDS ######################################################## + # Do not put any restrictions on words that have Foreign=Yes. These may + # also have Lang=xx in MISC, which would mean that the official + # validator would judge them by the rules for language [xx]. But even + # if they are not fully code-switched (e.g. because they are written in + # the Malayalam script, like the English verb പ്ലാന്റ് plānṟ "plant"), + # they still may not have the regular features of Malayalam morphology. + if node.feats['Foreign'] == 'Yes': + pass # NOUNS AND PROPER NOUNS ############################################### - if re.match(r'^(NOUN|PROPN)$', node.upos): + elif re.match(r'^(NOUN|PROPN)$', node.upos): self.check_required_features(node, ['Animacy', 'Number', 'Case']) self.check_allowed_features(node, { 'Animacy': ['Anim', 'Inan'], From 1335522492d7c6cc528ab576dfb3142d4aac67e3 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 26 Jan 2023 21:59:37 +0100 Subject: [PATCH 045/670] improve definition of almost_forest in PrintMentions --- udapi/block/corefud/printmentions.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 7ed31b0d..12db433a 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -10,7 +10,7 @@ class PrintMentions(Block): def __init__(self, continuous='include', almost_continuous='include', treelet='include', forest='include', almost_forest='include', oneword='include', singleton='include', empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, - print_total=True, + print_total=True, print_should=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, minimize_cross=True, color=True, attributes='form,upos,deprel', print_undef_as='_', print_doc_meta=True, print_comments=False, @@ -33,6 +33,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i random.seed(42) self.print_other_forms = print_other_forms self.print_total = print_total, + self.print_should = print_should, print_class = TextModeTreesHtml if html else TextModeTrees self.print_block = print_class( print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent, @@ -61,7 +62,9 @@ def _ok(self, condition, value): return (condition and value == 'only') or (not condition and value=='exclude') def _is_auxiliary_etc(self, node): - if node.udeprel in {'case', 'cc', 'punct', 'conj', 'mark', 'appos', 'vocative'}: + if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}: + return True + if node.deprel == 'advmod:emph': return True if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}: return True @@ -79,8 +82,25 @@ def _is_forest(self, mention, mwords, almost): for ch in w.children: if ch not in mwords: if not almost: + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid return False + # Punctuation before or after the mention span can depend on any of the mwords + # without breaking the almost_forest property. + # According to the UD guidelines, it should depend on the highest node within the phrase, + # i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines. + if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]): + continue + # Some auxiliary words (e.g. prepositions) may be excluded from the mention span + # without breaking the almost_forest property, but they need to depend + # on the mention head (or if the mention is not a catena, they need to depend + # on one of the potential heads, i.e. a node from mwords whose parent is not in mwords). + # For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head), + # but "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest + # because "with" depends on "Mary", which is not the mention head (nor a potential mention head). if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)): + if self.print_should: + ch.misc["ShouldBeInSpanOf"] = mention.entity.eid return False return True From 0178372e381accb9c28795bcfff5f21366e48520 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 27 Jan 2023 22:49:12 +0100 Subject: [PATCH 046/670] Malayalam adpositions can have the Case feature. --- udapi/block/ud/ml/markfeatsbugs.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py index 12e2ef0f..c2a8e0f4 100644 --- a/udapi/block/ud/ml/markfeatsbugs.py +++ b/udapi/block/ud/ml/markfeatsbugs.py @@ -242,6 +242,13 @@ def process_node(self, node): # The remaining adverbs are neither pronominal, nor compared or # negated. self.check_allowed_features(node, {'Typo': ['Yes']}) + # ADPOSITIONS ########################################################## + elif node.upos == 'ADP': + self.check_allowed_features(node, { + # Case suffixes after numbers are separate tokens, they are attached + # via the 'case' relation and they bear the Case feature (the number does not). + 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'], + 'Typo': ['Yes']}) # PARTICLES ############################################################ elif node.upos == 'PART': self.check_allowed_features(node, { From c3da386bf36609774e34464899a048700631b4b9 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Sat, 28 Jan 2023 11:08:43 +0100 Subject: [PATCH 047/670] ud.SetTranslation (e.g. lines from Google Translate) --- udapi/block/ud/settranslation.py | 59 ++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 udapi/block/ud/settranslation.py diff --git a/udapi/block/ud/settranslation.py b/udapi/block/ud/settranslation.py new file mode 100644 index 00000000..487cca06 --- /dev/null +++ b/udapi/block/ud/settranslation.py @@ -0,0 +1,59 @@ +""" +Block SetTranslation for setting of sentence-level translation (the attribute +text_en for English translation) from a separate text file (one sentence per +line). For example, one can export the original sentences using write.SentencesHtml, +then Google-translate them in the web browser, then CTRL+C CTRL+V to a plain +text editor, save them as translations.txt and import them using this block. + +Usage: +udapy -s ud.SetTranslation file=translations.txt < in.conllu > out.conllu + +Author: Dan Zeman +""" +from udapi.core.block import Block +import re +import logging + +class SetTranslation(Block): + """ + Set text_en to the next available translation. + """ + + def __init__(self, file, overwrite=False, **kwargs): + """ + Create the SetTranslation block. + + Parameters: + file: the name of the text file with the translations (one sentence per line) + overwrite=1: set the translation even if the sentence already has one + (default: do not overwrite existing translations) + """ + super().__init__(**kwargs) + self.file = file + fh = open(self.file, 'r', encoding='utf-8') + self.trlines = fh.readlines() + self.nlines = len(self.trlines) + self.iline = 0 + self.overwrite = overwrite + + def process_tree(self, tree): + if self.iline < self.nlines: + translation = self.trlines[self.iline] + self.iline += 1 + comments = [] + if tree.comment: + comments = tree.comment.split('\n') + i_tr = -1 + for i in range(len(comments)): + # The initial '#' character has been stripped. + if re.match(r'\s*text_en\s*=', comments[i]): + i_tr = i + break + if i_tr >= 0: + if self.overwrite: + comments[i_tr] = ' text_en = ' + translation + else: + comments.append(' text_en = ' + translation) + tree.comment = '\n'.join(comments) + elif self.iline == self.nlines: + logging.warning('There are only %d translation lines but there are more input sentences.' % self.nlines) From a75ab8d8bd9754b776911c41977fbcacdcf3b521 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 00:52:29 +0100 Subject: [PATCH 048/670] first draft of a coreference-visualization writer --- udapi/block/write/corefhtml.py | 123 +++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 udapi/block/write/corefhtml.py diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py new file mode 100644 index 00000000..fc49dfb4 --- /dev/null +++ b/udapi/block/write/corefhtml.py @@ -0,0 +1,123 @@ +"""CorefHtml class is a writer for HTML+JavaScript visualization of coreference.""" +from udapi.core.basewriter import BaseWriter +from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention + +ETYPES = 'person place organization animal plant object substance time number abstract event'.split() + +class CorefHtml(BaseWriter): + + def __init__(self, path_to_js='web', **kwargs): + super().__init__(**kwargs) + self.path_to_js = path_to_js + + def process_document(self, doc): + print('') + print('Udapi CorefUD viewer') + print('') + #print('') #$(window).on("load", function() {...} + #print('') + print('') + print('\n') + + for tree in doc.trees: + self.process_tree(tree) + + print('') + print('') + + def process_tree(self, tree): + mentions = set() + nodes_and_empty = tree.descendants_and_empty + for node in nodes_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + sent_mentions = [] + for mention in mentions: + mspan = mention.span + if ',' not in mspan: + sent_mentions.append(mention) + else: + entity = mention.entity + head_str = str(mention.words.index(mention.head) + 1) + subspans = mspan.split(',') + for idx,subspan in enumerate(subspans, 1): + subspan_eid = f'{entity.eid}[{idx}/{len(subspans)}]' + subspan_words = span_to_nodes(tree, subspan) + fake_entity = CorefEntity(subspan_eid, entity.etype) + fake_mention = CorefMention(subspan_words, head_str, fake_entity, add_word_backlinks=False) + if mention._other: + fake_mention._other = mention._other + if mention._bridging and idx == 1: + fake_mention._bridging = mention._bridging + sent_mentions.append(fake_mention) + sent_mentions.sort(reverse=True) + + opened = [] + print('

    ') + for node in nodes_and_empty: + while sent_mentions and sent_mentions[-1].words[0] == node: + m = sent_mentions.pop() + e = m.entity + classes = f'{e.eid} {e.etype or "other"}' + if all(w.is_empty() for w in m.words): + classes += ' empty' + if len(e.mentions) == 1: + classes += ' singleton' + title = f'eid={e.eid}\ntype={e.etype}\nhead={m.head.form}' + print(f'', end='') + opened.append(m) + + is_head = self._is_head(node) + if is_head: + print('', end='') + if node.is_empty(): + print('', end='') + print(node.form, end='') + if node.is_empty(): + print('', end='') + if is_head: + print('', end='') + + while opened and opened[-1].words[-1] == node: + print('', end='') + opened.pop() + + if not node.no_space_after: + print(' ', end='') + + print('

    ') + + def _is_head(self, node): + for mention in node.coref_mentions: + if mention.head == node: + return mention + return None + +# id needs to be a valid DOM querySelector +# so it cannot contain # nor / and it cannot start with a digit +def _id(node): + if node is None: + return 'null' + return '"n%s"' % node.address().replace('#', '-').replace('/', '-') + + +def _esc(string): + if string is None: + string = '' + return string.replace('\\', '\\\\').replace('"', r'\"') From e3ae1c3fb65fa62431e23c2bfff9d8534d458019 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 13:25:49 +0100 Subject: [PATCH 049/670] fix visualization of discontinuous mentions introduce CorefMentionSubspan instead of fake mentions (should be used also in store_coref_to_misc() in future) --- udapi/block/write/corefhtml.py | 40 +++++++++++----------------------- udapi/core/coref.py | 39 ++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index fc49dfb4..890b172a 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -44,44 +44,30 @@ def process_tree(self, tree): for m in node.coref_mentions: mentions.add(m) - sent_mentions = [] + subspans = [] for mention in mentions: - mspan = mention.span - if ',' not in mspan: - sent_mentions.append(mention) - else: - entity = mention.entity - head_str = str(mention.words.index(mention.head) + 1) - subspans = mspan.split(',') - for idx,subspan in enumerate(subspans, 1): - subspan_eid = f'{entity.eid}[{idx}/{len(subspans)}]' - subspan_words = span_to_nodes(tree, subspan) - fake_entity = CorefEntity(subspan_eid, entity.etype) - fake_mention = CorefMention(subspan_words, head_str, fake_entity, add_word_backlinks=False) - if mention._other: - fake_mention._other = mention._other - if mention._bridging and idx == 1: - fake_mention._bridging = mention._bridging - sent_mentions.append(fake_mention) - sent_mentions.sort(reverse=True) + subspans.extend(mention._subspans()) + subspans.sort(reverse=True) opened = [] print('

    ') for node in nodes_and_empty: - while sent_mentions and sent_mentions[-1].words[0] == node: - m = sent_mentions.pop() + while subspans and subspans[-1].words[0] == node: + subspan = subspans.pop() + m = subspan.mention e = m.entity classes = f'{e.eid} {e.etype or "other"}' - if all(w.is_empty() for w in m.words): + if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: classes += ' singleton' - title = f'eid={e.eid}\ntype={e.etype}\nhead={m.head.form}' - print(f'', end='') - opened.append(m) + title += f'\n{m.other}' + print(f'', end='') #data-eid="{e.eid}" + + opened.append(subspan) is_head = self._is_head(node) if is_head: diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 3eb76db3..1a6d1f95 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -128,6 +128,17 @@ def __init__(self, words, head=None, entity=None, add_word_backlinks=True): new_word._mentions.append(self) new_word._mentions.sort() + def _subspans(self): + mspan = self.span + if ',' not in mspan: + return [CorefMentionSubspan(self._words, self, '')] + root = self._words[0].root + subspans = mspan.split(',') + result = [] + for idx,subspan in enumerate(subspans, 1): + result.append(CorefMentionSubspan(span_to_nodes(root, subspan), self, f'[{idx}/{len(subspans)}]')) + return result + def __lt__(self, another): """Does this mention precedes (word-order wise) `another` mention? @@ -247,6 +258,32 @@ def span(self, new_span): self.words = span_to_nodes(self._head.root, new_span) +@functools.total_ordering +class CorefMentionSubspan(object): + """Helper class for representing a continuous subspan of a mention.""" + __slots__ = ['words', 'mention', 'subspan_id'] + + def __init__(self, words, mention, subspan_id): + if not words: + raise ValueError("mention.words must be non-empty") + self.words = sorted(words) + self.mention = mention + self.subspan_id = subspan_id + + def __lt__(self, another): + if self.words[0] is another.words[0]: + if len(self.words) > len(another.words): + return True + if len(self.words) < len(another.words): + return False + assert False + return self.words[0].precedes(another.words[0]) + + @property + def subspan_eid(self): + return self.mention._entity.eid + self.subspan_id + + CHARS_FORBIDDEN_IN_ID = "-=| \t()" @@ -886,7 +923,7 @@ def nodes_to_span(nodes): Note that empty nodes may form gaps in the span, so if a given tree contains an empty node with ord 5.1, but only nodes with ords 3, 4, 5, 6, 7.1 and 7.2 are provided as `nodes`, the resulting string will be "3-5,6,7.1-7.2". - This means that the implementation needs to iterate of all nodes + This means that the implementation needs to iterate over all nodes in a given tree (root.descendants_and_empty) to check for such gaps. """ if not nodes: From b78ef7eea0b76c4f41f8408d918092681d9c5fad Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 14:16:46 +0100 Subject: [PATCH 050/670] util.Normalize: sort attributes in FEATS and MISC --- udapi/block/util/normalize.py | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 udapi/block/util/normalize.py diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py new file mode 100644 index 00000000..5b4270cc --- /dev/null +++ b/udapi/block/util/normalize.py @@ -0,0 +1,40 @@ +"""util.Normalize normalizes the ordering of various attributes in CoNLL-U.""" +from udapi.core.block import Block + +class Normalize(Block): + """Normalize the ordering of attributes in the FEATS and MISC columns. + + The attribute-value pairs in the FEATS column in CoNLL-U files + must be sorted alphabetically (case-insensitive) according to the guidelines + (https://universaldependencies.org/format.html#morphological-annotation). + The same is highly recommended for the MISC column. + It is useful e.g. for comparing two conllu files with diff. + + Udapi does the sorting automatically, but for speed reasons + only when writing into these attributes. + This block thus just forces deserialization of node.feats and node.misc, + so that the Udapi later sorts the attributes during serialization. + It is a bit more efficient than something like + util.Eval node='node.feats["Number"] = node.feats["Number"]' + or + util.Eval node='node.misc["NonExistentAttribute"] = None' + """ + + def __init__(self, feats=True, misc=True, **kwargs): + """ + Args: + `feats`: normalize the ordering of FEATS. Default=True. + `misc`: normalize the ordering of MISC. Default=True. + """ + super().__init__(**kwargs) + self.feats = feats + self.misc = misc + # TODO: normalize also standardized comments like text, sent_id,... + + def process_node(self, node): + if self.feats: + node.feats._deserialize_if_empty() + node.feats._string = None + if self.misc: + node.misc._deserialize_if_empty() + node.misc._string = None From 90f338de077467acb4cb9ebebce68179419a0d77 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 14:29:30 +0100 Subject: [PATCH 051/670] allow writing to node.sdeprel, add tests --- udapi/core/node.py | 8 ++++++++ udapi/core/tests/test_node.py | 25 ++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 63242698..e188e134 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -166,6 +166,14 @@ def sdeprel(self): return parts[1] return '' + @sdeprel.setter + def sdeprel(self, value): + udeprel = self.udeprel + if value is not None and value != '': + self.deprel = udeprel + ':' + value + else: + self.deprel = udeprel + @property def feats(self): """Property for morphological features stored as a `Feats` object. diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index 28a45d85..8bc7f182 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -119,7 +119,7 @@ def test_draw(self): sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type def test_feats(self): - """Test the morphological featrues.""" + """Test the morphological features.""" node = Node(root=None) self.assertEqual(str(node.feats), '_') node.feats = '' @@ -145,6 +145,29 @@ def test_feats(self): self.assertEqual(str(node.feats), '_') self.assertEqual(node.feats, {}) + def test_deprel(self): + """Test getting setting the dependency relation.""" + node = Node(root=None, deprel='acl:relcl') + self.assertEqual(node.deprel, 'acl:relcl') + self.assertEqual(node.udeprel, 'acl') + self.assertEqual(node.sdeprel, 'relcl') + node.udeprel = 'advcl' + self.assertEqual(node.deprel, 'advcl:relcl') + node.sdeprel = 'tcl' + self.assertEqual(node.deprel, 'advcl:tcl') + node.sdeprel = '' + self.assertEqual(node.deprel, 'advcl') + self.assertEqual(node.udeprel, 'advcl') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj' + self.assertEqual(node.deprel, 'nsubj') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj:pass:outer' + self.assertEqual(node.deprel, 'nsubj:pass:outer') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, 'pass:outer') + def test_deps_getter(self): """Test enhanced dependencies.""" # Create a path to the test CoNLLU file. From 5817af214df034e42cf09ef2c08f0c8d15b3a0d9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 16:31:50 +0100 Subject: [PATCH 052/670] write.CorefHtml marks subspans of discontiuous mentions with a red border --- udapi/block/write/corefhtml.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 890b172a..e0ab830b 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -18,26 +18,34 @@ def process_document(self, doc): #print('') print('') print('\n') + mention_ids = {} + for entity in doc.coref_entities: + for idx, mention in enumerate(entity.mentions, 1): + mention_ids[mention] = f'{entity.eid}e{idx}' + for tree in doc.trees: - self.process_tree(tree) + self.process_tree(tree, mention_ids) print('') + ' e.stopPropagation();\n});\n' + '$("span").hover(\n' + ' function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");},\n' + ' function(e) {$("span").removeClass("active");}\n' + ');\n') print('') - def process_tree(self, tree): + def process_tree(self, tree, mention_ids): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -56,7 +64,7 @@ def process_tree(self, tree): subspan = subspans.pop() m = subspan.mention e = m.entity - classes = f'{e.eid} {e.etype or "other"}' + classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: From 355e7bdc32ab854827aff1f7277b069f5c5a8bc0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 17:57:48 +0100 Subject: [PATCH 053/670] write.CorefHtml shows also crossing mentions using valid (well-nested) html --- udapi/block/write/corefhtml.py | 56 +++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index e0ab830b..3efe9793 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -18,7 +18,8 @@ def process_document(self, doc): #print('') print('') @@ -35,15 +74,37 @@ def process_document(self, doc): for tree in doc.trees: self.process_tree(tree, mention_ids) - print('') + print('') print('') def _start_subspan(self, subspan, mention_ids, crossing=False): @@ -74,8 +135,10 @@ def process_tree(self, tree, mention_ids): subspans.extend(mention._subspans()) subspans.sort(reverse=True) + if tree.newpar: + print('


    ') opened = [] - print('

    ') + print(f'

    ') for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() From 9e11bd515e19fa59c0bdbc50654d29544b13a21b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 1 Feb 2023 18:03:19 +0100 Subject: [PATCH 060/670] util.Normalize now normalizes also sent_id --- udapi/block/util/normalize.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py index 5b4270cc..298bea42 100644 --- a/udapi/block/util/normalize.py +++ b/udapi/block/util/normalize.py @@ -20,16 +20,33 @@ class Normalize(Block): util.Eval node='node.misc["NonExistentAttribute"] = None' """ - def __init__(self, feats=True, misc=True, **kwargs): + def __init__(self, feats=True, misc=True, sent_id=True, start_sent_id=1, **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. `misc`: normalize the ordering of MISC. Default=True. + `sent_id`: normalize sent_id so it forms a sequence of integers + `start_sent_id`: the first sent_id number """ super().__init__(**kwargs) self.feats = feats self.misc = misc - # TODO: normalize also standardized comments like text, sent_id,... + self.sent_id = sent_id + self.next_sent_id = start_sent_id + # TODO: normalize also the order of standardized comments like text, sent_id,... + + def process_bundle(self, bundle): + if self.sent_id: + bundle.bundle_id = str(self.next_sent_id) + self.next_sent_id += 1 + + for tree in bundle: + if self._should_process_tree(tree): + self.process_tree(tree) + + def process_tree(self, tree): + for node in tree.descendants: + self.process_node(node) def process_node(self, node): if self.feats: From 4e1b75678dab1f2602cc26b641a31de977a98f14 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 00:47:37 +0100 Subject: [PATCH 061/670] sent_id should not be normalized by default Unlike feats and misc ordering, we can lose information this way - the original sent_id, so it is potentially dangerous. --- udapi/block/util/normalize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py index 298bea42..48cd6dc1 100644 --- a/udapi/block/util/normalize.py +++ b/udapi/block/util/normalize.py @@ -20,12 +20,12 @@ class Normalize(Block): util.Eval node='node.misc["NonExistentAttribute"] = None' """ - def __init__(self, feats=True, misc=True, sent_id=True, start_sent_id=1, **kwargs): + def __init__(self, feats=True, misc=True, sent_id=False, start_sent_id=1, **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. `misc`: normalize the ordering of MISC. Default=True. - `sent_id`: normalize sent_id so it forms a sequence of integers + `sent_id`: normalize sent_id so it forms a sequence of integers. Default=False. `start_sent_id`: the first sent_id number """ super().__init__(**kwargs) From b899af14c12c7ba4c9750ba39bf5f5544783ba59 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 09:53:59 +0100 Subject: [PATCH 062/670] write.Conllu path=another/directory keeps the file name, but changes the directory --- udapi/core/basewriter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index cdc2c38f..93f6463a 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -1,6 +1,7 @@ """BaseWriter is the base class for all writer blocks.""" import sys import logging +import os import udapi.core.coref from udapi.core.block import Block @@ -11,7 +12,7 @@ class BaseWriter(Block): """Base class for all reader blocks.""" def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8', - newline='\n', overwrite=False, **kwargs): + newline='\n', overwrite=False, path=None, **kwargs): super().__init__(**kwargs) self.orig_files = files self.orig_stdout = sys.stdout @@ -29,6 +30,7 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1") + self.path = path @property def filename(self): @@ -60,9 +62,11 @@ def before_process_document(self, document): sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: logging.warning('docname_as_file=1 but the document contains no docname') - elif self.overwrite: + elif self.overwrite or self.path: docname = document.meta.get('loaded_from', None) if docname is not None: + if self.path: + docname = os.path.join(self.path, os.path.split(docname)[1]) logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: From 9d183c1d979c50fabff9b3a295a0d8194a09c790 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 10:14:59 +0100 Subject: [PATCH 063/670] etype mismatch is stored in mention.other["orig_etype"] which allows easier debugging --- udapi/core/coref.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 1a13d9fb..12dda239 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -665,6 +665,7 @@ def load_coref_from_misc(doc, strict=True): entity.etype = etype elif etype and entity.etype and entity.etype != etype: logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + other["orig_etype"] = etype # CorefEntity could be created first with "Bridge=" without any type elif etype and entity.etype is None: entity.etype = etype From 5b3ed0268ccf76f5332fcce87ac0da9a42b221b8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 14:19:33 +0100 Subject: [PATCH 064/670] allow using e.g. write.CorefHtml path='html/*.html' --- udapi/core/basewriter.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 93f6463a..e17a64c3 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -66,11 +66,21 @@ def before_process_document(self, document): docname = document.meta.get('loaded_from', None) if docname is not None: if self.path: - docname = os.path.join(self.path, os.path.split(docname)[1]) + old_dir, old_filename = os.path.split(docname) + new_dir, new_filename = os.path.split(self.path) + old_file, old_ext = os.path.splitext(old_filename) + new_file, new_ext = os.path.splitext(new_filename) + if new_dir in ('', '*'): + new_dir = old_dir + if new_file in ('', '*'): + new_file = old_file + if new_ext in ('', '*'): + new_ext = old_ext + docname = os.path.join(new_dir, new_file + new_ext) logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: - logging.warning('overwrite=1 but document.meta["loaded_from"] is None') + logging.warning('using overwrite or path but document.meta["loaded_from"] is None') else: sys.stdout = self.orig_stdout else: From 34aa19d7d892790b81b2b79579fc4391c07a23ed Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 14:42:30 +0100 Subject: [PATCH 065/670] write.Conllu path=my_dir should be interpreted as path=my_dir/ --- udapi/core/basewriter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index e17a64c3..6e1b7446 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -30,6 +30,9 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1") + # interpret path=my_dir/my_subdir as path=my_dir/my_subdir/ + if path and path[-1] != os.sep and '*' not in path: + path += os.sep self.path = path @property From 301b808082254a9b45a2bd4cfe162719dc02bc23 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 4 Feb 2023 01:36:25 +0100 Subject: [PATCH 066/670] corefud.GuessSpan: add empty nodes that are causing gaps --- udapi/block/corefud/guessspan.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/guessspan.py b/udapi/block/corefud/guessspan.py index 5c3c6c12..d6093ece 100644 --- a/udapi/block/corefud/guessspan.py +++ b/udapi/block/corefud/guessspan.py @@ -4,6 +4,30 @@ class GuessSpan(Block): """Block corefud.GuessSpan heuristically fills mention spans, while keeping mention.head""" def process_coref_mention(self, mention): - mention.words = mention.head.descendants(add_self=True) - # TODO add empty nodes that are causing gaps + mwords = mention.head.descendants(add_self=True) # TODO add heuristics from corefud.PrintMentions almost_forest=1 + + # Add empty nodes that are causing gaps. + # A node "within the span" whose enhanced parent is in the mentions + # must be added to the mention as well. + # "within the span" includes also empty nodes "on the boundary". + # However, don't add empty nodes which are in a gap cause by non-empty nodes. + to_add = [] + min_ord = int(mwords[0].ord) if mwords[0].is_empty() else mwords[0].ord - 1 + max_ord = int(mwords[-1].ord) + 1 + root = mention.head.root + for empty in root.empty_nodes: + if empty in mwords: + continue + if empty.ord > max_ord: + break + if empty.ord > min_ord: + if any(enh['parent'] in mwords for enh in empty.deps): + to_add.append(empty) + elif empty.ord > min_ord + 1 and empty.ord < max_ord - 1: + prev_nonempty = root.descendants[int(empty.ord) - 1] + next_nonempty = root.descendants[int(empty.ord)] + if prev_nonempty in mwords and next_nonempty in mwords: + to_add.append(empty) + #else: empty.misc['Mark'] = f'not_in_treelet_of_{mention.entity.eid}' + mention.words = sorted(mwords + to_add) From 2285d27f5e9444d3db7a8a0b8db227b38e5c082b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 5 Feb 2023 01:06:32 +0100 Subject: [PATCH 067/670] write.CorefHtml: distinguish entities using colors, show eid and docname --- udapi/block/write/corefhtml.py | 41 +++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 8503854f..0a06b7e5 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -1,19 +1,21 @@ """CorefHtml class is a writer for HTML+JavaScript visualization of coreference.""" from udapi.core.basewriter import BaseWriter from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention +from collections import Counter import udapi.block.write.html ETYPES = 'person place organization animal plant object substance time number abstract event'.split() CSS = ''' .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.sentence span .eid {display:block; font-size: 10px;} .showtree {float:left; margin: 5px;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} .empty {color: gray;} -.singleton {border-style: dotted;} +.sentence .singleton {border-style: dotted;} .crossing:before {content: "!"; display: block; background: #ffd500;} .active {border: 1px solid red !important;} -.selected {background: red !important;} +.selected {background: red !important; text-shadow: 1px 1px 4px white, -1px 1px 4px white, 1px -1px 4px white, -1px -1px 4px white;} .other {background: hsl(0, 0%, 85%);} ''' @@ -50,9 +52,11 @@ class CorefHtml(BaseWriter): - def __init__(self, show_trees=True, **kwargs): + def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): super().__init__(**kwargs) self.show_trees = show_trees + self.show_eid = show_eid + self.colors = colors def process_document(self, doc): print('') @@ -63,16 +67,25 @@ def process_document(self, doc): print('') print('\n') mention_ids = {} + entity_colors = {} + entities_of_type = Counter() for entity in doc.coref_entities: + if self.colors: + count = entities_of_type[entity.etype] + entities_of_type[entity.etype] = count + 1 + entity_colors[entity] = f'c{count % self.colors}' for idx, mention in enumerate(entity.mentions, 1): mention_ids[mention] = f'{entity.eid}e{idx}' for tree in doc.trees: - self.process_tree(tree, mention_ids) + self.process_tree(tree, mention_ids, entity_colors) print('') print('') - def _start_subspan(self, subspan, mention_ids, crossing=False): + def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): m = subspan.mention e = m.entity classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"}' - title = f'eid={subspan.subspan_eid}\ntype={e.etype}\nhead={m.head.form}' + title = f'eid={subspan.subspan_eid}\ntype={e.etype} ({entity_colors[e]})\nhead={m.head.form}' + if self.colors: + classes += f' {entity_colors[e]}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: @@ -121,9 +136,11 @@ def _start_subspan(self, subspan, mention_ids, crossing=False): title += '\ncrossing' if m.other: title += f'\n{m.other}' - print(f'', end='') #data-eid="{e.eid}" + print(f'', end='') + if self.show_eid: + print(f'{subspan.subspan_eid}', end='') - def process_tree(self, tree, mention_ids): + def process_tree(self, tree, mention_ids, entity_colors): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -135,14 +152,16 @@ def process_tree(self, tree, mention_ids): subspans.extend(mention._subspans()) subspans.sort(reverse=True) - if tree.newpar: + if tree.newdoc: + print(f'


    {tree.newdoc if tree.newdoc is not True else ""}


    ') + elif tree.newpar: print('
    ') opened = [] print(f'

    ') for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() - self._start_subspan(subspan, mention_ids) + self._start_subspan(subspan, mention_ids, entity_colors) opened.append(subspan) is_head = self._is_head(node) @@ -180,7 +199,7 @@ def process_tree(self, tree, mention_ids): opened = new_opened print('' * (len(endings) + len(brokens)), end='') for broken in brokens: - self._start_subspan(broken, mention_ids, True) + self._start_subspan(broken, mention_ids, entity_colors, True) opened.append(subspan) if not node.no_space_after: From cae7c37efe8548c2e432b108e4aa06df3b778e3a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 15:07:42 +0100 Subject: [PATCH 068/670] `read.Conllu max_docs=3` will load only the first three documents This is nice for debugging coreference files, where we cannot load just first N sentences because there may be Bridge/SplitAnte referring to unknown eid. This way we load whole docs. --- udapi/block/read/conllu.py | 22 ++++++++++++++++++++-- udapi/core/basereader.py | 31 ++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index bba69696..d5623fba 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -81,8 +81,26 @@ def parse_comment_line(self, line, root): root.comment += line[1:] + "\n" def read_trees(self): - return [self.read_tree_from_lines(s.split('\n')) for s in - self.filehandle.read().split('\n\n') if s] + if not self.max_docs: + return [self.read_tree_from_lines(s.split('\n')) for s in + self.filehandle.read().split('\n\n') if s] + # udapi.core.basereader takes care about the max_docs parameter. + # However, we can make the loading much faster by not reading + # the whole file if the user wants just first N documents. + trees, lines, loaded_docs = [], [], 0 + for line in self.filehandle: + line = line.rstrip() + if line == '': + tree = self.read_tree_from_lines(lines) + lines = [] + if tree.newdoc: + if loaded_docs == self.max_docs: + return trees + loaded_docs += 1 + trees.append(tree) + else: + lines.append(line) + return def read_tree(self): if self.filehandle is None: diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index a3b334da..a841bf1b 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -13,7 +13,8 @@ class BaseReader(Block): # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', - sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, **kwargs): + sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, + max_docs=0, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None @@ -29,6 +30,8 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id self.merge = merge + self.max_docs = max_docs + self._docs_loaded = 0 # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it in here, the reader. # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, @@ -126,6 +129,11 @@ def try_fast_load(self, document): bundle, last_bundle_id = None, '' for root in trees: + if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return True + self._docs_loaded += 1 add_to_the_last_bundle = False if self.ignore_sent_id: @@ -180,8 +188,10 @@ def process_document(self, document): if root._sent_id is not None: bundle.bundle_id = root._sent_id.split('/', 1)[0] bundle.add_tree(root) - if root.newdoc and root.newdoc is not True: - document.meta["docname"] = root.newdoc + if root.newdoc: + self._docs_loaded += 1 + if root.newdoc is not True: + document.meta["docname"] = root.newdoc document.meta['global.Entity'] = self._global_entity document.meta['loaded_from'] = self.filename @@ -204,6 +214,17 @@ def process_document(self, document): if trees_loaded == 0: document.meta['loaded_from'] = self.filename document.meta['global.Entity'] = self._global_entity + # Parameter max_docs is primarily aimed for counting UD docs, ie. trees with newdoc. + # However, it could be useful even when working with files without the newdoc annotations, + # e.g. when using files='!*.conllu' or bundles_per_doc, in which case we count the Udapi documents + # so even if the first tree in udapi.Document does not have newdoc, we count it as a new document. + # The cases where newdoc is used are checked further below. + if not root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return + self._docs_loaded += 1 + add_to_the_last_bundle = False trees_loaded += 1 @@ -222,6 +243,9 @@ def process_document(self, document): # The `# newdoc` comment in CoNLL-U marks a start of a new document. if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return if not bundle and root.newdoc is not True: document.meta["docname"] = root.newdoc if bundle and self.split_docs: @@ -231,6 +255,7 @@ def process_document(self, document): len(orig_bundles)) self.finished = False return + self._docs_loaded += 1 # assign new/next bundle to `bundle` if needed if not bundle or not add_to_the_last_bundle: From ae34d8024d8ee95db6e1bf39581e44fc08bcbc73 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 15:25:50 +0100 Subject: [PATCH 069/670] refactor code duplication --- udapi/block/write/corefhtml.py | 29 +++-------------------------- udapi/block/write/html.py | 28 +++++++++++++++------------- 2 files changed, 18 insertions(+), 39 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 0a06b7e5..c7950ce9 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -50,6 +50,8 @@ }); ''' +WRITE_HTML = udapi.block.write.html.Html() + class CorefHtml(BaseWriter): def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): @@ -90,32 +92,7 @@ def process_document(self, doc): print('') print('') diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 148b29ee..48431900 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,16 +79,26 @@ def process_document(self, doc): print('\n') print('

    ') + + def print_doc_json(self, doc): print('data=[') for (bundle_number, bundle) in enumerate(doc, 1): - # TODO: if not self._should_process_bundle(bundle): continue if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' for tree in bundle.trees: - # TODO: if not self._should_process_tree(tree): continue zone = tree.zone if first_zone: first_zone = False @@ -101,24 +111,16 @@ def process_document(self, doc): print('"labels":["zone=%s","id=%s"]}' % (zone, tree.address())) desc += ',["[%s]","label"],[" ","space"]' % zone for node in tree.descendants: - desc += self.print_node(node) + desc += self.print_node_json(node) desc += r',["\n","newline"]' print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) print('];') - print("$('#treex-view').treexView(data);") - print('''function saveTree() { - var svg_el = jQuery('svg'); - if (svg_el.length) { - var svg = new Blob([svg_el.parent().html()], {type: "image/svg+xml"}); - saveAs(svg, 'tree.svg'); - } - }''') - print('') + @staticmethod - def print_node(node): + def print_node_json(node): """JSON representation of a given node.""" # pylint does not understand `.format(**locals())` and falsely alarms for unused vars # pylint: disable=too-many-locals,unused-variable From ca4d2b7f8240a0faca55f9aad6513d9a94968a08 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 19:53:25 +0100 Subject: [PATCH 070/670] write.CorefHtml: add side panel with an overview of entities --- udapi/block/write/corefhtml.py | 62 ++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index c7950ce9..280fc213 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -6,7 +6,25 @@ ETYPES = 'person place organization animal plant object substance time number abstract event'.split() +HEADER = ''' + +Udapi CorefUD viewer + +''' +# I use a pure CSS-3 solution: #overiew {resize: horizontal; overflow: auto;} +# so that the width of #overview can be changed by dragging the bottom right corner. +# The following lines would make the whole right border draggable: +# +# +# +#
    CSS = ''' +#wrap {display: flex; align-items: flex-start;} +#main {width: 100%; padding: 5px; background: white; z-index:100;} +#overview { position: sticky; top: 0; overflow-y: scroll; height:95vh; resize:horizontal; + display: grid; border-right: double; + padding: 5px; width: 20em; background: #ddd; border-radius: 5px; +} .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} .sentence span .eid {display:block; font-size: 10px;} .showtree {float:left; margin: 5px;} @@ -23,10 +41,16 @@ $("span").click(function(e) { let was_selected = $(this).hasClass("selected"); $("span").removeClass("selected"); - if (!was_selected){$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} + if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} e.stopPropagation(); }); +window.onhashchange = function() { + $("span").removeClass("selected"); + var fragment = window.location.hash.substring(1); + if (fragment) {$("." + fragment).addClass("selected");} +} + $("span").hover( function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, function(e) {$("span").removeClass("active");} @@ -60,10 +84,18 @@ def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): self.show_eid = show_eid self.colors = colors + def _representative_word(self, entity): + # return the first PROPN or NOUN. Or the most frequent one? + heads = [m.head for m in entity.mentions] + lemma_or_form = lambda n: n.lemma if n.lemma else n.form + for upos in ('PROPN', 'NOUN'): + nodes = [n for n in heads if n.upos == upos] + if nodes: + return lemma_or_form(nodes[0]) + return lemma_or_form(heads[0]) + def process_document(self, doc): - print('') - print('Udapi CorefUD viewer') - print('') + print(HEADER) if self.show_trees: print('') print('') - print('\n') + print('\n\n
    ') mention_ids = {} entity_colors = {} @@ -86,8 +118,21 @@ def process_document(self, doc): for idx, mention in enumerate(entity.mentions, 1): mention_ids[mention] = f'{entity.eid}e{idx}' + print('
    ') + print('' + '' + '\n') + for entity in doc.coref_entities: + print(f'' + f'' + f'') + print('
    eid#mword
    {entity.eid}{len(entity.mentions)}{self._representative_word(entity)}
    ') + print('
    ') + + print('
    ') for tree in doc.trees: self.process_tree(tree, mention_ids, entity_colors) + print('
    ') print('') - print('') + print('
    ') def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): m = subspan.mention @@ -113,7 +158,10 @@ def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): title += '\ncrossing' if m.other: title += f'\n{m.other}' - print(f'', end='') + span_id = '' + if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m: + span_id = f'id="{e.eid}" ' + print(f'', end='') if self.show_eid: print(f'{subspan.subspan_eid}', end='') From bbd702aa35fcf4e13d2a4ab2d3972a7efd89fcc5 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 16:22:03 +0100 Subject: [PATCH 071/670] Python glob.glob does not support {dir1,dir2} anyway --- udapi/core/files.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index 7fcd9149..c6973dad 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -58,14 +58,6 @@ def string_to_filenames(self, string): or commas. For specifying files with spaces or commas in filenames, you need to use wildcard patterns or '@' filelist. (But preferably don't use such filenames.) """ - # "!" means glob pattern which can contain {dir1,dir2} - # so it cannot be combined with separating tokens with comma. - if string[0] == '!': - pattern = string[1:] - filenames = glob.glob(pattern) - if not filenames: - raise RuntimeError('No filenames matched "%s" pattern' % pattern) - return filenames return list(itertools.chain.from_iterable(self._token_to_filenames(tok) for tok in string.replace(',', ' ').split())) From a5acaf43b1edb3468dfc493da6e7ae87f2d99966 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 17:58:45 +0100 Subject: [PATCH 072/670] ud.ComplyWithText: use node.misc['CorrectForm'] instead of node.misc['OrigForm'] which was a misleading name because the previous form value is usually not the real original form. --- udapi/block/ud/complywithtext.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index cead294a..bacc56a2 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -34,7 +34,7 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, - **kwargs): + previous_form_attr='CorrectForm', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -54,24 +54,33 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + previous_form_attr - when changing node.form, we store the previous value + in node.misc[previous_form_attr] (so no information is lost). + Default="CorrectForm" because we expect that the previous value + (i.e. the value of node.form before applying this block) + contained the corrected spelling, while root.text contains + the original spelling with typos as found in the raw text. + CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html + When setting this parameter to an empty string, no values will be stored to node.misc. """ super().__init__(**kwargs) self.fix_text = fix_text self.prefer_mwt = prefer_mwt self.allow_goeswith = allow_goeswith self.max_mwt_length = max_mwt_length + self.allow_add_punct = allow_add_punct + self.allow_delete_punct = allow_delete_punct + self.previous_form_attr = previous_form_attr @staticmethod def allow_space(form): """Is space allowed within this token form?""" return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) - @staticmethod - def store_orig_form(node, new_form): - """Store the original form of this node into MISC, unless the change is common&expected.""" - _ = new_form + def store_previous_form(self, node): + """Store the previous form of this node into MISC, unless the change is common&expected.""" if node.form not in ("''", "``"): - node.misc['OrigForm'] = node.form + node.misc[self.previous_form_attr] = node.form def process_tree(self, root): text = root.text @@ -203,7 +212,7 @@ def solve_diff(self, nodes, form): if ' ' in form: if len(nodes) == 1 and node.form == form.replace(' ', ''): if self.allow_space(form): - self.store_orig_form(node, form) + self.store_previous_form(node) node.form = form elif self.allow_goeswith: forms = form.split() @@ -235,7 +244,7 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_orig_form(node, form) + self.store_previous_form(node) node.form = form From a69c7a158edb91d12d2907f6802c3104d946ee0d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 18:00:46 +0100 Subject: [PATCH 073/670] ud.ComplyWithText fix_text=1 should always produce valid CoNLL-U so even if there are diffs which cannot be resolved, and thus we cannot fill SpaceAfter=No in the rest of the sentence, we must execute the "if self.fix_text:..." code, which changes the root.text (instead of changing the annotation of nodes). --- udapi/block/ud/complywithtext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index bacc56a2..1a13a4ec 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -121,7 +121,7 @@ def process_tree(self, root): node.misc['SpaceAfter'] = 'No' else: logging.warning('Node %s does not match text "%s"', node, tmp_text[:20]) - return + break # Edit root.text if needed. if self.fix_text: From fde163c32837ccc02a9b89d535be9769d4414340 Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Wed, 8 Feb 2023 14:23:05 +0100 Subject: [PATCH 074/670] further adjusted Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 122 ++++++++++++++++++----------- 1 file changed, 78 insertions(+), 44 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 323f60f7..111bceb9 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -29,7 +29,7 @@ def process_node(self, node): af = {} # NOUNS ################################################################ if node.upos == 'NOUN': - if not node.feats['Abbr'] == 'Yes' or node.feats['Case']: # abbreviated or indeclinable nouns + if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], @@ -37,11 +37,11 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Dim'], 'Abbr': ['Yes'], - 'Foreign': ['Yes']} + 'Foreign': ['Yes'], + 'VerbForm': ['Part']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] - af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] af['Compound'] = ['Yes'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] @@ -76,14 +76,12 @@ def process_node(self, node): 'Degree': ['Cmp', 'Sup', 'Abs'], 'Abbr': ['Yes'], 'Foreign': ['Yes'], - 'Polarity': ['Neg']} + 'Polarity': ['Neg'], + 'VerbForm': ['Part']} if self.flavio: - # Flavio does not use Degree=Pos, hence Degree is not required. - # rf = [f for f in rf if f != 'Degree'] # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] - af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] af['Degree'].append('Dim') af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] @@ -93,15 +91,16 @@ def process_node(self, node): elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { - 'PronType': ['Prs', 'Rel', 'Ind', 'Int', 'Rcp'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Proper': ['Yes'], + 'Compound': ['Yes'], + 'Polarity': ['Neg'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] if node.feats['Reflex'] == 'Yes': # seipsum, se rf.extend(['Person']) # seipsum has gender and number but se does not, so it is not required - # TODO: seipsum in ITTB, but why lemma seipsum instead of seipse? af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] af['Person'] = ['3'] @@ -122,6 +121,19 @@ def process_node(self, node): rf = [f for f in rf if f != 'Case'] af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] + # lexical check of PronTypes + af['PronType'] = [] + if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: + af['PronType'].append('Prs') + elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis']: + af['PronType'].append('Ind') + elif node.lemma in ['inuicem', 'invicem']: + af['PronType'].append('Rcp') + rf.remove('Case') + elif node.lemma in ['quicumque', 'qui', 'quisquis']: + af['PronType'].append('Rel') + if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis']: + af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatAnom', 'LatPron'] @@ -140,7 +152,9 @@ def process_node(self, node): 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Cmp', 'Abs', 'Sup'], - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Proper': ['Yes'], + 'PronType': [] } if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' rf.extend(['Poss', 'Person[psor]']) @@ -152,8 +166,24 @@ def process_node(self, node): if node.feats['Person[psor]'] != '3': rf.append('Number[psor]') af['Number[psor]'] = ['Sing', 'Plur'] - else: - af['PronType'] = ['Dem', 'Rel', 'Ind', 'Int', 'Tot', 'Con'] + if node.feats['PronType'] == 'Ind': + af['NumType'] = ['Card'] + # lexical check of PronTypes + if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: + if not af['PronType'] == ['Prs']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquot', 'quidam', 'quispiam', 'quivis', 'nullus', 'nonnullus', 'aliqui', 'qui', 'quilibet', 'quantuslibet', 'unus', 'uterque', 'ullus', 'multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + af['PronType'].append('Ind') + elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: + af['PronType'].append('Tot') + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus']: + af['PronType'].append('Rel') + elif node.lemma in ['qui', 'quantus', 'quot']: + af['PronType'].append('Int') + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot']: + af['PronType'].append('Dem') + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter']: + af['PronType'].append('Con') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] @@ -170,8 +200,8 @@ def process_node(self, node): rf = ['NumType', 'NumForm'] af = { 'NumType': ['Card'], - 'NumForm': ['Word', 'Roman', 'Digit'] - } + 'NumForm': ['Word', 'Roman', 'Digit'], + 'Proper': ['Yes']} # Arabic digits and Roman numerals do not have inflection features. if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): af['Gender'] = ['Masc', 'Fem', 'Neut'] @@ -186,40 +216,40 @@ def process_node(self, node): elif re.match(r'^(VERB|AUX)$', node.upos): rf = ['VerbForm', 'Aspect'] af = { - 'VerbForm': ['Inf', 'Fin', 'Part'], + 'VerbForm': ['Inf', 'Fin', 'Part', 'Conv'], 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Typo': ['Yes'] } - if not re.match(r'^(Ger|Gdv)$', node.feats['VerbForm']): + if node.feats['VerbForm'] not in ['Part', 'Conv']: rf.append('Tense') - af['Tense'] = ['Pres', 'Fut'] - if node.upos == 'VERB': # and not node.lemma.endswith('sum'): # compounds of sum + af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] + if node.upos == 'VERB': rf.append('Voice') af['Voice'] = ['Act', 'Pass'] - # Main verbs have aspect but auxiliaries don't. - # TODO: apparently, apparently AUXs have aspect as well - # if node.upos == 'VERB': - # rf.append('Aspect') - # af['Aspect'] = ['Imp', 'Inch', 'Perf', 'Prosp'] if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive rf.extend(['Mood', 'Person', 'Number']) - af['Tense'].extend(['Past', 'Pqp']) af['Mood'] = ['Ind', 'Sub', 'Imp'] af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] elif node.feats['VerbForm'] == 'Part': rf.extend(['Gender', 'Number', 'Case']) - af['Number'] = ['Sing', 'Plur'] - af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Sing'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Neut'] af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] af['Degree'] = ['Abs', 'Cmp'] - af['Gender'] = ['Masc', 'Fem', 'Neut'] - af['Tense'].append('Past') - # else: nothing to be added for VerbForm=Inf + if node.misc['TraditionalMood'].startswith('Gerundi'): + af['Voice'] = ['Pass'] + af['Aspect'] = 'Prosp' + elif node.feats['VerbForm'] == 'Conv': + rf.extend(['Case', 'Gender', 'Number']) + af['Case'] = ['Abl', 'Acc'] + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Voice'] = ['Act'] + elif node.feats['VerbForm'] == 'Inf': + af['Tense'].remove('Pqp') if self.flavio: - # Flavio has killed Tense in his treebanks. - rf = [f for f in rf if f != 'Tense'] - af['VerbForm'].append('Vnoun') # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] if 'Degree' in af: @@ -228,23 +258,22 @@ def process_node(self, node): af['Degree'] = ['Dim'] af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] - if re.match(r'^(Part|Vnoun)$', node.feats['VerbForm']): - af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO'] - af['VerbForm'].append('Vnoun') + if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## elif node.upos == 'ADV': af = { - 'AdvType': ['Loc', 'Tim'], + 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], - 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'NumType': ['Card', 'Ord'], # e.g., primum 'Polarity': ['Neg'] } if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] - af['NumType'] = ['Card', 'Ord'] # e.g., primum af['VerbForm'] = ['Part'] af['Degree'].append('Dim') self.check_allowed_features(node, af) @@ -262,7 +291,8 @@ def process_node(self, node): elif re.match(r'^[CS]CONJ$', node.upos): af = { 'PronType': ['Rel', 'Con'], - 'Polarity': ['Neg']} + 'Polarity': ['Neg'], + 'Compound': ['Yes']} if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] @@ -271,10 +301,14 @@ def process_node(self, node): self.check_allowed_features(node, af) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': + rf = ['AdpType'] + af = { + 'AdpType': ['Prep', 'Post'], + 'Abbr': ['Yes'] + } if self.flavio: - af = { - 'VerbForm': ['Part'], - 'Proper': ['Yes']} + af['VerbForm'] = ['Part'], + af['Proper'] = ['Yes'] self.check_allowed_features(node, af) # THE REST: NO FEATURES ################################################ else: From 29fb09caccd678560845ea3d80b2027145231c90 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:04:56 +0100 Subject: [PATCH 075/670] improve ud.ComplyWithText for KorKor --- udapi/block/ud/complywithtext.py | 81 ++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 1a13a4ec..02904731 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -24,7 +24,7 @@ """ import difflib import logging -import re +import regex from udapi.core.block import Block from udapi.core.mwt import MWT @@ -34,6 +34,7 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, + allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, previous_form_attr='CorrectForm', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. @@ -54,6 +55,14 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + allow_add_punct - allow creating punctuation-only nodes + allow_delete_punct - allow deleting extra punctuation-only nodes, + which are not represented in root.text + allow_hyphen_goeswith - if e.g. node.form=="mother-in-law" corresponds to + "mother in law" in root.text, convert it to three nodes: + node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") + node2(form="in", deprel="goeswith", upos="X", parent=node1) + node3(form="law", deprel="goeswith", upos="X", parent=node1). previous_form_attr - when changing node.form, we store the previous value in node.misc[previous_form_attr] (so no information is lost). Default="CorrectForm" because we expect that the previous value @@ -62,6 +71,7 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ the original spelling with typos as found in the raw text. CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html When setting this parameter to an empty string, no values will be stored to node.misc. + When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. """ super().__init__(**kwargs) self.fix_text = fix_text @@ -70,17 +80,20 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ self.max_mwt_length = max_mwt_length self.allow_add_punct = allow_add_punct self.allow_delete_punct = allow_delete_punct + self.allow_hyphen_goeswith = allow_hyphen_goeswith self.previous_form_attr = previous_form_attr @staticmethod def allow_space(form): """Is space allowed within this token form?""" - return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) + return regex.fullmatch('[0-9 ]+([,.][0-9]+)?', form) def store_previous_form(self, node): """Store the previous form of this node into MISC, unless the change is common&expected.""" - if node.form not in ("''", "``"): + if node.form not in ("''", "``") and self.previous_form_attr: node.misc[self.previous_form_attr] = node.form + if self.previous_form_attr == 'CorrectForm': + node.feats['Typo'] = 'Yes' def process_tree(self, root): text = root.text @@ -190,18 +203,38 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): for diff in diffs: edit, tree_lo, tree_hi, text_lo, text_hi = diff - # Focus only on edits of type 'replace', log insertions and deletions as failures. if edit == 'equal': - continue - if edit in ('insert', 'delete'): - logging.warning('Unable to solve token-vs-text mismatch\n%s', - _diff2str(diff, tree_chars, text)) - continue - - # Revert the splittng and solve the diff. - nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] - form = text[text_lo:text_hi] - self.solve_diff(nodes, form.strip()) + pass + elif edit == 'insert': + forms = text[text_lo:text_hi].split(' ') + if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: + #logging.info(f'trying to add {forms} before {char_nodes[tree_lo]}') + next_node = char_nodes[tree_lo] + for f in reversed(forms): + new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') + new.shift_before_node(next_node) + new.misc['Added'] = 1 + else: + logging.warning('Unable to insert nodes\n%s', + _diff2str(diff, tree_chars, text)) + elif edit == 'delete': + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + if all(regex.fullmatch('\p{P}+', n.form) for n in nodes): + if self.allow_delete_punct: + for node in nodes: + node.remove(children='rehang') + else: + logging.warning('Unable to delete punctuation nodes (try ud.ComplyWithText allow_delete_punct=1)\n%s', + _diff2str(diff, tree_chars, text)) + else: + logging.warning('Unable to delete non-punctuation nodes\n%s', + _diff2str(diff, tree_chars, text)) + else: + assert edit == 'replace' + # Revert the splittng and solve the diff. + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + form = text[text_lo:text_hi] + self.solve_diff(nodes, form.strip()) def solve_diff(self, nodes, form): """Fix a given (minimal) tokens-vs-text inconsistency.""" @@ -210,20 +243,25 @@ def solve_diff(self, nodes, form): # First, solve the cases when the text contains a space. if ' ' in form: - if len(nodes) == 1 and node.form == form.replace(' ', ''): + node_form = node.form + if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: + node_form = node_form.replace('-', '') + if len(nodes) == 1 and node_form == form.replace(' ', ''): if self.allow_space(form): self.store_previous_form(node) node.form = form elif self.allow_goeswith: + self.store_previous_form(node) forms = form.split() node.form = forms[0] + node.feats['Typo'] = 'Yes' for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos=node.upos) + new = node.create_child(form=split_form, deprel='goeswith', upos='X') new.shift_after_node(node) else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: - logging.warning('Unable to solve n:m diff:\n%s -> %s', nodes_str, form) + logging.warning(f'Unable to solve {len(nodes)}:{len(form.split(" "))} diff:\n{nodes_str} -> {form}') # Second, solve the cases when multiple nodes match one form (without any spaces). elif len(nodes) > 1: @@ -244,8 +282,13 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_previous_form(node) - node.form = form + if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): + new = node.create_child(form=form[len(node.form):], deprel='punct', upos='PUNCT') + new.shift_after_node(node) + new.misc['Added'] = 1 + else: + self.store_previous_form(node) + node.form = form def _nodes_to_chars(nodes): From d5a1a2a756ef13629984eb40af7b5853dbd8c7a0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:06:45 +0100 Subject: [PATCH 076/670] udapy hints when using a wrong block name or parameter name thanks to @michnov for this idea --- udapi/core/block.py | 23 +++++++++++++++++++---- udapi/core/run.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/udapi/core/block.py b/udapi/core/block.py index f039abce..fdcad9fa 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -1,5 +1,6 @@ """Block class represents the basic Udapi processing unit.""" import logging +import inspect def not_overridden(method): method.is_not_overridden = True @@ -14,9 +15,23 @@ class Block(object): Possible values are: process (default), skip, skip_warn, fail, delete. """ - def __init__(self, zones='all', if_empty_tree='process'): + def __init__(self, zones='all', if_empty_tree='process', **kwargs): self.zones = zones self.if_empty_tree = if_empty_tree + if kwargs: + params = set() + for cls in type(self).mro()[:-1]: + params.update(inspect.signature(cls.__init__).parameters.keys()) + params -= {'self', 'kwargs'} + raise TypeError(f"Extra parameters {kwargs}.\n" + f"Parameters of {self.block_name()} are:\n" + + '\n'.join(sorted(params))) + + def block_name(self): + module = ".".join(self.__module__.split(".")[:-1]) + if module.startswith('udapi.block.'): + module = module[12:] + return module + "." + self.__class__.__name__ def process_start(self): """A hook method that is executed before processing UD data""" @@ -73,7 +88,7 @@ def process_document(self, document): p_tree = not hasattr(self.process_tree, 'is_not_overridden') p_node = not hasattr(self.process_node, 'is_not_overridden') if not any((p_entity, p_mention, p_bundle, p_tree, p_node)): - raise Exception("No processing activity defined in block " + str(self)) + raise Exception("No processing activity defined in block " + self.block_name()) if p_entity or p_mention: for entity in document.coref_entities: @@ -85,8 +100,8 @@ def process_document(self, document): if p_bundle or p_tree or p_node: for bundle_no, bundle in enumerate(document.bundles, 1): - logging.debug('Block %s processing bundle #%d (id=%s)', - self.__class__.__name__, bundle_no, bundle.bundle_id) + logging.debug(f'Block {self.block_name()} processing ' + f'bundle #{bundle_no} (id={bundle.bundle_id})') if p_bundle: self.process_bundle(bundle) else: diff --git a/udapi/core/run.py b/udapi/core/run.py index a0cc4a9a..418baca6 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -67,6 +67,26 @@ def _parse_command_line_arguments(scenario): return block_names, block_args +def _blocks_in_a_package(package_name): + import importlib.util, pkgutil + + if not importlib.util.find_spec(package_name): + return [] + try: + package = __import__(package_name, fromlist="dummy") + submodule_names = [m.name for m in pkgutil.iter_modules(package.__path__)] + pname = package_name + if pname.startswith("udapi.block."): + pname = pname[12:] + blocks = [] + for sname in submodule_names: + module = __import__(f"{package_name}.{sname}", fromlist="dummy") + bname = [c for c in dir(module) if c.lower() == sname][0] + blocks.append(f"{pname}.{bname}") + return blocks + except: + return [] + def _import_blocks(block_names, block_args): """ Parse block names, import particular packages and call the constructor for each object. @@ -92,8 +112,17 @@ def _import_blocks(block_names, block_args): command = "from " + module + " import " + class_name + " as b" + str(block_id) logging.debug("Trying to run command: %s", command) exec(command) # pylint: disable=exec-used - except Exception: - logging.warning("Error when trying import the block %s", block_name) + except ModuleNotFoundError as err: + package_name = ".".join(module.split(".")[:-1]) + blocks = _blocks_in_a_package(package_name) + if not blocks: + raise + raise ModuleNotFoundError( + f"Cannot find block {block_name} (i.e. class {module}.{class_name})\n" + f"Available block in {package_name} are:\n" + + "\n".join(_blocks_in_a_package(package_name))) from err + except Exception as ex: + logging.warning(f"Cannot import block {block_name} (i.e. class {module}.{class_name})") raise # Run the imported module. From 49ed44d2e309523cdf3361c599934d5dbf58a2a8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:23:36 +0100 Subject: [PATCH 077/670] read.XY files='!*.conllu' should iterated over sorted files glob.glob() returns files in an arbitrary order (as `ls -U`) --- udapi/core/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index c6973dad..be59b2c0 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -65,7 +65,7 @@ def string_to_filenames(self, string): def _token_to_filenames(token): if token[0] == '!': pattern = token[1:] - filenames = glob.glob(pattern) + filenames = sorted(glob.glob(pattern)) if not filenames: raise RuntimeError('No filenames matched "%s" pattern' % pattern) elif token[0] == '@': From 1a4241104709e7647cf75ff84dbc68df3428fbe0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 9 Feb 2023 23:49:11 +0100 Subject: [PATCH 078/670] improve ud.ComplyWithText (for KorKor) --- udapi/block/ud/complywithtext.py | 70 ++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 02904731..c850018e 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -35,7 +35,8 @@ class ComplyWithText(Block): def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, - previous_form_attr='CorrectForm', **kwargs): + previous_form_label='CorrectForm', previous_text_label='CorrectText', + added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -63,8 +64,8 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") node2(form="in", deprel="goeswith", upos="X", parent=node1) node3(form="law", deprel="goeswith", upos="X", parent=node1). - previous_form_attr - when changing node.form, we store the previous value - in node.misc[previous_form_attr] (so no information is lost). + previous_form_label - when changing node.form, we store the previous value + in node.misc[previous_form_label] (so no information is lost). Default="CorrectForm" because we expect that the previous value (i.e. the value of node.form before applying this block) contained the corrected spelling, while root.text contains @@ -72,6 +73,12 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html When setting this parameter to an empty string, no values will be stored to node.misc. When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. + previous_text_label - when we are not able to adapt the annotation to match root.text + and fix_text is True, we store the previous root.text in a CoNLL-U comment with this label. + Default="CorrectText". When setting this parameter to an empty string, + no values will be stored to root.comment. + added_label - when creating new nodes because allow_add_punct=True, we mark these nodes + as new_node.misc[added_label] = 1. Default="Added". """ super().__init__(**kwargs) self.fix_text = fix_text @@ -81,7 +88,9 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ self.allow_add_punct = allow_add_punct self.allow_delete_punct = allow_delete_punct self.allow_hyphen_goeswith = allow_hyphen_goeswith - self.previous_form_attr = previous_form_attr + self.previous_form_label = previous_form_label + self.previous_text_label = previous_text_label + self.added_label = added_label @staticmethod def allow_space(form): @@ -90,9 +99,9 @@ def allow_space(form): def store_previous_form(self, node): """Store the previous form of this node into MISC, unless the change is common&expected.""" - if node.form not in ("''", "``") and self.previous_form_attr: - node.misc[self.previous_form_attr] = node.form - if self.previous_form_attr == 'CorrectForm': + if node.form not in ("''", "``") and self.previous_form_label: + node.misc[self.previous_form_label] = node.form + if self.previous_form_label == 'CorrectForm': node.feats['Typo'] = 'Yes' def process_tree(self, root): @@ -140,7 +149,8 @@ def process_tree(self, root): if self.fix_text: computed_text = root.compute_text() if text != computed_text: - root.add_comment('ToDoOrigText = ' + root.text) + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') root.text = computed_text def unspace_diffs(self, orig_diffs, tree_chars, text): @@ -152,6 +162,10 @@ def unspace_diffs(self, orig_diffs, tree_chars, text): tree_lo += 1 if tree_chars[tree_hi - 1] == ' ': tree_hi -= 1 + if text[text_lo] == ' ': + text_lo += 1 + if text[text_hi - 1] == ' ': + text_hi -= 1 old = tree_chars[tree_lo:tree_hi] new = text[text_lo:text_hi] if old == '' and new == '': @@ -208,12 +222,11 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): elif edit == 'insert': forms = text[text_lo:text_hi].split(' ') if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: - #logging.info(f'trying to add {forms} before {char_nodes[tree_lo]}') next_node = char_nodes[tree_lo] for f in reversed(forms): new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') new.shift_before_node(next_node) - new.misc['Added'] = 1 + new.misc[self.added_label] = 1 else: logging.warning('Unable to insert nodes\n%s', _diff2str(diff, tree_chars, text)) @@ -246,18 +259,26 @@ def solve_diff(self, nodes, form): node_form = node.form if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: node_form = node_form.replace('-', '') - if len(nodes) == 1 and node_form == form.replace(' ', ''): - if self.allow_space(form): - self.store_previous_form(node) - node.form = form - elif self.allow_goeswith: - self.store_previous_form(node) - forms = form.split() - node.form = forms[0] - node.feats['Typo'] = 'Yes' - for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos='X') + if len(nodes) == 1: + if node_form == form.replace(' ', ''): + if self.allow_space(form): + self.store_previous_form(node) + node.form = form + elif self.allow_goeswith: + self.store_previous_form(node) + forms = form.split() + node.form = forms[0] + node.feats['Typo'] = 'Yes' + for split_form in reversed(forms[1:]): + new = node.create_child(form=split_form, deprel='goeswith', upos='X') + new.shift_after_node(node) + else: + logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) + elif self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('[ \p{P}]+', form[len(node.form):]): + for punct_form in reversed(form[len(node.form):].split()): + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) + new.misc[self.added_label] = 1 else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: @@ -283,9 +304,10 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): - new = node.create_child(form=form[len(node.form):], deprel='punct', upos='PUNCT') + punct_form = form[len(node.form):] + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) - new.misc['Added'] = 1 + new.misc[self.added_label] = 1 else: self.store_previous_form(node) node.form = form @@ -313,6 +335,4 @@ def _log_diffs(diffs, tree_chars, text, msg): def _diff2str(diff, tree, text): old = '|' + ''.join(tree[diff[1]:diff[2]]) + '|' new = '|' + ''.join(text[diff[3]:diff[4]]) + '|' - if diff[0] == 'equal': - return '{:7} {!s:>50}'.format(diff[0], old) return '{:7} {!s:>50} --> {!s}'.format(diff[0], old, new) From 3abb76df036f7aa2e8f39437aa7d5b80032ae850 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:08:12 +0100 Subject: [PATCH 079/670] ud.ComplyWithText fix_text=1 should always produce valid CoNLL-U even if the raw texts include double spaces or no-break spaces (TODO: alternatively, we could annotate these using SpacesAfter). --- udapi/block/ud/complywithtext.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index c850018e..351ebc01 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -109,9 +109,13 @@ def process_tree(self, root): if text is None: raise ValueError('Tree %s has no text, cannot use ud.ComplyWithText' % root) - # Normalize the stored text (double space -> single space) + # Normalize the stored text (e.g. double space or no-break space -> single space) # and skip sentences which are already ok. text = ' '.join(text.split()) + if root.text != text and self.fix_text: + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') + root.text = text if text == root.compute_text(): return From 0c6f946802345cc670ece9663fc7007ff05efd73 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:09:36 +0100 Subject: [PATCH 080/670] corefud.PrintMentions should show Entity annotations in MISC by default --- udapi/block/corefud/printmentions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 12db433a..d011f686 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -12,7 +12,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, print_total=True, print_should=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, - minimize_cross=True, color=True, attributes='form,upos,deprel', + minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc', print_undef_as='_', print_doc_meta=True, print_comments=False, mark='(Mark)', hints=True, layout='classic', **kwargs): From f9dd071481e49944fe6c70629bf9d56a90bd86d6 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:27:46 +0100 Subject: [PATCH 081/670] keep newdoc and global.Entity when using read.Conllu sent_id_filter=regex The global.Entity comment will be read automatically by read.Conllu and then inserted automatically by write.Conllu, but only for trees with tree.newdoc, so we need to keep this annotation as well (move it to the new first tree in a given document). --- udapi/core/basereader.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index a841bf1b..71d57159 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -97,13 +97,19 @@ def filtered_read_tree(self): tree = self.read_tree() if self.sent_id_filter is None: return tree + + skipped_newdoc = None while True: if tree is None: return None if self.sent_id_filter.match(tree.sent_id) is not None: + if skipped_newdoc and not tree.newdoc: + tree.newdoc = skipped_newdoc return tree logging.debug('Skipping sentence %s as it does not match the sent_id_filter %s.', tree.sent_id, self.sent_id_filter) + if tree.newdoc: + skipped_newdoc = tree.newdoc tree = self.read_tree() def try_fast_load(self, document): From b036d572af97a9f06482ccdcd7e90cfe4f0f5655 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 15:15:48 +0100 Subject: [PATCH 082/670] update ord of empty nodes when deleting preceding nonempty nodes TODO: add tests, solve also deleting of empty nodes --- udapi/core/node.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 618e75eb..8a764498 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -516,6 +516,7 @@ def remove(self, children=None): `rehang_warn` means to rehang and warn:-). """ self._parent._children.remove(self) + empty_follows = None if children is not None and self._children: if children.startswith('rehang'): for child in self._children: @@ -523,6 +524,16 @@ def remove(self, children=None): self._parent._children.extend(self._children) self._parent._children.sort() self._children.clear() + elif self._root.empty_nodes: + will_be_removed = self.descendants(add_self=1) + prev_nonempty = self._root + empty_follows = {} + for node in self._root.descendants_and_empty: + if node.empty: + empty_follows[node] = prev_nonempty + elif node not in will_be_removed: + prev_nonempty = node + if children.endswith('warn'): logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) @@ -536,14 +547,29 @@ def remove(self, children=None): self._root._descendants.remove(self) except ValueError: pass # self may be an already deleted node e.g. if n.remove() called twice - for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): - node.ord = new_ord + else: + for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): + node.ord = new_ord + for empty in self._root.empty_nodes: + if empty > self: + empty.ord = round(empty.ord - 1, 1) else: # TODO nodes_to_remove = self.unordered_descendants() # and mark all nodes as deleted, remove them from MWT and coref mentions self._root._descendants = sorted(self._root.unordered_descendants()) for (new_ord, node) in enumerate(self._root._descendants, 1): node.ord = new_ord + # Decrease ord of empty nodes (keep their fractional part) + # Make sure that e.g. after deleting node with ord=2 + # ords "1 1.1 1.2 2 2.1" will become "1 1.1 1.2 1.3". + if empty_follows: + last_ord = 0 + for empty in self._root.empty_nodes: + prev_nonempty = empty_follows[empty] + new_ord = round(prev_nonempty.ord + (empty.ord % 1), 1) + while new_ord <= last_ord: + new_ord = round(new_ord + 0.1, 1) + last_ord, empty.ord = new_ord, new_ord def _shift_before_ord(self, reference_ord, without_children=False): """Internal method for changing word order.""" From 6c289d3bda8134a683f6362198888ee920520203 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 16:32:51 +0100 Subject: [PATCH 083/670] ud.ComplyWithText: the previous root.text value is better described as OrigText Unlike the previous node.form values, it is (usually) the original raw text including typos etc, so the label "CorrectText" was completely misleading. --- udapi/block/ud/complywithtext.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 351ebc01..b36b2512 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -35,7 +35,7 @@ class ComplyWithText(Block): def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, - previous_form_label='CorrectForm', previous_text_label='CorrectText', + previous_form_label='CorrectForm', previous_text_label='OrigText', added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. @@ -74,8 +74,8 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ When setting this parameter to an empty string, no values will be stored to node.misc. When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. previous_text_label - when we are not able to adapt the annotation to match root.text - and fix_text is True, we store the previous root.text in a CoNLL-U comment with this label. - Default="CorrectText". When setting this parameter to an empty string, + and fix_text is True, we store the previous root.text value in a CoNLL-U comment with this label. + Default="OrigText". When setting this parameter to an empty string, no values will be stored to root.comment. added_label - when creating new nodes because allow_add_punct=True, we mark these nodes as new_node.misc[added_label] = 1. Default="Added". From 043f4d73745a0155db76d5f4776d77f7ceeeba8a Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Fri, 17 Feb 2023 16:47:25 +0100 Subject: [PATCH 084/670] minor changes in Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 111bceb9..fde3b0bd 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -27,8 +27,11 @@ def __init__(self, flavio=False, **kwargs): def process_node(self, node): rf = [] af = {} + # PROIEL-specific: greek words without features + if node.lemma == 'greek.expression': + pass # NOUNS ################################################################ - if node.upos == 'NOUN': + elif node.upos == 'NOUN': if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { @@ -125,14 +128,14 @@ def process_node(self, node): af['PronType'] = [] if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: af['PronType'].append('Prs') - elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis']: + elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis', 'qui']: af['PronType'].append('Ind') elif node.lemma in ['inuicem', 'invicem']: af['PronType'].append('Rcp') rf.remove('Case') - elif node.lemma in ['quicumque', 'qui', 'quisquis']: + if node.lemma in ['quicumque', 'qui', 'quisquis']: af['PronType'].append('Rel') - if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis']: + if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis', 'ecqui']: af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. @@ -176,7 +179,7 @@ def process_node(self, node): af['PronType'].append('Ind') elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: af['PronType'].append('Tot') - if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus']: + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: af['PronType'].append('Rel') elif node.lemma in ['qui', 'quantus', 'quot']: af['PronType'].append('Int') From e84741a6e78acaaf13739945bd17814d569e3601 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Feb 2023 22:06:56 +0100 Subject: [PATCH 085/670] Remove NOCOREF entities e.g. from AnCora. --- udapi/block/corefud/removenocorefentities.py | 21 ++++++++++++++++++++ udapi/core/coref.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 udapi/block/corefud/removenocorefentities.py diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py new file mode 100644 index 00000000..8baba086 --- /dev/null +++ b/udapi/block/corefud/removenocorefentities.py @@ -0,0 +1,21 @@ +from udapi.core.block import Block +import udapi.core.coref +import re +import logging + +class RemoveNoCorefEntities(Block): + """ + Some corpora (e.g., AnCora) include annotation of named entities that are + not annotated for coreference. To distinguish them, their cluster ID starts + with 'NOCOREF' (optionally followed by entity type, so that one cluster + still has just one type). We may want to remove such entities from datasets + that are used to train coreference resolves, to prevent the resolvers from + thinking that all members of a NOCOREF cluster are coreferential. That is + what this block does. + """ + + def process_document(self, doc): + entities = doc.coref_entities + if not entities: + return + doc.coref_entities = [e for e in entities if not re.match(r'^NOCOREF', e.eid)] diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 12dda239..4cd656f1 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -300,7 +300,7 @@ def __init__(self, eid, etype=None): self.split_ante = [] def __lt__(self, another): - """Does this CorefEntity precedes (word-order wise) `another` entity? + """Does this CorefEntity precede (word-order wise) `another` entity? This method defines a total ordering of all entities by the first mention of each entity (see `CorefMention.__lt__`). From 16c3a48ed3eb7861757092649a6ece22b893151c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Feb 2023 22:27:19 +0100 Subject: [PATCH 086/670] Another method of removing entities. --- udapi/block/corefud/removenocorefentities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py index 8baba086..4551873c 100644 --- a/udapi/block/corefud/removenocorefentities.py +++ b/udapi/block/corefud/removenocorefentities.py @@ -18,4 +18,4 @@ def process_document(self, doc): entities = doc.coref_entities if not entities: return - doc.coref_entities = [e for e in entities if not re.match(r'^NOCOREF', e.eid)] + doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)} From 8b442889aca3c1b881d7d53896d1eb0547635cfa Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 21 Feb 2023 15:52:18 +0100 Subject: [PATCH 087/670] CorefUD: counting sentence sequences with no coref annotation --- udapi/block/corefud/countgaps.py | 67 ++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 udapi/block/corefud/countgaps.py diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py new file mode 100644 index 00000000..c8ee8d76 --- /dev/null +++ b/udapi/block/corefud/countgaps.py @@ -0,0 +1,67 @@ +from udapi.core.block import Block +from collections import Counter + +class CountGaps(Block): + """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" + + def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs): + super().__init__(**kwargs) + self.report_per_newdoc = report_per_newdoc + self.report_per_file = report_per_file + self.report_total = report_total + self._total_counter = Counter() + + def _report_stats(self, counter=None, header_id=None): + if not counter: + counter = self._total_counter + if header_id: + print(f"============ {header_id} ============") + for key in sorted(counter): + print(f"{key:2d}: {counter[key]}") + + def _count_empty_seqs(self, empty_seqs): + counter = Counter() + for seq in empty_seqs: + counter[len(seq)] += 1 + return counter + + def process_document(self, doc): + file_counter = Counter() + empty_seqs = [] + curr_seq = [] + newdoc = None + for i, tree in enumerate(doc.trees): + if tree.newdoc: + if i: + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_counter = self._count_empty_seqs(empty_seqs) + file_counter.update(newdoc_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_counter, header_id=newdoc) + newdoc = tree.newdoc + empty_seqs = [] + curr_seq = [] + + has_mention = any(node.coref_mentions for node in tree.descendants) + if not has_mention: + curr_seq.append(tree.sent_id) + elif curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_counter = self._count_empty_seqs(empty_seqs) + file_counter.update(newdoc_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_counter, header_id=newdoc) + + if self.report_per_file: + self._report_stats(file_counter, header_id="FULL DOC") + + self._total_counter.update(file_counter) + + def process_end(self): + if self.report_total: + self._report_stats(header_id="TOTAL") From 716461fe3b67711f71a8cee028668fe34ceffef0 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 21 Feb 2023 19:22:33 +0100 Subject: [PATCH 088/670] besides sequences, counting also paragraphs with no coref mentions --- udapi/block/corefud/countgaps.py | 63 +++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py index c8ee8d76..fc45540a 100644 --- a/udapi/block/corefud/countgaps.py +++ b/udapi/block/corefud/countgaps.py @@ -1,5 +1,5 @@ from udapi.core.block import Block -from collections import Counter +from collections import defaultdict, Counter class CountGaps(Block): """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" @@ -9,15 +9,15 @@ def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=T self.report_per_newdoc = report_per_newdoc self.report_per_file = report_per_file self.report_total = report_total - self._total_counter = Counter() + self._total_counter = defaultdict(Counter) - def _report_stats(self, counter=None, header_id=None): - if not counter: - counter = self._total_counter + def _report_stats(self, counter, header_id=None): if header_id: print(f"============ {header_id} ============") for key in sorted(counter): print(f"{key:2d}: {counter[key]}") + print("-------") + print(f"SUM: {sum([k*counter[k] for k in counter])}") def _count_empty_seqs(self, empty_seqs): counter = Counter() @@ -26,42 +26,69 @@ def _count_empty_seqs(self, empty_seqs): return counter def process_document(self, doc): - file_counter = Counter() + file_counters = defaultdict(Counter) empty_seqs = [] + empty_pars = [] curr_seq = [] + curr_par = [] + is_empty_par = True newdoc = None for i, tree in enumerate(doc.trees): if tree.newdoc: if i: if curr_seq: empty_seqs.append(curr_seq) - newdoc_counter = self._count_empty_seqs(empty_seqs) - file_counter.update(newdoc_counter) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if is_empty_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) if self.report_per_newdoc: - self._report_stats(newdoc_counter, header_id=newdoc) + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}") newdoc = tree.newdoc empty_seqs = [] + empty_pars = [] curr_seq = [] + curr_par = [] + is_empty_par = True + if tree.newpar: + if not tree.newdoc and is_empty_par: + empty_pars.append(curr_par) + curr_par = [] + is_empty_par = True has_mention = any(node.coref_mentions for node in tree.descendants) if not has_mention: curr_seq.append(tree.sent_id) - elif curr_seq: - empty_seqs.append(curr_seq) - curr_seq = [] + curr_par.append(tree.sent_id) + else: + if curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + is_empty_par = False if curr_seq: empty_seqs.append(curr_seq) - newdoc_counter = self._count_empty_seqs(empty_seqs) - file_counter.update(newdoc_counter) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if curr_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) if self.report_per_newdoc: - self._report_stats(newdoc_counter, header_id=newdoc) + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}") if self.report_per_file: - self._report_stats(file_counter, header_id="FULL DOC") + self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE") + self._report_stats(file_counters["par"], header_id="PAR STATS, FILE") - self._total_counter.update(file_counter) + self._total_counter["seq"].update(file_counters["seq"]) + self._total_counter["par"].update(file_counters["par"]) def process_end(self): if self.report_total: - self._report_stats(header_id="TOTAL") + self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL") + self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL") From c147469f5a4a9267902974846c6ff2d804447cdb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 25 Feb 2023 00:25:12 +0100 Subject: [PATCH 089/670] write.CorefHtml add visualization menu show: eid, trees, line breaks, paragraphs --- udapi/block/write/corefhtml.py | 39 +++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 280fc213..20f68291 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -11,7 +11,7 @@ Udapi CorefUD viewer ''' -# I use a pure CSS-3 solution: #overiew {resize: horizontal; overflow: auto;} +# I use a pure CSS-3 solution: #overview {resize: horizontal; overflow: auto;} # so that the width of #overview can be changed by dragging the bottom right corner. # The following lines would make the whole right border draggable: # @@ -25,9 +25,19 @@ display: grid; border-right: double; padding: 5px; width: 20em; background: #ddd; border-radius: 5px; } +#main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; + padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} +#menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} +.change .b1 {transform: translate(0, 9px) rotate(-45deg);} +.change .b2 {opacity: 0;} +.change .b3 {transform: translate(0, -9px) rotate(45deg);} + .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.sentence .tree span {border: none; padding: 0; display:inline;} .sentence span .eid {display:block; font-size: 10px;} -.showtree {float:left; margin: 5px;} +.showtree {margin: 5px; user-select: none;} +.display-inline {display: inline;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} .empty {color: gray;} .sentence .singleton {border-style: dotted;} @@ -55,16 +65,22 @@ function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, function(e) {$("span").removeClass("active");} ); + +function menuclick(x) { + x.classList.toggle("change"); + $("#main-menu").toggle(); +} + ''' SCRIPT_SHOWTREE = ''' $(".sentence").each(function(index){ var sent_id = this.id; - $(this).before( + $(this).prepend( $("
    ') print('
    ') + print('\n' + '\n') for tree in doc.trees: self.process_tree(tree, mention_ids, entity_colors) print('
    ') @@ -180,7 +203,7 @@ def process_tree(self, tree, mention_ids, entity_colors): if tree.newdoc: print(f'

    {tree.newdoc if tree.newdoc is not True else ""}


    ') elif tree.newpar: - print('
    ') + print('
    ') opened = [] print(f'

    ') for node in nodes_and_empty: @@ -188,7 +211,7 @@ def process_tree(self, tree, mention_ids, entity_colors): subspan = subspans.pop() self._start_subspan(subspan, mention_ids, entity_colors) opened.append(subspan) - + is_head = self._is_head(node) if is_head: print('', end='') @@ -199,7 +222,7 @@ def process_tree(self, tree, mention_ids, entity_colors): print('', end='') if is_head: print('', end='') - + while opened and opened[-1].words[-1] == node: print('', end='') opened.pop() @@ -229,7 +252,7 @@ def process_tree(self, tree, mention_ids, entity_colors): if not node.no_space_after: print(' ', end='') - + print('

    ') def _is_head(self, node): From 0b30f5b75ab2a53ed5e0425d536094dee5c56f02 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 25 Feb 2023 02:53:43 +0100 Subject: [PATCH 090/670] more visualization options --- udapi/block/write/corefhtml.py | 65 +++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 20f68291..fd500e7d 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -11,13 +11,7 @@ Udapi CorefUD viewer ''' -# I use a pure CSS-3 solution: #overview {resize: horizontal; overflow: auto;} -# so that the width of #overview can be changed by dragging the bottom right corner. -# The following lines would make the whole right border draggable: -# -# -# -#
    + CSS = ''' #wrap {display: flex; align-items: flex-start;} #main {width: 100%; padding: 5px; background: white; z-index:100;} @@ -27,15 +21,19 @@ } #main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#main-menu div {display: inline-block;} #menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} #menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} .change .b1 {transform: translate(0, 9px) rotate(-45deg);} .change .b2 {opacity: 0;} .change .b3 {transform: translate(0, -9px) rotate(45deg);} -.sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} -.sentence .tree span {border: none; padding: 0; display:inline;} -.sentence span .eid {display:block; font-size: 10px;} +.m {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.nobox {border:1px solid transparent; padding:0; background: transparent !important; display: inline} +.nobox .labels {display: inline;} +.nocolor {color: black !important;} +.nobold {font-weight: normal;} +.labels {display: block; font-size: 10px;} .showtree {margin: 5px; user-select: none;} .display-inline {display: inline;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} @@ -48,22 +46,22 @@ ''' SCRIPT_BASE = ''' -$("span").click(function(e) { +$(".m").click(function(e) { let was_selected = $(this).hasClass("selected"); - $("span").removeClass("selected"); + $(".m").removeClass("selected"); if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} e.stopPropagation(); }); window.onhashchange = function() { - $("span").removeClass("selected"); + $(".m").removeClass("selected"); var fragment = window.location.hash.substring(1); if (fragment) {$("." + fragment).addClass("selected");} } -$("span").hover( - function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, - function(e) {$("span").removeClass("active");} +$(".m").hover( + function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, + function(e) {$(".m").removeClass("active");} ); function menuclick(x) { @@ -94,10 +92,11 @@ class CorefHtml(BaseWriter): - def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): + def __init__(self, show_trees=True, show_eid=False, show_etype=False, colors=7, **kwargs): super().__init__(**kwargs) self.show_trees = show_trees self.show_eid = show_eid + self.show_etype = show_etype self.colors = colors def _representative_word(self, entity): @@ -120,6 +119,10 @@ def process_document(self, doc): if self.colors: for i in range(self.colors): print(f'.c{i} {{color: hsl({int(i * 360/self.colors)}, 100%, 30%);}}') + if not self.show_eid: + print('.eid {display: none;}') + if not self.show_etype: + print('.etype {display: none;}') print('') print('\n\n
    ') @@ -146,13 +149,19 @@ def process_document(self, doc): print('
    ') print('
    ') - print('\n' '\n') - for tree in doc.trees: - self.process_tree(tree, mention_ids, entity_colors) - print('
    ') - print('') print('
    ') - def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): + def _start_subspan(self, subspan, crossing=False): m = subspan.mention e = m.entity - classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"} m' + classes = f'{e.eid} {self._mention_ids[m]} {e.etype or "other"} m' title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}' if self.colors: - classes += f' {entity_colors[e]}' + classes += f' {self._entity_colors[e]}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: @@ -252,7 +303,7 @@ def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): f'{subspan.subspan_eid}' f' {e.etype}', end='') - def process_tree(self, tree, mention_ids, entity_colors): + def process_tree(self, tree): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -273,7 +324,7 @@ def process_tree(self, tree, mention_ids, entity_colors): for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() - self._start_subspan(subspan, mention_ids, entity_colors) + self._start_subspan(subspan) opened.append(subspan) is_head = self._is_head(node) @@ -311,7 +362,7 @@ def process_tree(self, tree, mention_ids, entity_colors): opened = new_opened print('' * (len(endings) + len(brokens)), end='') for broken in brokens: - self._start_subspan(broken, mention_ids, entity_colors, True) + self._start_subspan(broken, True) opened.append(subspan) if not node.no_space_after: diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 48431900..ae85d43c 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,7 +79,9 @@ def process_document(self, doc): print('\n') print('
    ') def print_doc_json(self, doc): - print('data=[') + print('[') for (bundle_number, bundle) in enumerate(doc, 1): if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' - for tree in bundle.trees: + try: + trees = bundle.trees + except: + trees = [bundle] # allow to call print_doc_json([tree1, tree2]) + for tree in trees: zone = tree.zone if first_zone: first_zone = False @@ -116,7 +122,7 @@ def print_doc_json(self, doc): print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) - print('];') + print(']') @staticmethod From 327bb6f9083f6131b4f986dac9b56f2570957f60 Mon Sep 17 00:00:00 2001 From: Federica Gamba Date: Thu, 30 Mar 2023 12:22:27 +0200 Subject: [PATCH 095/670] adjustments in Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 74 +++++++++++++++++++----------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index fde3b0bd..dce4592d 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -28,7 +28,8 @@ def process_node(self, node): rf = [] af = {} # PROIEL-specific: greek words without features - if node.lemma == 'greek.expression': + # LLCT-specific: corrupted nodes + if node.lemma in ['greek.expression', 'missing^token']: pass # NOUNS ################################################################ elif node.upos == 'NOUN': @@ -41,12 +42,14 @@ def process_node(self, node): 'Degree': ['Dim'], 'Abbr': ['Yes'], 'Foreign': ['Yes'], - 'VerbForm': ['Part']} + 'VerbForm': ['Part', 'Vnoun']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Proper'] = ['Yes'] + af['Polarity'] = ['Neg'] af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) self.check_allowed_features(node, af) @@ -61,10 +64,10 @@ def process_node(self, node): 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: - af['Compound'] = 'Yes' + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] - if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADJECTIVES ########################################################### @@ -72,7 +75,7 @@ def process_node(self, node): if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: rf = ['Gender', 'Number', 'Case'] af = { - 'NumType': ['Ord', 'Dist'], + 'NumType': ['Dist', 'Mult', 'Ord'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], @@ -83,9 +86,10 @@ def process_node(self, node): 'VerbForm': ['Part']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] + af['Variant'] = ['Greek'] af['Degree'].append('Dim') af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) @@ -112,10 +116,10 @@ def process_node(self, node): rf.extend(['Person', 'Number']) af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] - # 1st and 2nd person do not have gender + # 3rd person must have gender if node.feats['Person'] == '3': # is, id rf.append('Gender') - af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] elif re.match(r'^(Rel|Int)$', node.feats['PronType']): rf.extend(['Gender', 'Number']) af['Gender'] = ['Masc', 'Fem', 'Neut'] @@ -126,20 +130,20 @@ def process_node(self, node): af['Number'] = ['Sing', 'Plur'] # lexical check of PronTypes af['PronType'] = [] - if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: + if node.lemma in ['ego', 'tu', 'is', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'egoipse', 'egometipse', 'tumetipse', 'semetipse', 'nosmetipse']: af['PronType'].append('Prs') - elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis', 'qui']: + elif node.lemma in ['aliquis', 'nemo', 'nihil', 'nihilum', 'qui', 'quis', 'quisquis', 'quiuis', 'quivis']: af['PronType'].append('Ind') elif node.lemma in ['inuicem', 'invicem']: af['PronType'].append('Rcp') rf.remove('Case') - if node.lemma in ['quicumque', 'qui', 'quisquis']: + if node.lemma in ['qui', 'quicumque', 'quisquis']: af['PronType'].append('Rel') - if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis', 'ecqui']: + if node.lemma in [ 'ecquis', 'ecqui', 'numquis', 'qui', 'quis', 'quisnam']: af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['LatAnom', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurO', 'IndEurX', 'LatAnom', 'LatPron'] af['Compound'] = ['Yes'] af['Polarity'] = ['Neg'] af['Form'] = ['Emp'] @@ -175,25 +179,26 @@ def process_node(self, node): if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: if not af['PronType'] == ['Prs']: af['PronType'].append('Prs') - elif node.lemma in ['aliquot', 'quidam', 'quispiam', 'quivis', 'nullus', 'nonnullus', 'aliqui', 'qui', 'quilibet', 'quantuslibet', 'unus', 'uterque', 'ullus', 'multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + elif node.lemma in ['aliquantus', 'aliqui', 'aliquot', 'quidam', 'nonnullus', 'nullus', 'quantuscumque', 'quantuslibet', 'qui', 'quilibet', 'quispiam', 'quiuis', 'quivis', 'quotlibet', 'ullus', 'unus', 'uterque','multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: af['PronType'].append('Ind') elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: af['PronType'].append('Tot') if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: af['PronType'].append('Rel') - elif node.lemma in ['qui', 'quantus', 'quot']: + if node.lemma in ['qui', 'quantus', 'quot']: af['PronType'].append('Int') - elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot']: + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot', 'praedictus', 'praefatus', 'suprascriptus']: af['PronType'].append('Dem') - elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter']: + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter', 'uterlibet', 'uterque']: af['PronType'].append('Con') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] af['Compound'] = ['Yes'] af['Form'] = ['Emp'] af['NumType'] = ['Card'] af['Degree'].append('Dim') + af['PronType'].append('Art') if re.match(r'^(unus|ambo)', node.lemma): af['NumValue'] = ['1', '2'] self.check_required_features(node, rf) @@ -202,7 +207,7 @@ def process_node(self, node): elif node.upos == 'NUM': rf = ['NumType', 'NumForm'] af = { - 'NumType': ['Card'], + 'NumType': ['Card', 'Ord'], 'NumForm': ['Word', 'Roman', 'Digit'], 'Proper': ['Yes']} # Arabic digits and Roman numerals do not have inflection features. @@ -212,7 +217,9 @@ def process_node(self, node): af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['NumForm'].append('Reference') + af['Compound'] = ['Yes'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # VERBS AND AUXILIARIES ################################################ @@ -227,7 +234,7 @@ def process_node(self, node): if node.feats['VerbForm'] not in ['Part', 'Conv']: rf.append('Tense') af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] - if node.upos == 'VERB': + if node.upos == 'VERB' or (node.upos == 'AUX' and node.lemma != 'sum'): rf.append('Voice') af['Voice'] = ['Act', 'Pass'] if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive @@ -255,6 +262,7 @@ def process_node(self, node): if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + af['VerbType'] = ['Mod'] if 'Degree' in af: af['Degree'].append('Dim') else: @@ -262,7 +270,12 @@ def process_node(self, node): af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): - af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU'] + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + elif node.feats['VerbForm'] == 'Inf': + af['Case'] = ['Nom', 'Acc', 'Abl'] + af['Gender'] = ['Neut'] + af['Number'] = ['Sing'] + af['InflClass[nominal]'] = ['Ind'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## @@ -271,13 +284,13 @@ def process_node(self, node): 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], - 'NumType': ['Card', 'Ord'], # e.g., primum + 'NumType': ['Card', 'Mult', 'Ord'], # e.g., primum 'Polarity': ['Neg'] } if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] - af['VerbForm'] = ['Part'] + af['VerbForm'] = ['Fin', 'Part'] af['Degree'].append('Dim') self.check_allowed_features(node, af) # PARTICLES ############################################################ @@ -289,6 +302,7 @@ def process_node(self, node): if self.flavio: af['Form'] = ['Emp'] af['PronType'] = ['Dem'] + af['Compound'] = ['Yes'] self.check_allowed_features(node, af) # CONJUNCTIONS ######################################################### elif re.match(r'^[CS]CONJ$', node.upos): @@ -301,6 +315,8 @@ def process_node(self, node): af['Form'] = ['Emp'] af['VerbForm'] = ['Fin'] af['NumType'] = ['Card'] + af['ConjType'] = ['Expl'] + af['AdvType'] = ['Loc'] self.check_allowed_features(node, af) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': @@ -310,9 +326,13 @@ def process_node(self, node): 'Abbr': ['Yes'] } if self.flavio: - af['VerbForm'] = ['Part'], + af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] self.check_allowed_features(node, af) + # X ########################################################## + elif node.upos == 'X': + af = {'Abbr': ['Yes']} # THE REST: NO FEATURES ################################################ else: self.check_allowed_features(node, {}) From 1ddfce4aec593e222a0e3d26e8f74acf561d1356 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 31 Mar 2023 19:42:35 +0200 Subject: [PATCH 096/670] gzip the docs/* json and html files --- udapi/block/write/corefhtml.py | 49 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index cd0db1e5..6129b335 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -17,6 +17,7 @@ from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention from collections import Counter import udapi.block.write.html +import gzip import sys import os @@ -26,6 +27,7 @@ Udapi CorefUD viewer + ''' CSS = ''' @@ -87,21 +89,26 @@ $("#main-menu").toggle(); } -function load_doc(doc_num) { +async function load_doc(doc_num) { loading_now = true; - console.log("loading doc" + doc_num + ".html"); - $.get(docs_dir + "/doc" + doc_num + ".html", function(data){ - $("#main").append(data); - add_mention_listeners($("#doc" + doc_num + " .m")); - $("#doc" + doc_num + " .sentence").each(add_show_tree_button); - loading_now = false; - }).fail(function(){ + let filename = docs_dir + "/doc" + doc_num + ".html.gz" + console.log("loading " + filename); + try { + const res = await fetch(filename); + let raw = await res.arrayBuffer(); + data = pako.inflate(raw, {to: "string"}); + } catch (error){ if (! load_fail_reported) { load_fail_reported = true; - alert("Cannot load " + docs_dir + "/doc" + doc_num - + ".html\\nLocal files do not support lazy loading. Run a web server 'python -m http.server'"); + alert("Cannot load " + filename + "\\nLocal files do not support lazy loading." + + " Run a web server 'python -m http.server'\\n" + + "error = " + error); } - }); + } + $("#main").append(data); + add_mention_listeners($("#doc" + doc_num + " .m")); + $("#doc" + doc_num + " .sentence").each(add_show_tree_button); + loading_now = false; } var docs_loaded = 1; @@ -126,7 +133,7 @@ add_show_tree_button = function(index, el){ var sent_id = el.id; $(el).prepend( - $("