From 3755ee90d38ec0b1fbc0e9fbbef3ecf7da70fc22 Mon Sep 17 00:00:00 2001 From: Francis Tyers Date: Mon, 20 Feb 2017 22:36:54 +0100 Subject: [PATCH 0001/1374] Update vislcg.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the бир нәрсе problem, but leaves a problem with multiwords... e.g. 2-3 екеуі де _ _ _ _ _ _ _ _ 2 екеуі екеу _ num px3sp|nom 1 appos _ _ 3 де "да _ postadv _ 2 advmod _ _ --- udapi/block/read/vislcg.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/udapi/block/read/vislcg.py b/udapi/block/read/vislcg.py index 3c5852d7..d5906650 100644 --- a/udapi/block/read/vislcg.py +++ b/udapi/block/read/vislcg.py @@ -23,6 +23,7 @@ def read_tree(self, document=None): break if line[0] == '#': # Are comments allowed in VISL-cg? + # FMT: Yes :) continue if line[0].isspace(): @@ -60,15 +61,17 @@ def read_tree(self, document=None): raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) return root - + @staticmethod def _node(line, root): - fields = shlex.split(line) - lemma = fields[0] - xpos = fields[1] - feats_list = fields[2:-2] + delim = line.rfind('"'); + lemma = line[2:delim] + fields = line[delim+1:].split() + xpos = fields[0] + feats_list = fields[3:-2] feats = '|'.join(feats_list) if feats_list else '_' deprel = fields[-2][1:] parent_ord = int(fields[-1].split('->')[1]) node = root.create_child(lemma=lemma, xpos=xpos, feats=feats, deprel=deprel) return node, parent_ord + From c1348ab59dee8bf72bbbb25afe590e6f641ccf3c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 20 Feb 2017 21:01:17 +0100 Subject: [PATCH 0002/1374] ud.MarkBugs tests= If we want to apply just one or two tests. --- udapi/block/ud/markbugs.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index 37fd94bd..be3235f8 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -31,22 +31,31 @@ class MarkBugs(Block): """Block for checking suspicious/wrong constructions in UD v2.""" - def __init__(self, save_stats=True, skip=None, **kwargs): + def __init__(self, save_stats=True, tests=None, skip=None, **kwargs): """Create the MarkBugs block object. Args: save_stats: store the bug statistics overview into `document.misc["bugs"]`? - skip: a regex. If `re.search(skip, short_msg)` the node is not reported. + tests: a regex of tests to include. + If `not re.search(tests, short_msg)` the node is not reported. + You can use e.g. `tests=aux-chain|cop-upos` to apply only those two tests. + Default = None (or empty string or '.*') which all tests. + skip: a regex of tests to exclude. + If `re.search(skip, short_msg)` the node is not reported. You can use e.g. `skip=no-(VerbForm|NumType|PronType)`. + This has higher priority than the `tests` regex. Default = None (or empty string) which means no skipping. """ super().__init__(**kwargs) self.save_stats = save_stats self.stats = collections.Counter() + self.tests_re = re.compile(tests) if (tests is not None and tests != '') else None self.skip_re = re.compile(skip) if (skip is not None and skip != '') else None def log(self, node, short_msg, long_msg): """Log node.address() + long_msg and add ToDo=short_msg to node.misc.""" + if self.tests_re is not None and not self.tests_re.search(short_msg): + return if self.skip_re is not None and self.skip_re.search(short_msg): return logging.debug('node %s %s: %s', node.address(), short_msg, long_msg) From 58ba3e6ecb48300b434a487df6635e8b0be5ad8d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 21 Feb 2017 19:13:31 +0100 Subject: [PATCH 0003/1374] node.sdeprel for language-specific deprel subtype --- udapi/core/node.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/udapi/core/node.py b/udapi/core/node.py index fd87bc13..7666c428 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -106,6 +106,21 @@ def udeprel(self): """ return self.deprel.split(':')[0] if self.deprel is not None else None + @property + def sdeprel(self): + """Return the language-specific part of dependency relation. + + E.g. if deprel = `acl:relcl` then sdeprel = `relcl`. + If deprel=`acl` then sdeprel = empty string. + If deprel is `None` then `node.sdeprel` will return `None` as well. + """ + if self.deprel is None: + return None + parts = self.deprel.split(':', 1) + if len(parts) == 2: + return parts[1] + return '' + @property def feats(self): """Property for morphological features stored as a `Feats` object. From ea524a647102ba70850320fe8a2b66e5991a9a7c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 21 Feb 2017 19:14:04 +0100 Subject: [PATCH 0004/1374] convert e.g. dobj:lvc to obj:lvc --- udapi/block/ud/convert1to2.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/convert1to2.py b/udapi/block/ud/convert1to2.py index 72d08ab8..ce549256 100644 --- a/udapi/block/ud/convert1to2.py +++ b/udapi/block/ud/convert1to2.py @@ -114,15 +114,21 @@ def change_upos_copula(node): if node.deprel == 'cop' and node.upos not in ("AUX", "PRON"): node.upos = "AUX" - @staticmethod - def change_deprel_simple(node): + def change_deprel_simple(self, node): """mwe→fixed, dobj→obj, *pass→*:pass, name→flat, foreign→flat+Foreign=Yes.""" - if node.deprel == 'foreign': + if node.udeprel == 'foreign': node.feats['Foreign'] = 'Yes' + udeprel, sdeprel = node.udeprel, node.sdeprel try: - node.deprel = DEPREL_CHANGE[node.deprel] + node.deprel = DEPREL_CHANGE[udeprel] except KeyError: - pass + return + if sdeprel: + if ':' in node.deprel: + self.log(node, 'deprel', 'deprel=%s:%s new_deprel=%s but %s is lost' % + (udeprel, sdeprel, node.deprel, sdeprel)) + else: + node.deprel += ':' + sdeprel def change_neg(self, node): """neg→advmod/det/ToDo + Polarity=Neg. From 0146cf4e4977df300e5ab9ab4fe98878fcce97ed Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 21 Feb 2017 20:04:02 +0100 Subject: [PATCH 0005/1374] fix read.Vislcg Mea culpa: `line.lstrip(line)` is not `line = line.lstrip()` --- udapi/block/read/conllu.py | 2 +- udapi/block/read/vislcg.py | 21 ++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 8c80a779..79ccfaea 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -72,7 +72,7 @@ def parse_comment_line(line, root): root.newdoc = value return - root.comment = root.comment + line[1:] + "\n" + root.comment += line[1:] + "\n" # pylint: disable=too-many-locals,too-many-branches,too-many-statements # Maybe the code could be refactored, but it is speed-critical, diff --git a/udapi/block/read/vislcg.py b/udapi/block/read/vislcg.py index d5906650..3d8b7637 100644 --- a/udapi/block/read/vislcg.py +++ b/udapi/block/read/vislcg.py @@ -1,6 +1,4 @@ """Vislcg is a reader block the VISL-cg format.""" -import shlex - from udapi.core.basereader import BaseReader from udapi.core.root import Root @@ -22,13 +20,11 @@ def read_tree(self, document=None): if line == '': break if line[0] == '#': - # Are comments allowed in VISL-cg? - # FMT: Yes :) + root.comment += line[1:] + "\n" continue if line[0].isspace(): - line.lstrip(line) - node, parent_ord = self._node(line, root) + node, parent_ord = self._node(line.lstrip(), root) words.append(node) parents.append(parent_ord) else: @@ -61,12 +57,16 @@ def read_tree(self, document=None): raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) return root - + @staticmethod def _node(line, root): - delim = line.rfind('"'); - lemma = line[2:delim] - fields = line[delim+1:].split() + # line contains "lemma" xpos feat1 feat2 .. featN @deprel #ord->parent.ord + # Lemma can contain spaces, but quotes within lemma are not escaped, + # so we cannot use fields = shlex.split(line) + # Let's hope that xpos, feats and deprel do not contain any quotes. + end_quote_pos = line.rfind('"'); + lemma = line[2:end_quote_pos] + fields = line[end_quote_pos+1:].split() xpos = fields[0] feats_list = fields[3:-2] feats = '|'.join(feats_list) if feats_list else '_' @@ -74,4 +74,3 @@ def _node(line, root): parent_ord = int(fields[-1].split('->')[1]) node = root.create_child(lemma=lemma, xpos=xpos, feats=feats, deprel=deprel) return node, parent_ord - From 1aa0cdcf8abd08dd10140f09c402461ffea433fa Mon Sep 17 00:00:00 2001 From: Francis Tyers Date: Tue, 21 Feb 2017 20:11:30 +0100 Subject: [PATCH 0006/1374] Update vislcg.py From the second character, not the third, e.g. \t"foo --- udapi/block/read/vislcg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/read/vislcg.py b/udapi/block/read/vislcg.py index 3d8b7637..4c2eb618 100644 --- a/udapi/block/read/vislcg.py +++ b/udapi/block/read/vislcg.py @@ -65,7 +65,7 @@ def _node(line, root): # so we cannot use fields = shlex.split(line) # Let's hope that xpos, feats and deprel do not contain any quotes. end_quote_pos = line.rfind('"'); - lemma = line[2:end_quote_pos] + lemma = line[1:end_quote_pos] fields = line[end_quote_pos+1:].split() xpos = fields[0] feats_list = fields[3:-2] From bc37e233002c30647a3542f2c47aed93169d2bbe Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Feb 2017 13:50:02 +0100 Subject: [PATCH 0007/1374] prevent "BrokenPipeError: [Errno 32] Broken pipe" --- bin/udapy | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/bin/udapy b/bin/udapy index 03b9e3d2..773661c8 100755 --- a/bin/udapy +++ b/bin/udapy @@ -66,5 +66,13 @@ if __name__ == "__main__": args.scenario = args.scenario + ['marked_only=1'] if args.no_color: args.scenario = args.scenario + ['color=0'] + runner = Run(args) - runner.execute() + # udapy is often piped to head etc., e.g. + # `seq 1000 | udapy -s read.Sentences | head` + # Let's prevent Python from reporting (with distracting stacktrace) + # "BrokenPipeError: [Errno 32] Broken pipe" + try: + runner.execute() + except (BrokenPipeError, IOError): + pass From 0380d3d719e40af81a7cad9b38bf2991b0c12df2 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Feb 2017 14:37:56 +0100 Subject: [PATCH 0008/1374] UD_Galician-specific conversion of UDv1 to UDv2 or rather fixing the errors in the UDv1.4 --- udapi/block/ud/gl/__init__.py | 0 udapi/block/ud/gl/to2.py | 59 +++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 udapi/block/ud/gl/__init__.py create mode 100644 udapi/block/ud/gl/to2.py diff --git a/udapi/block/ud/gl/__init__.py b/udapi/block/ud/gl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/gl/to2.py b/udapi/block/ud/gl/to2.py new file mode 100644 index 00000000..f5f0f451 --- /dev/null +++ b/udapi/block/ud/gl/to2.py @@ -0,0 +1,59 @@ +"""Block ud.gl.To2 UD_Galician-specific conversion of UDv1 to UDv2 + +Author: Martin Popel +""" +from udapi.core.block import Block + +ADP_HEAD_PREFERENCES = { + 'NOUN': 10, + 'PRON': 9, + 'ADJ': 8, + 'VERB': 8, + 'PUNCT': -10, +} + +class To2(Block): + """Block for fixing the remaining cases (before ud.Convert1to2) in UD_Galician.""" + + def process_node(self, node): + + # UD_Galician v1.4 uses incorrectly deprel=cop not for the copula verb, + # but for its complement (typically ADJ) and also copula is the head. + if node.deprel == 'cop': + copula = node.parent + # In UDv2 discussions it has been decided that only a limited set of verbs + # can be annotated as copula. For Spanish, "estar" was questionable, but accepted. + # I guess in Galician it is the same. The rest (considerar, resultar, quedar,...) + # should not be annotated as copulas. Luckily, in UD_Galician v1.4 they are + # governing the clause, so no change of topology is needed, just deprel=xcomp. + if copula.lemma in ('ser', 'estar'): + node.parent = copula.parent + for cop_child in copula.children: + cop_child.parent = node + copula.parent = node + node.deprel = copula.deprel + copula.deprel = 'cop' + else: + node.deprel = 'xcomp' + + # Prepositions should depend on the noun, not vice versa. + # This is easy to fix, but unfortunatelly, there are many nodes with deprel=case + # which are not actually prepostions or case markes, but standard NOUNs, VERBs etc. + # These are left as ToDo. + if node.deprel == 'case' and node.children: + if node.upos not in ('ADP', 'CONJ', 'PART'): + node.misc['ToDo'] = 'case-upos' + else: + children = sorted(node.children, key=lambda n: -ADP_HEAD_PREFERENCES.get(n.upos, 0)) + children[0].parent = node.parent + node.parent = children[0] + for child in children[1:]: + child.parent = children[0] + + # Punctuation should have no children. + if node.deprel == 'punct' and node.children and node.upos == 'PUNCT': + children = sorted(node.children, key=lambda n: -ADP_HEAD_PREFERENCES.get(n.upos, 0)) + children[0].parent = node.parent + node.parent = children[0] + for child in children[1:]: + child.parent = children[0] From 41eaf1f28e07227aaebb5fca2b1f91c437faa146 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Feb 2017 17:08:54 +0100 Subject: [PATCH 0009/1374] allow e.g. sent_id_filter=101 --- udapi/core/basereader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index 0146ca90..bb126ae7 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -21,7 +21,7 @@ def __init__(self, files='-', zone='keep', bundles_per_doc=0, encoding='utf-8', self.finished = False self.sent_id_filter = None if sent_id_filter is not None: - self.sent_id_filter = re.compile(sent_id_filter) + self.sent_id_filter = re.compile(str(sent_id_filter)) logging.debug('Using sent_id_filter=%s', sent_id_filter) self.split_docs = split_docs From 5fe11898506d2920ae571105e70045ae54a3582e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Feb 2017 17:31:27 +0100 Subject: [PATCH 0010/1374] pylint --- udapi/block/read/vislcg.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/udapi/block/read/vislcg.py b/udapi/block/read/vislcg.py index 4c2eb618..26b3d787 100644 --- a/udapi/block/read/vislcg.py +++ b/udapi/block/read/vislcg.py @@ -27,20 +27,21 @@ def read_tree(self, document=None): node, parent_ord = self._node(line.lstrip(), root) words.append(node) parents.append(parent_ord) - else: - if words: - words[0].form = form - if len(words) > 1: - split_forms = form.split() - if len(words) == len(split_forms): - for word, split_form in zip(words, split_forms): - word.form = split_form - else: - for word in words[1:]: - word.form = '_' - root.create_multiword_token(words, form=form) - words = [] - form = line[2:-2] + continue + + if words: + words[0].form = form + if len(words) > 1: + split_forms = form.split() + if len(words) == len(split_forms): + for word, split_form in zip(words, split_forms): + word.form = split_form + else: + for word in words[1:]: + word.form = '_' + root.create_multiword_token(words, form=form) + words = [] + form = line[2:-2] if words: words[0].form = form @@ -64,7 +65,7 @@ def _node(line, root): # Lemma can contain spaces, but quotes within lemma are not escaped, # so we cannot use fields = shlex.split(line) # Let's hope that xpos, feats and deprel do not contain any quotes. - end_quote_pos = line.rfind('"'); + end_quote_pos = line.rfind('"') lemma = line[1:end_quote_pos] fields = line[end_quote_pos+1:].split() xpos = fields[0] From 19db31ab1094280cf45d97b49dbe34db1b8c08ed Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Feb 2017 18:31:13 +0100 Subject: [PATCH 0011/1374] block for fixing punct nodes with children --- udapi/block/ud/fixpunctchild.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 udapi/block/ud/fixpunctchild.py diff --git a/udapi/block/ud/fixpunctchild.py b/udapi/block/ud/fixpunctchild.py new file mode 100644 index 00000000..a9f16b8c --- /dev/null +++ b/udapi/block/ud/fixpunctchild.py @@ -0,0 +1,9 @@ +"""Block ud.FixPunctChild for making sure punctuation nodes have no children.""" +from udapi.core.block import Block + +class FixPunctChild(Block): + """Make sure punct nodes have no children by rehanging the children upwards.""" + + def process_node(self, node): + while node.parent.deprel == 'punct': + node.parent = node.parent.parent From c31300d1427316145685dd65ec77414b44d5f0d1 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Feb 2017 23:03:34 +0100 Subject: [PATCH 0012/1374] deprel=punct iff upos=PUNCT --- udapi/block/ud/markbugs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index be3235f8..14b07797 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -118,6 +118,9 @@ def process_node(self, node): if parent.deprel == 'punct': self.log(node, 'punct-child', 'parent.deprel=punct') + if upos == 'PUNCT' and deprel != 'punct': + self.log(node, 'punct-deprel', 'upos=PUNCT deprel!=punct (but %s)' % deprel) + # See http://universaldependencies.org/u/overview/syntax.html#the-status-of-function-words # TODO: Promotion by Head Elision: It is difficult to detect this exception. # So far, I have just excluded "det" from the forbidded parent.deprel set @@ -147,7 +150,7 @@ def process_node(self, node): if upos == 'SYM' and form.isalpha(): self.log(node, 'sym-alpha', "upos=SYM but all form chars are alphabetical: " + form) - if upos == 'PUNCT' and any(char.isalpha() for char in form): + if upos == 'PUNCT' and any(char.isalpha() for char in form): self.log(node, 'punct-alpha', "upos=PUNCT but form has alphabetical char(s): " + form) def after_process_document(self, document): From c07fa8b03dda39b53adf6437d70a2dec74146753 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 23 Feb 2017 15:55:24 +0100 Subject: [PATCH 0013/1374] add `cc-upos` test, make `punct-deprel` test less strict --- udapi/block/ud/markbugs.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index 14b07797..1897c39c 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -66,7 +66,7 @@ def log(self, node, short_msg, long_msg): node.misc['Bug'] = short_msg self.stats[short_msg] += 1 - # pylint: disable=too-many-branches + # pylint: disable=too-many-branches, too-many-statements def process_node(self, node): form, deprel, upos, feats = node.form, node.deprel, node.upos, node.feats parent = node.parent @@ -118,9 +118,6 @@ def process_node(self, node): if parent.deprel == 'punct': self.log(node, 'punct-child', 'parent.deprel=punct') - if upos == 'PUNCT' and deprel != 'punct': - self.log(node, 'punct-deprel', 'upos=PUNCT deprel!=punct (but %s)' % deprel) - # See http://universaldependencies.org/u/overview/syntax.html#the-status-of-function-words # TODO: Promotion by Head Elision: It is difficult to detect this exception. # So far, I have just excluded "det" from the forbidded parent.deprel set @@ -153,6 +150,17 @@ def process_node(self, node): if upos == 'PUNCT' and any(char.isalpha() for char in form): self.log(node, 'punct-alpha', "upos=PUNCT but form has alphabetical char(s): " + form) + if upos == 'PUNCT' and deprel not in ('punct', 'fixed', 'goeswith', 'root'): + self.log(node, 'punct-deprel', 'upos=PUNCT deprel!=punct|fixed|goeswith|root (but %s)' + % deprel) + + # http://universaldependencies.org/u/dep/cc.html says + # "cc is the relation between a conjunct and a preceding + # [coordinating conjunction](http://universaldependencies.org/u/pos/CCONJ)." + # No other upos is allowed in the documentation, although e.g. PART is common in the data. + if deprel == 'cc' and upos != 'CCONJ': + self.log(node, 'cc-upos', "deprel=cc upos!=CCONJ (but %s): " % upos) + def after_process_document(self, document): total = 0 message = 'ud.MarkBugs Error Overview:' From 51e3e9853525cad6e17031deab455a11324d41e6 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 23 Feb 2017 20:36:48 +0100 Subject: [PATCH 0014/1374] don't silence all IOError I had included IOError because SO says, Windows raise it instead of BrokenPipeError, but now I see it was not a good idea. --- bin/udapy | 5 +++-- udapi/block/read/addsentences.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/udapy b/bin/udapy index 773661c8..c756c5cb 100755 --- a/bin/udapy +++ b/bin/udapy @@ -48,7 +48,8 @@ elif args.quiet: else: level = logging.INFO -logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', level=level) +logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', + level=level) # Process and provide the scenario. if __name__ == "__main__": @@ -74,5 +75,5 @@ if __name__ == "__main__": # "BrokenPipeError: [Errno 32] Broken pipe" try: runner.execute() - except (BrokenPipeError, IOError): + except BrokenPipeError: pass diff --git a/udapi/block/read/addsentences.py b/udapi/block/read/addsentences.py index 75c4ac7d..67c79ee8 100644 --- a/udapi/block/read/addsentences.py +++ b/udapi/block/read/addsentences.py @@ -34,7 +34,7 @@ def process_document(self, document): for bundle in document.bundles: line = self.filehandle.readline() if line == '': - raise IOError('File does not have enoush lines') + raise IOError('File does not have enough lines') root = bundle.get_tree(zone=self.zone) root.text = line.rstrip() self.finished = not self.files.has_next_file() From 30ac74b178638ec849718b30e1e09da8f911b4e5 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 23 Feb 2017 21:18:10 +0100 Subject: [PATCH 0015/1374] escape html special chars in write.TextModeTreesHtml --- udapi/block/write/textmodetrees.py | 14 +++++++++----- udapi/block/write/textmodetreeshtml.py | 9 +++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index 3c9f7308..a2c949cc 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -214,17 +214,21 @@ def process_tree(self, root): stack = sorted(stack, key=lambda x: -self._gaps[x.ord]) # Print headers (if required) and the tree itself + self.print_headers(root) + for line in self.lines: + print(line) + + if self.add_empty_line: + print('') + + def print_headers(self, root): + """Print sent_id, text and other comments related to the tree.""" if self.print_sent_id: print('# sent_id = ' + root.address()) if self.print_text: print("# text = " + (root.get_sentence() if root.is_root() else root.compute_text())) if self.print_comments and root.comment: print('#' + self.colorize_comment(root.comment.rstrip().replace('\n', '\n#'))) - for line in self.lines: - print(line) - - if self.add_empty_line: - print('') def _ends(self, idx, chars): return bool(self.lines[idx] and self.lines[idx][-1] in chars) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index ecb0efb8..21bd8e92 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -72,3 +72,12 @@ def colorize_comment(self, comment): def colorize_attr(attr, value, marked): """Return a string with color markup for a given attr and its value.""" return "%s" % (attr, escape(value)) + + def print_headers(self, root): + if self.print_sent_id: + print('# sent_id = ' + escape(root.address())) + if self.print_text: + text = "# text = " + (root.get_sentence() if root.is_root() else root.compute_text()) + print(escape(text)) + if self.print_comments and root.comment: + print(escape('#' + self.colorize_comment(root.comment.rstrip().replace('\n', '\n#')))) From 23d7814f187a6224b17a68b596cbe8e29ab91345 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 23 Feb 2017 21:24:43 +0100 Subject: [PATCH 0016/1374] skip `appos-chain`, add `list-chain` and `list-rightheaded` --- udapi/block/ud/markbugs.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index 1897c39c..44cfb5cc 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -71,11 +71,20 @@ def process_node(self, node): form, deprel, upos, feats = node.form, node.deprel, node.upos, node.feats parent = node.parent - for dep in ('aux', 'fixed', 'appos', 'goeswith'): + for dep in ('aux', 'fixed', 'goeswith', 'list'): if deprel == dep and parent.deprel == dep: self.log(node, dep + '-chain', dep + ' dependencies should not form a chain.') - for dep in ('flat', 'fixed', 'conj', 'appos', 'goeswith'): + # 'appos-chain' is more difficult to test because nested appositions are allowed. + # The commented-out code below prevents just some of the false alarms + # (those where changing the nested appos into flat would result in non-projectivity). + # Unfortunatelly, there are still too many false alarms, so let's skip this test completely. + # It seems that multiple appositions as siblings are much less common than nested. + # if deprel == 'appos' and parent.deprel == 'appos': + # if not node.precedes(parent.children[-1]): + # self.log(node, 'appos-chain', 'appos should not form a chain except when nested.') + + for dep in ('flat', 'fixed', 'conj', 'appos', 'goeswith', 'list'): if deprel == dep and node.precedes(parent): self.log(node, dep + '-rightheaded', dep + ' relations should be left-headed, not right.') From e9474fe8c2846630f74b9f68a17b7b30a3127dc4 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 15 Mar 2017 22:56:39 +0100 Subject: [PATCH 0017/1374] DualDict can be initialized with string or dict this allows us to use e.g. `root.create_multiword_token(nodes, mwt_form, another_node.misc)` instead of unintuitive `root.create_multiword_token(nodes, mwt_form, str(another_node.misc))` --- udapi/core/dualdict.py | 15 +++++++-------- udapi/core/mwt.py | 2 +- udapi/core/node.py | 4 ++-- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/udapi/core/dualdict.py b/udapi/core/dualdict.py index a68cb0bb..2b98f45c 100644 --- a/udapi/core/dualdict.py +++ b/udapi/core/dualdict.py @@ -28,14 +28,13 @@ class DualDict(collections.abc.MutableMapping): """ __slots__ = ['_string', '_dict'] - def __init__(self, string=None, *args, **kwargs): - if string is not None: - if args: - raise ValueError('If string is specified, no other arg is allowed ' + str(args)) - if kwargs: - raise ValueError('If string is specified, no other kwarg is allowed ' + str(kwargs)) - self._dict = dict(args, **kwargs) - self._string = string + def __init__(self, value=None, **kwargs): + if value is not None and kwargs: + raise ValueError('If value is specified, no other kwarg is allowed ' + str(kwargs)) + self._dict = dict(**kwargs) + self._string = None + if value is not None: + self.set_mapping(value) def __str__(self): if self._string is None: diff --git a/udapi/core/mwt.py b/udapi/core/mwt.py index 8a623804..19bbdbb5 100644 --- a/udapi/core/mwt.py +++ b/udapi/core/mwt.py @@ -9,7 +9,7 @@ class MWT(object): def __init__(self, words=None, form=None, misc=None, root=None): self.words = words if words is not None else [] self.form = form - self._misc = DualDict(string=misc) + self._misc = DualDict(misc) self.root = root for word in self.words: word._mwt = self # pylint: disable=W0212 diff --git a/udapi/core/node.py b/udapi/core/node.py index 7666c428..0d745baa 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -85,9 +85,9 @@ def __init__(self, form=None, lemma=None, upos=None, # pylint: disable=too-many- self.lemma = lemma self.upos = upos self.xpos = xpos - self._feats = Feats(string=feats) + self._feats = Feats(feats) self.deprel = deprel - self._misc = DualDict(string=misc) + self._misc = DualDict(misc) self._raw_deps = '_' self._deps = None self._parent = None From a364bd76736fff4975c4af46913cac3e4ce7576d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 16 Mar 2017 19:19:13 +0100 Subject: [PATCH 0018/1374] new blocks ud.RemoveMwt and ud.cs.AddMwt Plus abstract base class ud.AddMwt. --- udapi/block/ud/addmwt.py | 69 ++++++++++++++++++++++++++++++++ udapi/block/ud/cs/__init__.py | 0 udapi/block/ud/cs/addmwt.py | 74 +++++++++++++++++++++++++++++++++++ udapi/block/ud/removemwt.py | 37 ++++++++++++++++++ 4 files changed, 180 insertions(+) create mode 100644 udapi/block/ud/addmwt.py create mode 100644 udapi/block/ud/cs/__init__.py create mode 100644 udapi/block/ud/cs/addmwt.py create mode 100644 udapi/block/ud/removemwt.py diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py new file mode 100644 index 00000000..6d74d4a2 --- /dev/null +++ b/udapi/block/ud/addmwt.py @@ -0,0 +1,69 @@ +"""Abstract base class ud.AddMwt for heuristic detection of multi-word tokens.""" +from udapi.core.block import Block + +class AddMwt(Block): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def process_node(self, node): + analysis = self.multiword_analysis(node) + if analysis is None: + return + orig_attr = {} + for attr in 'form lemma upos xpos feats deprel misc'.split(): + orig_attr[attr] = getattr(node, attr) + + forms = analysis['form'].split() + main = analysis.get('main', 0) + parent = node if analysis.get('shape', '') == 'subtree' else node.parent + nodes = [] + for form in forms[0:main]: + new_node = parent.create_child(form=form) + new_node.shift_before_node(node) + nodes.append(new_node) + node.form = forms[main] + nodes.append(node) + for form in forms[main+1:]: + new_node = parent.create_child(form=form) + new_node.shift_after_node(nodes[-1]) + nodes.append(new_node) + + if orig_attr['form'].isupper(): + for new_node in nodes: + new_node.form = new_node.form.upper() + elif orig_attr['form'][0].isupper(): + nodes[0].form = nodes[0].form.title() + + for attr in 'lemma upos xpos feats deprel misc'.split(): + if attr in analysis: + values = analysis[attr].split() + for i, new_node in enumerate(nodes): + if values[i] == '*': + setattr(new_node, attr, orig_attr[attr]) + else: + setattr(new_node, attr, values[i]) + + mwt = node.root.create_multiword_token(nodes, orig_attr['form'], orig_attr['misc']) + node.misc = None + self.postprocess_mwt(mwt) + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token. + + An example return value is:: + { + 'form': 'aby bych', + 'lemma': 'aby být', + 'upos': 'SCONJ AUX', + 'xpos': 'J,------------- Vc-S---1-------', + 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin', # _ means empty FEATS + 'deprel': '* aux', # * means keep the original deprel + 'main': 0, # which of the two words will inherit the original children (if any) + 'shape': 'siblings', # the newly created nodes will be siblings or alternatively + #'shape': 'subtree', # the main-indexed node will be the head + } + """ + raise NotImplementedError('multiword_analysis must be overriden in subclasses') + + def postprocess_mwt(self, mwt): + """Optional postprocessing of newly created MWTs.""" + pass diff --git a/udapi/block/ud/cs/__init__.py b/udapi/block/ud/cs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/cs/addmwt.py b/udapi/block/ud/cs/addmwt.py new file mode 100644 index 00000000..17e0648c --- /dev/null +++ b/udapi/block/ud/cs/addmwt.py @@ -0,0 +1,74 @@ +"""Block ud.cs.AddMwt for heuristic detection of multi-word tokens.""" +import udapi.block.ud.addmwt + +MWTS = { + 'abych': {'form': 'aby bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'kdybych': {'form': 'když bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'abys': {'form': 'aby bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybys': {'form': 'když bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'aby': {'form': 'aby by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, + 'kdyby': {'form': 'když by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, + 'abychom': {'form': 'aby bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychom': {'form': 'když bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'abyste': {'form': 'aby byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyste': {'form': 'když byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, +} +for v in MWTS.values(): + v['upos'] = 'SCONJ AUX' + number = '-' + if 'Sing' in v['feats']: + number = 'S' + elif 'Plur' in v['feats']: + number = 'P' + person = '-' + if 'Person=1' in v['feats']: + person = '1' + elif 'Person=2' in v['feats']: + person = '2' + + v['xpos'] = 'J,------------- Vc-%s---%s-------' % (number, person) + v['deprel'] = '* aux' + v['lemma'] = v['form'].split()[0] + ' být' + v['main'] = 0 + v['shape'] = 'siblings' + +# nač -> na + co +for prep in 'na za o'.split(): + MWTS[prep + 'č'] = { + 'form': prep + ' co', + 'lemma': prep + ' co', + 'upos': 'ADP PRON', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + if analysis is not None: + return analysis + + # There is no VerbType=verbconj in the UD_Czech data. + # The purpose of this rule is rather to show that + # it is possible to write such "dynamic" rules + # (which cannot be included in static MWTS). + if node.form.lower().endswith('ť') and node.feats['VerbType'] == 'verbconj': + return { + 'form': node.form.lower()[:-1] + ' neboť', + 'lemma': '* neboť', + 'upos': '* CCONJ', + 'xpos': 'Vt-S---3P-NA--2 J^-------------', + 'feats': '* _', + 'deprel': '* cc', + 'main': 0, + 'shape': 'subtree', + } + return None + + def postprocess_mwt(self, mwt): + if mwt.words[0].deprel == 'fixed' and mwt.words[0].parent.parent.upos == 'VERB': + mwt.words[1].parent = mwt.words[0].parent.parent diff --git a/udapi/block/ud/removemwt.py b/udapi/block/ud/removemwt.py new file mode 100644 index 00000000..462e9fbd --- /dev/null +++ b/udapi/block/ud/removemwt.py @@ -0,0 +1,37 @@ +"""Block ud.RemoveMwt for removing multi-word tokens.""" +from udapi.core.block import Block + +class RemoveMwt(Block): + """Substitute MWTs with one word representing the whole MWT.""" + + def process_tree(self, root): + for mwt in root.multiword_tokens: + words = mwt.words + words[0].form = mwt.form + words[0].misc = mwt.misc + words[0].upos = self.guess_upos(words) + words[0].feats = self.guess_feats(words) + words[0].deprel = self.guess_deprel(words) + mwt.remove() + for word in words[1:]: + word.remove(children='rehang') + + @staticmethod + def guess_upos(words): + """UPOS of the whole MWT""" + return words[0].upos + + @staticmethod + def guess_deprel(words): + """DEPREL of the whole MWT""" + return words[0].deprel + # Alternatively, we could define deprel subtypes + #return words[0].deprel + ':' + ','.join([w.deprel for w in words[1:]]) + + @staticmethod + def guess_feats(words): + """FEATS of the whole MWT""" + feats = words[0].feats + for word in words[1:]: + feats.update(word.feats) + return feats From f0dff5a2d3f79c7ab6f71d7fa4c99b06012307e5 Mon Sep 17 00:00:00 2001 From: Prokopis Prokopidis Date: Fri, 17 Mar 2017 14:31:17 +0200 Subject: [PATCH 0019/1374] add new block ud.el.AddMwt --- udapi/block/ud/el/__init__.py | 0 udapi/block/ud/el/addmwt.py | 48 +++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 udapi/block/ud/el/__init__.py create mode 100644 udapi/block/ud/el/addmwt.py diff --git a/udapi/block/ud/el/__init__.py b/udapi/block/ud/el/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/ud/el/addmwt.py b/udapi/block/ud/el/addmwt.py new file mode 100644 index 00000000..fd16a0f5 --- /dev/null +++ b/udapi/block/ud/el/addmwt.py @@ -0,0 +1,48 @@ +"""Block ud.el.AddMwt for heuristic detection of multi-word (σε+DET) tokens. Notice that this should be used only for converting existing conllu files. Ideally a tokenizer should have already split the MWTs. Also notice that this block does not deal with the relatively rare PRON(Person=2)+'*+PRON(Person=3, i.e. "σ'το" and "στο") MWTs.""" +import udapi.block.ud.addmwt + +MWTS = { + 'στη': {'form' : 'σ τη', + 'lemma' : 'σε ο', + 'upos' : 'ADP DET', + 'xpos' : 'AsPpSp AtDf', + 'feats' : '_ Case=Acc|Gender=Fem|Number=Sing', + 'deprel': 'case det', + 'main' : 0, # which of the two words will inherit the original children (if any) + 'shape' : 'siblings', # the newly created nodes will be siblings}, + }, + 'στην': {'form': 'σ την', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, + 'στις': {'form': 'σ τις', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Fem|Number=Plur', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, + 'στα': {'form': 'σ τα', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Neut|Number=Plur', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, + 'στους': {'form': 'σ τους', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Masc|Number=Plur', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, + 'στον': {'form': 'σ τον', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Masc|Number=Sing', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, +} + +#for v in MWTS.values(): +# pass + +class AddMwt(udapi.block.ud.addmwt.AddMwt): + """Detect and mark MWTs (split them into words and add the words to the tree).""" + + def multiword_analysis(self, node): + """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + analysis = MWTS.get(node.form.lower(), None) + if analysis is not None: + return analysis + + # Write a rule for ambigous prep+article MWTs + if node.form.lower() == 'στο' and node.feats['Gender'] == 'Masc': + return { + 'form': 'σ το','lemma': 'σε ο','upos': 'ADP DET','xpos': 'AsPpSp AtDf','deprel': 'case det','main': 0,'shape': 'siblings', + 'feats': '_ Case=Acc|Gender=Masc|Number=Sing', + } + elif node.form.lower() == 'στο': + return { + 'form': 'σ το','lemma': 'σε ο','upos': 'ADP DET','xpos': 'AsPpSp AtDf','deprel': 'case det','main': 0,'shape': 'siblings', + 'feats': '_ Case=Acc|Gender=Neut|Number=Sing', + } + + return None + +# def postprocess_mwt(self, mwt): +# pass From f060ccc335d4d7201f667b7477697f1e2a57bc00 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 17 Mar 2017 15:13:59 +0100 Subject: [PATCH 0020/1374] introduce node.feats.copy(), ud.AddMwt supports e.g. Case=* rule --- udapi/block/ud/addmwt.py | 9 ++++++++- udapi/core/dualdict.py | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py index 6d74d4a2..653cfa26 100644 --- a/udapi/block/ud/addmwt.py +++ b/udapi/block/ud/addmwt.py @@ -9,8 +9,10 @@ def process_node(self, node): if analysis is None: return orig_attr = {} - for attr in 'form lemma upos xpos feats deprel misc'.split(): + for attr in 'form lemma upos xpos deprel'.split(): orig_attr[attr] = getattr(node, attr) + orig_attr['feats'] = node.feats.copy() + orig_attr['misc'] = node.misc.copy() forms = analysis['form'].split() main = analysis.get('main', 0) @@ -39,6 +41,11 @@ def process_node(self, node): for i, new_node in enumerate(nodes): if values[i] == '*': setattr(new_node, attr, orig_attr[attr]) + elif attr == 'feats' and '*' in values[i]: + new_node.feats = values[i] + for feat_name, feat_value in list(new_node.feats.items()): + if feat_value == '*': + new_node.feats[feat_name] = orig_attr['feats'][feat_name] else: setattr(new_node, attr, values[i]) diff --git a/udapi/core/dualdict.py b/udapi/core/dualdict.py index 2b98f45c..edad9c37 100644 --- a/udapi/core/dualdict.py +++ b/udapi/core/dualdict.py @@ -1,5 +1,6 @@ """DualDict is a dict with lazily synchronized string representation.""" import collections.abc +import copy class DualDict(collections.abc.MutableMapping): """DualDict class serves as dict with lazily synchronized string representation. @@ -93,6 +94,10 @@ def clear(self): self._string = '_' self._dict.clear() + def copy(self): + """Return a deep copy of this instance.""" + return copy.deepcopy(self) + def set_mapping(self, value): """Set the mapping from a dict or string. From 5b814e651d820361599da28cb75d19e179c5b2b4 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 17 Mar 2017 15:17:36 +0100 Subject: [PATCH 0021/1374] simpify ud.el.AddMwt by the newly introduced Gender=* --- udapi/block/ud/el/addmwt.py | 41 +++++++++---------------------------- 1 file changed, 10 insertions(+), 31 deletions(-) diff --git a/udapi/block/ud/el/addmwt.py b/udapi/block/ud/el/addmwt.py index fd16a0f5..d61f7616 100644 --- a/udapi/block/ud/el/addmwt.py +++ b/udapi/block/ud/el/addmwt.py @@ -2,47 +2,26 @@ import udapi.block.ud.addmwt MWTS = { - 'στη': {'form' : 'σ τη', - 'lemma' : 'σε ο', - 'upos' : 'ADP DET', + 'στη': {'form' : 'σ τη', + 'lemma' : 'σε ο', + 'upos' : 'ADP DET', 'xpos' : 'AsPpSp AtDf', - 'feats' : '_ Case=Acc|Gender=Fem|Number=Sing', - 'deprel': 'case det', + 'feats' : '_ Case=Acc|Gender=Fem|Number=Sing', + 'deprel': 'case det', 'main' : 0, # which of the two words will inherit the original children (if any) - 'shape' : 'siblings', # the newly created nodes will be siblings}, + 'shape' : 'siblings', # the newly created nodes will be siblings}, }, - 'στην': {'form': 'σ την', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, - 'στις': {'form': 'σ τις', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Fem|Number=Plur', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, + 'στην': {'form': 'σ την', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, + 'στις': {'form': 'σ τις', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Fem|Number=Plur', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, 'στα': {'form': 'σ τα', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Neut|Number=Plur', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, 'στους': {'form': 'σ τους', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Masc|Number=Plur', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, 'στον': {'form': 'σ τον', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Masc|Number=Sing', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, + 'στο': {'form': 'σ το', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=*|Number=Sing', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, } -#for v in MWTS.values(): -# pass - class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree).""" def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" - analysis = MWTS.get(node.form.lower(), None) - if analysis is not None: - return analysis - - # Write a rule for ambigous prep+article MWTs - if node.form.lower() == 'στο' and node.feats['Gender'] == 'Masc': - return { - 'form': 'σ το','lemma': 'σε ο','upos': 'ADP DET','xpos': 'AsPpSp AtDf','deprel': 'case det','main': 0,'shape': 'siblings', - 'feats': '_ Case=Acc|Gender=Masc|Number=Sing', - } - elif node.form.lower() == 'στο': - return { - 'form': 'σ το','lemma': 'σε ο','upos': 'ADP DET','xpos': 'AsPpSp AtDf','deprel': 'case det','main': 0,'shape': 'siblings', - 'feats': '_ Case=Acc|Gender=Neut|Number=Sing', - } - - return None - -# def postprocess_mwt(self, mwt): -# pass + return MWTS.get(node.form.lower(), None) From 843c1dfa02aa0f0be62f2e6aa52c84ba980b1967 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 17 Mar 2017 15:31:42 +0100 Subject: [PATCH 0022/1374] fix pylint warning (line length) and further simplify --- udapi/block/ud/el/addmwt.py | 40 ++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/udapi/block/ud/el/addmwt.py b/udapi/block/ud/el/addmwt.py index d61f7616..39e23620 100644 --- a/udapi/block/ud/el/addmwt.py +++ b/udapi/block/ud/el/addmwt.py @@ -1,24 +1,32 @@ -"""Block ud.el.AddMwt for heuristic detection of multi-word (σε+DET) tokens. Notice that this should be used only for converting existing conllu files. Ideally a tokenizer should have already split the MWTs. Also notice that this block does not deal with the relatively rare PRON(Person=2)+'*+PRON(Person=3, i.e. "σ'το" and "στο") MWTs.""" +"""Block ud.el.AddMwt for heuristic detection of multi-word (σε+DET) tokens. + +Notice that this should be used only for converting existing conllu files. +Ideally a tokenizer should have already split the MWTs. +Also notice that this block does not deal with the relatively rare +PRON(Person=2)+'*+PRON(Person=3, i.e. "σ'το" and "στο") MWTs. +""" import udapi.block.ud.addmwt MWTS = { - 'στη': {'form' : 'σ τη', - 'lemma' : 'σε ο', - 'upos' : 'ADP DET', - 'xpos' : 'AsPpSp AtDf', - 'feats' : '_ Case=Acc|Gender=Fem|Number=Sing', - 'deprel': 'case det', - 'main' : 0, # which of the two words will inherit the original children (if any) - 'shape' : 'siblings', # the newly created nodes will be siblings}, - }, - 'στην': {'form': 'σ την', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, - 'στις': {'form': 'σ τις', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Fem|Number=Plur', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, - 'στα': {'form': 'σ τα', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Neut|Number=Plur', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, - 'στους': {'form': 'σ τους', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Masc|Number=Plur', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, - 'στον': {'form': 'σ τον', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=Masc|Number=Sing', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, - 'στο': {'form': 'σ το', 'lemma': 'σε ο', 'upos':'ADP DET', 'xpos':'AsPpSp AtDf', 'feats': '_ Case=Acc|Gender=*|Number=Sing', 'deprel': 'case det', 'main': 0, 'shape': 'siblings'}, + 'στη': {'form': 'σ τη', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing'}, + 'στην': {'form': 'σ την', 'feats': '_ Case=Acc|Gender=Fem|Number=Sing'}, + 'στα': {'form': 'σ τα', 'feats': '_ Case=Acc|Gender=Neut|Number=Plur'}, + 'στους': {'form': 'σ τους', 'feats': '_ Case=Acc|Gender=Masc|Number=Plur'}, + 'στις': {'form': 'σ τις', 'feats': '_ Case=Acc|Gender=Fem|Number=Plur'}, + 'στον': {'form': 'σ τον', 'feats': '_ Case=Acc|Gender=Masc|Number=Sing'}, + 'στο': {'form': 'σ το', 'feats': '_ Case=Acc|Gender=*|Number=Sing'}, } +# shared values for all entries in MWTS +for v in MWTS.values(): + v['lemma'] = 'σε ο' + v['upos'] = 'ADP DET' + v['xpos'] = 'AsPpSp AtDf' + v['deprel'] = 'case det' + # The following are the default values + #v['main'] = 0 # which of the two words will inherit the original children (if any) + #v['shape'] = 'siblings', # the newly created nodes will be siblings + class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree).""" From 292591e3745567f6c8692cd7ea6f85d2733a044e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 17 Mar 2017 22:14:52 +0100 Subject: [PATCH 0023/1374] =?UTF-8?q?de/projectivization=20=C3=A0=20la=20N?= =?UTF-8?q?ivre=20&=20Nilsson=20(2005)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- udapi/block/transform/__init__.py | 0 udapi/block/transform/deproj.py | 42 +++++++++++++++++++++ udapi/block/transform/proj.py | 62 +++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 udapi/block/transform/__init__.py create mode 100644 udapi/block/transform/deproj.py create mode 100644 udapi/block/transform/proj.py diff --git a/udapi/block/transform/__init__.py b/udapi/block/transform/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/transform/deproj.py b/udapi/block/transform/deproj.py new file mode 100644 index 00000000..3a6dcda5 --- /dev/null +++ b/udapi/block/transform/deproj.py @@ -0,0 +1,42 @@ +"""Block Deproj for deprojectivization of pseudo-projective trees à la Nivre & Nilsson (2005). + +See ud.transform.Proj for details. +TODO: implement also path and head+path strategies. +""" +from udapi.core.block import Block + +class Deproj(Block): + """De-projectivize the trees à la Nivre & Nilsson (2005).""" + + def __init__(self, strategy='head', label='misc', **kwargs): + """Create the Deproj block object.""" + super().__init__(**kwargs) + self.strategy = strategy + self.label = label + + def process_node(self, node): + if self.label == 'misc': + label = node.misc['pproj'] + elif self.label == 'deprel': + parts = node.sdeprel.split('+', 1) + if len(parts) == 2: + label = parts[1] + node.deprel = node.udeprel + (':' + parts[0] if parts[0] else '') + else: + label = '' + else: + raise(ValueError('Unknown parameter label=%s' % self.label)) + if label == '': + return + reconstructed_parent = self.head_strategy(node, label) + if reconstructed_parent: + node.parent = reconstructed_parent + + def head_strategy(self, node, label): + queue = [n for n in node.parent.children if n!=node] # TODO deque + while queue: + adept = queue.pop(0) + if adept.udeprel == label: + return adept + queue.extend(adept.children) + return None diff --git a/udapi/block/transform/proj.py b/udapi/block/transform/proj.py new file mode 100644 index 00000000..f15e46e9 --- /dev/null +++ b/udapi/block/transform/proj.py @@ -0,0 +1,62 @@ +"""Block Proj for (pseudo-)projectivization of trees à la Nivre & Nilsson (2005). + +See http://www.aclweb.org/anthology/P/P05/P05-1013.pdf. +This block tries to replicate Malt parser's projectivization: +http://www.maltparser.org/userguide.html#singlemalt_proj +http://www.maltparser.org/optiondesc.html#pproj-marking_strategy + +TODO: implement also path and head+path strategies. +TODO: Sometimes it would be better (intuitively) + to lower the gap-node (if its whole subtree is in the gap + and if this does not cause more non-projectivities) + rather than to lift several nodes whose parent-edge crosses this gap. + We would need another label value (usually the lowering is of depth 1), + but the advantage is that reconstruction of lowered edges + during deprojectivization is simple and needs no heuristics. +""" +from udapi.core.block import Block + +class Proj(Block): + """Projectivize the trees à la Nivre & Nilsson (2005).""" + + def __init__(self, strategy='head', lifting_order='deepest', label='misc', **kwargs): + """Create the Proj block object.""" + super().__init__(**kwargs) + self.lifting_order = lifting_order + self.strategy = strategy + self.label = label + + def process_tree(self, tree): + nonprojs = [self.nonproj_info(n) for n in tree.descendants if n.is_nonprojective()] + for nonproj in sorted(nonprojs, key=lambda info: info[0]): + self.lift(nonproj[1]) + + def nonproj_info(self, node): + if self.lifting_order == 'shortest': + return (abs(node.ord - node.parent.ord), node) + orig_parent = node.parent + node.parent = node.parent.parent + depth = 1 + while node.is_nonprojective(): + node.parent = node.parent.parent + depth += 1 + node.parent = orig_parent + return (-depth, node) + + def lift(self, node): + orig_parent = node.parent + depth = 0 + while node.is_nonprojective(): + node.parent = node.parent.parent + depth += 1 + if depth == 0: + return + self.mark(node, orig_parent.udeprel) + + def mark(self, node, label): + if self.label == 'misc': + node.misc['pproj'] = label + elif self.label == 'deprel': + node.deprel = '%s:%s+%s' % (node.udeprel, node.sdeprel, label) + else: + raise(ValueError('Unknown parameter label=%s' % self.label)) From 2e2617eec9310d67d750a763177b067224383c2a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 18 Mar 2017 15:14:42 +0100 Subject: [PATCH 0024/1374] add draft of blocks eval.Parsing and eval.LcsF1 --- udapi/block/eval/__init__.py | 0 udapi/block/eval/lcsf1.py | 201 +++++++++++++++++++++++++++++++++++ udapi/block/eval/parsing.py | 35 ++++++ 3 files changed, 236 insertions(+) create mode 100644 udapi/block/eval/__init__.py create mode 100644 udapi/block/eval/lcsf1.py create mode 100644 udapi/block/eval/parsing.py diff --git a/udapi/block/eval/__init__.py b/udapi/block/eval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/eval/lcsf1.py b/udapi/block/eval/lcsf1.py new file mode 100644 index 00000000..09d84e61 --- /dev/null +++ b/udapi/block/eval/lcsf1.py @@ -0,0 +1,201 @@ +"""Block eval.LcsF1 for evaluating differences between sentences with P/R/F1.""" +from udapi.core.basewriter import BaseWriter + +class LcsF1(BaseWriter): + """Evaluate differences between sentences (in different zones) with P/R/F1.""" + + def __init__(self, gold_zone, attributes='form', focus='.*', details=4, **kwargs): + """Create the LcsF1 block object.""" + super().__init__(**kwargs) + self.gold_zone = gold_zone + self.attributes = attributes + self.focus = focus + self.details = details + self._stats = {} + self.correct, self.pred, self.gold = 0, 0, 0 + + def process_tree(self, tree): + gold_tree = tree.bundle.get_tree(self.gold_zone) + if tree == gold_tree: + return + #self._stats['zones'][tree.zone] += 1 + + attrs = self.attributes.split(',') + pred_tokens = ['_'.join(n.get_attrs(attrs)) for n in tree.descendants] + gold_tokens = ['_'.join(n.get_attrs(attrs)) for n in gold_tree.descendants] + common = find_lcs(pred_tokens, gold_tokens) + + # my $focus = $self->focus; + # if ($focus ne '.*') { + # @common = grep {/$focus/} @common; + # @pred_tokens = grep {/$focus/} @pred_tokens; + # @gold_tokens = grep {/$focus/} @gold_tokens; + # } + + self.correct += len(common) + self.pred += len(pred_tokens) + self.gold += len(gold_tokens) + + # if ($self->details){ + # $self->_stats->{C}{$_}++ for (@common); + # $self->_stats->{P}{$_}++ for (@pred_tokens); + # $self->_stats->{G}{$_}++ for (@gold_tokens); + # $self->_stats->{T}{$_}++ for (@gold_tokens, @pred_tokens); + # } + + def process_end(self): + # Redirect the default filehandle to the file specified by self.files + self.before_process_document(None) + + # my %pred_zones = %{$self->_stats->{zones}}; + # my @pz = keys %pred_zones; + # if (!@pz) { + # warn 'Block Eval::LcsF1 was not applied to any zone. Check the parameter zones='.$self->zones; + # } elsif (@pz > 1){ + # warn "Block Eval::LcsF1 was applied to more than one zone (@pz). " + # . 'The results are mixed together. Check the parameter zones='.$self->zones; + # } + # say "Comparing predicted trees (zone=@pz) with gold trees (zone=" + # . $self->gold_zone . "), sentences=$pred_zones{$pz[0]}"; + # + # if ($self->details){ + # say '=== Details ==='; + # my $total_count = $self->_stats->{T}; + # my @tokens = sort {$total_count->{$b} <=> $total_count->{$a}} keys %{$total_count}; + # splice @tokens, $self->details; + # printf "%-10s %5s %5s %5s %6s %6s %6s\n", qw(token pred gold corr prec rec F1); + # foreach my $token (@tokens){ + # my ($p, $g, $c) = map {$self->_stats->{$_}{$token}||0} (qw(P G C)); + # my $pr = $c / ($p || 1); + # my $re = $c / ($g || 1); + # my $f = 2 * $pr * $re / (($pr + $re)||1); + # printf "%-10s %5d %5d %5d %6.2f%% %6.2f%% %6.2f%%\n", + # $token, $p, $g, $c, 100*$pr, 100*$re, 100*$f + # } + # say '=== Totals ===' + # } + + + print("%-9s = %7d\n"*3 % ('predicted', self.pred, 'gold', self.gold, 'correct', self.correct)) + # ($pred, $gold) = map {$_||1} ($pred, $gold); # prevent division by zero + # my $prec = $correct / $pred; + # my $rec = $correct / $gold; + # my $f1 = 2 * $prec * $rec / (($prec + $rec)||1); + # printf "%-9s = %6.2f%%\n"x3, precision=>100*$prec, recall=>100*$rec, F1=>100*$f1; + + +# difflib.SequenceMatcher does not compute LCS, so let's implement it here +# TODO: make faster by trimming common prefix and sufix +def find_lcs(x, y): + m, n = len(x), len(y) + C = [[0] * (n + 1) for _ in range(m + 1)] + for i in range(1, m+1): + for j in range(1, n+1): + C[i][j] = C[i-1][j-1] + 1 if x[i-1] == y[j-1] else max(C[i][j-1], C[i-1][j]) + index = C[m][n] + lcs = [None] * index + while m > 0 and n > 0: + if x[m-1] == y[n-1]: + lcs[index-1] = x[m-1] + m, n, index = m-1, n-1, index-1 + elif C[m-1][n] > C[m][n-1]: + m -= 1 + else: + n -= 1 + return lcs + + +''' +Udapi::Block::Eval::LcsF1 - evaluate differences between sentences with P/R/F1 + +=head1 SYNOPSIS + + Eval::LcsF1 zones=en_pred gold_zone=en_gold to=results.txt + + # prints something like + predicted = 210 + gold = 213 + correct = 210 + precision = 100.00% + recall = 98.59% + F1 = 99.29% + + Eval::LcsF1 gold_zone=y attributes=form,upos focus='^(?i:an?|the)_DET$' details=4 + + # prints something like + === Details === + token pred gold corr prec rec F1 + the_DET 711 213 188 26.44% 88.26% 40.69% + The_DET 82 25 19 23.17% 76.00% 35.51% + a_DET 0 62 0 0.00% 0.00% 0.00% + an_DET 0 16 0 0.00% 0.00% 0.00% + === Totals === + predicted = 793 + gold = 319 + correct = 207 + precision = 26.10% + recall = 64.89% + F1 = 37.23% + +=head1 DESCRIPTION + +This block finds differences between nodes of trees in two zones +and reports the overall precision, recall and F1. +The two zones are "predicted" (on which this block is applied) +and "gold" (which needs to be specified with parameter C). + +This block also reports the number of total nodes in the predicted zone +and in the gold zone and the number of "correct" nodes, +that is predicted nodes which are also in the gold zone. +By default two nodes are considered "the same" if they have the same C
, +but it is possible to check also for other nodes' attributes +(with parameter C). + +As usual: + + precision = correct / predicted + recall = correct / gold + F1 = 2 * precision * recall / (precision + recall) + +The implementation is based on finding the longest common subsequence (LCS) +between the nodes in the two trees. +This means that the two zones do not need to be explicitly word-aligned. + +=head1 PARAMETERS + +=head2 zones + +Which zone contains the "predicted" trees? +Make sure that you specify just one zone. +If you leave the default value "all" and the document contains more zones, +the results will be mixed, which is most likely not what you wanted. +Exception: If the document conaints just two zones (predicted and gold trees), +you can keep the default value "all" because this block +will skip comparison of the gold zone with itself. + +=head2 gold_zone + +Which zone contains the gold-standard trees? + +=head2 attributes + +comma separated list of attributes which should be checked +when deciding whether two nodes are equivalent in LCS + +=head2 focus + +Regular expresion constraining the tokens we are interested in. +If more attributes were specified in the C parameter, +their values are concatenated with underscore, so C should reflect that +e.g. C. + +For case-insensitive focus use e.g. C +(which is equivalent to C) + +=head2 details + +Print also detailed statistics for each token (matching the C). +The value of this parameter C
specifies the number of tokens to include. +The tokens are sorted according to the sum of their I and I counts. + +''' diff --git a/udapi/block/eval/parsing.py b/udapi/block/eval/parsing.py new file mode 100644 index 00000000..3c7f5da8 --- /dev/null +++ b/udapi/block/eval/parsing.py @@ -0,0 +1,35 @@ +"""Block eval.Parsing for evaluating UAS and LAS - gold and pred must have the same tokens.""" +from udapi.core.basewriter import BaseWriter + +class Parsing(BaseWriter): + """Evaluate labeled and unlabeled attachment score (LAS and UAS).""" + + def __init__(self, gold_zone, **kwargs): + """Create the eval.Parsing block object.""" + super().__init__(**kwargs) + self.gold_zone = gold_zone + self.correct_las, self.correct_uas, self.total = 0, 0, 0 + + def process_tree(self, tree): + gold_tree = tree.bundle.get_tree(self.gold_zone) + if tree == gold_tree: + return + pred_nodes = tree.descendants + gold_nodes = gold_tree.descendants + if len(pred_nodes) != len(gold_nodes): + raise ValueError('The sentences do not match (%d vs. %d nodes)' + % (len(pred_nodes), len(gold_nodes))) + + self.total += len(pred_nodes) + for pred_node, gold_node in zip(pred_nodes, gold_nodes): + if pred_node.parent.ord == gold_node.parent.ord: + self.correct_uas += 1 + if pred_node.deprel == gold_node.deprel: + self.correct_las += 1 + + def process_end(self): + # Redirect the default filehandle to the file specified by self.files + self.before_process_document(None) + print('nodes = %d' % self.total) + print('UAS = %6.2f' % (100 * self.correct_uas / self.total)) + print('LAS = %6.2f' % (100 * self.correct_las / self.total)) From 5477d3c739644df10de6fce34fee8d7f40081177 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 19 Mar 2017 16:41:58 +0100 Subject: [PATCH 0025/1374] simple tokenization: morpho.TokenizeOnWhitespace --- udapi/block/eval/lcsf1.py | 8 ++-- udapi/block/morpho/__init__.py | 0 udapi/block/morpho/tokenizeonwhitespace.py | 46 ++++++++++++++++++++++ 3 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 udapi/block/morpho/__init__.py create mode 100644 udapi/block/morpho/tokenizeonwhitespace.py diff --git a/udapi/block/eval/lcsf1.py b/udapi/block/eval/lcsf1.py index 09d84e61..2c085779 100644 --- a/udapi/block/eval/lcsf1.py +++ b/udapi/block/eval/lcsf1.py @@ -1,7 +1,7 @@ """Block eval.LcsF1 for evaluating differences between sentences with P/R/F1.""" from udapi.core.basewriter import BaseWriter -class LcsF1(BaseWriter): +class LcsF1(BaseWriter): # pylint: disable=too-many-instance-attributes """Evaluate differences between sentences (in different zones) with P/R/F1.""" def __init__(self, gold_zone, attributes='form', focus='.*', details=4, **kwargs): @@ -50,7 +50,8 @@ def process_end(self): # my %pred_zones = %{$self->_stats->{zones}}; # my @pz = keys %pred_zones; # if (!@pz) { - # warn 'Block Eval::LcsF1 was not applied to any zone. Check the parameter zones='.$self->zones; + # warn 'Block Eval::LcsF1 was not applied to any zone. Check the parameter zones=' + # . $self->zones; # } elsif (@pz > 1){ # warn "Block Eval::LcsF1 was applied to more than one zone (@pz). " # . 'The results are mixed together. Check the parameter zones='.$self->zones; @@ -76,7 +77,8 @@ def process_end(self): # } - print("%-9s = %7d\n"*3 % ('predicted', self.pred, 'gold', self.gold, 'correct', self.correct)) + print("%-9s = %7d\n"*3 + % ('predicted', self.pred, 'gold', self.gold, 'correct', self.correct)) # ($pred, $gold) = map {$_||1} ($pred, $gold); # prevent division by zero # my $prec = $correct / $pred; # my $rec = $correct / $gold; diff --git a/udapi/block/morpho/__init__.py b/udapi/block/morpho/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/morpho/tokenizeonwhitespace.py b/udapi/block/morpho/tokenizeonwhitespace.py new file mode 100644 index 00000000..ef6f5d6f --- /dev/null +++ b/udapi/block/morpho/tokenizeonwhitespace.py @@ -0,0 +1,46 @@ +"""Block morpho.TokenizeOnWhitespace""" +from udapi.core.block import Block + +class TokenizeOnWhitespace(Block): + """"Base tokenizer, splits on whitespaces, fills SpaceAfter=No.""" + + @staticmethod + def tokenize_sentence(string): + """A method to be overriden in subclasses.""" + return string + + def process_tree(self, root): + if root.children: + raise ValueError('Tree %s is already tokenized.' % root) + sentence = ' '.join(root.text.split()) + tokens = self.tokenize_sentence(sentence).split() + for i, token in enumerate(tokens, 1): + space_after = False + + # Delete the token from the begining of the sentence. + if sentence.startswith(token): + sentence = sentence[len(token):] + # This is the expected case. The sentence starts with the token. + # If it is followed by a space, delete the space and set space_after=True. + if not len(sentence): + space_after = True + elif sentence.startswith(' '): + space_after = True + sentence = sentence[1:] + else: + # The token (returned from tokenization) does not match the start of sentence. + # E.g. '. . . word' is tokenized as '... word'. + # Let's delete the start of sentence anyway, + # using a non-greedy regex and the expected next token + # returned from the tokenization. + # my $next_token = $tokens[$i+1]; + # my ($first, $rest) = ($sentence =~ /^(.*?)(\Q$next_token\E.*)$/); + # $no_space_after = 1 if (defined $first && $first !~ /\s$/); + # $sentence = $rest if (defined $rest); + raise ValueError('tokenization does not match: "%s" vs "%s"' % (token, sentence)) + + # create a new node + node = root.create_child(form=token) + node.ord = i + if not space_after: + node.misc = 'SpaceAfter=No' From ca3e822d25656b4c59e17865ec934363df9a9d7e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 19 Mar 2017 16:43:19 +0100 Subject: [PATCH 0026/1374] draft of UDPipe integration --- udapi/block/udpipe/__init__.py | 0 udapi/block/udpipe/base.py | 180 +++++++++++++++++++++++++++++++++ udapi/block/udpipe/en.py | 10 ++ udapi/tool/__init__.py | 0 udapi/tool/udpipe.py | 40 ++++++++ 5 files changed, 230 insertions(+) create mode 100644 udapi/block/udpipe/__init__.py create mode 100644 udapi/block/udpipe/base.py create mode 100644 udapi/block/udpipe/en.py create mode 100644 udapi/tool/__init__.py create mode 100644 udapi/tool/udpipe.py diff --git a/udapi/block/udpipe/__init__.py b/udapi/block/udpipe/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py new file mode 100644 index 00000000..914c160f --- /dev/null +++ b/udapi/block/udpipe/base.py @@ -0,0 +1,180 @@ +"""Block udpipe.Base for tagging and parsing using UDPipe.""" +from udapi.core.block import Block +from udapi.tool.udpipe import UDPipe + +KNOWN_MODELS = { + 'grc': 'models/udpipe/2.0/ancient_greek-proiel-ud-2.0-conll17-170315.udpipe', + 'grc_proiel': 'models/udpipe/2.0/ancient_greek-ud-2.0-conll17-170315.udpipe', + 'ar': 'models/udpipe/2.0/arabic-ud-2.0-conll17-170315.udpipe', + 'eu': 'models/udpipe/2.0/basque-ud-2.0-conll17-170315.udpipe', + 'bg': 'models/udpipe/2.0/bulgarian-ud-2.0-conll17-170315.udpipe', + 'ca': 'models/udpipe/2.0/catalan-ud-2.0-conll17-170315.udpipe', + 'zh': 'models/udpipe/2.0/chinese-ud-2.0-conll17-170315.udpipe', + 'hr': 'models/udpipe/2.0/croatian-ud-2.0-conll17-170315.udpipe', + 'cs_cac': 'models/udpipe/2.0/czech-cac-ud-2.0-conll17-170315.udpipe', + 'cs_cltt': 'models/udpipe/2.0/czech-cltt-ud-2.0-conll17-170315.udpipe', + 'cs': 'models/udpipe/2.0/czech-ud-2.0-conll17-170315.udpipe', + 'da': 'models/udpipe/2.0/danish-ud-2.0-conll17-170315.udpipe', + 'nl_lassysmall': 'models/udpipe/2.0/dutch-lassysmall-ud-2.0-conll17-170315.udpipe', + 'nl': 'models/udpipe/2.0/dutch-ud-2.0-conll17-170315.udpipe', + 'en_lines': 'models/udpipe/2.0/english-lines-ud-2.0-conll17-170315.udpipe', + 'en_partut': 'models/udpipe/2.0/english-partut-ud-2.0-conll17-170315.udpipe', + 'en': 'models/udpipe/2.0/english-ud-2.0-conll17-170315.udpipe', + 'et': 'models/udpipe/2.0/estonian-ud-2.0-conll17-170315.udpipe', + 'fi_ftb': 'models/udpipe/2.0/finnish-ftb-ud-2.0-conll17-170315.udpipe', + 'fi': 'models/udpipe/2.0/finnish-ud-2.0-conll17-170315.udpipe', + 'fr_partut': 'models/udpipe/2.0/french-partut-ud-2.0-conll17-170315.udpipe', + 'fr_sequoia': 'models/udpipe/2.0/french-sequoia-ud-2.0-conll17-170315.udpipe', + 'fr': 'models/udpipe/2.0/french-ud-2.0-conll17-170315.udpipe', + 'gl_treegal': 'models/udpipe/2.0/galician-treegal-ud-2.0-conll17-170315.udpipe', + 'gl': 'models/udpipe/2.0/galician-ud-2.0-conll17-170315.udpipe', + 'de': 'models/udpipe/2.0/german-ud-2.0-conll17-170315.udpipe', + 'got': 'models/udpipe/2.0/gothic-ud-2.0-conll17-170315.udpipe', + 'el': 'models/udpipe/2.0/greek-ud-2.0-conll17-170315.udpipe', + 'he': 'models/udpipe/2.0/hebrew-ud-2.0-conll17-170315.udpipe', + 'hi': 'models/udpipe/2.0/hindi-ud-2.0-conll17-170315.udpipe', + 'hu': 'models/udpipe/2.0/hungarian-ud-2.0-conll17-170315.udpipe', + 'id': 'models/udpipe/2.0/indonesian-ud-2.0-conll17-170315.udpipe', + 'ga': 'models/udpipe/2.0/irish-ud-2.0-conll17-170315.udpipe', + 'it_partut': 'models/udpipe/2.0/italian-partut-ud-2.0-conll17-170315.udpipe', + 'it': 'models/udpipe/2.0/italian-ud-2.0-conll17-170315.udpipe', + 'ja': 'models/udpipe/2.0/japanese-ud-2.0-conll17-170315.udpipe', + 'kk': 'models/udpipe/2.0/kazakh-ud-2.0-conll17-170315.udpipe', + 'ko': 'models/udpipe/2.0/korean-ud-2.0-conll17-170315.udpipe', + 'la_ittb': 'models/udpipe/2.0/latin-ittb-ud-2.0-conll17-170315.udpipe', + 'la_proiel': 'models/udpipe/2.0/latin-proiel-ud-2.0-conll17-170315.udpipe', + 'la': 'models/udpipe/2.0/latin-ud-2.0-conll17-170315.udpipe', + 'lv': 'models/udpipe/2.0/latvian-ud-2.0-conll17-170315.udpipe', + 'no_bokmaal': 'models/udpipe/2.0/norwegian-bokmaal-ud-2.0-conll17-170315.udpipe', + 'no_nynorsk': 'models/udpipe/2.0/norwegian-nynorsk-ud-2.0-conll17-170315.udpipe', + 'cu': 'models/udpipe/2.0/old_church_slavonic-ud-2.0-conll17-170315.udpipe', + 'fa': 'models/udpipe/2.0/persian-ud-2.0-conll17-170315.udpipe', + 'pl': 'models/udpipe/2.0/polish-ud-2.0-conll17-170315.udpipe', + 'pt_br': 'models/udpipe/2.0/portuguese-br-ud-2.0-conll17-170315.udpipe', + 'pt': 'models/udpipe/2.0/portuguese-ud-2.0-conll17-170315.udpipe', + 'ro': 'models/udpipe/2.0/romanian-ud-2.0-conll17-170315.udpipe', + 'ru_syntagrus': 'models/udpipe/2.0/russian-syntagrus-ud-2.0-conll17-170315.udpipe', + 'ru': 'models/udpipe/2.0/russian-ud-2.0-conll17-170315.udpipe', + 'sk': 'models/udpipe/2.0/slovak-ud-2.0-conll17-170315.udpipe', + 'sl_sst': 'models/udpipe/2.0/slovenian-sst-ud-2.0-conll17-170315.udpipe', + 'sl': 'models/udpipe/2.0/slovenian-ud-2.0-conll17-170315.udpipe', + 'es_ancora': 'models/udpipe/2.0/spanish-ancora-ud-2.0-conll17-170315.udpipe', + 'es': 'models/udpipe/2.0/spanish-ud-2.0-conll17-170315.udpipe', + 'sv_lines': 'models/udpipe/2.0/swedish-lines-ud-2.0-conll17-170315.udpipe', + 'sv': 'models/udpipe/2.0/swedish-ud-2.0-conll17-170315.udpipe', + 'tr': 'models/udpipe/2.0/turkish-ud-2.0-conll17-170315.udpipe', + 'uk': 'models/udpipe/2.0/ukrainian-ud-2.0-conll17-170315.udpipe', + 'ur': 'models/udpipe/2.0/urdu-ud-2.0-conll17-170315.udpipe', + 'ug': 'models/udpipe/2.0/uyghur-ud-2.0-conll17-170315.udpipe', + 'vi': 'models/udpipe/2.0/vietnamese-ud-2.0-conll17-170315.udpipe', +} + +class Base(Block): + """Base class for all UDPipe blocks.""" + + # pylint: disable=too-many-arguments + def __init__(self, model=None, model_alias=None, + tokenize=False, tag=True, parse=True, **kwargs): + """Create the udpipe.En block object.""" + super().__init__(**kwargs) + self.model, self.model_alias = model, model_alias + self._tool = None + self.tokenize, self.tag, self.parse = tokenize, tag, parse + + @property + def tool(self): + """Return the tool (UDPipe in this case), created lazily.""" + if self._tool: + return self._tool + if not self.model: + if not self.model_alias: + raise ValueError('model (path/to/model) or model_alias (e.g. en) must be set!') + self.model = KNOWN_MODELS[self.model_alias] + self._tool = UDPipe(model=self.model) + return self._tool + + def process_tree(self, root): + tok, tag, par = self.tokenize, self.tag, self.parse + if not tok and tag and par: + return self.tool.tag_parse_tree(root) + # TODO + # return $self->tool->tokenize_tag_parse_tree($root) if $tok && $tag && $par; + # return $self->tool->tokenize_tag_tree($root) if $tok && $tag && !$par; + # return $self->tool->tokenize_tree($root) if $tok && !$tag && !$par; + # return $self->tool->tag_parse_tree($root) if !$tok && $tag && $par; + # return $self->tool->tag_tree($root) if !$tok && $tag && !$par; + # return $self->tool->parse_tree($root) if !$tok && !$tag && $par; + raise ValueError("Unimplemented tokenize=%s tag=%s parse=%s" % (tok, tag, par)) + +''' +Udapi::Block::UDPipe::Base - tokenize, tag and parse into UD + +=head1 SYNOPSIS + + # from the command line + echo John loves Mary | udapi.pl Read::Sentences UDPipe::Base model_alias=en Write::TextModeTrees + + # in scenario + UDPipe::Base model=/home/me/english-ud-1.2-160523.udpipe + UDPipe::Base model_alias=en + UDPipe::EN # shortcut for the above + UDPipe::EN tokenize=1 tag=1 parse=0 + +=head1 DESCRIPTION + +This block loads L (a wrapper for the UDPipe C++ tool) with +the given C for analysis into the Universal Dependencies (UD) style. +UDPipe can do tokenization, tagging (plus lemmatization and universal features) +and parsing (with deprel labels) and users of this block can select which of the +substasks should be done using parameters C, C and C. +The default is to do all three. + +=head1 TODO + +UDPipe can do also sentence segmentation, but L does not supported it yet. + +Similarly with multi-word tokens. + +=head1 PARAMETERS + +=head2 C + +Path to the model file within Udapi share +(or relative path starting with "./" or absolute path starting with "/"). +This parameter is required if C is not supplied. + +=head2 C + +The C parameter can be omitted if this parameter is supplied. +Currently available model aliases are: + +B. + +They correspond to paths where the language code in the alias is substituted +with the respective language name, e.g. B expands to +C. + +=head1 tokenize + +Do tokenization, i.e. create new nodes with attributes +C, C (if SpaceAfter=No) and C. +The sentence string is taken from the root's attribute C. + +=head1 tag + +Fill node attributes: C, C, C and C. +On the input, just the attribute C is expected. + +=head1 parse + +Fill node attributes: C and rehang the nodes to their parent. +On the input, attributes C, C, C and C are expected. + +=head1 SEE ALSO + +L + +L +''' diff --git a/udapi/block/udpipe/en.py b/udapi/block/udpipe/en.py new file mode 100644 index 00000000..7cb74a25 --- /dev/null +++ b/udapi/block/udpipe/en.py @@ -0,0 +1,10 @@ +"""Block udpipe.En for tagging and parsing English.""" +from udapi.block.udpipe.base import Base + + +class En(Base): + """Tag and parse English.""" + + def __init__(self, **kwargs): + """Create the udpipe.En block object.""" + super().__init__(model_alias='en', **kwargs) diff --git a/udapi/tool/__init__.py b/udapi/tool/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/tool/udpipe.py b/udapi/tool/udpipe.py new file mode 100644 index 00000000..dbfb1e74 --- /dev/null +++ b/udapi/tool/udpipe.py @@ -0,0 +1,40 @@ +'''Wrapper for UDPipe (more pythonic than ufal.udpipe).''' +import io +import os + +from ufal.udpipe import Model, Pipeline, ProcessingError # pylint: disable=no-name-in-module +from udapi.block.read.conllu import Conllu as ConlluReader + +class UDPipe: + '''Wrapper for UDPipe (more pythonic than ufal.udpipe).''' + + def __init__(self, model): + """Create the UDPipe tool object.""" + self.model = model + path = self.model_path() + self.tool = Model.load(path) + if not self.tool: + raise IOError("Cannot load model from file '%s'" % path) + self.error = ProcessingError() + self.conllu_reader = ConlluReader() + + def model_path(self): + """Return absolute path to the model file to be loaded.""" + if self.model.startswith('/') or self.model.startswith('.'): + return self.model + elif os.environ.get('UDAPI_DATA'): + return os.environ['UDAPI_DATA'] + '/' + self.model + else: + return os.environ.get('HOME') + '/' + self.model + + def tag_parse_tree(self, root): + """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" + pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') + in_data = " ".join([n.form for n in root.descendants]) + out_data = pipeline.process(in_data, self.error) + if self.error.occurred(): + raise IOError("UDPipe error " + self.error.message) + self.conllu_reader.files.filehandle = io.StringIO(out_data) + parsed = self.conllu_reader.read_tree() + # pylint: disable=protected-access + root._children, root._descendants = parsed._children, parsed._descendants From 6eb2a6e236756c1cb621f7d4a43c5b28e9c9aa4a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 19 Mar 2017 21:40:55 +0100 Subject: [PATCH 0027/1374] alternative loading of the UDPipe-parsed trees which preserves MISC --- udapi/tool/udpipe.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/udapi/tool/udpipe.py b/udapi/tool/udpipe.py index dbfb1e74..c4487e23 100644 --- a/udapi/tool/udpipe.py +++ b/udapi/tool/udpipe.py @@ -35,6 +35,16 @@ def tag_parse_tree(self, root): if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) - parsed = self.conllu_reader.read_tree() - # pylint: disable=protected-access - root._children, root._descendants = parsed._children, parsed._descendants + parsed_root = self.conllu_reader.read_tree() + nodes = [root] + root.descendants + for parsed_node in parsed_root.descendants: + node = nodes[parsed_node.ord] + node.parent = nodes[parsed_node.parent.ord] + for attr in 'upos xpos lemma feats'.split(): + setattr(node, attr, getattr(parsed_node, attr)) + + # TODO: benchmark which solution is the fastest one. E.g. we could also do + #for node, parsed_node in zip(root.descendants, parsed_root.descendants): + # parsed_node.misc = node.misc + ## pylint: disable=protected-access + #root._children, root._descendants = parsed_root._children, parsed_root._descendants From e2c17e69ad5db4dffb7674bb2b182dad9cf9d201 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 19 Mar 2017 21:42:34 +0100 Subject: [PATCH 0028/1374] add tokenize.Simple Also * rename morpho.TokenizeOnWhitespace tokenize.OnWhitespace * let the `tokenize_sentence` method return directly the list of tokens --- udapi/block/{morpho => tokenize}/__init__.py | 0 .../onwhitespace.py} | 8 ++++---- udapi/block/tokenize/simple.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) rename udapi/block/{morpho => tokenize}/__init__.py (100%) rename udapi/block/{morpho/tokenizeonwhitespace.py => tokenize/onwhitespace.py} (92%) create mode 100644 udapi/block/tokenize/simple.py diff --git a/udapi/block/morpho/__init__.py b/udapi/block/tokenize/__init__.py similarity index 100% rename from udapi/block/morpho/__init__.py rename to udapi/block/tokenize/__init__.py diff --git a/udapi/block/morpho/tokenizeonwhitespace.py b/udapi/block/tokenize/onwhitespace.py similarity index 92% rename from udapi/block/morpho/tokenizeonwhitespace.py rename to udapi/block/tokenize/onwhitespace.py index ef6f5d6f..544c4da6 100644 --- a/udapi/block/morpho/tokenizeonwhitespace.py +++ b/udapi/block/tokenize/onwhitespace.py @@ -1,19 +1,19 @@ -"""Block morpho.TokenizeOnWhitespace""" +"""Block tokenize.OnWhitespace""" from udapi.core.block import Block -class TokenizeOnWhitespace(Block): +class OnWhitespace(Block): """"Base tokenizer, splits on whitespaces, fills SpaceAfter=No.""" @staticmethod def tokenize_sentence(string): """A method to be overriden in subclasses.""" - return string + return string.split() def process_tree(self, root): if root.children: raise ValueError('Tree %s is already tokenized.' % root) sentence = ' '.join(root.text.split()) - tokens = self.tokenize_sentence(sentence).split() + tokens = self.tokenize_sentence(sentence) for i, token in enumerate(tokens, 1): space_after = False diff --git a/udapi/block/tokenize/simple.py b/udapi/block/tokenize/simple.py new file mode 100644 index 00000000..82403cee --- /dev/null +++ b/udapi/block/tokenize/simple.py @@ -0,0 +1,12 @@ +"""Block tokenize.Simple""" +import re + +from udapi.block.tokenize.onwhitespace import OnWhitespace + +class Simple(OnWhitespace): + """Simple tokenizer, splits on whitespaces and punctuation, fills SpaceAfter=No.""" + + @staticmethod + def tokenize_sentence(string): + """A method to be overriden in subclasses.""" + return re.findall(r'\w+|[^\w\s]', string) From c46092f25d5014eae744a9c9ec3835d44310b9d0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 19 Mar 2017 21:46:47 +0100 Subject: [PATCH 0029/1374] revert back to the old UDPipe models (UDv1.2) We can switch the the new UDv2.0 models once UDPipe 1.1 is relased on PyPI --- udapi/block/udpipe/base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index 914c160f..eea4b64d 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -3,6 +3,11 @@ from udapi.tool.udpipe import UDPipe KNOWN_MODELS = { + 'en': 'models/udpipe/english-ud-1.2-160523.udpipe', +} + +# TODO use the new models once UDPipe 1.1 is published and available on PyPI as ufal.udpipe +V2_KNOWN_MODELS = { 'grc': 'models/udpipe/2.0/ancient_greek-proiel-ud-2.0-conll17-170315.udpipe', 'grc_proiel': 'models/udpipe/2.0/ancient_greek-ud-2.0-conll17-170315.udpipe', 'ar': 'models/udpipe/2.0/arabic-ud-2.0-conll17-170315.udpipe', From a142807b160b59920d28fd4d0f82d9884de71823 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 19 Mar 2017 22:59:19 +0100 Subject: [PATCH 0030/1374] UDPipe tokenizer (without segmenter) --- udapi/block/udpipe/base.py | 4 +++- udapi/tool/udpipe.py | 43 +++++++++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index eea4b64d..d72cbf16 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -79,7 +79,7 @@ class Base(Block): # pylint: disable=too-many-arguments def __init__(self, model=None, model_alias=None, - tokenize=False, tag=True, parse=True, **kwargs): + tokenize=True, tag=True, parse=True, **kwargs): """Create the udpipe.En block object.""" super().__init__(**kwargs) self.model, self.model_alias = model, model_alias @@ -100,6 +100,8 @@ def tool(self): def process_tree(self, root): tok, tag, par = self.tokenize, self.tag, self.parse + if tok and tag and par: + return self.tool.tokenize_tag_parse_tree(root) if not tok and tag and par: return self.tool.tag_parse_tree(root) # TODO diff --git a/udapi/tool/udpipe.py b/udapi/tool/udpipe.py index c4487e23..2fdbc337 100644 --- a/udapi/tool/udpipe.py +++ b/udapi/tool/udpipe.py @@ -2,7 +2,7 @@ import io import os -from ufal.udpipe import Model, Pipeline, ProcessingError # pylint: disable=no-name-in-module +from ufal.udpipe import Model, Pipeline, ProcessingError, Sentence # pylint: disable=no-name-in-module from udapi.block.read.conllu import Conllu as ConlluReader class UDPipe: @@ -17,6 +17,7 @@ def __init__(self, model): raise IOError("Cannot load model from file '%s'" % path) self.error = ProcessingError() self.conllu_reader = ConlluReader() + self.tokenizer = self.tool.newTokenizer(Model.DEFAULT) def model_path(self): """Return absolute path to the model file to be loaded.""" @@ -48,3 +49,43 @@ def tag_parse_tree(self, root): # parsed_node.misc = node.misc ## pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants + + def tokenize_tag_parse_tree(self, root): + """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.""" + if root.children: + raise ValueError('Tree already contained nodes before tokenization') + + # tokenization (I cannot turn off segmenter, so I need to join the segments) + self.tokenizer.setText(root.text) + u_sentence = Sentence() + is_another = self.tokenizer.nextSentence(u_sentence) + u_words = u_sentence.words + n_words = u_words.size() - 1 + if is_another: + u_sent_cont = Sentence() + while self.tokenizer.nextSentence(u_sent_cont): + n_cont = u_sent_cont.words.size() - 1 + for i in range(1, n_cont+1): + u_w = u_sent_cont.words[i] + n_words += 1 + u_w.id = n_words + u_words.append(u_w) + + # tagging and parsing + self.tool.tag(u_sentence, Model.DEFAULT) + self.tool.parse(u_sentence, Model.DEFAULT) + + # converting UDPipe nodes to Udapi nodes + heads, nodes = [], [root] + for i in range(1, u_words.size()): + u_w = u_words[i] + node = root.create_child( + form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, + xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, + ) + node.misc = u_w.misc + heads.append(u_w.head) + nodes.append(node) + for node in nodes[1:]: + head = heads.pop(0) + node.parent = nodes[head] From 22a1acdcdb1b9b1d31ab6b8bc4f612e7c290343f Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 23 Mar 2017 11:29:16 +0100 Subject: [PATCH 0031/1374] add util.Mark for marking nodes `util.Mark node='XY'` is a shortcut for `util.Eval node='if XY: node.misc["Mark"] = self.mark'` and in combination with `udapy -TM` it is also a shortcut for `util.Filter keep_tree_if_node='XY'` But I think it is useful enough to become a separate block. --- udapi/block/util/filter.py | 2 +- udapi/block/util/mark.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 udapi/block/util/mark.py diff --git a/udapi/block/util/filter.py b/udapi/block/util/filter.py index 88bedb76..b812fb64 100644 --- a/udapi/block/util/filter.py +++ b/udapi/block/util/filter.py @@ -15,7 +15,7 @@ class Filter(Block): # keep only trees which contain ToDo|Bug nodes udapy -s util.Filter keep_tree_if_node='re.match("ToDo|Bug", str(node.misc))' < in > filtered - # keep only non-projective trees, annotate non-projective edges with Mark=nofeats and show. + # keep only non-projective trees, annotate non-projective edges with Mark=nonproj and show. udapy -T util.Filter keep_tree_if_node='node.is_nonprojective()' mark=nonproj < in | less -R # delete trees which contain deprel=remnant diff --git a/udapi/block/util/mark.py b/udapi/block/util/mark.py new file mode 100644 index 00000000..42052336 --- /dev/null +++ b/udapi/block/util/mark.py @@ -0,0 +1,30 @@ +"""util.Mark is a special block for marking nodes specified by parameters.""" +import re # may be useful in eval, thus pylint: disable=unused-import + +from udapi.core.block import Block + +# We need eval in this block +# pylint: disable=eval-used +class Mark(Block): + """Mark nodes specified by parameters. + + Example usage from command line:: + # see non-projective trees with non-projective edges highlighted + udapy -TM util.Mark node='node.is_nonprojective()' < in | less -R + """ + def __init__(self, node, mark=1, **kwargs): + """Create the Mark block object. + + Args: + `node`: Python expression to be evaluated for each node and if True, + the node will be marked. + + `mark`: the node will be marked with `Mark=` in `node.misc`. Default=1. + """ + super().__init__(**kwargs) + self.mark = mark + self.node = node + + def process_node(self, node): + if eval(self.node): + node.misc['Mark'] = self.mark From 60a5c645a52bd6f2262faa144d42584fee4a84c4 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 23 Mar 2017 11:36:27 +0100 Subject: [PATCH 0032/1374] template for tutorial.Adpositions --- udapi/block/tutorial/__init__.py | 0 udapi/block/tutorial/adpositions.py | 33 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 udapi/block/tutorial/__init__.py create mode 100644 udapi/block/tutorial/adpositions.py diff --git a/udapi/block/tutorial/__init__.py b/udapi/block/tutorial/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/udapi/block/tutorial/adpositions.py b/udapi/block/tutorial/adpositions.py new file mode 100644 index 00000000..cc8bafba --- /dev/null +++ b/udapi/block/tutorial/adpositions.py @@ -0,0 +1,33 @@ +"""tutorial.Adpositions block template. + +Example usage: +for a in */sample.conllu; do + printf '%50s ' $a; + udapy tutorial.Adpositions < $a; +done | tee results.txt + +# What are the English postpositions? +cat UD_English/sample.conllu | udapy -TM util.Mark \ + node='node.upos == "ADP" and node.parent.precedes(node)' | less -R +""" +from udapi.core.block import Block + +class Adpositions(Block): + """Compute the number of prepositions and postpositions.""" + + def __init__(self, **kwargs): + """Create the Adpositions block object.""" + super().__init__(**kwargs) + self.prepositions = 0 + self.postpositions = 0 + + def process_node(self, node): + # TODO: Your task: distinguish prepositions and postpositions + if node.upos == "ADP": + self.prepositions += 1 + + def process_end(self): + total = self.prepositions + self.postpositions or 1 + prep = 100 * self.prepositions / total + post = 100 * self.postpositions / total + print("prepositions %5.1f%%, postpositions %5.1f%%" % (prep, post)) From 17d6a83cd5cee6c47bffa2e1a2fda73c7e98438e Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 25 Mar 2017 02:47:53 +0100 Subject: [PATCH 0033/1374] add punct-nonproj test to ud.MarkBugs --- udapi/block/ud/markbugs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index 44cfb5cc..f785f556 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -163,6 +163,9 @@ def process_node(self, node): self.log(node, 'punct-deprel', 'upos=PUNCT deprel!=punct|fixed|goeswith|root (but %s)' % deprel) + if upos == 'PUNCT' and node.is_nonprojective(): + self.log(node, 'punct-nonproj', 'upos=PUNCT and edge is non-projective') + # http://universaldependencies.org/u/dep/cc.html says # "cc is the relation between a conjunct and a preceding # [coordinating conjunction](http://universaldependencies.org/u/pos/CCONJ)." From 70d6d86b3c611e05b4b7ea8d87529ea028a0d0db Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 27 Mar 2017 17:59:14 +0200 Subject: [PATCH 0034/1374] generate documentation with Sphinx --- README.md | 1 + docs/.gitignore | 4 ++ docs/Makefile | 20 ++++++ docs/api.rst | 17 +++++ docs/conf.py | 176 +++++++++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 24 +++++++ docs/install.rst | 21 ++++++ 7 files changed, 263 insertions(+) create mode 100644 docs/.gitignore create mode 100644 docs/Makefile create mode 100644 docs/api.rst create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/install.rst diff --git a/README.md b/README.md index 721d1354..d0666b70 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ Python framework for processing Universal Dependencies data [![Build Status](https://travis-ci.org/udapi/udapi-python.svg?branch=master)](https://travis-ci.org/udapi/udapi-python) +[![Website](https://img.shields.io/website-up-down-green-red/http/udapi.github.io.svg)](http://udapi.github.io) ## Requirements - You need Python 3.3 or higher. diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000..a1d82581 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,4 @@ +_build +udapi.rst +udapi.*.rst +modules.rst diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..17d5375a --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = Udapi +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 00000000..0857cc98 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,17 @@ +.. _api: + +================= +API Documentation +================= + +``udapi`` package +======================== + +.. automodule:: udapi + :members: + +------------------------ + +**Sub-modules** + +.. toctree:: modules diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..3e7864a5 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Udapi documentation build configuration file, created by +# sphinx-quickstart on Mon Mar 27 17:08:03 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.ifconfig', + 'sphinx.ext.viewcode'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Udapi' +copyright = '2017, Martin Popel' +author = 'Martin Popel' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0' +# The full version, including alpha/beta/rc tags. +release = '1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# html_theme = 'alabaster' +import sphinx_rtd_theme +html_theme = 'sphinx_rtd_theme' +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Udapidoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Udapi.tex', 'Udapi Documentation', + 'Martin Popel', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'udapi', 'Udapi Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Udapi', 'Udapi Documentation', + author, 'Udapi', 'API and framework for processing Universal Dependencies', + 'Miscellaneous'), +] + + +def run_apidoc(_): + + cur_dir = os.path.abspath(os.path.dirname(__file__)) + print(cur_dir) + module = os.path.abspath(os.path.join(cur_dir, "..", "udapi")) + print(module) + + from sphinx.apidoc import main + main(['--separate', '-o', cur_dir, module, '--force']) + +def setup(app): + app.connect('builder-inited', run_apidoc) + diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..78a2d540 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,24 @@ +.. Udapi documentation master file, created by + sphinx-quickstart on Mon Mar 27 17:08:03 2017. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to Udapi's documentation! +================================= + +Udapi is a framework providing an API for processing +`Universal Dependencies `_ data. + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + install + api + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/install.rst b/docs/install.rst new file mode 100644 index 00000000..14f81527 --- /dev/null +++ b/docs/install.rst @@ -0,0 +1,21 @@ +.. _instalation: + +============ +Installation +============ + +You need Python 3.3 or higher, pip3 and git. + + +Let's clone the git repo to ``~/udapi-python/``, install dependencies +and setup ``$PATH`` and ``$PYTHONPATH`` accordingly: + +.. code-block:: bash + + cd + git clone https://github.com/udapi/udapi-python.git + pip3 install --user -r udapi-python/requirements.txt + echo '## Use Udapi from ~/udapi-python/ ##' >> ~/.bashrc + echo 'export PATH="$HOME/udapi-python/bin:$PATH"' >> ~/.bashrc + echo 'export PYTHONPATH="$HOME/udapi-python/:$PYTHONPATH"' >> ~/.bashrc + source ~/.bashrc # or open new bash From 30f308f7d2401b5af579ffa0fba0fd097b35df99 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 27 Mar 2017 19:14:24 +0200 Subject: [PATCH 0035/1374] link udapi.readthedocs.io --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d0666b70..7183cd6c 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ Python framework for processing Universal Dependencies data [![Build Status](https://travis-ci.org/udapi/udapi-python.svg?branch=master)](https://travis-ci.org/udapi/udapi-python) [![Website](https://img.shields.io/website-up-down-green-red/http/udapi.github.io.svg)](http://udapi.github.io) +[![Documentation Status](https://readthedocs.org/projects/udapi/badge/)](http://udapi.readthedocs.io) ## Requirements - You need Python 3.3 or higher. From 4cc3e13e7c7b89180da2662b37f9634e78b32656 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 27 Mar 2017 20:49:30 +0200 Subject: [PATCH 0036/1374] fix rst syntax (work in progress) --- udapi/block/transform/proj.py | 15 ++--- udapi/block/tutorial/adpositions.py | 13 +++-- udapi/block/ud/addmwt.py | 1 + udapi/block/ud/convert1to2.py | 13 +++-- udapi/block/ud/el/addmwt.py | 2 +- udapi/block/ud/ro/setspaceafter.py | 22 ++++--- udapi/block/write/html.py | 24 ++++---- udapi/block/write/sdparse.py | 28 ++++----- udapi/block/write/textmodetrees.py | 91 +++++++++++++++-------------- udapi/block/write/tikz.py | 15 ++--- udapi/block/write/vislcg.py | 64 ++++++++++---------- 11 files changed, 153 insertions(+), 135 deletions(-) diff --git a/udapi/block/transform/proj.py b/udapi/block/transform/proj.py index f15e46e9..6254b917 100644 --- a/udapi/block/transform/proj.py +++ b/udapi/block/transform/proj.py @@ -6,13 +6,14 @@ http://www.maltparser.org/optiondesc.html#pproj-marking_strategy TODO: implement also path and head+path strategies. + TODO: Sometimes it would be better (intuitively) - to lower the gap-node (if its whole subtree is in the gap - and if this does not cause more non-projectivities) - rather than to lift several nodes whose parent-edge crosses this gap. - We would need another label value (usually the lowering is of depth 1), - but the advantage is that reconstruction of lowered edges - during deprojectivization is simple and needs no heuristics. +to lower the gap-node (if its whole subtree is in the gap +and if this does not cause more non-projectivities) +rather than to lift several nodes whose parent-edge crosses this gap. +We would need another label value (usually the lowering is of depth 1), +but the advantage is that reconstruction of lowered edges +during deprojectivization is simple and needs no heuristics. """ from udapi.core.block import Block @@ -59,4 +60,4 @@ def mark(self, node, label): elif self.label == 'deprel': node.deprel = '%s:%s+%s' % (node.udeprel, node.sdeprel, label) else: - raise(ValueError('Unknown parameter label=%s' % self.label)) + raise ValueError('Unknown parameter label=%s' % self.label) diff --git a/udapi/block/tutorial/adpositions.py b/udapi/block/tutorial/adpositions.py index cc8bafba..cf2ad514 100644 --- a/udapi/block/tutorial/adpositions.py +++ b/udapi/block/tutorial/adpositions.py @@ -1,14 +1,15 @@ """tutorial.Adpositions block template. -Example usage: -for a in */sample.conllu; do +Example usage:: + + for a in */sample.conllu; do printf '%50s ' $a; udapy tutorial.Adpositions < $a; -done | tee results.txt + done | tee results.txt -# What are the English postpositions? -cat UD_English/sample.conllu | udapy -TM util.Mark \ - node='node.upos == "ADP" and node.parent.precedes(node)' | less -R + # What are the English postpositions? + cat UD_English/sample.conllu | udapy -TM util.Mark \ + node='node.upos == "ADP" and node.parent.precedes(node)' | less -R """ from udapi.core.block import Block diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py index 653cfa26..eab2158b 100644 --- a/udapi/block/ud/addmwt.py +++ b/udapi/block/ud/addmwt.py @@ -57,6 +57,7 @@ def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token. An example return value is:: + { 'form': 'aby bych', 'lemma': 'aby být', diff --git a/udapi/block/ud/convert1to2.py b/udapi/block/ud/convert1to2.py index ce549256..e389e476 100644 --- a/udapi/block/ud/convert1to2.py +++ b/udapi/block/ud/convert1to2.py @@ -275,13 +275,16 @@ def fix_remnants_in_tree(self, root): Remnant's parent is always the correlate (same-role) node. Usually, correlate's parent is the head of the whole ellipsis subtree, i.e. the first conjunct. However, sometimes remnants are deeper, e.g. - 'Over 300 Iraqis are reported dead and 500 wounded.' with edges: - nsubjpass(reported, Iraqis) - nummod(Iraqis, 300) - remnant(300, 500) + 'Over 300 Iraqis are reported dead and 500 wounded.' with edges:: + + nsubjpass(reported, Iraqis) + nummod(Iraqis, 300) + remnant(300, 500) + Let's expect all remnants in one tree are part of the same ellipsis structure. + TODO: theoretically, there may be more ellipsis structures with remnants in one tree, - but I have no idea how to distinguish them from the deeper-remnants cases. + but I have no idea how to distinguish them from the deeper-remnants cases. """ remnants = [n for n in root.descendants if n.deprel == 'remnant'] if not remnants: diff --git a/udapi/block/ud/el/addmwt.py b/udapi/block/ud/el/addmwt.py index 39e23620..81a98836 100644 --- a/udapi/block/ud/el/addmwt.py +++ b/udapi/block/ud/el/addmwt.py @@ -3,7 +3,7 @@ Notice that this should be used only for converting existing conllu files. Ideally a tokenizer should have already split the MWTs. Also notice that this block does not deal with the relatively rare -PRON(Person=2)+'*+PRON(Person=3, i.e. "σ'το" and "στο") MWTs. +``PRON(Person=2)+'*+PRON(Person=3, i.e. "σ'το" and "στο")`` MWTs. """ import udapi.block.ud.addmwt diff --git a/udapi/block/ud/ro/setspaceafter.py b/udapi/block/ud/ro/setspaceafter.py index 80bfda8f..bc18f364 100644 --- a/udapi/block/ud/ro/setspaceafter.py +++ b/udapi/block/ud/ro/setspaceafter.py @@ -1,7 +1,8 @@ """Block ud.ro.SetSpaceAfter for heuristic setting of SpaceAfter=No in Romanian. -Usage: -udapy -s ud.ro.SetSpaceAfter < in.conllu > fixed.conllu +Usage:: + + udapy -s ud.ro.SetSpaceAfter < in.conllu > fixed.conllu Author: Martin Popel """ @@ -13,13 +14,16 @@ class SetSpaceAfter(udapi.block.ud.setspaceafter.SetSpaceAfter): """Block for heuristic setting of the SpaceAfter=No MISC attribute in Romanian. Romanian uses many contractions, e.g. - raw | meaning | tokenized | lemmatized - -------|---------|-----------|----------- - n-ar | nu ar | n- ar | nu avea - să-i | să îi | să -i | să el - într-o | în o | într- o | întru un - nu-i | nu îi | nu -i | nu el - nu-i | nu e | nu -i | nu fi + + ======= ======= ========= ========== + raw meaning tokenized lemmatized + ======= ======= ========= ========== + n-ar nu ar n- ar nu avea + să-i să îi să -i să el + într-o în o într- o întru un + nu-i nu îi nu -i nu el + nu-i nu e nu -i nu fi + ======= ======= ========= ========== Detokenization is quite simple: no space after word-final hyphen and before word-initial hyphen. There are just two exceptions, I have found: diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 2fd76bf7..85b8dcc9 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -5,17 +5,19 @@ class Html(BaseWriter): """A writer for HTML+JavaScript+SVG visualization of dependency trees. - Usage: - # from the command line - udapy write.Html < file.conllu > file.html - firefox file.html - - # for offline use, we need to download first three JavaScript libraries - wget https://code.jquery.com/jquery-2.1.4.min.js - wget https://cdn.rawgit.com/eligrey/FileSaver.js/master/FileSaver.min.js - wget https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js - udapy write.Html path_to_js=. < file.conllu > file.html - firefox file.html + .. code-block:: bash + + # from the command line + udapy write.Html < file.conllu > file.html + firefox file.html + + For offline use, we need to download first three JavaScript libraries:: + + wget https://code.jquery.com/jquery-2.1.4.min.js + wget https://cdn.rawgit.com/eligrey/FileSaver.js/master/FileSaver.min.js + wget https://cdn.rawgit.com/ufal/js-treex-view/gh-pages/js-treex-view.js + udapy write.Html path_to_js=. < file.conllu > file.html + firefox file.html This writer produces an html file with drawings of the dependency trees in the document (there are buttons for selecting which bundle will be shown). diff --git a/udapi/block/write/sdparse.py b/udapi/block/write/sdparse.py index 209938b5..60b78d6d 100644 --- a/udapi/block/write/sdparse.py +++ b/udapi/block/write/sdparse.py @@ -8,33 +8,35 @@ class Sdparse(BaseWriter): """A writer of files in the Stanford dependencies format, suitable for Brat visualization. Usage: - udapy write.Sdparse print_upos=0 < in.conllu + ``udapy write.Sdparse print_upos=0 < in.conllu`` Example output:: - ~~~ sdparse - Corriere Sport da pagina 23 a pagina 26 - name(Corriere, Sport) - case(pagina-4, da) - nmod(Corriere, pagina-4) - nummod(pagina-4, 23) - case(pagina-7, a) - nmod(Corriere, pagina-7) - nummod(pagina-7, 26) - ~~~ + ~~~ sdparse + Corriere Sport da pagina 23 a pagina 26 + name(Corriere, Sport) + case(pagina-4, da) + nmod(Corriere, pagina-4) + nummod(pagina-4, 23) + case(pagina-7, a) + nmod(Corriere, pagina-7) + nummod(pagina-7, 26) + ~~~ To visualize it, use embedded Brat, e.g. go to - http://universaldependencies.org/visualization.html#editing + http://universaldependencies.org/visualization.html#editing. Click the edit button and paste the output of this writer excluding the `~~~` marks. Notes: - Original Stanford dependencies format (http://nlp.stanford.edu/software/dependencies_manual.pdf) + The original `Stanford dependencies format + `_ allows explicit specification of the root dependency, e.g. `root(ROOT-0, makes-8)`. However, this is not allowed by Brat, so this writer does not print it. UD v2.0 allows tokens with spaces, but I am not aware of any Brat support. Alternatives: + * `write.Conllu` Brat recently supports also the CoNLL-U input * `write.TextModeTrees` may be more readable/useful in some usecases * `write.Html` dtto, press "Save as SVG" button, convert to pdf diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index a2c949cc..b5968b3a 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -20,64 +20,67 @@ class TextModeTrees(BaseWriter): """An ASCII pretty printer of dependency trees. - SYNOPSIS - # from command line (visualize CoNLL-U files) - udapy write.TextModeTrees color=1 < file.conllu | less -R + .. code-block:: bash - # is scenario (examples of other parameters) - write.TextModeTrees indent=1 print_sent_id=1 print_sentence=1 - write.TextModeTrees zones=en,cs attributes=form,lemma,upos minimize_cross=0 + # from the command line (visualize CoNLL-U files) + udapy write.TextModeTrees color=1 < file.conllu | less -R + + In scenario (examples of other parameters):: + + write.TextModeTrees indent=1 print_sent_id=1 print_sentence=1 + write.TextModeTrees zones=en,cs attributes=form,lemma,upos minimize_cross=0 - DESCRIPTION This block prints dependency trees in plain-text format. - For example the following CoNLL-U file (with tabs instead of spaces) - - 1 I I PRON PRP Number=Sing|Person=1 2 nsubj _ _ - 2 saw see VERB VBD Tense=Past 0 root _ _ - 3 a a DET DT Definite=Ind 4 det _ _ - 4 dog dog NOUN NN Number=Sing 2 dobj _ _ - 5 today today NOUN NN Number=Sing 2 nmod:tmod _ SpaceAfter=No - 6 , , PUNCT , _ 2 punct _ _ - 7 which which DET WDT PronType=Rel 10 nsubj _ _ - 8 was be VERB VBD Person=3|Tense=Past 10 cop _ _ - 9 a a DET DT Definite=Ind 10 det _ _ - 10 boxer boxer NOUN NN Number=Sing 4 acl:relcl _ SpaceAfter=No - 11 . . PUNCT . _ 2 punct _ _ - - will be printed (with the default parameters) as - ─┮ - │ ╭─╼ I PRON nsubj - ╰─┾ saw VERB root - │ ╭─╼ a DET det - ├────────────────────────┾ dog NOUN dobj - ├─╼ today NOUN nmod:tmod │ - ├─╼ , PUNCT punct │ - │ │ ╭─╼ which DET nsubj - │ │ ├─╼ was VERB cop - │ │ ├─╼ a DET det - │ ╰─┶ boxer NOUN acl:relcl - ╰─╼ . PUNCT punct + For example the following CoNLL-U file (with tabs instead of spaces):: + + 1 I I PRON PRP Number=Sing|Person=1 2 nsubj _ _ + 2 saw see VERB VBD Tense=Past 0 root _ _ + 3 a a DET DT Definite=Ind 4 det _ _ + 4 dog dog NOUN NN Number=Sing 2 dobj _ _ + 5 today today NOUN NN Number=Sing 2 nmod:tmod _ SpaceAfter=No + 6 , , PUNCT , _ 2 punct _ _ + 7 which which DET WDT PronType=Rel 10 nsubj _ _ + 8 was be VERB VBD Person=3|Tense=Past 10 cop _ _ + 9 a a DET DT Definite=Ind 10 det _ _ + 10 boxer boxer NOUN NN Number=Sing 4 acl:relcl _ SpaceAfter=No + 11 . . PUNCT . _ 2 punct _ _ + + will be printed (with the default parameters) as:: + + ─┮ + │ ╭─╼ I PRON nsubj + ╰─┾ saw VERB root + │ ╭─╼ a DET det + ├────────────────────────┾ dog NOUN dobj + ├─╼ today NOUN nmod:tmod │ + ├─╼ , PUNCT punct │ + │ │ ╭─╼ which DET nsubj + │ │ ├─╼ was VERB cop + │ │ ├─╼ a DET det + │ ╰─┶ boxer NOUN acl:relcl + ╰─╼ . PUNCT punct Some non-projective trees cannot be printed witout crossing edges. - TextModeTrees uses a special "bridge" symbol ─╪─ to mark this: - ─┮ - │ ╭─╼ 1 - ├─╪───┮ 2 - ╰─┶ 3 │ - ╰─╼ 4 - - By default parameter `color=auto`, so if the output is printed to the console + TextModeTrees uses a special "bridge" symbol ─╪─ to mark this:: + + ─┮ + │ ╭─╼ 1 + ├─╪───┮ 2 + ╰─┶ 3 │ + ╰─╼ 4 + + By default parameter ``color=auto``, so if the output is printed to the console (not file or pipe), each node attribute is printed in different color. If a given node's MISC contains any of `ToDo`, `Bug` or `Mark` attributes (or any other specified in the parameter `mark`), the node will be highlighted (by reveresing the background and foreground colors). This block's method `process_tree` can be called on any node (not only root), - which is useful for printing subtrees using `node.print_subtree()`, + which is useful for printing subtrees using ``node.print_subtree()``, which is internally implemented using this block. SEE ALSO - `write.TextModeTreesHtml` + :py:class:`.TextModeTreesHtml` """ def __init__(self, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, diff --git a/udapi/block/write/tikz.py b/udapi/block/write/tikz.py index 4eb69221..7e95454e 100644 --- a/udapi/block/write/tikz.py +++ b/udapi/block/write/tikz.py @@ -7,19 +7,20 @@ class Tikz(BaseWriter): r"""A writer of files in the LaTeX with tikz-dependency format. - Usage: - udapy write.Tikz < my.conllu > my.tex - pdflatex my.tex - xdg-open my.pdf + Usage:: + + udapy write.Tikz < my.conllu > my.tex + pdflatex my.tex + xdg-open my.pdf Long sentences may result in too large pictures. You can tune the width (in addition to changing fontsize or using minipage and rescaling) with - ``\begin{deptext}[column sep=0.2cm]`` + ``\begin{deptext}[column sep=0.2cm]`` or individually for each word: - ``My \&[.5cm] dog \& etc.`` + ``My \&[.5cm] dog \& etc.`` By default, the height of the horizontal segment of a dependency edge is proportional to the distance between the linked words. You can tune the height with: - ``\depedge[edge unit distance=1.5ex]{9}{1}{deprel}`` + ``\depedge[edge unit distance=1.5ex]{9}{1}{deprel}`` See `tikz-dependency documentation `_ diff --git a/udapi/block/write/vislcg.py b/udapi/block/write/vislcg.py index 9179eeb2..5dd5f52b 100644 --- a/udapi/block/write/vislcg.py +++ b/udapi/block/write/vislcg.py @@ -19,44 +19,44 @@ class Vislcg(BaseWriter): See https://visl.sdu.dk/visl/vislcg-doc.html Usage: - udapy write.Vislcg < in.conllu > out.vislcg + ``udapy write.Vislcg < in.conllu > out.vislcg`` Example output:: - "<Қыз>" - "қыз" n nom @nsubj #1->3 - "<оның>" - "ол" prn pers p3 sg gen @nmod:poss #2->3 - "<қарындасы>" - "қарындас" n px3sp nom @parataxis #3->8 - "е" cop aor p3 sg @cop #4->3 - "<,>" - "," cm @punct #5->8 - "<ол>" - "ол" prn pers p3 sg nom @nsubj #6->8 - "<бес>" - "бес" num @nummod #7->8 - "<жаста>" - "жас" n loc @root #8->0 - "е" cop aor p3 sg @cop #9->8 - "<.>" - "." sent @punct #10->8 + "<Қыз>" + "қыз" n nom @nsubj #1->3 + "<оның>" + "ол" prn pers p3 sg gen @nmod:poss #2->3 + "<қарындасы>" + "қарындас" n px3sp nom @parataxis #3->8 + "е" cop aor p3 sg @cop #4->3 + "<,>" + "," cm @punct #5->8 + "<ол>" + "ол" prn pers p3 sg nom @nsubj #6->8 + "<бес>" + "бес" num @nummod #7->8 + "<жаста>" + "жас" n loc @root #8->0 + "е" cop aor p3 sg @cop #9->8 + "<.>" + "." sent @punct #10->8 Example input:: - # text = Қыз оның қарындасы, ол бес жаста. - 1 Қыз қыз _ n nom 3 nsubj _ _ - 2 оның ол _ prn pers|p3|sg|gen 3 nmod:poss _ _ - 3-4 қарындасы _ _ _ _ _ _ _ _ - 3 қарындасы қарындас _ n px3sp|nom 8 parataxis _ _ - 4 _ е _ cop aor|p3|sg 3 cop _ _ - 5 , , _ cm _ 8 punct _ _ - 6 ол ол _ prn pers|p3|sg|nom 8 nsubj _ _ - 7 бес бес _ num _ 8 nummod _ _ - 8-9 жаста _ _ _ _ _ _ _ _ - 8 жаста жас _ n loc 0 root _ _ - 9 _ е _ cop aor|p3|sg 8 cop _ _ - 10 . . _ sent _ 8 punct _ _ + # text = Қыз оның қарындасы, ол бес жаста. + 1 Қыз қыз _ n nom 3 nsubj _ _ + 2 оның ол _ prn pers|p3|sg|gen 3 nmod:poss _ _ + 3-4 қарындасы _ _ _ _ _ _ _ _ + 3 қарындасы қарындас _ n px3sp|nom 8 parataxis _ _ + 4 _ е _ cop aor|p3|sg 3 cop _ _ + 5 , , _ cm _ 8 punct _ _ + 6 ол ол _ prn pers|p3|sg|nom 8 nsubj _ _ + 7 бес бес _ num _ 8 nummod _ _ + 8-9 жаста _ _ _ _ _ _ _ _ + 8 жаста жас _ n loc 0 root _ _ + 9 _ е _ cop aor|p3|sg 8 cop _ _ + 10 . . _ sent _ 8 punct _ _ """ def process_tree(self, tree): From bc09e9e5039f7aadcfdc2fe759ed253c17fa130c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 28 Mar 2017 15:12:36 +0200 Subject: [PATCH 0037/1374] new tutorial task template --- udapi/block/tutorial/addcommas.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 udapi/block/tutorial/addcommas.py diff --git a/udapi/block/tutorial/addcommas.py b/udapi/block/tutorial/addcommas.py new file mode 100644 index 00000000..8c04e242 --- /dev/null +++ b/udapi/block/tutorial/addcommas.py @@ -0,0 +1,22 @@ +"""tutorial.AddCommas block template.""" +from udapi.core.block import Block + +class AddCommas(Block): + """Heuristically insert nodes for missing commas.""" + + def process_node(self, node): + if self.should_add_comma_before(node): + comma = node.create_child(form=',', deprel='punct', upos='PUNCT') + comma.shift_before_node(node) + + def should_add_comma_before(self, node): + # TODO: Your task: implement some heuristics + prev_node = node.prev_node + if prev_node is None: + return False + if prev_node.lemma == 'however': + return True + if any(n.deprel == 'appos' for n in prev_node.children): + return True + + return False From 48e0ca452b0d42d87c3237146441d11521cb9419 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 30 Mar 2017 22:20:00 +0200 Subject: [PATCH 0038/1374] block parameter `zones` --- udapi/core/block.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/udapi/core/block.py b/udapi/core/block.py index 7326f7f8..453b1d65 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -5,8 +5,8 @@ class Block(object): """The smallest processing unit for processing Universal Dependencies data.""" - def __init__(self, **kwargs): - pass + def __init__(self, zones='all'): + self.zones = zones def process_start(self): """A hook method that is executed before processing UD data""" @@ -28,7 +28,8 @@ def process_tree(self, tree): def process_bundle(self, bundle): """Process a UD bundle""" for tree in bundle: - self.process_tree(tree) + if self._should_process_tree(tree): + self.process_tree(tree) def process_document(self, document): """Process a UD document""" @@ -44,3 +45,12 @@ def before_process_document(self, document): def after_process_document(self, document): """This method is called after each process_document.""" pass + + def _should_process_tree(self, tree): + if self.zones == 'all': + return True + if self.zones == '' and tree.zone == '': + return True + if tree.zone in self.zones.split(','): + return True + return False From 20ebcec7551fab001d94036f64d718a4c4681a56 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 30 Mar 2017 23:10:26 +0200 Subject: [PATCH 0039/1374] implement all the functionality of eval.F1 --- udapi/block/eval/f1.py | 191 +++++++++++++++++++++++++++++++++++ udapi/block/eval/lcsf1.py | 203 -------------------------------------- 2 files changed, 191 insertions(+), 203 deletions(-) create mode 100644 udapi/block/eval/f1.py delete mode 100644 udapi/block/eval/lcsf1.py diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py new file mode 100644 index 00000000..90be94ec --- /dev/null +++ b/udapi/block/eval/f1.py @@ -0,0 +1,191 @@ +"""Block eval.F1 for evaluating differences between sentences with P/R/F1. + +``eval.F1 zones=en_pred gold_zone=en_gold details=0`` +prints something like:: + + predicted = 210 + gold = 213 + correct = 210 + precision = 100.00% + recall = 98.59% + F1 = 99.29% + +``eval.F1 gold_zone=y attributes=form,upos focus='(?i:an?|the)_DET' details=4`` +prints something like:: + + === Details === + token pred gold corr prec rec F1 + the_DET 711 213 188 26.44% 88.26% 40.69% + The_DET 82 25 19 23.17% 76.00% 35.51% + a_DET 0 62 0 0.00% 0.00% 0.00% + an_DET 0 16 0 0.00% 0.00% 0.00% + === Totals === + predicted = 793 + gold = 319 + correct = 207 + precision = 26.10% + recall = 64.89% + F1 = 37.23% + +This block finds differences between nodes of trees in two zones +and reports the overall precision, recall and F1. +The two zones are "predicted" (on which this block is applied) +and "gold" (which needs to be specified with parameter ``gold``). + +This block also reports the number of total nodes in the predicted zone +and in the gold zone and the number of "correct" nodes, +that is predicted nodes which are also in the gold zone. +By default two nodes are considered "the same" if they have the same ``form``, +but it is possible to check also for other nodes' attributes +(with parameter ``attributes``). + +As usual:: + + precision = correct / predicted + recall = correct / gold + F1 = 2 * precision * recall / (precision + recall) + +The implementation is based on finding the longest common subsequence (LCS) +between the nodes in the two trees. +This means that the two zones do not need to be explicitly word-aligned. +""" +from collections import Counter +import logging +import re + +from udapi.core.basewriter import BaseWriter + +# pylint: disable=too-many-instance-attributes,invalid-name +class F1(BaseWriter): + """Evaluate differences between sentences (in different zones) with P/R/F1. + + Args: + zones: Which zone contains the "predicted" trees? + Make sure that you specify just one zone. + If you leave the default value "all" and the document contains more zones, + the results will be mixed, which is most likely not what you wanted. + Exception: If the document conaints just two zones (predicted and gold trees), + you can keep the default value "all" because this block + will skip comparison of the gold zone with itself. + + gold_zone: Which zone contains the gold-standard trees? + + attributes: comma separated list of attributes which should be checked + when deciding whether two nodes are equivalent in LCS + + focus: Regular expresion constraining the tokens we are interested in. + If more attributes were specified in the ``attributes`` parameter, + their values are concatenated with underscore, so ``focus`` should reflect that + e.g. ``attributes=form,upos focus='(a|the)_DET'``. + For case-insensitive focus use e.g. ``focus='(?i)the'`` + (which is equivalent to ``focus='[Tt][Hh][Ee]'``). + + details: Print also detailed statistics for each token (matching the ``focus``). + The value of this parameter ``details`` specifies the number of tokens to include. + The tokens are sorted according to the sum of their *predicted* and *gold* counts. + """ + + def __init__(self, gold_zone, attributes='form', focus=None, details=4, **kwargs): + """Create the eval.F1 block object.""" + super().__init__(**kwargs) + self.gold_zone = gold_zone + self.attrs = attributes.split(',') + self.focus = None + if focus is not None: + self.focus = re.compile(focus) + self.details = details + self.correct, self.pred, self.gold = 0, 0, 0 + self.visited_zones = Counter() + if details: + self._common = Counter() + self._pred = Counter() + self._gold = Counter() + self._total = Counter() + + def process_tree(self, tree): + gold_tree = tree.bundle.get_tree(self.gold_zone) + if tree == gold_tree: + return + self.visited_zones[tree.zone] += 1 + + pred_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in tree.descendants] + gold_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in gold_tree.descendants] + common = find_lcs(pred_tokens, gold_tokens) + + if self.focus is not None: + common = [x for x in common if self.focus.fullmatch(x)] + pred_tokens = [x for x in pred_tokens if self.focus.fullmatch(x)] + gold_tokens = [x for x in gold_tokens if self.focus.fullmatch(x)] + + self.correct += len(common) + self.pred += len(pred_tokens) + self.gold += len(gold_tokens) + + if self.details: + for x in common: + self._common[x] += 1 + for x in gold_tokens: + self._gold[x] += 1 + self._total[x] += 1 + for x in pred_tokens: + self._pred[x] += 1 + self._total[x] += 1 + + def process_end(self): + # Redirect the default filehandle to the file specified by self.files + self.before_process_document(None) + + if not self.visited_zones: + logging.warning('Block eval.F1 was not applied to any zone. ' + 'Check the parameter zones=%s', self.zones) + elif len(self.visited_zones) > 1: + logging.warning('Block eval.F1 was applied to more than one zone %s. ' + 'The results are mixed together. Check the parameter zones=%s', + list(self.visited_zones.elements()), self.zones) + print('Comparing predicted trees (zone=%s) with gold trees (zone=%s), sentences=%d' + % (next(self.visited_zones.elements()), self.gold_zone, + self.visited_zones.most_common(1)[0][1])) + if self.details: + print('=== Details ===') + print('%-10s %5s %5s %5s %6s %6s %6s' + % ('token', 'pred', 'gold', 'corr', 'prec', 'rec', 'F1')) + tokens = self._total.most_common(self.details) + for token, _ in tokens: + _prec = self._common[token] / (self._pred[token] or 1) + _rec = self._common[token] / (self._gold[token] or 1) + _f1 = 2 * _prec * _rec / ((_prec + _rec) or 1) + print('%-10s %5d %5d %5d %6.2f%% %6.2f%% %6.2f%%' + % (token, self._pred[token], self._gold[token], self._common[token], + 100*_prec, 100*_rec, 100*_f1)) + print('=== Totals ===') + + print("%-9s = %7d\n"*3 + % ('predicted', self.pred, 'gold', self.gold, 'correct', self.correct), end='') + pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero + precision = self.correct / pred + recall = self.correct / gold + f1 = 2 * precision * recall / ((precision + recall) or 1) + print("%-9s = %6.2f%%\n" * 3 + % ('precision', 100*precision, 'recall', 100*recall, 'F1', 100*f1), end='') + + +# difflib.SequenceMatcher does not compute LCS, so let's implement it here +# TODO: make faster by trimming common prefix and sufix +def find_lcs(x, y): + """Find longest common subsequence.""" + m, n = len(x), len(y) + C = [[0] * (n + 1) for _ in range(m + 1)] + for i in range(1, m+1): + for j in range(1, n+1): + C[i][j] = C[i-1][j-1] + 1 if x[i-1] == y[j-1] else max(C[i][j-1], C[i-1][j]) + index = C[m][n] + lcs = [None] * index + while m > 0 and n > 0: + if x[m-1] == y[n-1]: + lcs[index-1] = x[m-1] + m, n, index = m-1, n-1, index-1 + elif C[m-1][n] > C[m][n-1]: + m -= 1 + else: + n -= 1 + return lcs diff --git a/udapi/block/eval/lcsf1.py b/udapi/block/eval/lcsf1.py deleted file mode 100644 index 2c085779..00000000 --- a/udapi/block/eval/lcsf1.py +++ /dev/null @@ -1,203 +0,0 @@ -"""Block eval.LcsF1 for evaluating differences between sentences with P/R/F1.""" -from udapi.core.basewriter import BaseWriter - -class LcsF1(BaseWriter): # pylint: disable=too-many-instance-attributes - """Evaluate differences between sentences (in different zones) with P/R/F1.""" - - def __init__(self, gold_zone, attributes='form', focus='.*', details=4, **kwargs): - """Create the LcsF1 block object.""" - super().__init__(**kwargs) - self.gold_zone = gold_zone - self.attributes = attributes - self.focus = focus - self.details = details - self._stats = {} - self.correct, self.pred, self.gold = 0, 0, 0 - - def process_tree(self, tree): - gold_tree = tree.bundle.get_tree(self.gold_zone) - if tree == gold_tree: - return - #self._stats['zones'][tree.zone] += 1 - - attrs = self.attributes.split(',') - pred_tokens = ['_'.join(n.get_attrs(attrs)) for n in tree.descendants] - gold_tokens = ['_'.join(n.get_attrs(attrs)) for n in gold_tree.descendants] - common = find_lcs(pred_tokens, gold_tokens) - - # my $focus = $self->focus; - # if ($focus ne '.*') { - # @common = grep {/$focus/} @common; - # @pred_tokens = grep {/$focus/} @pred_tokens; - # @gold_tokens = grep {/$focus/} @gold_tokens; - # } - - self.correct += len(common) - self.pred += len(pred_tokens) - self.gold += len(gold_tokens) - - # if ($self->details){ - # $self->_stats->{C}{$_}++ for (@common); - # $self->_stats->{P}{$_}++ for (@pred_tokens); - # $self->_stats->{G}{$_}++ for (@gold_tokens); - # $self->_stats->{T}{$_}++ for (@gold_tokens, @pred_tokens); - # } - - def process_end(self): - # Redirect the default filehandle to the file specified by self.files - self.before_process_document(None) - - # my %pred_zones = %{$self->_stats->{zones}}; - # my @pz = keys %pred_zones; - # if (!@pz) { - # warn 'Block Eval::LcsF1 was not applied to any zone. Check the parameter zones=' - # . $self->zones; - # } elsif (@pz > 1){ - # warn "Block Eval::LcsF1 was applied to more than one zone (@pz). " - # . 'The results are mixed together. Check the parameter zones='.$self->zones; - # } - # say "Comparing predicted trees (zone=@pz) with gold trees (zone=" - # . $self->gold_zone . "), sentences=$pred_zones{$pz[0]}"; - # - # if ($self->details){ - # say '=== Details ==='; - # my $total_count = $self->_stats->{T}; - # my @tokens = sort {$total_count->{$b} <=> $total_count->{$a}} keys %{$total_count}; - # splice @tokens, $self->details; - # printf "%-10s %5s %5s %5s %6s %6s %6s\n", qw(token pred gold corr prec rec F1); - # foreach my $token (@tokens){ - # my ($p, $g, $c) = map {$self->_stats->{$_}{$token}||0} (qw(P G C)); - # my $pr = $c / ($p || 1); - # my $re = $c / ($g || 1); - # my $f = 2 * $pr * $re / (($pr + $re)||1); - # printf "%-10s %5d %5d %5d %6.2f%% %6.2f%% %6.2f%%\n", - # $token, $p, $g, $c, 100*$pr, 100*$re, 100*$f - # } - # say '=== Totals ===' - # } - - - print("%-9s = %7d\n"*3 - % ('predicted', self.pred, 'gold', self.gold, 'correct', self.correct)) - # ($pred, $gold) = map {$_||1} ($pred, $gold); # prevent division by zero - # my $prec = $correct / $pred; - # my $rec = $correct / $gold; - # my $f1 = 2 * $prec * $rec / (($prec + $rec)||1); - # printf "%-9s = %6.2f%%\n"x3, precision=>100*$prec, recall=>100*$rec, F1=>100*$f1; - - -# difflib.SequenceMatcher does not compute LCS, so let's implement it here -# TODO: make faster by trimming common prefix and sufix -def find_lcs(x, y): - m, n = len(x), len(y) - C = [[0] * (n + 1) for _ in range(m + 1)] - for i in range(1, m+1): - for j in range(1, n+1): - C[i][j] = C[i-1][j-1] + 1 if x[i-1] == y[j-1] else max(C[i][j-1], C[i-1][j]) - index = C[m][n] - lcs = [None] * index - while m > 0 and n > 0: - if x[m-1] == y[n-1]: - lcs[index-1] = x[m-1] - m, n, index = m-1, n-1, index-1 - elif C[m-1][n] > C[m][n-1]: - m -= 1 - else: - n -= 1 - return lcs - - -''' -Udapi::Block::Eval::LcsF1 - evaluate differences between sentences with P/R/F1 - -=head1 SYNOPSIS - - Eval::LcsF1 zones=en_pred gold_zone=en_gold to=results.txt - - # prints something like - predicted = 210 - gold = 213 - correct = 210 - precision = 100.00% - recall = 98.59% - F1 = 99.29% - - Eval::LcsF1 gold_zone=y attributes=form,upos focus='^(?i:an?|the)_DET$' details=4 - - # prints something like - === Details === - token pred gold corr prec rec F1 - the_DET 711 213 188 26.44% 88.26% 40.69% - The_DET 82 25 19 23.17% 76.00% 35.51% - a_DET 0 62 0 0.00% 0.00% 0.00% - an_DET 0 16 0 0.00% 0.00% 0.00% - === Totals === - predicted = 793 - gold = 319 - correct = 207 - precision = 26.10% - recall = 64.89% - F1 = 37.23% - -=head1 DESCRIPTION - -This block finds differences between nodes of trees in two zones -and reports the overall precision, recall and F1. -The two zones are "predicted" (on which this block is applied) -and "gold" (which needs to be specified with parameter C). - -This block also reports the number of total nodes in the predicted zone -and in the gold zone and the number of "correct" nodes, -that is predicted nodes which are also in the gold zone. -By default two nodes are considered "the same" if they have the same C, -but it is possible to check also for other nodes' attributes -(with parameter C). - -As usual: - - precision = correct / predicted - recall = correct / gold - F1 = 2 * precision * recall / (precision + recall) - -The implementation is based on finding the longest common subsequence (LCS) -between the nodes in the two trees. -This means that the two zones do not need to be explicitly word-aligned. - -=head1 PARAMETERS - -=head2 zones - -Which zone contains the "predicted" trees? -Make sure that you specify just one zone. -If you leave the default value "all" and the document contains more zones, -the results will be mixed, which is most likely not what you wanted. -Exception: If the document conaints just two zones (predicted and gold trees), -you can keep the default value "all" because this block -will skip comparison of the gold zone with itself. - -=head2 gold_zone - -Which zone contains the gold-standard trees? - -=head2 attributes - -comma separated list of attributes which should be checked -when deciding whether two nodes are equivalent in LCS - -=head2 focus - -Regular expresion constraining the tokens we are interested in. -If more attributes were specified in the C parameter, -their values are concatenated with underscore, so C should reflect that -e.g. C. - -For case-insensitive focus use e.g. C -(which is equivalent to C) - -=head2 details - -Print also detailed statistics for each token (matching the C). -The value of this parameter C
specifies the number of tokens to include. -The tokens are sorted according to the sum of their I and I counts. - -''' From 09d964dbbcbbaa2fb64b84f2ab61305f771ece48 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 4 Apr 2017 20:01:10 +0200 Subject: [PATCH 0040/1374] keep root.sent_id and root.bundle.bundle_id in sync and both are writeable --- udapi/core/basereader.py | 1 - udapi/core/bundle.py | 15 +++++++++++++-- udapi/core/root.py | 5 +++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index bb126ae7..f9e22fbb 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -128,7 +128,6 @@ def process_document(self, document): root.zone = parts[1] add_to_the_last_bundle = bundle_id == last_bundle_id last_bundle_id = bundle_id - root.sent_id = None if self.zone != 'keep': root.zone = self.zone diff --git a/udapi/core/bundle.py b/udapi/core/bundle.py index 0a94c799..d29b2268 100644 --- a/udapi/core/bundle.py +++ b/udapi/core/bundle.py @@ -16,13 +16,24 @@ class Bundle(object): Trees in one bundle are distinguished by a zone label. """ - __slots__ = ["trees", "number", "bundle_id", "_document"] + __slots__ = ["trees", "number", "_bundle_id", "_document"] def __init__(self, bundle_id=None, document=None): self.trees = [] - self.bundle_id = bundle_id + self._bundle_id = bundle_id self._document = document + @property + def bundle_id(self): + """ID of this bundle.""" + return self._bundle_id + + @bundle_id.setter + def bundle_id(self, bundle_id): + self._bundle_id = bundle_id + for tree in self.trees: + tree._sent_id = bundle_id + '/' + tree.zone # pylint: disable=protected-access + def __str__(self): if self.bundle_id is None: return 'bundle without id' diff --git a/udapi/core/root.py b/udapi/core/root.py index 5b141391..e5435c59 100644 --- a/udapi/core/root.py +++ b/udapi/core/root.py @@ -44,6 +44,11 @@ def sent_id(self): @sent_id.setter def sent_id(self, sent_id): + if self._bundle is not None: + parts = sent_id.split('/', 1) + self._bundle.bundle_id = parts[0] + if len(parts) == 2: + self.zone = parts[1] self._sent_id = sent_id @property From a1e0abbb6af36b322d5647bde27ec2de7ed7b022 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 4 Apr 2017 21:16:11 +0200 Subject: [PATCH 0041/1374] util.FindBug --- udapi/block/util/findbug.py | 60 +++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 udapi/block/util/findbug.py diff --git a/udapi/block/util/findbug.py b/udapi/block/util/findbug.py new file mode 100644 index 00000000..d01221b7 --- /dev/null +++ b/udapi/block/util/findbug.py @@ -0,0 +1,60 @@ +"""Block util.FindBug for debugging. + +Usage: +If block xy.Z fails with a Python exception, +insert "util.FindBug block=" into the scenario, +e.g. to debug ``second.Block``, use + +udapy first.Block util.FindBug block=second.Block > bug.conllu + +This will create the file bug.conllu with the bundle, which caused the bug. +""" +import copy +import logging + +from udapi.core.basewriter import BaseWriter +from udapi.block.write.conllu import Conllu +from udapi.core.run import _parse_block_name + +class FindBug(BaseWriter): + """Debug another block by finding a minimal testcase conllu file.""" + + def __init__(self, block, first_error_only=True, **kwargs): + """Args: block, first_error_only""" + super().__init__(**kwargs) + self.block = block + self.first_error_only = first_error_only + + def process_document(self, document): + sub_path, class_name = _parse_block_name(self.block) + module = "udapi.block." + sub_path + "." + class_name.lower() + try: + command = "from " + module + " import " + class_name + " as b" + logging.debug("Trying to run command: %s", command) + exec(command) # pylint: disable=exec-used + except Exception: + logging.warning("Error when trying import the block %s", self.block) + raise + + command = "b()" # TODO params as kwargs + logging.debug("Trying to evaluate this: %s", command) + new_block = eval(command) # pylint: disable=eval-used + + doc_copy = copy.deepcopy(document) + writer = Conllu(files=self.orig_files) + + for bundle_no, bundle in enumerate(doc_copy.bundles, 1): + logging.debug('Block %s processing bundle #%d (id=%s)', + self.block, bundle_no, bundle.bundle_id) + try: + new_block.process_bundle(bundle) + except Exception as exc: # pylint: disable=broad-except + logging.warning('util.FindBug found a problem in bundle %d in block %s: %r', + bundle_no, self.block, exc) + logging.warning('Printing a minimal example to %s', self.orig_files) + + for tree in document.bundles[bundle_no-1].trees: + writer.process_tree(tree) + + if self.first_error_only: + raise From cf35654eee088384f41043c506d2cca1f00fb189 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 4 Apr 2017 23:04:32 +0200 Subject: [PATCH 0042/1374] ud.Google2ud draft of conversion script --- udapi/block/ud/google2ud.py | 93 +++++++++++++++++++++++++ udapi/block/ud/setspaceafterfromtext.py | 2 + 2 files changed, 95 insertions(+) create mode 100644 udapi/block/ud/google2ud.py diff --git a/udapi/block/ud/google2ud.py b/udapi/block/ud/google2ud.py new file mode 100644 index 00000000..12081ecc --- /dev/null +++ b/udapi/block/ud/google2ud.py @@ -0,0 +1,93 @@ +"""Block ud.google2ud for converting Google Universal Dependency Treebank into UD. + +Usage: +udapy -s ud.Google2ud ud.SetSpaceAfterFromText ud.Convert1to2 < google.conllu > ud2.conllu +""" +from udapi.block.ud.convert1to2 import Convert1to2 + +DEPREL_CHANGE = { + "ROOT": "root", + "prep": "case", + "p": "punct", + "poss": "nmod:poss", + "ps": "case", + "num": "nummod", # TODO ?? + "number": "nummod", # TODO ?? + "tmod": "nmod:tmod", + "vmod": "acl", + "rcmod": "acl:relcl", + "npadvmod": "advmod", + "prt": "compound:prt", + "preconj": "cc:preconj", + "predet": "det:predet", + "gmod": "amod", + "gobj": "obj", +} + + +class Google2ud(Convert1to2): + """Convert Google Universal Dependency Treebank into UD style.""" + + def process_tree(self, root): + comment_lines = root.comment.split("\n") + root.sent_id = comment_lines[0].strip() + root.text = comment_lines[1].strip() + root.comment = '' + + for node in root.descendants: + self.process_node(node) + + # This needs to be executed after all other deprels are converted + for node in root.descendants: + if node.deprel in ('acomp', 'attr'): # TODO not sure about attr + copula = node.parent + node.parent = copula.parent + node.deprel = copula.deprel + copula.parent = node + copula.deprel = 'cop' + for child in copula.children: + child.parent = node + + def process_node(self, node): + orig_feats = dict(node.feats) + node.feats = None + for name, value in orig_feats.items(): + if value != 'false': + name = name.split('/')[1].capitalize() + node.misc[name] = value.capitalize() + + if node.misc['Proper'] and node.upos == 'NOUN': + node.upos = 'PROPN' + del node.misc['Proper'] + + try: + node.deprel = DEPREL_CHANGE[node.deprel] + except KeyError: + pass + + if node.deprel == 'nn': + if node.upos == 'PROPN' and node.parent.upos == 'PROPN': + node.deprel = 'flat' + else: + node.deprel = 'compound' + elif node.deprel in ('pobj', 'pcomp'): + if node.parent.deprel == 'case': + preposition = node.parent + node.parent = preposition.parent + preposition.parent = node + node.deprel = 'nmod' if node.deprel == 'pobj' else 'xcomp' # TODO check xcomp + # ud.Convert1to2 will change 'nmod' to 'obl' if needed + else: + self.log(node, node.deprel, node.deprel + ' but parent.deprel!=case') + node.deprel = 'obj' + elif node.deprel == 'infmod': + node.deprel = 'xcomp' + node.feats['VerbForm'] = 'Inf' + elif node.deprel == 'partmod': + node.deprel = 'ccomp' + node.feats['VerbForm'] = 'Part' + + if node.upos == '.': + node.upos = 'PUNCT' + elif node.upos == 'PRT': + node.upos = 'PART' diff --git a/udapi/block/ud/setspaceafterfromtext.py b/udapi/block/ud/setspaceafterfromtext.py index 0c4d8d9d..3dcd12f2 100644 --- a/udapi/block/ud/setspaceafterfromtext.py +++ b/udapi/block/ud/setspaceafterfromtext.py @@ -14,6 +14,8 @@ class SetSpaceAfterFromText(Block): def process_tree(self, root): text = root.text + if text is None: + raise ValueError('Tree %s has no text, cannot use ud.SetSpaceAfterFromText' % root) computed = root.compute_text() if text == computed: return From 7dfdf9bcc629024103b6bd9b2558a7a28783a970 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 5 Apr 2017 15:17:21 +0200 Subject: [PATCH 0043/1374] adapting for other languages --- udapi/block/ud/google2ud.py | 109 +++++++++++++++++++++++++++++++----- 1 file changed, 95 insertions(+), 14 deletions(-) diff --git a/udapi/block/ud/google2ud.py b/udapi/block/ud/google2ud.py index 12081ecc..ad7e1436 100644 --- a/udapi/block/ud/google2ud.py +++ b/udapi/block/ud/google2ud.py @@ -1,9 +1,10 @@ """Block ud.google2ud for converting Google Universal Dependency Treebank into UD. Usage: -udapy -s ud.Google2ud ud.SetSpaceAfterFromText ud.Convert1to2 < google.conllu > ud2.conllu +udapy -s ud.Google2ud < google.conllu > ud2.conllu """ from udapi.block.ud.convert1to2 import Convert1to2 +from udapi.block.ud.setspaceafterfromtext import SetSpaceAfterFromText DEPREL_CHANGE = { "ROOT": "root", @@ -24,18 +25,62 @@ "gobj": "obj", } +FEATS_CHANGE = { + "proper=false": "", + "case=prep": "", + "gender=unsp_g": "", + "voice=unsp_v": "", + "number=unsp_n": "", + "tense=unsp_t": "", + "reciprocity=non-rcp": "", + "reciprocity=rcp": "PronType=Rcp", + "aspect=imperf": "Aspect=Imp", + "form=long": "Variant=Long", + "form=short": "Variant=Short", + "person=reflex": "Reflex=Yes", + "case=reflex": "Reflex=Yes", + "gender=pl_tantum": "Number=Ptan", + "gender_antecedent=fem_a": "Gender=Fem", + "gender_antecedent=masc_a": "Gender=Masc", + "gender_antecedent=neut_a": "Gender=Neut", + "number_antecedent=sing_a": "Number=Sing", + "number_antecedent=plur_a": "Number=Plur", + "person_antecedent=1_a": "Person=1", + "person_antecedent=2_a": "Person=2", + "person_antecedent=3_a": "Person=3", + "definiteness=def": "Definite=Def", + "definiteness=indef": "Definite=Ind", + "mood=sub1": "Mood=Sub", # TODO: what is the difference between sub1 and sub2 in German? + "mood=sub2": "Mood=Sub", + "tense=cnd": "Mood=Cnd", + "degree=sup_a": "Degree=Abs", + "degree=sup_r": "Degree=Sup", + "case=obl": "Case=Acc", +} class Google2ud(Convert1to2): """Convert Google Universal Dependency Treebank into UD style.""" + def __init__(self, lang='unk', **kwargs): + """Create the Google2ud block instance. + + See ``Convert1to2`` for all the args. + """ + super().__init__(**kwargs) + self.lang = lang + self._spaceafter_block = SetSpaceAfterFromText() + def process_tree(self, root): comment_lines = root.comment.split("\n") - root.sent_id = comment_lines[0].strip() + root.sent_id = comment_lines[0].strip().replace(' ', '-') root.text = comment_lines[1].strip() root.comment = '' for node in root.descendants: - self.process_node(node) + self.fix_feats(node) + self.fix_upos(node) + self.fix_deprel(node) + #self.fix_quotes(node) # This needs to be executed after all other deprels are converted for node in root.descendants: @@ -48,18 +93,52 @@ def process_tree(self, root): for child in copula.children: child.parent = node - def process_node(self, node): + # call ud.SetSpaceAfterFromText + self._spaceafter_block.process_tree(root) + + # call ud.Convert1to2 + super().process_tree(root) + + @staticmethod + def fix_feats(node): + """Remove language prefixes, capitalize names and values, apply FEATS_CHANGE.""" orig_feats = dict(node.feats) node.feats = None for name, value in orig_feats.items(): - if value != 'false': - name = name.split('/')[1].capitalize() - node.misc[name] = value.capitalize() + name = name.split('/')[1] + if name == 'inflection_type': + node.misc['InflectionType'] = value.capitalize() + continue + if "antecedent" in name and node.upos == 'PRON': + node.feats["PronType"] = "Prs" + new = FEATS_CHANGE.get(name + '=' + value) + if new is not None: + if new != '': + new_name, new_value = new.split('=') + node.feats[new_name] = new_value + else: + node.feats[name.capitalize()] = value.capitalize() + + def fix_upos(self, node): + """PRT→PART, .→PUNCT, NOUN+Proper→PROPN.""" + if node.upos == '.': + node.upos = 'PUNCT' + elif node.upos == 'PRT': + node.upos = 'PART' + if node.feats['Proper']: + if node.upos == 'NOUN': + node.upos = 'PROPN' + if node.feats['Proper'] != 'True': + self.log(node, 'unexpected-proper', 'Proper=' + node.feats['Proper']) + else: + node.misc['Proper'] = node.feats['Proper'] + del node.feats['Proper'] - if node.misc['Proper'] and node.upos == 'NOUN': - node.upos = 'PROPN' - del node.misc['Proper'] + def fix_deprel(self, node): + """Convert Google dependency relations to UD deprels. + Change topology where needed. + """ try: node.deprel = DEPREL_CHANGE[node.deprel] except KeyError: @@ -87,7 +166,9 @@ def process_node(self, node): node.deprel = 'ccomp' node.feats['VerbForm'] = 'Part' - if node.upos == '.': - node.upos = 'PUNCT' - elif node.upos == 'PRT': - node.upos = 'PART' + def fix_quotes(self, node): + """Reconstruct the original quotes.""" + if node.xpos == '``': + node.form = '„' if self.lang == 'de' else '"' + elif node.xpos == "''": + node.form = '“' if self.lang == 'de' else '"' From bf81ad21d4bf161806e11eb795a1bf722119940d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 6 Apr 2017 12:28:37 +0200 Subject: [PATCH 0044/1374] node.get_attrs() more powerfull --- udapi/core/node.py | 59 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 0d745baa..e717ccbe 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -465,15 +465,68 @@ def is_leaf(self): """Is this node a leaf, ie. a node without any children?""" return not self.children + def _get_attr(self, name): # pylint: disable=too-many-return-statements + if name == 'dir': + if self.parent.is_root(): + return 'root' + return 'left' if self.precedes(self.parent) else 'right' + if name == 'edge': + if self.parent.is_root(): + return 0 + return self.ord - self.parent.ord + if name == 'children': + return len(self.children) + if name == 'siblings': + return len(self.parent.children) - 1 + if name == 'depth': + value = 0 + tmp = self + while not tmp.is_root(): + tmp = tmp.parent + value += 1 + return value + if name == 'feats_split': + return str(self.feats).split('|') + return getattr(self, name) + def get_attrs(self, attrs, undefs=None, stringify=True): - """Return multiple attributes, possibly subsitituting empty ones. + """Return multiple attributes or pseudo-attributes, possibly substituting empty ones. + + Pseudo-attributes: + p_xy is the (pseudo) attribute xy of the parent node. + c_xy is a list of the (pseudo) attributes xy of the children nodes. + dir: 'left' = the node is a left child of its parent, + 'right' = the node is a rigth child of its parent, + 'root' = the node's parent is the technical root. + edge: length of the edge to parent (`node.ord - node.parent.ord`) or 0 if parent is root + children: number of children nodes. + siblings: number of siblings nodes. + depth: depth in the dependency tree (technical root has depth=0, highest word has depth=1). + feats_split: list of name=value formatted strings of the FEATS. Args: - attrs: A list of attribute names, e.g. ['form', 'lemma']. + attrs: A list of attribute names, e.g. ``['form', 'lemma', 'p_upos']``. undefs: A value to be used instead of None for empty (undefined) values. stringify: Apply `str()` on each value (except for None) """ - values = [getattr(self, name) for name in attrs] + values = [] + for name in attrs: + if name.startswith('p_'): + if name == 'p_feats_split': + values.extend(self.parent._get_attr(name[2:])) + else: + values.append(self.parent._get_attr(name[2:])) + elif name.startswith('c_'): + for child in self.children: + if name == 'c_feats_split': + values.extend(child._get_attr(name[2:])) + else: + values.append(child._get_attr(name[2:])) + elif name == 'feats_split': + values.extend(self._get_attr(name)) + else: + values.append(self._get_attr(name)) + if undefs is not None: values = [x if x is not None else undefs for x in values] if stringify: From 81c0c872d4fe6b96fe23651418fb1a996c3d2d43 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 6 Apr 2017 12:29:53 +0200 Subject: [PATCH 0045/1374] util.See prints useful statistics on matching nodes --- udapi/block/util/see.py | 119 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 udapi/block/util/see.py diff --git a/udapi/block/util/see.py b/udapi/block/util/see.py new file mode 100644 index 00000000..57cdf81c --- /dev/null +++ b/udapi/block/util/see.py @@ -0,0 +1,119 @@ +"""Block util.See prints statistics about the nodes matching a given condition. + +Example usage from the command line:: + +udapy util.See node='node.is_nonprojective()' n=3 \ + stats=dir,children,c_upos,p_lemma,deprel,feats_split < in.conllu + +Example output:: + +node.is_nonprojective() +matches 245 out of 35766 nodes (0.7%) in 174 out of 1478 trees (11.8%) +=== dir (2 values) === + right 193 78% delta=+37% + left 52 21% delta=-33% +=== children (9 values) === + 0 64 26% delta=-38% + 2 58 23% delta=+14% + 3 38 15% delta= +7% +=== c_upos (15 values) === + NOUN 118 23% delta= +4% + DET 61 12% delta= -3% + PROPN 47 9% delta= +1% +=== p_lemma (187 values) === + il 5 2% delta= +1% + fonction 4 1% delta= +1% + écrire 4 1% delta= +1% +=== deprel (22 values) === + appos 41 16% delta=+15% + conj 41 16% delta=+13% + punct 36 14% delta= +4% +=== feats_split (20 values) === + Number=Sing 114 21% delta= +2% + Gender=Masc 81 15% delta= +3% + _ 76 14% delta= -6% + +In addition to absolute counts for each value, the percentage within matching nodes is printed +and a delta relative to percentage within all nodes. +This helps to highlight what is special about the matching nodes. +""" +from collections import Counter +import re # may be useful in eval, thus pylint: disable=unused-import + +from udapi.core.block import Block + +STATS = 'dir,edge,depth,children,siblings,p_upos,p_lemma,c_upos,form,lemma,upos,deprel,feats_split' + +# We need eval in this block +# pylint: disable=eval-used +class See(Block): + """Print statistics about the nodes specified by the parameter `node`.""" + + def __init__(self, node, n=5, stats=STATS, **kwargs): + """Args: + `node`: Python expression to be evaluated for each node and if True, + the node will be considered "matching". + `n`: Top n values will be printed for each statistic. + `stats`: a list of comma-separated statistics to be printed. + A statistic can be an attribute (`form`, `lemma`) or a pseudo-attribute + (`depth` = depth of a node in dependency tree, + `children` = number of children nodes, + `p_lemma` = lemma of a parent node, etc). + See `udapi.core.Node.get_attrs` for a full list of statistics. + """ + super().__init__(**kwargs) + self.node = node + self.n_limit = n + self.stats = stats.split(',') + self.match = dict() + self.every = dict() + for stat in self.stats: + self.match[stat] = Counter() + self.every[stat] = Counter() + self.overall = Counter() + + def process_tree(self, root): + self.overall['trees'] += 1 + tree_match = False + for node in root.descendants: + matching = self.process_node(node) + self.overall['nodes'] += 1 + if matching: + self.overall['matching_nodes'] += 1 + if not tree_match: + self.overall['matching_trees'] += 1 + tree_match = True + + def process_node(self, node): + matching = eval(self.node) + for stat in self.stats: + for value in node.get_attrs([stat], undefs=''): + self.every[stat][value] += 1 + self.every[stat]['T O T A L'] += 1 + if matching: + self.match[stat][value] += 1 + self.match[stat]['T O T A L'] += 1 + return matching + + def process_end(self): + print(self.node) + print("matches %d out of %d nodes (%.1f%%) in %d out of %d trees (%.1f%%)" + % (self.overall['matching_nodes'], + self.overall['nodes'], + self.overall['matching_nodes'] * 100 / self.overall['nodes'], + self.overall['matching_trees'], + self.overall['trees'], + self.overall['matching_trees'] * 100 / self.overall['trees'])) + for stat in self.stats: + vals = len(self.match[stat].keys()) - 1 + print("=== %s (%d value%s) ===" % (stat, vals, 's' if vals > 1 else '')) + match_total = self.match[stat]['T O T A L'] or 1 + every_total = self.every[stat]['T O T A L'] or 1 + for value, match_count in self.match[stat].most_common(self.n_limit + 1): + if value == 'T O T A L': + continue + every_count = self.every[stat][value] + match_perc = 100 * match_count / match_total + every_perc = 100 * every_count / every_total + print("%15s %5d %3d%% delta=%+3d%%" + % (value, match_count, match_perc, match_perc - every_perc)) From 884cf18511f22aefe476d159833c9ae70bb9a2e0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 6 Apr 2017 12:34:48 +0200 Subject: [PATCH 0046/1374] draft of Indonesian conversion --- udapi/block/ud/google2ud.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/udapi/block/ud/google2ud.py b/udapi/block/ud/google2ud.py index ad7e1436..95ee5fa2 100644 --- a/udapi/block/ud/google2ud.py +++ b/udapi/block/ud/google2ud.py @@ -23,6 +23,9 @@ "predet": "det:predet", "gmod": "amod", "gobj": "obj", + "postneg": "neg", # will be changed to advmod + Polarity=Neg in ud.Convert1to2 + "pronl": "obj", # TODO: or expl? UD_French seems to use a mix of both + "redup": "compound:plur", } FEATS_CHANGE = { @@ -56,6 +59,7 @@ "degree=sup_a": "Degree=Abs", "degree=sup_r": "Degree=Sup", "case=obl": "Case=Acc", + "tense=impf": "Tense=Imp", } class Google2ud(Convert1to2): @@ -134,6 +138,21 @@ def fix_upos(self, node): node.misc['Proper'] = node.feats['Proper'] del node.feats['Proper'] + # Indonesian uses prefixes (me, di, ber, ke,...) and suffixes (an, kan, i,...), + # which are written without spaces with the main word/stem (according to the raw text). + # These could be treated as syntactic words and annotated using multi-word tokens. + # However, there is no annotation about their dependency relations (just suff, pref) + # and UD_Indonesian v2.0 keeps them as one word with the stem. So let's follow this style. + if node.upos == 'AFFIX': + if node.deprel == 'suff': + node.prev_node.form += node.form + elif node.deprel == 'pref': + node.next_node.form = node.form + node.next_node.form + else: + self.log(node, 'affix', 'upos=AFFIX deprel=' + node.deprel) + return + node.remove(children='rehang') + def fix_deprel(self, node): """Convert Google dependency relations to UD deprels. From fc46d4e40f3e56d108dea97facfe8fb31be6c9af Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 8 Apr 2017 01:02:05 +0200 Subject: [PATCH 0047/1374] ud.Google2ud update --- udapi/block/ud/google2ud.py | 77 +++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/udapi/block/ud/google2ud.py b/udapi/block/ud/google2ud.py index 95ee5fa2..510c6b2f 100644 --- a/udapi/block/ud/google2ud.py +++ b/udapi/block/ud/google2ud.py @@ -9,11 +9,12 @@ DEPREL_CHANGE = { "ROOT": "root", "prep": "case", + "ncomp": "case", # TODO ? "p": "punct", "poss": "nmod:poss", "ps": "case", - "num": "nummod", # TODO ?? - "number": "nummod", # TODO ?? + "num": "nummod", + "number": "nummod", # TODO ? "tmod": "nmod:tmod", "vmod": "acl", "rcmod": "acl:relcl", @@ -26,15 +27,35 @@ "postneg": "neg", # will be changed to advmod + Polarity=Neg in ud.Convert1to2 "pronl": "obj", # TODO: or expl? UD_French seems to use a mix of both "redup": "compound:plur", + "oblcomp": "obl", + "mes": "dep", # TODO ? + "mwn": "compound:n", # nominal multi-word + "mwa": "compound:a", # adjectival multi-word + "mwv": "compound:v", # verbal multi-word + "asp": "aux", # aspectual particle + "rcmodrel": "mark:relcl", + "auxcaus": "aux", # redundant with Voice=Cau + "topic": "dep", + "possessive": "case", + "quantmod": "det", # TODO UD_Hindi uses "dep" for the same words + # TODO: "ref" - in basic dependencies it should be rehanged and relabelled + "conjv": "compound:conjv", } FEATS_CHANGE = { "proper=false": "", "case=prep": "", + "case=unsp_c": "", "gender=unsp_g": "", + "gender_antecedent=unsp_g": "", "voice=unsp_v": "", "number=unsp_n": "", + "number_antecedent=unsp_n": "", "tense=unsp_t": "", + "mood=unsp_m": "", + "animacy=unsp_r": "", + "aspect=unsp_a": "", + "case=rel": "", # redundant with rcmodrel (mark:relcl) "reciprocity=non-rcp": "", "reciprocity=rcp": "PronType=Rcp", "aspect=imperf": "Aspect=Imp", @@ -42,24 +63,30 @@ "form=short": "Variant=Short", "person=reflex": "Reflex=Yes", "case=reflex": "Reflex=Yes", + "case=dir": "Case=Nom", "gender=pl_tantum": "Number=Ptan", - "gender_antecedent=fem_a": "Gender=Fem", - "gender_antecedent=masc_a": "Gender=Masc", - "gender_antecedent=neut_a": "Gender=Neut", - "number_antecedent=sing_a": "Number=Sing", - "number_antecedent=plur_a": "Number=Plur", - "person_antecedent=1_a": "Person=1", - "person_antecedent=2_a": "Person=2", - "person_antecedent=3_a": "Person=3", + "gender_antecedent=fem_a": "Gender[psor]=Fem", + "gender_antecedent=masc_a": "Gender[psor]=Masc", + "gender_antecedent=neut_a": "Gender[psor]=Neut", + "number_antecedent=sing_a": "Number[psor]=Sing", + "number_antecedent=plur_a": "Number[psor]=Plur", + "person_antecedent=1_a": "Person[psor]=1", + "person_antecedent=2_a": "Person[psor]=2", + "person_antecedent=3_a": "Person[psor]=3", "definiteness=def": "Definite=Def", "definiteness=indef": "Definite=Ind", "mood=sub1": "Mood=Sub", # TODO: what is the difference between sub1 and sub2 in German? "mood=sub2": "Mood=Sub", + "mood=inter": "PronType=Int", # TODO or keep Mood=Inter (it is used in UD_Chinese) "tense=cnd": "Mood=Cnd", "degree=sup_a": "Degree=Abs", "degree=sup_r": "Degree=Sup", "case=obl": "Case=Acc", "tense=impf": "Tense=Imp", + "animacy=rat": "Animacy=Hum", + "animacy=irrat": "Animacy=Nhum", + "honorific=hon": "Polite=Form", + "mood=psm": "Tense=Fut", # TODO ? } class Google2ud(Convert1to2): @@ -108,7 +135,7 @@ def fix_feats(node): """Remove language prefixes, capitalize names and values, apply FEATS_CHANGE.""" orig_feats = dict(node.feats) node.feats = None - for name, value in orig_feats.items(): + for name, value in sorted(orig_feats.items()): name = name.split('/')[1] if name == 'inflection_type': node.misc['InflectionType'] = value.capitalize() @@ -143,15 +170,18 @@ def fix_upos(self, node): # These could be treated as syntactic words and annotated using multi-word tokens. # However, there is no annotation about their dependency relations (just suff, pref) # and UD_Indonesian v2.0 keeps them as one word with the stem. So let's follow this style. + # Chinese AFFIXes are more tricky to convert. + # It seems these words are quite often tagged as PART in UD_Chinese. if node.upos == 'AFFIX': if node.deprel == 'suff': node.prev_node.form += node.form + node.remove(children='rehang') elif node.deprel == 'pref': node.next_node.form = node.form + node.next_node.form + node.remove(children='rehang') else: self.log(node, 'affix', 'upos=AFFIX deprel=' + node.deprel) - return - node.remove(children='rehang') + node.upos = 'PART' def fix_deprel(self, node): """Convert Google dependency relations to UD deprels. @@ -169,12 +199,26 @@ def fix_deprel(self, node): else: node.deprel = 'compound' elif node.deprel in ('pobj', 'pcomp'): - if node.parent.deprel == 'case': + if node.parent.deprel in ('case', 'prep'): preposition = node.parent node.parent = preposition.parent preposition.parent = node - node.deprel = 'nmod' if node.deprel == 'pobj' else 'xcomp' # TODO check xcomp + # ud.Convert1to2 will change 'nmod' to 'obl' if needed + node.deprel = 'nmod' if node.deprel == 'pobj' else 'xcomp' # TODO check xcomp + + # Prepositions should not have any children (except for deprel=fixed/mwe), see + # http://universaldependencies.org/u/overview/syntax.html#multiword-function-words. + # Unfortunatelly, there are many annotation errors and it is almost always better + # to rehang the extra children (at least to prevent spurious non-projectivities). + # In case of PUNCTuation it is surely correct. + # Otherwise, let's mark it as ToDo. + for extra_prep_child in preposition.children: + if extra_prep_child.udeprel in ('fixed', 'mwe'): + continue + extra_prep_child.parent = node + if extra_prep_child.upos != 'PUNCT': + self.log(extra_prep_child, 'ex-adp-child', 'was an extra adposition child') else: self.log(node, node.deprel, node.deprel + ' but parent.deprel!=case') node.deprel = 'obj' @@ -184,6 +228,9 @@ def fix_deprel(self, node): elif node.deprel == 'partmod': node.deprel = 'ccomp' node.feats['VerbForm'] = 'Part' + elif node.deprel == 'suff': + node.misc['OrigDeprel'] = 'suff' + node.deprel = 'dep' def fix_quotes(self, node): """Reconstruct the original quotes.""" From 2530a11fd75454ec67565701e9564b2fa242baf9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 8 Apr 2017 18:00:35 +0200 Subject: [PATCH 0048/1374] autopep8 --- udapi/block/demo/rehangprepositions.py | 1 + udapi/block/eval/f1.py | 24 ++++++++------- udapi/block/eval/parsing.py | 1 + udapi/block/read/addsentences.py | 2 ++ udapi/block/read/conllu.py | 6 ++-- udapi/block/read/vislcg.py | 3 +- udapi/block/tokenize/onwhitespace.py | 1 + udapi/block/tokenize/simple.py | 1 + udapi/block/transform/deproj.py | 3 +- udapi/block/transform/proj.py | 1 + udapi/block/tutorial/addcommas.py | 5 ++-- udapi/block/tutorial/adpositions.py | 1 + udapi/block/ud/addmwt.py | 3 +- udapi/block/ud/bg/removedotafterabbr.py | 1 + udapi/block/ud/convert1to2.py | 5 ++-- udapi/block/ud/cs/addmwt.py | 1 + udapi/block/ud/el/addmwt.py | 5 ++-- udapi/block/ud/fixpunctchild.py | 1 + udapi/block/ud/ga/to2.py | 1 + udapi/block/ud/gl/to2.py | 1 + udapi/block/ud/goeswithfromtext.py | 3 +- udapi/block/ud/google2ud.py | 40 ++++++++++++++----------- udapi/block/ud/he/fixneg.py | 1 + udapi/block/ud/markbugs.py | 1 + udapi/block/ud/removemwt.py | 3 +- udapi/block/ud/ro/fixneg.py | 3 +- udapi/block/ud/ro/setspaceafter.py | 3 +- udapi/block/ud/ru/fixremnant.py | 1 + udapi/block/ud/setspaceafter.py | 5 ++-- udapi/block/ud/setspaceafterfromtext.py | 1 + udapi/block/ud/splitunderscoretokens.py | 1 + udapi/block/udpipe/base.py | 1 + udapi/block/util/eval.py | 8 +++-- udapi/block/util/filter.py | 9 +++--- udapi/block/util/findbug.py | 11 +++---- udapi/block/util/mark.py | 5 +++- udapi/block/util/see.py | 4 ++- udapi/block/util/wc.py | 1 + udapi/block/write/conllu.py | 2 +- udapi/block/write/html.py | 5 ++-- udapi/block/write/textmodetrees.py | 10 ++++--- udapi/block/write/textmodetreeshtml.py | 7 +++-- udapi/block/write/tikz.py | 2 +- udapi/block/write/vislcg.py | 1 + udapi/core/basereader.py | 3 +- udapi/core/basewriter.py | 1 + udapi/core/bundle.py | 2 +- udapi/core/dualdict.py | 1 + udapi/core/feats.py | 1 + udapi/core/files.py | 3 +- udapi/core/mwt.py | 4 +-- udapi/core/node.py | 38 ++++++++++++----------- udapi/core/root.py | 3 +- udapi/core/run.py | 10 ++++--- udapi/core/tests/test_node.py | 4 +-- udapi/tool/udpipe.py | 13 ++++---- 56 files changed, 170 insertions(+), 108 deletions(-) diff --git a/udapi/block/demo/rehangprepositions.py b/udapi/block/demo/rehangprepositions.py index 8d641b49..d25e29bc 100644 --- a/udapi/block/demo/rehangprepositions.py +++ b/udapi/block/demo/rehangprepositions.py @@ -4,6 +4,7 @@ class RehangPrepositions(Block): """This block takes all prepositions (upos=ADP) and rehangs them above their parent.""" + def process_node(self, node): if node.upos == "ADP": origparent = node.parent diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py index 90be94ec..a4f93a1b 100644 --- a/udapi/block/eval/f1.py +++ b/udapi/block/eval/f1.py @@ -56,6 +56,8 @@ from udapi.core.basewriter import BaseWriter # pylint: disable=too-many-instance-attributes,invalid-name + + class F1(BaseWriter): """Evaluate differences between sentences (in different zones) with P/R/F1. @@ -156,17 +158,17 @@ def process_end(self): _f1 = 2 * _prec * _rec / ((_prec + _rec) or 1) print('%-10s %5d %5d %5d %6.2f%% %6.2f%% %6.2f%%' % (token, self._pred[token], self._gold[token], self._common[token], - 100*_prec, 100*_rec, 100*_f1)) + 100 * _prec, 100 * _rec, 100 * _f1)) print('=== Totals ===') - print("%-9s = %7d\n"*3 + print("%-9s = %7d\n" * 3 % ('predicted', self.pred, 'gold', self.gold, 'correct', self.correct), end='') - pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero + pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero precision = self.correct / pred recall = self.correct / gold f1 = 2 * precision * recall / ((precision + recall) or 1) print("%-9s = %6.2f%%\n" * 3 - % ('precision', 100*precision, 'recall', 100*recall, 'F1', 100*f1), end='') + % ('precision', 100 * precision, 'recall', 100 * recall, 'F1', 100 * f1), end='') # difflib.SequenceMatcher does not compute LCS, so let's implement it here @@ -175,16 +177,16 @@ def find_lcs(x, y): """Find longest common subsequence.""" m, n = len(x), len(y) C = [[0] * (n + 1) for _ in range(m + 1)] - for i in range(1, m+1): - for j in range(1, n+1): - C[i][j] = C[i-1][j-1] + 1 if x[i-1] == y[j-1] else max(C[i][j-1], C[i-1][j]) + for i in range(1, m + 1): + for j in range(1, n + 1): + C[i][j] = C[i - 1][j - 1] + 1 if x[i - 1] == y[j - 1] else max(C[i][j - 1], C[i - 1][j]) index = C[m][n] lcs = [None] * index while m > 0 and n > 0: - if x[m-1] == y[n-1]: - lcs[index-1] = x[m-1] - m, n, index = m-1, n-1, index-1 - elif C[m-1][n] > C[m][n-1]: + if x[m - 1] == y[n - 1]: + lcs[index - 1] = x[m - 1] + m, n, index = m - 1, n - 1, index - 1 + elif C[m - 1][n] > C[m][n - 1]: m -= 1 else: n -= 1 diff --git a/udapi/block/eval/parsing.py b/udapi/block/eval/parsing.py index 3c7f5da8..86d7b089 100644 --- a/udapi/block/eval/parsing.py +++ b/udapi/block/eval/parsing.py @@ -1,6 +1,7 @@ """Block eval.Parsing for evaluating UAS and LAS - gold and pred must have the same tokens.""" from udapi.core.basewriter import BaseWriter + class Parsing(BaseWriter): """Evaluate labeled and unlabeled attachment score (LAS and UAS).""" diff --git a/udapi/block/read/addsentences.py b/udapi/block/read/addsentences.py index 67c79ee8..f7f8d764 100644 --- a/udapi/block/read/addsentences.py +++ b/udapi/block/read/addsentences.py @@ -3,6 +3,8 @@ # pylint: disable=abstract-method # read_tree() does not need to be installed here + + class AddSentences(BaseReader): """A reader for adding plain-text sentences (one sentence per line) files. diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index 79ccfaea..ba10d163 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -12,6 +12,7 @@ RE_TEXT = re.compile(r'^# text\s*=\s*(.+)') RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc) (?:\s*id\s*=\s*(.+))?') + class Conllu(BaseReader): """A reader of the CoNLL-U files.""" @@ -49,7 +50,6 @@ def __init__(self, strict=False, separator='tab', self.strict = strict self.separator = separator - @staticmethod def parse_comment_line(line, root): """Parse one line of CoNLL-U and fill sent_id, text, newpar, newdoc in root.""" @@ -112,7 +112,7 @@ def read_tree(self, document=None): empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], xpos=fields[4], feats=fields[5], misc=fields[9]) empty.ord = fields[0] - empty.raw_deps = fields[8] # TODO + empty.raw_deps = fields[8] # TODO continue node = root.create_child() @@ -163,7 +163,7 @@ def read_tree(self, document=None): # Create multi-word tokens. for fields in mwts: range_start, range_end = fields[0].split('-') - words = nodes[int(range_start):int(range_end)+1] + words = nodes[int(range_start):int(range_end) + 1] root.create_multiword_token(words, form=fields[1], misc=fields[-1]) return root diff --git a/udapi/block/read/vislcg.py b/udapi/block/read/vislcg.py index 26b3d787..9ad272e3 100644 --- a/udapi/block/read/vislcg.py +++ b/udapi/block/read/vislcg.py @@ -2,6 +2,7 @@ from udapi.core.basereader import BaseReader from udapi.core.root import Root + class Vislcg(BaseReader): """A reader of the VISL-cg format, suitable for VISL Constraint Grammer Parser.""" @@ -67,7 +68,7 @@ def _node(line, root): # Let's hope that xpos, feats and deprel do not contain any quotes. end_quote_pos = line.rfind('"') lemma = line[1:end_quote_pos] - fields = line[end_quote_pos+1:].split() + fields = line[end_quote_pos + 1:].split() xpos = fields[0] feats_list = fields[3:-2] feats = '|'.join(feats_list) if feats_list else '_' diff --git a/udapi/block/tokenize/onwhitespace.py b/udapi/block/tokenize/onwhitespace.py index 544c4da6..5451b3a1 100644 --- a/udapi/block/tokenize/onwhitespace.py +++ b/udapi/block/tokenize/onwhitespace.py @@ -1,6 +1,7 @@ """Block tokenize.OnWhitespace""" from udapi.core.block import Block + class OnWhitespace(Block): """"Base tokenizer, splits on whitespaces, fills SpaceAfter=No.""" diff --git a/udapi/block/tokenize/simple.py b/udapi/block/tokenize/simple.py index 82403cee..f7010d13 100644 --- a/udapi/block/tokenize/simple.py +++ b/udapi/block/tokenize/simple.py @@ -3,6 +3,7 @@ from udapi.block.tokenize.onwhitespace import OnWhitespace + class Simple(OnWhitespace): """Simple tokenizer, splits on whitespaces and punctuation, fills SpaceAfter=No.""" diff --git a/udapi/block/transform/deproj.py b/udapi/block/transform/deproj.py index 3a6dcda5..581f5a6b 100644 --- a/udapi/block/transform/deproj.py +++ b/udapi/block/transform/deproj.py @@ -5,6 +5,7 @@ """ from udapi.core.block import Block + class Deproj(Block): """De-projectivize the trees à la Nivre & Nilsson (2005).""" @@ -33,7 +34,7 @@ def process_node(self, node): node.parent = reconstructed_parent def head_strategy(self, node, label): - queue = [n for n in node.parent.children if n!=node] # TODO deque + queue = [n for n in node.parent.children if n != node] # TODO deque while queue: adept = queue.pop(0) if adept.udeprel == label: diff --git a/udapi/block/transform/proj.py b/udapi/block/transform/proj.py index 6254b917..6e284b4c 100644 --- a/udapi/block/transform/proj.py +++ b/udapi/block/transform/proj.py @@ -17,6 +17,7 @@ """ from udapi.core.block import Block + class Proj(Block): """Projectivize the trees à la Nivre & Nilsson (2005).""" diff --git a/udapi/block/tutorial/addcommas.py b/udapi/block/tutorial/addcommas.py index 8c04e242..ccc26a66 100644 --- a/udapi/block/tutorial/addcommas.py +++ b/udapi/block/tutorial/addcommas.py @@ -1,10 +1,11 @@ """tutorial.AddCommas block template.""" from udapi.core.block import Block + class AddCommas(Block): """Heuristically insert nodes for missing commas.""" - def process_node(self, node): + def process_node(self, node): if self.should_add_comma_before(node): comma = node.create_child(form=',', deprel='punct', upos='PUNCT') comma.shift_before_node(node) @@ -18,5 +19,5 @@ def should_add_comma_before(self, node): return True if any(n.deprel == 'appos' for n in prev_node.children): return True - + return False diff --git a/udapi/block/tutorial/adpositions.py b/udapi/block/tutorial/adpositions.py index cf2ad514..9c4e131b 100644 --- a/udapi/block/tutorial/adpositions.py +++ b/udapi/block/tutorial/adpositions.py @@ -13,6 +13,7 @@ """ from udapi.core.block import Block + class Adpositions(Block): """Compute the number of prepositions and postpositions.""" diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py index eab2158b..ffa78bbb 100644 --- a/udapi/block/ud/addmwt.py +++ b/udapi/block/ud/addmwt.py @@ -1,6 +1,7 @@ """Abstract base class ud.AddMwt for heuristic detection of multi-word tokens.""" from udapi.core.block import Block + class AddMwt(Block): """Detect and mark MWTs (split them into words and add the words to the tree).""" @@ -24,7 +25,7 @@ def process_node(self, node): nodes.append(new_node) node.form = forms[main] nodes.append(node) - for form in forms[main+1:]: + for form in forms[main + 1:]: new_node = parent.create_child(form=form) new_node.shift_after_node(nodes[-1]) nodes.append(new_node) diff --git a/udapi/block/ud/bg/removedotafterabbr.py b/udapi/block/ud/bg/removedotafterabbr.py index d1d94628..a132dad1 100644 --- a/udapi/block/ud/bg/removedotafterabbr.py +++ b/udapi/block/ud/bg/removedotafterabbr.py @@ -7,6 +7,7 @@ """ from udapi.core.block import Block + class RemoveDotAfterAbbr(Block): """Block for deleting extra PUNCT nodes after abbreviations. diff --git a/udapi/block/ud/convert1to2.py b/udapi/block/ud/convert1to2.py index e389e476..53529afb 100644 --- a/udapi/block/ud/convert1to2.py +++ b/udapi/block/ud/convert1to2.py @@ -23,9 +23,10 @@ "csubjpass": "csubj:pass", "auxpass": "aux:pass", "name": "flat:name", - "foreign": "flat", # "flat:foreign" not needed once we have Foreign=Yes in FEATS + "foreign": "flat", # "flat:foreign" not needed once we have Foreign=Yes in FEATS } + class Convert1to2(Block): """Block for converting UD v1 to UD v2.""" @@ -45,7 +46,7 @@ def __init__(self, skip='', save_stats=True, **kwargs): self.skip = {k for k in skip.split(',')} self.save_stats = save_stats - def process_tree(self, tree): # pylint: disable=too-many-branches + def process_tree(self, tree): # pylint: disable=too-many-branches """Apply all the changes on the current tree. This method is automatically called on each tree by Udapi. diff --git a/udapi/block/ud/cs/addmwt.py b/udapi/block/ud/cs/addmwt.py index 17e0648c..4c203ddc 100644 --- a/udapi/block/ud/cs/addmwt.py +++ b/udapi/block/ud/cs/addmwt.py @@ -43,6 +43,7 @@ 'shape': 'subtree', } + class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree).""" diff --git a/udapi/block/ud/el/addmwt.py b/udapi/block/ud/el/addmwt.py index 81a98836..8381c69f 100644 --- a/udapi/block/ud/el/addmwt.py +++ b/udapi/block/ud/el/addmwt.py @@ -24,8 +24,9 @@ v['xpos'] = 'AsPpSp AtDf' v['deprel'] = 'case det' # The following are the default values - #v['main'] = 0 # which of the two words will inherit the original children (if any) - #v['shape'] = 'siblings', # the newly created nodes will be siblings + # v['main'] = 0 # which of the two words will inherit the original children (if any) + # v['shape'] = 'siblings', # the newly created nodes will be siblings + class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree).""" diff --git a/udapi/block/ud/fixpunctchild.py b/udapi/block/ud/fixpunctchild.py index a9f16b8c..07ef3eb3 100644 --- a/udapi/block/ud/fixpunctchild.py +++ b/udapi/block/ud/fixpunctchild.py @@ -1,6 +1,7 @@ """Block ud.FixPunctChild for making sure punctuation nodes have no children.""" from udapi.core.block import Block + class FixPunctChild(Block): """Make sure punct nodes have no children by rehanging the children upwards.""" diff --git a/udapi/block/ud/ga/to2.py b/udapi/block/ud/ga/to2.py index 4d8506e1..dbf093a9 100644 --- a/udapi/block/ud/ga/to2.py +++ b/udapi/block/ud/ga/to2.py @@ -4,6 +4,7 @@ """ from udapi.core.block import Block + class To2(Block): """Block for fixing the remaining cases (after ud.Convert1to2) in UD_Irish.""" diff --git a/udapi/block/ud/gl/to2.py b/udapi/block/ud/gl/to2.py index f5f0f451..81a17c64 100644 --- a/udapi/block/ud/gl/to2.py +++ b/udapi/block/ud/gl/to2.py @@ -12,6 +12,7 @@ 'PUNCT': -10, } + class To2(Block): """Block for fixing the remaining cases (before ud.Convert1to2) in UD_Galician.""" diff --git a/udapi/block/ud/goeswithfromtext.py b/udapi/block/ud/goeswithfromtext.py index 64e1d99f..fe419fa2 100644 --- a/udapi/block/ud/goeswithfromtext.py +++ b/udapi/block/ud/goeswithfromtext.py @@ -9,6 +9,7 @@ from udapi.core.block import Block + class GoeswithFromText(Block): """Block for splitting nodes and attaching via goeswith according to the the sentence text. @@ -96,6 +97,6 @@ def process_tree(self, root): else: last_node.misc['SpaceAfter'] = 'No' else: - assert False # we have checked the whole sentence already + assert False # we have checked the whole sentence already if text: logging.warning('Extra text "%s" in tree %s', text, root) diff --git a/udapi/block/ud/google2ud.py b/udapi/block/ud/google2ud.py index 510c6b2f..b082c0af 100644 --- a/udapi/block/ud/google2ud.py +++ b/udapi/block/ud/google2ud.py @@ -9,12 +9,12 @@ DEPREL_CHANGE = { "ROOT": "root", "prep": "case", - "ncomp": "case", # TODO ? + "ncomp": "case", # TODO ? "p": "punct", "poss": "nmod:poss", "ps": "case", "num": "nummod", - "number": "nummod", # TODO ? + "number": "nummod", # TODO ? "tmod": "nmod:tmod", "vmod": "acl", "rcmod": "acl:relcl", @@ -24,20 +24,20 @@ "predet": "det:predet", "gmod": "amod", "gobj": "obj", - "postneg": "neg", # will be changed to advmod + Polarity=Neg in ud.Convert1to2 - "pronl": "obj", # TODO: or expl? UD_French seems to use a mix of both + "postneg": "neg", # will be changed to advmod + Polarity=Neg in ud.Convert1to2 + "pronl": "obj", # TODO: or expl? UD_French seems to use a mix of both "redup": "compound:plur", "oblcomp": "obl", - "mes": "dep", # TODO ? - "mwn": "compound:n", # nominal multi-word - "mwa": "compound:a", # adjectival multi-word - "mwv": "compound:v", # verbal multi-word - "asp": "aux", # aspectual particle + "mes": "dep", # TODO ? + "mwn": "compound:n", # nominal multi-word + "mwa": "compound:a", # adjectival multi-word + "mwv": "compound:v", # verbal multi-word + "asp": "aux", # aspectual particle "rcmodrel": "mark:relcl", - "auxcaus": "aux", # redundant with Voice=Cau + "auxcaus": "aux", # redundant with Voice=Cau "topic": "dep", "possessive": "case", - "quantmod": "det", # TODO UD_Hindi uses "dep" for the same words + "quantmod": "det", # TODO UD_Hindi uses "dep" for the same words # TODO: "ref" - in basic dependencies it should be rehanged and relabelled "conjv": "compound:conjv", } @@ -55,7 +55,7 @@ "mood=unsp_m": "", "animacy=unsp_r": "", "aspect=unsp_a": "", - "case=rel": "", # redundant with rcmodrel (mark:relcl) + "case=rel": "", # redundant with rcmodrel (mark:relcl) "reciprocity=non-rcp": "", "reciprocity=rcp": "PronType=Rcp", "aspect=imperf": "Aspect=Imp", @@ -75,9 +75,9 @@ "person_antecedent=3_a": "Person[psor]=3", "definiteness=def": "Definite=Def", "definiteness=indef": "Definite=Ind", - "mood=sub1": "Mood=Sub", # TODO: what is the difference between sub1 and sub2 in German? + "mood=sub1": "Mood=Sub", # TODO: what is the difference between sub1 and sub2 in German? "mood=sub2": "Mood=Sub", - "mood=inter": "PronType=Int", # TODO or keep Mood=Inter (it is used in UD_Chinese) + "mood=inter": "PronType=Int", # TODO or keep Mood=Inter (it is used in UD_Chinese) "tense=cnd": "Mood=Cnd", "degree=sup_a": "Degree=Abs", "degree=sup_r": "Degree=Sup", @@ -86,9 +86,10 @@ "animacy=rat": "Animacy=Hum", "animacy=irrat": "Animacy=Nhum", "honorific=hon": "Polite=Form", - "mood=psm": "Tense=Fut", # TODO ? + "mood=psm": "Tense=Fut", # TODO ? } + class Google2ud(Convert1to2): """Convert Google Universal Dependency Treebank into UD style.""" @@ -111,11 +112,11 @@ def process_tree(self, root): self.fix_feats(node) self.fix_upos(node) self.fix_deprel(node) - #self.fix_quotes(node) + # self.fix_quotes(node) # This needs to be executed after all other deprels are converted for node in root.descendants: - if node.deprel in ('acomp', 'attr'): # TODO not sure about attr + if node.deprel in ('acomp', 'attr'): # TODO not sure about attr copula = node.parent node.parent = copula.parent node.deprel = copula.deprel @@ -183,6 +184,9 @@ def fix_upos(self, node): self.log(node, 'affix', 'upos=AFFIX deprel=' + node.deprel) node.upos = 'PART' + if node.upos == 'PUNCT' and node.form in ('$', '£'): + node.upos = 'SYM' + def fix_deprel(self, node): """Convert Google dependency relations to UD deprels. @@ -205,7 +209,7 @@ def fix_deprel(self, node): preposition.parent = node # ud.Convert1to2 will change 'nmod' to 'obl' if needed - node.deprel = 'nmod' if node.deprel == 'pobj' else 'xcomp' # TODO check xcomp + node.deprel = 'nmod' if node.deprel == 'pobj' else 'xcomp' # TODO check xcomp # Prepositions should not have any children (except for deprel=fixed/mwe), see # http://universaldependencies.org/u/overview/syntax.html#multiword-function-words. diff --git a/udapi/block/ud/he/fixneg.py b/udapi/block/ud/he/fixneg.py index 5062854c..15325990 100644 --- a/udapi/block/ud/he/fixneg.py +++ b/udapi/block/ud/he/fixneg.py @@ -6,6 +6,7 @@ from udapi.core.block import Block + class FixNeg(Block): """Block for fixing the remaining cases (after ud.Convert1to2) of deprel=neg in UD_Hebrew.""" diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index f785f556..6c63e93f 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -28,6 +28,7 @@ 'VERB': 'VerbForm', } + class MarkBugs(Block): """Block for checking suspicious/wrong constructions in UD v2.""" diff --git a/udapi/block/ud/removemwt.py b/udapi/block/ud/removemwt.py index 462e9fbd..99c37b4d 100644 --- a/udapi/block/ud/removemwt.py +++ b/udapi/block/ud/removemwt.py @@ -1,6 +1,7 @@ """Block ud.RemoveMwt for removing multi-word tokens.""" from udapi.core.block import Block + class RemoveMwt(Block): """Substitute MWTs with one word representing the whole MWT.""" @@ -26,7 +27,7 @@ def guess_deprel(words): """DEPREL of the whole MWT""" return words[0].deprel # Alternatively, we could define deprel subtypes - #return words[0].deprel + ':' + ','.join([w.deprel for w in words[1:]]) + # return words[0].deprel + ':' + ','.join([w.deprel for w in words[1:]]) @staticmethod def guess_feats(words): diff --git a/udapi/block/ud/ro/fixneg.py b/udapi/block/ud/ro/fixneg.py index a22131b2..68888aa6 100644 --- a/udapi/block/ud/ro/fixneg.py +++ b/udapi/block/ud/ro/fixneg.py @@ -6,13 +6,14 @@ from udapi.core.block import Block + class FixNeg(Block): """Block for fixing the remaining cases (after ud.Convert1to2) of deprel=neg in UD_Romanian.""" def process_node(self, node): if node.deprel == "neg": if node.upos == "PRON" and node.form == "ne": - node.feats = 'Polarity=Neg' # delete other features + node.feats = 'Polarity=Neg' # delete other features elif node.upos != "ADJ": logging.warning("Strange node %s with deprel=neg", node) node.upos = "ADV" diff --git a/udapi/block/ud/ro/setspaceafter.py b/udapi/block/ud/ro/setspaceafter.py index bc18f364..6c4b27e3 100644 --- a/udapi/block/ud/ro/setspaceafter.py +++ b/udapi/block/ud/ro/setspaceafter.py @@ -10,6 +10,7 @@ import udapi.block.ud.setspaceafter + class SetSpaceAfter(udapi.block.ud.setspaceafter.SetSpaceAfter): """Block for heuristic setting of the SpaceAfter=No MISC attribute in Romanian. @@ -37,7 +38,7 @@ def process_tree(self, root): # Mark contractions like -i, -și, -l, -urilor, but not negative numbers like -12,3. # Store SpaceAfter=No to the previous node. - next_form = nodes[i+1].form + next_form = nodes[i + 1].form if re.match('-.*[^0-9,.]', next_form): self.mark_no_space(node) diff --git a/udapi/block/ud/ru/fixremnant.py b/udapi/block/ud/ru/fixremnant.py index d94b0e5c..b41431db 100644 --- a/udapi/block/ud/ru/fixremnant.py +++ b/udapi/block/ud/ru/fixremnant.py @@ -4,6 +4,7 @@ """ from udapi.core.block import Block + class FixRemnant(Block): """ad-hoc fixing the remaining cases (after ud.Convert1to2) of deprel=remnant in UD_Russian.""" diff --git a/udapi/block/ud/setspaceafter.py b/udapi/block/ud/setspaceafter.py index 00193770..e796bf0d 100644 --- a/udapi/block/ud/setspaceafter.py +++ b/udapi/block/ud/setspaceafter.py @@ -9,6 +9,7 @@ from udapi.core.block import Block + class SetSpaceAfter(Block): """Block for heuristic setting of the SpaceAfter=No MISC attribute.""" @@ -40,14 +41,14 @@ def process_tree(self, root): not_after += '“' for i, node in enumerate(nodes[:-1]): - next_form = nodes[i+1].form + next_form = nodes[i + 1].form if node.form in self.not_after or next_form in not_before: self.mark_no_space(node) if matching_quotes and node.form == '"': if odd_indexed_quote: self.mark_no_space(node) elif i: - self.mark_no_space(nodes[i-1]) + self.mark_no_space(nodes[i - 1]) odd_indexed_quote = not odd_indexed_quote if matching_quotes and nodes[-1].form == '"': diff --git a/udapi/block/ud/setspaceafterfromtext.py b/udapi/block/ud/setspaceafterfromtext.py index 3dcd12f2..e3a1f90f 100644 --- a/udapi/block/ud/setspaceafterfromtext.py +++ b/udapi/block/ud/setspaceafterfromtext.py @@ -9,6 +9,7 @@ from udapi.core.block import Block + class SetSpaceAfterFromText(Block): """Block for setting of the SpaceAfter=No MISC attribute according to the sentence text.""" diff --git a/udapi/block/ud/splitunderscoretokens.py b/udapi/block/ud/splitunderscoretokens.py index 25caeb3b..094f181a 100644 --- a/udapi/block/ud/splitunderscoretokens.py +++ b/udapi/block/ud/splitunderscoretokens.py @@ -8,6 +8,7 @@ import logging from udapi.core.block import Block + class SplitUnderscoreTokens(Block): """Block for spliting tokens with underscores and attaching the new nodes using deprel=flat. diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index d72cbf16..78563abb 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -74,6 +74,7 @@ 'vi': 'models/udpipe/2.0/vietnamese-ud-2.0-conll17-170315.udpipe', } + class Base(Block): """Base class for all UDPipe blocks.""" diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index c5fa04f2..9bde12bf 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -5,10 +5,12 @@ from udapi.core.block import Block -pp = pprint.pprint # pylint: disable=invalid-name +pp = pprint.pprint # pylint: disable=invalid-name # We need exec in this block and the variables this etc. are not unused but provided for the exec # pylint: disable=exec-used,unused-variable + + class Eval(Block): r"""Special block for evaluating code given by parameters. @@ -66,7 +68,7 @@ def process_document(self, document): if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node: for bundle in doc.bundles: - #TODO if self._should_process_bundle(bundle): + # TODO if self._should_process_bundle(bundle): self.process_bundle(bundle) def process_bundle(self, bundle): @@ -83,7 +85,7 @@ def process_bundle(self, bundle): if self.tree or self.node: trees = bundle.trees for tree in trees: - #TODO if not self._should_process_tree(tree): continue + # TODO if not self._should_process_tree(tree): continue self.process_tree(tree) if self.after_bundle: diff --git a/udapi/block/util/filter.py b/udapi/block/util/filter.py index b812fb64..6d4118d6 100644 --- a/udapi/block/util/filter.py +++ b/udapi/block/util/filter.py @@ -1,10 +1,12 @@ """Filter is a special block for keeping/deleting subtrees specified by parameters.""" -import re # may be useful in eval, thus pylint: disable=unused-import +import re # may be useful in eval, thus pylint: disable=unused-import from udapi.core.block import Block # We need eval in this block # pylint: disable=eval-used + + class Filter(Block): """Special block for keeping/deleting subtrees specified by parameters. @@ -25,8 +27,7 @@ class Filter(Block): udapy -s util.Filter delete_subtree='node.deprel == "remnant"' < in > filtered """ - - def __init__(self, # pylint: disable=too-many-arguments + def __init__(self, # pylint: disable=too-many-arguments delete_tree=None, delete_tree_if_node=None, delete_subtree=None, keep_tree=None, keep_tree_if_node=None, keep_subtree=None, mark=None, **kwargs): @@ -72,7 +73,7 @@ def __init__(self, # pylint: disable=too-many-arguments self.keep_subtree = keep_subtree self.mark = mark - def process_tree(self, tree): # pylint: disable=too-many-branches + def process_tree(self, tree): # pylint: disable=too-many-branches root = tree if self.delete_tree is not None: diff --git a/udapi/block/util/findbug.py b/udapi/block/util/findbug.py index d01221b7..e05afe76 100644 --- a/udapi/block/util/findbug.py +++ b/udapi/block/util/findbug.py @@ -16,6 +16,7 @@ from udapi.block.write.conllu import Conllu from udapi.core.run import _parse_block_name + class FindBug(BaseWriter): """Debug another block by finding a minimal testcase conllu file.""" @@ -31,14 +32,14 @@ def process_document(self, document): try: command = "from " + module + " import " + class_name + " as b" logging.debug("Trying to run command: %s", command) - exec(command) # pylint: disable=exec-used + exec(command) # pylint: disable=exec-used except Exception: logging.warning("Error when trying import the block %s", self.block) raise - command = "b()" # TODO params as kwargs + command = "b()" # TODO params as kwargs logging.debug("Trying to evaluate this: %s", command) - new_block = eval(command) # pylint: disable=eval-used + new_block = eval(command) # pylint: disable=eval-used doc_copy = copy.deepcopy(document) writer = Conllu(files=self.orig_files) @@ -48,12 +49,12 @@ def process_document(self, document): self.block, bundle_no, bundle.bundle_id) try: new_block.process_bundle(bundle) - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: # pylint: disable=broad-except logging.warning('util.FindBug found a problem in bundle %d in block %s: %r', bundle_no, self.block, exc) logging.warning('Printing a minimal example to %s', self.orig_files) - for tree in document.bundles[bundle_no-1].trees: + for tree in document.bundles[bundle_no - 1].trees: writer.process_tree(tree) if self.first_error_only: diff --git a/udapi/block/util/mark.py b/udapi/block/util/mark.py index 42052336..02682a92 100644 --- a/udapi/block/util/mark.py +++ b/udapi/block/util/mark.py @@ -1,10 +1,12 @@ """util.Mark is a special block for marking nodes specified by parameters.""" -import re # may be useful in eval, thus pylint: disable=unused-import +import re # may be useful in eval, thus pylint: disable=unused-import from udapi.core.block import Block # We need eval in this block # pylint: disable=eval-used + + class Mark(Block): """Mark nodes specified by parameters. @@ -12,6 +14,7 @@ class Mark(Block): # see non-projective trees with non-projective edges highlighted udapy -TM util.Mark node='node.is_nonprojective()' < in | less -R """ + def __init__(self, node, mark=1, **kwargs): """Create the Mark block object. diff --git a/udapi/block/util/see.py b/udapi/block/util/see.py index 57cdf81c..aa7131b7 100644 --- a/udapi/block/util/see.py +++ b/udapi/block/util/see.py @@ -38,7 +38,7 @@ This helps to highlight what is special about the matching nodes. """ from collections import Counter -import re # may be useful in eval, thus pylint: disable=unused-import +import re # may be useful in eval, thus pylint: disable=unused-import from udapi.core.block import Block @@ -46,6 +46,8 @@ # We need eval in this block # pylint: disable=eval-used + + class See(Block): """Print statistics about the nodes specified by the parameter `node`.""" diff --git a/udapi/block/util/wc.py b/udapi/block/util/wc.py index df9db3f4..403daf5f 100644 --- a/udapi/block/util/wc.py +++ b/udapi/block/util/wc.py @@ -1,6 +1,7 @@ """Wc is a special block for printing statistics (word count etc).""" from udapi.core.block import Block + class Wc(Block): """Special block for printing statistics (word count etc).""" diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index 8c65f0fd..6c2dc314 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -15,7 +15,7 @@ def __init__(self, print_sent_id=True, print_text=True, print_empty_trees=True, self.node_attributes = ["ord", "form", "lemma", "upos", "xpos", "feats", "parent", "deprel", "raw_deps", "misc"] - def process_tree(self, tree): # pylint: disable=too-many-branches + def process_tree(self, tree): # pylint: disable=too-many-branches nodes = tree.descendants # Empty sentences are not allowed in CoNLL-U, so with print_empty_trees==0 diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 85b8dcc9..ec33b0fd 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -73,7 +73,7 @@ def process_document(self, doc): js_t_v = self.path_to_js + '/js-treex-view.js' print('') - print('Udapi viewer') # TODO doc.loaded_from + print('Udapi viewer') # TODO doc.loaded_from for js_file in (jquery, fsaver, js_t_v): print('' % js_file) print('\n') @@ -134,7 +134,7 @@ def print_node(node): multiline_feats = feats.replace('|', r'\n') print(',{{"id":{id_node},"parent":{id_parent},"order":{order},{firstson_str}{rbrother_str}' '"data":{{"ord":{order},"form":"{form}","lemma":"{lemma}","upos":"{upos}",' - '"xpos":"{xpos}","feats":"{feats}","deprel":"{deprel}",' # TODO: deps + '"xpos":"{xpos}","feats":"{feats}","deprel":"{deprel}",' # TODO: deps '"misc":"{misc}","id":"{address}"}},' '"labels":["{form}","#{{#bb0000}}{upos}","#{{#0000bb}}{deprel}"],' '"hint":"lemma={lemma}\\n{multiline_feats}"}}'.format(**locals())) @@ -151,6 +151,7 @@ def _id(node): return 'null' return '"n%s"' % node.address().replace('#', '-').replace('/', '-') + def _esc(string): if string is None: string = '' diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index b5968b3a..d32a5fe1 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -17,6 +17,8 @@ # Too many instance variables, arguments, branches... # I don't see how to fix this while not making the code less readable or more difficult to use. # pylint: disable=R0902,R0912,R0913,R0914 + + class TextModeTrees(BaseWriter): """An ASCII pretty printer of dependency trees. @@ -181,17 +183,17 @@ def process_tree(self, root): # Precompute the number of non-projective gaps for each subtree if self.minimize_cross: - self._gaps = [0,] * (1 + len(root.root.descendants)) + self._gaps = [0, ] * (1 + len(root.root.descendants)) self._compute_gaps(root) # Precompute lines for printing - stack = [root,] + stack = [root, ] while stack: node = stack.pop() children = node.children(add_self=1) min_idx, max_idx = self._index_of[children[0].ord], self._index_of[children[-1].ord] - max_length = max([self.lengths[i] for i in range(min_idx, max_idx+1)]) - for idx in range(min_idx, max_idx+1): + max_length = max([self.lengths[i] for i in range(min_idx, max_idx + 1)]) + for idx in range(min_idx, max_idx + 1): idx_node = allnodes[idx] filler = '─' if self._ends(idx, '─╭╰├╪') else ' ' self._add(idx, filler * (max_length - self.lengths[idx])) diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 21bd8e92..4da977a5 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -1,5 +1,5 @@ """An ASCII pretty printer of colored dependency trees in HTML.""" -from html import escape # pylint: disable=no-name-in-module +from html import escape # pylint: disable=no-name-in-module from udapi.block.write.textmodetrees import TextModeTrees @@ -15,6 +15,7 @@ mark {box-shadow: 0px 0px 0px 1px red; font-weight: bold;} ''' + class TextModeTreesHtml(TextModeTrees): """An ASCII pretty printer of colored dependency trees in HTML. @@ -41,7 +42,8 @@ def __init__(self, color=True, title='Udapi visualization', **kwargs): def before_process_document(self, document): # TextModeTrees.before_process_document changes the color property, # we need to skip this, but call BaseWriter's method which redirects stdout. - super(TextModeTrees, self).before_process_document(document) #pylint: disable=bad-super-call + super(TextModeTrees, self).before_process_document( + document) # pylint: disable=bad-super-call print('\n\n\n') print('' + self.title + '') print('') + print('\n') + + for tree in doc.trees: + self.process_tree(tree) + + print('') + print('') + + def process_tree(self, tree): + mentions = set() + nodes_and_empty = tree.descendants_and_empty + for node in nodes_and_empty: + for m in node.coref_mentions: + mentions.add(m) + + sent_mentions = [] + for mention in mentions: + mspan = mention.span + if ',' not in mspan: + sent_mentions.append(mention) + else: + entity = mention.entity + head_str = str(mention.words.index(mention.head) + 1) + subspans = mspan.split(',') + for idx,subspan in enumerate(subspans, 1): + subspan_eid = f'{entity.eid}[{idx}/{len(subspans)}]' + subspan_words = span_to_nodes(tree, subspan) + fake_entity = CorefEntity(subspan_eid, entity.etype) + fake_mention = CorefMention(subspan_words, head_str, fake_entity, add_word_backlinks=False) + if mention._other: + fake_mention._other = mention._other + if mention._bridging and idx == 1: + fake_mention._bridging = mention._bridging + sent_mentions.append(fake_mention) + sent_mentions.sort(reverse=True) + + opened = [] + print('

') + for node in nodes_and_empty: + while sent_mentions and sent_mentions[-1].words[0] == node: + m = sent_mentions.pop() + e = m.entity + classes = f'{e.eid} {e.etype or "other"}' + if all(w.is_empty() for w in m.words): + classes += ' empty' + if len(e.mentions) == 1: + classes += ' singleton' + title = f'eid={e.eid}\ntype={e.etype}\nhead={m.head.form}' + print(f'', end='') + opened.append(m) + + is_head = self._is_head(node) + if is_head: + print('', end='') + if node.is_empty(): + print('', end='') + print(node.form, end='') + if node.is_empty(): + print('', end='') + if is_head: + print('', end='') + + while opened and opened[-1].words[-1] == node: + print('', end='') + opened.pop() + + if not node.no_space_after: + print(' ', end='') + + print('

') + + def _is_head(self, node): + for mention in node.coref_mentions: + if mention.head == node: + return mention + return None + +# id needs to be a valid DOM querySelector +# so it cannot contain # nor / and it cannot start with a digit +def _id(node): + if node is None: + return 'null' + return '"n%s"' % node.address().replace('#', '-').replace('/', '-') + + +def _esc(string): + if string is None: + string = '' + return string.replace('\\', '\\\\').replace('"', r'\"') From e3ae1c3fb65fa62431e23c2bfff9d8534d458019 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 13:25:49 +0100 Subject: [PATCH 0753/1374] fix visualization of discontinuous mentions introduce CorefMentionSubspan instead of fake mentions (should be used also in store_coref_to_misc() in future) --- udapi/block/write/corefhtml.py | 40 +++++++++++----------------------- udapi/core/coref.py | 39 ++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index fc49dfb4..890b172a 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -44,44 +44,30 @@ def process_tree(self, tree): for m in node.coref_mentions: mentions.add(m) - sent_mentions = [] + subspans = [] for mention in mentions: - mspan = mention.span - if ',' not in mspan: - sent_mentions.append(mention) - else: - entity = mention.entity - head_str = str(mention.words.index(mention.head) + 1) - subspans = mspan.split(',') - for idx,subspan in enumerate(subspans, 1): - subspan_eid = f'{entity.eid}[{idx}/{len(subspans)}]' - subspan_words = span_to_nodes(tree, subspan) - fake_entity = CorefEntity(subspan_eid, entity.etype) - fake_mention = CorefMention(subspan_words, head_str, fake_entity, add_word_backlinks=False) - if mention._other: - fake_mention._other = mention._other - if mention._bridging and idx == 1: - fake_mention._bridging = mention._bridging - sent_mentions.append(fake_mention) - sent_mentions.sort(reverse=True) + subspans.extend(mention._subspans()) + subspans.sort(reverse=True) opened = [] print('

') for node in nodes_and_empty: - while sent_mentions and sent_mentions[-1].words[0] == node: - m = sent_mentions.pop() + while subspans and subspans[-1].words[0] == node: + subspan = subspans.pop() + m = subspan.mention e = m.entity classes = f'{e.eid} {e.etype or "other"}' - if all(w.is_empty() for w in m.words): + if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: classes += ' singleton' - title = f'eid={e.eid}\ntype={e.etype}\nhead={m.head.form}' - print(f'', end='') - opened.append(m) + title += f'\n{m.other}' + print(f'', end='') #data-eid="{e.eid}" + + opened.append(subspan) is_head = self._is_head(node) if is_head: diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 3eb76db3..1a6d1f95 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -128,6 +128,17 @@ def __init__(self, words, head=None, entity=None, add_word_backlinks=True): new_word._mentions.append(self) new_word._mentions.sort() + def _subspans(self): + mspan = self.span + if ',' not in mspan: + return [CorefMentionSubspan(self._words, self, '')] + root = self._words[0].root + subspans = mspan.split(',') + result = [] + for idx,subspan in enumerate(subspans, 1): + result.append(CorefMentionSubspan(span_to_nodes(root, subspan), self, f'[{idx}/{len(subspans)}]')) + return result + def __lt__(self, another): """Does this mention precedes (word-order wise) `another` mention? @@ -247,6 +258,32 @@ def span(self, new_span): self.words = span_to_nodes(self._head.root, new_span) +@functools.total_ordering +class CorefMentionSubspan(object): + """Helper class for representing a continuous subspan of a mention.""" + __slots__ = ['words', 'mention', 'subspan_id'] + + def __init__(self, words, mention, subspan_id): + if not words: + raise ValueError("mention.words must be non-empty") + self.words = sorted(words) + self.mention = mention + self.subspan_id = subspan_id + + def __lt__(self, another): + if self.words[0] is another.words[0]: + if len(self.words) > len(another.words): + return True + if len(self.words) < len(another.words): + return False + assert False + return self.words[0].precedes(another.words[0]) + + @property + def subspan_eid(self): + return self.mention._entity.eid + self.subspan_id + + CHARS_FORBIDDEN_IN_ID = "-=| \t()" @@ -886,7 +923,7 @@ def nodes_to_span(nodes): Note that empty nodes may form gaps in the span, so if a given tree contains an empty node with ord 5.1, but only nodes with ords 3, 4, 5, 6, 7.1 and 7.2 are provided as `nodes`, the resulting string will be "3-5,6,7.1-7.2". - This means that the implementation needs to iterate of all nodes + This means that the implementation needs to iterate over all nodes in a given tree (root.descendants_and_empty) to check for such gaps. """ if not nodes: From b78ef7eea0b76c4f41f8408d918092681d9c5fad Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 14:16:46 +0100 Subject: [PATCH 0754/1374] util.Normalize: sort attributes in FEATS and MISC --- udapi/block/util/normalize.py | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 udapi/block/util/normalize.py diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py new file mode 100644 index 00000000..5b4270cc --- /dev/null +++ b/udapi/block/util/normalize.py @@ -0,0 +1,40 @@ +"""util.Normalize normalizes the ordering of various attributes in CoNLL-U.""" +from udapi.core.block import Block + +class Normalize(Block): + """Normalize the ordering of attributes in the FEATS and MISC columns. + + The attribute-value pairs in the FEATS column in CoNLL-U files + must be sorted alphabetically (case-insensitive) according to the guidelines + (https://universaldependencies.org/format.html#morphological-annotation). + The same is highly recommended for the MISC column. + It is useful e.g. for comparing two conllu files with diff. + + Udapi does the sorting automatically, but for speed reasons + only when writing into these attributes. + This block thus just forces deserialization of node.feats and node.misc, + so that the Udapi later sorts the attributes during serialization. + It is a bit more efficient than something like + util.Eval node='node.feats["Number"] = node.feats["Number"]' + or + util.Eval node='node.misc["NonExistentAttribute"] = None' + """ + + def __init__(self, feats=True, misc=True, **kwargs): + """ + Args: + `feats`: normalize the ordering of FEATS. Default=True. + `misc`: normalize the ordering of MISC. Default=True. + """ + super().__init__(**kwargs) + self.feats = feats + self.misc = misc + # TODO: normalize also standardized comments like text, sent_id,... + + def process_node(self, node): + if self.feats: + node.feats._deserialize_if_empty() + node.feats._string = None + if self.misc: + node.misc._deserialize_if_empty() + node.misc._string = None From 90f338de077467acb4cb9ebebce68179419a0d77 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 14:29:30 +0100 Subject: [PATCH 0755/1374] allow writing to node.sdeprel, add tests --- udapi/core/node.py | 8 ++++++++ udapi/core/tests/test_node.py | 25 ++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 63242698..e188e134 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -166,6 +166,14 @@ def sdeprel(self): return parts[1] return '' + @sdeprel.setter + def sdeprel(self, value): + udeprel = self.udeprel + if value is not None and value != '': + self.deprel = udeprel + ':' + value + else: + self.deprel = udeprel + @property def feats(self): """Property for morphological features stored as a `Feats` object. diff --git a/udapi/core/tests/test_node.py b/udapi/core/tests/test_node.py index 28a45d85..8bc7f182 100755 --- a/udapi/core/tests/test_node.py +++ b/udapi/core/tests/test_node.py @@ -119,7 +119,7 @@ def test_draw(self): sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type def test_feats(self): - """Test the morphological featrues.""" + """Test the morphological features.""" node = Node(root=None) self.assertEqual(str(node.feats), '_') node.feats = '' @@ -145,6 +145,29 @@ def test_feats(self): self.assertEqual(str(node.feats), '_') self.assertEqual(node.feats, {}) + def test_deprel(self): + """Test getting setting the dependency relation.""" + node = Node(root=None, deprel='acl:relcl') + self.assertEqual(node.deprel, 'acl:relcl') + self.assertEqual(node.udeprel, 'acl') + self.assertEqual(node.sdeprel, 'relcl') + node.udeprel = 'advcl' + self.assertEqual(node.deprel, 'advcl:relcl') + node.sdeprel = 'tcl' + self.assertEqual(node.deprel, 'advcl:tcl') + node.sdeprel = '' + self.assertEqual(node.deprel, 'advcl') + self.assertEqual(node.udeprel, 'advcl') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj' + self.assertEqual(node.deprel, 'nsubj') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, '') + node.udeprel = 'nsubj:pass:outer' + self.assertEqual(node.deprel, 'nsubj:pass:outer') + self.assertEqual(node.udeprel, 'nsubj') + self.assertEqual(node.sdeprel, 'pass:outer') + def test_deps_getter(self): """Test enhanced dependencies.""" # Create a path to the test CoNLLU file. From 5817af214df034e42cf09ef2c08f0c8d15b3a0d9 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 16:31:50 +0100 Subject: [PATCH 0756/1374] write.CorefHtml marks subspans of discontiuous mentions with a red border --- udapi/block/write/corefhtml.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 890b172a..e0ab830b 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -18,26 +18,34 @@ def process_document(self, doc): #print('') print('') print('\n') + mention_ids = {} + for entity in doc.coref_entities: + for idx, mention in enumerate(entity.mentions, 1): + mention_ids[mention] = f'{entity.eid}e{idx}' + for tree in doc.trees: - self.process_tree(tree) + self.process_tree(tree, mention_ids) print('') + ' e.stopPropagation();\n});\n' + '$("span").hover(\n' + ' function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");},\n' + ' function(e) {$("span").removeClass("active");}\n' + ');\n') print('') - def process_tree(self, tree): + def process_tree(self, tree, mention_ids): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -56,7 +64,7 @@ def process_tree(self, tree): subspan = subspans.pop() m = subspan.mention e = m.entity - classes = f'{e.eid} {e.etype or "other"}' + classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: From 355e7bdc32ab854827aff1f7277b069f5c5a8bc0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 30 Jan 2023 17:57:48 +0100 Subject: [PATCH 0757/1374] write.CorefHtml shows also crossing mentions using valid (well-nested) html --- udapi/block/write/corefhtml.py | 56 +++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index e0ab830b..3efe9793 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -18,7 +18,8 @@ def process_document(self, doc): #print('') print('') @@ -35,15 +74,37 @@ def process_document(self, doc): for tree in doc.trees: self.process_tree(tree, mention_ids) - print('') + print('') print('') def _start_subspan(self, subspan, mention_ids, crossing=False): @@ -74,8 +135,10 @@ def process_tree(self, tree, mention_ids): subspans.extend(mention._subspans()) subspans.sort(reverse=True) + if tree.newpar: + print('


') opened = [] - print('

') + print(f'

') for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() From 9e11bd515e19fa59c0bdbc50654d29544b13a21b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 1 Feb 2023 18:03:19 +0100 Subject: [PATCH 0764/1374] util.Normalize now normalizes also sent_id --- udapi/block/util/normalize.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py index 5b4270cc..298bea42 100644 --- a/udapi/block/util/normalize.py +++ b/udapi/block/util/normalize.py @@ -20,16 +20,33 @@ class Normalize(Block): util.Eval node='node.misc["NonExistentAttribute"] = None' """ - def __init__(self, feats=True, misc=True, **kwargs): + def __init__(self, feats=True, misc=True, sent_id=True, start_sent_id=1, **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. `misc`: normalize the ordering of MISC. Default=True. + `sent_id`: normalize sent_id so it forms a sequence of integers + `start_sent_id`: the first sent_id number """ super().__init__(**kwargs) self.feats = feats self.misc = misc - # TODO: normalize also standardized comments like text, sent_id,... + self.sent_id = sent_id + self.next_sent_id = start_sent_id + # TODO: normalize also the order of standardized comments like text, sent_id,... + + def process_bundle(self, bundle): + if self.sent_id: + bundle.bundle_id = str(self.next_sent_id) + self.next_sent_id += 1 + + for tree in bundle: + if self._should_process_tree(tree): + self.process_tree(tree) + + def process_tree(self, tree): + for node in tree.descendants: + self.process_node(node) def process_node(self, node): if self.feats: From 4e1b75678dab1f2602cc26b641a31de977a98f14 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 00:47:37 +0100 Subject: [PATCH 0765/1374] sent_id should not be normalized by default Unlike feats and misc ordering, we can lose information this way - the original sent_id, so it is potentially dangerous. --- udapi/block/util/normalize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py index 298bea42..48cd6dc1 100644 --- a/udapi/block/util/normalize.py +++ b/udapi/block/util/normalize.py @@ -20,12 +20,12 @@ class Normalize(Block): util.Eval node='node.misc["NonExistentAttribute"] = None' """ - def __init__(self, feats=True, misc=True, sent_id=True, start_sent_id=1, **kwargs): + def __init__(self, feats=True, misc=True, sent_id=False, start_sent_id=1, **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. `misc`: normalize the ordering of MISC. Default=True. - `sent_id`: normalize sent_id so it forms a sequence of integers + `sent_id`: normalize sent_id so it forms a sequence of integers. Default=False. `start_sent_id`: the first sent_id number """ super().__init__(**kwargs) From b899af14c12c7ba4c9750ba39bf5f5544783ba59 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 09:53:59 +0100 Subject: [PATCH 0766/1374] write.Conllu path=another/directory keeps the file name, but changes the directory --- udapi/core/basewriter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index cdc2c38f..93f6463a 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -1,6 +1,7 @@ """BaseWriter is the base class for all writer blocks.""" import sys import logging +import os import udapi.core.coref from udapi.core.block import Block @@ -11,7 +12,7 @@ class BaseWriter(Block): """Base class for all reader blocks.""" def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding='utf-8', - newline='\n', overwrite=False, **kwargs): + newline='\n', overwrite=False, path=None, **kwargs): super().__init__(**kwargs) self.orig_files = files self.orig_stdout = sys.stdout @@ -29,6 +30,7 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1") + self.path = path @property def filename(self): @@ -60,9 +62,11 @@ def before_process_document(self, document): sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: logging.warning('docname_as_file=1 but the document contains no docname') - elif self.overwrite: + elif self.overwrite or self.path: docname = document.meta.get('loaded_from', None) if docname is not None: + if self.path: + docname = os.path.join(self.path, os.path.split(docname)[1]) logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: From 9d183c1d979c50fabff9b3a295a0d8194a09c790 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 10:14:59 +0100 Subject: [PATCH 0767/1374] etype mismatch is stored in mention.other["orig_etype"] which allows easier debugging --- udapi/core/coref.py | 1 + 1 file changed, 1 insertion(+) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 1a13d9fb..12dda239 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -665,6 +665,7 @@ def load_coref_from_misc(doc, strict=True): entity.etype = etype elif etype and entity.etype and entity.etype != etype: logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}") + other["orig_etype"] = etype # CorefEntity could be created first with "Bridge=" without any type elif etype and entity.etype is None: entity.etype = etype From 5b3ed0268ccf76f5332fcce87ac0da9a42b221b8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 14:19:33 +0100 Subject: [PATCH 0768/1374] allow using e.g. write.CorefHtml path='html/*.html' --- udapi/core/basewriter.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index 93f6463a..e17a64c3 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -66,11 +66,21 @@ def before_process_document(self, document): docname = document.meta.get('loaded_from', None) if docname is not None: if self.path: - docname = os.path.join(self.path, os.path.split(docname)[1]) + old_dir, old_filename = os.path.split(docname) + new_dir, new_filename = os.path.split(self.path) + old_file, old_ext = os.path.splitext(old_filename) + new_file, new_ext = os.path.splitext(new_filename) + if new_dir in ('', '*'): + new_dir = old_dir + if new_file in ('', '*'): + new_file = old_file + if new_ext in ('', '*'): + new_ext = old_ext + docname = os.path.join(new_dir, new_file + new_ext) logging.info('Writing to file %s.', docname) sys.stdout = open(docname, 'wt', encoding=self.encoding, newline=self.newline) else: - logging.warning('overwrite=1 but document.meta["loaded_from"] is None') + logging.warning('using overwrite or path but document.meta["loaded_from"] is None') else: sys.stdout = self.orig_stdout else: From 34aa19d7d892790b81b2b79579fc4391c07a23ed Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 2 Feb 2023 14:42:30 +0100 Subject: [PATCH 0769/1374] write.Conllu path=my_dir should be interpreted as path=my_dir/ --- udapi/core/basewriter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/udapi/core/basewriter.py b/udapi/core/basewriter.py index e17a64c3..6e1b7446 100644 --- a/udapi/core/basewriter.py +++ b/udapi/core/basewriter.py @@ -30,6 +30,9 @@ def __init__(self, files='-', filehandle=None, docname_as_file=False, encoding=' raise ValueError("overwrite=1 is not compatible with files=" + files) if overwrite and docname_as_file: raise ValueError("overwrite=1 is not compatible with docname_as_file=1") + # interpret path=my_dir/my_subdir as path=my_dir/my_subdir/ + if path and path[-1] != os.sep and '*' not in path: + path += os.sep self.path = path @property From 301b808082254a9b45a2bd4cfe162719dc02bc23 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 4 Feb 2023 01:36:25 +0100 Subject: [PATCH 0770/1374] corefud.GuessSpan: add empty nodes that are causing gaps --- udapi/block/corefud/guessspan.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/udapi/block/corefud/guessspan.py b/udapi/block/corefud/guessspan.py index 5c3c6c12..d6093ece 100644 --- a/udapi/block/corefud/guessspan.py +++ b/udapi/block/corefud/guessspan.py @@ -4,6 +4,30 @@ class GuessSpan(Block): """Block corefud.GuessSpan heuristically fills mention spans, while keeping mention.head""" def process_coref_mention(self, mention): - mention.words = mention.head.descendants(add_self=True) - # TODO add empty nodes that are causing gaps + mwords = mention.head.descendants(add_self=True) # TODO add heuristics from corefud.PrintMentions almost_forest=1 + + # Add empty nodes that are causing gaps. + # A node "within the span" whose enhanced parent is in the mentions + # must be added to the mention as well. + # "within the span" includes also empty nodes "on the boundary". + # However, don't add empty nodes which are in a gap cause by non-empty nodes. + to_add = [] + min_ord = int(mwords[0].ord) if mwords[0].is_empty() else mwords[0].ord - 1 + max_ord = int(mwords[-1].ord) + 1 + root = mention.head.root + for empty in root.empty_nodes: + if empty in mwords: + continue + if empty.ord > max_ord: + break + if empty.ord > min_ord: + if any(enh['parent'] in mwords for enh in empty.deps): + to_add.append(empty) + elif empty.ord > min_ord + 1 and empty.ord < max_ord - 1: + prev_nonempty = root.descendants[int(empty.ord) - 1] + next_nonempty = root.descendants[int(empty.ord)] + if prev_nonempty in mwords and next_nonempty in mwords: + to_add.append(empty) + #else: empty.misc['Mark'] = f'not_in_treelet_of_{mention.entity.eid}' + mention.words = sorted(mwords + to_add) From 2285d27f5e9444d3db7a8a0b8db227b38e5c082b Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 5 Feb 2023 01:06:32 +0100 Subject: [PATCH 0771/1374] write.CorefHtml: distinguish entities using colors, show eid and docname --- udapi/block/write/corefhtml.py | 41 +++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 8503854f..0a06b7e5 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -1,19 +1,21 @@ """CorefHtml class is a writer for HTML+JavaScript visualization of coreference.""" from udapi.core.basewriter import BaseWriter from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention +from collections import Counter import udapi.block.write.html ETYPES = 'person place organization animal plant object substance time number abstract event'.split() CSS = ''' .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.sentence span .eid {display:block; font-size: 10px;} .showtree {float:left; margin: 5px;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} .empty {color: gray;} -.singleton {border-style: dotted;} +.sentence .singleton {border-style: dotted;} .crossing:before {content: "!"; display: block; background: #ffd500;} .active {border: 1px solid red !important;} -.selected {background: red !important;} +.selected {background: red !important; text-shadow: 1px 1px 4px white, -1px 1px 4px white, 1px -1px 4px white, -1px -1px 4px white;} .other {background: hsl(0, 0%, 85%);} ''' @@ -50,9 +52,11 @@ class CorefHtml(BaseWriter): - def __init__(self, show_trees=True, **kwargs): + def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): super().__init__(**kwargs) self.show_trees = show_trees + self.show_eid = show_eid + self.colors = colors def process_document(self, doc): print('') @@ -63,16 +67,25 @@ def process_document(self, doc): print('') print('\n') mention_ids = {} + entity_colors = {} + entities_of_type = Counter() for entity in doc.coref_entities: + if self.colors: + count = entities_of_type[entity.etype] + entities_of_type[entity.etype] = count + 1 + entity_colors[entity] = f'c{count % self.colors}' for idx, mention in enumerate(entity.mentions, 1): mention_ids[mention] = f'{entity.eid}e{idx}' for tree in doc.trees: - self.process_tree(tree, mention_ids) + self.process_tree(tree, mention_ids, entity_colors) print('') print('') - def _start_subspan(self, subspan, mention_ids, crossing=False): + def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): m = subspan.mention e = m.entity classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"}' - title = f'eid={subspan.subspan_eid}\ntype={e.etype}\nhead={m.head.form}' + title = f'eid={subspan.subspan_eid}\ntype={e.etype} ({entity_colors[e]})\nhead={m.head.form}' + if self.colors: + classes += f' {entity_colors[e]}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: @@ -121,9 +136,11 @@ def _start_subspan(self, subspan, mention_ids, crossing=False): title += '\ncrossing' if m.other: title += f'\n{m.other}' - print(f'', end='') #data-eid="{e.eid}" + print(f'', end='') + if self.show_eid: + print(f'{subspan.subspan_eid}', end='') - def process_tree(self, tree, mention_ids): + def process_tree(self, tree, mention_ids, entity_colors): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -135,14 +152,16 @@ def process_tree(self, tree, mention_ids): subspans.extend(mention._subspans()) subspans.sort(reverse=True) - if tree.newpar: + if tree.newdoc: + print(f'


{tree.newdoc if tree.newdoc is not True else ""}


') + elif tree.newpar: print('
') opened = [] print(f'

') for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() - self._start_subspan(subspan, mention_ids) + self._start_subspan(subspan, mention_ids, entity_colors) opened.append(subspan) is_head = self._is_head(node) @@ -180,7 +199,7 @@ def process_tree(self, tree, mention_ids): opened = new_opened print('' * (len(endings) + len(brokens)), end='') for broken in brokens: - self._start_subspan(broken, mention_ids, True) + self._start_subspan(broken, mention_ids, entity_colors, True) opened.append(subspan) if not node.no_space_after: From cae7c37efe8548c2e432b108e4aa06df3b778e3a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 15:07:42 +0100 Subject: [PATCH 0772/1374] `read.Conllu max_docs=3` will load only the first three documents This is nice for debugging coreference files, where we cannot load just first N sentences because there may be Bridge/SplitAnte referring to unknown eid. This way we load whole docs. --- udapi/block/read/conllu.py | 22 ++++++++++++++++++++-- udapi/core/basereader.py | 31 ++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index bba69696..d5623fba 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -81,8 +81,26 @@ def parse_comment_line(self, line, root): root.comment += line[1:] + "\n" def read_trees(self): - return [self.read_tree_from_lines(s.split('\n')) for s in - self.filehandle.read().split('\n\n') if s] + if not self.max_docs: + return [self.read_tree_from_lines(s.split('\n')) for s in + self.filehandle.read().split('\n\n') if s] + # udapi.core.basereader takes care about the max_docs parameter. + # However, we can make the loading much faster by not reading + # the whole file if the user wants just first N documents. + trees, lines, loaded_docs = [], [], 0 + for line in self.filehandle: + line = line.rstrip() + if line == '': + tree = self.read_tree_from_lines(lines) + lines = [] + if tree.newdoc: + if loaded_docs == self.max_docs: + return trees + loaded_docs += 1 + trees.append(tree) + else: + lines.append(line) + return def read_tree(self): if self.filehandle is None: diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index a3b334da..a841bf1b 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -13,7 +13,8 @@ class BaseReader(Block): # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', - sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, **kwargs): + sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, + max_docs=0, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None @@ -29,6 +30,8 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id self.merge = merge + self.max_docs = max_docs + self._docs_loaded = 0 # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it in here, the reader. # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, @@ -126,6 +129,11 @@ def try_fast_load(self, document): bundle, last_bundle_id = None, '' for root in trees: + if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return True + self._docs_loaded += 1 add_to_the_last_bundle = False if self.ignore_sent_id: @@ -180,8 +188,10 @@ def process_document(self, document): if root._sent_id is not None: bundle.bundle_id = root._sent_id.split('/', 1)[0] bundle.add_tree(root) - if root.newdoc and root.newdoc is not True: - document.meta["docname"] = root.newdoc + if root.newdoc: + self._docs_loaded += 1 + if root.newdoc is not True: + document.meta["docname"] = root.newdoc document.meta['global.Entity'] = self._global_entity document.meta['loaded_from'] = self.filename @@ -204,6 +214,17 @@ def process_document(self, document): if trees_loaded == 0: document.meta['loaded_from'] = self.filename document.meta['global.Entity'] = self._global_entity + # Parameter max_docs is primarily aimed for counting UD docs, ie. trees with newdoc. + # However, it could be useful even when working with files without the newdoc annotations, + # e.g. when using files='!*.conllu' or bundles_per_doc, in which case we count the Udapi documents + # so even if the first tree in udapi.Document does not have newdoc, we count it as a new document. + # The cases where newdoc is used are checked further below. + if not root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return + self._docs_loaded += 1 + add_to_the_last_bundle = False trees_loaded += 1 @@ -222,6 +243,9 @@ def process_document(self, document): # The `# newdoc` comment in CoNLL-U marks a start of a new document. if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return if not bundle and root.newdoc is not True: document.meta["docname"] = root.newdoc if bundle and self.split_docs: @@ -231,6 +255,7 @@ def process_document(self, document): len(orig_bundles)) self.finished = False return + self._docs_loaded += 1 # assign new/next bundle to `bundle` if needed if not bundle or not add_to_the_last_bundle: From ae34d8024d8ee95db6e1bf39581e44fc08bcbc73 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 15:25:50 +0100 Subject: [PATCH 0773/1374] refactor code duplication --- udapi/block/write/corefhtml.py | 29 +++-------------------------- udapi/block/write/html.py | 28 +++++++++++++++------------- 2 files changed, 18 insertions(+), 39 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 0a06b7e5..c7950ce9 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -50,6 +50,8 @@ }); ''' +WRITE_HTML = udapi.block.write.html.Html() + class CorefHtml(BaseWriter): def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): @@ -90,32 +92,7 @@ def process_document(self, doc): print('') print('') diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 148b29ee..48431900 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,16 +79,26 @@ def process_document(self, doc): print('\n') print('

') + + def print_doc_json(self, doc): print('data=[') for (bundle_number, bundle) in enumerate(doc, 1): - # TODO: if not self._should_process_bundle(bundle): continue if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' for tree in bundle.trees: - # TODO: if not self._should_process_tree(tree): continue zone = tree.zone if first_zone: first_zone = False @@ -101,24 +111,16 @@ def process_document(self, doc): print('"labels":["zone=%s","id=%s"]}' % (zone, tree.address())) desc += ',["[%s]","label"],[" ","space"]' % zone for node in tree.descendants: - desc += self.print_node(node) + desc += self.print_node_json(node) desc += r',["\n","newline"]' print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) print('];') - print("$('#treex-view').treexView(data);") - print('''function saveTree() { - var svg_el = jQuery('svg'); - if (svg_el.length) { - var svg = new Blob([svg_el.parent().html()], {type: "image/svg+xml"}); - saveAs(svg, 'tree.svg'); - } - }''') - print('') + @staticmethod - def print_node(node): + def print_node_json(node): """JSON representation of a given node.""" # pylint does not understand `.format(**locals())` and falsely alarms for unused vars # pylint: disable=too-many-locals,unused-variable From ca4d2b7f8240a0faca55f9aad6513d9a94968a08 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 19:53:25 +0100 Subject: [PATCH 0774/1374] write.CorefHtml: add side panel with an overview of entities --- udapi/block/write/corefhtml.py | 62 ++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index c7950ce9..280fc213 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -6,7 +6,25 @@ ETYPES = 'person place organization animal plant object substance time number abstract event'.split() +HEADER = ''' + +Udapi CorefUD viewer + +''' +# I use a pure CSS-3 solution: #overiew {resize: horizontal; overflow: auto;} +# so that the width of #overview can be changed by dragging the bottom right corner. +# The following lines would make the whole right border draggable: +# +# +# +#
CSS = ''' +#wrap {display: flex; align-items: flex-start;} +#main {width: 100%; padding: 5px; background: white; z-index:100;} +#overview { position: sticky; top: 0; overflow-y: scroll; height:95vh; resize:horizontal; + display: grid; border-right: double; + padding: 5px; width: 20em; background: #ddd; border-radius: 5px; +} .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} .sentence span .eid {display:block; font-size: 10px;} .showtree {float:left; margin: 5px;} @@ -23,10 +41,16 @@ $("span").click(function(e) { let was_selected = $(this).hasClass("selected"); $("span").removeClass("selected"); - if (!was_selected){$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} + if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} e.stopPropagation(); }); +window.onhashchange = function() { + $("span").removeClass("selected"); + var fragment = window.location.hash.substring(1); + if (fragment) {$("." + fragment).addClass("selected");} +} + $("span").hover( function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, function(e) {$("span").removeClass("active");} @@ -60,10 +84,18 @@ def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): self.show_eid = show_eid self.colors = colors + def _representative_word(self, entity): + # return the first PROPN or NOUN. Or the most frequent one? + heads = [m.head for m in entity.mentions] + lemma_or_form = lambda n: n.lemma if n.lemma else n.form + for upos in ('PROPN', 'NOUN'): + nodes = [n for n in heads if n.upos == upos] + if nodes: + return lemma_or_form(nodes[0]) + return lemma_or_form(heads[0]) + def process_document(self, doc): - print('') - print('Udapi CorefUD viewer') - print('') + print(HEADER) if self.show_trees: print('') print('') - print('\n') + print('\n\n
') mention_ids = {} entity_colors = {} @@ -86,8 +118,21 @@ def process_document(self, doc): for idx, mention in enumerate(entity.mentions, 1): mention_ids[mention] = f'{entity.eid}e{idx}' + print('
') + print('' + '' + '\n') + for entity in doc.coref_entities: + print(f'' + f'' + f'') + print('
eid#mword
{entity.eid}{len(entity.mentions)}{self._representative_word(entity)}
') + print('
') + + print('
') for tree in doc.trees: self.process_tree(tree, mention_ids, entity_colors) + print('
') print('') - print('') + print('
') def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): m = subspan.mention @@ -113,7 +158,10 @@ def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): title += '\ncrossing' if m.other: title += f'\n{m.other}' - print(f'', end='') + span_id = '' + if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m: + span_id = f'id="{e.eid}" ' + print(f'', end='') if self.show_eid: print(f'{subspan.subspan_eid}', end='') From bbd702aa35fcf4e13d2a4ab2d3972a7efd89fcc5 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 16:22:03 +0100 Subject: [PATCH 0775/1374] Python glob.glob does not support {dir1,dir2} anyway --- udapi/core/files.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index 7fcd9149..c6973dad 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -58,14 +58,6 @@ def string_to_filenames(self, string): or commas. For specifying files with spaces or commas in filenames, you need to use wildcard patterns or '@' filelist. (But preferably don't use such filenames.) """ - # "!" means glob pattern which can contain {dir1,dir2} - # so it cannot be combined with separating tokens with comma. - if string[0] == '!': - pattern = string[1:] - filenames = glob.glob(pattern) - if not filenames: - raise RuntimeError('No filenames matched "%s" pattern' % pattern) - return filenames return list(itertools.chain.from_iterable(self._token_to_filenames(tok) for tok in string.replace(',', ' ').split())) From a5acaf43b1edb3468dfc493da6e7ae87f2d99966 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 17:58:45 +0100 Subject: [PATCH 0776/1374] ud.ComplyWithText: use node.misc['CorrectForm'] instead of node.misc['OrigForm'] which was a misleading name because the previous form value is usually not the real original form. --- udapi/block/ud/complywithtext.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index cead294a..bacc56a2 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -34,7 +34,7 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, - **kwargs): + previous_form_attr='CorrectForm', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -54,24 +54,33 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + previous_form_attr - when changing node.form, we store the previous value + in node.misc[previous_form_attr] (so no information is lost). + Default="CorrectForm" because we expect that the previous value + (i.e. the value of node.form before applying this block) + contained the corrected spelling, while root.text contains + the original spelling with typos as found in the raw text. + CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html + When setting this parameter to an empty string, no values will be stored to node.misc. """ super().__init__(**kwargs) self.fix_text = fix_text self.prefer_mwt = prefer_mwt self.allow_goeswith = allow_goeswith self.max_mwt_length = max_mwt_length + self.allow_add_punct = allow_add_punct + self.allow_delete_punct = allow_delete_punct + self.previous_form_attr = previous_form_attr @staticmethod def allow_space(form): """Is space allowed within this token form?""" return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) - @staticmethod - def store_orig_form(node, new_form): - """Store the original form of this node into MISC, unless the change is common&expected.""" - _ = new_form + def store_previous_form(self, node): + """Store the previous form of this node into MISC, unless the change is common&expected.""" if node.form not in ("''", "``"): - node.misc['OrigForm'] = node.form + node.misc[self.previous_form_attr] = node.form def process_tree(self, root): text = root.text @@ -203,7 +212,7 @@ def solve_diff(self, nodes, form): if ' ' in form: if len(nodes) == 1 and node.form == form.replace(' ', ''): if self.allow_space(form): - self.store_orig_form(node, form) + self.store_previous_form(node) node.form = form elif self.allow_goeswith: forms = form.split() @@ -235,7 +244,7 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_orig_form(node, form) + self.store_previous_form(node) node.form = form From a69c7a158edb91d12d2907f6802c3104d946ee0d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 18:00:46 +0100 Subject: [PATCH 0777/1374] ud.ComplyWithText fix_text=1 should always produce valid CoNLL-U so even if there are diffs which cannot be resolved, and thus we cannot fill SpaceAfter=No in the rest of the sentence, we must execute the "if self.fix_text:..." code, which changes the root.text (instead of changing the annotation of nodes). --- udapi/block/ud/complywithtext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index bacc56a2..1a13a4ec 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -121,7 +121,7 @@ def process_tree(self, root): node.misc['SpaceAfter'] = 'No' else: logging.warning('Node %s does not match text "%s"', node, tmp_text[:20]) - return + break # Edit root.text if needed. if self.fix_text: From fde163c32837ccc02a9b89d535be9769d4414340 Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Wed, 8 Feb 2023 14:23:05 +0100 Subject: [PATCH 0778/1374] further adjusted Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 122 ++++++++++++++++++----------- 1 file changed, 78 insertions(+), 44 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 323f60f7..111bceb9 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -29,7 +29,7 @@ def process_node(self, node): af = {} # NOUNS ################################################################ if node.upos == 'NOUN': - if not node.feats['Abbr'] == 'Yes' or node.feats['Case']: # abbreviated or indeclinable nouns + if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], @@ -37,11 +37,11 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Dim'], 'Abbr': ['Yes'], - 'Foreign': ['Yes']} + 'Foreign': ['Yes'], + 'VerbForm': ['Part']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] - af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] af['Compound'] = ['Yes'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] @@ -76,14 +76,12 @@ def process_node(self, node): 'Degree': ['Cmp', 'Sup', 'Abs'], 'Abbr': ['Yes'], 'Foreign': ['Yes'], - 'Polarity': ['Neg']} + 'Polarity': ['Neg'], + 'VerbForm': ['Part']} if self.flavio: - # Flavio does not use Degree=Pos, hence Degree is not required. - # rf = [f for f in rf if f != 'Degree'] # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] - af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] af['Degree'].append('Dim') af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] @@ -93,15 +91,16 @@ def process_node(self, node): elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { - 'PronType': ['Prs', 'Rel', 'Ind', 'Int', 'Rcp'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Proper': ['Yes'], + 'Compound': ['Yes'], + 'Polarity': ['Neg'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] if node.feats['Reflex'] == 'Yes': # seipsum, se rf.extend(['Person']) # seipsum has gender and number but se does not, so it is not required - # TODO: seipsum in ITTB, but why lemma seipsum instead of seipse? af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] af['Person'] = ['3'] @@ -122,6 +121,19 @@ def process_node(self, node): rf = [f for f in rf if f != 'Case'] af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] + # lexical check of PronTypes + af['PronType'] = [] + if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: + af['PronType'].append('Prs') + elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis']: + af['PronType'].append('Ind') + elif node.lemma in ['inuicem', 'invicem']: + af['PronType'].append('Rcp') + rf.remove('Case') + elif node.lemma in ['quicumque', 'qui', 'quisquis']: + af['PronType'].append('Rel') + if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis']: + af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatAnom', 'LatPron'] @@ -140,7 +152,9 @@ def process_node(self, node): 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Cmp', 'Abs', 'Sup'], - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Proper': ['Yes'], + 'PronType': [] } if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' rf.extend(['Poss', 'Person[psor]']) @@ -152,8 +166,24 @@ def process_node(self, node): if node.feats['Person[psor]'] != '3': rf.append('Number[psor]') af['Number[psor]'] = ['Sing', 'Plur'] - else: - af['PronType'] = ['Dem', 'Rel', 'Ind', 'Int', 'Tot', 'Con'] + if node.feats['PronType'] == 'Ind': + af['NumType'] = ['Card'] + # lexical check of PronTypes + if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: + if not af['PronType'] == ['Prs']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquot', 'quidam', 'quispiam', 'quivis', 'nullus', 'nonnullus', 'aliqui', 'qui', 'quilibet', 'quantuslibet', 'unus', 'uterque', 'ullus', 'multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + af['PronType'].append('Ind') + elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: + af['PronType'].append('Tot') + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus']: + af['PronType'].append('Rel') + elif node.lemma in ['qui', 'quantus', 'quot']: + af['PronType'].append('Int') + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot']: + af['PronType'].append('Dem') + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter']: + af['PronType'].append('Con') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] @@ -170,8 +200,8 @@ def process_node(self, node): rf = ['NumType', 'NumForm'] af = { 'NumType': ['Card'], - 'NumForm': ['Word', 'Roman', 'Digit'] - } + 'NumForm': ['Word', 'Roman', 'Digit'], + 'Proper': ['Yes']} # Arabic digits and Roman numerals do not have inflection features. if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): af['Gender'] = ['Masc', 'Fem', 'Neut'] @@ -186,40 +216,40 @@ def process_node(self, node): elif re.match(r'^(VERB|AUX)$', node.upos): rf = ['VerbForm', 'Aspect'] af = { - 'VerbForm': ['Inf', 'Fin', 'Part'], + 'VerbForm': ['Inf', 'Fin', 'Part', 'Conv'], 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Typo': ['Yes'] } - if not re.match(r'^(Ger|Gdv)$', node.feats['VerbForm']): + if node.feats['VerbForm'] not in ['Part', 'Conv']: rf.append('Tense') - af['Tense'] = ['Pres', 'Fut'] - if node.upos == 'VERB': # and not node.lemma.endswith('sum'): # compounds of sum + af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] + if node.upos == 'VERB': rf.append('Voice') af['Voice'] = ['Act', 'Pass'] - # Main verbs have aspect but auxiliaries don't. - # TODO: apparently, apparently AUXs have aspect as well - # if node.upos == 'VERB': - # rf.append('Aspect') - # af['Aspect'] = ['Imp', 'Inch', 'Perf', 'Prosp'] if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive rf.extend(['Mood', 'Person', 'Number']) - af['Tense'].extend(['Past', 'Pqp']) af['Mood'] = ['Ind', 'Sub', 'Imp'] af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] elif node.feats['VerbForm'] == 'Part': rf.extend(['Gender', 'Number', 'Case']) - af['Number'] = ['Sing', 'Plur'] - af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Sing'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Neut'] af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] af['Degree'] = ['Abs', 'Cmp'] - af['Gender'] = ['Masc', 'Fem', 'Neut'] - af['Tense'].append('Past') - # else: nothing to be added for VerbForm=Inf + if node.misc['TraditionalMood'].startswith('Gerundi'): + af['Voice'] = ['Pass'] + af['Aspect'] = 'Prosp' + elif node.feats['VerbForm'] == 'Conv': + rf.extend(['Case', 'Gender', 'Number']) + af['Case'] = ['Abl', 'Acc'] + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Voice'] = ['Act'] + elif node.feats['VerbForm'] == 'Inf': + af['Tense'].remove('Pqp') if self.flavio: - # Flavio has killed Tense in his treebanks. - rf = [f for f in rf if f != 'Tense'] - af['VerbForm'].append('Vnoun') # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] if 'Degree' in af: @@ -228,23 +258,22 @@ def process_node(self, node): af['Degree'] = ['Dim'] af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] - if re.match(r'^(Part|Vnoun)$', node.feats['VerbForm']): - af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO'] - af['VerbForm'].append('Vnoun') + if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## elif node.upos == 'ADV': af = { - 'AdvType': ['Loc', 'Tim'], + 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], - 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'NumType': ['Card', 'Ord'], # e.g., primum 'Polarity': ['Neg'] } if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] - af['NumType'] = ['Card', 'Ord'] # e.g., primum af['VerbForm'] = ['Part'] af['Degree'].append('Dim') self.check_allowed_features(node, af) @@ -262,7 +291,8 @@ def process_node(self, node): elif re.match(r'^[CS]CONJ$', node.upos): af = { 'PronType': ['Rel', 'Con'], - 'Polarity': ['Neg']} + 'Polarity': ['Neg'], + 'Compound': ['Yes']} if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] @@ -271,10 +301,14 @@ def process_node(self, node): self.check_allowed_features(node, af) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': + rf = ['AdpType'] + af = { + 'AdpType': ['Prep', 'Post'], + 'Abbr': ['Yes'] + } if self.flavio: - af = { - 'VerbForm': ['Part'], - 'Proper': ['Yes']} + af['VerbForm'] = ['Part'], + af['Proper'] = ['Yes'] self.check_allowed_features(node, af) # THE REST: NO FEATURES ################################################ else: From 29fb09caccd678560845ea3d80b2027145231c90 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:04:56 +0100 Subject: [PATCH 0779/1374] improve ud.ComplyWithText for KorKor --- udapi/block/ud/complywithtext.py | 81 ++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 1a13a4ec..02904731 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -24,7 +24,7 @@ """ import difflib import logging -import re +import regex from udapi.core.block import Block from udapi.core.mwt import MWT @@ -34,6 +34,7 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, + allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, previous_form_attr='CorrectForm', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. @@ -54,6 +55,14 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + allow_add_punct - allow creating punctuation-only nodes + allow_delete_punct - allow deleting extra punctuation-only nodes, + which are not represented in root.text + allow_hyphen_goeswith - if e.g. node.form=="mother-in-law" corresponds to + "mother in law" in root.text, convert it to three nodes: + node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") + node2(form="in", deprel="goeswith", upos="X", parent=node1) + node3(form="law", deprel="goeswith", upos="X", parent=node1). previous_form_attr - when changing node.form, we store the previous value in node.misc[previous_form_attr] (so no information is lost). Default="CorrectForm" because we expect that the previous value @@ -62,6 +71,7 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ the original spelling with typos as found in the raw text. CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html When setting this parameter to an empty string, no values will be stored to node.misc. + When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. """ super().__init__(**kwargs) self.fix_text = fix_text @@ -70,17 +80,20 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ self.max_mwt_length = max_mwt_length self.allow_add_punct = allow_add_punct self.allow_delete_punct = allow_delete_punct + self.allow_hyphen_goeswith = allow_hyphen_goeswith self.previous_form_attr = previous_form_attr @staticmethod def allow_space(form): """Is space allowed within this token form?""" - return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) + return regex.fullmatch('[0-9 ]+([,.][0-9]+)?', form) def store_previous_form(self, node): """Store the previous form of this node into MISC, unless the change is common&expected.""" - if node.form not in ("''", "``"): + if node.form not in ("''", "``") and self.previous_form_attr: node.misc[self.previous_form_attr] = node.form + if self.previous_form_attr == 'CorrectForm': + node.feats['Typo'] = 'Yes' def process_tree(self, root): text = root.text @@ -190,18 +203,38 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): for diff in diffs: edit, tree_lo, tree_hi, text_lo, text_hi = diff - # Focus only on edits of type 'replace', log insertions and deletions as failures. if edit == 'equal': - continue - if edit in ('insert', 'delete'): - logging.warning('Unable to solve token-vs-text mismatch\n%s', - _diff2str(diff, tree_chars, text)) - continue - - # Revert the splittng and solve the diff. - nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] - form = text[text_lo:text_hi] - self.solve_diff(nodes, form.strip()) + pass + elif edit == 'insert': + forms = text[text_lo:text_hi].split(' ') + if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: + #logging.info(f'trying to add {forms} before {char_nodes[tree_lo]}') + next_node = char_nodes[tree_lo] + for f in reversed(forms): + new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') + new.shift_before_node(next_node) + new.misc['Added'] = 1 + else: + logging.warning('Unable to insert nodes\n%s', + _diff2str(diff, tree_chars, text)) + elif edit == 'delete': + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + if all(regex.fullmatch('\p{P}+', n.form) for n in nodes): + if self.allow_delete_punct: + for node in nodes: + node.remove(children='rehang') + else: + logging.warning('Unable to delete punctuation nodes (try ud.ComplyWithText allow_delete_punct=1)\n%s', + _diff2str(diff, tree_chars, text)) + else: + logging.warning('Unable to delete non-punctuation nodes\n%s', + _diff2str(diff, tree_chars, text)) + else: + assert edit == 'replace' + # Revert the splittng and solve the diff. + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + form = text[text_lo:text_hi] + self.solve_diff(nodes, form.strip()) def solve_diff(self, nodes, form): """Fix a given (minimal) tokens-vs-text inconsistency.""" @@ -210,20 +243,25 @@ def solve_diff(self, nodes, form): # First, solve the cases when the text contains a space. if ' ' in form: - if len(nodes) == 1 and node.form == form.replace(' ', ''): + node_form = node.form + if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: + node_form = node_form.replace('-', '') + if len(nodes) == 1 and node_form == form.replace(' ', ''): if self.allow_space(form): self.store_previous_form(node) node.form = form elif self.allow_goeswith: + self.store_previous_form(node) forms = form.split() node.form = forms[0] + node.feats['Typo'] = 'Yes' for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos=node.upos) + new = node.create_child(form=split_form, deprel='goeswith', upos='X') new.shift_after_node(node) else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: - logging.warning('Unable to solve n:m diff:\n%s -> %s', nodes_str, form) + logging.warning(f'Unable to solve {len(nodes)}:{len(form.split(" "))} diff:\n{nodes_str} -> {form}') # Second, solve the cases when multiple nodes match one form (without any spaces). elif len(nodes) > 1: @@ -244,8 +282,13 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_previous_form(node) - node.form = form + if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): + new = node.create_child(form=form[len(node.form):], deprel='punct', upos='PUNCT') + new.shift_after_node(node) + new.misc['Added'] = 1 + else: + self.store_previous_form(node) + node.form = form def _nodes_to_chars(nodes): From d5a1a2a756ef13629984eb40af7b5853dbd8c7a0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:06:45 +0100 Subject: [PATCH 0780/1374] udapy hints when using a wrong block name or parameter name thanks to @michnov for this idea --- udapi/core/block.py | 23 +++++++++++++++++++---- udapi/core/run.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/udapi/core/block.py b/udapi/core/block.py index f039abce..fdcad9fa 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -1,5 +1,6 @@ """Block class represents the basic Udapi processing unit.""" import logging +import inspect def not_overridden(method): method.is_not_overridden = True @@ -14,9 +15,23 @@ class Block(object): Possible values are: process (default), skip, skip_warn, fail, delete. """ - def __init__(self, zones='all', if_empty_tree='process'): + def __init__(self, zones='all', if_empty_tree='process', **kwargs): self.zones = zones self.if_empty_tree = if_empty_tree + if kwargs: + params = set() + for cls in type(self).mro()[:-1]: + params.update(inspect.signature(cls.__init__).parameters.keys()) + params -= {'self', 'kwargs'} + raise TypeError(f"Extra parameters {kwargs}.\n" + f"Parameters of {self.block_name()} are:\n" + + '\n'.join(sorted(params))) + + def block_name(self): + module = ".".join(self.__module__.split(".")[:-1]) + if module.startswith('udapi.block.'): + module = module[12:] + return module + "." + self.__class__.__name__ def process_start(self): """A hook method that is executed before processing UD data""" @@ -73,7 +88,7 @@ def process_document(self, document): p_tree = not hasattr(self.process_tree, 'is_not_overridden') p_node = not hasattr(self.process_node, 'is_not_overridden') if not any((p_entity, p_mention, p_bundle, p_tree, p_node)): - raise Exception("No processing activity defined in block " + str(self)) + raise Exception("No processing activity defined in block " + self.block_name()) if p_entity or p_mention: for entity in document.coref_entities: @@ -85,8 +100,8 @@ def process_document(self, document): if p_bundle or p_tree or p_node: for bundle_no, bundle in enumerate(document.bundles, 1): - logging.debug('Block %s processing bundle #%d (id=%s)', - self.__class__.__name__, bundle_no, bundle.bundle_id) + logging.debug(f'Block {self.block_name()} processing ' + f'bundle #{bundle_no} (id={bundle.bundle_id})') if p_bundle: self.process_bundle(bundle) else: diff --git a/udapi/core/run.py b/udapi/core/run.py index a0cc4a9a..418baca6 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -67,6 +67,26 @@ def _parse_command_line_arguments(scenario): return block_names, block_args +def _blocks_in_a_package(package_name): + import importlib.util, pkgutil + + if not importlib.util.find_spec(package_name): + return [] + try: + package = __import__(package_name, fromlist="dummy") + submodule_names = [m.name for m in pkgutil.iter_modules(package.__path__)] + pname = package_name + if pname.startswith("udapi.block."): + pname = pname[12:] + blocks = [] + for sname in submodule_names: + module = __import__(f"{package_name}.{sname}", fromlist="dummy") + bname = [c for c in dir(module) if c.lower() == sname][0] + blocks.append(f"{pname}.{bname}") + return blocks + except: + return [] + def _import_blocks(block_names, block_args): """ Parse block names, import particular packages and call the constructor for each object. @@ -92,8 +112,17 @@ def _import_blocks(block_names, block_args): command = "from " + module + " import " + class_name + " as b" + str(block_id) logging.debug("Trying to run command: %s", command) exec(command) # pylint: disable=exec-used - except Exception: - logging.warning("Error when trying import the block %s", block_name) + except ModuleNotFoundError as err: + package_name = ".".join(module.split(".")[:-1]) + blocks = _blocks_in_a_package(package_name) + if not blocks: + raise + raise ModuleNotFoundError( + f"Cannot find block {block_name} (i.e. class {module}.{class_name})\n" + f"Available block in {package_name} are:\n" + + "\n".join(_blocks_in_a_package(package_name))) from err + except Exception as ex: + logging.warning(f"Cannot import block {block_name} (i.e. class {module}.{class_name})") raise # Run the imported module. From 49ed44d2e309523cdf3361c599934d5dbf58a2a8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:23:36 +0100 Subject: [PATCH 0781/1374] read.XY files='!*.conllu' should iterated over sorted files glob.glob() returns files in an arbitrary order (as `ls -U`) --- udapi/core/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index c6973dad..be59b2c0 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -65,7 +65,7 @@ def string_to_filenames(self, string): def _token_to_filenames(token): if token[0] == '!': pattern = token[1:] - filenames = glob.glob(pattern) + filenames = sorted(glob.glob(pattern)) if not filenames: raise RuntimeError('No filenames matched "%s" pattern' % pattern) elif token[0] == '@': From 1a4241104709e7647cf75ff84dbc68df3428fbe0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 9 Feb 2023 23:49:11 +0100 Subject: [PATCH 0782/1374] improve ud.ComplyWithText (for KorKor) --- udapi/block/ud/complywithtext.py | 70 ++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 02904731..c850018e 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -35,7 +35,8 @@ class ComplyWithText(Block): def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, - previous_form_attr='CorrectForm', **kwargs): + previous_form_label='CorrectForm', previous_text_label='CorrectText', + added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -63,8 +64,8 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") node2(form="in", deprel="goeswith", upos="X", parent=node1) node3(form="law", deprel="goeswith", upos="X", parent=node1). - previous_form_attr - when changing node.form, we store the previous value - in node.misc[previous_form_attr] (so no information is lost). + previous_form_label - when changing node.form, we store the previous value + in node.misc[previous_form_label] (so no information is lost). Default="CorrectForm" because we expect that the previous value (i.e. the value of node.form before applying this block) contained the corrected spelling, while root.text contains @@ -72,6 +73,12 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html When setting this parameter to an empty string, no values will be stored to node.misc. When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. + previous_text_label - when we are not able to adapt the annotation to match root.text + and fix_text is True, we store the previous root.text in a CoNLL-U comment with this label. + Default="CorrectText". When setting this parameter to an empty string, + no values will be stored to root.comment. + added_label - when creating new nodes because allow_add_punct=True, we mark these nodes + as new_node.misc[added_label] = 1. Default="Added". """ super().__init__(**kwargs) self.fix_text = fix_text @@ -81,7 +88,9 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ self.allow_add_punct = allow_add_punct self.allow_delete_punct = allow_delete_punct self.allow_hyphen_goeswith = allow_hyphen_goeswith - self.previous_form_attr = previous_form_attr + self.previous_form_label = previous_form_label + self.previous_text_label = previous_text_label + self.added_label = added_label @staticmethod def allow_space(form): @@ -90,9 +99,9 @@ def allow_space(form): def store_previous_form(self, node): """Store the previous form of this node into MISC, unless the change is common&expected.""" - if node.form not in ("''", "``") and self.previous_form_attr: - node.misc[self.previous_form_attr] = node.form - if self.previous_form_attr == 'CorrectForm': + if node.form not in ("''", "``") and self.previous_form_label: + node.misc[self.previous_form_label] = node.form + if self.previous_form_label == 'CorrectForm': node.feats['Typo'] = 'Yes' def process_tree(self, root): @@ -140,7 +149,8 @@ def process_tree(self, root): if self.fix_text: computed_text = root.compute_text() if text != computed_text: - root.add_comment('ToDoOrigText = ' + root.text) + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') root.text = computed_text def unspace_diffs(self, orig_diffs, tree_chars, text): @@ -152,6 +162,10 @@ def unspace_diffs(self, orig_diffs, tree_chars, text): tree_lo += 1 if tree_chars[tree_hi - 1] == ' ': tree_hi -= 1 + if text[text_lo] == ' ': + text_lo += 1 + if text[text_hi - 1] == ' ': + text_hi -= 1 old = tree_chars[tree_lo:tree_hi] new = text[text_lo:text_hi] if old == '' and new == '': @@ -208,12 +222,11 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): elif edit == 'insert': forms = text[text_lo:text_hi].split(' ') if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: - #logging.info(f'trying to add {forms} before {char_nodes[tree_lo]}') next_node = char_nodes[tree_lo] for f in reversed(forms): new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') new.shift_before_node(next_node) - new.misc['Added'] = 1 + new.misc[self.added_label] = 1 else: logging.warning('Unable to insert nodes\n%s', _diff2str(diff, tree_chars, text)) @@ -246,18 +259,26 @@ def solve_diff(self, nodes, form): node_form = node.form if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: node_form = node_form.replace('-', '') - if len(nodes) == 1 and node_form == form.replace(' ', ''): - if self.allow_space(form): - self.store_previous_form(node) - node.form = form - elif self.allow_goeswith: - self.store_previous_form(node) - forms = form.split() - node.form = forms[0] - node.feats['Typo'] = 'Yes' - for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos='X') + if len(nodes) == 1: + if node_form == form.replace(' ', ''): + if self.allow_space(form): + self.store_previous_form(node) + node.form = form + elif self.allow_goeswith: + self.store_previous_form(node) + forms = form.split() + node.form = forms[0] + node.feats['Typo'] = 'Yes' + for split_form in reversed(forms[1:]): + new = node.create_child(form=split_form, deprel='goeswith', upos='X') + new.shift_after_node(node) + else: + logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) + elif self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('[ \p{P}]+', form[len(node.form):]): + for punct_form in reversed(form[len(node.form):].split()): + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) + new.misc[self.added_label] = 1 else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: @@ -283,9 +304,10 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): - new = node.create_child(form=form[len(node.form):], deprel='punct', upos='PUNCT') + punct_form = form[len(node.form):] + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) - new.misc['Added'] = 1 + new.misc[self.added_label] = 1 else: self.store_previous_form(node) node.form = form @@ -313,6 +335,4 @@ def _log_diffs(diffs, tree_chars, text, msg): def _diff2str(diff, tree, text): old = '|' + ''.join(tree[diff[1]:diff[2]]) + '|' new = '|' + ''.join(text[diff[3]:diff[4]]) + '|' - if diff[0] == 'equal': - return '{:7} {!s:>50}'.format(diff[0], old) return '{:7} {!s:>50} --> {!s}'.format(diff[0], old, new) From 3abb76df036f7aa2e8f39437aa7d5b80032ae850 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:08:12 +0100 Subject: [PATCH 0783/1374] ud.ComplyWithText fix_text=1 should always produce valid CoNLL-U even if the raw texts include double spaces or no-break spaces (TODO: alternatively, we could annotate these using SpacesAfter). --- udapi/block/ud/complywithtext.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index c850018e..351ebc01 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -109,9 +109,13 @@ def process_tree(self, root): if text is None: raise ValueError('Tree %s has no text, cannot use ud.ComplyWithText' % root) - # Normalize the stored text (double space -> single space) + # Normalize the stored text (e.g. double space or no-break space -> single space) # and skip sentences which are already ok. text = ' '.join(text.split()) + if root.text != text and self.fix_text: + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') + root.text = text if text == root.compute_text(): return From 0c6f946802345cc670ece9663fc7007ff05efd73 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:09:36 +0100 Subject: [PATCH 0784/1374] corefud.PrintMentions should show Entity annotations in MISC by default --- udapi/block/corefud/printmentions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 12db433a..d011f686 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -12,7 +12,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, print_total=True, print_should=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, - minimize_cross=True, color=True, attributes='form,upos,deprel', + minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc', print_undef_as='_', print_doc_meta=True, print_comments=False, mark='(Mark)', hints=True, layout='classic', **kwargs): From f9dd071481e49944fe6c70629bf9d56a90bd86d6 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:27:46 +0100 Subject: [PATCH 0785/1374] keep newdoc and global.Entity when using read.Conllu sent_id_filter=regex The global.Entity comment will be read automatically by read.Conllu and then inserted automatically by write.Conllu, but only for trees with tree.newdoc, so we need to keep this annotation as well (move it to the new first tree in a given document). --- udapi/core/basereader.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index a841bf1b..71d57159 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -97,13 +97,19 @@ def filtered_read_tree(self): tree = self.read_tree() if self.sent_id_filter is None: return tree + + skipped_newdoc = None while True: if tree is None: return None if self.sent_id_filter.match(tree.sent_id) is not None: + if skipped_newdoc and not tree.newdoc: + tree.newdoc = skipped_newdoc return tree logging.debug('Skipping sentence %s as it does not match the sent_id_filter %s.', tree.sent_id, self.sent_id_filter) + if tree.newdoc: + skipped_newdoc = tree.newdoc tree = self.read_tree() def try_fast_load(self, document): From b036d572af97a9f06482ccdcd7e90cfe4f0f5655 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 15:15:48 +0100 Subject: [PATCH 0786/1374] update ord of empty nodes when deleting preceding nonempty nodes TODO: add tests, solve also deleting of empty nodes --- udapi/core/node.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 618e75eb..8a764498 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -516,6 +516,7 @@ def remove(self, children=None): `rehang_warn` means to rehang and warn:-). """ self._parent._children.remove(self) + empty_follows = None if children is not None and self._children: if children.startswith('rehang'): for child in self._children: @@ -523,6 +524,16 @@ def remove(self, children=None): self._parent._children.extend(self._children) self._parent._children.sort() self._children.clear() + elif self._root.empty_nodes: + will_be_removed = self.descendants(add_self=1) + prev_nonempty = self._root + empty_follows = {} + for node in self._root.descendants_and_empty: + if node.empty: + empty_follows[node] = prev_nonempty + elif node not in will_be_removed: + prev_nonempty = node + if children.endswith('warn'): logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) @@ -536,14 +547,29 @@ def remove(self, children=None): self._root._descendants.remove(self) except ValueError: pass # self may be an already deleted node e.g. if n.remove() called twice - for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): - node.ord = new_ord + else: + for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): + node.ord = new_ord + for empty in self._root.empty_nodes: + if empty > self: + empty.ord = round(empty.ord - 1, 1) else: # TODO nodes_to_remove = self.unordered_descendants() # and mark all nodes as deleted, remove them from MWT and coref mentions self._root._descendants = sorted(self._root.unordered_descendants()) for (new_ord, node) in enumerate(self._root._descendants, 1): node.ord = new_ord + # Decrease ord of empty nodes (keep their fractional part) + # Make sure that e.g. after deleting node with ord=2 + # ords "1 1.1 1.2 2 2.1" will become "1 1.1 1.2 1.3". + if empty_follows: + last_ord = 0 + for empty in self._root.empty_nodes: + prev_nonempty = empty_follows[empty] + new_ord = round(prev_nonempty.ord + (empty.ord % 1), 1) + while new_ord <= last_ord: + new_ord = round(new_ord + 0.1, 1) + last_ord, empty.ord = new_ord, new_ord def _shift_before_ord(self, reference_ord, without_children=False): """Internal method for changing word order.""" From 6c289d3bda8134a683f6362198888ee920520203 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 16:32:51 +0100 Subject: [PATCH 0787/1374] ud.ComplyWithText: the previous root.text value is better described as OrigText Unlike the previous node.form values, it is (usually) the original raw text including typos etc, so the label "CorrectText" was completely misleading. --- udapi/block/ud/complywithtext.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 351ebc01..b36b2512 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -35,7 +35,7 @@ class ComplyWithText(Block): def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, - previous_form_label='CorrectForm', previous_text_label='CorrectText', + previous_form_label='CorrectForm', previous_text_label='OrigText', added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. @@ -74,8 +74,8 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ When setting this parameter to an empty string, no values will be stored to node.misc. When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. previous_text_label - when we are not able to adapt the annotation to match root.text - and fix_text is True, we store the previous root.text in a CoNLL-U comment with this label. - Default="CorrectText". When setting this parameter to an empty string, + and fix_text is True, we store the previous root.text value in a CoNLL-U comment with this label. + Default="OrigText". When setting this parameter to an empty string, no values will be stored to root.comment. added_label - when creating new nodes because allow_add_punct=True, we mark these nodes as new_node.misc[added_label] = 1. Default="Added". From 043f4d73745a0155db76d5f4776d77f7ceeeba8a Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Fri, 17 Feb 2023 16:47:25 +0100 Subject: [PATCH 0788/1374] minor changes in Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 111bceb9..fde3b0bd 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -27,8 +27,11 @@ def __init__(self, flavio=False, **kwargs): def process_node(self, node): rf = [] af = {} + # PROIEL-specific: greek words without features + if node.lemma == 'greek.expression': + pass # NOUNS ################################################################ - if node.upos == 'NOUN': + elif node.upos == 'NOUN': if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { @@ -125,14 +128,14 @@ def process_node(self, node): af['PronType'] = [] if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: af['PronType'].append('Prs') - elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis']: + elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis', 'qui']: af['PronType'].append('Ind') elif node.lemma in ['inuicem', 'invicem']: af['PronType'].append('Rcp') rf.remove('Case') - elif node.lemma in ['quicumque', 'qui', 'quisquis']: + if node.lemma in ['quicumque', 'qui', 'quisquis']: af['PronType'].append('Rel') - if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis']: + if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis', 'ecqui']: af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. @@ -176,7 +179,7 @@ def process_node(self, node): af['PronType'].append('Ind') elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: af['PronType'].append('Tot') - if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus']: + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: af['PronType'].append('Rel') elif node.lemma in ['qui', 'quantus', 'quot']: af['PronType'].append('Int') From e84741a6e78acaaf13739945bd17814d569e3601 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Feb 2023 22:06:56 +0100 Subject: [PATCH 0789/1374] Remove NOCOREF entities e.g. from AnCora. --- udapi/block/corefud/removenocorefentities.py | 21 ++++++++++++++++++++ udapi/core/coref.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 udapi/block/corefud/removenocorefentities.py diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py new file mode 100644 index 00000000..8baba086 --- /dev/null +++ b/udapi/block/corefud/removenocorefentities.py @@ -0,0 +1,21 @@ +from udapi.core.block import Block +import udapi.core.coref +import re +import logging + +class RemoveNoCorefEntities(Block): + """ + Some corpora (e.g., AnCora) include annotation of named entities that are + not annotated for coreference. To distinguish them, their cluster ID starts + with 'NOCOREF' (optionally followed by entity type, so that one cluster + still has just one type). We may want to remove such entities from datasets + that are used to train coreference resolves, to prevent the resolvers from + thinking that all members of a NOCOREF cluster are coreferential. That is + what this block does. + """ + + def process_document(self, doc): + entities = doc.coref_entities + if not entities: + return + doc.coref_entities = [e for e in entities if not re.match(r'^NOCOREF', e.eid)] diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 12dda239..4cd656f1 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -300,7 +300,7 @@ def __init__(self, eid, etype=None): self.split_ante = [] def __lt__(self, another): - """Does this CorefEntity precedes (word-order wise) `another` entity? + """Does this CorefEntity precede (word-order wise) `another` entity? This method defines a total ordering of all entities by the first mention of each entity (see `CorefMention.__lt__`). From 16c3a48ed3eb7861757092649a6ece22b893151c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Feb 2023 22:27:19 +0100 Subject: [PATCH 0790/1374] Another method of removing entities. --- udapi/block/corefud/removenocorefentities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py index 8baba086..4551873c 100644 --- a/udapi/block/corefud/removenocorefentities.py +++ b/udapi/block/corefud/removenocorefentities.py @@ -18,4 +18,4 @@ def process_document(self, doc): entities = doc.coref_entities if not entities: return - doc.coref_entities = [e for e in entities if not re.match(r'^NOCOREF', e.eid)] + doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)} From 8b442889aca3c1b881d7d53896d1eb0547635cfa Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 21 Feb 2023 15:52:18 +0100 Subject: [PATCH 0791/1374] CorefUD: counting sentence sequences with no coref annotation --- udapi/block/corefud/countgaps.py | 67 ++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 udapi/block/corefud/countgaps.py diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py new file mode 100644 index 00000000..c8ee8d76 --- /dev/null +++ b/udapi/block/corefud/countgaps.py @@ -0,0 +1,67 @@ +from udapi.core.block import Block +from collections import Counter + +class CountGaps(Block): + """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" + + def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs): + super().__init__(**kwargs) + self.report_per_newdoc = report_per_newdoc + self.report_per_file = report_per_file + self.report_total = report_total + self._total_counter = Counter() + + def _report_stats(self, counter=None, header_id=None): + if not counter: + counter = self._total_counter + if header_id: + print(f"============ {header_id} ============") + for key in sorted(counter): + print(f"{key:2d}: {counter[key]}") + + def _count_empty_seqs(self, empty_seqs): + counter = Counter() + for seq in empty_seqs: + counter[len(seq)] += 1 + return counter + + def process_document(self, doc): + file_counter = Counter() + empty_seqs = [] + curr_seq = [] + newdoc = None + for i, tree in enumerate(doc.trees): + if tree.newdoc: + if i: + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_counter = self._count_empty_seqs(empty_seqs) + file_counter.update(newdoc_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_counter, header_id=newdoc) + newdoc = tree.newdoc + empty_seqs = [] + curr_seq = [] + + has_mention = any(node.coref_mentions for node in tree.descendants) + if not has_mention: + curr_seq.append(tree.sent_id) + elif curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_counter = self._count_empty_seqs(empty_seqs) + file_counter.update(newdoc_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_counter, header_id=newdoc) + + if self.report_per_file: + self._report_stats(file_counter, header_id="FULL DOC") + + self._total_counter.update(file_counter) + + def process_end(self): + if self.report_total: + self._report_stats(header_id="TOTAL") From 716461fe3b67711f71a8cee028668fe34ceffef0 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 21 Feb 2023 19:22:33 +0100 Subject: [PATCH 0792/1374] besides sequences, counting also paragraphs with no coref mentions --- udapi/block/corefud/countgaps.py | 63 +++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py index c8ee8d76..fc45540a 100644 --- a/udapi/block/corefud/countgaps.py +++ b/udapi/block/corefud/countgaps.py @@ -1,5 +1,5 @@ from udapi.core.block import Block -from collections import Counter +from collections import defaultdict, Counter class CountGaps(Block): """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" @@ -9,15 +9,15 @@ def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=T self.report_per_newdoc = report_per_newdoc self.report_per_file = report_per_file self.report_total = report_total - self._total_counter = Counter() + self._total_counter = defaultdict(Counter) - def _report_stats(self, counter=None, header_id=None): - if not counter: - counter = self._total_counter + def _report_stats(self, counter, header_id=None): if header_id: print(f"============ {header_id} ============") for key in sorted(counter): print(f"{key:2d}: {counter[key]}") + print("-------") + print(f"SUM: {sum([k*counter[k] for k in counter])}") def _count_empty_seqs(self, empty_seqs): counter = Counter() @@ -26,42 +26,69 @@ def _count_empty_seqs(self, empty_seqs): return counter def process_document(self, doc): - file_counter = Counter() + file_counters = defaultdict(Counter) empty_seqs = [] + empty_pars = [] curr_seq = [] + curr_par = [] + is_empty_par = True newdoc = None for i, tree in enumerate(doc.trees): if tree.newdoc: if i: if curr_seq: empty_seqs.append(curr_seq) - newdoc_counter = self._count_empty_seqs(empty_seqs) - file_counter.update(newdoc_counter) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if is_empty_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) if self.report_per_newdoc: - self._report_stats(newdoc_counter, header_id=newdoc) + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}") newdoc = tree.newdoc empty_seqs = [] + empty_pars = [] curr_seq = [] + curr_par = [] + is_empty_par = True + if tree.newpar: + if not tree.newdoc and is_empty_par: + empty_pars.append(curr_par) + curr_par = [] + is_empty_par = True has_mention = any(node.coref_mentions for node in tree.descendants) if not has_mention: curr_seq.append(tree.sent_id) - elif curr_seq: - empty_seqs.append(curr_seq) - curr_seq = [] + curr_par.append(tree.sent_id) + else: + if curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + is_empty_par = False if curr_seq: empty_seqs.append(curr_seq) - newdoc_counter = self._count_empty_seqs(empty_seqs) - file_counter.update(newdoc_counter) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if curr_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) if self.report_per_newdoc: - self._report_stats(newdoc_counter, header_id=newdoc) + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}") if self.report_per_file: - self._report_stats(file_counter, header_id="FULL DOC") + self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE") + self._report_stats(file_counters["par"], header_id="PAR STATS, FILE") - self._total_counter.update(file_counter) + self._total_counter["seq"].update(file_counters["seq"]) + self._total_counter["par"].update(file_counters["par"]) def process_end(self): if self.report_total: - self._report_stats(header_id="TOTAL") + self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL") + self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL") From c147469f5a4a9267902974846c6ff2d804447cdb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 25 Feb 2023 00:25:12 +0100 Subject: [PATCH 0793/1374] write.CorefHtml add visualization menu show: eid, trees, line breaks, paragraphs --- udapi/block/write/corefhtml.py | 39 +++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 280fc213..20f68291 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -11,7 +11,7 @@ Udapi CorefUD viewer ''' -# I use a pure CSS-3 solution: #overiew {resize: horizontal; overflow: auto;} +# I use a pure CSS-3 solution: #overview {resize: horizontal; overflow: auto;} # so that the width of #overview can be changed by dragging the bottom right corner. # The following lines would make the whole right border draggable: # @@ -25,9 +25,19 @@ display: grid; border-right: double; padding: 5px; width: 20em; background: #ddd; border-radius: 5px; } +#main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; + padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} +#menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} +.change .b1 {transform: translate(0, 9px) rotate(-45deg);} +.change .b2 {opacity: 0;} +.change .b3 {transform: translate(0, -9px) rotate(45deg);} + .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.sentence .tree span {border: none; padding: 0; display:inline;} .sentence span .eid {display:block; font-size: 10px;} -.showtree {float:left; margin: 5px;} +.showtree {margin: 5px; user-select: none;} +.display-inline {display: inline;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} .empty {color: gray;} .sentence .singleton {border-style: dotted;} @@ -55,16 +65,22 @@ function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, function(e) {$("span").removeClass("active");} ); + +function menuclick(x) { + x.classList.toggle("change"); + $("#main-menu").toggle(); +} + ''' SCRIPT_SHOWTREE = ''' $(".sentence").each(function(index){ var sent_id = this.id; - $(this).before( + $(this).prepend( $("
') print('
') + print('\n' + '\n') for tree in doc.trees: self.process_tree(tree, mention_ids, entity_colors) print('
') @@ -180,7 +203,7 @@ def process_tree(self, tree, mention_ids, entity_colors): if tree.newdoc: print(f'

{tree.newdoc if tree.newdoc is not True else ""}


') elif tree.newpar: - print('
') + print('
') opened = [] print(f'

') for node in nodes_and_empty: @@ -188,7 +211,7 @@ def process_tree(self, tree, mention_ids, entity_colors): subspan = subspans.pop() self._start_subspan(subspan, mention_ids, entity_colors) opened.append(subspan) - + is_head = self._is_head(node) if is_head: print('', end='') @@ -199,7 +222,7 @@ def process_tree(self, tree, mention_ids, entity_colors): print('', end='') if is_head: print('', end='') - + while opened and opened[-1].words[-1] == node: print('', end='') opened.pop() @@ -229,7 +252,7 @@ def process_tree(self, tree, mention_ids, entity_colors): if not node.no_space_after: print(' ', end='') - + print('

') def _is_head(self, node): From 0b30f5b75ab2a53ed5e0425d536094dee5c56f02 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 25 Feb 2023 02:53:43 +0100 Subject: [PATCH 0794/1374] more visualization options --- udapi/block/write/corefhtml.py | 65 +++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 20f68291..fd500e7d 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -11,13 +11,7 @@ Udapi CorefUD viewer ''' -# I use a pure CSS-3 solution: #overview {resize: horizontal; overflow: auto;} -# so that the width of #overview can be changed by dragging the bottom right corner. -# The following lines would make the whole right border draggable: -# -# -# -#
+ CSS = ''' #wrap {display: flex; align-items: flex-start;} #main {width: 100%; padding: 5px; background: white; z-index:100;} @@ -27,15 +21,19 @@ } #main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#main-menu div {display: inline-block;} #menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} #menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} .change .b1 {transform: translate(0, 9px) rotate(-45deg);} .change .b2 {opacity: 0;} .change .b3 {transform: translate(0, -9px) rotate(45deg);} -.sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} -.sentence .tree span {border: none; padding: 0; display:inline;} -.sentence span .eid {display:block; font-size: 10px;} +.m {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.nobox {border:1px solid transparent; padding:0; background: transparent !important; display: inline} +.nobox .labels {display: inline;} +.nocolor {color: black !important;} +.nobold {font-weight: normal;} +.labels {display: block; font-size: 10px;} .showtree {margin: 5px; user-select: none;} .display-inline {display: inline;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} @@ -48,22 +46,22 @@ ''' SCRIPT_BASE = ''' -$("span").click(function(e) { +$(".m").click(function(e) { let was_selected = $(this).hasClass("selected"); - $("span").removeClass("selected"); + $(".m").removeClass("selected"); if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} e.stopPropagation(); }); window.onhashchange = function() { - $("span").removeClass("selected"); + $(".m").removeClass("selected"); var fragment = window.location.hash.substring(1); if (fragment) {$("." + fragment).addClass("selected");} } -$("span").hover( - function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, - function(e) {$("span").removeClass("active");} +$(".m").hover( + function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, + function(e) {$(".m").removeClass("active");} ); function menuclick(x) { @@ -94,10 +92,11 @@ class CorefHtml(BaseWriter): - def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): + def __init__(self, show_trees=True, show_eid=False, show_etype=False, colors=7, **kwargs): super().__init__(**kwargs) self.show_trees = show_trees self.show_eid = show_eid + self.show_etype = show_etype self.colors = colors def _representative_word(self, entity): @@ -120,6 +119,10 @@ def process_document(self, doc): if self.colors: for i in range(self.colors): print(f'.c{i} {{color: hsl({int(i * 360/self.colors)}, 100%, 30%);}}') + if not self.show_eid: + print('.eid {display: none;}') + if not self.show_etype: + print('.etype {display: none;}') print('') print('\n\n
') @@ -146,13 +149,19 @@ def process_document(self, doc): print('
') print('
') - print('\n' '\n') - for tree in doc.trees: - self.process_tree(tree, mention_ids, entity_colors) - print('
') - print('') print('
') - def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): + def _start_subspan(self, subspan, crossing=False): m = subspan.mention e = m.entity - classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"} m' + classes = f'{e.eid} {self._mention_ids[m]} {e.etype or "other"} m' title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}' if self.colors: - classes += f' {entity_colors[e]}' + classes += f' {self._entity_colors[e]}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: @@ -252,7 +303,7 @@ def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): f'{subspan.subspan_eid}' f' {e.etype}', end='') - def process_tree(self, tree, mention_ids, entity_colors): + def process_tree(self, tree): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -273,7 +324,7 @@ def process_tree(self, tree, mention_ids, entity_colors): for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() - self._start_subspan(subspan, mention_ids, entity_colors) + self._start_subspan(subspan) opened.append(subspan) is_head = self._is_head(node) @@ -311,7 +362,7 @@ def process_tree(self, tree, mention_ids, entity_colors): opened = new_opened print('' * (len(endings) + len(brokens)), end='') for broken in brokens: - self._start_subspan(broken, mention_ids, entity_colors, True) + self._start_subspan(broken, True) opened.append(subspan) if not node.no_space_after: diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 48431900..ae85d43c 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,7 +79,9 @@ def process_document(self, doc): print('\n') print('
') def print_doc_json(self, doc): - print('data=[') + print('[') for (bundle_number, bundle) in enumerate(doc, 1): if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' - for tree in bundle.trees: + try: + trees = bundle.trees + except: + trees = [bundle] # allow to call print_doc_json([tree1, tree2]) + for tree in trees: zone = tree.zone if first_zone: first_zone = False @@ -116,7 +122,7 @@ def print_doc_json(self, doc): print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) - print('];') + print(']') @staticmethod From 327bb6f9083f6131b4f986dac9b56f2570957f60 Mon Sep 17 00:00:00 2001 From: Federica Gamba Date: Thu, 30 Mar 2023 12:22:27 +0200 Subject: [PATCH 0799/1374] adjustments in Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 74 +++++++++++++++++++----------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index fde3b0bd..dce4592d 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -28,7 +28,8 @@ def process_node(self, node): rf = [] af = {} # PROIEL-specific: greek words without features - if node.lemma == 'greek.expression': + # LLCT-specific: corrupted nodes + if node.lemma in ['greek.expression', 'missing^token']: pass # NOUNS ################################################################ elif node.upos == 'NOUN': @@ -41,12 +42,14 @@ def process_node(self, node): 'Degree': ['Dim'], 'Abbr': ['Yes'], 'Foreign': ['Yes'], - 'VerbForm': ['Part']} + 'VerbForm': ['Part', 'Vnoun']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Proper'] = ['Yes'] + af['Polarity'] = ['Neg'] af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) self.check_allowed_features(node, af) @@ -61,10 +64,10 @@ def process_node(self, node): 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: - af['Compound'] = 'Yes' + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] - if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADJECTIVES ########################################################### @@ -72,7 +75,7 @@ def process_node(self, node): if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: rf = ['Gender', 'Number', 'Case'] af = { - 'NumType': ['Ord', 'Dist'], + 'NumType': ['Dist', 'Mult', 'Ord'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], @@ -83,9 +86,10 @@ def process_node(self, node): 'VerbForm': ['Part']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] + af['Variant'] = ['Greek'] af['Degree'].append('Dim') af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) @@ -112,10 +116,10 @@ def process_node(self, node): rf.extend(['Person', 'Number']) af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] - # 1st and 2nd person do not have gender + # 3rd person must have gender if node.feats['Person'] == '3': # is, id rf.append('Gender') - af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] elif re.match(r'^(Rel|Int)$', node.feats['PronType']): rf.extend(['Gender', 'Number']) af['Gender'] = ['Masc', 'Fem', 'Neut'] @@ -126,20 +130,20 @@ def process_node(self, node): af['Number'] = ['Sing', 'Plur'] # lexical check of PronTypes af['PronType'] = [] - if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: + if node.lemma in ['ego', 'tu', 'is', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'egoipse', 'egometipse', 'tumetipse', 'semetipse', 'nosmetipse']: af['PronType'].append('Prs') - elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis', 'qui']: + elif node.lemma in ['aliquis', 'nemo', 'nihil', 'nihilum', 'qui', 'quis', 'quisquis', 'quiuis', 'quivis']: af['PronType'].append('Ind') elif node.lemma in ['inuicem', 'invicem']: af['PronType'].append('Rcp') rf.remove('Case') - if node.lemma in ['quicumque', 'qui', 'quisquis']: + if node.lemma in ['qui', 'quicumque', 'quisquis']: af['PronType'].append('Rel') - if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis', 'ecqui']: + if node.lemma in [ 'ecquis', 'ecqui', 'numquis', 'qui', 'quis', 'quisnam']: af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['LatAnom', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurO', 'IndEurX', 'LatAnom', 'LatPron'] af['Compound'] = ['Yes'] af['Polarity'] = ['Neg'] af['Form'] = ['Emp'] @@ -175,25 +179,26 @@ def process_node(self, node): if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: if not af['PronType'] == ['Prs']: af['PronType'].append('Prs') - elif node.lemma in ['aliquot', 'quidam', 'quispiam', 'quivis', 'nullus', 'nonnullus', 'aliqui', 'qui', 'quilibet', 'quantuslibet', 'unus', 'uterque', 'ullus', 'multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + elif node.lemma in ['aliquantus', 'aliqui', 'aliquot', 'quidam', 'nonnullus', 'nullus', 'quantuscumque', 'quantuslibet', 'qui', 'quilibet', 'quispiam', 'quiuis', 'quivis', 'quotlibet', 'ullus', 'unus', 'uterque','multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: af['PronType'].append('Ind') elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: af['PronType'].append('Tot') if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: af['PronType'].append('Rel') - elif node.lemma in ['qui', 'quantus', 'quot']: + if node.lemma in ['qui', 'quantus', 'quot']: af['PronType'].append('Int') - elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot']: + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot', 'praedictus', 'praefatus', 'suprascriptus']: af['PronType'].append('Dem') - elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter']: + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter', 'uterlibet', 'uterque']: af['PronType'].append('Con') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] af['Compound'] = ['Yes'] af['Form'] = ['Emp'] af['NumType'] = ['Card'] af['Degree'].append('Dim') + af['PronType'].append('Art') if re.match(r'^(unus|ambo)', node.lemma): af['NumValue'] = ['1', '2'] self.check_required_features(node, rf) @@ -202,7 +207,7 @@ def process_node(self, node): elif node.upos == 'NUM': rf = ['NumType', 'NumForm'] af = { - 'NumType': ['Card'], + 'NumType': ['Card', 'Ord'], 'NumForm': ['Word', 'Roman', 'Digit'], 'Proper': ['Yes']} # Arabic digits and Roman numerals do not have inflection features. @@ -212,7 +217,9 @@ def process_node(self, node): af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['NumForm'].append('Reference') + af['Compound'] = ['Yes'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # VERBS AND AUXILIARIES ################################################ @@ -227,7 +234,7 @@ def process_node(self, node): if node.feats['VerbForm'] not in ['Part', 'Conv']: rf.append('Tense') af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] - if node.upos == 'VERB': + if node.upos == 'VERB' or (node.upos == 'AUX' and node.lemma != 'sum'): rf.append('Voice') af['Voice'] = ['Act', 'Pass'] if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive @@ -255,6 +262,7 @@ def process_node(self, node): if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + af['VerbType'] = ['Mod'] if 'Degree' in af: af['Degree'].append('Dim') else: @@ -262,7 +270,12 @@ def process_node(self, node): af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): - af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU'] + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + elif node.feats['VerbForm'] == 'Inf': + af['Case'] = ['Nom', 'Acc', 'Abl'] + af['Gender'] = ['Neut'] + af['Number'] = ['Sing'] + af['InflClass[nominal]'] = ['Ind'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## @@ -271,13 +284,13 @@ def process_node(self, node): 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], - 'NumType': ['Card', 'Ord'], # e.g., primum + 'NumType': ['Card', 'Mult', 'Ord'], # e.g., primum 'Polarity': ['Neg'] } if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] - af['VerbForm'] = ['Part'] + af['VerbForm'] = ['Fin', 'Part'] af['Degree'].append('Dim') self.check_allowed_features(node, af) # PARTICLES ############################################################ @@ -289,6 +302,7 @@ def process_node(self, node): if self.flavio: af['Form'] = ['Emp'] af['PronType'] = ['Dem'] + af['Compound'] = ['Yes'] self.check_allowed_features(node, af) # CONJUNCTIONS ######################################################### elif re.match(r'^[CS]CONJ$', node.upos): @@ -301,6 +315,8 @@ def process_node(self, node): af['Form'] = ['Emp'] af['VerbForm'] = ['Fin'] af['NumType'] = ['Card'] + af['ConjType'] = ['Expl'] + af['AdvType'] = ['Loc'] self.check_allowed_features(node, af) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': @@ -310,9 +326,13 @@ def process_node(self, node): 'Abbr': ['Yes'] } if self.flavio: - af['VerbForm'] = ['Part'], + af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] self.check_allowed_features(node, af) + # X ########################################################## + elif node.upos == 'X': + af = {'Abbr': ['Yes']} # THE REST: NO FEATURES ################################################ else: self.check_allowed_features(node, {}) From 1ddfce4aec593e222a0e3d26e8f74acf561d1356 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 31 Mar 2023 19:42:35 +0200 Subject: [PATCH 0800/1374] gzip the docs/* json and html files --- udapi/block/write/corefhtml.py | 49 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index cd0db1e5..6129b335 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -17,6 +17,7 @@ from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention from collections import Counter import udapi.block.write.html +import gzip import sys import os @@ -26,6 +27,7 @@ Udapi CorefUD viewer + ''' CSS = ''' @@ -87,21 +89,26 @@ $("#main-menu").toggle(); } -function load_doc(doc_num) { +async function load_doc(doc_num) { loading_now = true; - console.log("loading doc" + doc_num + ".html"); - $.get(docs_dir + "/doc" + doc_num + ".html", function(data){ - $("#main").append(data); - add_mention_listeners($("#doc" + doc_num + " .m")); - $("#doc" + doc_num + " .sentence").each(add_show_tree_button); - loading_now = false; - }).fail(function(){ + let filename = docs_dir + "/doc" + doc_num + ".html.gz" + console.log("loading " + filename); + try { + const res = await fetch(filename); + let raw = await res.arrayBuffer(); + data = pako.inflate(raw, {to: "string"}); + } catch (error){ if (! load_fail_reported) { load_fail_reported = true; - alert("Cannot load " + docs_dir + "/doc" + doc_num - + ".html\\nLocal files do not support lazy loading. Run a web server 'python -m http.server'"); + alert("Cannot load " + filename + "\\nLocal files do not support lazy loading." + + " Run a web server 'python -m http.server'\\n" + + "error = " + error); } - }); + } + $("#main").append(data); + add_mention_listeners($("#doc" + doc_num + " .m")); + $("#doc" + doc_num + " .sentence").each(add_show_tree_button); + loading_now = false; } var docs_loaded = 1; @@ -126,7 +133,7 @@ add_show_tree_button = function(index, el){ var sent_id = el.id; $(el).prepend( - $("