From cd77a939b37327671aaec68333c3b683969e0840 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 21 Nov 2023 09:44:39 +0100 Subject: [PATCH 001/514] write.Sentences newdoc=1 newpar=1 --- udapi/block/write/sentences.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/udapi/block/write/sentences.py b/udapi/block/write/sentences.py index 60eb6bec..70553d7d 100644 --- a/udapi/block/write/sentences.py +++ b/udapi/block/write/sentences.py @@ -3,13 +3,14 @@ class Sentences(BaseWriter): - """A writer of plain-text sentences (one per line). + """A writer of plain-text sentences (one sentence per line). Usage: udapy write.Sentences if_missing=empty < my.conllu > my.txt + udapy write.Sentences newdoc=1 newpar=1 < my.conllu > my.txt """ - def __init__(self, if_missing='detokenize', **kwargs): + def __init__(self, if_missing='detokenize', newdoc=None, newpar=None, **kwargs): """Create the Sentences writer block. Parameters: @@ -18,9 +19,21 @@ def __init__(self, if_missing='detokenize', **kwargs): * `empty`: print an empty line * `warn_detokenize`, `warn_empty`: in addition emit a warning via `logging.warning()` * `fatal`: raise an exception + newdoc: What to do if `root.newdoc` is not None? (default=None) + * None: ignore it + * True: print an empty_line (except for the first tree, i.e. bundle.number==1) + newpar: What to do if `root.newpar` is not None? (default=None) + * None: ignore it + * True: print an empty_line (except for the first tree, i.e. bundle.number==1) """ super().__init__(**kwargs) self.if_missing = if_missing + self.newdoc = newdoc + self.newpar = newpar def process_tree(self, tree): + if self.newdoc and tree.newdoc and tree.bundle.number > 1: + print() + if self.newpar and tree.newpar and tree.bundle.number > 1: + print() print(tree.get_sentence(self.if_missing)) From a5a42f64ad0f635300366e8790efc8d6942c8810 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Nov 2023 15:20:01 +0100 Subject: [PATCH 002/514] util.MarkDiff: joining None results in an error --- udapi/block/util/markdiff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/udapi/block/util/markdiff.py b/udapi/block/util/markdiff.py index f5f7b17d..e102ca9c 100644 --- a/udapi/block/util/markdiff.py +++ b/udapi/block/util/markdiff.py @@ -62,8 +62,8 @@ def process_tree(self, tree): if len(pred_nodes) != len(gold_nodes) and self.mark_attr: tree.add_comment(f'{self.mark_attr} = {self.mark}') gold_tree.add_comment(f'{self.mark_attr} = {self.mark}') - pred_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in pred_nodes] - gold_tokens = ['_'.join(n.get_attrs(self.attrs)) for n in gold_nodes] + pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in pred_nodes] + gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs="_")) for n in gold_nodes] matcher = difflib.SequenceMatcher(None, pred_tokens, gold_tokens, autojunk=False) diffs = list(matcher.get_opcodes()) From ac7562984291d0af72f8d9fba6e8e6ffae4d5d95 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 11 Jan 2024 17:11:11 +0100 Subject: [PATCH 003/514] ud.FixCompoundName: If a treebank connects person names using compound, convert it to flat:name. --- udapi/block/ud/fixcompoundname.py | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 udapi/block/ud/fixcompoundname.py diff --git a/udapi/block/ud/fixcompoundname.py b/udapi/block/ud/fixcompoundname.py new file mode 100644 index 00000000..43eb6578 --- /dev/null +++ b/udapi/block/ud/fixcompoundname.py @@ -0,0 +1,36 @@ +""" +Block ud.FixCompoundName finds compound relations between PROPN nodes and converts +them to flat:name. This is not necessarily correct in all situations. The difference +between compound and flat is that compound allows to distinguish head and modifier. +Multiword person names (given name and surname, or various other patterns) typically +should be analyzed as flat but there are treebanks that incorrectly use compound +for person names. This block can be used to fix them. +""" +from udapi.core.block import Block +import logging + + +class FixCompoundName(Block): + """ + Converts a compound relation between two PROPN nodes into a flat relation. + Compounds of a PROPN and a non-PROPN will be left alone, although they are + suspicious, too. + """ + + def process_node(self, node): + if node.upos == 'PROPN' and node.udeprel == 'compound' and node.parent.upos == 'PROPN': + # See if there are other PROPN compound siblings. + namewords = [x for x in node.siblings if x.upos == 'PROPN' and x.udeprel == 'compound'] + namewords.append(node.parent) + namewords = sorted(namewords, key=lambda x: x.ord) + ###!!! We currently cannot transform enhanced dependencies. + ###!!! If we proceed, the basic tree would diverge from the enhanced dependencies. + if len(node.deps) > 0: + logging.fatal('There are enhanced dependencies but ud.FixCompoundName has been implemented only for basic dependencies.') + # The first name word will be the technical head. If it is the current parent, fine. + if namewords[0] != node.parent: + namewords[0].parent = node.parent.parent + namewords[0].deprel = node.parent.deprel + for i in range(len(namewords)-1): + namewords[i+1].parent = namewords[0] + namewords[i+1].deprel = 'flat:name' From 02698410b6c7c5a17b092afbbc0044ab1a0b5515 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 11 Jan 2024 18:11:33 +0100 Subject: [PATCH 004/514] Improved block ud.FixCompoundName. --- udapi/block/ud/fixcompoundname.py | 37 +++++++++++++++++++------------ 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/udapi/block/ud/fixcompoundname.py b/udapi/block/ud/fixcompoundname.py index 43eb6578..74610757 100644 --- a/udapi/block/ud/fixcompoundname.py +++ b/udapi/block/ud/fixcompoundname.py @@ -7,6 +7,7 @@ for person names. This block can be used to fix them. """ from udapi.core.block import Block +import regex as re import logging @@ -19,18 +20,26 @@ class FixCompoundName(Block): def process_node(self, node): if node.upos == 'PROPN' and node.udeprel == 'compound' and node.parent.upos == 'PROPN': + origparent = node.parent + grandparent = origparent.parent + outdeprel = origparent.deprel # See if there are other PROPN compound siblings. - namewords = [x for x in node.siblings if x.upos == 'PROPN' and x.udeprel == 'compound'] - namewords.append(node.parent) - namewords = sorted(namewords, key=lambda x: x.ord) - ###!!! We currently cannot transform enhanced dependencies. - ###!!! If we proceed, the basic tree would diverge from the enhanced dependencies. - if len(node.deps) > 0: - logging.fatal('There are enhanced dependencies but ud.FixCompoundName has been implemented only for basic dependencies.') - # The first name word will be the technical head. If it is the current parent, fine. - if namewords[0] != node.parent: - namewords[0].parent = node.parent.parent - namewords[0].deprel = node.parent.deprel - for i in range(len(namewords)-1): - namewords[i+1].parent = namewords[0] - namewords[i+1].deprel = 'flat:name' + namewords = sorted([x for x in origparent.children(add_self=True) if x.upos == 'PROPN' and (x.udeprel == 'compound' or x == origparent)], key=lambda y: y.ord) + # The Hindi treebank tags dates (['30', 'navaṁbara'], ['disaṁbara', '1993']) as PROPN compounds. + # This is wrong but it is also different from personal names we are targeting here. + # Hence, we will skip "names" that contain numbers. + if len([x for x in namewords if re.search(r"\d", x.form)]) == 0: + #logging.info(str([x.misc['Translit'] for x in namewords])) + ###!!! We currently cannot transform enhanced dependencies. + ###!!! If we proceed, the basic tree would diverge from the enhanced dependencies. + if len(node.deps) > 0: + logging.fatal('There are enhanced dependencies but ud.FixCompoundName has been implemented only for basic dependencies.') + # The first name word will be the technical head. If it is the current parent, fine. + head = namewords[0] + rest = namewords[1:] + if head != origparent: + head.parent = grandparent + head.deprel = outdeprel + for n in rest: + n.parent = head + n.deprel = 'flat:name' From a5f5d90603669cb9bee389b96d93e1bf85935d70 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 31 Jan 2024 17:16:28 +0100 Subject: [PATCH 005/514] WIP: corefud.Link2Cluster for converting link-based coreference annotation to CorefUD --- udapi/block/corefud/link2cluster.py | 37 +++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 udapi/block/corefud/link2cluster.py diff --git a/udapi/block/corefud/link2cluster.py b/udapi/block/corefud/link2cluster.py new file mode 100644 index 00000000..4e296507 --- /dev/null +++ b/udapi/block/corefud/link2cluster.py @@ -0,0 +1,37 @@ +from udapi.core.block import Block + +class Link2Cluster(Block): + """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.""" + + def __init__(self, id_attr='external-id', ante_attr='antecedent-id', **kwargs): + super().__init__(**kwargs) + self.id_attr = id_attr + self.ante_id = ante_attr + + def process_document(self, doc): + id2node = {} + links = [] + for node in doc.nodes: + this_id = node.misc[self.id_attr] + if this_id != '': + id2node[this_id] = node + ante_id = node.misc[self.ante_attr] + if ante_id != '': + links.append([ante_id, this_id]) + + # sorted(...,reverse=True) converts both cataphora and anaphora to a pair (this, ante) where ante < this. + node_links = [sorted([id2node[link[0]], id2node[link[1]]], reverse=True) for link in links] + + # sort() makes sure the links are sorted by the "this" node (i.e. the anaphor, not the antecendent). + node_links.sort() + + # Thanks to this sorting, we can assert that this_node is not part of any mention/entity when iterating + # and we can prevent the need for merging two entities. + for this_node, ante_node in node_links: + assert not this_node.mentions + if ante_node.mentions: + ante_node.entities[0].create_mention(head=this_node, words=[this_node]) + else: + entity = this_node.root.document.create_coref_entity() + entity.create_mention(head=ante_node, words=[ante_node]) + entity.create_mention(head=this_node, words=[this_node]) From 3f9dd8417c7bf6dc387ef7abdde9443527971c70 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 1 Feb 2024 02:01:36 +0100 Subject: [PATCH 006/514] corefud.Link2Cluster prepared to convert PROIEL files --- udapi/block/corefud/link2cluster.py | 40 +++++++++++++++++++++-------- udapi/core/document.py | 4 +-- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/udapi/block/corefud/link2cluster.py b/udapi/block/corefud/link2cluster.py index 4e296507..e04380bf 100644 --- a/udapi/block/corefud/link2cluster.py +++ b/udapi/block/corefud/link2cluster.py @@ -1,12 +1,14 @@ +import logging from udapi.core.block import Block class Link2Cluster(Block): """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.""" - def __init__(self, id_attr='external-id', ante_attr='antecedent-id', **kwargs): + def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, **kwargs): super().__init__(**kwargs) self.id_attr = id_attr - self.ante_id = ante_attr + self.ante_attr = ante_attr + self.delete_orig_attrs = delete_orig_attrs def process_document(self, doc): id2node = {} @@ -18,20 +20,38 @@ def process_document(self, doc): ante_id = node.misc[self.ante_attr] if ante_id != '': links.append([ante_id, this_id]) + if self.delete_orig_attrs: + for attr in (self.id_attr, self.ante_attr): + del node.misc[attr] + + for link in links: + if link[0] not in id2node: + logging.warning(f"{link[0]} is referenced in {self.ante_attr}, but not in {self.id_attr}") + links = [link for link in links if link[0] in id2node] + + # nodeA < nodeB is a shortcut for nodeA.ord < nodeB.ord + # but here we need to sort nodes from different sentences, + # so we need to compare first the bundle number and then node.ord. + sort_key = lambda node: (node.root.bundle.number, node.ord) # sorted(...,reverse=True) converts both cataphora and anaphora to a pair (this, ante) where ante < this. - node_links = [sorted([id2node[link[0]], id2node[link[1]]], reverse=True) for link in links] + node_links = [sorted([id2node[link[0]], id2node[link[1]]], reverse=True, key=sort_key) for link in links] - # sort() makes sure the links are sorted by the "this" node (i.e. the anaphor, not the antecendent). - node_links.sort() + # Makes sure the links are sorted by this_node (i.e. the anaphor, not the antecendent). + node_links.sort(key=lambda link: sort_key(link[0])) # Thanks to this sorting, we can assert that this_node is not part of any mention/entity when iterating # and we can prevent the need for merging two entities. for this_node, ante_node in node_links: - assert not this_node.mentions - if ante_node.mentions: - ante_node.entities[0].create_mention(head=this_node, words=[this_node]) + assert not this_node.coref_mentions + if ante_node.coref_mentions: + ante_node.coref_entities[0].create_mention(head=this_node, words=[this_node]) else: entity = this_node.root.document.create_coref_entity() - entity.create_mention(head=ante_node, words=[ante_node]) - entity.create_mention(head=this_node, words=[this_node]) + m_ante = entity.create_mention(head=ante_node, words=[ante_node]) + m_this = entity.create_mention(head=this_node, words=[this_node]) + for node, mention in ((ante_node, m_ante), (this_node, m_this)): + if node.misc['information-status']: + mention.other['infstat'] = node.misc['information-status'] + if self.delete_orig_attrs: + del node.misc['information-status'] diff --git a/udapi/core/document.py b/udapi/core/document.py index 8507d2f1..5f2bdf0b 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -159,9 +159,9 @@ def create_coref_entity(self, eid=None, etype=None): self._load_coref() if not eid: counter = 1 - while self._eid_to_entity.get(f'c{counter}'): + while self._eid_to_entity.get(f'e{counter}'): counter += 1 - eid = f'c{counter}' + eid = f'e{counter}' elif self._eid_to_entity.get(eid): raise ValueError("Entity with eid=%s already exists", eid) entity = udapi.core.coref.CorefEntity(eid, etype) From ed822b8c4597e606ea4632d56412ba3db901db42 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Thu, 1 Feb 2024 17:32:51 +0100 Subject: [PATCH 007/514] Better implementation following suggestions by @martinpopel --- udapi/block/ud/fixcompoundname.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/udapi/block/ud/fixcompoundname.py b/udapi/block/ud/fixcompoundname.py index 74610757..90596e35 100644 --- a/udapi/block/ud/fixcompoundname.py +++ b/udapi/block/ud/fixcompoundname.py @@ -24,11 +24,12 @@ def process_node(self, node): grandparent = origparent.parent outdeprel = origparent.deprel # See if there are other PROPN compound siblings. - namewords = sorted([x for x in origparent.children(add_self=True) if x.upos == 'PROPN' and (x.udeprel == 'compound' or x == origparent)], key=lambda y: y.ord) + # (The list node.children is automatically sorted by ord. If any new sorting is needed later, we can compare nodes directly, their default comparison value is ord.) + namewords = [x for x in origparent.children(add_self=True) if x.upos == 'PROPN' and (x.udeprel == 'compound' or x == origparent)] # The Hindi treebank tags dates (['30', 'navaṁbara'], ['disaṁbara', '1993']) as PROPN compounds. # This is wrong but it is also different from personal names we are targeting here. # Hence, we will skip "names" that contain numbers. - if len([x for x in namewords if re.search(r"\d", x.form)]) == 0: + if any(re.search(r"\d", x.form) for x in namewords): #logging.info(str([x.misc['Translit'] for x in namewords])) ###!!! We currently cannot transform enhanced dependencies. ###!!! If we proceed, the basic tree would diverge from the enhanced dependencies. From bb9e553df751db1fe2d01c691f1f600674f12f07 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 3 Feb 2024 21:38:03 +0100 Subject: [PATCH 008/514] improve udpipe.Base online=1 - bugfix: `model` was ignored when tokenize=0 - allow more combinations, e.g. tokenize=0 tag=1 parse=0 or tokenize=0 tag=0 parse=1 where the existing tags/parses are reused - for redoing the tokenization, you can use tokenize=1 delete_nodes=1 which first deletes the existing nodes and then creates them again using UDPipe's tokenizer --- udapi/block/udpipe/base.py | 21 +++++++++++++-------- udapi/tool/udpipeonline.py | 25 +++++++++++++++++++------ 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index 448dbb60..d94f8cc5 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -120,12 +120,14 @@ class Base(Block): # pylint: disable=too-many-arguments def __init__(self, model=None, model_alias=None, online=False, - tokenize=True, tag=True, parse=True, resegment=False, **kwargs): + tokenize=True, tag=True, parse=True, resegment=False, + delete_nodes=False, **kwargs): """Create the udpipe.En block object.""" super().__init__(**kwargs) self.model, self.model_alias, self.online = model, model_alias, online self._tool = None self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment + self.delete_nodes = delete_nodes @property def tool(self): @@ -146,16 +148,19 @@ def tool(self): return self._tool def process_document(self, doc): - tok, tag, par = self.tokenize, self.tag, self.parse + tok, tag, par, reseg = self.tokenize, self.tag, self.parse, self.resegment old_bundles = doc.bundles new_bundles = [] for bundle in old_bundles: for tree in bundle: new_bundles.append(bundle) if self._should_process_tree(tree): + if self.delete_nodes: + for subroot in tree.children: + subroot.remove() if tok: - new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=self.resegment, - tag=self.tag, parse=self.parse) + new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=reseg, + tag=tag, parse=par) if self.resegment and len(new_trees) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' @@ -164,9 +169,9 @@ def process_document(self, doc): new_tree.zone = tree.zone new_bundle.add_tree(new_tree) new_bundles.append(new_bundle) - elif not tok and tag and par: - self.tool.tag_parse_tree(tree) - elif not tok and not tag and not par and self.resegment: + elif not tok and not reseg and (tag or par): + self.tool.tag_parse_tree(tree, tag=tag, parse=par) + elif not tok and reseg and not tag and not par: sentences = self.tool.segment_text(tree.text) if len(sentences) > 1: orig_bundle_id = bundle.bundle_id @@ -178,7 +183,7 @@ def process_document(self, doc): new_tree.text = sentence new_bundles.append(new_bundle) else: - raise ValueError("Unimplemented tokenize=%s tag=%s parse=%s" % (tok, tag, par)) + raise ValueError(f"Unimplemented tokenize={tok} tag={tag} parse={par} resegment={reseg}") doc.bundles = new_bundles ''' diff --git a/udapi/tool/udpipeonline.py b/udapi/tool/udpipeonline.py index 2b78a45a..26f31a73 100644 --- a/udapi/tool/udpipeonline.py +++ b/udapi/tool/udpipeonline.py @@ -62,21 +62,34 @@ def perform_request(self, params, method="process"): return response["result"] - def tag_parse_tree(self, root): + def tag_parse_tree(self, root, tag=True, parse=True): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" + if not tag and not parse: + raise ValueError('tag_parse_tree(root, tag=False, parse=False) does not make sense.') descendants = root.descendants if not descendants: return in_data = " ".join([n.form for n in descendants]) - out_data = self.perform_request(params={"data": in_data, "input":"horizontal", "tagger":"", "parser":""}) - conllu_reader = ConlluReader() + params = {"model": self.model, "data": in_data, "input":"horizontal", "tagger":""} + if tag: + attrs = 'upos xpos lemma feats'.split() + else: + attrs = [] + if parse: + params["parser"] = "" + attrs.append('deprel') + + out_data = self.perform_request(params=params) + conllu_reader = ConlluReader(empty_parent="ignore") conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = conllu_reader.read_tree() - root.flatten() + if parse: + root.flatten() for parsed_node in parsed_root.descendants: node = descendants[parsed_node.ord - 1] - node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root - for attr in 'upos xpos lemma feats deprel'.split(): + if parse: + node.parent = descendants[parsed_node.parent.ord - 1] if parsed_node.parent.ord else root + for attr in attrs: setattr(node, attr, getattr(parsed_node, attr)) def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): From d780aa9f092adbffed003960b9764d3ee3dd4a91 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 4 Feb 2024 01:59:23 +0100 Subject: [PATCH 009/514] write.CorefHtml shows sent_id --- udapi/block/write/corefhtml.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 6129b335..cc956ade 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -61,6 +61,7 @@ .active {border: 1px solid red !important;} .selected {background: red !important; text-shadow: 1px 1px 4px white, -1px 1px 4px white, 1px -1px 4px white, -1px -1px 4px white;} .other {background: hsl(0, 0%, 85%);} +.sent_id {display: none; background: #ddd; border-radius: 3px;} ''' SCRIPT_BASE = ''' @@ -133,7 +134,7 @@ add_show_tree_button = function(index, el){ var sent_id = el.id; $(el).prepend( - $("