are ordinal numbers starting from the one specified by the `start` parameter.
+ This block can be applied on multiple documents within one udapy call.
+ For example, to re-index eid in all conllu files in the current directory
+ (keeping the IDs unique across all the files), use:
+ `udapy read.Conllu files='!*.conllu' corefud.IndexClusters write.Conllu overwrite=1`
+
+ Parameters:
+ -----------
+ start : int
+ the starting index (default=1)
+ prefix : str
+ prefix of the IDs before the number (default="e")
+ """
+
+ def __init__(self, start=1, prefix='e'):
+ self.start = start
+ self.prefix = prefix
+
+ def process_document(self, doc):
+ entities = doc.coref_entities
+ if not entities:
+ return
+ new_eid_to_entity = {}
+ for idx, entity in enumerate(entities, self.start):
+ new_eid = self.prefix + str(idx)
+ entity.eid = new_eid
+ new_eid_to_entity[new_eid] = entity
+ self.start = idx + 1
+ doc._eid_to_entity = new_eid_to_entity
diff --git a/udapi/block/corefud/link2cluster.py b/udapi/block/corefud/link2cluster.py
new file mode 100644
index 00000000..08296531
--- /dev/null
+++ b/udapi/block/corefud/link2cluster.py
@@ -0,0 +1,137 @@
+import logging
+from udapi.core.block import Block
+
+class Link2Cluster(Block):
+ """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.
+
+ Params:
+ id_attr: name of the attribute in MISC that stores the original-format IDs of nodes
+ ante_attr: name of the attribute in MISC that stores the ID of the antecedent
+ of the current node (in the same format as `id_attr`).
+ delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion?
+ (i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr,
+ bridge_attr, bridge_relation_attr if these are used). Default=True.
+ infstat_attr: name of the attribute in MISC that stores the information status of a given mention
+ Will be stored in `mention.other['infstat']`. Use None for ignoring this.
+ coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention
+ Will be stored in `mention.other['coreftype']`. Use None for ignoring this.
+ bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent
+ of the current node/mention (in the same format as `id_attr`).
+ Default=None, i.e. ignore this parameter.
+ bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type
+ (e.g. "part" or "subset"). Default=None, i.e. ignore this parameter.
+ eid_counter: use a global counter of entity.eid and start with a given number. Default=1.
+ The main goal of this parameter is to make eid unique across multiple documents.
+ If you use eid_counter=0, this feature will be turned off,
+ so entities will be created using `root.document.create_coref_entity()`,
+ with no eid parameter, so that the eid will start from "e1" in each document processed by this block.
+ """
+ def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True,
+ infstat_attr='information-status', coreftype_attr='coreftype',
+ bridge_attr=None, bridge_relation_attr=None, eid_counter=1, **kwargs):
+ super().__init__(**kwargs)
+ self.id_attr = id_attr
+ self.ante_attr = ante_attr
+ self.delete_orig_attrs = delete_orig_attrs
+ self.infstat_attr = infstat_attr
+ self.coreftype_attr = coreftype_attr
+ self.bridge_attr = bridge_attr
+ self.bridge_relation_attr = bridge_relation_attr
+ self.eid_counter = int(eid_counter)
+
+ def _new_entity(self, doc):
+ if not self.eid_counter:
+ return doc.create_coref_entity()
+ entity = doc.create_coref_entity(eid=f"e{self.eid_counter}")
+ self.eid_counter += 1
+ return entity
+
+ def _new_mention(self, entity, node):
+ mention = entity.create_mention(head=node, words=[node])
+ if self.infstat_attr and node.misc[self.infstat_attr]:
+ mention.other['infstat'] = node.misc[self.infstat_attr]
+ if self.delete_orig_attrs:
+ del node.misc[self.infstat_attr]
+ if self.coreftype_attr and node.misc[self.coreftype_attr]:
+ mention.other['coreftype'] = node.misc[self.coreftype_attr]
+ if self.delete_orig_attrs:
+ del node.misc[self.coreftype_attr]
+ return mention
+
+ def process_document(self, doc):
+ id2node = {}
+ links = []
+ bridges = []
+ for node in doc.nodes_and_empty:
+ this_id = node.misc[self.id_attr]
+ if this_id != '':
+ id2node[this_id] = node
+ ante_id = node.misc[self.ante_attr]
+ if ante_id != '':
+ if ante_id == this_id:
+ logging.warning(f"{node} has a self-reference {self.ante_attr}={ante_id}")
+ else:
+ links.append([ante_id, this_id])
+ if self.delete_orig_attrs:
+ for attr in (self.id_attr, self.ante_attr):
+ del node.misc[attr]
+ if self.bridge_attr:
+ bridge_id = node.misc[self.bridge_attr]
+ if bridge_id != '':
+ if bridge_id == this_id:
+ logging.warning(f"{node} has a self-reference bridging {self.bridge_attr}={bridge_id}")
+ else:
+ bridges.append([bridge_id, this_id, node.misc[self.bridge_relation_attr]])
+ if self.delete_orig_attrs:
+ for attr in (self.bridge_attr, self.bridge_relation_attr):
+ del node.misc[attr]
+
+ # It seems faster&simpler to process the links in any order and implement entity merging,
+ # rather than trying to sort the links so that no entity merging is needed.
+ for ante_id, this_id in links:
+ if ante_id not in id2node:
+ logging.warning(f"{ante_id} is referenced in {self.ante_attr}, but not in {self.id_attr}")
+ else:
+ ante_node, this_node = id2node[ante_id], id2node[this_id]
+ if not this_node.coref_mentions and not ante_node.coref_mentions:
+ # None of the nodes is part of any mention/entity. Let's create them.
+ entity = self._new_entity(this_node.root.document)
+ self._new_mention(entity, ante_node)
+ self._new_mention(entity, this_node)
+ elif this_node.coref_mentions and ante_node.coref_mentions:
+ # Both of the nodes are part of mentions in different entities.
+ # Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity).
+ # While the official API supports "stealing" a single mention (m.entity = another_entity),
+ # the implementation below using _mentions and _entity is a bit faster.
+ e_ante, e_this = this_node.coref_entities[0], ante_node.coref_entities[0]
+ assert e_ante != e_this
+ for mention in e_ante.mentions:
+ mention._entity = e_this
+ e_this._mentions.extend(e_ante.mentions)
+ e_this._mentions.sort()
+ e_ante._mentions.clear()
+ else:
+ # Only one of the nodes is part of an entity. Let's add the second one to this entity.
+ if ante_node.coref_mentions:
+ self._new_mention(ante_node.coref_entities[0], this_node)
+ else:
+ self._new_mention(this_node.coref_entities[0], ante_node)
+
+ # Bridging
+ for ante_id, this_id, relation in bridges:
+ if ante_id not in id2node:
+ logging.warning(f"{ante_id} is referenced in {self.bridge_attr}, but not in {self.id_attr}")
+ else:
+ ante_node, this_node = id2node[ante_id], id2node[this_id]
+ if ante_node.coref_mentions:
+ m_ante = next(m for m in ante_node.coref_mentions if m.head is ante_node)
+ e_ante = m_ante.entity
+ else:
+ e_ante = self._new_entity(ante_node.root.document)
+ m_ante = self._new_mention(e_ante, ante_node)
+ if this_node.coref_mentions:
+ m_this = next(m for m in this_node.coref_mentions if m.head is this_node)
+ else:
+ e_this = self._new_entity(this_node.root.document)
+ m_this = self._new_mention(e_this, this_node)
+ m_this.bridging.append((e_ante, relation))
diff --git a/udapi/block/corefud/load.py b/udapi/block/corefud/load.py
new file mode 100644
index 00000000..92773dc2
--- /dev/null
+++ b/udapi/block/corefud/load.py
@@ -0,0 +1,12 @@
+from udapi.core.block import Block
+import udapi.core.coref
+
+class Load(Block):
+ """Load coreference-related MISC attributes into memory. Allow lenient mode by strict=0."""
+
+ def __init__(self, strict=True):
+ self.strict = strict
+
+ def process_document(self, doc):
+ if doc._eid_to_entity is None:
+ udapi.core.coref.load_coref_from_misc(doc, self.strict)
diff --git a/udapi/block/corefud/markcrossing.py b/udapi/block/corefud/markcrossing.py
new file mode 100644
index 00000000..8064e67f
--- /dev/null
+++ b/udapi/block/corefud/markcrossing.py
@@ -0,0 +1,39 @@
+from udapi.core.block import Block
+import udapi.core.coref
+import itertools
+import logging
+
+class MarkCrossing(Block):
+ """Find mentions with crossing spans."""
+
+ def __init__(self, same_entity_only=False, continuous_only=False, print_form=False,
+ log=True, mark=True, **kwargs):
+ super().__init__(**kwargs)
+ self.same_entity_only = same_entity_only
+ self.continuous_only = continuous_only
+ self.print_form = print_form
+ self.log = log
+ self.mark = mark
+ self._logged = {}
+
+ def _print(self, mention):
+ if self.print_form:
+ return ' '.join([w.form for w in mention.words])
+ else:
+ return mention.span
+
+ def process_node(self, node):
+ if len(node.coref_mentions) > 1:
+ for mA, mB in itertools.combinations(node.coref_mentions, 2):
+ if not (set(mA.words) <= set(mB.words)) and not (set(mB.words) <= set(mA.words)):
+ if self.same_entity_only and mA.entity != mB.entity:
+ continue
+ if self.continuous_only and (',' in mA.span or ',' in mB.span):
+ continue
+ if self.mark:
+ node.misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}"
+ if self.log:
+ cross_id = node.root.sent_id + mA.span + mB.span
+ if cross_id not in self._logged:
+ self._logged[cross_id] = True
+ print(f"crossing mentions at {node}: {self._print(mA)} + {self._print(mB)}")
diff --git a/udapi/block/corefud/markinterleaved.py b/udapi/block/corefud/markinterleaved.py
new file mode 100644
index 00000000..c00f73b1
--- /dev/null
+++ b/udapi/block/corefud/markinterleaved.py
@@ -0,0 +1,45 @@
+from udapi.core.block import Block
+import udapi.core.coref
+import itertools
+
+class MarkInterleaved(Block):
+ """Find mentions with interleaved spans."""
+
+ def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False,
+ log=True, mark=True, **kwargs):
+ super().__init__(**kwargs)
+ self.same_entity_only = same_entity_only
+ self.both_discontinuous = both_discontinuous
+ self.print_form = print_form
+ self.log = log
+ self.mark = mark
+
+ def _print(self, mention):
+ if self.print_form:
+ return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words])
+ else:
+ return mention.entity.eid + ':' + mention.span
+
+ def process_tree(self, tree):
+ mentions = set()
+ for node in tree.descendants_and_empty:
+ for m in node.coref_mentions:
+ mentions.add(m)
+ if len(mentions) > 1:
+ for mA, mB in itertools.combinations(mentions, 2):
+ if set(mA.words).intersection(set(mB.words)):
+ continue
+ if mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]:
+ continue
+ if mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]:
+ continue
+ if self.same_entity_only and mA.entity != mB.entity:
+ continue
+ if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span):
+ continue
+ if self.mark:
+ for w in mA.words + mB.words:
+ w.misc['Mark'] = 1
+ mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}"
+ if self.log:
+ print(f"interleaved mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}")
diff --git a/udapi/block/corefud/marknested.py b/udapi/block/corefud/marknested.py
new file mode 100644
index 00000000..8db8a657
--- /dev/null
+++ b/udapi/block/corefud/marknested.py
@@ -0,0 +1,44 @@
+from udapi.core.block import Block
+import udapi.core.coref
+import itertools
+
+class MarkNested(Block):
+ """Find nested mentions."""
+
+ def __init__(self, same_entity_only=True, both_discontinuous=False, multiword_only=False,
+ print_form=False, log=True, mark=True, **kwargs):
+ super().__init__(**kwargs)
+ self.same_entity_only = same_entity_only
+ self.both_discontinuous = both_discontinuous
+ self.multiword_only = multiword_only
+ self.print_form = print_form
+ self.log = log
+ self.mark = mark
+
+ def _print(self, mention):
+ if self.print_form:
+ return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words])
+ else:
+ return mention.entity.eid + ':' + mention.span
+
+ def process_tree(self, tree):
+ mentions = set()
+ for node in tree.descendants_and_empty:
+ for m in node.coref_mentions:
+ mentions.add(m)
+ for mA, mB in itertools.combinations(mentions, 2):
+ if self.same_entity_only and mA.entity != mB.entity:
+ continue
+ if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span):
+ continue
+ sA, sB = set(mA.words), set(mB.words)
+ if not (sA <= sB) and not (sB <= sA):
+ continue
+ if self.multiword_only and (len(sA) == 1 or len(sB) == 1):
+ continue
+ if self.mark:
+ for w in mA.words + mB.words:
+ w.misc['Mark'] = 1
+ mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}"
+ if self.log:
+ print(f"nested mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}")
diff --git a/udapi/block/corefud/markpairs.py b/udapi/block/corefud/markpairs.py
new file mode 100644
index 00000000..cc63b387
--- /dev/null
+++ b/udapi/block/corefud/markpairs.py
@@ -0,0 +1,138 @@
+from udapi.core.block import Block
+import udapi.core.coref
+import itertools
+from collections import Counter
+import logging
+
+class MarkPairs(Block):
+ """Find pairs of coreference mentions within the same sentence with given properties.
+ Mark these pairs of mentions (using `misc["Mark"]`), so they can be further
+ processed or printed.
+
+ Usage:
+ # Find pairs of mentions of the same entity within the same sentence:
+ cat my.conllu | udapy -TM corefud.MarkPairs same_entity=1 | less -R
+
+ Properties:
+ same_entity - both mentions belong to the same entity (cluster)
+ both_continuous - both mentions have continuous spans
+ both_discontinuous - both mentions have discontinuous spans
+ nested - span of one mention is nested (a subset of) in the span of the other mention
+ crossing - spans are crossing (i.e. intersecting, but neither is subset of the other)
+ interleaved - spans are interleaved (i.e. not intersecting, but neither span precedes the other)
+ same_head - the same node is a head of both mentions
+ same_span - both mentions have the same span (which is invalid according to UD's validate.py)
+ same_subspan - at least one of the mentions is discontinuous and one of its subspans
+ is also a subspan (or span) of the other mention
+
+
+ You can combine any number of properties.
+ Each property can have one of the three values:
+ include - this is the default value: include pairs with this property, i.e. ignore the property
+ exclude - exclude (from the marking) pairs of mentions with this property
+ only - pairs of mentions without this property will be excluded
+
+ As a shortcut, you can use -1 and 1 instead of exclude and only, so e.g.
+ nested=only same_head=exclude
+ can be written as
+ nested=1 same_head=-1
+ """
+
+ def __init__(self, same_entity=0, both_continuous=0, both_discontinuous=0,
+ nested=0, crossing=0, interleaved=0,
+ same_head=0, same_span=0, same_subspan=0,
+ print_form=False, print_total=True, log=True, mark=True, **kwargs):
+ super().__init__(**kwargs)
+
+
+ self.same_entity = self._convert(same_entity)
+ self.both_continuous = self._convert(both_continuous)
+ self.both_discontinuous = self._convert(both_discontinuous)
+ self.nested = self._convert(nested)
+ self.crossing = self._convert(crossing)
+ self.interleaved = self._convert(interleaved)
+ self.same_head = self._convert(same_head)
+ self.same_span = self._convert(same_span)
+ self.same_subspan = self._convert(same_subspan)
+
+ self.print_form = print_form
+ self.print_total = print_total
+ self.log = log
+ self.mark = mark
+ self.counter = Counter()
+
+ def _convert(self, value):
+ if value in {-1, 0, 1}:
+ return value
+ if value == 'include':
+ return 0
+ if value == 'only':
+ return 1
+ if value == 'exclude':
+ return -1
+ raise ValueError('unknown value ' + value)
+
+ def _ok(self, condition, value):
+ if value == 0:
+ return True
+ return (condition and value == 1) or (not condition and value==-1)
+
+ def _print(self, mention):
+ if self.print_form:
+ return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words])
+ else:
+ return mention.entity.eid + ':' + mention.span
+
+ def process_tree(self, tree):
+ mentions = set()
+ for node in tree.descendants_and_empty:
+ for m in node.coref_mentions:
+ mentions.add(m)
+ self.counter['mentions'] += len(mentions)
+
+ for mA, mB in itertools.combinations(mentions, 2):
+ self.counter['pairs'] += 1
+ if not self._ok(mA.entity == mB.entity, self.same_entity):
+ continue
+ if not self._ok(mA.head == mB.head, self.same_head):
+ continue
+
+ if self.both_continuous or self.both_discontinuous or self.same_span or self.same_subspan:
+ sA, sB = mA.span, mB.span
+ cA, cB = ',' not in sA, ',' not in sB
+ if not self._ok(cA and cB, self.both_continuous):
+ continue
+ if not self._ok(not cA and not cB, self.both_discontinuous):
+ continue
+ if not self._ok(sA == sB, self.same_span):
+ continue
+ if not self._ok(set(sA.split(',')).intersection(set(sB.split(','))), self.same_subspan):
+ continue
+
+ if self.nested or self.crossing or self.interleaved:
+ wA, wB = set(mA.words), set(mB.words)
+ if not self._ok(wA <= wB or wB <= wA, self.nested):
+ continue
+ if not self._ok(wA.intersection(wB) and not wA <= wB and not wB <= wA, self.crossing):
+ continue
+ if self.interleaved:
+ a_precedes_b = mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0]
+ b_precedes_a = mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0]
+ if not self._ok(not wA.intersection(wB) and not a_precedes_b and not b_precedes_a, self.interleaved):
+ continue
+
+ self.counter['matching'] += 1
+ if self.mark:
+ for w in mA.words + mB.words:
+ w.misc['Mark'] = 1
+ mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}"
+ if self.log:
+ logging.info(f"Found mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}")
+
+ def after_process_document(self, doc):
+ if self.print_total:
+ #if self.max_trees and seen_trees > self.max_trees:
+ # print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.')
+ msg = f'######## Mentions = {self.counter["mentions"]}, matching/all pairs = {self.counter["matching"]} / {self.counter["pairs"]}'
+ logging.info(msg)
+ doc.meta["corefud.MarkPairs"] = msg
diff --git a/udapi/block/corefud/marksamesubspan.py b/udapi/block/corefud/marksamesubspan.py
new file mode 100644
index 00000000..f3cfd7b3
--- /dev/null
+++ b/udapi/block/corefud/marksamesubspan.py
@@ -0,0 +1,45 @@
+from udapi.core.block import Block
+import udapi.core.coref
+import itertools
+
+class MarkSameSubSpan(Block):
+ """Find mentions with the same subspan."""
+
+ def __init__(self, same_entity_only=False, both_discontinuous=False, print_form=False, nested_only=False,
+ log=True, mark=True, **kwargs):
+ super().__init__(**kwargs)
+ self.same_entity_only = same_entity_only
+ self.both_discontinuous = both_discontinuous
+ self.nested_only = nested_only
+ self.print_form = print_form
+ self.log = log
+ self.mark = mark
+
+ def _print(self, mention):
+ if self.print_form:
+ return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words])
+ else:
+ return mention.entity.eid + ':' + mention.span
+
+ def process_tree(self, tree):
+ mentions = set()
+ for node in tree.descendants_and_empty:
+ for m in node.coref_mentions:
+ mentions.add(m)
+ if len(mentions) > 1:
+ for mA, mB in itertools.combinations(mentions, 2):
+ if self.same_entity_only and mA.entity != mB.entity:
+ continue
+ if self.both_discontinuous and (',' not in mA.span or ',' not in mB.span):
+ continue
+ sA, sB = set(mA.words), set(mB.words)
+ if self.nested_only and not (sA <= sB) and not (sB <= sA):
+ continue
+ if not set(mA.span.split(',')).intersection(set(mB.span.split(','))):
+ continue
+ if self.mark:
+ for w in mA.words + mB.words:
+ w.misc['Mark'] = 1
+ mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}"
+ if self.log:
+ print(f"same-subspan mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}")
diff --git a/udapi/block/corefud/mergesamespan.py b/udapi/block/corefud/mergesamespan.py
new file mode 100644
index 00000000..61b613cb
--- /dev/null
+++ b/udapi/block/corefud/mergesamespan.py
@@ -0,0 +1,52 @@
+from udapi.core.block import Block
+import udapi.core.coref
+import itertools
+import logging
+
+class MergeSameSpan(Block):
+ """
+ Multiple same-span mentions are considered invalid in CoNLL-U, whether they
+ belong to the same entity or not. If they occur, merge them into one.
+ Note: We currently do not have mentions across sentence boundaries in the
+ CorefUD data, so this block processes one sentence at a time.
+ """
+
+ def __init__(self, same_entity_only=False, **kwargs):
+ super().__init__(**kwargs)
+ self.same_entity_only = same_entity_only
+
+ def process_tree(self, tree):
+ mentions = set()
+ for node in tree.descendants_and_empty:
+ for m in node.coref_mentions:
+ mentions.add(m)
+
+ for mA, mB in itertools.combinations(mentions, 2):
+ if self.same_entity_only and mA.entity != mB.entity:
+ continue
+ # Reduce non-determinism in which mention is removed:
+ # If the mentions belong to different entities, sort them by entity (entity) ids.
+ if mA.entity.eid > mB.entity.eid:
+ mA, mB = mB, mA
+
+ sA, sB = set(mA.words), set(mB.words)
+ if sA != sB:
+ continue
+
+ # If the mentions belong to different entities, we should merge the
+ # entities first, i.e., pick one entity as the survivor, move the
+ # mentions from the other entity to this entity, and remove the
+ # other entity.
+ if mA.entity != mB.entity:
+ logging.warning(f"Merging same-span mentions that belong to different entities: {mA.entity.eid} vs. {mB.entity.eid}")
+ ###!!! TODO: As of now, changing the entity of a mention is not supported in the API.
+ #for m in mB.entity.mentions:
+ # m.entity = mA.entity
+ # Remove mention B. It may have been removed earlier because of
+ # another duplicate, that is the purpose of try-except.
+ ###!!! TODO: If we remove a singleton, we are destroying the entity. Then we must also handle possible bridging and split antecedents pointing to that entity!
+ mB.words = []
+ try:
+ mB.entity.mentions.remove(mB)
+ except ValueError:
+ pass
diff --git a/udapi/block/corefud/miscstats.py b/udapi/block/corefud/miscstats.py
new file mode 100644
index 00000000..dee358d6
--- /dev/null
+++ b/udapi/block/corefud/miscstats.py
@@ -0,0 +1,35 @@
+from udapi.core.block import Block
+from collections import Counter
+import re
+
+class MiscStats(Block):
+ """Block corefud.MiscStats prints 10 most frequent values of each attribute stored in the MISC field"""
+
+ def __init__(self, maxvalues=10, **kwargs):
+
+ """Create the corefud.MiscStats
+
+ Args:
+ maxvalues: the number of most frequent values
+ to be printed for each attribute.
+
+ """
+ super().__init__(**kwargs)
+ self.maxvalues = maxvalues
+ self.valuecounter = {}
+ self.totalcounter = Counter()
+
+ def process_node(self,node):
+ for attrname in node.misc:
+ shortattrname = re.sub(r'\[\d+\]',r'',attrname)
+ if not shortattrname in self.valuecounter:
+ self.valuecounter[shortattrname] = Counter()
+ self.valuecounter[shortattrname][node.misc[attrname]] += 1
+ self.totalcounter[shortattrname] += 1
+
+ def process_end(self):
+ for attrname in self.valuecounter:
+ print()
+ print(attrname+"\t"+str(self.totalcounter[attrname]))
+ for value,freq in self.valuecounter[attrname].most_common(self.maxvalues):
+ print("\t"+str(value)+"\t"+str(freq))
diff --git a/udapi/block/corefud/miscstatstex.py b/udapi/block/corefud/miscstatstex.py
new file mode 100644
index 00000000..25d3751a
--- /dev/null
+++ b/udapi/block/corefud/miscstatstex.py
@@ -0,0 +1,44 @@
+from udapi.core.block import Block
+from collections import Counter
+import re
+
+class MiscStatsTex(Block):
+ """Block corefud.MiscStats prints 10 most frequent values of each attribute stored in the MISC field"""
+
+ def __init__(self, maxvalues=10, **kwargs):
+
+ """Create the corefud.MiscStats
+
+ Args:
+ maxvalues: the number of most frequent values
+ to be printed for each attribute.
+
+ """
+ super().__init__(**kwargs)
+ self.maxvalues = maxvalues
+ self.valuecounter = {}
+ self.totalcounter = Counter()
+
+ def process_node(self,node):
+ for attrname in node.misc:
+ shortattrname = re.sub(r'\[\d+\]',r'',attrname)
+ if not shortattrname in self.valuecounter:
+ self.valuecounter[shortattrname] = Counter()
+ self.valuecounter[shortattrname][node.misc[attrname]] += 1
+ self.totalcounter[shortattrname] += 1
+
+ def process_end(self):
+ for attrname in self.valuecounter:
+
+ total = self.totalcounter[attrname]
+ distrvalues = []
+
+ for value,freq in self.valuecounter[attrname].most_common(self.maxvalues):
+ value = re.sub(r'_',r'\\_',value)
+ distrvalues.append(f'\\attr{{{str(value)}}} {100*freq/total:2.1f}~\\%')
+
+ attrname = re.sub(r'_',r'\\_',attrname)
+ print(f" \\item attribute \\attr{{{attrname}}}, {total:,} occurrences, values: "+", ".join(distrvalues))
+# print(f" \\item attribute \\attr\{{attrname}\}, {str(total)} occurrences, distribution of values: "+", ".join(distrvalues))
+
+
diff --git a/udapi/block/corefud/movehead.py b/udapi/block/corefud/movehead.py
new file mode 100644
index 00000000..00a32e9f
--- /dev/null
+++ b/udapi/block/corefud/movehead.py
@@ -0,0 +1,95 @@
+import logging
+from collections import Counter
+from udapi.core.block import Block
+from udapi.core.node import find_minimal_common_treelet
+
+class MoveHead(Block):
+ """Block corefud.MoveHead moves the head to the highest node in each mention."""
+
+ def __init__(self, bugs='warn', keep_head_if_possible=True, **kwargs):
+ self.counter = Counter()
+ self.bugs = bugs
+ self.keep_head_if_possible = keep_head_if_possible
+ super().__init__(**kwargs)
+
+ def _eparents(self, node):
+ if node._raw_deps != '_':
+ return [d['parent'] for d in node.deps]
+ if node.parent:
+ return [node.parent]
+ return []
+
+ def find_head(self, mention):
+ mwords = set(mention.words)
+
+ # First, check the simplest case: no empty words and a treelet in basic dependencies.
+ basic_heads = [w for w in mention.words if not w.parent or not w.parent in mwords]
+ assert basic_heads
+ if len(basic_heads) == 1:
+ return basic_heads[0], 'treelet'
+
+ # Second, check also enhanced dependencies (but only within basic_heads for simplicity).
+ enh_heads = [w for w in basic_heads if not any(p in mwords for p in self._eparents(w))]
+ if not enh_heads:
+ enh_heads = [w for w in basic_heads if not all(p in mwords for p in self._eparents(w))]
+ if not enh_heads:
+ return mention.head, 'cycle'
+ if len(enh_heads) == 1:
+ return enh_heads[0], 'treelet'
+
+ # Third, find non-empty parents (ancestors in future) of empty nodes.
+ empty_nodes, non_empty = [], []
+ for w in enh_heads:
+ (empty_nodes if w.is_empty() else non_empty).append(w)
+ if empty_nodes:
+ for empty_node in empty_nodes:
+ parents = [d['parent'] for d in empty_node.deps if not d['parent'].is_empty()]
+ if parents:
+ if parents[0] not in non_empty:
+ non_empty.append(parents[0])
+ else:
+ # TODO we should climb up, but preventing cycles
+ # We could also introduce empty_node.nonempty_ancestor
+ if 'warn' in self.bugs:
+ logging.warning(f"could not find non-empty parent of {empty_node} for mention {mention.head}")
+ if 'mark' in self.bugs:
+ node.misc['Bug'] = 'no-parent-of-empty'
+ non_empty.sort()
+
+ # Fourth, check if there is a node within the enh_heads governing all the mention nodes
+ # and forming thus a "gappy treelet", where the head is clearly the "highest" node.
+ (highest, added_nodes) = find_minimal_common_treelet(*non_empty)
+ if highest in enh_heads:
+ return highest, 'gappy'
+ if highest in mwords:
+ if 'warn' in self.bugs:
+ logging.warning(f"Strange mention {mention.head} with highest node {highest}")
+ if 'mark' in self.bugs:
+ highest.misc['Bug'] = 'highest-in-mwords'
+ mention.head.misc['Bug'] = 'highest-head'
+
+ # Fifth, try to convervatively preserve the original head, if it is one of the possible heads.
+ if self.keep_head_if_possible and mention.head in enh_heads:
+ return mention.head, 'nontreelet'
+
+ # Finally, return the word-order-wise first head candidate as the head.
+ return enh_heads[0], 'nontreelet'
+
+ def process_coref_mention(self, mention):
+ self.counter['total'] += 1
+ if len(mention.words) < 2:
+ self.counter['single-word'] += 1
+ else:
+ new_head, category = self.find_head(mention)
+ self.counter[category] += 1
+ if new_head is mention.head:
+ self.counter[category + '-kept'] += 1
+ else:
+ self.counter[category + '-moved'] += 1
+ mention.head = new_head
+
+ def process_end(self):
+ logging.info("corefud.MoveHead overview of mentions:")
+ total = self.counter['total']
+ for key, value in self.counter.most_common():
+ logging.info(f"{key:>16} = {value:6} ({100*value/total:5.1f}%)")
diff --git a/udapi/block/corefud/printentities.py b/udapi/block/corefud/printentities.py
new file mode 100644
index 00000000..7230c6a5
--- /dev/null
+++ b/udapi/block/corefud/printentities.py
@@ -0,0 +1,55 @@
+import re
+import os.path
+from udapi.core.block import Block
+from collections import Counter, defaultdict
+
+class PrintEntities(Block):
+ """Block corefud.PrintEntities prints all mentions of a given entity."""
+
+ def __init__(self, eid_re=None, min_mentions=0, print_ranges=True, mark_head=True,
+ aggregate_mentions=True, **kwargs):
+ """Params:
+ eid_re: regular expression constraining ID of the entities to be printed
+ min_mentions: print only entities with with at least N mentions
+ print_ranges: print also addressess of all mentions
+ (compactly, using the longest common prefix of sent_id)
+ mark_head: mark the head (e.g. as "red **car**")
+ """
+ super().__init__(**kwargs)
+ self.eid_re = re.compile(str(eid_re)) if eid_re else None
+ self.min_mentions = min_mentions
+ self.print_ranges = print_ranges
+ self.mark_head = mark_head
+ self.aggregate_mentions = aggregate_mentions
+
+ def process_document(self, doc):
+ if 'docname' in doc.meta:
+ print(f"Coref entities in document {doc.meta['docname']}:")
+ for entity in doc.coref_entities:
+ if self.eid_re and not self.eid_re.match(entity.eid):
+ continue
+ if len(entity.mentions) < self.min_mentions:
+ continue
+ print(f" {entity.eid} has {len(entity.mentions)} mentions:")
+ if self.aggregate_mentions:
+ counter = Counter()
+ ranges = defaultdict(list)
+ for mention in entity.mentions:
+ forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words])
+ counter[forms] += 1
+ if self.print_ranges:
+ ranges[forms].append(mention.head.root.address() + ':' +mention.span)
+ for form, count in counter.most_common():
+ print(f"{count:4}: {form}")
+ if self.print_ranges:
+ if count == 1:
+ print(' ' + ranges[form][0])
+ else:
+ prefix = os.path.commonprefix(ranges[form])
+ print(f' {prefix} ({" ".join(f[len(prefix):] for f in ranges[form])})')
+ else:
+ for mention in entity.mentions:
+ forms = ' '.join([f"**{w.form}**" if self.mark_head and w is mention.head else w.form for w in mention.words])
+ print(' ' + forms)
+ if self.print_ranges:
+ print(f" {mention.head.root.address()}:{mention.span}")
diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py
new file mode 100644
index 00000000..d011f686
--- /dev/null
+++ b/udapi/block/corefud/printmentions.py
@@ -0,0 +1,186 @@
+import random
+from collections import Counter
+from udapi.core.block import Block
+from udapi.block.write.textmodetreeshtml import TextModeTreesHtml
+from udapi.block.write.textmodetrees import TextModeTrees
+
+class PrintMentions(Block):
+ """Print mentions with various properties."""
+
+ def __init__(self, continuous='include', almost_continuous='include', treelet='include',
+ forest='include', almost_forest='include', oneword='include', singleton='include',
+ empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5,
+ print_total=True, print_should=True,
+ print_sent_id=True, print_text=True, add_empty_line=True, indent=1,
+ minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc',
+ print_undef_as='_', print_doc_meta=True, print_comments=False,
+ mark='(Mark)', hints=True, layout='classic',
+ **kwargs):
+ super().__init__(**kwargs)
+ self.continuous = self._convert(continuous)
+ self.almost_continuous = self._convert(almost_continuous)
+ self.treelet = self._convert(treelet)
+ self.forest = self._convert(forest)
+ self.almost_forest = self._convert(almost_forest)
+ self.oneword = self._convert(oneword)
+ self.singleton = self._convert(singleton)
+ self.empty = self._convert(empty)
+
+ self.max_trees = max_trees
+ self.html = html
+ self.shuffle = shuffle
+ if shuffle:
+ random.seed(42)
+ self.print_other_forms = print_other_forms
+ self.print_total = print_total,
+ self.print_should = print_should,
+ print_class = TextModeTreesHtml if html else TextModeTrees
+ self.print_block = print_class(
+ print_sent_id=print_sent_id, print_text=print_text, add_empty_line=add_empty_line, indent=indent,
+ minimize_cross=minimize_cross, color=color, attributes=attributes,
+ print_undef_as=print_undef_as, print_doc_meta=print_doc_meta, print_comments=print_comments,
+ mark=mark, hints=hints, layout=layout)
+
+ def _convert(self, value):
+ if value in {'include', 'exclude', 'only'}:
+ return value
+ if value == 1:
+ return 'only'
+ if value == 0:
+ return 'exclude'
+ raise ValueError('unknown value ' + value)
+
+ def before_process_document(self, document):
+ self.print_block.before_process_document(document)
+
+ def after_process_document(self, document):
+ self.print_block.after_process_document(document)
+
+ def _ok(self, condition, value):
+ if value == 'include':
+ return True
+ return (condition and value == 'only') or (not condition and value=='exclude')
+
+ def _is_auxiliary_etc(self, node):
+ if node.udeprel in {'case', 'cc', 'conj', 'mark', 'appos', 'vocative', 'discourse'}:
+ return True
+ if node.deprel == 'advmod:emph':
+ return True
+ if node.udeprel == 'dep' and node.upos in {'ADP', 'SCONJ', 'CCONJ', 'PUNCT'}:
+ return True
+ return False
+
+ def _is_forest(self, mention, mwords, almost):
+ for w in mention.words:
+ # UD unfortunatelly does not use the copula-as-head style for copula construction,
+ # so e.g. in "It is my fault", "fault" is the root of the tree and all other words its children.
+ # However, in the cop-as-head stule, only "my" would depend on "fault" (and should be part of the mention).
+ # It is difficult to tell apart which w.children are related to w and which to the copula.
+ # We thus ignore these cases completely (we expect any child is potentially related to the copula).
+ if any(ch.udeprel == 'cop' for ch in w.children):
+ continue
+ for ch in w.children:
+ if ch not in mwords:
+ if not almost:
+ if self.print_should:
+ ch.misc["ShouldBeInSpanOf"] = mention.entity.eid
+ return False
+ # Punctuation before or after the mention span can depend on any of the mwords
+ # without breaking the almost_forest property.
+ # According to the UD guidelines, it should depend on the highest node within the phrase,
+ # i.e. on the mention head, but it is not our goal now to check UD punctuation guidelines.
+ if ch.udeprel == 'punct' and (ch < mention.words[0] or ch > mention.words[-1]):
+ continue
+ # Some auxiliary words (e.g. prepositions) may be excluded from the mention span
+ # without breaking the almost_forest property, but they need to depend
+ # on the mention head (or if the mention is not a catena, they need to depend
+ # on one of the potential heads, i.e. a node from mwords whose parent is not in mwords).
+ # For example: "A gift for (e1 John)" is almost_forest ("for" depends on "John" which is the mention head),
+ # but "(e1[1/2] John) with (e1[2/2]) Mary" is not almost_forest
+ # because "with" depends on "Mary", which is not the mention head (nor a potential mention head).
+ if not (w.parent and w.parent not in mwords and self._is_auxiliary_etc(ch)):
+ if self.print_should:
+ ch.misc["ShouldBeInSpanOf"] = mention.entity.eid
+ return False
+ return True
+
+ def _is_almost_continuous(self, mention):
+ if ',' not in mention.span:
+ return True
+ nonempty = [w for w in mention.words if not w.is_empty()]
+ if not nonempty:
+ return True
+ mwords = set(mention.words)
+ gap_nodes = [w for w in mention.head.root.descendants if w > nonempty[0] and w < nonempty[-1] and not w in mwords]
+ for gap_node in gap_nodes:
+ if not gap_node.is_empty():
+ return False
+ return True
+
+ def process_document(self, doc):
+ mentions = []
+ for entity in doc.coref_entities:
+ if self._ok(len(entity.mentions) == 1, self.singleton):
+ mentions.extend(entity.mentions)
+ if self.shuffle:
+ random.shuffle(mentions)
+ else:
+ mentions.sort()
+
+ seen_trees = 0
+ for mention in mentions:
+ if not self._ok(len(mention.words) == 1, self.oneword):
+ continue
+ if not self._ok(',' not in mention.span, self.continuous):
+ continue
+ if self.almost_continuous != 'include' and not self._ok(self._is_almost_continuous(mention), self.almost_continuous):
+ continue
+
+ empty_mwords = [w for w in mention.words if w.is_empty()]
+ if not self._ok(len(empty_mwords) > 0, self.empty):
+ continue
+
+ heads, mwords = 0, set(mention.words)
+ for w in mention.words:
+ if w.parent:
+ heads += 0 if w.parent in mwords else 1
+ else:
+ heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1
+ if not self._ok(heads <= 1, self.treelet):
+ continue
+ if self.forest != 'include' and not self._ok(self._is_forest(mention, mwords, False), self.forest):
+ continue
+ if self.almost_forest != 'include' and not self._ok(self._is_forest(mention, mwords, True), self.almost_forest):
+ continue
+
+ for w in mention.words:
+ w.misc['Mark'] = 1
+
+ seen_trees += 1
+ if self.max_trees and seen_trees > self.max_trees:
+ if not self.print_total:
+ print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.')
+ return
+ else:
+ this_form = ' '.join([w.form for w in mention.words])
+ print("# Mention = " + this_form)
+ if self.print_other_forms:
+ counter = Counter()
+ for m in mention.entity.mentions:
+ forms = ' '.join([w.form for w in m.words])
+ if forms != this_form:
+ counter[forms] += 1
+ if counter:
+ print(f"# {min(len(counter), self.print_other_forms)} other forms:", end='')
+ for form, count in counter.most_common(self.print_other_forms):
+ print(f' "{form}"({count})', end='')
+ print()
+ self.print_block.process_tree(mention.head.root)
+ for w in mention.words:
+ del w.misc['Mark']
+
+ if self.print_total:
+ if self.max_trees and seen_trees > self.max_trees:
+ print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.')
+ print(f'######## Total matching/all mentions = {seen_trees} / {len(mentions)}')
+
diff --git a/udapi/block/corefud/removemisc.py b/udapi/block/corefud/removemisc.py
new file mode 100644
index 00000000..f132aaed
--- /dev/null
+++ b/udapi/block/corefud/removemisc.py
@@ -0,0 +1,18 @@
+from udapi.core.block import Block
+import re
+
+class RemoveMisc(Block):
+ """Deleting all temporary attributes after primary conversions"""
+
+ def __init__(self, attrnames='', **kwargs):
+ """ Arg: attrnames = comma-separated list of Misc attributes to be deleted"""
+ super().__init__(**kwargs)
+ self.attrs4deletion = set(attrnames.split(','))
+
+ def process_tree(self,root):
+ for node in root.descendants_and_empty:
+ for attrname in list(node.misc):
+ shortattrname = re.sub(r'\[\d+\]',r'',attrname)
+ if shortattrname in self.attrs4deletion:
+ del node.misc[attrname]
+
diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py
new file mode 100644
index 00000000..4551873c
--- /dev/null
+++ b/udapi/block/corefud/removenocorefentities.py
@@ -0,0 +1,21 @@
+from udapi.core.block import Block
+import udapi.core.coref
+import re
+import logging
+
+class RemoveNoCorefEntities(Block):
+ """
+ Some corpora (e.g., AnCora) include annotation of named entities that are
+ not annotated for coreference. To distinguish them, their cluster ID starts
+ with 'NOCOREF' (optionally followed by entity type, so that one cluster
+ still has just one type). We may want to remove such entities from datasets
+ that are used to train coreference resolves, to prevent the resolvers from
+ thinking that all members of a NOCOREF cluster are coreferential. That is
+ what this block does.
+ """
+
+ def process_document(self, doc):
+ entities = doc.coref_entities
+ if not entities:
+ return
+ doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)}
diff --git a/udapi/block/corefud/singleparent.py b/udapi/block/corefud/singleparent.py
new file mode 100644
index 00000000..ee9b1948
--- /dev/null
+++ b/udapi/block/corefud/singleparent.py
@@ -0,0 +1,47 @@
+"""If an empty node has multiple (enhanced-deps) parents, only the highest one is kept."""
+from udapi.core.block import Block
+from collections import Counter
+from udapi.core.node import find_minimal_common_treelet
+import logging
+
+class SingleParent(Block):
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self._reasons = Counter()
+
+ def process_tree(self, tree):
+ for empty in tree.empty_nodes:
+ self._reasons['_empty'] += 1
+ if len(empty.deps) > 1:
+ self._reasons['_more-parents'] += 1
+ parents = [d['parent'] for d in empty.deps]
+ nonempty_parents = [p for p in parents if not p.is_empty()]
+ if len(nonempty_parents) != len(parents):
+ self._reasons['empty-parent'] += 1
+ #empty.misc['Mark'] = f"empty-parent:{empty.deps}"
+ logging.warning(f"Empty node {empty} has an empty parent.")
+ if not nonempty_parents:
+ empty.deps = []
+ self._reasons['no-nonempty-parent'] += 1
+ continue
+ (highest, added_nodes) = find_minimal_common_treelet(*nonempty_parents)
+ if highest in nonempty_parents:
+ self._reasons['one-governs'] += 1
+ empty.deps = [d for d in empty.deps if d['parent'] is highest]
+ continue
+ nonempty_parents.sort(key=lambda n:n._get_attr('depth'))
+ if len(nonempty_parents)>1 and nonempty_parents[0]._get_attr('depth') == nonempty_parents[0]._get_attr('depth'):
+ self._reasons['same-depth'] += 1
+ #empty.misc['Mark'] = f"same-depth:{empty.deps}"
+ else:
+ self._reasons['one-highest'] += 1
+ #empty.misc['Mark'] = f"one-highest:{empty.deps}"
+ empty.deps = [d for d in empty.deps if d['parent'] is nonempty_parents[0]]
+
+ def after_process_document(self, document):
+ message = "\n"
+ for k, v in self._reasons.most_common():
+ message += f"{k}={v}\n"
+ #document.meta["bugs"] = message
+ logging.info(message)
diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py
new file mode 100644
index 00000000..527159e9
--- /dev/null
+++ b/udapi/block/corefud/stats.py
@@ -0,0 +1,305 @@
+from udapi.core.block import Block
+from collections import Counter
+import re
+
+class Stats(Block):
+ """Block corefud.Stats prints various coreference-related statistics."""
+
+ def __init__(self, m_len_max=5, e_len_max=5,
+ report_basics=False, report_mentions=True, report_entities=True,
+ report_details=True, report_words_per_doc=False, report_entity_range=False,
+ selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _',
+ exclude_singletons=False, exclude_nonsingletons=False, style='human',
+ per_doc=False, max_rows_per_page=50, docname='newdoc', docname_len=15,
+ highlight_docnames=None,
+ **kwargs):
+ super().__init__(**kwargs)
+ self.m_len_max = m_len_max
+ self.e_len_max = e_len_max
+ self.report_basics = report_basics
+ self.report_mentions = report_mentions
+ self.report_entities = report_entities
+ self.report_details = report_details
+ self.report_words_per_doc = report_words_per_doc
+ self.report_entity_range = report_entity_range
+ self.exclude_singletons = exclude_singletons
+ self.exclude_nonsingletons = exclude_nonsingletons
+ self.style = style
+ if style not in 'tex tex-table tex-doc human'.split():
+ raise ValueError(f'Unknown style {style}')
+ self.per_doc = per_doc
+ self.max_rows_per_page = max_rows_per_page
+ if docname not in 'newdoc filename'.split():
+ raise ValueError(f'Unknown style {style}')
+ self.docname = docname
+ self.docname_len = docname_len
+ self.highlight_docnames = highlight_docnames
+ self._header_printed = False
+ self._lines_printed = None
+
+ self.counter = Counter()
+ self.mentions = 0
+ self.entities = 0
+ self.singletons = 0
+ self.total_nodes = 0
+ self.longest_mention = 0
+ self.longest_entity = 0
+ self.m_words = 0
+ self.selected_upos = None if selected_upos == 'all' else selected_upos.split()
+ self.entity_ranges = []
+
+ def process_document(self, doc):
+ self.total_nodes += len(list(doc.nodes))
+ self.counter['documents'] += 1
+ node2docord, current_docord = {}, 0
+ if self.report_entity_range:
+ for node in doc.nodes_and_empty:
+ node2docord[node] = current_docord
+ current_docord += 1
+
+ for entity in doc.coref_entities:
+ len_mentions = len(entity.mentions)
+ if len_mentions == 1:
+ self.singletons += 1
+ if len_mentions == 1 and self.exclude_singletons:
+ continue
+ elif len_mentions > 1 and self.exclude_nonsingletons:
+ continue
+ if self.report_entity_range:
+ self.entity_ranges.append(node2docord[entity.mentions[-1].head] - node2docord[entity.mentions[0].head])
+ self.longest_entity = max(len_mentions, self.longest_entity)
+ self.counter['c_total_len'] += len_mentions
+ self.counter[f"c_len_{min(len_mentions, self.e_len_max)}"] += 1
+
+ self.entities += 1
+ if not self.report_mentions and not self.report_details:
+ continue
+ for mention in entity.mentions:
+ self.mentions += 1
+ all_words = len(mention.words)
+ non_empty = len([w for w in mention.words if not w.is_empty()])
+ self.m_words += all_words
+ self.longest_mention = max(non_empty, self.longest_mention)
+ self.counter['m_total_len'] += non_empty
+ self.counter[f"m_len_{min(non_empty, self.m_len_max)}"] += 1
+ if self.report_details:
+ upos = 'other'
+ if not self.selected_upos or mention.head.upos in self.selected_upos:
+ upos = mention.head.upos
+ self.counter['m_head_upos_' + upos] += 1
+ self.counter['m_with_empty'] += 1 if all_words > non_empty else 0
+ self.counter['m_with_gaps'] += 1 if ',' in mention.span else 0
+ heads, mwords = 0, set(mention.words)
+ for w in mention.words:
+ if w.parent:
+ heads += 0 if w.parent in mwords else 1
+ else:
+ heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1
+ self.counter['m_nontreelet'] += 1 if heads > 1 else 0
+
+ if self.report_basics:
+ doc_words = 0
+ for tree in doc.trees:
+ self.counter['sents'] += 1
+ self.counter['words'] += len(tree.descendants)
+ self.counter['empty'] += len(tree.empty_nodes)
+ if tree.newdoc:
+ self.counter['newdocs'] += 1
+ if doc_words > self.counter['max_words_per_doc']:
+ self.counter['max_words_per_doc'] = doc_words
+ doc_words = 0
+ doc_words += len(tree.descendants)
+
+ def after_process_document(self, doc):
+ if self.per_doc:
+ self.process_end(skip=False, doc=doc)
+ self.counter = Counter()
+ self.mentions = 0
+ self.entities = 0
+ self.singletons = 0
+ self.total_nodes = 0
+ self.longest_mention = 0
+ self.longest_entity = 0
+ self.m_words = 0
+ self.entity_ranges = []
+
+ def process_end(self, skip=True, doc=None):
+ if not self._lines_printed:
+ self.print_header()
+ self._lines_printed = 0
+ if self.per_doc:
+ if skip:
+ self.print_footer()
+ return
+ else:
+ docname = doc.meta['loaded_from'] if self.docname == 'filename' else doc[0].trees[0].newdoc
+ if self.style.startswith('tex'):
+ if self.highlight_docnames and re.search(self.highlight_docnames, docname):
+ docname = r"\NEW " + docname
+ docname = docname.replace('_', r'\_')
+ print(f"{docname:{self.docname_len}}", end='&' if self.style.startswith('tex') else '\n')
+ elif self.style.startswith('tex-'):
+ print(f"{self.counter['documents']:4} documents &")
+ self._lines_printed += 1
+
+ mentions_nonzero = 1 if self.mentions == 0 else self.mentions
+ entities_nonzero = 1 if self.entities == 0 else self.entities
+ total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes
+
+ columns =[ ]
+ if self.report_basics:
+ columns += [('docs', f"{self.counter['newdocs']:6,}"),
+ ('sents', f"{self.counter['sents']:7,}"),
+ ('words', f"{self.counter['words']:9,}"),
+ ('empty', f"{self.counter['empty']:7,}"),]
+ if self.report_words_per_doc:
+ columns += [('max_words/doc', f"{self.counter['max_words_per_doc']:7,}"),
+ ('words/doc', f"{self.counter['words']/self.counter['newdocs']:7,.0f}"),]
+ if self.report_entities:
+ columns += [('entities', f"{self.entities:7,}"),
+ ('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"),
+ ('longest_entity', f"{self.longest_entity:6}"),
+ ('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")]
+ if self.report_entity_range:
+ self.entity_ranges.sort()
+ percentile = self.entity_ranges[int(0.95 * (len(self.entity_ranges) - 1))] if self.entity_ranges else 0
+ columns += [('entity_range_95percentile', f"{percentile:6,}"),]
+ for i in range(1, self.e_len_max + 1):
+ percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero
+ columns.append((f"c_len_{i}{'' if i < self.e_len_max else '+'}", f"{percent:5.1f}"))
+ if self.report_mentions:
+ columns += [('mentions', f"{self.mentions:7,}"),
+ ('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"),
+ ('longest_mention', f"{self.longest_mention:6}"),
+ ('avg_mention', f"{self.counter['m_total_len'] / mentions_nonzero:5.1f}")]
+ if self.m_len_max:
+ for i in range(0, self.m_len_max + 1):
+ percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero
+ columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}"))
+ if self.report_details:
+ columns += [('with_empty', f"{100 * self.counter['m_with_empty'] / mentions_nonzero:5.1f}"),
+ ('with_gaps', f"{100 * self.counter['m_with_gaps'] / mentions_nonzero:5.1f}"),
+ ('nontreelet', f"{100 * self.counter['m_nontreelet'] / mentions_nonzero:5.1f}"),]
+ if self.selected_upos:
+ upos_list = self.selected_upos + ['other']
+ else:
+ upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')]
+ for upos in upos_list:
+ columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}"))
+
+ if self.style.startswith('tex'):
+ print(" &".join(c[1] for c in columns), end=" \\\\\n")
+ elif self.style == 'human':
+ for c in columns:
+ print(f"{c[0]:>15} = {c[1].strip():>10}")
+ if not self.per_doc:
+ self.print_footer()
+ elif self._lines_printed > self.max_rows_per_page:
+ self.print_footer(False)
+ self._lines_printed = 0
+
+ def print_header(self):
+ if not self.style.startswith('tex-'):
+ return
+ if self.style == 'tex-doc':
+ if self._lines_printed is None:
+ print(r'\documentclass[multi=mypage]{standalone}')
+ print(r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}')
+ print(r'\usepackage[table]{xcolor}\newcommand{\NEW}{\rowcolor{gray!50}}')
+ print(r'\title{Udapi coreference statistics}')
+ print(r'\begin{document}')
+ print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}')
+ lines = [r'\begin{mypage}'+"\n"+r'\begin{tabular}{@{}l ',
+ " " * self.docname_len,
+ ("document" if self.per_doc else "dataset ") + " " * (self.docname_len-8),
+ " " * self.docname_len]
+ if self.report_basics:
+ lines[0] += "rrrr "
+ lines[1] += r'& \MC{4}{text size} '
+ lines[2] += r'& \MC{4}{total number of} '
+ lines[3] += r'& docs & sents & words &empty n.'
+ if self.report_words_per_doc:
+ lines[0] += "rr "
+ lines[1] += r'& & '
+ lines[2] += r'&\MC{2}{words/doc}'
+ lines[3] += r'& max & avg '
+ if self.report_entities:
+ lines[0] += "rrrr "
+ lines[1] += r'& \MC{4}{entities} '
+ lines[2] += r'& total &per 1k &\MC{2}{length}'
+ lines[3] += r'& count & words & max & avg '
+ if self.report_entity_range:
+ lines[0] += "r "
+ lines[1] += r'& '
+ lines[2] += r'& range '
+ lines[3] += r'& p95 '
+ if self.e_len_max:
+ for i in range(1, self.e_len_max + 1):
+ lines[0] += "r"
+ lines[2] += f"& {i:4}" + ("+ " if i==self.e_len_max else " ")
+ lines[3] += r'& [\%] '
+ lines[0] += " "
+ lines[1] += r'& \MC{' + str(self.e_len_max) + r'}{distribution of entity lengths}'
+ if self.report_mentions:
+ lines[0] += "rrrr "
+ lines[1] += r'& \MC{4}{mentions} '
+ lines[2] += r'& total &per 1k &\MC{2}{length}'
+ lines[3] += r'& count & words & max & avg '
+ if self.m_len_max:
+ for i in range(0, self.m_len_max + 1):
+ lines[0] += "r"
+ lines[2] += f"& {i:4}" + ("+ " if i==self.m_len_max else " ")
+ lines[3] += r'& [\%] '
+ lines[0] += " "
+ lines[1] += r'& \MC{' + str(self.m_len_max + 1) + r'}{distribution of mention lengths}' + " "*7
+ if self.report_details:
+ lines[0] += "rrrr "
+ lines[1] += r'& \MC{3}{mention type} '
+ lines[2] += r'&w/empty& w/gap&non-tree'
+ lines[3] += r'& [\%] ' * 3
+ if self.selected_upos:
+ upos_list = self.selected_upos + ['other']
+ else:
+ upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')]
+ lines[0] += "@{~}r" * len(upos_list)
+ lines[1] += r"& \MC{" + str(len(upos_list)) + r"}{distribution of head UPOS}"
+ lines[2] += ''.join(f'&{upos:7}' for upos in upos_list)
+ lines[3] += r'& [\%] ' * len(upos_list)
+ lines[0] += r'@{}}\toprule'
+ last_col = 1
+ lines[1] += r'\\'
+ lines[2] += r'\\'
+ lines[3] += r'\\\midrule'
+ if self.report_basics:
+ lines[1] += r'\cmidrule(lr){2-7}' if self.report_words_per_doc else r'\cmidrule(lr){2-5}'
+ lines[2] += r'\cmidrule(lr){2-5}'
+ last_col += 4
+ if self.report_words_per_doc:
+ lines[2] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+2}" + '}'
+ last_col += 2
+ if self.report_entities:
+ _cols = 5 if self.report_entity_range else 5
+ lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+_cols}" + '}'
+ lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}'
+ last_col += _cols
+ if self.e_len_max:
+ last_col += self.e_len_max
+ lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}'
+ if self.report_mentions:
+ lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}'
+ lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}'
+ last_col += 4
+ if self.m_len_max:
+ lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+self.m_len_max+1}" + '}'
+ last_col += self.m_len_max + 1
+ if self.report_details:
+ lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+3}"
+ lines[1] += r'}\cmidrule(l){' + f"{last_col+4}-{last_col+3+len(upos_list)}" + '}'
+ print("\n".join(lines))
+
+ def print_footer(self, end_doc=True):
+ if not self.style.startswith('tex-'):
+ return
+ print(r'\bottomrule\end{tabular}'+"\n"+r'\end{mypage}')
+ if self.style == 'tex-doc' and end_doc:
+ print(r'\end{document}')
diff --git a/udapi/block/demo/complexity.py b/udapi/block/demo/complexity.py
new file mode 100644
index 00000000..99e8a046
--- /dev/null
+++ b/udapi/block/demo/complexity.py
@@ -0,0 +1,268 @@
+"""demo.Complexity prints statistics on syntactic complexity.
+"""
+from udapi.core.basewriter import BaseWriter
+from collections import deque
+
+
+def non_punct(nodes):
+ return [n for n in nodes if n.upos != 'PUNCT']
+
+
+def is_np(node):
+ return node.upos in ("NOUN", "PROPN") or (node.upos == "PRON" and node.feats["PronType"] == "Prs" and not node.feats["Poss"])
+
+
+def is_vp(node):
+ """E.g. prosili, naléhali a žadonili => 1 coordinated verb phrase, head “prosili”.
+
+ [POS == “VERB”, [deprel == “conj”, POS == “VERB”]], unique coordination heads
+ TODO: zahrnout i non-VERB?
+ - vznikla a byla přijata(conj,ADJ,parent=vznikla)
+ - je(cop,AUX) nešťastný(ADJ) a nechá(conj,VERB,parent=nešťastný) se nalákat
+ - "podařilo se to a dokladem(ClauseHead,NOUN,conj,parent=podařilo) je(cop,AUX,parent=dokladem)"
+ - omezit se jen na (či využít) ClauseHead, nebo zahrnout i non-finite verbs (koordinace infinitivů či příčestí)?
+ "stihl(ClauseHead) napsat(VerbForm=Inf) a publikovat(VerbForm=Inf)" ... napsat ani publikovat nejsou ClauseHead
+ "rozhodl se ukončit a ukazuje(ClauseHead,parent=ukončit)" správně by mělo být parent=rozhodl, ale parser dělá chyby.
+ - Parsing vůbec dělá mnoho chyb v koordinacích, takže je vhodné podmínky velmi omezit.
+ """
+ return node.upos == "VERB" or node.misc["ClauseHead"]
+
+
+def is_relcl(node):
+ """Is a given node a head of a relative clause?
+
+ Unfortunatelly, UDPipe 2.4 produces just acl instead of acl:relcl.
+ """
+ if node.deprel == 'acl:relcl':
+ return True
+ return node.udeprel == 'acl' and any('Rel' in c.feats['PronType'] for c in node.children)
+
+
+def is_postponed_nom_mod(node):
+ """Is a given node a postponed nominal modifier?
+
+ Silvie: [(POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)), child with higher word order than parent
+ [deprel != “conj”, POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)]
+
+ TODO: Tohle hledá v češtině zcela běžné jevy jako "vznik díla". Nechceme hledat něco jiného?
+ """
+ return node.udeprel != 'conj' and is_np(node) and node.parent.precedes(node) and is_np(node.parent)
+
+
+def is_postponed_adj_mod(node):
+ # TODO můžeme rozlišovat holý přívlastek ("písní ruských") a rozvitý ("milenec známý z pozdějšího zpracování")
+ return node.parent.precedes(node) and is_np(node.parent) and node.upos == 'ADJ' #and not node.children
+
+
+def is_complex_nominal(node):
+ """[(POS in {“NOUN”, “PROPN”} | POS == “PRON” & feats:PronType == “Prs” & !(feats:Poss==”Yes”)) 2x descendant [deprel != “conj”]]
+ TODO: punct, case, cc a dep taky ignorovat?
+ TODO: opravdu descendants a ne children? (descendants snadno roste nad všechny meze, je-li tam třeba vedlejší věta)
+ TODO: beztak bude chtít odfiltrovat copuly: "Jádrem tvorby jsou sbírky." - Jádrem má 3 děti.
+ TODO: a nezvýšit ten limit z 2x aspoň na 3x?
+ """
+ return is_np(node) and len([n for n in node.descendants if n.deprel not in ('conj', 'punct', 'case', 'cc', 'dep', 'cop')]) > 1
+
+
+def is_finite_clause_head(node):
+ """Is a given node a head of a finite clause?
+
+ Silvie: [(POS == „VERB“ & feats:Verbform == „Fin“ | Verbform == „Part“} ) ] OR [(POS in {„ADJ“, „NOUN“, „PROPN“}, [child POS == „AUX“)]]
+ - POS == „VERB“ je zbytečné, protože VerbForm=Part je nastaveno i u ADJ ("je nucen" apod.)
+ - child POS == „AUX“ zase matchuje i např. na "Vidím psa(NOUN), který je(AUX,acl,parent=psa) z dávné doby."
+ - adjectivized predicates (převažující(VerbForm=Part) básně) by neměly být určeny jako clause_head
+
+ * Most finite verbs with deprel=amod are parsing errors - they should have deprel=acl,
+ but for better robustness we include these as well.
+ * Similarly "dep" and "orphan" are mostly parsing errors.
+ * TODO: by uncommenting the nsubj/csubj line, we find few more real clause heads, but also some false positives.
+ """
+ # TODO appos
+ if ((node.udeprel in {'root', 'conj', 'acl', 'advcl', 'ccomp', 'csubj', 'obl', 'parataxis', 'amod', 'dep', 'orphan'}
+ and is_finite_verb(node))
+ #or any(c.udeprel in {'nsubj', 'csubj'} for c in node.children)
+ or (any(c.udeprel == 'cop' for c in node.children) and node.udeprel != 'xcomp')):
+ return True
+ xcomp_child = next((c for c in node.children if c.udeprel == 'xcomp'), None)
+ return xcomp_child and any(c.udeprel == 'cop' for c in xcomp_child.children)
+
+
+# TODO: zahrnout i: bude(aux,AUX,parent=chovat) se chovat(VERB,VerbForm=Inf)
+def is_finite_verb(node):
+ return (node.feats['VerbForm'] in {'Fin', 'Part'} and
+ (node.upos == 'VERB' or
+ node.upos == 'ADJ' and any(c.deprel == 'aux:pass' for c in node.children)))
+
+
+def is_adjectivized_predicate(node):
+ """E.g. kouřící komín, zbitý kluk
+
+ Silvie: [(POS == „ADJ“ & feats:VerbForm == „Part“), parent [POS in {„NOUN“, „PROPN“}] ]
+ - parent [POS in {„NOUN“, „PROPN“}] zamezí případům jako
+ "kvůli nesmyslné a stupňující(parent=nesmyslné,deprel=conj) se žárlivosti"
+ "Nové pronikající(parent=Nové,deprel=amod) socialistické myšlení" asi chyba parsingu, mělo být parent=myšlení?
+ - dotaz naopak matchuje na "způsob, jakým jsou popsány", proto přidávám podmínku not node.misc["ClauseHead"]
+ """
+ return (node.feats["VerbForm"] == "Part"
+ and node.upos == "ADJ"
+ and (node.parent.upos in {"NOUN","PROPN"} or (node.udeprel == "conj" and node.parent.upos == "ADJ"))
+ and not node.misc["ClauseHead"])
+
+
+def is_controlled_predicate(node):
+ """E.g. Mohli jsme odejít i zůstat.
+
+ TODO: Chceme zahrnout i druhý a další člen koordinace, např. "stihl napsat a publikovat",
+ tedy node.udeprel == "conj" and node.parent.udeprel == "xcomp"?
+ """
+ return node.deprel == "xcomp"
+
+class Complexity(BaseWriter):
+
+ def __init__(self, matches=False, **kwargs):
+ super().__init__(**kwargs)
+ self.matches = matches
+
+
+ def report(self, category, groups, expand_type='no'):
+ if self.matches:
+ for group in groups:
+ self.print_match(category, group, expand_type)
+ else:
+ print("\t" + str(len(groups)), end='')
+
+
+ def expand_subtree(self, nodes, expand_type):
+ if expand_type == 'no':
+ return nodes
+ if len(nodes) > 1:
+ raise Exception("expanding more than one node not implemented yet")
+ if expand_type == 'subtree':
+ return nodes[0].descendants(add_self=True)
+ #if expand_type == 'subtree_except_conj':
+ #result = nodes
+ #for child in group.children:
+ #if child.udeprel != 'conj':
+ #result.extend(child.descendants(add_self=True))
+ #return = sorted(result)
+ if expand_type == 'subtree_within_clause':
+ stack = [n for n in nodes[0].children if n.udeprel != 'conj']
+ while stack:
+ node = stack.pop()
+ if not node.misc["ClauseHead"]:
+ nodes.append(node)
+ stack.extend(node.children())
+ return sorted(nodes)
+ raise ValueError("unknown expand value " + expand_type)
+
+
+ def print_match(self, category, group, expand_type='no'):
+ nodes = self.expand_subtree(group, expand_type)
+ lemmas = " ".join(n.lemma for n in nodes)
+ tags = " ".join(n.upos for n in nodes)
+ n_tokens = str(len(non_punct(nodes)))
+ print("\t".join([category, nodes[0].root.sent_id, lemmas, tags, n_tokens]))
+
+
+ def get_main_clauses(self, root):
+ main_heads = []
+ for main_head in root.children:
+ main_heads.append(main_head)
+ main_heads.extend(n for n in main_head.children if n.udeprel == 'conj')
+ return [[n] for n in main_heads]
+
+
+ def get_coord_phrase(self, root, phrase_type_function):
+ results = []
+ for node in root.descendants:
+ if phrase_type_function(node):
+ conjuncts = [n for n in node.children if n.udeprel == 'conj' and phrase_type_function(n)]
+ if conjuncts:
+ conjunctions = []
+ for conj in conjuncts:
+ # TODO multiword conjunctions (udeprel=flat)?
+ conjunctions.extend([n for n in conj.children if n.udeprel == 'cc'])
+ results.append(sorted([node] + conjuncts + conjunctions))
+ return results
+
+ # TODO koordinace hlavních i vedlejších vět
+ def get_t_units(self, main_heads):
+ results = []
+ for main_head in main_heads:
+ main_clause = [main_head]
+ dep_heads = []
+ stack = main_head.children
+ while stack:
+ node = stack.pop()
+ if node.misc["ClauseHead"]:
+ dep_heads.append(node)
+ else:
+ main_clause.append(node)
+ stack.extend(node.children)
+ main_clause = sorted(main_clause)
+
+ for dep_clause_head in dep_heads:
+ results.append(main_clause + self.expand_subtree([dep_clause_head], 'subtree'))
+ return results
+
+ # TODO complex t-unit má jinou definici: 3 klauze
+ def get_complex_t_units(self, root):
+ results = []
+ for node in root.descendants:
+ if node.deprel != 'root' and node.misc["ClauseHead"]: # TODO: exclude the main clause?
+ results += self.get_t_units([node])
+ return results
+
+
+ def process_tree(self, root):
+ print("# " + root.text)
+
+ allnodes = root.descendants
+ depth, clause_depth = {0: 0}, {0: 0}
+ queue = deque(root.children)
+ clause_heads = []
+ while queue:
+ node = queue.popleft()
+ depth[node.ord] = depth[node.parent.ord] + 1
+ clause_depth[node.ord] = clause_depth[node.parent.ord]
+ if is_finite_clause_head(node):
+ node.misc['ClauseHead'] = 1
+ clause_heads.append(node)
+ clause_depth[node.ord] += 1
+ queue.extend(node.children)
+ max_depth = sorted(depth.values())[-1]
+ max_clause_depth = sorted(clause_depth.values())[-1]
+
+ t_units = self.get_t_units([n for n in root.children if n.deprel == 'root'])
+ total_t_units_length = sum(len(t_unit) for t_unit in t_units)
+ mean_t_unit_length = total_t_units_length / (len(t_units) or 1) # TODO co reportovat, když věta nemá žádné t-units?
+
+ if not self.matches:
+ print("\t".join(str(x) for x in [root.sent_id, len(non_punct(allnodes)), max_depth, max_clause_depth, mean_t_unit_length]), end='')
+
+ self.report("clauses", [[n] for n in clause_heads], 'subtree')
+ self.report("adjectivized_predicates", [[n] for n in allnodes if is_adjectivized_predicate(n)])
+ self.report("controlled_predicates", [[n] for n in allnodes if is_controlled_predicate(n)])
+ self.report("main_clauses", self.get_main_clauses(root), 'subtree_within_clause')
+ self.report("coordinated_verb_phrases", self.get_coord_phrase(root, is_vp))
+ self.report("coordinated_noun_phrases", self.get_coord_phrase(root, is_np))
+ self.report("coordinated_adjective_phrases", self.get_coord_phrase(root, lambda n: n.upos in ("ADJ", "DET")))
+ self.report("coordinated_adverb_phrases", self.get_coord_phrase(root, lambda n: n.upos == "ADV"))
+ self.report("t-units", t_units)
+ self.report("complex_t-units", self.get_complex_t_units(root))
+ # TODO: najde "básně a písně" a "rychtář a rychtářka" UDPipe kdovíproč určil jako ADV a ADV. Zkontrolovat, máme-li nejlepší možný UDPipe model.
+ self.report("relative_clauses", [[n] for n in allnodes if is_relcl(n)], 'subtree_within_clause')
+ self.report("postponed_nominal_modifiers", [[n] for n in allnodes if is_postponed_nom_mod(n)])
+ self.report("postponed_adjective_modifiers", [[n] for n in allnodes if is_postponed_adj_mod(n)])
+ self.report("complex_nominals", [[n] for n in allnodes if is_complex_nominal(n)])
+
+ if not self.matches:
+ # TODO: pro total koordinace asi nemá smysl reportovat matches, jen total count?
+ self.report("coordinated_phrases_total", self.get_coord_phrase(root, lambda _: True))
+
+ nonpunct_upos = [n.upos for n in non_punct(allnodes)] + ['NONE', 'NONE']
+ brackets = str(len([n for n in allnodes if n.form == '(']))
+ dashes = str(len([n for n in allnodes if n.form in '-–—―'])) # hyphen, en-dash, em-dash, horizonatal bar
+ colons = str(len([n for n in allnodes if n.form == ':']))
+ semicolons = str(len([n for n in allnodes if n.form == ';']))
+ print("\t", "\t".join([nonpunct_upos[0], nonpunct_upos[1], brackets, dashes, colons, semicolons]))
diff --git a/udapi/block/demo/newspeak.py b/udapi/block/demo/newspeak.py
new file mode 100644
index 00000000..6be2caf5
--- /dev/null
+++ b/udapi/block/demo/newspeak.py
@@ -0,0 +1,66 @@
+"""demo.Newspeak block for 1984-like newspeak-ization of Czech.
+
+This is just a demo/draft.
+
+Usage:
+ $ echo 'Nejhorší žena je lepší než nejlepší muž.' | \
+ udapy -q read.Sentences udpipe.Cs demo.Newspeak write.Sentences
+ Převelenedobrá žena je veledobrá než převeledobrý muž.
+"""
+from udapi.core.block import Block
+from udapi.tool.morphodita import MorphoDiTa
+
+ANTONYMS = {
+ 'špatný': 'dobrý',
+ 'pomalý': 'rychlý',
+ # 'muž': 'žena', this does not work because xpos contains gender,
+ # we would also need to exploit the parsing and change gender of all congruent adj children.
+}
+
+
+class Newspeak(Block):
+ """Change all comparatives to vele-x and superlatives to převele-x."""
+
+ def __init__(self, morphodita_path='models/morphodita/cs/',
+ morphodita_model='czech-morfflex-131112.dict',
+ **kwargs):
+ """Create the PreVele block object."""
+ super().__init__(**kwargs)
+ self.morphodita = MorphoDiTa(model=morphodita_path + morphodita_model)
+
+ def process_tree(self, tree):
+
+ # apply process_node on all nodes
+ super().process_tree(tree)
+
+ # Capitalize if needed
+ first_node = tree.descendants[0]
+ if tree.text[0].isupper() and not first_node.form[0].isupper():
+ first_node.form = first_node.form[0].upper() + first_node.form[1:]
+
+ # Recompute the sentence string
+ tree.text = tree.compute_text()
+
+ def process_node(self, node):
+ antonym = ANTONYMS.get(node.lemma)
+ if antonym is not None:
+ if node.xpos[11] == 'N':
+ if node.form.lower().startswith('ne'):
+ node.lemma = antonym
+ node.xpos = node.xpos[:10] + 'A' + node.xpos[11:]
+ node.form = node.form[2:]
+ else:
+ forms = self.morphodita.forms_of_lemma(antonym, node.xpos)
+ if forms:
+ node.lemma = antonym
+ node.xpos = node.xpos[:10] + 'N' + node.xpos[11:]
+ node.form = 'ne' + forms[0].form
+
+ degree = node.feats["Degree"]
+ if degree in ("Sup", "Cmp"):
+ new_xpos = node.xpos[:9] + '1' + node.xpos[10:]
+ forms = self.morphodita.forms_of_lemma(node.lemma, new_xpos)
+ if forms:
+ new_form = "vele" if degree == "Cmp" else "převele"
+ new_form += forms[0].form
+ node.form = new_form
diff --git a/udapi/block/demo/rehangprepositions.py b/udapi/block/demo/rehangprepositions.py
index 8d641b49..d25e29bc 100644
--- a/udapi/block/demo/rehangprepositions.py
+++ b/udapi/block/demo/rehangprepositions.py
@@ -4,6 +4,7 @@
class RehangPrepositions(Block):
"""This block takes all prepositions (upos=ADP) and rehangs them above their parent."""
+
def process_node(self, node):
if node.upos == "ADP":
origparent = node.parent
diff --git a/udapi/block/eval/__init__.py b/udapi/block/eval/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/eval/conll17.py b/udapi/block/eval/conll17.py
new file mode 100644
index 00000000..61e86383
--- /dev/null
+++ b/udapi/block/eval/conll17.py
@@ -0,0 +1,288 @@
+r"""Block&script eval.Conll17 for evaluating LAS,UAS,etc as in CoNLL2017 UD shared task.
+
+This is a reimplementation of the CoNLL2017 shared task official evaluation script,
+http://universaldependencies.org/conll17/evaluation.html
+
+The gold trees and predicted (system-output) trees need to be sentence-aligned
+e.g. using `util.ResegmentGold`.
+Unlike in `eval.Parsing`, the gold and predicted trees can have different tokenization.
+
+An example usage and output::
+
+ $ udapy read.Conllu zone=gold files=gold.conllu \
+ read.Conllu zone=pred files=pred.conllu ignore_sent_id=1 \
+ eval.Conll17
+ Metric | Precision | Recall | F1 Score | AligndAcc
+ -----------+-----------+-----------+-----------+-----------
+ Words | 27.91 | 52.17 | 36.36 | 100.00
+ UPOS | 27.91 | 52.17 | 36.36 | 100.00
+ XPOS | 27.91 | 52.17 | 36.36 | 100.00
+ Feats | 27.91 | 52.17 | 36.36 | 100.00
+ Lemma | 27.91 | 52.17 | 36.36 | 100.00
+ UAS | 16.28 | 30.43 | 21.21 | 58.33
+ LAS | 16.28 | 30.43 | 21.21 | 58.33
+ CLAS | 10.34 | 16.67 | 12.77 | 37.50
+
+
+For evaluating multiple systems and testsets (as in CoNLL2017)
+stored in `systems/system_name/testset_name.conllu` you can use::
+
+ #!/bin/bash
+ SYSTEMS=`ls systems`
+ [[ $# -ne 0 ]] && SYSTEMS=$@
+ set -x
+ set -e
+ for sys in $SYSTEMS; do
+ mkdir -p results/$sys
+ for testset in `ls systems/$sys`; do
+ udapy read.Conllu zone=gold files=gold/$testset \
+ read.Conllu zone=pred files=systems/$sys/$testset ignore_sent_id=1 \
+ util.ResegmentGold \
+ eval.Conll17 print_results=0 print_raw=1 \
+ > results/$sys/${testset%.conllu}
+ done
+ done
+ python3 `python3 -c 'import udapi.block.eval.conll17 as x; print(x.__file__)'` -r 100
+
+The last line executes this block as a script and computes bootstrap resampling with 100 resamples
+(default=1000, it is recommended to keep the default or higher value unless testing the interface).
+This prints the ranking and confidence intervals (95% by default) and also p-values for each
+pair of systems with neighboring ranks. If the difference in LAS is significant
+(according to a paired bootstrap test, by default if p < 0.05),
+a line is printed between the two systems.
+
+The output looks like::
+
+ 1. Stanford 76.17 ± 0.12 (76.06 .. 76.30) p=0.001
+ ------------------------------------------------------------
+ 2. C2L2 74.88 ± 0.12 (74.77 .. 75.01) p=0.001
+ ------------------------------------------------------------
+ 3. IMS 74.29 ± 0.13 (74.16 .. 74.43) p=0.001
+ ------------------------------------------------------------
+ 4. HIT-SCIR 71.99 ± 0.14 (71.84 .. 72.12) p=0.001
+ ------------------------------------------------------------
+ 5. LATTICE 70.81 ± 0.13 (70.67 .. 70.94) p=0.001
+ ------------------------------------------------------------
+ 6. NAIST-SATO 70.02 ± 0.13 (69.89 .. 70.16) p=0.001
+ ------------------------------------------------------------
+ 7. Koc-University 69.66 ± 0.13 (69.52 .. 69.79) p=0.002
+ ------------------------------------------------------------
+ 8. UFAL-UDPipe-1-2 69.36 ± 0.13 (69.22 .. 69.49) p=0.001
+ ------------------------------------------------------------
+ 9. UParse 68.75 ± 0.14 (68.62 .. 68.89) p=0.003
+ ------------------------------------------------------------
+ 10. Orange-Deskin 68.50 ± 0.13 (68.37 .. 68.62) p=0.448
+ 11. TurkuNLP 68.48 ± 0.14 (68.34 .. 68.62) p=0.029
+ ------------------------------------------------------------
+ 12. darc 68.29 ± 0.13 (68.16 .. 68.42) p=0.334
+ 13. conll17-baseline 68.25 ± 0.14 (68.11 .. 68.38) p=0.003
+ ------------------------------------------------------------
+ 14. MQuni 67.93 ± 0.13 (67.80 .. 68.06) p=0.062
+ 15. fbaml 67.78 ± 0.13 (67.65 .. 67.91) p=0.283
+ 16. LyS-FASTPARSE 67.73 ± 0.13 (67.59 .. 67.85) p=0.121
+ 17. LIMSI-LIPN 67.61 ± 0.14 (67.47 .. 67.75) p=0.445
+ 18. RACAI 67.60 ± 0.13 (67.46 .. 67.72) p=0.166
+ 19. IIT-Kharagpur 67.50 ± 0.14 (67.36 .. 67.64) p=0.447
+ 20. naistCL 67.49 ± 0.15 (67.34 .. 67.63)
+
+TODO: Bootstrap currently reports only LAS, but all the other measures could be added as well.
+"""
+import argparse
+import difflib
+import logging
+import os
+import random
+import sys
+from collections import Counter
+from udapi.core.basewriter import BaseWriter
+
+CLAS_IGNORE = {'aux', 'case', 'cc', 'clf', 'cop', 'det', 'mark', 'punct'}
+
+
+class Conll17(BaseWriter):
+ """Evaluate labeled and unlabeled attachment score (LAS and UAS)."""
+
+ def __init__(self, gold_zone='gold', print_raw=False, print_results=True, **kwargs):
+ """Args:
+ gold_zone - Which zone contains the gold-standard trees (the other zone contains "pred")?
+ print_raw - Print raw counts (pred, gold, Words, LAS) for each sentence.
+ This is useful for bootstrap resampling post-processing to get confidence intervals.
+ print_results - Print a table with overall results after all document are processed.
+ """
+ super().__init__(**kwargs)
+ self.gold_zone = gold_zone
+ self.total_count = Counter()
+ self.print_raw = print_raw
+ self.print_results = print_results
+
+ def process_tree(self, tree):
+ gold_tree = tree.bundle.get_tree(self.gold_zone)
+ if tree == gold_tree:
+ return
+ pred_nodes = tree.descendants
+ gold_nodes = gold_tree.descendants
+ pred_forms = [n.form.lower() for n in pred_nodes]
+ gold_forms = [n.form.lower() for n in gold_nodes]
+ matcher = difflib.SequenceMatcher(None, pred_forms, gold_forms, autojunk=False)
+ aligned = []
+ for diff in matcher.get_opcodes():
+ edit, pred_lo, pred_hi, gold_lo, gold_hi = diff
+ if edit == 'equal':
+ aligned.extend(zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]))
+ align_map = {tree: gold_tree}
+ for p_node, g_node in aligned:
+ align_map[p_node] = g_node
+
+ count = Counter()
+ count['pred'] = len(pred_nodes)
+ count['gold'] = len(gold_nodes)
+ count['Words'] = len(aligned)
+ count['pred_clas'] = len([n for n in pred_nodes if n.udeprel not in CLAS_IGNORE])
+ count['gold_clas'] = len([n for n in gold_nodes if n.udeprel not in CLAS_IGNORE])
+ count['alig_clas'] = len([n for _, n in aligned if n.udeprel not in CLAS_IGNORE])
+
+ for p_node, g_node in aligned:
+ for attr in ('UPOS', 'XPOS', 'Feats', 'Lemma'):
+ if p_node.get_attrs([attr.lower()]) == g_node.get_attrs([attr.lower()]):
+ count[attr] += 1
+ if align_map.get(p_node.parent) == g_node.parent:
+ count['UAS'] += 1
+ if p_node.udeprel == g_node.udeprel:
+ count['LAS'] += 1
+ if g_node.udeprel not in CLAS_IGNORE:
+ count['CLAS'] += 1
+ self.total_count.update(count)
+
+ if self.print_raw:
+ scores = [str(count[s]) for s in ('pred', 'gold', 'Words', 'LAS')]
+ print(' '.join(scores))
+
+ def process_end(self):
+ if not self.print_results:
+ return
+
+ # Redirect the default filehandle to the file specified by self.files
+ self.before_process_document(None)
+
+ metrics = ('Words', 'UPOS', 'XPOS', 'Feats', 'Lemma', 'UAS', 'LAS', 'CLAS')
+ print("Metric | Precision | Recall | F1 Score | AligndAcc")
+ print("-----------+-----------+-----------+-----------+-----------")
+ pred, gold = self.total_count['pred'], self.total_count['gold']
+ alig = self.total_count['Words']
+ for metric in metrics:
+ if metric == 'CLAS':
+ pred, gold = self.total_count['pred_clas'], self.total_count['gold_clas']
+ alig = self.total_count['alig_clas']
+ correct = self.total_count[metric]
+ precision = correct / pred if pred else 0
+ recall = correct / gold if gold else 0
+ alignacc = correct / alig if alig else 0
+ fscore = 2 * correct / (pred + gold) if pred + gold else 0
+ print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{:10.2f}".format(
+ metric, 100 * precision, 100 * recall, 100 * fscore, 100 * alignacc))
+
+
+def prec_rec_f1(correct, pred, gold, alig=0):
+ precision = correct / pred if pred else 0
+ recall = correct / gold if gold else 0
+ alignacc = correct / alig if alig else 0
+ fscore = 2 * correct / (pred + gold) if pred + gold else 0
+ return precision, recall, fscore, alignacc
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--dir_results", "-d", default="results", help="directory with results")
+ parser.add_argument("--resamples", "-r", default=1000, type=int, help="how many resamples")
+ parser.add_argument("--confidence", "-c", default=95, help="use x-percent confidence interval")
+ parser.add_argument("--tests", "-t", default='all', help="comma-separated test sets")
+ parser.add_argument("--systems", "-s", default='all', help="comma-separated systems")
+ parser.add_argument("--randseed", default=0, type=int, help="random seed, default=sys time")
+ args = parser.parse_args()
+ res_dir, resamples, conf = args.dir_results, args.resamples, args.confidence
+ alpha = (1 - conf/100) / 2
+ index_lo = int(alpha * (resamples - 1))
+ index_hi = resamples - 1 - index_lo
+ index_mid = int(resamples / 2)
+ if args.systems == 'all':
+ systems = os.listdir(res_dir)
+ else:
+ systems = args.systems.split(',')
+ if args.tests == 'all':
+ tests = set()
+ for system in systems:
+ tests.update(os.listdir(res_dir + '/' + system))
+ tests = sorted(tests)
+ else:
+ tests = args.tests.split(',')
+ if args.randseed:
+ random.seed(args.randseed)
+ results = []
+
+ print('Loading...', file=sys.stderr)
+ for system in systems:
+ sys_results = []
+ results.append(sys_results)
+ for i_test, test in enumerate(tests):
+ filename = '/'.join((res_dir, system, test))
+ try:
+ with open(filename) as res_file:
+ sys_results.extend([[i_test] + list(map(int, l.split())) for l in res_file])
+ except FileNotFoundError:
+ logging.warning(filename + ' not found')
+ samples = len(sys_results)
+
+ print('Resampling...', file=sys.stderr)
+ boot_results = []
+ for i_resample in range(resamples):
+ print(i_resample + 1, file=sys.stderr, end='\r')
+ resample_results = []
+ boot_results.append(resample_results)
+ for i_system in range(len(systems)):
+ pred, gold, words, las = ([0] * len(tests) for _ in range(4))
+ for _ in range(samples):
+ i_test, pre, gol, wor, la_ = random.choice(results[i_system])
+ pred[i_test] += pre
+ gold[i_test] += gol
+ words[i_test] += wor
+ las[i_test] += la_
+ fscore_sum = 0
+ for i_test in range(len(tests)):
+ _prec, _rec, fscore, _aligacc = prec_rec_f1(las[i_test], pred[i_test], gold[i_test])
+ fscore_sum += fscore
+ resample_results.append(fscore_sum / len(tests))
+ print('\n', file=sys.stderr)
+
+ sys_fscores = []
+ for i_system, system in enumerate(systems):
+ sys_fscores.append([boot_results[i_resample][i_system] for i_resample in range(resamples)])
+ final_results = []
+ sys_sys_wins = [[0] * len(systems) for x in range(len(systems))]
+ for i_system, system in enumerate(systems):
+ for j_system in range(i_system):
+ for i, j in zip(sys_fscores[i_system], sys_fscores[j_system]):
+ if i > j:
+ sys_sys_wins[i_system][j_system] += 1
+ elif i < j:
+ sys_sys_wins[j_system][i_system] += 1
+ fscores = sorted(sys_fscores[i_system])
+ final_results.append([i_system, fscores[index_mid], fscores[index_lo], fscores[index_hi]])
+
+ sorted_systems = sorted(final_results, key=lambda x: -x[1])
+ for rank, sys_results in enumerate(sorted_systems):
+ i_system, f1_mid, f1_lo, f1_hi = sys_results
+ if rank < len(systems) - 1:
+ j_worse_sys = sorted_systems[rank + 1][0]
+ p_value = (sys_sys_wins[j_worse_sys][i_system] + 1) / (resamples + 1)
+ p_str = " p=%.3f" % p_value
+ else:
+ p_value, p_str = 1, ""
+ print("%2d. %17s %5.2f ±%5.2f (%5.2f .. %5.2f)%s" %
+ (rank + 1, systems[i_system],
+ 100 * f1_mid, 50 * (f1_hi - f1_lo), 100 * f1_lo, 100 * f1_hi, p_str))
+ if p_value < (1 - conf/100):
+ print('-' * 60)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/udapi/block/eval/conll18.py b/udapi/block/eval/conll18.py
new file mode 100644
index 00000000..22f42a42
--- /dev/null
+++ b/udapi/block/eval/conll18.py
@@ -0,0 +1,337 @@
+r"""Block&script eval.Conll18 for evaluating LAS,UAS,etc as in CoNLL2018 UD shared task.
+
+This is a reimplementation of the CoNLL2018 shared task official evaluation script,
+http://universaldependencies.org/conll18/evaluation.html
+
+The gold trees and predicted (system-output) trees need to be sentence-aligned
+e.g. using `util.ResegmentGold`.
+Unlike in `eval.Parsing`, the gold and predicted trees can have different tokenization.
+
+An example usage and output::
+
+ $ udapy read.Conllu zone=gold files=gold.conllu \
+ read.Conllu zone=pred files=pred.conllu ignore_sent_id=1 \
+ util.ResegmentGold \
+ eval.Conll18
+ Metric | Precision | Recall | F1 Score | AligndAcc
+ -----------+-----------+-----------+-----------+-----------
+ Words | 27.91 | 52.17 | 36.36 | 100.00
+ UPOS | 27.91 | 52.17 | 36.36 | 100.00
+ XPOS | 27.91 | 52.17 | 36.36 | 100.00
+ Feats | 27.91 | 52.17 | 36.36 | 100.00
+ Lemma | 27.91 | 52.17 | 36.36 | 100.00
+ UAS | 16.28 | 30.43 | 21.21 | 58.33
+ LAS | 16.28 | 30.43 | 21.21 | 58.33
+ CLAS | 10.34 | 16.67 | 12.77 | 37.50
+
+
+For evaluating multiple systems and testsets (as in CoNLL2018)
+stored in `systems/system_name/testset_name.conllu` you can use::
+
+ #!/bin/bash
+ SYSTEMS=`ls systems`
+ [[ $# -ne 0 ]] && SYSTEMS=$@
+ set -x
+ set -e
+ for sys in $SYSTEMS; do
+ mkdir -p results/$sys
+ for testset in `ls systems/$sys`; do
+ udapy read.Conllu zone=gold files=gold/$testset \
+ read.Conllu zone=pred files=systems/$sys/$testset ignore_sent_id=1 \
+ util.ResegmentGold \
+ eval.Conll18 print_results=0 print_raw=LAS \
+ > results/$sys/${testset%.conllu}
+ done
+ done
+ python3 `python3 -c 'import udapi.block.eval.conll18 as x; print(x.__file__)'` -r 100
+
+The last line executes this block as a script and computes bootstrap resampling with 100 resamples
+(default=1000, it is recommended to keep the default or higher value unless testing the interface).
+This prints the ranking and confidence intervals (95% by default) and also p-values for each
+pair of systems with neighboring ranks. If the difference in LAS is significant
+(according to a paired bootstrap test, by default if p < 0.05),
+a line is printed between the two systems.
+
+The output looks like::
+
+ 1. Stanford 76.17 ± 0.12 (76.06 .. 76.30) p=0.001
+ ------------------------------------------------------------
+ 2. C2L2 74.88 ± 0.12 (74.77 .. 75.01) p=0.001
+ ------------------------------------------------------------
+ 3. IMS 74.29 ± 0.13 (74.16 .. 74.43) p=0.001
+ ------------------------------------------------------------
+ 4. HIT-SCIR 71.99 ± 0.14 (71.84 .. 72.12) p=0.001
+ ------------------------------------------------------------
+ 5. LATTICE 70.81 ± 0.13 (70.67 .. 70.94) p=0.001
+ ------------------------------------------------------------
+ 6. NAIST-SATO 70.02 ± 0.13 (69.89 .. 70.16) p=0.001
+ ------------------------------------------------------------
+ 7. Koc-University 69.66 ± 0.13 (69.52 .. 69.79) p=0.002
+ ------------------------------------------------------------
+ 8. UFAL-UDPipe-1-2 69.36 ± 0.13 (69.22 .. 69.49) p=0.001
+ ------------------------------------------------------------
+ 9. UParse 68.75 ± 0.14 (68.62 .. 68.89) p=0.003
+ ------------------------------------------------------------
+ 10. Orange-Deskin 68.50 ± 0.13 (68.37 .. 68.62) p=0.448
+ 11. TurkuNLP 68.48 ± 0.14 (68.34 .. 68.62) p=0.029
+ ------------------------------------------------------------
+ 12. darc 68.29 ± 0.13 (68.16 .. 68.42) p=0.334
+ 13. conll18-baseline 68.25 ± 0.14 (68.11 .. 68.38) p=0.003
+ ------------------------------------------------------------
+ 14. MQuni 67.93 ± 0.13 (67.80 .. 68.06) p=0.062
+ 15. fbaml 67.78 ± 0.13 (67.65 .. 67.91) p=0.283
+ 16. LyS-FASTPARSE 67.73 ± 0.13 (67.59 .. 67.85) p=0.121
+ 17. LIMSI-LIPN 67.61 ± 0.14 (67.47 .. 67.75) p=0.445
+ 18. RACAI 67.60 ± 0.13 (67.46 .. 67.72) p=0.166
+ 19. IIT-Kharagpur 67.50 ± 0.14 (67.36 .. 67.64) p=0.447
+ 20. naistCL 67.49 ± 0.15 (67.34 .. 67.63)
+"""
+import argparse
+import difflib
+import logging
+import os
+import random
+import sys
+from collections import Counter
+from udapi.core.basewriter import BaseWriter
+
+CONTENT = {'nsubj', 'obj', 'iobj', 'csubj', 'ccomp', 'xcomp', 'obl', 'vocative', 'expl',
+ 'dislocated', 'advcl', 'advmod', 'discourse', 'nmod', 'appos', 'nummod', 'acl',
+ 'amod', 'conj', 'fixed', 'flat', 'compound', 'list', 'parataxis', 'orphan', 'goeswith',
+ 'reparandum', 'root', 'dep'}
+FUNCTIONAL = {'aux', 'cop', 'mark', 'det', 'clf', 'case', 'cc'}
+UNIV_FEATS = {'PronType', 'NumType', 'Poss', 'Reflex', 'Foreign', 'Abbr', 'Gender', 'Animacy',
+ 'Number', 'Case', 'Definite', 'Degree', 'VerbForm', 'Mood', 'Tense', 'Aspect',
+ 'Voice', 'Evident', 'Polarity', 'Person', 'Polite'}
+
+class Conll18(BaseWriter):
+ """Evaluate LAS, UAS, MLAS and BLEX."""
+
+ def __init__(self, gold_zone='gold', print_raw=False, print_results=True, print_counts=False,
+ **kwargs):
+ """Args:
+ gold_zone - Which zone contains the gold-standard trees (the other zone contains "pred")?
+ print_raw - Print raw counts (pred, gold, aligned, correct) for each sentence.
+ This is useful for bootstrap resampling post-processing to get confidence intervals.
+ The parameter print_raw specifies a given metric
+ (UAS, LAS, MLAS, BLEX, UPOS, XPOS, Feats, Lemma) or is 0 (or False) by default.
+ print_results - Print a table with overall results after all document are processed.
+ print_counts - Print counts of correct/gold/system instead of prec/rec/f1 for all metrics.
+ """
+ super().__init__(**kwargs)
+ self.gold_zone = gold_zone
+ self.total_count = Counter()
+ self.print_raw = print_raw
+ self.print_results = print_results
+ self.print_counts = print_counts
+
+ def _ufeats(self, feats):
+ return '|'.join(sorted(x for x in feats.split('|') if x.split('=', 1)[0] in UNIV_FEATS))
+
+ def process_tree(self, tree):
+ gold_tree = tree.bundle.get_tree(self.gold_zone)
+ if tree == gold_tree:
+ return
+ pred_nodes = tree.descendants
+ gold_nodes = gold_tree.descendants
+ pred_forms = [n.form.lower() for n in pred_nodes]
+ gold_forms = [n.form.lower() for n in gold_nodes]
+ matcher = difflib.SequenceMatcher(None, pred_forms, gold_forms, autojunk=False)
+ aligned = []
+ for diff in matcher.get_opcodes():
+ edit, pred_lo, pred_hi, gold_lo, gold_hi = diff
+ if edit == 'equal':
+ aligned.extend(zip(pred_nodes[pred_lo:pred_hi], gold_nodes[gold_lo:gold_hi]))
+ align_map, feats_match = {tree: gold_tree}, {}
+ for p_node, g_node in aligned:
+ align_map[p_node] = g_node
+ feats_match[p_node] = self._ufeats(str(p_node.feats)) == self._ufeats(str(g_node.feats))
+
+ count = Counter()
+ count['pred'] = len(pred_nodes)
+ count['gold'] = len(gold_nodes)
+ count['Words'] = len(aligned)
+ count['pred_cont'] = len([n for n in pred_nodes if n.udeprel in CONTENT])
+ count['gold_cont'] = len([n for n in gold_nodes if n.udeprel in CONTENT])
+ count['alig_cont'] = len([n for _, n in aligned if n.udeprel in CONTENT])
+
+ for p_node, g_node in aligned:
+ count['UPOS'] += 1 if p_node.upos == g_node.upos else 0
+ count['XPOS'] += 1 if p_node.xpos == g_node.xpos else 0
+ count['Lemmas'] += 1 if g_node.lemma == '_' or p_node.lemma == g_node.lemma else 0
+ count['UFeats'] += 1 if feats_match[p_node] else 0
+ if feats_match[p_node] and p_node.upos == g_node.upos and p_node.xpos == g_node.xpos:
+ count['AllTags'] += 1
+ if align_map.get(p_node.parent) == g_node.parent and not p_node.misc['Rehanged']:
+ count['UAS'] += 1
+ if p_node.udeprel == g_node.udeprel:
+ count['LAS'] += 1
+ if g_node.udeprel in CONTENT:
+ count['CLAS'] += 1
+ if g_node.lemma == '_' or g_node.lemma == p_node.lemma:
+ count['BLEX'] += 1
+ if self._morpho_match(p_node, g_node, align_map, feats_match):
+ if not p_node.misc['FuncChildMissing']:
+ count['MLAS'] += 1
+ self.total_count.update(count)
+
+ if self.print_raw:
+ if self.print_raw in {'CLAS', 'BLEX', 'MLAS'}:
+ scores = [str(count[s]) for s in ('pred_cont', 'gold_cont', 'alig_cont',
+ self.print_raw)]
+ else:
+ scores = [str(count[s]) for s in ('pred', 'gold', 'Words', self.print_raw)]
+ print(' '.join(scores))
+
+ def _morpho_match(self, p_node, g_node, align_map, feats_match):
+ if p_node.upos != g_node.upos or not feats_match[p_node]:
+ return False
+ p_children = [c for c in p_node.children if c.udeprel in FUNCTIONAL and not c.misc['Rehanged']]
+ g_children = [c for c in g_node.children if c.udeprel in FUNCTIONAL]
+ if len(p_children) != len(g_children):
+ return False
+ for p_child, g_child in zip(p_children, g_children):
+ if align_map.get(p_child) != g_child:
+ return False
+ if p_child.udeprel != g_child.udeprel:
+ return False
+ if p_child.upos != g_child.upos or not feats_match[p_child]:
+ return False
+ return True
+
+ def process_end(self):
+ if not self.print_results:
+ return
+
+ # Redirect the default filehandle to the file specified by self.files
+ self.before_process_document(None)
+
+ metrics = ('Words', 'UPOS', 'XPOS', 'UFeats', 'AllTags',
+ 'Lemmas', 'UAS', 'LAS', 'CLAS', 'MLAS', 'BLEX')
+ if self.print_counts:
+ print("Metric | Correct | Gold | Predicted | Aligned")
+ else:
+ print("Metric | Precision | Recall | F1 Score | AligndAcc")
+ print("-----------+-----------+-----------+-----------+-----------")
+ for metric in metrics:
+ correct = self.total_count[metric]
+ if metric in {'CLAS', 'BLEX', 'MLAS'}:
+ pred, gold = self.total_count['pred_cont'], self.total_count['gold_cont']
+ alig = self.total_count['alig_cont']
+ else:
+ pred, gold = self.total_count['pred'], self.total_count['gold']
+ alig = self.total_count['Words']
+ if self.print_counts:
+ print("{:11}|{:10} |{:10} |{:10} |{:10}".format(
+ metric, correct, gold, pred, alig))
+ else:
+ precision, recall, fscore, alignacc = prec_rec_f1(correct, pred, gold, alig)
+ alignacc = "{:10.2f}".format(100 * alignacc) if metric != 'Words' else ""
+ print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
+ metric, 100 * precision, 100 * recall, 100 * fscore, alignacc))
+
+
+def prec_rec_f1(correct, pred, gold, alig=0):
+ precision = correct / pred if pred else 0
+ recall = correct / gold if gold else 0
+ alignacc = correct / alig if alig else 0
+ fscore = 2 * correct / (pred + gold) if pred + gold else 0
+ return precision, recall, fscore, alignacc
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--dir_results", "-d", default="results", help="directory with results")
+ parser.add_argument("--resamples", "-r", default=1000, type=int, help="how many resamples")
+ parser.add_argument("--confidence", "-c", default=95, help="use x-percent confidence interval")
+ parser.add_argument("--tests", "-t", default='all', help="comma-separated test sets")
+ parser.add_argument("--systems", "-s", default='all', help="comma-separated systems")
+ parser.add_argument("--randseed", default=0, type=int, help="random seed, default=sys time")
+ args = parser.parse_args()
+ res_dir, resamples, conf = args.dir_results, args.resamples, args.confidence
+ alpha = (1 - conf/100) / 2
+ index_lo = int(alpha * (resamples - 1))
+ index_hi = resamples - 1 - index_lo
+ index_mid = int(resamples / 2)
+ if args.systems == 'all':
+ systems = os.listdir(res_dir)
+ else:
+ systems = args.systems.split(',')
+ if args.tests == 'all':
+ tests = set()
+ for system in systems:
+ tests.update(os.listdir(res_dir + '/' + system))
+ tests = sorted(tests)
+ else:
+ tests = args.tests.split(',')
+ if args.randseed:
+ random.seed(args.randseed)
+ results = []
+
+ print('Loading...', file=sys.stderr)
+ for system in systems:
+ sys_results = []
+ results.append(sys_results)
+ for i_test, test in enumerate(tests):
+ filename = '/'.join((res_dir, system, test))
+ try:
+ with open(filename) as res_file:
+ sys_results.extend([[i_test] + list(map(int, l.split())) for l in res_file])
+ except FileNotFoundError:
+ logging.warning(filename + ' not found')
+ samples = len(sys_results)
+
+ print('Resampling...', file=sys.stderr)
+ boot_results = []
+ for i_resample in range(resamples):
+ print(i_resample + 1, file=sys.stderr, end='\r')
+ resample_results = []
+ boot_results.append(resample_results)
+ for i_system in range(len(systems)):
+ pred, gold, words, correct = ([0] * len(tests) for _ in range(4))
+ for _ in range(samples):
+ i_test, pre, gol, wor, corr = random.choice(results[i_system])
+ pred[i_test] += pre
+ gold[i_test] += gol
+ words[i_test] += wor
+ correct[i_test] += corr
+ fscore_sum = 0
+ for i_test in range(len(tests)):
+ _prec, _rec, fscore, _aligacc = prec_rec_f1(correct[i_test], pred[i_test], gold[i_test])
+ fscore_sum += fscore
+ resample_results.append(fscore_sum / len(tests))
+ print('\n', file=sys.stderr)
+
+ sys_fscores = []
+ for i_system, system in enumerate(systems):
+ sys_fscores.append([boot_results[i_resample][i_system] for i_resample in range(resamples)])
+ final_results = []
+ sys_sys_wins = [[0] * len(systems) for x in range(len(systems))]
+ for i_system, system in enumerate(systems):
+ for j_system in range(i_system):
+ for i, j in zip(sys_fscores[i_system], sys_fscores[j_system]):
+ if i > j:
+ sys_sys_wins[i_system][j_system] += 1
+ elif i < j:
+ sys_sys_wins[j_system][i_system] += 1
+ fscores = sorted(sys_fscores[i_system])
+ final_results.append([i_system, fscores[index_mid], fscores[index_lo], fscores[index_hi]])
+
+ sorted_systems = sorted(final_results, key=lambda x: -x[1])
+ for rank, sys_results in enumerate(sorted_systems):
+ i_system, f1_mid, f1_lo, f1_hi = sys_results
+ if rank < len(systems) - 1:
+ j_worse_sys = sorted_systems[rank + 1][0]
+ p_value = (sys_sys_wins[j_worse_sys][i_system] + 1) / (resamples + 1)
+ p_str = " p=%.3f" % p_value
+ else:
+ p_value, p_str = 1, ""
+ print("%2d. %17s %5.2f ±%5.2f (%5.2f .. %5.2f)%s" %
+ (rank + 1, systems[i_system],
+ 100 * f1_mid, 50 * (f1_hi - f1_lo), 100 * f1_lo, 100 * f1_hi, p_str))
+ if p_value < (1 - conf/100):
+ print('-' * 60)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py
new file mode 100644
index 00000000..e4889770
--- /dev/null
+++ b/udapi/block/eval/f1.py
@@ -0,0 +1,230 @@
+"""Block eval.F1 for evaluating differences between sentences with P/R/F1.
+
+``eval.F1 zones=en_pred gold_zone=en_gold details=0``
+prints something like::
+
+ predicted = 210
+ gold = 213
+ correct = 210
+ precision = 100.00%
+ recall = 98.59%
+ F1 = 99.29%
+
+``eval.F1 gold_zone=y attributes=form,upos focus='(?i:an?|the)_DET' details=4``
+prints something like::
+
+ === Details ===
+ token pred gold corr prec rec F1
+ the_DET 711 213 188 26.44% 88.26% 40.69%
+ The_DET 82 25 19 23.17% 76.00% 35.51%
+ a_DET 0 62 0 0.00% 0.00% 0.00%
+ an_DET 0 16 0 0.00% 0.00% 0.00%
+ === Totals ===
+ predicted = 793
+ gold = 319
+ correct = 207
+ precision = 26.10%
+ recall = 64.89%
+ F1 = 37.23%
+
+This block finds differences between nodes of trees in two zones
+and reports the overall precision, recall and F1.
+The two zones are "predicted" (on which this block is applied)
+and "gold" (which needs to be specified with parameter ``gold``).
+
+This block also reports the number of total nodes in the predicted zone
+and in the gold zone and the number of "correct" nodes,
+that is predicted nodes which are also in the gold zone.
+By default two nodes are considered "the same" if they have the same ``form``,
+but it is possible to check also for other nodes' attributes
+(with parameter ``attributes``).
+
+As usual::
+
+ precision = correct / predicted
+ recall = correct / gold
+ F1 = 2 * precision * recall / (precision + recall)
+
+The implementation is based on finding the longest common subsequence (LCS)
+between the nodes in the two trees.
+This means that the two zones do not need to be explicitly word-aligned.
+"""
+from collections import Counter
+import logging
+import re
+
+from udapi.core.basewriter import BaseWriter
+
+# pylint: disable=too-many-instance-attributes,invalid-name
+
+
+class F1(BaseWriter):
+ """Evaluate differences between sentences (in different zones) with P/R/F1.
+
+ Args:
+ zones: Which zone contains the "predicted" trees?
+ Make sure that you specify just one zone.
+ If you leave the default value "all" and the document contains more zones,
+ the results will be mixed, which is most likely not what you wanted.
+ Exception: If the document conaints just two zones (predicted and gold trees),
+ you can keep the default value "all" because this block
+ will skip comparison of the gold zone with itself.
+
+ gold_zone: Which zone contains the gold-standard trees?
+
+ attributes: comma separated list of attributes which should be checked
+ when deciding whether two nodes are equivalent in LCS
+
+ focus: Regular expresion constraining the tokens we are interested in.
+ If more attributes were specified in the ``attributes`` parameter,
+ their values are concatenated with underscore, so ``focus`` should reflect that
+ e.g. ``attributes=form,upos focus='(a|the)_DET'``.
+ For case-insensitive focus use e.g. ``focus='(?i)the'``
+ (which is equivalent to ``focus='[Tt][Hh][Ee]'``).
+
+ details: Print also detailed statistics for each token (matching the ``focus``).
+ The value of this parameter ``details`` specifies the number of tokens to include.
+ The tokens are sorted according to the sum of their *predicted* and *gold* counts.
+ """
+
+ def __init__(self, gold_zone, attributes='form', focus=None, details=4, **kwargs):
+ """Create the eval.F1 block object."""
+ super().__init__(**kwargs)
+ self.gold_zone = gold_zone
+ self.attrs = attributes.split(',')
+ self.focus = None
+ if focus is not None:
+ self.focus = re.compile(focus)
+ self.details = details
+ self.correct, self.pred, self.gold = 0, 0, 0
+ self.visited_zones = Counter()
+ if details:
+ self._common = Counter()
+ self._pred = Counter()
+ self._gold = Counter()
+ self._total = Counter()
+
+ def process_tree(self, tree):
+ gold_tree = tree.bundle.get_tree(self.gold_zone)
+ if tree == gold_tree:
+ return
+ self.visited_zones[tree.zone] += 1
+
+ pred_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in tree.descendants]
+ gold_tokens = ['_'.join(n.get_attrs(self.attrs, undefs='None')) for n in gold_tree.descendants]
+
+ # lcs("abc", "acb") can be either "ab" or "ac".
+ # We want to prefer the LCS with the highest number of non-focused tokens.
+ # E.g. if focus="," then lcs("a,c", "ac,") should be "ac" and the comma should be evaluated
+ # as non-aligned, i.e. eval.F1 should return precision=recall=f1=0 for this sentence.
+ if self.focus is None:
+ common = find_lcs(pred_tokens, gold_tokens)
+ else:
+ nf_pred_tokens = [x for x in pred_tokens if not self.focus.fullmatch(x)]
+ nf_gold_tokens = [x for x in gold_tokens if not self.focus.fullmatch(x)]
+ nf_common = find_lcs(nf_pred_tokens, nf_gold_tokens)
+ i, j, c, un_pred, un_gold, common = 0, 0, 0, [], [], []
+ while i < len(pred_tokens) and j < len(gold_tokens):
+ if c == len(nf_common):
+ common += find_lcs(pred_tokens[i:], gold_tokens[j:])
+ break
+ while nf_common[c] != pred_tokens[i]:
+ un_pred.append(pred_tokens[i])
+ i += 1
+ while nf_common[c] != gold_tokens[j]:
+ un_gold.append(gold_tokens[j])
+ j += 1
+ common += find_lcs(un_pred, un_gold)
+ un_pred, un_gold = [], []
+ while c < len(nf_common) and nf_common[c] == pred_tokens[i] and nf_common[c] == gold_tokens[j]:
+ i, j, c = i+1, j+1, c+1
+ common = [x for x in common if self.focus.fullmatch(x)]
+ pred_tokens = [x for x in pred_tokens if self.focus.fullmatch(x)]
+ gold_tokens = [x for x in gold_tokens if self.focus.fullmatch(x)]
+
+ self.correct += len(common)
+ self.pred += len(pred_tokens)
+ self.gold += len(gold_tokens)
+
+ if self.details:
+ for x in common:
+ self._common[x] += 1
+ for x in gold_tokens:
+ self._gold[x] += 1
+ self._total[x] += 1
+ for x in pred_tokens:
+ self._pred[x] += 1
+ self._total[x] += 1
+
+ @property
+ def f1(self):
+ pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero
+ precision = self.correct / pred
+ recall = self.correct / gold
+ return 2 * precision * recall / ((precision + recall) or 1)
+
+ def process_end(self):
+ # Redirect the default filehandle to the file specified by self.files
+ self.before_process_document(None)
+
+ if not self.visited_zones:
+ logging.warning('Block eval.F1 was not applied to any zone. '
+ 'Check the parameter zones=%s', self.zones)
+ elif len(self.visited_zones) > 1:
+ logging.warning('Block eval.F1 was applied to more than one zone %s. '
+ 'The results are mixed together. Check the parameter zones=%s',
+ list(self.visited_zones.elements()), self.zones)
+ print('Comparing predicted trees (zone=%s) with gold trees (zone=%s), sentences=%d'
+ % (next(self.visited_zones.elements()), self.gold_zone,
+ self.visited_zones.most_common(1)[0][1]))
+ if self.details:
+ print('=== Details ===')
+ print('%-10s %5s %5s %5s %6s %6s %6s'
+ % ('token', 'pred', 'gold', 'corr', 'prec', 'rec', 'F1'))
+ tokens = self._total.most_common(self.details)
+ for token, _ in tokens:
+ _prec = self._common[token] / (self._pred[token] or 1)
+ _rec = self._common[token] / (self._gold[token] or 1)
+ _f1 = 2 * _prec * _rec / ((_prec + _rec) or 1)
+ print('%-10s %5d %5d %5d %6.2f%% %6.2f%% %6.2f%%'
+ % (token, self._pred[token], self._gold[token], self._common[token],
+ 100 * _prec, 100 * _rec, 100 * _f1))
+ print('=== Totals ===')
+
+ print("%-9s = %7d\n" * 3
+ % ('predicted', self.pred, 'gold', self.gold, 'correct', self.correct), end='')
+ pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero
+ precision = self.correct / pred
+ recall = self.correct / gold
+ f1 = 2 * precision * recall / ((precision + recall) or 1)
+ print("%-9s = %6.2f%%\n" * 3
+ % ('precision', 100 * precision, 'recall', 100 * recall, 'F1', 100 * f1), end='')
+
+
+# difflib.SequenceMatcher does not compute LCS, so let's implement it here
+def find_lcs(x, y):
+ """Find longest common subsequence."""
+ m, n = len(x), len(y)
+ if m == 0 or n == 0:
+ return []
+ elif x[0] == y[0]:
+ i = 1
+ while i < min(m, n) and x[i] == y[i]:
+ i += 1
+ return x[:i] + (find_lcs(x[i:], y[i:]) if i < min(m, n) else [])
+ else:
+ C = [[0] * (n + 1) for _ in range(m + 1)]
+ for i in range(1, m + 1):
+ for j in range(1, n + 1):
+ C[i][j] = C[i - 1][j - 1] + 1 if x[i - 1] == y[j - 1] else max(C[i][j - 1], C[i - 1][j])
+ index = C[m][n]
+ lcs = [None] * index
+ while m > 0 and n > 0:
+ if x[m - 1] == y[n - 1]:
+ lcs[index - 1] = x[m - 1]
+ m, n, index = m - 1, n - 1, index - 1
+ elif C[m - 1][n] > C[m][n - 1]:
+ m -= 1
+ else:
+ n -= 1
+ return lcs
diff --git a/udapi/block/eval/parsing.py b/udapi/block/eval/parsing.py
new file mode 100644
index 00000000..6923c1fb
--- /dev/null
+++ b/udapi/block/eval/parsing.py
@@ -0,0 +1,40 @@
+"""Block eval.Parsing for evaluating UAS and LAS - gold and pred must have the same tokens."""
+from udapi.core.basewriter import BaseWriter
+
+
+class Parsing(BaseWriter):
+ """Evaluate labeled and unlabeled attachment score (LAS and UAS)."""
+
+ def __init__(self, gold_zone, **kwargs):
+ """Create the eval.Parsing block object."""
+ super().__init__(**kwargs)
+ self.gold_zone = gold_zone
+ self.correct_las, self.correct_ulas, self.correct_uas, self.total = 0, 0, 0, 0
+
+ def process_tree(self, tree):
+ gold_tree = tree.bundle.get_tree(self.gold_zone)
+ if tree == gold_tree:
+ return
+ pred_nodes = tree.descendants
+ gold_nodes = gold_tree.descendants
+ if len(pred_nodes) != len(gold_nodes):
+ raise ValueError('The sentences do not match (%d vs. %d nodes)'
+ % (len(pred_nodes), len(gold_nodes)))
+
+ self.total += len(pred_nodes)
+ for pred_node, gold_node in zip(pred_nodes, gold_nodes):
+ if pred_node.parent.ord == gold_node.parent.ord:
+ self.correct_uas += 1
+ if pred_node.deprel == gold_node.deprel:
+ self.correct_las += 1
+ if pred_node.udeprel == gold_node.udeprel:
+ self.correct_ulas += 1
+
+
+ def process_end(self):
+ # Redirect the default filehandle to the file specified by self.files
+ self.before_process_document(None)
+ print('nodes = %d' % self.total)
+ print('UAS = %6.2f' % (100 * self.correct_uas / self.total))
+ print('LAS (deprel) = %6.2f' % (100 * self.correct_las / self.total))
+ print('LAS (udeprel) = %6.2f' % (100 * self.correct_ulas / self.total))
diff --git a/udapi/block/msf/case.py b/udapi/block/msf/case.py
new file mode 100644
index 00000000..7d362c7f
--- /dev/null
+++ b/udapi/block/msf/case.py
@@ -0,0 +1,448 @@
+"""
+Morphosyntactic features (UniDive):
+Derive a MS Case feature from morphological case and adposition.
+"""
+from udapi.core.block import Block
+import logging
+
+class Case(Block):
+
+ adposmap = {
+ 'v+Loc': 'Ine',
+ 'uvnitř+Gen': 'Ine',
+ 'uvnitř+': 'Ine',
+ 'mezi_uvnitř+Gen': 'Ine', # annotation error?
+ 'uprostřed+Gen': 'Ces',
+ 'mezi+Ins': 'Int',
+ 'mezi+Nom': 'Int', # annotation error
+ 'mezi+Voc': 'Int', # annotation error
+ 'vně+Gen': 'Ext',
+ 'stranou+Gen': 'Ext',
+ 'stranou+Dat': 'Ext',
+ 'na+Loc': 'Ade',
+ 'na_mimo+Loc': 'Ade', # annotation error?
+ 'na_úroveň+Gen': 'Ade',
+ 'na_úroveň+': 'Ade',
+ 'v_proces+Gen': 'Ade', # ???
+ 'v_rámec+Gen': 'Ade', # ???
+ 'v_rámec+': 'Ade', # ???
+ 'v_řada+Gen': 'Ade', # ???
+ 'z_oblast+Gen': 'Ade', # ???
+ 'vedle+Gen': 'Apu',
+ 'u+Gen': 'Chz',
+ 'kolem+Gen': 'Cir',
+ 'kol+Gen': 'Cir',
+ 'dokola+Gen': 'Cir',
+ 'okolo+Gen': 'Cir',
+ 'v_oblast+Gen': 'Cir',
+ 'v_oblast+': 'Cir',
+ 'blízko+Dat': 'Prx',
+ 'blízko+Gen': 'Prx',
+ 'blízko+': 'Prx',
+ 'nedaleko+Gen': 'Prx',
+ 'daleko+Gen': 'Prx', # lemma of 'nedaleko'
+ 'poblíž+Gen': 'Prx',
+ 'daleko_od+Gen': 'Dst',
+ 'nad+Ins': 'Sup',
+ 'pod+Ins': 'Sub',
+ 'vespod+Gen': 'Sub',
+ 'před+Ins': 'Ant',
+ 'vpředu+Gen': 'Ant',
+ 'na_čelo+Gen': 'Ant',
+ 'v_čelo+Gen': 'Ant',
+ 'v_čelo+': 'Ant',
+ 'za+Ins': 'Pst',
+ 'naproti+Dat': 'Opp',
+ 'od+Gen': 'Abl',
+ 'od+Dat': 'Abl', # annotation error
+ 'směr_od+Gen': 'Abl',
+ 'z_strana+Gen': 'Abl',
+ 'z_strana+': 'Abl',
+ 'z+Gen': 'Ela',
+ 'z+Nom': 'Ela', # annotation error
+ 'z+Dat': 'Ela', # annotation error
+ 'zevnitř+Gen': 'Ela',
+ 'zprostřed+Gen': 'Cne',
+ 's+Gen': 'Del',
+ 'zpod+Gen': 'Sbe',
+ 'zpoza+Gen': 'Pse',
+ 'po+Loc': 'Per',
+ 'cesta+Gen': 'Per',
+ 'cesta+Ins': 'Per',
+ 'napříč+Gen': 'Crs',
+ 'napříč+Ins': 'Crs',
+ 'podél+Gen': 'Lng',
+ 'skrz+Acc': 'Inx',
+ 'přes+Acc': 'Spx',
+ 'přes+Nom': 'Spx', # annotation error
+ 'ob+Acc': 'Cix',
+ 'po+Acc': 'Ter',
+ 'po+Nom': 'Ter', # annotation error
+ 'po+Gen': 'Ter', # annotation error
+ 'do+Gen': 'Ill',
+ 'do+Acc': 'Ill', # annotation error
+ 'do_/+Gen': 'Ill',
+ 'dovnitř+Gen': 'Ill',
+ 'doprostřed+Gen': 'Cnl',
+ 'mezi+Acc': 'Itl',
+ 'na+Acc': 'All',
+ 'na+Nom': 'All', # annotation error
+ 'na+Gen': 'All', # annotation error
+ 'k+Dat': 'Apl',
+ 'k+Nom': 'Apl', # annotation error
+ 'vstříc+Dat': 'Apl',
+ 'do_oblast+Gen': 'Apl',
+ 'směr+': 'Apl',
+ 'směr_k+Dat': 'Apl',
+ 'směr_k+': 'Apl',
+ 'směr_na+Acc': 'Apl',
+ 'v_směr_k+Dat': 'Apl',
+ 'nad+Acc': 'Spl',
+ 'nad+Nom': 'Spl', # annotation error
+ 'pod+Acc': 'Sbl',
+ 'před+Acc': 'Anl',
+ 'před+Gen': 'Anl', # annotation error
+ 'za+Acc': 'Psl',
+ 'dík_za+Acc': 'Psl', # annotation error?
+ 'dokud': 'Tan',
+ 'nežli': 'Tan',
+ 'v+Acc': 'Tem',
+ 'v+Nom': 'Tem', # annotation error
+ 'v+Gen': 'Tem', # annotation error
+ 'při_příležitost+Gen': 'Tem',
+ 'současně_s+Ins': 'Tem',
+ 'u_příležitost+Gen': 'Tem',
+ 'v_období+Gen': 'Tpx',
+ 'počátkem+Gen': 'Din',
+ 'počátek+Gen': 'Din',
+ 'počínat+Ins': 'Din',
+ 'počínat+': 'Din',
+ 'začátkem+Gen': 'Din',
+ 'začátek+Gen': 'Din',
+ 'během+Gen': 'Dur',
+ 'postupem+Gen': 'Dur',
+ 'postup+Gen': 'Dur',
+ 'při+Loc': 'Dur',
+ 'v_průběh+Gen': 'Dur',
+ 'za+Gen': 'Der',
+ 'koncem+Gen': 'Dtr',
+ 'konec+Gen': 'Dtr',
+ 'k_konec+Gen': 'Dtr',
+ 'končit+Ins': 'Dtr',
+ 'závěrem+Gen': 'Dtr',
+ 'závěr+Gen': 'Dtr',
+ 'na_závěr+Gen': 'Dtr',
+ 'v_závěr+Gen': 'Dtr',
+ 'jakmile': 'Tps',
+ 'jen_co': 'Tps',
+ 'před_po+Loc': 'Tps',
+ 'počínaje+Ins': 'Teg',
+ 'jménem+Nom': 'Atr',
+ 'jméno+Nom': 'Atr',
+ 'zdali': 'Atr',
+ 'že': 'Atr',
+ 'z_řada+Gen': 'Gen',
+ 's+Ins': 'Com',
+ 's+Nom': 'Com', # annotation error
+ 'spolu_s+Ins': 'Com',
+ 'spolu_s+': 'Com',
+ 'společně_s+Ins': 'Com',
+ 'společně_s+': 'Com',
+ 'v_čelo_s+Ins': 'Com',
+ 'v_spolupráce_s+Ins': 'Com',
+ 'bez+Gen': 'Abe',
+ 'včetně+Gen': 'Inc',
+ 'nad_rámec+Gen': 'Add',
+ 'kromě+Gen': 'Exc',
+ 'krom+Gen': 'Exc',
+ 'mimo+Acc': 'Exc',
+ 'mimo+Gen': 'Exc',
+ 'vyjma+Gen': 'Exc',
+ 'až_na+Acc': 'Exc',
+ 's_výjimka+Gen': 'Exc',
+ 's_výjimka+': 'Exc',
+ 'místo+Gen': 'Sbs',
+ 'místo+Ins': 'Sbs', # něčím místo něčím jiným
+ 'místo+Loc': 'Sbs', # annotation error
+ 'místo_do+Gen': 'Sbs',
+ 'místo_k+Dat': 'Sbs',
+ 'místo_na+Acc': 'Sbs',
+ 'místo_na+': 'Sbs',
+ 'místo_po+Loc': 'Sbs',
+ 'místo_v+Acc': 'Sbs',
+ 'místo_v+': 'Sbs',
+ 'místo_za+Acc': 'Sbs',
+ 'namísto+Gen': 'Sbs',
+ 'namísto_do+Gen': 'Sbs',
+ 'v_zastoupení+Gen': 'Sbs',
+ 'výměna_za+Acc': 'Sbs',
+ 'jako': 'Ess',
+ 'jako+': 'Ess',
+ 'jako+Nom': 'Ess',
+ 'jako+Acc': 'Ess',
+ 'jako+Dat': 'Ess',
+ 'jako_u+Gen': 'Ess',
+ 'jako_v+Loc': 'Ess',
+ 'formou+Gen': 'Ess',
+ 'forma+Gen': 'Ess',
+ 'v_forma+Gen': 'Ess',
+ 'v_podoba+Gen': 'Ess',
+ 'v_podoba+': 'Ess',
+ 'shoda+Gen': 'Equ',
+ 'v_shoda_s+Ins': 'Equ',
+ 'do_soulad_s+Ins': 'Sem',
+ 'na_způsob+Gen': 'Sem',
+ 'po_vzor+Gen': 'Sem',
+ 'úměrně+Dat': 'Sem',
+ 'úměrně_k+Dat': 'Sem',
+ 'úměrně_s+Ins': 'Sem',
+ 'v_analogie_s+Ins': 'Sem',
+ 'v_duch+Gen': 'Sem',
+ 'v_smysl+Gen': 'Sem',
+ 'oproti+Dat': 'Dsm',
+ 'na_rozdíl_od+Gen': 'Dsm',
+ 'na_rozdíl_od+': 'Dsm',
+ 'než': 'Cmp',
+ 'než+Nom': 'Cmp',
+ 'než+Gen': 'Cmp',
+ 'než+Acc': 'Cmp',
+ 'než_nad+Ins': 'Cmp',
+ 'než_v+Acc': 'Cmp',
+ 'než_v+Loc': 'Cmp',
+ 'v_poměr_k+Dat': 'Cmp',
+ 'v_poměr_k+': 'Cmp',
+ 'v_porovnání_k+Dat': 'Cmp',
+ 'v_porovnání_s+Ins': 'Cmp',
+ 'v_porovnání_s+': 'Cmp',
+ 'v_srovnání_s+Ins': 'Cmp',
+ 'v_srovnání_s+': 'Cmp',
+ 'o+Acc': 'Dif',
+ 'o+Nom': 'Dif', # annotation error
+ 'o+Gen': 'Dif', # annotation error
+ 'o+Dat': 'Dif', # annotation error
+ 'o_o+Acc': 'Dif', # annotation error
+ 'kdežto': 'Cmt',
+ 'přičemž': 'Cmt',
+ 'zatímco': 'Cmt',
+ 'díky+Dat': 'Cau',
+ 'dík+Dat': 'Cau',
+ 'kvůli+Dat': 'Cau',
+ 'vinou+Gen': 'Cau',
+ 'vlivem+Gen': 'Cau',
+ 'vliv+Gen': 'Cau',
+ 'vliv+': 'Cau',
+ 'vinou+Gen': 'Cau',
+ 'vina+Gen': 'Cau',
+ 'zásluhou+Gen': 'Cau',
+ 'zásluha+Gen': 'Cau',
+ 'z_důvod+Gen': 'Cau',
+ 'v_důsledek+Gen': 'Cau',
+ 'jelikož': 'Cau',
+ 'ježto': 'Cau',
+ 'poněvadž': 'Cau',
+ 'protože': 'Cau',
+ 'takže': 'Cau',
+ 'následek+Gen': 'Cau',
+ 'aby': 'Pur',
+ 'jméno+Gen': 'Pur',
+ 'pro_případ+Gen': 'Pur',
+ 'v_jméno+Gen': 'Pur',
+ 'v_zájem+Gen': 'Pur',
+ 'za_účel+Gen': 'Pur',
+ 'na_základ+Gen': 'Cns',
+ 'pod_vliv+Gen': 'Cns',
+ 's_ohled_na+Acc': 'Cns',
+ 's_přihlédnutí_k+Dat': 'Cns',
+ 's_přihlédnutí_na+Acc': 'Cns',
+ 'v_souvislost_s+Ins': 'Cns',
+ 'v_souvislost_s+': 'Cns',
+ 'v_světlo+Gen': 'Cns',
+ 'vzhledem_k+Dat': 'Cns',
+ 'v_soulad_s+Ins': 'Cns',
+ 'v_soulad_s+': 'Cns',
+ 'z_titul+Gen': 'Cns',
+ 'ať': 'Ign',
+ 'bez_ohled_na+Acc': 'Ign',
+ 'nehledě_k+Dat': 'Ign',
+ 'nehledě_na+Acc': 'Ign',
+ 'navzdory+Dat': 'Ccs',
+ 'vzdor+Dat': 'Ccs',
+ 'v_rozpor_s+Ins': 'Ccs',
+ 'ač': 'Ccs',
+ 'ačkoli': 'Ccs',
+ 'byť': 'Ccs',
+ 'přestože': 'Ccs',
+ 'třebaže': 'Ccs',
+ 'jestli': 'Cnd',
+ 'jestliže': 'Cnd',
+ 'ledaže': 'Cnd',
+ 'li': 'Cnd',
+ 'pakliže': 'Cnd',
+ 'pokud': 'Cnd',
+ 'pokud+Nom': 'Cnd',
+ 'zda': 'Cnd',
+ 'v_případ+Gen': 'Cnd',
+ 'v_případ+': 'Cnd',
+ 'v_závislost_na+Loc': 'Cnd',
+ 'v_závislost_s+Ins': 'Cnd',
+ 'o+Loc': 'The',
+ 'ohledně+Gen': 'The',
+ 'stran+Gen': 'The',
+ 'co_do+Gen': 'The',
+ 'na_téma+Gen': 'The',
+ 'na_téma+Nom': 'The',
+ 'na_téma+': 'The',
+ 'na_úsek+Gen': 'The',
+ 'po_stránka+Gen': 'The',
+ 'v_obor+Gen': 'The',
+ 'v_otázka+Gen': 'The',
+ 'v_spojení_s+Ins': 'The',
+ 'v_věc+Gen': 'The',
+ 'v_vztah_k+Dat': 'The',
+ 'v_vztah_k+': 'The',
+ 'v_záležitost+Gen': 'The',
+ 'v_znamení+Gen': 'The',
+ 'z_hledisko+Gen': 'The',
+ 'z_hledisko+': 'The',
+ 'podle+Gen': 'Quo',
+ 'dle+Gen': 'Quo',
+ 'pomocí+Gen': 'Ins',
+ 's_pomoc+Gen': 'Ins',
+ 'prostřednictvím+Gen': 'Ins',
+ 'prostřednictví+Gen': 'Ins',
+ 'prostřednictví+Ins': 'Ins', # annotation error
+ 'prostřednictví+': 'Ins',
+ 'za_pomoc+Gen': 'Ins',
+ 'pro+Acc': 'Ben',
+ 'pro+Nom': 'Ben', # annotation error
+ 'pro+Gen': 'Ben', # annotation error
+ 'pro+Ins': 'Ben', # annotation error
+ 'napospas+Dat': 'Ben',
+ 'k_prospěch+Gen': 'Ben',
+ 'na_úkor+Gen': 'Ben',
+ 'na_vrub+Gen': 'Ben',
+ 'v_prospěch+Gen': 'Ben',
+ 'v_neprospěch+Gen': 'Ben',
+ 'v_služba+Gen': 'Ben',
+ 'proti+Dat': 'Adv',
+ 'proti+Gen': 'Adv',
+ 'kontra+Nom': 'Adv',
+ 'versus+Nom': 'Adv',
+ 'vůči+Dat': 'Adv',
+ # subordinators
+ 'dokud': 'Tan',
+ 'nežli': 'Tan',
+ 'jakmile': 'Tps',
+ 'jen_co': 'Tps',
+ 'zdali': 'Atr',
+ 'že': 'Atr',
+ 'jako': 'Ess',
+ 'než': 'Cmp',
+ 'kdežto': 'Cmt',
+ 'přičemž': 'Cmt',
+ 'zatímco': 'Cmt',
+ 'jelikož': 'Cau',
+ 'ježto': 'Cau',
+ 'poněvadž': 'Cau',
+ 'protože': 'Cau',
+ 'takže': 'Cau',
+ 'aby': 'Pur',
+ 'ať': 'Ign',
+ 'ač': 'Ccs',
+ 'ačkoli': 'Ccs',
+ 'byť': 'Ccs',
+ 'přestože': 'Ccs',
+ 'třebaže': 'Ccs',
+ 'jestli': 'Cnd',
+ 'jestliže': 'Cnd',
+ 'ledaže': 'Cnd',
+ 'li': 'Cnd',
+ 'pakliže': 'Cnd',
+ 'pokud': 'Cnd',
+ 'zda': 'Cnd',
+ # coordinators
+ 'a': 'Conj',
+ 'i': 'Conj',
+ 'ani': 'Nnor',
+ 'nebo': 'Disj',
+ 'či': 'Disj',
+ 'ale': 'Advs',
+ 'avšak': 'Advs',
+ 'však': 'Advs',
+ 'nýbrž': 'Advs',
+ 'neboť': 'Reas',
+ 'tedy': 'Cnsq',
+ 'tak': 'Cnsq'
+ }
+
+ def process_node(self, node):
+ """
+ Derives a case value from preposition and morphological case. Stores it
+ as MSFCase in MISC.
+ """
+ # Do not do anything for function words.
+ # Specifically for Case, also skip 'det' and 'amod' modifiers (congruent attributes)
+ # because their Case is only agreement feature inherited from the head noun.
+ if node.udeprel in ['case', 'mark', 'cc', 'aux', 'cop', 'punct']:
+ node.misc['MSFFunc'] = 'Yes'
+ return
+ elif node.udeprel in ['det', 'amod']:
+ node.misc['MSFFunc'] = 'No'
+ return
+ else:
+ node.misc['MSFFunc'] = 'No'
+ # Get all case markers (adpositions) attached to the current node.
+ adpositions = []
+ for c in node.children:
+ if c.udeprel == 'case':
+ lemma = c.lemma
+ # If it has outgoing 'fixed' relations, it is a multiword adposition.
+ fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed']
+ if fixedchildren:
+ lemma += '_' + '_'.join(fixedchildren)
+ adpositions.append(lemma)
+ # We assume that all features were copied from FEATS to MISC in mwe.MsfInit.
+ # They may have been further processed there, so we take the input from there.
+ msfcase = node.misc['MSFCase']
+ if adpositions:
+ adpostring = '_'.join(adpositions)
+ caseadpostring = adpostring + '+' + msfcase
+ if caseadpostring in self.adposmap:
+ msfcase = self.adposmap[caseadpostring]
+ else:
+ logging.warn(f"No Case value found for '{caseadpostring}'.")
+ msfcase = caseadpostring
+ # Omer wants to collect cases from both adpositions and subordinators
+ # but we will consider subordinators only if we do not have any case
+ # from morphology or adpositions.
+ if not msfcase:
+ subordinators = []
+ for c in node.children:
+ if c.udeprel == 'mark':
+ lemma = c.lemma
+ # If it has outgoing 'fixed' relations, it is a multiword adposition.
+ fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed']
+ if fixedchildren:
+ lemma += '_' + '_'.join(fixedchildren)
+ subordinators.append(lemma)
+ if subordinators:
+ subordstring = '_'.join(subordinators)
+ if subordstring in self.adposmap:
+ msfcase = self.adposmap[subordstring]
+ # To lump coordinators with all the above makes even less sense but for
+ # the moment we do it.
+ if not msfcase:
+ coordinators = []
+ for c in node.children:
+ if c.udeprel == 'cc':
+ lemma = c.lemma
+ # If it has outgoing 'fixed' relations, it is a multiword adposition.
+ fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed']
+ if fixedchildren:
+ lemma += '_' + '_'.join(fixedchildren)
+ coordinators.append(lemma)
+ if coordinators:
+ coordstring = '_'.join(coordinators)
+ if coordstring in self.adposmap:
+ msfcase = self.adposmap[coordstring]
+ node.misc['MSFCase'] = msfcase
diff --git a/udapi/block/msf/createabstract.py b/udapi/block/msf/createabstract.py
new file mode 100644
index 00000000..fbdf73e5
--- /dev/null
+++ b/udapi/block/msf/createabstract.py
@@ -0,0 +1,45 @@
+"""
+Morphosyntactic features (UniDive):
+Create abstract nodes representing dropped arguments of predicates (if verbal
+morphology signals that the subject is third person singular, and there is no
+subject node, create an abstract node and copy the features there).
+"""
+from udapi.core.block import Block
+import re
+
+class CreateAbstract(Block):
+
+ def process_node(self, node):
+ """
+ If a node has MSFVerbForm=Fin and at least one of the agreement features
+ MSFNumber, MSFPerson, MSFGender, MSFAnimacy, MSFPolite, assume that these
+ features characterize the subject (this block is not suitable for languages
+ with polypersonal agreement). Check that the subject is present. If not,
+ create an abstract node to represent it.
+ """
+ if node.misc['MSFVerbForm'] == 'Fin' and any([node.misc[x] for x in ['MSFNumber', 'MSFPerson', 'MSFGender', 'MSFAnimacy', 'MSFPolite']]):
+ # Current node is a finite predicate. Does it have a subject? If not, create an abstract one.
+ if not any([x.udeprel in ['nsubj', 'csubj'] for x in node.children]):
+ # There could already be an abstract subject. We have to look for it in the enhanced graph.
+ if not any([re.match(r"^[nc]subj", edep['deprel']) for edep in node.deps]):
+ # Create an abstract subject.
+ subject = node.create_empty_child('nsubj')
+ subject.upos = 'PRON'
+ subject.feats['PronType'] = 'Prs'
+ subject.misc['MSFPronType'] = 'Prs'
+ subject.feats['Case'] = 'Nom'
+ subject.misc['MSFCase'] = 'Nom'
+ for f in ['Number', 'Person', 'Gender', 'Animacy', 'Polite']:
+ msf = 'MSF' + f
+ if node.misc[msf]:
+ subject.feats[f] = node.misc[msf]
+ subject.misc[msf] = node.misc[msf]
+ subject.misc['MSFFunc'] = 'No'
+ # Regardless of whether it had a subject or not, the agreement features
+ # should be removed from the verb.
+ ###!!! We also may want to check if the pre-existing subject has all the features.
+ node.misc['MSFNumber'] = ''
+ node.misc['MSFPerson'] = ''
+ node.misc['MSFGender'] = ''
+ node.misc['MSFAnimacy'] = ''
+ node.misc['MSFPolite'] = ''
diff --git a/udapi/block/msf/init.py b/udapi/block/msf/init.py
new file mode 100644
index 00000000..ceca12af
--- /dev/null
+++ b/udapi/block/msf/init.py
@@ -0,0 +1,53 @@
+"""
+Morphosyntactic features (UniDive):
+Initialization. Copies features from FEATS as MSF* attributes to MISC.
+"""
+from udapi.core.block import Block
+import re
+
+class Init(Block):
+
+
+ def process_node(self, node):
+ """
+ For every feature in FEATS, creates its MSF* counterpart in MISC.
+ """
+ for f in node.feats:
+ # Only selected features will be copied. Certain features are not
+ # interesting for the morphosyntactic annotation.
+ if f not in ['Abbr', 'AdpType', 'Emph', 'Foreign', 'NameType', 'Style', 'Typo', 'Variant']:
+ node.misc['MSF'+f] = node.feats[f]
+ # We are particularly interested in the Case feature but some nominals
+ # lack it (e.g. acronyms or numbers). If there is a preposition, it may
+ # indicate the expected case of the nominal.
+ if not node.feats['Case']:
+ # Not any 'case' dependent is helpful. Here we really need single-word
+ # adposition.
+ adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP']
+ if len(adpositions) == 1:
+ fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed']
+ if not fixed and adpositions[0].feats['Case']:
+ node.misc['MSFCase'] = adpositions[0].feats['Case']
+ # If we did not find a preposition to help us, we may be able to read
+ # the case off an adjectival modifier or determiner.
+ if not node.misc['MSFCase']:
+ modifiers = [x for x in node.children if x.udeprel in ['amod', 'det'] and x.feats['Case']]
+ if modifiers:
+ node.misc['MSFCase'] = modifiers[0].feats['Case']
+ # Finally, if the above did not help, we may guess the case from the deprel of the node itself.
+ if not node.misc['MSFCase']:
+ if node.udeprel == 'nsubj':
+ node.misc['MSFCase'] = 'Nom'
+ elif node.udeprel == 'obj':
+ node.misc['MSFCase'] = 'Acc'
+ # If the node contains Phrase features in MISC (periphrastic verb forms
+ # detected by Lenka's code), replace the MS features with them.
+ phrasefeatures = [x for x in node.misc if re.match(r"^Phrase[A-Z]", x)]
+ for pf in phrasefeatures:
+ msf = pf
+ if msf == 'PhraseForm':
+ msf = 'MSFVerbForm'
+ else:
+ msf = re.sub(r"Phrase", 'MSF', pf)
+ node.misc[msf] = node.misc[pf]
+ node.misc[pf] = ''
diff --git a/udapi/block/msf/numphrase.py b/udapi/block/msf/numphrase.py
new file mode 100644
index 00000000..22f68c9d
--- /dev/null
+++ b/udapi/block/msf/numphrase.py
@@ -0,0 +1,36 @@
+"""
+Morphosyntactic features (UniDive):
+Case in Number Phrases like 'pět mužů' (five men) in Czech.
+"""
+from udapi.core.block import Block
+
+class NumPhrase(Block):
+
+
+ def process_node(self, node):
+ """
+ Nouns with a 'nummod:gov' dependent are morphologically in genitive,
+ but the case of the whole phrase (number + counted noun) is different,
+ probably nominative or accusative.
+ """
+ quantifiers = [x for x in node.children if x.deprel in ['nummod:gov', 'det:numgov']]
+ current_case = node.misc['MSFCase']
+ if (current_case == 'Gen' or current_case == '') and quantifiers:
+ quantifier_case = quantifiers[0].misc['MSFCase']
+ # The quantifier may lack the case feature (e.g. numbers expressed by digits)
+ # but we may be able to guess it from a preposition or other factors.
+ if quantifier_case == '':
+ # Not any 'case' dependent is helpful. Here we really need single-word
+ # adposition.
+ adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP']
+ if len(adpositions) == 1:
+ fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed']
+ if not fixed and adpositions[0].feats['Case']:
+ quantifier_case = adpositions[0].feats['Case']
+ # Finally, if the above did not help, we may guess the case from the deprel of the node itself.
+ if quantifier_case == '':
+ if node.udeprel == 'nsubj':
+ quantifier_case = 'Nom'
+ elif node.udeprel == 'obj':
+ quantifier_case = 'Acc'
+ node.misc['MSFCase'] = quantifier_case
diff --git a/udapi/block/msf/phrase.py b/udapi/block/msf/phrase.py
new file mode 100644
index 00000000..cf5a8f81
--- /dev/null
+++ b/udapi/block/msf/phrase.py
@@ -0,0 +1,168 @@
+"""
+Morphosyntactic features (UniDive):
+An abstract block as a base for derivation of blocks that discover periphrastic
+verb forms and save them as Phrase features in MISC. This block provides the
+methods that save the features in MISC. It is based on the Writer module by
+Lenka Krippnerová.
+"""
+from udapi.core.block import Block
+import logging
+
+class Phrase(Block):
+
+ def __init__(self, feature_prefix='CW', **kwargs):
+ """
+ Parameters:
+ feature_prefix (string) - The prefix of phrase features (e. g. 'CW', 'Phrase'), default is 'CG'
+ """
+ super().__init__(**kwargs)
+ self.feature_prefix = feature_prefix
+
+ self.dictionary = {
+ 'person': f'{feature_prefix}Person',
+ 'number': f'{feature_prefix}Number',
+ 'mood': f'{feature_prefix}Mood',
+ 'tense': f'{feature_prefix}Tense',
+ 'voice': f'{feature_prefix}Voice',
+ 'aspect':f'{feature_prefix}Aspect',
+ 'form': f'{feature_prefix}Form',
+ 'reflex': f'{feature_prefix}Reflex',
+ 'polarity': f'{feature_prefix}Polarity',
+ 'gender': f'{feature_prefix}Gender',
+ 'animacy': f'{feature_prefix}Animacy',
+ 'ords': feature_prefix,
+ 'expl': f'{feature_prefix}Expl',
+ 'analytic': 'Analytic',
+ }
+
+ # a dictionary where the key is the lemma of a negative particle and the value is a list of the lemmas of their possible children that have a 'fixed' relation
+ # we do not want to include these negative particles in the phrase; these are expressions like "never", etc.
+ self.negation_fixed = {
+ # Belarusian
+ 'ні' : ['раз'],
+ 'ня' : ['толькі'],
+
+ # Upper Sorbian
+ 'nic' : ['naposledku'],
+
+ # Polish
+ 'nie' : ['mało'],
+
+ # Pomak
+ 'néma' : ['kak'],
+
+ # Slovenian
+ 'ne' : ['le'],
+
+ # Russian and Old East Slavic
+ 'не' : ['то', 'токмо'],
+ 'ни' : ['в', 'раз', 'шатко'],
+ 'нет' : ['нет']
+ }
+
+ def process_node(self, node):
+ """
+ Override this in a derived class!
+ """
+ logging.fatal('process_node() not implemented.')
+
+
+
+ def write_node_info(self, node,
+ tense = None,
+ person = None,
+ number = None,
+ mood = None,
+ voice = None,
+ form = None,
+ reflex = None,
+ polarity = None,
+ ords = None,
+ gender = None,
+ animacy = None,
+ aspect = None,
+ expl=None,
+ analytic=None):
+ arguments = locals()
+ del arguments['self'] # delete self and node from arguments,
+ del arguments['node'] # we want only grammatical categories
+ for key,val in arguments.items():
+ if val != None:
+ node.misc[self.dictionary[key]] = val
+
+ def has_fixed_children(self, node):
+ """
+ Returns True if the node has any children with the 'fixed' relation and the node's lemma along with the child's lemma are listed in self.negation_fixed.
+ """
+ fixed_children = [x for x in node.children if x.udeprel == 'fixed']
+
+ if fixed_children:
+ if fixed_children[0].lemma in self.negation_fixed.get(node.lemma, []):
+ return True
+ return False
+
+ def get_polarity(self, nodes):
+ """
+ Returns 'Neg' if there is exactly one node with Polarity='Neg' among the given nodes.
+ Returns an empty string if there are zero or more than one such nodes.
+ """
+ neg_count = 0
+ for node in nodes:
+ if node.feats['Polarity'] == 'Neg':
+ neg_count += 1
+
+ if neg_count == 1:
+ return 'Neg'
+
+ # neg_count can be zero or two, in either case we want to return an empty string so that the PhrasePolarity attribute is not generated
+ else:
+ return ''
+
+ def get_negative_particles(self, nodes):
+ """
+ Returns a list of all negative particles found among the children
+ of the specified nodes, except for negative particles with fixed children specified in self.negation_fixed.
+ """
+ neg_particles = []
+ for node in nodes:
+ neg = [x for x in node.children if x.upos == 'PART' and x.feats['Polarity'] == 'Neg' and x.udeprel == 'advmod' and not self.has_fixed_children(x)]
+ if neg:
+ neg_particles += neg
+ return neg_particles
+
+
+ def get_is_reflex(self,node,refl):
+ if node.feats['Voice'] == 'Mid':
+ return 'Yes'
+ if len(refl) == 0:
+ return node.feats['Reflex']
+ return 'Yes'
+
+ def get_expl_type(self,node, refl):
+ if node.feats['Voice'] == 'Mid':
+ return 'Pv'
+ if not refl:
+ return ''
+ if refl[0].deprel == 'expl':
+ return 'Pv'
+ return refl[0].deprel.split(':')[1].capitalize()
+
+ def is_expl_pass(self,refl):
+ if len(refl) == 0:
+ return False
+ return refl[0].deprel == 'expl:pass'
+
+ def get_voice(self,node,refl):
+ voice = node.feats['Voice']
+ if self.is_expl_pass(refl):
+ return 'Pass'
+ return voice
+
+ def get_analytic_bool(self,node):
+ auxes = [x for x in node.children if x.udeprel == 'aux']
+
+ if auxes:
+ return 'Yes'
+ else:
+ return 'No'
+
diff --git a/udapi/block/msf/removefunc.py b/udapi/block/msf/removefunc.py
new file mode 100644
index 00000000..e169a2de
--- /dev/null
+++ b/udapi/block/msf/removefunc.py
@@ -0,0 +1,17 @@
+"""
+Morphosyntactic features (UniDive):
+Cleanup. Removes MSF* features from MISC for function nodes (MSFFunc=Yes).
+"""
+from udapi.core.block import Block
+
+class RemoveFunc(Block):
+
+
+ def process_node(self, node):
+ """
+ Removes MSF* features if MSFFunc=Yes.
+ """
+ if node.misc['MSFFunc'] == 'Yes':
+ msfeats = [x for x in node.misc if x.startswith('MSF')]
+ for msf in msfeats:
+ node.misc[msf] = ''
diff --git a/udapi/block/msf/romance/preprocessor.py b/udapi/block/msf/romance/preprocessor.py
new file mode 100644
index 00000000..ad7aec1e
--- /dev/null
+++ b/udapi/block/msf/romance/preprocessor.py
@@ -0,0 +1,20 @@
+from udapi.core.block import Block
+
+class Preprocessor(Block):
+
+
+ def process_node(self, node):
+
+ # In Porttinari treebank, the negative adverb não is not marked with feat Polarity=Neg
+ if node.lemma == 'não' and node.upos == 'ADV':
+ node.feats['Polarity'] = 'Neg'
+
+ if node.upos == 'ADV' and node.feats['PronType'] == 'Neg':
+ node.feats['PronType'] = ''
+ node.feats['Polarity'] = 'Neg'
+
+ # In Romanian RRT treebank, there is no annotation of the voice feature
+ # Automatically assign passive voice
+ pass_auxes = [x for x in node.children if x.deprel == 'aux:pass']
+ if pass_auxes:
+ node.feats['Voice'] = 'Pass'
\ No newline at end of file
diff --git a/udapi/block/msf/romance/romance.py b/udapi/block/msf/romance/romance.py
new file mode 100644
index 00000000..ed05fa89
--- /dev/null
+++ b/udapi/block/msf/romance/romance.py
@@ -0,0 +1,965 @@
+import udapi.block.msf.phrase
+from enum import Enum
+
+AUXES_HAVE = ['ter', 'haber', 'avere']
+AUXES_BE = ['estar', 'essere']
+MODALS = ['poder', 'deber', 'querer', 'saber', # Spanish + Portuguese
+ 'potere', 'dovere', 'volere', 'sapere'] # Italian
+
+class Aspect(str, Enum):
+ ANT = 'Ant'
+ IMP = 'Imp'
+ IMPPROG = 'ImpProg'
+ PERF = 'Perf'
+ PERFPROG = 'PerfProg'
+ PROG = 'Prog'
+ PQP = 'Pqp'
+ PQPPROG = 'PqpProg'
+
+class Tense(str, Enum):
+ FUT = 'Fut'
+ FUTFUT = 'FutFut'
+ PAST = 'Past'
+ PASTFUT = 'PastFut'
+ PASTPRES = 'PastPres'
+ PRES = 'Pres'
+
+class Romance(udapi.block.msf.phrase.Phrase):
+
+ def __init__(self, neg=True, **kwargs):
+ """
+ Parameters:
+ neg (bool) - If True, process negation and generate the PhrasePolarity=Neg attribute.
+ feature_prefix (string) - The prefix of phrase features (e. g. 'CG', 'Phrase'), default is 'CG'
+ """
+ super().__init__(**kwargs)
+ self.neg = neg
+
+ def process_node(self, node):
+
+ if node.misc[self.feature_prefix] != '':
+ return
+
+ cop = [x for x in node.children if x.udeprel == 'cop']
+
+ # only expl or expl:pv, no expl:impers or expl:pass
+ refl = [x for x in node.children if (x.lemma == 'se' or x.lemma == 'soi') and x.upos == 'PRON' and x.udeprel == 'expl' and x.deprel != 'expl:impers' and x.deprel != 'expl:pass']
+
+ if refl:
+ expl='Pv'
+ else:
+ expl=None
+
+ if cop:
+ # find auxiliary verbs, modal verbs, and auxiliary verbs related to modal verbs among the children of the content verb and separate them from each other
+ auxes, neg, modals, modal_auxes, modal_neg = self.find_auxes_and_neg(node)
+ adp = [x for x in node.children if x.upos == 'ADP']
+
+ if modals:
+ # we consider modals themselves to be separate verb forms
+ self.process_modal_verbs(modals, modal_auxes, modal_neg)
+
+ if auxes:
+ polarity = ''
+ if self.neg is True:
+ phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in auxes] + [r.ord for r in refl] + [a.ord for a in adp] + [n.ord for n in neg]
+ if neg:
+ polarity = 'Neg'
+ else:
+ phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in auxes] + [a.ord for a in adp] + [r.ord for r in refl]
+ phrase_ords.sort()
+
+ self.process_periphrastic_verb_forms(cop[0], auxes, expl, polarity, phrase_ords, node)
+ else:
+ # no auxiliaries, only cop
+ polarity = ''
+ if self.neg is True:
+ phrase_ords = [node.ord] + [c.ord for c in cop] + [r.ord for r in refl] + [a.ord for a in adp] + [n.ord for n in neg]
+ if neg:
+ polarity = 'Neg'
+ else:
+ phrase_ords = [node.ord] + [c.ord for c in cop] + [a.ord for a in adp] + [r.ord for r in refl]
+ phrase_ords.sort()
+
+ self.process_copulas(node, cop, expl, polarity, phrase_ords)
+ return
+
+ if node.upos == 'VERB': #TODO maybe add "or node.feats['VerbForm'] == 'Part'"?
+
+ # find auxiliary verbs, modal verbs, and auxiliary verbs related to modals among the children of the content verb and separate them from each other
+ auxes, neg, modals, modal_auxes, modal_neg = self.find_auxes_and_neg(node)
+ aux_pass = [x for x in auxes if x.deprel == 'aux:pass']
+ auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass']
+
+ # infinitive with a subject is a subjunctive
+ subj = [x for x in node.children if x.udeprel == 'subj']
+ if node.feats['VerbForm'] == 'Inf' and subj:
+ self.write_node_info(node,
+ person=node.feats['Person'],
+ number=node.feats['Number'],
+ mood='Sub',
+ form='Fin',
+ tense=Tense.FUT.value,
+ gender=node.feats['Gender'],
+ voice=node.feats['Voice'],
+ expl=expl,
+ analytic=self.get_analytic_bool(node),
+ ords=[node.ord]
+ )
+ return
+
+ if modals:
+ # we consider modals themselves to be separate verb forms
+ self.process_modal_verbs(modals, modal_auxes, modal_neg)
+
+ if not auxes:
+ polarity = ''
+ if self.neg is True:
+ phrase_ords = [node.ord] + [r.ord for r in refl] + [n.ord for n in neg]
+ if neg:
+ polarity = 'Neg'
+ else:
+ phrase_ords = [node.ord] + [r.ord for r in refl]
+ phrase_ords.sort()
+
+ self.process_phrases_with_ir_aller_estar(node, expl, polarity, phrase_ords, node)
+ self.process_simple_verb_forms(node, expl, polarity, phrase_ords, node)
+
+
+ else:
+ # no passive auxiliaries
+ if not aux_pass:
+ polarity = ''
+ if self.neg is True:
+ phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg]
+ if neg:
+ polarity = 'Neg'
+ else:
+ phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl]
+ phrase_ords.sort()
+
+ self.process_periphrastic_verb_forms(node, auxes, expl, polarity, phrase_ords, node)
+
+ # head verb has only passive auxiliary and no more other auxiliaries
+ elif not auxes_without_pass:
+ polarity = ''
+
+ if self.neg is True:
+ phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg]
+ if neg:
+ polarity = 'Neg'
+ else:
+ phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl]
+ phrase_ords.sort()
+
+ # TODO phrase-level features are currently determined based on the first passive auxiliary, but it can happen that there are more than one passive auxiliary
+ self.process_phrases_with_ir_aller_estar(auxes[0], expl, polarity, phrase_ords, node)
+ self.process_simple_verb_forms(auxes[0], expl, polarity, phrase_ords, node)
+
+ # head verb has passive auxiliary and also other auxiliaries
+ else:
+ polarity = ''
+
+ if self.neg is True:
+ phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl] + [n.ord for n in neg]
+ if neg:
+ polarity = 'Neg'
+ else:
+ phrase_ords = [node.ord] + [a.ord for a in auxes] + [r.ord for r in refl]
+ phrase_ords.sort()
+
+ self.process_periphrastic_verb_forms(aux_pass[0], auxes_without_pass, expl, polarity, phrase_ords, node)
+
+ def find_auxes_and_neg(self, node):
+ """
+ Find all auxiliaries and negative adverbials among node.children and classifies them.
+
+ Parameters:
+ node (udapi.core.node.Node): head word, look for auxiliaries in its children
+
+ Returns:
+ tuple: a classification of auxiliaries consisting of:
+ - auxiliaries directly modifying the node,
+ - negative adverbs modifying the node,
+ - modal verbs,
+ - auxiliaries modifying a modal verb,
+ - negative adverbs modifying a modal verb.
+ """
+
+ node_auxes = []
+ node_neg = []
+ modals = []
+ modal_auxes = []
+ modal_neg = []
+
+ for child in node.children:
+ if child.udeprel == 'aux':
+ if child.lemma in MODALS:
+ modals.append(child)
+ modal_auxes = node_auxes # auxiliaries found so far are assumed to modify the modal verb (they come before it)
+ node_auxes = []
+
+ modal_neg = node_neg
+ node_neg = []
+
+ else:
+ node_auxes.append(child)
+
+ elif child.upos == 'ADV' and child.feats['Polarity'] == 'Neg':
+ node_neg.append(child)
+
+ return node_auxes, node_neg, modals, modal_auxes, modal_neg
+
+ def process_modal_verbs(self, modals, modal_auxes, modal_neg):
+ """
+ Annotates modal verb forms with the Phrase* attributes.
+ The modal verbs are kept as a single verb form, without including the infinitive of the content word.
+
+ Parameters:
+ modals (list): all modal verbs among the children of the head content verb (currently assumes there is only one.)
+ modal_auxes (list): auxiliaries of the modal verb(s)
+ modal_neg (list): negative adverbs of the modal verb(s)
+
+ """
+ if not modal_auxes:
+ polarity = ''
+ if self.neg is True:
+ phrase_ords = [modals[0].ord] + [n.ord for n in modal_neg]
+ phrase_ords.sort()
+
+ if modal_neg:
+ polarity='Neg'
+ else:
+ phrase_ords = [modals[0].ord]
+ self.process_phrases_with_ir_aller_estar(modals[0], '', polarity, phrase_ords, modals[0])
+ self.process_simple_verb_forms(modals[0], '', polarity, phrase_ords, modals[0])
+
+ else:
+ polarity = ''
+ if self.neg is True:
+ phrase_ords = [modals[0].ord] + [a.ord for a in modal_auxes] + [n.ord for n in modal_neg]
+ if modal_neg:
+ polarity='Neg'
+ else:
+ phrase_ords = [modals[0].ord] + [a.ord for a in modal_auxes]
+ phrase_ords.sort()
+
+ self.process_periphrastic_verb_forms(modals[0], modal_auxes, '', polarity, phrase_ords, modals[0])
+
+ def process_phrases_with_ir_aller_estar(self, node, expl, polarity, phrase_ords, head_node):
+ aspect = ''
+ tense = node.feats['Tense']
+
+ # phrase already annotated
+ if head_node.misc[self.feature_prefix] != '':
+ return
+
+ xcomps = [x for x in node.children if x.udeprel == 'xcomp']
+ if node.lemma in ['ir', 'aller', 'estar', 'ter'] and node.upos == 'VERB' and xcomps:
+ node.misc['PeriAux'] = 'Yes'
+
+ voice = node.feats['Voice']
+ auxes = [x for x in xcomps[0].children if x.udeprel == 'aux']
+ aux_pass = [x for x in auxes if x.deprel == 'aux:pass']
+ auxes_without_pass = [x for x in auxes if x.deprel != 'aux:pass']
+
+ # European Portuguese: estar + a + Inf
+ if node.lemma == 'estar':
+
+ if node.feats['Tense'] == 'Pres':
+ tense=Tense.PRES.value
+ aspect =Aspect.PROG.value
+
+ elif node.feats['Tense'] == 'Imp':
+ tense=Tense.PAST.value
+ aspect=Aspect.IMPPROG.value
+
+ elif node.feats['Tense'] == 'Past':
+ tense=Tense.PAST.value
+ aspect=Aspect.PERFPROG.value
+
+ elif node.feats['Tense'] == 'Fut':
+ tense=Tense.FUT.value
+ aspect=Aspect.PROG.value
+
+ elif node.lemma == 'ter' and len(xcomps) > 1:
+ tense=Tense.PAST.value
+ aspect=Aspect.PROG.value
+ xcomps[0].misc['PeriAux'] = 'Yes'
+
+ elif node.feats['Tense'] == 'Pres':
+ tense=Tense.FUT.value
+
+ elif node.feats['Tense'] == 'Imp':
+ tense=Tense.PASTFUT.value
+ aspect=Aspect.IMP.value
+
+ elif node.feats['Tense'] == 'Fut':
+ tense=Tense.FUTFUT.value
+
+ elif node.feats['Tense'] == 'Past':
+ tense=Tense.PASTFUT.value
+ aspect=Aspect.PERF.value
+
+
+ if auxes_without_pass:
+ if auxes[0].lemma == 'estar':
+ aspect += 'Prog'
+ if auxes[0].lemma == 'haber':
+ aspect += 'Perf'
+
+
+
+ adp_a = [x for x in xcomps[-1].children if x.lemma == 'a' and x.udeprel == 'mark']
+ cop = [x for x in xcomps[0].children if x.udeprel == 'cop']
+ phrase_ords = [node.ord] + [x.ord for x in xcomps] + [x.ord for x in auxes] + [x.ord for x in cop]
+ if adp_a:
+ phrase_ords += [x.ord for x in adp_a]
+
+ if aux_pass:
+ voice='Pass'
+
+ phrase_ords.sort()
+
+ self.write_node_info(xcomps[-1],
+ tense = tense,
+ number = node.feats['Number'],
+ person = node.feats['Person'],
+ aspect = aspect,
+ mood = node.feats['Mood'],
+ form = 'Fin',
+ voice=voice,
+ expl = expl,
+ polarity = polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+ return
+
+ def process_simple_verb_forms(self, node, expl, polarity, phrase_ords, head_node):
+ """
+ Annotate simple verb forms or passive verb forms that contain only a passive auxiliary.
+
+ Parameters
+ node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary.
+ expl (str): The value of the PhraseExpl attribute.
+ polarity (str): The value of the PhrasePolarity attribute.
+ phrase_ords (list[int]): The ord values of all member words of the verb form.
+ head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase.
+ """
+
+ if node.misc['PeriAux'] != '':
+ return
+
+ # Portuguese
+ # presente -> PhraseTense=Pres, PhraseAspect=''
+ # Futuro do presente -> PhraseTense=Fut, PhraseAspect=''
+
+ # Spanish
+ # presente -> PhraseTense=Pres, PhraseAspect=''
+ # futuro simple -> PhraseTense=Fut, PhraseAspect=''
+
+ # Italian
+ # presente -> PhraseTense=Pres, PhraseAspect=''
+ # futuro semplice -> PhraseTense=Fut, PhraseAspect=''
+
+ aspect = ''
+ tense = node.feats['Tense']
+ form = node.feats['VerbForm']
+
+ if node.feats['Mood'] == 'Ind':
+
+ # Portuguese
+ # pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp
+
+ # Spanish
+ # pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp
+
+ # Italian
+ # imperfetto -> PhraseTense=Past, PhraseAspect=Imp
+ if node.feats['Tense'] == 'Imp':
+ tense=Tense.PAST.value
+ aspect=Aspect.IMP.value
+
+ # Portuguese
+ # pretérito perfeito -> PhraseTense=Past, PhraseAspect=Perf
+
+ # Spanish
+ # pretérito perfecto -> PhraseTense=Past, PhraseAspect=Perf
+
+ # Italian
+ # pass remoto -> PhraseTense=Past, PhraseAspect=Perf
+ elif node.feats['Tense'] == 'Past':
+ aspect=Aspect.PERF.value
+
+ # Portuguese
+ # pretérito mais que perfeito simples -> PhraseTense=Past, PhraseAspect=Pqp
+ elif node.feats['Tense'] == 'Pqp':
+ tense=Tense.PAST.value
+ aspect=Aspect.PQP.value
+
+ else:
+ # viitorul popular/colocvial (intentional future) -> PhraseTense=Fut, PhraseAspect=''
+ o = [x for x in node.children if x.lemma == 'o' and x.upos == 'PART']
+ sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART']
+
+
+ if o and sa:
+ tense = Tense.FUT.value
+ phrase_ords.append(o[0].ord)
+ phrase_ords.append(sa[0].ord)
+
+ phrase_ords.sort()
+
+
+
+ # Portuguese
+ # subjunctive presente -> PhraseTense=Pres, PhraseAspect=''
+ # subjunctive futuro -> PhraseTense=Fut, PhraseAspect=''
+
+ # Spanish
+ # subjunctive presente -> PhraseTense=Pres, PhraseAspect=''
+ # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' TODO not annotated in treebanks?
+
+ # Italian
+ # Congiuntivo presente -> PhraseTense=Pres, PhraseAspect=''
+ if node.feats['Mood'] == 'Sub':
+
+ if node.feats['Tense'] == 'Past':
+ aspect=Aspect.IMP.value
+
+ # Portuguese
+ # subjunctive pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp
+
+ # Spanish
+ # Pretérito imperfecto -> PhraseTense=Past, PhraseAspect=Imp
+
+ # Italian
+ # Congiuntivo imperfetto -> PhraseTense=Past, PhraseAspect=Imp
+ if node.feats['Tense'] == 'Imp':
+ tense=Tense.PAST.value
+ aspect=Aspect.IMP.value
+
+ # Portuguese
+ # Futuro do pretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd
+
+ # Spanish
+ # pospretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd
+
+ # Italian
+ # Condizionale presente -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd
+ if node.feats['Mood'] == 'Cnd':
+ aspect=''
+ tense=Tense.PRES.value
+
+ adp_en = [x for x in head_node.children if x.upos == 'ADP' and x.lemma == 'en' and x.udeprel == 'mark']
+ if node.feats['VerbForm'] == 'Part' and adp_en:
+ phrase_ords.append(adp_en[0].ord)
+ phrase_ords.sort()
+ form = 'Ger'
+
+
+ self.write_node_info(head_node,
+ person=node.feats['Person'],
+ aspect=aspect,
+ number=node.feats['Number'],
+ mood=node.feats['Mood'],
+ form=form,
+ tense=tense,
+ gender=head_node.feats['Gender'],
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic=self.get_analytic_bool(head_node),
+ ords=phrase_ords
+ )
+
+ def process_periphrastic_verb_forms(self, node, auxes, expl, polarity, phrase_ords, head_node):
+ """
+ Annotate periphrastic verb forms with the Phrase* attributes.
+
+ Parameters
+ node (udapi.core.node.Node): The relevant node. If there is no passive construction, this is the head verb. If the head verb is passive, this is the passive auxiliary.
+ auxes (list[udapi.core.node.Node]): All auxiliaries except the passive auxiliaries.
+ expl (str): The value of the PhraseExpl attribute.
+ polarity (str): The value of the PhrasePolarity attribute.
+ phrase_ords (list[int]): The ord values of all member words in the verb form.
+ head_node (udapi.core.node.Node): The node that should receive the Phrase* attributes, i.e., the head of the phrase.
+ """
+
+ # phrase already annotated
+ if head_node.misc[self.feature_prefix] != '':
+ return
+
+ if len(auxes) == 1:
+ # Cnd
+ if auxes[0].feats['Mood'] == 'Cnd' and (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'Ger'):
+
+ # Portuguese
+ # aux estar cond + gerund -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Cnd
+ if auxes[0].lemma == 'estar':
+ tense=Tense.PRES.value
+ aspect=Aspect.PROG.value
+
+ # Portuguese
+ # Futuro do pretérito composto -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd
+
+ # Spanish
+ # Antepospretérito -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd
+
+ # Italian
+ # Condizionale passato -> PhraseTense=Past, PhraseAspect='', PhraseMood=Cnd
+ else:
+ tense=Tense.PAST.value
+ aspect=''
+
+ self.write_node_info(head_node,
+ tense=tense,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ aspect=aspect,
+ mood='Cnd',
+ form='Fin',
+ expl=expl,
+ polarity=polarity,
+ voice=head_node.feats['Voice'],
+ analytic='Yes',
+ ords=phrase_ords)
+ return
+
+ if auxes[0].lemma == 'vir' and auxes[0].feats['Tense'] in ['Pres', 'Imp', 'Past'] and node.feats['VerbForm'] == 'Ger':
+
+ # aux Pres (vir) + gerund -> PhraseTense=PastPres, PraseAspect=Prog
+ if auxes[0].feats['Tense'] == 'Pres':
+ tense=Tense.PASTPRES.value
+
+
+ elif auxes[0].feats['Tense'] in ['Imp', 'Past']:
+ tense=Tense.PAST.value
+
+ self.write_node_info(head_node,
+ tense=tense,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ mood=auxes[0].feats['Mood'],
+ form='Fin',
+ aspect=Aspect.PROG.value,
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+ return
+
+ if auxes[0].lemma == 'ir' and node.feats['VerbForm'] == 'Ger':
+
+ # aux Pres (ir) + gerund -> PhraseTense=Pres, PhraseAspect=Prog
+ tense = auxes[0].feats['Tense']
+ aspect = Aspect.PROG.value
+
+ # aux Imp (ir) + gerund -> PhraseTense=Past, PhraseAspect=Prog
+ if auxes[0].feats['Tense'] == 'Imp':
+ tense=Tense.PAST.value
+ aspect=Aspect.PROG.value
+
+ self.write_node_info(head_node,
+ tense=tense,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ mood=auxes[0].feats['Mood'],
+ aspect=aspect,
+ form='Fin',
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+ return
+
+ # Auxiliary 'estar' followed by a gerund
+ if node.feats['VerbForm'] == 'Ger':
+
+ # Portuguese + Spanish
+ # pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg
+ # subjunctive pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub
+ if auxes[0].feats['Tense'] == 'Imp':
+ tense=Tense.PAST.value
+ aspect=Aspect.IMPPROG.value
+
+ # Portuguese + Spanish
+ # pretérito perfeito (aux estar) -> PhraseTense=Past, PhraseAspect=PerfProg
+ elif auxes[0].feats['Tense'] == 'Past':
+ tense=Tense.PAST.value
+ aspect=Aspect.PERFPROG.value
+
+ # Portuguese + Spanish
+ # presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog
+ # futuro do presente (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog
+ # subjunctive presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Sub
+ # subjunctive futuro (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog, PhraseMood=Sub
+ else:
+ tense=auxes[0].feats['Tense']
+ aspect=Aspect.PROG.value
+
+ self.write_node_info(head_node,
+ tense=tense,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ mood=auxes[0].feats['Mood'],
+ form='Fin',
+ voice=head_node.feats['Voice'],
+ aspect=aspect,
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+
+ return
+
+ # Auxiliary 'ter' / 'haber' / 'avere' / 'essere' followed by a participle
+ if node.feats['VerbForm'] == 'Part':
+
+ # Portuguese
+ # futuro do presente composto (aux ter) -> PhraseTense=Fut, PhraseAspect=Perf
+
+ # Spanish
+ # Futuro compuesto antefuturo -> PhraseTense=Fut, PhraseAspect=Perf
+
+ # Italian
+ # Futuro anteriore -> PhraseTense=Fut, PhraseAspect=Perf
+ aspect=Aspect.PERF.value
+ tense=auxes[0].feats['Tense']
+ form='Fin'
+ mood=auxes[0].feats['Mood']
+
+ adp_en = [x for x in node.children if x.lemma == 'en' and x.upos == 'ADP' and x.udeprel == 'mark']
+ if auxes[0].feats['VerbForm'] == 'Part' and adp_en:
+ tense=Tense.PAST.value
+ aspect=''
+ phrase_ords.append(adp_en[0].ord)
+ phrase_ords.sort()
+ form='Ger'
+
+
+ # Romanian
+ # Perfect compus -> PhraseTense=Past, PhraseAspect=Perf
+ elif auxes[0].lemma == 'avea':
+ tense = Tense.PAST.value
+ aspect = Aspect.PERF.value
+ form = 'Fin'
+
+ # Spanish
+ # Pretérito perfecto compuesto ante presente -> PhraseTense=Past, PhraseAspect=Perf
+
+ # Italian
+ # Passato prossimo (aux avere/essere) -> PhraseTense=Past, PhraseAspect=Perf
+ elif auxes[0].feats['Tense'] == 'Pres':
+
+ # Portuguese
+ # pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf
+ # subjonctive pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf, PhraseMood=Sub
+ if auxes[0].lemma == 'fi' or auxes[0].feats['Mood'] == 'Sub':
+ tense = Tense.PASTPRES.value
+
+ # subjonctive mood not annotated in Romanian data
+ if auxes[0].lemma == 'fi':
+ mood='Sub'
+ else:
+ tense=Tense.PAST.value
+
+ # Portuguese
+ # pretérito mais que perfeito composto (aux ter/haver) -> PhraseTense=Past, PhraseAspect=Pqp
+ # subjonctive pretérito mais-que-perfeito composto (aux ter) -> PhraseTense=Past, PhraseAspect=Pqp, PhraseMood=Sub
+
+ # Spanish
+ # pretérito pluscuamperfecto -> PhraseTense=Past, PhraseAspect=Pqp
+
+ # Italian
+ # Trapassato prossimo -> PhraseTense=Past, PhraseAspect=Pqp
+ elif auxes[0].feats['Tense'] == 'Imp':
+ tense=Tense.PAST.value
+ aspect=Aspect.PQP.value
+
+ # Spanish
+ # pretérito anterior ante pretérito -> PhraseTense=Past, PhraseAspect=Ant
+
+ # Italian
+ # trapassato remoto -> PhraseTense=Past, PhraseAspect=Ant
+
+ # French
+ # passé antérieur -> PhraseTense=Past, PhraseAspect=Ant
+ elif auxes[0].feats['Tense'] == 'Past':
+ tense=Tense.PAST.value
+ aspect = Aspect.ANT.value
+
+ self.write_node_info(head_node,
+ tense=tense,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ mood=mood,
+ aspect=aspect,
+ form=form,
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+ return
+
+ # auxiliary 'ir' or 'vrea' followed by infinitive
+ if auxes[0].lemma in ['ir', 'vrea'] and node.feats['VerbForm'] == 'Inf':
+
+ tense=node.feats['Tense']
+ aspect=''
+
+ # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect=''
+ if auxes[0].feats['Tense'] == 'Pres':
+ tense=Tense.FUT.value
+ aspect=''
+
+ # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=Imp
+ elif auxes[0].feats['Tense'] == 'Imp':
+ tense=Tense.PASTFUT.value
+ aspect=Aspect.IMP.value
+
+ # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect=''
+ elif auxes[0].feats['Tense'] == 'Fut':
+ tense=Tense.FUTFUT.value
+ aspect=''
+
+ # Futuro perifrástico passado perf -> PhraseTense=PastFut, PhraseAspect=Perf
+ elif auxes[0].feats['Tense'] == 'Past':
+ tense=Tense.PASTFUT.value
+ aspect=Aspect.PERF.value
+
+ # Viitorul standard/literar/simplu -> PhraseTense=Fut, PhraseAspect=''
+ if auxes[0].lemma == 'vrea':
+ tense = Tense.FUT.value
+ aspect = ''
+
+ self.write_node_info(head_node,
+ tense=tense,
+ aspect=aspect,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ mood=auxes[0].feats['Mood'],
+ form='Fin',
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+
+ return
+
+ # condițional-optativ prezent -> PhraseTense=Pres, PhraseAspect=''
+ if auxes[0].lemma == 'avea' and node.feats['VerbForm'] == 'Inf':
+ tense=Tense.PRES.value
+ aspect=''
+ self.write_node_info(head_node,
+ tense=tense,
+ aspect=aspect,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ mood='Cnd',
+ form='Fin',
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+
+ return
+
+ # viitor popular/colloquial (obligative future) -> PhraseTense=Fut, PhraseAspect=''
+ # viitor popular (potential future - contracted form) -> PhraseTense=Fut, PhraseAspect=''
+ if node.feats['VerbForm'] == 'Fin':
+ sa = [x for x in node.children if x.lemma == 'să' and x.upos == 'PART']
+
+ if sa:
+ phrase_ords.append(sa[0].ord)
+ phrase_ords.sort()
+
+ tense=Tense.FUT.value
+ aspect=''
+
+ self.write_node_info(head_node,
+ tense=tense,
+ aspect=aspect,
+ number=head_node.feats['Number'],
+ person=head_node.feats['Person'],
+ mood=head_node.feats['Mood'],
+ form='Fin',
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+
+ return
+
+ elif len(auxes) == 2:
+ # Romanian
+ # viitor anterior -> PhraseTense=Fut, PhraseAsoect=Perf
+ if auxes[0].lemma == 'vrea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part':
+
+ self.write_node_info(head_node,
+ tense=Tense.PAST.value,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ mood=auxes[0].feats['Mood'],
+ form='Fin',
+ aspect=Aspect.PERF.value,
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+
+ return
+
+ # condițional-optativ perfect -> PhraseTense=Past
+ if auxes[0].lemma == 'avea' and auxes[1].lemma == 'fi' and node.feats['VerbForm'] == 'Part':
+
+ self.write_node_info(head_node,
+ tense=Tense.PAST.value,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ mood='Cnd',
+ form='Fin',
+ aspect='',
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+
+ return
+
+ # Portuguese
+ # auxiliry 'ir' followed by auxiliary 'estar' in infinitive and a gerund
+ if auxes[0].lemma == 'ir' and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger':
+
+ # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect=Prog
+ if auxes[0].feats['Tense'] == 'Pres':
+ tense=Tense.FUT.value
+ aspect=Aspect.PROG.value
+
+ # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=ImpProg
+ if auxes[0].feats['Tense'] == 'Imp':
+ tense=Tense.PASTFUT.value
+ aspect=Aspect.IMPPROG.value
+
+ # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect=Prog
+ if auxes[0].feats['Tense'] == 'Fut':
+ tense=Tense.FUTFUT.value
+ aspect=Aspect.PROG.value
+
+ if auxes[0].feats['Tense'] == 'Past':
+ tense=Tense.PASTFUT.value
+ aspect=Aspect.PERFPROG.value
+
+ self.write_node_info(head_node,
+ tense=tense,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ mood=auxes[0].feats['Mood'],
+ form='Fin',
+ aspect=aspect,
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+
+ # auxiliriy 'ir' in present or future tense followed by auxiliary 'ter' in infinitive and a participle
+ if auxes[0].lemma == 'ir' and (auxes[0].feats['Tense'] in ['Pres', 'Fut']) and auxes[1].lemma == 'ter' and node.feats['VerbForm'] == 'Part':
+
+ # Futuro perifrástico -> PhraseTense=FutFut, PhraseAspect=Perf
+ if auxes[0].feats['Tense'] == 'Fut':
+ tense=Tense.FUTFUT.value
+ aspect=Aspect.PERF.value
+
+ # aux Pres (ir) + aux ter inf + pp -> PhraseTense=Fut, PhraseAspect=Perf
+ if auxes[0].feats['Tense'] == 'Pres':
+ tense=Tense.FUT.value
+ aspect=Aspect.PERF.value
+
+ self.write_node_info(head_node,
+ tense=tense,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ mood=auxes[0].feats['Mood'],
+ aspect=aspect,
+ form='Fin',
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords)
+
+ # Cnd (only ter/haber), Sub and Past,Pres,Fut tenses: 2 auxes - ter/haber + estar
+ if auxes[0].lemma in AUXES_HAVE and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger':
+
+ tense = auxes[0].feats['Tense']
+ aspect = Aspect.PERFPROG.value
+
+ # aux ter cond + estar pp + gerund -> PhraseTense=Past, PhraseAspect=Prog, PhraseMood=Cnd
+ if auxes[0].feats['Mood'] == 'Cnd':
+ tense=Tense.PAST.value
+ aspect=Aspect.PROG.value
+
+ # Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg
+ # subjonctive Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg, PhraseMood=Sub
+ elif auxes[0].feats['Tense'] == 'Pres':
+ tense=Tense.PASTPRES.value
+
+ # Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg
+ # subjonctive Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub
+ elif auxes[0].feats['Tense'] in ['Imp', 'Past']:
+ tense=Tense.PAST.value
+ aspect=Aspect.PQPPROG.value
+
+ # Futuro do presente composto -> PhraseTense=Fut, PhraseAspect=PerfProg
+ elif auxes[0].feats['Tense'] == 'Fut' and auxes[0].lemma == 'ter':
+ tense=Tense.FUT.value
+
+ self.write_node_info(head_node,
+ tense=tense,
+ number=auxes[0].feats['Number'],
+ person=auxes[0].feats['Person'],
+ mood=auxes[0].feats['Mood'],
+ form='Fin',
+ aspect=aspect,
+ voice=head_node.feats['Voice'],
+ expl=expl,
+ polarity=polarity,
+ analytic='Yes',
+ ords=phrase_ords,
+ )
+ return
+
+ def process_copulas(self, node, cop, expl, polarity, phrase_ords):
+ """
+ Annotate non-verbal predicates with copula using the Phrase* attributes.
+
+ This method is specialized for non-periphrastic copulas.
+ If any auxiliaries are present, process_periphrastic_verb_forms() is called instead.
+
+ Parameters
+ node (udapi.core.node.Node): The non-verbal predicate that should receive the Phrase* attributes, i.e., the head of the phrase.
+ cop (list[udapi.core.node.Node]): The copula nodes.
+ expl (str): The value of the PhraseExpl attribute.
+ polarity (str): The value of the PhrasePolarity attribute.
+ phrase_ords (list[int]): The ord values of all member words in the verb form.
+ """
+
+ # classify the morphological features of the copula node and propagate them to the entire phrase (treating the copula as the content verb)
+ self.process_phrases_with_ir_aller_estar(cop[0], expl, polarity, phrase_ords, node)
+ self.process_simple_verb_forms(cop[0], expl, polarity, phrase_ords, node)
+
+ # adjust PhraseAspect based on the lemma of the copula
+ if cop[0].feats['Tense'] in ['Pres', 'Fut']:
+ if cop[0].lemma == 'ser':
+ node.misc['PeriAspect'] = Aspect.PERF.value
+ elif cop[0].lemma == 'estar':
+ node.misc['PeriAspect'] = Aspect.IMP.value
\ No newline at end of file
diff --git a/udapi/block/msf/slavic/conditional.py b/udapi/block/msf/slavic/conditional.py
new file mode 100644
index 00000000..9d15418f
--- /dev/null
+++ b/udapi/block/msf/slavic/conditional.py
@@ -0,0 +1,97 @@
+"""
+Morphosyntactic features (UniDive, Lenka Krippnerová):
+This block detects conditional verb forms in Slavic languages and saves their
+features as Phrase* attributes in MISC of their head word.
+"""
+
+import udapi.block.msf.phrase
+
+class Conditional(udapi.block.msf.phrase.Phrase):
+
+ def process_node(self, node):
+ if (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'PartRes') or node.feats['VerbForm'] == 'Fin':
+ # in most Slavic languages, the verb has feats['VerbForm'] == 'Part' but in Polish the verb has feats['VerbForm'] == 'Fin'
+
+ aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # list for auxiliary verbs for forming the conditional mood
+ cop = [x for x in node.children if x.udeprel == 'cop'] # in some cases it may happen that the cop follows the noun, we don't want to these cases in this branch
+ # in Polish the auxiliary verbs for conditional mood have deprel == 'aux:cnd', in other languages the auxiliary verbs have x.feats['Mood'] == 'Cnd'
+
+ # the conditional mood can be formed using the auxiliary verb or some conjunctions (such as 'aby, kdyby...' in Czech)
+ # so x.udeprel == 'aux' can't be required because it doesn't meet the conjunctions
+
+ if aux_cnd and not cop:
+ aux = [x for x in node.children if x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd'] # all auxiliary verbs and conjuctions with feats['Mood'] == 'Cnd'
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + aux + refl
+
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ person='3' # TODO there is a problem in russian etc. (same as in past tense)
+
+ for aux_verb in aux:
+ if aux_verb.feats['Person'] != '':
+ person=aux_verb.feats['Person']
+
+
+ self.write_node_info(node,
+ person=person,
+ number=node.feats['Number'],
+ mood='Cnd',
+ form='Fin',
+ aspect=node.feats['Aspect'],
+ expl=self.get_expl_type(node,refl),
+ polarity=self.get_polarity(phrase_nodes),
+ voice=self.get_voice(node, refl),
+ ords=phrase_ords,
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ analytic=self.get_analytic_bool(node)
+ )
+ return
+
+
+ cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['VerbForm'] == 'Part' or x.feats['VerbForm'] == 'Fin')]
+ aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel=='aux:cnd']
+
+ if cop and aux_cnd:
+ # there can be a copula with Mood='Cnd' (i. e. in Old East Slavonic), we don't want to count these copula in phrase_ords twice, so there is x.udeprel != 'cop' in aux list
+ aux = [x for x in node.children if (x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd') and x.udeprel != 'cop']
+ prep = [x for x in node.children if x.upos == 'ADP']
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + aux + prep + refl + cop
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ copVerb = cop[0]
+
+ person = '3'
+
+ for aux_verb in aux:
+ if aux_verb.feats['Person'] != '':
+ person=aux_verb.feats['Person']
+ for cop_verb in cop:
+ if cop_verb.feats['Person'] != '':
+ person=aux_verb.feats['Person']
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+ self.write_node_info(node,
+ aspect=copVerb.feats['Aspect'],
+ person=person,
+ number=copVerb.feats['Number'],
+ mood='Cnd',
+ form='Fin',
+ voice=self.get_voice(copVerb, refl),
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node, refl),
+ ords=phrase_ords,
+ gender=copVerb.feats['Gender'],
+ animacy=copVerb.feats['Animacy'],
+ analytic=self.get_analytic_bool(node)
+ )
\ No newline at end of file
diff --git a/udapi/block/msf/slavic/converb.py b/udapi/block/msf/slavic/converb.py
new file mode 100644
index 00000000..32714630
--- /dev/null
+++ b/udapi/block/msf/slavic/converb.py
@@ -0,0 +1,94 @@
+"""
+Morphosyntactic features (UniDive, Lenka Krippnerová):
+This block detects converb (transgressive) forms in Slavic languages and saves their
+features as Phrase* attributes in MISC of their head word.
+"""
+
+import udapi.block.msf.phrase
+
+class Converb(udapi.block.msf.phrase.Phrase):
+
+ def process_node(self, node):
+ # condition node.upos == 'VERB' to prevent copulas from entering this branch
+ if node.feats['VerbForm'] == 'Conv' and node.upos == 'VERB':
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ person=node.feats['Person'],
+ number=node.feats['Number'],
+ form='Conv',
+ tense=node.feats['Tense'],
+ aspect=node.feats['Aspect'],
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ ords=phrase_ords,
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ voice=self.get_voice(node, refl),
+ analytic=self.get_analytic_bool(node)
+ )
+
+ # passive voice
+ elif node.upos == 'ADJ':
+ aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Conv']
+
+ if aux:
+ auxVerb = aux[0]
+
+ phrase_nodes = [node] + aux
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ person=auxVerb.feats['Person'],
+ number=auxVerb.feats['Number'],
+ form='Conv',
+ tense=auxVerb.feats['Tense'],
+ aspect=node.feats['Aspect'],
+ polarity=self.get_polarity(phrase_nodes),
+ ords=phrase_ords,
+ gender=auxVerb.feats['Gender'],
+ animacy=auxVerb.feats['Animacy'],
+ voice='Pass',
+ analytic=self.get_analytic_bool(node)
+ )
+
+ # copulas
+ else:
+ cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Conv']
+
+ if cop:
+ prep = [x for x in node.children if x.upos == 'ADP']
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ copVerb = cop[0]
+
+ phrase_nodes = [node] + cop + prep + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+
+ self.write_node_info(node,
+ aspect=copVerb.feats['Aspect'],
+ person=copVerb.feats['Person'],
+ number=copVerb.feats['Number'],
+ tense=copVerb.feats['Tense'],
+ gender=copVerb.feats['Gender'],
+ animacy=copVerb.feats['Animacy'],
+ form='Conv',
+ polarity=self.get_polarity(phrase_nodes),
+ ords=phrase_ords,
+ voice=self.get_voice(copVerb, refl),
+ analytic=self.get_analytic_bool(node)
+ )
diff --git a/udapi/block/msf/slavic/future.py b/udapi/block/msf/slavic/future.py
new file mode 100644
index 00000000..9cc17717
--- /dev/null
+++ b/udapi/block/msf/slavic/future.py
@@ -0,0 +1,207 @@
+"""
+Morphosyntactic features (UniDive, Lenka Krippnerová):
+This block detects future tense forms in Slavic languages and saves their
+features as Phrase* attributes in MISC of their head word.
+"""
+
+import udapi.block.msf.phrase
+
+class Future(udapi.block.msf.phrase.Phrase):
+
+ def process_node(self, node):
+ # future tense for Serbian and Croatian
+ aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and (x.lemma == 'hteti' or x.lemma == 'htjeti')]
+ if node.upos != 'AUX' and aux:
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+ aux_other = [x for x in node.children if x.udeprel == 'aux'] # adding aux for passive voice
+ cop = [x for x in node.children if x.deprel == 'cop']
+
+ phrase_nodes = [node] + refl + aux_other + cop
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+
+ if not cop:
+ self.write_node_info(node,
+ tense='Fut',
+ person=aux[0].feats['Person'],
+ number=aux[0].feats['Number'],
+ mood='Ind',
+ voice=node.feats['Voice'],
+ aspect=node.feats['Aspect'], # srbstina ani chorvatstina vidy nema
+ form='Fin',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
+ else:
+ prep = [x for x in node.children if x.upos == 'ADP']
+ phrase_nodes += prep
+ phrase_ords += [x.ord for x in prep]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ tense='Fut',
+ person=aux[0].feats['Person'],
+ number=aux[0].feats['Number'],
+ mood='Ind',
+ voice=node.feats['Voice'],
+ aspect=node.feats['Aspect'],
+ form='Fin',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
+
+ return
+
+ # Macedonian forms the future tense with the auxiliary word ќе and a verb in the present tense
+ # Bulgarian forms the future tense with the auxiliary word ще and a verb in the present tense
+ aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще']
+
+ if node.feats['Tense'] == 'Pres' and aux:
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + refl + aux
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ tense='Fut',
+ person=node.feats['Person'],
+ number=node.feats['Number'],
+ mood='Ind',
+ voice=node.feats['Voice'],
+ aspect=node.feats['Aspect'],
+ form='Fin',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
+ return
+
+ # future tense of perfect verbs
+ # Upper Sorbian forms the future tense in this way, however, the feats[Aspect] are not listed in the data
+ # in some languages (e.g. in Russian) these verbs have the Tense Fut, in others (e.g. in Czech) they have the Tense Pres
+ if node.feats['Aspect'] == 'Perf' and (node.feats['Tense'] == 'Pres' or node.feats['Tense'] == 'Fut') and node.feats['VerbForm'] != 'Conv':
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ tense='Fut',
+ person=node.feats['Person'],
+ number=node.feats['Number'],
+ mood='Ind',
+ voice=self.get_voice(node,refl),
+ form='Fin',
+ aspect='Perf',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
+ return
+
+
+ # future tense of imperfect verbs and passive voice
+ # in some languages the verb is in the infinitive, in some it is in the l-participle
+ # the condition node.upos == 'ADJ' is due to the passive voice - the n-participle is marked as ADJ, but the auxiliary verb is not cop, but aux
+ if node.upos == 'VERB' or node.upos == 'ADJ':
+
+ aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Fut']
+
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + aux + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ if aux:
+ auxVerb = aux[0]
+ self.write_node_info(node,
+ tense='Fut',
+ person=auxVerb.feats['Person'],
+ number=auxVerb.feats['Number'],
+ mood='Ind',
+ voice=self.get_voice(node,refl),
+ aspect=node.feats['Aspect'],
+ form='Fin',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ ords=phrase_ords,
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ analytic=self.get_analytic_bool(node)
+ )
+ return
+
+ # simple future tense - e.g. in Serbian, the future tense can be formed by combining a verb with a full meaning and an auxiliary verb into one word, i.e. without an auxiliary verb
+ # or verbs like pojede, půjdeme... in Czech
+
+ if not aux and node.feats['Tense'] == 'Fut':
+
+ self.write_node_info(node,
+ tense='Fut',
+ person=node.feats['Person'],
+ number=node.feats['Number'],
+ mood='Ind',
+ voice=self.get_voice(node,refl),
+ aspect=node.feats['Aspect'],
+ form='Fin',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
+ return
+
+
+ cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Fut']
+ if cop:
+ copVerb = cop[0]
+ aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood']=='Ind']
+ prep = [x for x in node.children if x.upos == 'ADP']
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + cop + aux + prep + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ aspect=copVerb.feats['Aspect'],
+ tense='Fut',
+ person=copVerb.feats['Person'],
+ number=copVerb.feats['Number'],
+ mood='Ind',
+ form='Fin',
+ voice=self.get_voice(copVerb, refl),
+ polarity=self.get_polarity(phrase_nodes),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
+
diff --git a/udapi/block/msf/slavic/imperative.py b/udapi/block/msf/slavic/imperative.py
new file mode 100644
index 00000000..5a30d05e
--- /dev/null
+++ b/udapi/block/msf/slavic/imperative.py
@@ -0,0 +1,89 @@
+"""
+Morphosyntactic features (UniDive, Lenka Krippnerová):
+This block detects imperative verb forms in Slavic languages and saves their
+features as Phrase* attributes in MISC of their head word.
+"""
+
+import udapi.block.msf.phrase
+
+class Imperative(udapi.block.msf.phrase.Phrase):
+
+ def process_node(self, node):
+ # the condition node.upos == 'VERB' ensures that copulas do not enter this branch
+ if node.feats['Mood'] == 'Imp' and node.upos == 'VERB':
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ person=node.feats['Person'],
+ number=node.feats['Number'],
+ aspect=node.feats['Aspect'],
+ mood='Imp',
+ form='Fin',
+ voice='Act',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
+ return
+
+ # verbs in the passive forms are marked as ADJ
+ if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass':
+ aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood'] == 'Imp']
+ if aux:
+ phrase_nodes = [node] + aux
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ person=aux[0].feats['Person'],
+ number=aux[0].feats['Number'],
+ mood='Imp',
+ voice='Pass',
+ aspect=node.feats['Aspect'],
+ form='Fin',
+ polarity=self.get_polarity(phrase_nodes),
+ ords=phrase_ords,
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ analytic=self.get_analytic_bool(node)
+ )
+ return
+
+
+ cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Mood'] == 'Imp']
+ if cop:
+ prep = [x for x in node.children if x.upos == 'ADP']
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ copVerb = cop[0]
+
+ phrase_nodes = [node] + cop + prep + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ aspect=copVerb.feats['Aspect'],
+ person=copVerb.feats['Person'],
+ number=copVerb.feats['Number'],
+ mood='Imp',
+ form='Fin',
+ voice=self.get_voice(copVerb, refl),
+ expl=self.get_expl_type(node, refl),
+ polarity=self.get_polarity(phrase_nodes),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
diff --git a/udapi/block/msf/slavic/infinitive.py b/udapi/block/msf/slavic/infinitive.py
new file mode 100644
index 00000000..83bc0766
--- /dev/null
+++ b/udapi/block/msf/slavic/infinitive.py
@@ -0,0 +1,107 @@
+"""
+Morphosyntactic features (UniDive, Lenka Krippnerová):
+This block detects infinitive verb forms in Slavic languages and saves their
+features as Phrase* attributes in MISC of their head word.
+"""
+
+import udapi.block.msf.phrase
+
+class Infinitive(udapi.block.msf.phrase.Phrase):
+
+ def process_node(self,node):
+ if node.feats['VerbForm'] == 'Inf' and node.upos == 'VERB':
+ aux = [x for x in node.children if x.udeprel == 'aux']
+ if not aux: # the list of auxiliary list must be empty - we don't want to mark infinitives which are part of any other phrase (for example the infinititive is part of the future tense in Czech)
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes == neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+
+ self.write_node_info(node,
+ aspect=node.feats['Aspect'],
+ voice=self.get_voice(node,refl),
+ form='Inf',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
+ return
+
+ if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass':
+ aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Inf']
+ aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] != 'Inf']
+ if aux and not aux_forb:
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + aux + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ aspect=node.feats['Aspect'],
+ voice='Pass',
+ form='Inf',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node, refl),
+ ords=phrase_ords,
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ number=node.feats['Number'],
+ analytic=self.get_analytic_bool(node)
+ )
+ return
+
+
+
+ cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Inf']
+ aux_forb = [x for x in node.children if x.udeprel == 'aux']
+ if cop and not aux_forb:
+ prep = [x for x in node.children if x.upos == 'ADP']
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + cop + prep + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ aspect=cop[0].feats['Aspect'],
+ voice=self.get_voice(cop[0], refl),
+ form='Inf',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node, refl),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
+
+ # there is a rare verb form called supine in Slovenian, it is used instead of infinitive as the argument of motion verbs
+ if node.feats['VerbForm'] == 'Sup':
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ aspect=node.feats['Aspect'],
+ voice='Act',
+ form='Sup',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node, refl),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
diff --git a/udapi/block/msf/slavic/past.py b/udapi/block/msf/slavic/past.py
new file mode 100644
index 00000000..130d972d
--- /dev/null
+++ b/udapi/block/msf/slavic/past.py
@@ -0,0 +1,212 @@
+"""
+Morphosyntactic features (UniDive, Lenka Krippnerová):
+This block detects past tense forms in Slavic languages and saves their
+features as Phrase* attributes in MISC of their head word.
+"""
+
+import udapi.block.msf.phrase
+
+class Past(udapi.block.msf.phrase.Phrase):
+
+ def get_person_for_langs_with_simple_past(self, node, person):
+ """
+ returns the person which is known from subject, languages with the simple past tense (e. g. Russian) do not express person in these verb forms
+ if the person was not taken from the subject, the third person would be filled in automatically due to languages with a compound past but simple forms for the third person (e. g. Czech)
+ """
+ subj = [x for x in node.children if x.udeprel == 'nsubj']
+ if subj:
+ subj = subj[0]
+ if subj.feats['Person'] != '':
+ person = subj.feats['Person']
+ return person
+
+ def process_node(self, node):
+
+ past_tenses = ['Past', 'Imp', 'Pqp']
+ cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['Tense'] in past_tenses)]
+
+ # there is person 0 in Polish and Ukrainian which is for impersonal statements
+ # in Polish, verbs with Person=0 have also Tense=Past, in Ukrainian the tense is not specified
+ if node.feats['Person'] == '0':
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ tense=node.feats['Tense'],
+ person=node.feats['Person'],
+ number=node.feats['Number'],
+ mood=node.feats['Mood'],
+ voice='Act', #In Polish, impersonal statements are annotated with Voice=Act. In Ukrainian, the Voice feature is missing; therefore, we decided to annotate these phrases with PhraseVoice=Act
+ aspect=node.feats['Aspect'],
+ form=node.feats['VerbForm'],
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ ords=phrase_ords,
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ analytic=self.get_analytic_bool(node)
+ )
+
+ # compound past tense
+ if (node.feats['VerbForm'] in ['Part', 'PartRes', 'Fin']) and node.upos == 'VERB' and node.feats['Voice'] != 'Pass':
+ aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in ['Pres', '']]
+ aux_pqp = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in past_tenses]
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + aux + refl + aux_pqp
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ aux_cnd = [x for x in node.children if (x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd') and x.udeprel != 'conj'] # we don't want to mark l-participles in the conditional as past tense
+ if not aux_cnd:
+ if aux:
+ person = aux[0].feats['Person']
+
+ elif not aux:
+ person = '3'
+
+ if aux_pqp:
+ person = aux_pqp[0].feats['Person']
+
+ # in Slovenian, the participles are not annotated as Tense='Past', the Tense feature is missing here
+ # but in Bulgarian, there are cases where the participles are annotated as Tense='Imp'
+ tense = 'Past'
+ if node.feats['Tense'] == 'Imp':
+ tense = 'Imp'
+ if node.feats['Tense'] == 'Pqp':
+ tense = 'Pqp'
+
+ self.write_node_info(node,
+ tense=tense,
+ person=person,
+ number=node.feats['Number'],
+ mood='Ind',
+ voice=self.get_voice(node,refl),
+ aspect=node.feats['Aspect'],
+ form='Fin',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ ords=phrase_ords,
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ analytic=self.get_analytic_bool(node)
+ )
+
+
+ # the past tense of some Slavic languages is formed only by a verb without an auxiliary verb (e.g. Polish)
+ # or imperfect (special case of the past tense) e.g. in Bulgarian or Croatian
+ elif (node.feats['Tense'] in past_tenses) and node.upos == 'VERB' and node.feats['VerbForm'] != 'Conv':
+
+ # the past tense is formed only by a content verb, not with an auxiliary
+ aux_forb = [x for x in node.children if x.udeprel == 'aux']
+
+ if not aux_forb:
+
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ tense=node.feats['Tense'],
+ person=node.feats['Person'],
+ number=node.feats['Number'],
+ mood='Ind',
+ voice=self.get_voice(node,refl),
+ aspect=node.feats['Aspect'],
+ form=node.feats['VerbForm'],
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ ords=phrase_ords,
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ analytic=self.get_analytic_bool(node)
+ )
+
+
+
+ # passive
+ elif node.upos == 'ADJ' and node.feats['Voice'] == 'Pass' and not cop:
+ aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and (x.feats['Tense'] in past_tenses)]
+ aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense
+ if not aux_cnd:
+ if aux_past_tense:
+ aux_pres_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] # e. g. the auxiliary 'jsem' in the phrase 'byl jsem přinucen'
+
+ phrase_nodes = [node] + aux_past_tense + aux_pres_tense
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ person = '3'
+ if aux_pres_tense:
+ person = aux_pres_tense[0].feats['Person']
+ person = self.get_person_for_langs_with_simple_past(node, person)
+
+ self.write_node_info(node,
+ tense=aux_past_tense[0].feats['Tense'],
+ person=person,
+ number=aux_past_tense[0].feats['Number'],
+ mood='Ind',
+ voice='Pass',
+ form='Fin',
+ aspect=node.feats['Aspect'],
+ polarity=self.get_polarity(phrase_nodes),
+ ords=phrase_ords,
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ analytic=self.get_analytic_bool(node)
+ )
+
+ else:
+ aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense
+ if cop and not aux_cnd:
+ aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres']
+ prep = [x for x in node.children if x.upos == 'ADP']
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + aux_past_tense + cop + prep + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ person = '3'
+ if aux_past_tense:
+ person = aux_past_tense[0].feats['Person']
+
+ # In ru, be, uk, the person is not expressed in past tense and the verbform is Fin, not Part
+ if cop[0].feats['VerbForm'] == 'Fin':
+ person = ''
+
+ self.write_node_info(node,
+ aspect=cop[0].feats['Aspect'],
+ tense=cop[0].feats['Tense'],
+ person=person,
+ number=cop[0].feats['Number'],
+ mood='Ind',
+ voice=self.get_voice(cop[0], refl),
+ form='Fin',
+ expl=self.get_expl_type(node,refl),
+ polarity=self.get_polarity(phrase_nodes),
+ ords=phrase_ords,
+ gender=cop[0].feats['Gender'],
+ animacy=cop[0].feats['Animacy'],
+ analytic=self.get_analytic_bool(node)
+ )
diff --git a/udapi/block/msf/slavic/preprocessor.py b/udapi/block/msf/slavic/preprocessor.py
new file mode 100644
index 00000000..0672812b
--- /dev/null
+++ b/udapi/block/msf/slavic/preprocessor.py
@@ -0,0 +1,83 @@
+"""
+Morphosyntactic features (UniDive, Lenka Krippnerová):
+This block serves as a preprocessor for Slavic languages before the other blocks
+are applied to detect periphrastic verb forms. It improves harmonization of
+annotations across the treebanks by addressing some known divergences.
+"""
+
+from udapi.core.block import Block
+
+class Preprocessor(Block):
+
+ def process_node(self,node):
+
+ # in Ukrainian the active verb forms are not marked as PhraseVoice=Act
+ if (node.upos == 'VERB' or (node.upos == 'AUX' and node.feats['VerbForm'] == 'Fin')) and node.feats['Voice'] == '':
+ node.feats['Voice'] = 'Act'
+
+ # in some languages, participles are annotated with UPOS=VERB, while in others they are annotated with UPOS=ADJ
+ # we change the UPOS to ADJ when a participle expresses case
+ #if node.upos == 'VERB' and node.feats['VerbForm'] == 'Part' and node.feats['Case'] != '':
+ # node.upos = 'ADJ'
+
+ # in Polish, the conditional mood for auxiliary verbs is marked as deprel == 'aux:cnd' and not as in the last Slavic languages feats['Mood'] == 'Cnd'
+ if node.deprel == 'aux:cnd':
+ node.feats['Mood'] = 'Cnd'
+
+ # unify polarities - some languages mark only Neg (Russian), some mark both Neg and Pos (Czech)
+ if node.feats['Polarity'] == 'Pos':
+ node.feats['Polarity'] = ''
+
+ # In Ukrainian, there is no explicit annotation of reflexive verbs
+ # We decided to unify the annotation of reflexive verbs with Russian and Belarusian, where reflexive verbs are formed similarly
+ # We add the feature Voice=Mid to reflexive verbs
+ if node.upos == 'VERB' and (node.lemma.endswith('сь') or node.lemma.endswith('ся')):
+ node.feats['Voice'] = 'Mid'
+
+ # makedonstina tvori budouci cas pomoci pomocneho slova ќе, u nejz neni nijak vyznaceno, ze se podili na tvorbe budouciho casu
+ # stejne tak bulharstina pomoci pomocneho slova ще
+ # makedonstina a bulharstina
+ if node.feats['Tense'] == 'Pres':
+ aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще']
+ if len(aux) == 1:
+ aux[0].feats['Tense'] = 'Fut'
+
+ # in Czech and in Old Church Slavonic, the participles are sometimes marked with the plural gender
+ if node.feats['Gender'] == 'Fem,Neut' or node.feats['Gender'] == 'Fem,Masc':
+ subj = [x for x in node.children if x.udeprel == 'nsubj']
+
+ # for relative pronouns, only one gender is indicated
+ if len(subj) == 1:
+ conj = [x for x in subj[0].children if x.deprel == 'conj']
+ if len(conj) == 0:
+ node.feats['Gender'] = subj[0].feats['Gender']
+ node.feats['Number'] = subj[0].feats['Number']
+
+ # participles in passive are sometimes annotated as VERB, sometimes as ADJ
+ #if node.upos == 'VERB' and node.feats['Voice'] == 'Pass':
+ # node.upos = 'ADJ'
+
+ # there are cases where the node has deprel=='expl:pv' or 'expl:pass' or 'expl:impers' and Reflex is not Yes (i.e. Macedonian treebank)
+ # we add the Reflex=Yes feature
+ if node.deprel == 'expl:pv' or node.deprel == 'expl:pass' or node.deprel == 'expl:impers':
+ node.feats['Reflex'] = 'Yes'
+
+ # fixing the mistake in Macedonian treebank (mk_mtb-ud-test.conllu), in sent_id=other0010, there is personal pronoun 'ми' marked as expl:pv, it should be iobj
+ if node.deprel == 'expl:pv' and node.lemma == 'ми' and node.feats['PronType'] == 'Prs':
+ node.deprel = ''
+ node.udeprel = 'iobj'
+
+ # in Old Church Slavonic, there is feature Mood=Sub, but this is a notation for conditional mood
+ if node.feats['Mood'] == 'Sub':
+ node.feats['Mood'] = 'Cnd'
+
+ # although infinitives in Old Church Slavonic are annotated with Tense=Pres, they do not convey tense; therefore, we remove this annotation
+ if node.feats['VerbForm'] == 'Inf':
+ node.feats['Tense'] = ''
+
+ # in the russian Syntagrus corpus, the negative particles have no Polarity=Neg feature
+ if node.lemma == 'не' and node.upos == 'PART' and node.udeprel == 'advmod':
+ node.feats['Polarity'] = 'Neg'
+
+ # TODO maybe we want to set Tense=Fut for the perfective verbs with Tense=Pres? This could solve the problem with the simplified detection of the future tense in Czech
+ # but there are many verbs with no Aspect value, so the problem is still there
diff --git a/udapi/block/msf/slavic/present.py b/udapi/block/msf/slavic/present.py
new file mode 100644
index 00000000..7521a08d
--- /dev/null
+++ b/udapi/block/msf/slavic/present.py
@@ -0,0 +1,132 @@
+"""
+Morphosyntactic features (UniDive, Lenka Krippnerová):
+This block detects present tense forms in Slavic languages and saves their
+features as Phrase* attributes in MISC of their head word.
+"""
+
+import udapi.block.msf.phrase
+
+class Present(udapi.block.msf.phrase.Phrase):
+
+ def process_node(self,node):
+ # the condition VerbForm == 'Fin' ensures that there are no transgressives between the found verbs
+ # the aspect is not always given in Czech treebanks, so we can't rely on the fact that the imperfect aspect is specified
+ if node.feats['Tense'] == 'Pres' and node.upos == 'VERB' and node.feats['VerbForm'] == 'Fin' and node.feats['Aspect'] !='Perf':
+
+ aux_forb = [x for x in node.children if x.upos == 'AUX' and (x.lemma == 'ќе' or x.lemma == 'ще' or x.feats['Mood'] == 'Cnd')] # forbidden auxiliaries for present tense (these auxiliaries are used for the future tense or the conditional mood)
+
+ if not aux_forb:
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ tense='Pres',
+ person=node.feats['Person'],
+ number=node.feats['Number'],
+ mood='Ind',
+ aspect=node.feats['Aspect'],
+ voice=self.get_voice(node,refl),
+ form='Fin',
+ polarity=self.get_polarity(phrase_nodes),
+ expl=self.get_expl_type(node,refl),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
+ return
+
+ # passive voice
+ if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass':
+ aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and x.lemma != 'hteti' and x.lemma != 'htjeti']
+ aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] != 'Pres'] # we don't want the past passive (e. g. 'byl jsem poučen' in Czech)
+
+ if aux and not aux_forb:
+ phrase_nodes = [node] + aux
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ auxVerb = aux[0]
+
+ self.write_node_info(node,
+ tense='Pres',
+ person=auxVerb.feats['Person'],
+ number=auxVerb.feats['Number'],
+ mood='Ind',
+ aspect=node.feats['Aspect'],
+ form='Fin',
+ voice='Pass',
+ polarity=self.get_polarity(phrase_nodes),
+ ords=phrase_ords,
+ gender=node.feats['Gender'],
+ animacy=node.feats['Animacy'],
+ analytic=self.get_analytic_bool(node)
+ )
+ return
+
+ # participles
+ # in some languages, participles are used as attributes (they express case and degree)
+ if node.upos == 'ADJ' and node.feats['VerbForm'] == 'Part':
+ aux_forb = [x for x in node.children if x.udeprel == 'aux']
+ cop = [x for x in node.children if x.udeprel == 'cop']
+
+ if not aux_forb and not cop:
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ aspect=node.feats['Aspect'],
+ tense=node.feats['Tense'],
+ number=node.feats['Number'],
+ form='Part',
+ voice=self.get_voice(node, refl),
+ expl=self.get_expl_type(node, refl),
+ polarity=self.get_polarity(phrase_nodes),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
+ return
+
+ cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Pres']
+ aux_forb = [x for x in node.children if x.upos == 'AUX' and x.feats['Tense'] != 'Pres'] # in Serbian this can be a future tense
+
+ if cop and not aux_forb:
+ aux = [x for x in node.children if x.udeprel == "aux" and x.feats['Mood'] == 'Ind' and x.feats['Tense'] == 'Pres']
+ prep = [x for x in node.children if x.upos == 'ADP']
+ refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl']
+
+ phrase_nodes = [node] + cop + aux + prep + refl
+ neg = self.get_negative_particles(phrase_nodes)
+ phrase_nodes += neg
+
+ copVerb = cop[0]
+
+ phrase_ords = [x.ord for x in phrase_nodes]
+ phrase_ords.sort()
+
+ self.write_node_info(node,
+ aspect=copVerb.feats['Aspect'],
+ tense='Pres',
+ person=copVerb.feats['Person'],
+ number=copVerb.feats['Number'],
+ mood='Ind',
+ form='Fin',
+ voice=self.get_voice(copVerb, refl),
+ expl=self.get_expl_type(node, refl),
+ polarity=self.get_polarity(phrase_nodes),
+ analytic=self.get_analytic_bool(node),
+ ords=phrase_ords
+ )
diff --git a/udapi/block/mwe/normalize.py b/udapi/block/mwe/normalize.py
new file mode 100644
index 00000000..e7ebf24f
--- /dev/null
+++ b/udapi/block/mwe/normalize.py
@@ -0,0 +1,68 @@
+"""Block that takes PARSEME-like annotation of multiword expressions from MISC
+ and normalizes it so that the type is always annotated at the first word of
+ the expression."""
+from udapi.core.block import Block
+import logging
+import re
+
+class Normalize(Block):
+
+ def collect_mwes(self, root):
+ """
+ Collects annotations of multiword expressions from MISC of the nodes.
+ The expected annotation is in the style of Parseme (see
+ https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download
+ the data from http://hdl.handle.net/11372/LRT-5124), except that there
+ are only ten columns and the annotation from the eleventh column is
+ copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).
+ """
+ nodes = root.descendants
+ mwes = {} # for each mwe id, its type and list of node ids
+ mwes_by_nodes = {} # for each node id, a list of mwe ids
+ for n in nodes:
+ mwes_by_nodes[n.ord] = []
+ miscmwe = n.misc['Mwe']
+ if miscmwe:
+ # A node may belong to multiple multiword expressions.
+ miscmwes = miscmwe.split(';')
+ for m in miscmwes:
+ # Either it is NUMBER:TYPE, or just NUMBER.
+ # Number identifies this MWE among all MWEs in the sentence.
+ # Type is a main uppercase string (VID, LVC etc.), optionally
+ # followed by a subtype ('LVC.cause').
+ # See https://gitlab.com/parseme/corpora/-/wikis/home
+ match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m)
+ if match:
+ number = match.group(1)
+ type = match.group(2)
+ if not number in mwes:
+ mwes[number] = {'nodes': [], 'type': ''}
+ if type:
+ mwes[number]['type'] = type
+ mwes[number]['nodes'].append(n.ord)
+ mwes_by_nodes[n.ord].append(number)
+ else:
+ logging.warning("Cannot parse Mwe=%s" % m)
+ return (mwes, mwes_by_nodes)
+
+ def process_tree(self, root):
+ """
+ Collects annotations of multiword expressions from MISC of the nodes.
+ Then saves them back but makes sure that the type is annotated at the
+ first word of the expression (as opposed to the syntactic head or to
+ any other word).
+ """
+ (mwes, mwes_by_nodes) = self.collect_mwes(root)
+ nodes = root.descendants
+ for n in nodes:
+ # Erase the previous MWE annotations so we can start from scratch.
+ n.misc['Mwe'] = ''
+ # There may be multiple MWEs this node is member of.
+ annotations = []
+ for m in mwes_by_nodes[n.ord]:
+ if n.ord == mwes[m]['nodes'][0]:
+ annotations.append("%s:%s" % (m, mwes[m]['type']))
+ else:
+ annotations.append(m)
+ if annotations:
+ n.misc['Mwe'] = ';'.join(annotations)
diff --git a/udapi/block/mwe/possessives.py b/udapi/block/mwe/possessives.py
new file mode 100644
index 00000000..0849a210
--- /dev/null
+++ b/udapi/block/mwe/possessives.py
@@ -0,0 +1,74 @@
+"""Block that takes PARSEME-like annotation of multiword expressions from MISC,
+ looks for dependent possessive pronouns and reports how they are treated."""
+from udapi.core.block import Block
+import logging
+import re
+
+class Possessives(Block):
+
+ def collect_mwes(self, root):
+ """
+ Collects annotations of multiword expressions from MISC of the nodes.
+ The expected annotation is in the style of Parseme (see
+ https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download
+ the data from http://hdl.handle.net/11372/LRT-5124), except that there
+ are only ten columns and the annotation from the eleventh column is
+ copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).
+ """
+ nodes = root.descendants
+ mwes = {} # for each mwe id, its type and list of node ids
+ mwes_by_nodes = {} # for each node id, a list of mwe ids
+ for n in nodes:
+ mwes_by_nodes[n.ord] = []
+ miscmwe = n.misc['Mwe']
+ if miscmwe:
+ # A node may belong to multiple multiword expressions.
+ miscmwes = miscmwe.split(';')
+ for m in miscmwes:
+ # Either it is NUMBER:TYPE, or just NUMBER.
+ # Number identifies this MWE among all MWEs in the sentence.
+ # Type is a main uppercase string (VID, LVC etc.), optionally
+ # followed by a subtype ('LVC.cause').
+ # See https://gitlab.com/parseme/corpora/-/wikis/home
+ match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m)
+ if match:
+ number = match.group(1)
+ type = match.group(2)
+ if not number in mwes:
+ mwes[number] = {'nodes': [], 'type': ''}
+ if type:
+ mwes[number]['type'] = type
+ mwes[number]['nodes'].append(n.ord)
+ mwes_by_nodes[n.ord].append(number)
+ else:
+ logging.warning("Cannot parse Mwe=%s" % m)
+ return (mwes, mwes_by_nodes)
+
+ def process_tree(self, root):
+ """
+ Collects annotations of multiword expressions from MISC of the nodes.
+ Then surveys the possessive pronouns.
+ """
+ (mwes, mwes_by_nodes) = self.collect_mwes(root)
+ nodes = root.descendants
+ for m in mwes:
+ mwenodes = [x for x in nodes if m in mwes_by_nodes[x.ord]]
+ mweheads = [x for x in mwenodes if not x.parent in mwenodes]
+ mwedescendantset = set()
+ for x in mweheads:
+ mwedescendantset = mwedescendantset.union(set(x.descendants))
+ mwedescendants = list(sorted(mwedescendantset))
+ # Is there a possessive pronoun?
+ possprons = [x for x in mwedescendants if x.upos == 'PRON' and x.feats['Poss'] == 'Yes']
+ inpp = [x for x in possprons if m in mwes_by_nodes[x.ord]]
+ outpp = [x for x in possprons if not m in mwes_by_nodes[x.ord]]
+ observation = ''
+ if inpp and outpp:
+ observation = 'both'
+ elif inpp:
+ observation = 'in'
+ elif outpp:
+ observation = 'out'
+ if observation:
+ expression = ' '.join([x.form if m in mwes_by_nodes[x.ord] else '('+x.form+')' for x in mwedescendants])
+ print(observation + ': ' + expression)
diff --git a/udapi/block/mwe/tosubdeprels.py b/udapi/block/mwe/tosubdeprels.py
new file mode 100644
index 00000000..3682c0c7
--- /dev/null
+++ b/udapi/block/mwe/tosubdeprels.py
@@ -0,0 +1,62 @@
+"""Block that takes PARSEME-like annotation of multiword expressions from MISC
+ and projects it to subtypes of dependency relation labels. The motivation is
+ that a parser could learn to predict the multiword expressions."""
+from udapi.core.block import Block
+import logging
+import re
+
+class ToSubDeprels(Block):
+
+ def collect_mwes(self, root):
+ """
+ Collects annotations of multiword expressions from MISC of the nodes.
+ The expected annotation is in the style of Parseme (see
+ https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download
+ the data from http://hdl.handle.net/11372/LRT-5124), except that there
+ are only ten columns and the annotation from the eleventh column is
+ copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).
+ """
+ nodes = root.descendants
+ mwes = {} # for each mwe id, its type and list of node ids
+ mwes_by_nodes = {} # for each node id, a list of mwe ids
+ for n in nodes:
+ mwes_by_nodes[n.ord] = []
+ miscmwe = n.misc['Mwe']
+ if miscmwe:
+ # A node may belong to multiple multiword expressions.
+ miscmwes = miscmwe.split(';')
+ for m in miscmwes:
+ # Either it is NUMBER:TYPE, or just NUMBER.
+ # Number identifies this MWE among all MWEs in the sentence.
+ # Type is a main uppercase string (VID, LVC etc.), optionally
+ # followed by a subtype ('LVC.cause').
+ # See https://gitlab.com/parseme/corpora/-/wikis/home
+ match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m)
+ if match:
+ number = match.group(1)
+ type = match.group(2)
+ if not number in mwes:
+ mwes[number] = {'nodes': [], 'type': ''}
+ if type:
+ mwes[number]['type'] = type
+ mwes[number]['nodes'].append(n.ord)
+ mwes_by_nodes[n.ord].append(number)
+ else:
+ logging.warning("Cannot parse Mwe=%s" % m)
+ return (mwes, mwes_by_nodes)
+
+ def process_tree(self, root):
+ """
+ Collects annotations of multiword expressions from MISC of the nodes.
+ Then saves the type of the MWE as a subtype of the deprels inside.
+ """
+ nodes = root.descendants
+ (mwes, mwes_by_nodes) = self.collect_mwes(root)
+ # Now we hopefully know the type of every multiword expression in the sentence.
+ for n in nodes:
+ if mwes_by_nodes[n.ord]:
+ for m in mwes_by_nodes[n.ord]:
+ type = re.sub(r"\.", '', mwes[m]['type'].lower())
+ # Add the MWE type to the DEPREL if the parent is also in the same MWE.
+ if n.parent.ord > 0 and m in mwes_by_nodes[n.parent.ord]:
+ n.deprel += ':' + type
diff --git a/udapi/block/read/addbratann.py b/udapi/block/read/addbratann.py
new file mode 100644
index 00000000..4f5fc877
--- /dev/null
+++ b/udapi/block/read/addbratann.py
@@ -0,0 +1,230 @@
+"""Add Brat coreference annotation from *.ann files.
+
+So far, tested on French LitBank data only.
+
+T12 HIST 362 366 qui
+T13 HIST 349 362 une aventure
+R1431 Coreference Arg1:T12 Arg2:T13
+
+"""
+
+from udapi.core.block import Block
+from udapi.core.files import Files
+import logging
+from bisect import bisect_left
+import networkx as nx
+
+def _m(range_s, range_e, offset):
+ return f"{range_s}-{offset}:{range_e}-{offset}" if offset else f"{range_s}:{range_e}"
+
+class AddBratAnn(Block):
+
+ def __init__(self, files, zone='', offset=0, detect_bom=True, keep_mention_id=True,
+ coref_attr="R", no_type_value='_Unsorted_',
+ **kwargs):
+ """Args:
+ files: file names with the coreference annotations (*.ann)
+ offset: what number to substract from the chatacter indices in the ann files
+ detect_bom: if True and the current txt file starts with BOM (byte-order mark), add 1 to the offset
+ """
+ super().__init__(**kwargs)
+ self.zone = zone
+ self.files = Files(filenames=files)
+ self.offset = offset
+ self.detect_bom = detect_bom
+ self.keep_mention_id = keep_mention_id
+ self.coref_attr = coref_attr
+ self.no_type_value = no_type_value
+
+ def process_document(self, document):
+
+ # Read all the important info from the *.ann file.
+ mentions, attrs, split_ante, clusters = {}, [], [], []
+ ann_filehandle = self.files.next_filehandle()
+ offset = self.offset
+ if self.detect_bom:
+ txt_filename = self.files.filename.replace("ann", "txt")
+ with open(txt_filename, 'rb') as txt_fh:
+ raw_bytes = txt_fh.read(3)
+ if raw_bytes == b'\xef\xbb\xbf':
+ offset += 1
+
+ for line in ann_filehandle:
+ line = line.rstrip('\n')
+ if not "\t" in line:
+ logging.warning(f"Unexpected line without tabs: {line}")
+ elif line.startswith("T"):
+ # T13 HIST 349 362 une aventure
+ try:
+ mention_id, type_and_range, form = line.split("\t")
+ # Usually range are two numbers, but can be more, e.g. type_and_range="Abstract 605 653;654 703"
+ # Let's take the first and last number only.´
+ parts = type_and_range.split()
+ ne_type, range_s, range_e = parts[0], int(parts[1]), int(parts[-1])
+
+ # If form ends with spaces, remove them and adjust range_e
+ stripped_form = form.rstrip(" ")
+ if form != stripped_form:
+ num_spaces = len(form) - len(stripped_form)
+ logging.debug(f"Stripping {num_spaces} space{'s' if num_spaces>1 else ''} from {mention_id} '{form}' ({_m(range_s,range_e,offset)}->{range_e-num_spaces})")
+ form = stripped_form
+ range_e = range_e - num_spaces
+
+
+ mentions[mention_id] = [ne_type, range_s, range_e, form]
+ if self.keep_mention_id:
+ attrs.append(["mention_id", mention_id, mention_id])
+ except Exception as e:
+ logging.warning(f"Unexpected mention line: {line}\n{e}")
+ elif line.startswith(self.coref_attr):
+ try:
+ cor_attr, mention_ids = line.rstrip().split("\t")
+ parts = mention_ids.split()
+ assert(parts[0] == "Coreference")
+ except Exception as e:
+ logging.warning(f"Unexpected coref line: '{line}'\n{e}")
+ clusters.append([p.split(":")[1] for p in parts[1:]])
+ elif line.startswith("#"):
+ pass # Let's ignore annotators' comments
+ else:
+ logging.warning(f"Unexpected line in {self.files.filename}:\n{line}")
+
+ # Some Brat ann files use link-based representation, e.g.
+ # R123 Coreference Arg1:T11 Arg2:T13
+ # R124 Coreference Arg1:T12 Arg2:T14
+ # R125 Coreference Arg1:T13 Arg2:T14
+ # This actually means that all four mentions T11, T12, T13 and T14 are in the same cluster (entity).
+ # However, clusters = [["T11", "T13"], ["T12", "T14"], ["T13", "T14"]]
+ # and we need to convert it to clusters = [["T11", "T12", "T13", "T14"]]
+ # Note that if creating entities for link, in their original order,
+ # R123 and R125 would result in creating two entities and when hitting R125
+ # we would need to merge them, i.e. delete one of them and move their mentions to the other.
+ # This is the solution of corefud.Link2Cluster, but here it seems easier to find connected components.
+ coref_graph = nx.Graph()
+ for mention_ids in clusters:
+ coref_graph.add_node(mention_ids[0])
+ for mention_id in mention_ids[1:]:
+ coref_graph.add_node(mention_id)
+ coref_graph.add_edge(mention_id, mention_ids[0])
+ clusters = [list(component) for component in nx.connected_components(coref_graph)]
+
+ # Create entity objects for non-singletons.
+ entity_map = {}
+ for mention_ids in clusters:
+ etype, etype_index = None, 0
+ for index, m_id in enumerate(mention_ids):
+ if mentions[m_id][0] == self.no_type_value:
+ pass
+ elif etype is None:
+ etype, etype_index = mentions[m_id][0], index
+ elif etype != mentions[m_id][0]:
+ logging.warning(f"Mention type mismatch {mention_ids[etype_index]}:{etype} != {m_id}:{mentions[m_id][0]}. Using the former.")
+ if etype is None:
+ etype = "other"
+ entity = document.create_coref_entity(etype=etype)
+ for m_id in mention_ids:
+ if m_id in entity_map:
+ logging.warning(f"Mention {m_id} already in Entity {entity_map[m_id].eid}, not adding to {entity.eid}")
+ else:
+ entity_map[m_id] = entity
+
+ # Collect TokenRange (as pre-filled by UDPipe) for each token.
+ tokens, starts, ends = [], [], []
+ for tree in document.trees:
+ for token in tree.token_descendants:
+ tokens.append(token)
+ range_s, range_e = token.misc["TokenRange"].split(":")
+ starts.append(int(range_s))
+ ends.append(int(range_e))
+
+ # Create mention objects.
+ mention_map = {}
+ for mention_id, mention_values in mentions.items():
+
+ # Find Udapi tokens for each mention.
+ ne_type, range_s, range_e, form = mention_values
+ index_s = bisect_left(starts, range_s - offset)
+ if starts[index_s] != range_s - offset and index_s > 0:
+ index_s -= 1
+ index_e = bisect_left(ends, range_e - offset)
+ mtokens = tokens[index_s : index_e+1]
+ token_s, token_e = tokens[index_s], tokens[index_e]
+
+ # Solve cases when the character range crosses Udapi (UDPipe-predicted) token boundaries.
+ # If the start token is a multi-word token (MWT),
+ # we can still try to find the proper word within the MWT.
+ ok_s, ok_e = True, True
+ if starts[index_s] != range_s - offset:
+ ok_s = False
+ if token_s.is_mwt():
+ mtokens.pop(0)
+ first_form = form.split()[0]
+ new_start = ends[index_s]
+ for w in reversed(token_s.words):
+ mtokens = [w] + mtokens
+ new_start -= len(w.form)
+ if w.form == first_form or new_start < range_s - offset:
+ ok_s = True
+ break
+
+ # similarly for the end token
+ if ends[index_e] != range_e - offset:
+ ok_e = False
+ if token_e.is_mwt():
+ mtokens.pop()
+ last_form = form.split()[-1]
+ new_end = starts[index_e]
+ for w in token_e.words:
+ mtokens.append(w)
+ new_end += len(w.form)
+ if w.form == last_form or new_end > range_e - offset:
+ ok_e = True
+ break
+
+ if not ok_s or not ok_e:
+ logging.warning(f"Mention {mention_id} range {_m(range_s, range_e, offset)} ({form})"
+ f" crosses token boundaries: {token_s.misc} ({token_s.form}) "
+ f".. {token_e.misc} ({token_e.form})")
+
+ # Project tokens (including MWTs) to words and check forms match.
+ words, udapi_form = [], ""
+ for token in mtokens:
+ words += token.words
+ udapi_form += token.form
+ if not token.no_space_after:
+ udapi_form += " "
+ udapi_form = udapi_form.rstrip()
+ if form != udapi_form:
+ logging.warning(f"Mention {mention_id}: ann form '{form}' != Udapi form '{udapi_form}'")
+
+ # Make sure all words of the mention are in the same sentence.
+ root = words[0].root
+ mwords = [words[0]]
+ for word in words[1:]:
+ if word.root is root:
+ mwords.append(word)
+ else:
+ logging.warning(f"Cross-sentence mention. Word {word} not in {root}, thus omitting from the mention.")
+
+ # Create entities for singletons
+ if mention_id not in entity_map:
+ entity_map[mention_id] = document.create_coref_entity(etype=ne_type)
+
+ # Create the Udapi mention object
+ mention = entity_map[mention_id].create_mention(words=mwords)
+ mention_map[mention_id] = mention
+
+ # Fill-in the additional mention attributes.
+ for attr_name, mention_id, attr_value in attrs:
+ if mention_id in mention_map:
+ mention_map[mention_id].other[attr_name] = attr_value
+
+ # Fill-in split antecedents
+ for arg1, arg2 in split_ante:
+ if arg1 in entity_map and arg2 in entity_map:
+ if entity_map[arg1] in entity_map[arg2].split_ante:
+ logging.warning(f"Repeated SplitAnte: {arg1=} ({entity_map[arg1].eid}) {arg2=} ({entity_map[arg2].eid})")
+ else:
+ entity_map[arg2].split_ante.append(entity_map[arg1])
+ else:
+ logging.warning(f"{arg1} or {arg2} not indexed in entity_map")
diff --git a/udapi/block/read/addsentences.py b/udapi/block/read/addsentences.py
index 75c4ac7d..f676fbe7 100644
--- a/udapi/block/read/addsentences.py
+++ b/udapi/block/read/addsentences.py
@@ -2,7 +2,9 @@
from udapi.core.basereader import BaseReader
# pylint: disable=abstract-method
-# read_tree() does not need to be installed here
+# read_tree() does not need to be implemented here
+
+
class AddSentences(BaseReader):
"""A reader for adding plain-text sentences (one sentence per line) files.
@@ -12,8 +14,17 @@ class AddSentences(BaseReader):
`cat in.conllu | udapy -s read.Conllu read.AddSentences files=in.txt > merged.conllu`
"""
- def __init__(self, zone='', **kwargs):
+ def __init__(self, zone='', into='text', **kwargs):
+ """Args:
+ into: name of the comment-attribute where the sentence should be stored. Default = text.
+ That is the sentence is stored in `root.text` and in CoNLL-U it will look like e.g.
+ `# text = John loves Mary.`
+ Any other name than "text" is stored to `root.comment`, so e.g. `into=english_text`
+ will result in a CoNLL-U with a comment line:
+ `# english_text = John loves Mary.`
+ """
super().__init__(zone=zone, **kwargs)
+ self.into = into
@staticmethod
def is_multizone_reader():
@@ -34,7 +45,10 @@ def process_document(self, document):
for bundle in document.bundles:
line = self.filehandle.readline()
if line == '':
- raise IOError('File does not have enoush lines')
+ raise IOError('File does not have enough lines')
root = bundle.get_tree(zone=self.zone)
- root.text = line.rstrip()
+ if self.into == 'text':
+ root.text = line.rstrip()
+ else:
+ root.comment += ' ' + self.into + " = " + line.rstrip() + "\n"
self.finished = not self.files.has_next_file()
diff --git a/udapi/block/read/addtext.py b/udapi/block/read/addtext.py
new file mode 100644
index 00000000..4d0b7771
--- /dev/null
+++ b/udapi/block/read/addtext.py
@@ -0,0 +1,59 @@
+"""read.AddText is a reader for adding word-wrapped plain-text to existing trees."""
+from udapi.core.basereader import BaseReader
+from udapi.core.root import Root
+import logging
+
+class AddText(BaseReader):
+ r"""A reader for plain-text files to be stored to existing trees.
+
+ For example LitBank conll files are segmented to sentences and tokenized,
+ but the SpacesAfter attributes are missing. We need to load the original
+ (raw) texts, which are not tokenized and not segmented, only word-wrapped
+ (to 70 characters per line).
+
+ Args:
+ add_newpar: add newpar CoNLL-U annotations on empty lines (and the beginning of file)
+ """
+ def __init__(self, zone='', add_newpar=True, **kwargs):
+ super().__init__(zone=zone, **kwargs)
+ self.add_newpar = add_newpar
+
+ @staticmethod
+ def is_multizone_reader():
+ """Can this reader read bundles which contain more zones?.
+
+ This implementation returns always False.
+ """
+ return False
+
+ def process_document(self, document):
+ filehandle = self.next_filehandle()
+ if filehandle is None:
+ self.finished = True
+ return
+ text = ''.join(self.filehandle.readlines())
+ i, end, was_newpar = 0, len(text)-1, True
+ while i <= end and text[i].isspace():
+ i += 1
+
+ for bundle in document.bundles:
+ root = bundle.get_tree(zone=self.zone)
+ if self.add_newpar and was_newpar:
+ root.newpar = True
+ was_newpar = False
+ for node in root.token_descendants:
+ if text[i:i+len(node.form)] == node.form:
+ i += len(node.form)
+ if i > end or text[i].isspace():
+ del node.misc['SpaceAfter']
+ was_newpar = i+1 < end and text[i+1] == '\n' and text[i] == '\n'
+ while i <= end and text[i].isspace():
+ i += 1
+ else:
+ node.misc['SpaceAfter'] = 'No'
+ was_newpar = False
+ else:
+ logging.warning('Node %s does not match text "%s"', node, text[i:i+20])
+ return
+ root.text = root.compute_text()
+ self.finished = not self.files.has_next_file()
diff --git a/udapi/block/read/ccv.py b/udapi/block/read/ccv.py
new file mode 100644
index 00000000..eb449362
--- /dev/null
+++ b/udapi/block/read/ccv.py
@@ -0,0 +1,78 @@
+"""Ccv class is a reader for Corpus of Czech Verse json files."""
+from udapi.core.basereader import BaseReader
+from udapi.core.root import Root
+from udapi.block.ud.setspaceafterfromtext import SetSpaceAfterFromText
+import json
+
+class Ccv(BaseReader):
+ r"""A reader for Corpus of Czech Verse json files.
+
+ See https://github.com/versotym/corpusCzechVerse
+ Each verse (line) is stored as one tree (although it is quite often not a whole sentence).
+ Start of each stanza is marked with `newpar`.
+ Start of each poem is marked with `newdoc = [poem_id]`.
+
+ Args:
+ tokenize: create nodes
+ """
+ def __init__(self, tokenize=True, **kwargs):
+ self.tokenize = tokenize
+ self._cache = None
+ super().__init__(**kwargs)
+
+ @staticmethod
+ def is_multizone_reader():
+ """Can this reader read bundles which contain more zones?.
+
+ This implementation returns always False.
+ """
+ return False
+
+ def read_tree(self):
+ if self._cache:
+ return self._cache.pop()
+ else:
+ trees = self.read_trees()
+ if not trees:
+ return None
+ self._cache = list(reversed(trees[1:]))
+ return trees[0]
+
+ def read_trees(self):
+ if self.filehandle is None:
+ return None
+ poems = json.load(self.filehandle)
+ all_trees = []
+ for poem in poems:
+ poem_trees = []
+ for stanza in poem["body"]:
+ stanza_trees = []
+ for line in stanza:
+ root = Root()
+ root.text = line["text"]
+ root.json["rhyme"] = line["rhyme"]
+ root.json["metre"] = line["metre"]
+ root.json["stress"] = line["stress"]
+ stanza_trees.append(root)
+ if self.tokenize:
+ words = [[]] + [[w] for w in line["words"]]
+ for index, puncts in line["punct"].items():
+ for punct in puncts:
+ words[int(index)].append({"token": punct, "lemma": punct})
+ for word in words:
+ for w in word:
+ node = root.create_child(form=w["token"], lemma=w["lemma"])
+ if "morph" in w:
+ node.xpos = w["morph"]
+ node.misc["xsampa"] = w["xsampa"]
+ node.misc["phoebe"] = w["phoebe"]
+ SetSpaceAfterFromText.process_tree(None, root)
+ stanza_trees[0].newpar = True
+ poem_trees.extend(stanza_trees)
+ root = poem_trees[0]
+ root.newdoc = poem["poem_id"]
+ root.json["p_author"] = poem["p_author"]
+ root.json["b_author"] = poem["b_author"]
+ root.json["biblio"] = poem["biblio"]
+ all_trees.extend(poem_trees)
+ return all_trees
diff --git a/udapi/block/read/conll.py b/udapi/block/read/conll.py
new file mode 100644
index 00000000..d0aef1ee
--- /dev/null
+++ b/udapi/block/read/conll.py
@@ -0,0 +1,162 @@
+""""Conll is a reader block for CoNLL-like files (CoNLL-U, CoNLL-X, CoNLL-2009)."""
+import json
+import logging
+import re
+
+import udapi.block.read.conllu
+from udapi.core.root import Root
+from udapi.core.node import Node
+
+
+class Conll(udapi.block.read.conllu.Conllu):
+ """A reader of the CoNLL-U files."""
+
+ def __init__(self, separator='tab',
+ attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc', **kwargs):
+ """Create the Conll reader object.
+
+ This us a subclass of udapi.block.read.conllu.Conllu,
+ which adds a support for arbitrary column names and thus supporting not only CoNLL-U,
+ but also CoNLL-X, CoNLL-2009 and many other CoNLL-like formats.
+
+ Args:
+ separator: How are the columns separated?
+ Default='tab' is the only possibility in valid CoNLL-U files.
+ 'space' means one or more whitespaces (this does not allow forms with space).
+ 'doublespace' means two or more spaces.
+ attributes: comma-separated list of column names in the input files
+ (default='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc')
+ Changing the default can be used for loading CoNLL-like formats (not valid CoNLL-U).
+ For ignoring a column, use "_" as its name.
+ Column "ord" marks the column with 1-based word-order number/index (usualy called ID).
+ Column "head" marks the column with dependency parent index (word-order number).
+
+ For example, for CoNLL-X which uses name1=value1|name2=value2 format of FEATS, use
+ `attributes=ord,form,lemma,upos,xpos,feats,head,deprel,_,_`
+ but note that attributes upos, feats and deprel will contain language-specific values,
+ not valid according to UD guidelines and a further conversion will be needed.
+ You will loose the projective_HEAD and projective_DEPREL attributes.
+
+ For CoNLL-2009 you can use `attributes=ord,form,lemma,_,upos,_,feats,_,head,_,deprel`.
+ You will loose the predicted_* attributes and semantic/predicate annotation.
+
+ TODO: allow storing the rest of columns in misc, e.g. `node.misc[feats]`
+ for feats which do not use the name1=value1|name2=value2 format.
+ """
+ super().__init__(**kwargs)
+ self.node_attributes = attributes.split(',')
+ self.separator = separator
+
+ # pylint: disable=too-many-locals,too-many-branches,too-many-statements
+ # Maybe the code could be refactored, but it is speed-critical,
+ # so benchmarking is needed because calling extra methods may result in slowdown.
+
+ def parse_node_line(self, line, root, nodes, parents, mwts):
+ if self.separator == 'tab':
+ fields = line.split('\t')
+ elif self.separator == 'space':
+ fields = line.split()
+ elif self.separator == 'doublespace':
+ fields = re.split(' +', line)
+ else:
+ raise ValueError('separator=%s is not valid' % self.separator)
+ if len(fields) != len(self.node_attributes):
+ if self.strict:
+ raise RuntimeError('Wrong number of columns in %r' % line)
+ fields.extend(['_'] * (len(self.node_attributes) - len(fields)))
+ # multi-word tokens will be processed later
+ if '-' in fields[0]:
+ mwts.append(fields)
+ return
+ if '.' in fields[0]:
+ empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3],
+ xpos=fields[4], feats=fields[5], misc=fields[9])
+ empty.ord = fields[0]
+ empty.raw_deps = fields[8] # TODO
+ return
+
+ # This implementation is slower than in read.Conllu,
+ # but it allows for arbitrary columns
+ node = root.create_child()
+ for (n_attribute, attribute_name) in enumerate(self.node_attributes):
+ value = fields[n_attribute]
+ if attribute_name == 'head':
+ try:
+ parents.append(int(value))
+ except ValueError as exception:
+ if not self.strict and value == '_':
+ if self.empty_parent == 'warn':
+ logging.warning("Empty parent/head index in '%s'", line)
+ parents.append(0)
+ else:
+ raise exception
+ elif attribute_name == 'ord':
+ if int(value) != node._ord:
+ raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}")
+ elif attribute_name == 'deps':
+ setattr(node, 'raw_deps', value)
+ elif attribute_name != '_' and value != '_':
+ setattr(node, attribute_name, value)
+
+ nodes.append(node)
+
+ # Acknowledged code duplication with read.Conllu
+ def read_tree_from_lines(self, lines):
+ root = Root()
+ nodes = [root]
+ parents = [0]
+ mwts = []
+ for line in lines:
+ if line[0] == '#':
+ self.parse_comment_line(line, root)
+ else:
+ self.parse_node_line(line, root, nodes, parents, mwts)
+
+ # If no nodes were read from the filehandle (so only root remained in nodes),
+ # we return None as a sign of failure (end of file or more than one empty line).
+ if len(nodes) == 1:
+ return None
+
+ # Empty sentences are not allowed in CoNLL-U,
+ # but if the users want to save just the sentence string and/or sent_id
+ # they need to create one artificial node and mark it with Empty=Yes.
+ # In that case, we will delete this node, so the tree will have just the (technical) root.
+ # See also udapi.block.write.Conllu, which is compatible with this trick.
+ if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes':
+ nodes.pop()
+ root._children = []
+ root._descendants = []
+
+ # Set dependency parents (now, all nodes of the tree are created).
+ for node_ord, node in enumerate(nodes[1:], 1):
+ try:
+ parent = nodes[parents[node_ord]]
+ except IndexError:
+ raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord]))
+ if node is parent:
+ if self.fix_cycles:
+ logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node)
+ parent = root
+ else:
+ raise ValueError(f"Detected a cycle: {node} attached to itself")
+ elif node._children:
+ climbing = parent._parent
+ while climbing:
+ if climbing is node:
+ if self.fix_cycles:
+ logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent)
+ parent = root
+ break
+ else:
+ raise ValueError(f"Detected a cycle: {node}")
+ climbing = climbing._parent
+ node._parent = parent
+ parent._children.append(node)
+
+ # Create multi-word tokens.
+ for fields in mwts:
+ range_start, range_end = fields[0].split('-')
+ words = nodes[int(range_start):int(range_end) + 1]
+ root.create_multiword_token(words, form=fields[1], misc=fields[-1])
+
+ return root
diff --git a/udapi/block/read/conll2012.py b/udapi/block/read/conll2012.py
new file mode 100644
index 00000000..2adbd00f
--- /dev/null
+++ b/udapi/block/read/conll2012.py
@@ -0,0 +1,153 @@
+""""Conll2012 is a reader block for the coreference in CoNLL-2012 format.
+
+This implementation was tested on the LitBank files only
+(and quickly on Portuguese Corref-PT and Summ-it++v2), so far.
+LitBank does not use most of the columns, so the implementation
+should be improved to handle other types of CoNLL-2012 files.
+"""
+import json
+import logging
+import re
+
+import udapi.block.read.conllu
+from udapi.core.root import Root
+from udapi.core.node import Node
+
+RE_BEGIN = re.compile(r'^#begin document ([^ ]+)')
+
+class Conll2012(udapi.block.read.conllu.Conllu):
+ """A reader of the Conll2012 files."""
+
+ def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', emptyval='_', **kwargs):
+ """Create the Conll2012 reader object.
+
+ Args:
+ attributes: comma-separated list of column names in the input files
+ (default='docname,_,ord,form,_,_,_,_,_,_,_,_,coref' suitable for LitBank)
+ For ignoring a column, use "_" as its name.
+ Column "ord" marks the column with 0-based (unlike in CoNLL-U, which uses 1-based)
+ word-order number/index (usualy called ID).
+ For Corref-PT-SemEval, use attributes='ord,form,_,_,_,_,coref'.
+ For Summ-it++v2, use attributes='ord,form,_,_,_,_,_,_,coref'.
+ For FantasyCoref, use attributes='docname,_,ord,form,_,_,_,_,_,_,_,coref'.
+ emptyval: a symbol that represents an empty value, especially in the coref column
+ (default='_' suitable for LitBank, Corref-PT-SemEval, and Summ-it++v2)
+ For FantasyCoref, use emptyval='-'.
+ """
+ super().__init__(**kwargs)
+ self.node_attributes = attributes.split(',')
+ self._docname = 'd'
+ self.emptyval = emptyval
+
+ def parse_comment_line(self, line, root):
+ if line.startswith("#end document"):
+ return
+ match = RE_BEGIN.match(line)
+ if match:
+ docname = match.group(1)
+ # LitBank and FantasyCoref use e.g.
+ # #begin document (1023_bleak_house_brat); part 0
+ if docname.startswith('(') and docname.endswith(');'):
+ docname = docname[1:-2]
+ # Summ-it++v2 uses e.g.
+ # #begin document /home/andre/Recursos-fontes/Summit/Summ-it_v3.0/corpusAnotado_CCR/CIENCIA_2002_22010/CIENCIA_2002_22010.txt
+ elif docname.startswith('/home/'):
+ docname = docname.split('/')[-1]
+ # Corref-PT-SemEval uses e.g.
+ # #begin document D1_C30_Folha_07-08-2007_09h19.txt.xml
+ docname = docname.replace('.txt', '').replace('.xml', '')
+ # FantasyCoref may use parentheses within the document ID e.g.
+ # #begin document (051_Fundevogel_(Bird-foundling)); part 000
+ docname = docname.replace('(', '').replace(')', '')
+
+ root.newdoc = docname
+ self._global_entity = 'eid-etype-head-other'
+ root.comment += '$GLOBAL.ENTITY\n'
+ self._docname = docname
+ else:
+ logging.warning(f"Unexpected comment line: {line}")
+
+ def parse_node_line(self, line, root, nodes):
+ fields = line.split('\t')
+ if len(fields) != len(self.node_attributes):
+ if self.strict:
+ raise RuntimeError('Wrong number of columns in %r' % line)
+ fields.extend(['_'] * (len(self.node_attributes) - len(fields)))
+
+ # This implementation is slower than in read.Conllu,
+ # but it allows for arbitrary columns
+ node = root.create_child()
+ for (n_attribute, attribute_name) in enumerate(self.node_attributes):
+ value = fields[n_attribute]
+ if attribute_name == 'docname':
+ # FantasyCoref may use parentheses within the document ID
+ value = value.replace('(', '').replace(')', '')
+ if value != self._docname:
+ logging.warning(f"Document name mismatch {value} != {self._docname}")
+
+ # convert the zero-based index to one-based
+ # but Corref-PT uses a mix of one-based and zero-based
+ elif attribute_name == 'ord':
+ #setattr(node, 'ord', int(value) + 1)
+ if node.ord not in(int(value) + 1, int(value)):
+ logging.warning(f"Mismatch: expected {node.ord=}, but found {int(value) + 1} {line=}")
+
+ elif attribute_name == 'coref':
+ if value and value != self.emptyval:
+ # LitBank always separates chunks by a vertical bar, e.g. (13)|10)
+ # Summ-it++v2 does not, e.g. (13)10)
+ if '|' in value:
+ chunks = value.split("|")
+ else:
+ chunks = [x for x in re.split(r'(\([^()]+\)?|[^()]+\))', value) if x]
+ modified_entities = []
+ escaped_docname = self._docname.replace("-", "")
+ for entity in chunks:
+ entity_num = entity.replace("(", "").replace(")","")
+ modified_entity = f"{escaped_docname}_e{entity_num}--1"
+ if entity.startswith("(") and entity.endswith(")"):
+ modified_entity = "(" + modified_entity + ")"
+ elif entity.startswith("("):
+ modified_entity = "(" + modified_entity
+ elif entity.endswith(")"):
+ modified_entity = f"{escaped_docname}_e{entity_num}" + ")"
+
+ # to avoid parentheses clashes, put the entities with ")" first
+ if modified_entity.startswith("("):
+ modified_entities.append(modified_entity)
+ else:
+ modified_entities.insert(0, modified_entity)
+ node.misc['Entity'] = ''.join(modified_entities)
+
+ elif attribute_name == 'form' or (attribute_name != '_' and value != '_'):
+ setattr(node, attribute_name, value)
+ nodes.append(node)
+
+ def read_tree_from_lines(self, lines):
+ root = Root()
+ nodes = [root]
+ for line in lines:
+ if line == '':
+ pass
+ elif line[0] == '#':
+ self.parse_comment_line(line, root)
+ else:
+ self.parse_node_line(line, root, nodes)
+
+ # If no nodes were read from the filehandle (so only root remained in nodes),
+ # we return None as a sign of failure (end of file or more than one empty line).
+ if len(nodes) == 1:
+ return None
+
+ return root
+
+ def read_trees(self):
+ if self.max_docs:
+ raise NotImplementedError("TODO implement max_docs in read.Conll2012")
+ # Corref-PT does not put an empty line before #end document,
+ # so we need to split both on #end document and empty lines.
+ return [self.read_tree_from_lines(s.split('\n')) for s in
+ re.split(r'\n\n+|\n#end document\n', self.filehandle.read()) if s]
+
+ def read_tree(self):
+ raise NotImplementedError("TODO implement read_tree in read.Conll2012")
diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py
index 8c80a779..e19cd676 100644
--- a/udapi/block/read/conllu.py
+++ b/udapi/block/read/conllu.py
@@ -1,66 +1,51 @@
""""Conllu is a reader block for the CoNLL-U files."""
+import json
import logging
import re
from udapi.core.basereader import BaseReader
from udapi.core.root import Root
+from udapi.core.node import Node
# Compile a set of regular expressions that will be searched over the lines.
# The equal sign after sent_id was added to the specification in UD v2.0.
# This reader accepts also older-style sent_id (until UD v2.0 treebanks are released).
RE_SENT_ID = re.compile(r'^# sent_id\s*=?\s*(\S+)')
-RE_TEXT = re.compile(r'^# text\s*=\s*(.+)')
-RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc) (?:\s*id\s*=\s*(.+))?')
+RE_TEXT = re.compile(r'^# text\s*=\s*(.*)')
+RE_NEWPARDOC = re.compile(r'^# (newpar|newdoc)(?:\s+id\s*=\s*(.+))?$')
+RE_JSON = re.compile(r'^# (doc_)?json_([^ =]+)\s*=\s*(.+)')
+RE_GLOBAL_ENTITY = re.compile(r'^# global.Entity\s*=\s*(\S+)')
+
class Conllu(BaseReader):
"""A reader of the CoNLL-U files."""
- def __init__(self, strict=False, separator='tab',
- attributes='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc', **kwargs):
+ def __init__(self, strict=False, empty_parent='warn', fix_cycles=False, **kwargs):
"""Create the Conllu reader object.
Args:
strict: raise an exception if errors found (default=False, i.e. a robust mode)
- separator: How are the columns separated?
- Default='tab' is the only possibility in valid CoNLL-U files.
- 'space' means one or more whitespaces (this does not allow forms with space).
- 'doublespace' means two or more spaces.
- attributes: comma-separated list of column names in the input files
- (default='ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc')
- Changing the default can be used for loading CoNLL-like formats (not valid CoNLL-U).
- For ignoring a column, use "_" as its name.
- Column "ord" marks the column with 1-based word-order number/index (usualy called ID).
- Column "head" marks the column with dependency parent index (word-order number).
-
- For example, for CoNLL-X which uses name1=value1|name2=value2 format of FEATS, use
- `attributes=ord,form,lemma,upos,xpos,feats,head,deprel,_,_`
- but note attributes that upos, feats and deprel will contain language-specific values,
- not valid according to UD guidelines and a further conversion will be needed.
- You will loose the projective_HEAD and projective_DEPREL attributes.
-
- For CoNLL-2009 you can use `attributes=ord,form,lemma,_,upos,_,feats,_,head,_,deprel`.
- You will loose the predicted_* attributes and semantic/predicate annotation.
-
- TODO: allow storing the rest of columns in misc, e.g. `node.misc[feats]`
- for feats which do not use the name1=value1|name2=value2 format.
+ empty_parent: What to do if HEAD is _? Default=warn: issue a warning and attach to the root
+ or if strict=1 issue an exception. With `empty_parent=ignore` no warning is issued.
+ fix_cycles: fix cycles by attaching a node in the cycle to the root; fix also HEAD index out of range
"""
super().__init__(**kwargs)
- self.node_attributes = attributes.split(',')
self.strict = strict
- self.separator = separator
-
+ self.empty_parent = empty_parent
+ self.fix_cycles = fix_cycles
- @staticmethod
- def parse_comment_line(line, root):
+ def parse_comment_line(self, line, root):
"""Parse one line of CoNLL-U and fill sent_id, text, newpar, newdoc in root."""
sent_id_match = RE_SENT_ID.match(line)
if sent_id_match is not None:
root.sent_id = sent_id_match.group(1)
+ root.comment += '$SENT_ID\n'
return
text_match = RE_TEXT.match(line)
if text_match is not None:
root.text = text_match.group(1)
+ root.comment += '$TEXT\n'
return
pardoc_match = RE_NEWPARDOC.match(line)
@@ -68,42 +53,89 @@ def parse_comment_line(line, root):
value = True if pardoc_match.group(2) is None else pardoc_match.group(2)
if pardoc_match.group(1) == 'newpar':
root.newpar = value
+ root.comment += '$NEWPAR\n'
else:
root.newdoc = value
+ root.comment += '$NEWDOC\n'
return
- root.comment = root.comment + line[1:] + "\n"
+ json_match = RE_JSON.match(line)
+ if json_match is not None:
+ container = root.json
+ if json_match.group(1) == 'doc_':
+ if '__doc__' not in root.json:
+ root.json['__doc__'] = {}
+ container = root.json['__doc__']
+ container[json_match.group(2)] = json.loads(json_match.group(3))
+ return
- # pylint: disable=too-many-locals,too-many-branches,too-many-statements
- # Maybe the code could be refactored, but it is speed-critical,
- # so benchmarking is needed because calling extra methods may result in slowdown.
- def read_tree(self, document=None):
+ entity_match = RE_GLOBAL_ENTITY.match(line)
+ if entity_match is not None:
+ global_entity = entity_match.group(1)
+ if self._global_entity and self._global_entity != global_entity:
+ logging.warning(f"Mismatch in global.Entity: {self._global_entity} != {global_entity}")
+ self._global_entity = global_entity
+ root.comment += '$GLOBAL.ENTITY\n'
+ return
+
+ root.comment += line[1:] + "\n"
+
+ def read_trees(self):
+ if not self.max_docs:
+ # Valid CoNLL-U files must have sentences separated by a single empty line.
+ # However, some users have to work with invalid files e.g. ending with two empty lines.
+ # It is obvious how to parse such files and re.split(r'\n\n+', s) is only twice as slow
+ # as s.split('\n\n') and this time is negligble
+ # relative to the main CoNLL-U parsing in read_tree_from_lines().
+ return [self.read_tree_from_lines(s.split('\n')) for s in
+ re.split(r'\n\n+', self.filehandle.read()) if s]
+ # udapi.core.basereader takes care about the max_docs parameter.
+ # However, we can make the loading much faster by not reading
+ # the whole file if the user wants just first N documents.
+ trees, lines, loaded_docs = [], [], 0
+ for line in self.filehandle:
+ line = line.rstrip()
+ if line == '':
+ tree = self.read_tree_from_lines(lines)
+ lines = []
+ if tree.newdoc:
+ if loaded_docs == self.max_docs:
+ return trees
+ loaded_docs += 1
+ if tree:
+ trees.append(tree)
+ else:
+ lines.append(line)
+ return trees
+
+ def read_tree(self):
if self.filehandle is None:
return None
+ lines = []
+ for line in self.filehandle:
+ line = line.rstrip()
+ if line == '':
+ break
+ lines.append(line)
+ return self.read_tree_from_lines(lines)
+ # pylint: disable=too-many-locals,too-many-branches,too-many-statements
+ # Maybe the code could be refactored, but it is speed-critical,
+ # so benchmarking is needed because calling extra methods may result in slowdown.
+ def read_tree_from_lines(self, lines):
root = Root()
nodes = [root]
parents = [0]
mwts = []
- for line in self.filehandle:
- line = line.rstrip()
- if line == '':
- break
+ for line in lines:
if line[0] == '#':
self.parse_comment_line(line, root)
else:
- if self.separator == 'tab':
- fields = line.split('\t')
- elif self.separator == 'space':
- fields = line.split()
- elif self.separator == 'doublespace':
- fields = re.split(' +', line)
- else:
- raise ValueError('separator=%s is not valid' % self.separator)
- if len(fields) != len(self.node_attributes):
+ fields = line.split('\t')
+ if len(fields) != 10:
if self.strict:
raise RuntimeError('Wrong number of columns in %r' % line)
- fields.extend(['_'] * (len(self.node_attributes) - len(fields)))
+ fields.extend(['_'] * (10 - len(fields)))
# multi-word tokens will be processed later
if '-' in fields[0]:
mwts.append(fields)
@@ -112,27 +144,32 @@ def read_tree(self, document=None):
empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3],
xpos=fields[4], feats=fields[5], misc=fields[9])
empty.ord = fields[0]
- empty.raw_deps = fields[8] # TODO
+ empty.raw_deps = fields[8] # TODO
continue
- node = root.create_child()
-
- # TODO slow implementation of speed-critical loading
- for (n_attribute, attribute_name) in enumerate(self.node_attributes):
- if attribute_name == 'head':
- try:
- parents.append(int(fields[n_attribute]))
- except ValueError as exception:
- if not self.strict and fields[n_attribute] == '_':
- logging.warning("Empty parent/head index in '%s'", line)
- else:
- raise exception
- elif attribute_name == 'ord':
- setattr(node, 'ord', int(fields[n_attribute]))
- elif attribute_name == 'deps':
- setattr(node, 'raw_deps', fields[n_attribute])
- elif attribute_name != '_':
- setattr(node, attribute_name, fields[n_attribute])
+ if fields[3] == '_':
+ fields[3] = None
+ if fields[4] == '_':
+ fields[4] = None
+ if fields[7] == '_':
+ fields[7] = None
+
+ # ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc
+ node = Node(root=root, form=fields[1], lemma=fields[2], upos=fields[3],
+ xpos=fields[4], feats=fields[5], deprel=fields[7], misc=fields[9])
+ root._descendants.append(node)
+ node._ord = int(fields[0])
+ if fields[8] != '_':
+ node.raw_deps = fields[8]
+ try:
+ parents.append(int(fields[6]))
+ except ValueError as exception:
+ if not self.strict and fields[6] == '_':
+ if self.empty_parent == 'warn':
+ logging.warning("Empty parent/head index in '%s'", line)
+ parents.append(0)
+ else:
+ raise exception
nodes.append(node)
@@ -146,24 +183,49 @@ def read_tree(self, document=None):
# they need to create one artificial node and mark it with Empty=Yes.
# In that case, we will delete this node, so the tree will have just the (technical) root.
# See also udapi.block.write.Conllu, which is compatible with this trick.
- if len(nodes) == 2 and nodes[1].misc == 'Empty=Yes':
+ if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes':
nodes.pop()
+ root._children = []
+ root._descendants = []
# Set dependency parents (now, all nodes of the tree are created).
- # TODO: parent setter checks for cycles, but this is something like O(n*log n)
- # if done for each node. It could be done faster if the whole tree is checked at once.
- # Also parent setter removes the node from its old parent's list of children,
- # this could be skipped here by not using `node = root.create_child()`.
for node_ord, node in enumerate(nodes[1:], 1):
try:
- node.parent = nodes[parents[node_ord]]
+ parent = nodes[parents[node_ord]]
except IndexError:
- raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord]))
+ if self.fix_cycles:
+ logging.warning(f"Ignoring out-of-range HEAD (attaching to the root instead): {node} HEAD={parents[node_ord]}")
+ parent = root
+ else:
+ raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord]))
+ if node is parent:
+ if self.fix_cycles:
+ logging.warning("Ignoring a self-cycle (attaching to the root instead):\n%s", node)
+ parent = root
+ else:
+ raise ValueError(f"Detected a cycle: {node} attached to itself")
+ elif node._children:
+ climbing = parent._parent
+ while climbing:
+ if climbing is node:
+ if self.fix_cycles:
+ logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent)
+ parent = root
+ break
+ else:
+ raise ValueError(f"Detected a cycle: {node}")
+ climbing = climbing._parent
+ node._parent = parent
+ parent._children.append(node)
# Create multi-word tokens.
for fields in mwts:
- range_start, range_end = fields[0].split('-')
- words = nodes[int(range_start):int(range_end)+1]
- root.create_multiword_token(words, form=fields[1], misc=fields[-1])
+ try:
+ range_start, range_end = fields[0].split('-')
+ except ValueError:
+ logging.warning(f"Wrong MWT range in\n{fields[0]}\n\n{lines}")
+ raise
+ words = nodes[int(range_start):int(range_end) + 1]
+ root.create_multiword_token(words, form=fields[1], feats=fields[5], misc=fields[9])
return root
diff --git a/udapi/block/read/conllup.py b/udapi/block/read/conllup.py
new file mode 100644
index 00000000..16d83d07
--- /dev/null
+++ b/udapi/block/read/conllup.py
@@ -0,0 +1,107 @@
+"""Conllup is a reader block for the CoNLL-UPlus format.
+
+Columns which don't have standardize attributes in Udapi/CoNLL-U
+are stored in MISC (as key=value pairs).
+
+This code has been only tested on Hungarian KorKor files for CorefUD so far.
+However, in the end, it is not used there (xtsv files are used instead conllup).
+"""
+import logging
+import re
+
+import udapi.block.read.conll
+from udapi.core.root import Root
+from udapi.core.node import Node
+
+RE_GLOBAL_COLUMNS = re.compile(r'^# global.columns\s*=\s*(.+)')
+COLUMN_MAP = {
+ 'ID': 'ord',
+}
+NORMAL_ATTRS = 'form lemma upos xpos feats deprel misc'.split()
+
+class Conllup(udapi.block.read.conll.Conll):
+ """A reader of the CoNLL-UPlus files."""
+
+ def __init__(self, attributes='autodetect', save_global_columns=False, **kwargs):
+ """Create the Conllup reader object.
+
+ Args:
+ attributes: comma-separated list of column names in the input files
+ (can be used if the global.columns header is missing or needs to be overriden).
+ Default='autodetect' which means the column names will be loaded from the global.columns header.
+ For ignoring a column, use "_" as its name.
+ save_global_columns: keep the "global.columns" header in root.comments. Default=False.
+ Note that when saving the output to CoNLL-U, the comment is not needed
+ and it may be even misleading. It could be helpful only once write.Conllup is implemented
+ (with the possibility to use the same columns as in the input file).
+ """
+ super().__init__(**kwargs)
+ self.save_global_columns = save_global_columns
+ if attributes == 'autodetect':
+ self.node_attributes = None
+ else:
+ self.node_attributes = attributes.split(',')
+
+ def parse_comment_line(self, line, root):
+ if self.node_attributes is None:
+ global_columns_match = RE_GLOBAL_COLUMNS.match(line)
+ if global_columns_match is None:
+ return super().parse_comment_line(line, root)
+ global_columns = global_columns_match.group(1)
+ self.node_attributes = [COLUMN_MAP.get(v, v.lower()) for v in global_columns.split(" ")]
+ if self.save_global_columns:
+ root.comment += line[1:] + '\n'
+ return
+ return super().parse_comment_line(line, root)
+
+ def parse_node_line(self, line, root, nodes, parents, mwts):
+ fields = line.split('\t')
+ if len(fields) != len(self.node_attributes):
+ if self.strict:
+ raise RuntimeError('Wrong number of columns in %r' % line)
+ fields.extend(['_'] * (len(self.node_attributes) - len(fields)))
+
+ # multi-word tokens will be processed later
+ if '-' in fields[0]:
+ mwts.append(fields)
+ return
+ if '.' in fields[0]:
+ raise NotImplementedError("Empty nodes in CoNLL-UPlus not implement yet in read.Conllup")
+
+ # This implementation is slower than in read.Conllu,
+ # but it allows for arbitrary columns
+ node = root.create_child()
+ nonstandard_attrs = []
+ for (n_attribute, attribute_name) in enumerate(self.node_attributes):
+ value = fields[n_attribute]
+ if attribute_name == 'head':
+ if value == '???':
+ value = 0
+ try:
+ parents.append(int(value))
+ except ValueError as exception:
+ if not self.strict and value == '_':
+ if self.empty_parent == 'warn':
+ logging.warning("Empty parent/head index in '%s'", line)
+ parents.append(0)
+ else:
+ raise exception
+ elif attribute_name == 'ord':
+ if int(value) != node._ord:
+ raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}")
+ elif attribute_name == 'deps':
+ setattr(node, 'raw_deps', value)
+ elif value == '_' and attribute_name != 'form':
+ pass
+ elif attribute_name == '_':
+ pass
+ elif attribute_name in NORMAL_ATTRS:
+ setattr(node, attribute_name, value)
+ else:
+ nonstandard_attrs.append([attribute_name, value])
+
+ # This needs to be done after node.misc is created (if "misc" in node.attributes)
+ for attribute_name, value in nonstandard_attrs:
+ node.misc[attribute_name.capitalize()] = value
+
+ nodes.append(node)
diff --git a/udapi/block/read/oldcorefud.py b/udapi/block/read/oldcorefud.py
new file mode 100644
index 00000000..73e05f3b
--- /dev/null
+++ b/udapi/block/read/oldcorefud.py
@@ -0,0 +1,119 @@
+"""Reader for CoNLL-U files with the old CorefUD 0.1 style of coreference annotation."""
+import re
+import logging
+import udapi.block.read.conllu
+from udapi.core.coref import CorefEntity, CorefMention, BridgingLinks
+
+class OldCorefUD(udapi.block.read.conllu.Conllu):
+
+ def __init__(self, replace_hyphen_in_id_with='', **kwargs):
+ """Create the read.OldCorefUD reader object.
+
+ Args:
+ substitute_hyphen_in_id_for: string to use as a replacement for hyphens in ClusterId
+ The new format does not allow hyphens in eid (IDs of entity entities),
+ so we need to replace them.
+ """
+ super().__init__(**kwargs)
+ self.replace_hyphen_in_id_with = replace_hyphen_in_id_with
+ self.orig2new = {}
+ self.new2orig = {}
+
+ def _fix_id(self, cid):
+ if not cid or '-' not in cid:
+ return cid
+ new_cid = self.orig2new.get(cid)
+ if new_cid is None:
+ new_cid = cid.replace('-', self.replace_hyphen_in_id_with)
+ base, counter = new_cid, 1
+ while new_cid in self.new2orig:
+ counter += 1
+ new_cid = f"{base}{counter}"
+ self.new2orig[new_cid] = cid
+ self.orig2new[cid] = new_cid
+ return new_cid
+
+ def process_document(self, doc, strict=True):
+ super().process_document(doc)
+
+ eid_to_entity = {}
+ for node in doc.nodes_and_empty:
+ index, index_str = 0, ""
+ eid = node.misc["ClusterId"]
+ if not eid:
+ index, index_str = 1, "[1]"
+ eid = node.misc["ClusterId[1]"]
+ eid = self._fix_id(eid)
+ while eid:
+ entity = eid_to_entity.get(eid)
+ if entity is None:
+ entity = CorefEntity(eid)
+ eid_to_entity[eid] = entity
+ mention = CorefMention(words=[node], entity=entity)
+ if node.misc["MentionSpan" + index_str]:
+ mention.span = node.misc["MentionSpan" + index_str]
+ etype = node.misc["ClusterType" + index_str]
+ if etype:
+ if entity.etype is not None and etype != entity.etype:
+ logging.warning(f"etype mismatch in {node}: {entity.etype} != {etype}")
+ entity.etype = etype
+
+ bridging_str = node.misc["Bridging" + index_str]
+ if bridging_str:
+ mention._bridging = BridgingLinks(mention)
+ for link_str in bridging_str.split(','):
+ target, relation = link_str.split(':')
+ target = self._fix_id(target)
+ if target == eid:
+ _error("Bridging cannot self-reference the same entity: " + target, strict)
+ if target not in eid_to_entity:
+ eid_to_entity[target] = CorefEntity(target)
+ mention._bridging.append((eid_to_entity[target], relation))
+
+ split_ante_str = node.misc["SplitAnte" + index_str]
+ if split_ante_str:
+ split_antes = []
+ # TODO in CorefUD draft "+" was used as the separator, but it was changed to comma.
+ # We can delete `.replace('+', ',')` once there are no more data with the legacy plus separator.
+ for ante_str in split_ante_str.replace('+', ',').split(','):
+ ante_str = self._fix_id(ante_str)
+ if ante_str in eid_to_entity:
+ if ante_str == eid:
+ _error("SplitAnte cannot self-reference the same entity: " + eid, strict)
+ split_antes.append(eid_to_entity[ante_str])
+ else:
+ # split cataphora, e.g. "We, that is you and me..."
+ ante_cl = CorefEntity(ante_str)
+ eid_to_entity[ante_str] = ante_cl
+ split_antes.append(ante_cl)
+ entity.split_ante = sorted(split_antes)
+
+ # Some CorefUD 0.2 datasets (e.g. ARRAU) separate key-value pairs with spaces instead of commas.
+ # We also need to escape forbidden characters.
+ mmisc = node.misc["MentionMisc" + index_str].replace(' ', ',')
+ mention.other = mmisc.replace('-', '%2D').replace('(', '%28').replace(')', '%29')
+ index += 1
+ index_str = f"[{index}]"
+ eid = self._fix_id(node.misc["ClusterId" + index_str])
+ # c=doc.coref_entities should be sorted, so that c[0] < c[1] etc.
+ # In other words, the dict should be sorted by the values (according to CorefEntity.__lt__),
+ # not by the keys (eid).
+ # In Python 3.7+ (3.6+ in CPython), dicts are guaranteed to be insertion order.
+ for entity in eid_to_entity.values():
+ if not entity._mentions:
+ _error(f"Entity {entity.eid} referenced in SplitAnte or Bridging, but not defined with ClusterId", strict)
+ entity._mentions.sort()
+ doc._eid_to_entity = {c._eid: c for c in sorted(eid_to_entity.values())}
+
+ # Delete all old-style attributes from MISC (so when converting old to new style, the old attributes are deleted).
+ attrs = "ClusterId MentionSpan ClusterType Bridging SplitAnte MentionMisc".split()
+ for node in doc.nodes_and_empty:
+ for key in list(node.misc):
+ if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs):
+ del node.misc[key]
+
+
+def _error(msg, strict):
+ if strict:
+ raise ValueError(msg)
+ logging.error(msg)
diff --git a/udapi/block/read/sentences.py b/udapi/block/read/sentences.py
index 14840a50..7487d580 100644
--- a/udapi/block/read/sentences.py
+++ b/udapi/block/read/sentences.py
@@ -4,7 +4,28 @@
class Sentences(BaseReader):
- """A reader for plain-text sentences (one sentence per line) files."""
+ r"""A reader for plain-text sentences (one sentence per line) files.
+
+ Args:
+ ignore_empty_lines: if True, delete empty lines from the input.
+ Default=False.
+ newdoc_if_empty_line: if True, empty lines mark document boundaries,
+ which are marked with `root.newdoc`. Default=False.
+ rstrip: a set of characters to be stripped from the end of each line.
+ Default='\r\n '. You can use rstrip='\n' if you want to preserve
+ any space or '\r' (Carriage Return) at end of line,
+ so that `udpipe.Base` keeps these characters in `SpacesAfter`.
+ As most blocks do not expect whitespace other than a space to appear
+ in the processed text, using this feature is at your own risk.
+ """
+ def __init__(self, ignore_empty_lines=False, newdoc_if_empty_line=False,
+ rstrip='\r\n ', **kwargs):
+ if ignore_empty_lines and newdoc_if_empty_line:
+ raise ValueError("ignore_empty_lines is not compatible with newdoc_if_empty_line")
+ self.ignore_empty_lines = ignore_empty_lines
+ self.newdoc_if_empty_line = newdoc_if_empty_line
+ self.rstrip = rstrip
+ super().__init__(**kwargs)
@staticmethod
def is_multizone_reader():
@@ -18,8 +39,25 @@ def read_tree(self, document=None):
if self.filehandle is None:
return None
line = self.filehandle.readline()
+ # if readline() returns an empty string, the end of the file has been
+ # reached, while a blank line is represented by '\n'
+ # (or '\r\n' if reading a Windows file on Unix machine).
if line == '':
return None
+ preceded_by_empty_line = False
+ if self.ignore_empty_lines or self.newdoc_if_empty_line:
+ while line in {'\n', '\r\n'}:
+ preceded_by_empty_line = True
+ line = self.filehandle.readline()
+ if line == '':
+ return None
root = Root()
- root.text = line.rstrip()
+ root.text = line.rstrip(self.rstrip)
+ if self.newdoc_if_empty_line and preceded_by_empty_line:
+ root.newdoc = True
return root
+
+ # The first line in a file also marks a start of new document
+ def after_process_document(self, document):
+ if self.newdoc_if_empty_line:
+ document.bundles[0].trees[0].newdoc = True
diff --git a/udapi/block/read/text.py b/udapi/block/read/text.py
new file mode 100644
index 00000000..161b6b6e
--- /dev/null
+++ b/udapi/block/read/text.py
@@ -0,0 +1,74 @@
+"""Text class is a reader for word-wrapped plain-text files."""
+from udapi.core.basereader import BaseReader
+from udapi.core.root import Root
+
+
+class Text(BaseReader):
+ r"""A reader for plain-text files with sentences on one or more lines.
+
+ Sentences are separated by one or more empty lines.
+ Newlines within sentences are substituted by a space.
+
+ Args:
+ rstrip: a set of characters to be stripped from the end of each line.
+ Default='\r\n '. You can use rstrip='\n' if you want to preserve
+ any space or '\r' (Carriage Return) at end of line,
+ so that `udpipe.Base` keeps these characters in `SpacesAfter`.
+ As most blocks do not expect whitespace other than a space to appear
+ in the processed text, using this feature is at your own risk.
+ empty_line: how empty lines are handled. Default 'new_sentence' preserves
+ the current behaviour (empty lines mark sentence boundaries). Use
+ 'keep' to read the entire file content into a single sentence (tree), including
+ empty lines. Use 'newpar' to behave like 'new_sentence' but also set
+ `root.newpar = True` on each sentence.
+ """
+ def __init__(self, rstrip='\r\n ', empty_line='new_sentence', **kwargs):
+ if empty_line not in {'new_sentence', 'keep', 'newpar'}:
+ raise ValueError("empty_line must be 'new_sentence', 'keep' or 'newpar'")
+ self.rstrip = rstrip
+ self.empty_line = empty_line
+ super().__init__(**kwargs)
+
+ @staticmethod
+ def is_multizone_reader():
+ """Can this reader read bundles which contain more zones?.
+
+ This implementation returns always False.
+ """
+ return False
+
+ def read_tree(self, document=None):
+ if self.filehandle is None:
+ return None
+ if self.empty_line == 'keep':
+ content = self.filehandle.read()
+ if content == '':
+ return None
+ root = Root()
+ root.text = content
+ return root
+ lines = []
+ line = None
+ while True:
+ line = self.filehandle.readline()
+ # if readline() returns an empty string, the end of the file has been
+ # reached, while a blank line is represented by '\n'
+ # (or '\r\n' if reading a Windows file on Unix machine).
+ if line == '':
+ if not lines:
+ return None
+ else:
+ break
+ elif line in {'\n', '\r\n'}:
+ if not lines:
+ continue
+ else:
+ break
+ else:
+ lines.append(line.rstrip(self.rstrip))
+
+ root = Root()
+ root.text = " ".join(lines)
+ if self.empty_line == 'newpar':
+ root.newpar = True
+ return root
diff --git a/udapi/block/read/vislcg.py b/udapi/block/read/vislcg.py
index 3c5852d7..4c5a87ab 100644
--- a/udapi/block/read/vislcg.py
+++ b/udapi/block/read/vislcg.py
@@ -1,15 +1,14 @@
"""Vislcg is a reader block the VISL-cg format."""
-import shlex
-
from udapi.core.basereader import BaseReader
from udapi.core.root import Root
+
class Vislcg(BaseReader):
"""A reader of the VISL-cg format, suitable for VISL Constraint Grammer Parser."""
# TODO check validity and raise helpful exceptions if not valid
# pylint: disable=too-many-branches
- def read_tree(self, document=None):
+ def read_tree(self):
if self.filehandle is None:
return None
@@ -22,28 +21,28 @@ def read_tree(self, document=None):
if line == '':
break
if line[0] == '#':
- # Are comments allowed in VISL-cg?
+ root.comment += line[1:] + "\n"
continue
if line[0].isspace():
- line.lstrip(line)
- node, parent_ord = self._node(line, root)
+ node, parent_ord = self._node(line.lstrip(), root)
words.append(node)
parents.append(parent_ord)
- else:
- if words:
- words[0].form = form
- if len(words) > 1:
- split_forms = form.split()
- if len(words) == len(split_forms):
- for word, split_form in zip(words, split_forms):
- word.form = split_form
- else:
- for word in words[1:]:
- word.form = '_'
- root.create_multiword_token(words, form=form)
- words = []
- form = line[2:-2]
+ continue
+
+ if words:
+ words[0].form = form
+ if len(words) > 1:
+ split_forms = form.split()
+ if len(words) == len(split_forms):
+ for word, split_form in zip(words, split_forms):
+ word.form = split_form
+ else:
+ for word in words[1:]:
+ word.form = '_'
+ root.create_multiword_token(words, form=form)
+ words = []
+ form = line[2:-2]
if words:
words[0].form = form
@@ -63,10 +62,15 @@ def read_tree(self, document=None):
@staticmethod
def _node(line, root):
- fields = shlex.split(line)
- lemma = fields[0]
- xpos = fields[1]
- feats_list = fields[2:-2]
+ # line contains "lemma" xpos feat1 feat2 .. featN @deprel #ord->parent.ord
+ # Lemma can contain spaces, but quotes within lemma are not escaped,
+ # so we cannot use fields = shlex.split(line)
+ # Let's hope that xpos, feats and deprel do not contain any quotes.
+ end_quote_pos = line.rfind('"')
+ lemma = line[1:end_quote_pos]
+ fields = line[end_quote_pos + 1:].split()
+ xpos = fields[0]
+ feats_list = fields[3:-2]
feats = '|'.join(feats_list) if feats_list else '_'
deprel = fields[-2][1:]
parent_ord = int(fields[-1].split('->')[1])
diff --git a/udapi/block/segment/merge.py b/udapi/block/segment/merge.py
new file mode 100644
index 00000000..9ada45f1
--- /dev/null
+++ b/udapi/block/segment/merge.py
@@ -0,0 +1,46 @@
+"""Block segment.Merge"""
+from udapi.core.block import Block
+
+class Merge(Block):
+ """"Re-segmenter merging selected sentences (trees).
+
+ This class merges sentences ending with semicolons,
+ but it can be used as a base class for merging based on different criteria
+ by overriding one of the `should_*` methods.
+ """
+
+ @staticmethod
+ def should_merge_tokens(first, second):
+ """Is there actually a sentence boundary between the first and second node?"""
+ if first.form[-1] == ';':
+ return True
+ return False
+
+ def should_merge_bundles(self, first_bundle, second_bundle):
+ """Is there actually a sentence boundary between the first and second bundle?"""
+ first_tree = self._get_our_tree(first_bundle)
+ second_tree = self._get_our_tree(second_bundle)
+ return self.should_merge_tokens(first_tree.descendants[-1], second_tree.descendants[0])
+
+
+ def _get_our_tree(self, bundle):
+ for tree in bundle:
+ if self._should_process_tree(tree):
+ return tree
+ raise ValueError("Bundle %s contains no tree to process." % bundle.address())
+
+
+ def process_document(self, doc):
+ old_bundles = doc.bundles
+ prev_bundle = old_bundles[0]
+ new_bundles = [prev_bundle]
+ for bundle in old_bundles[1:]:
+ if self.should_merge_bundles(prev_bundle, bundle):
+ for tree in bundle:
+ prev_tree = prev_bundle.get_tree(tree.zone)
+ prev_tree.steal_nodes(tree.descendants)
+ prev_tree.text = prev_tree.compute_text()
+ else:
+ new_bundles.append(bundle)
+ prev_bundle = bundle
+ doc.bundles = new_bundles
\ No newline at end of file
diff --git a/udapi/block/segment/simple.py b/udapi/block/segment/simple.py
new file mode 100644
index 00000000..58be9b6d
--- /dev/null
+++ b/udapi/block/segment/simple.py
@@ -0,0 +1,91 @@
+"""Block segment.Simple"""
+from udapi.core.block import Block
+from udapi.core.bundle import Bundle
+import re
+
+class Simple(Block):
+ """"Heuristic segmenter, splits on sentence-final segmentation followed by uppercase.
+ The exceptions are:
+ 1) abbreviations of names, e.g. "A. Merkel"
+ 2) predefined list of nonfinal abbreviations, e.g. "e.g."
+
+ Parameters
+ ----------
+ keep_spaces : bool
+ do not strip whitespaces from the `text` attribute of the sentences created by segmentation
+ """
+
+ def __init__(self, keep_spaces=False, **kwargs):
+ super().__init__(**kwargs)
+ self.keep_spaces = keep_spaces
+
+ @staticmethod
+ def is_nonfinal_abbrev(token):
+ """Is a given token an abbreviation (without the final period) which cannot end a sentence?"""
+ if re.search('(např|e.g.)$', token):
+ return True
+ return False
+
+
+ def is_boundary(self, first, second):
+ """Is there a sentence boundary between the first and second token?"""
+ if not first or not second:
+ return False
+ if first[-1] in '"“»›)':
+ first = first[:-1]
+ if not first:
+ return False
+ if second[0] in '"„«¿¡‹(':
+ second = second[1:]
+ if not second:
+ return False
+ if not second[0].isupper() or second[0].isdigit():
+ return False
+ if not first[-1] in '.!?':
+ return False
+ if first[-1] == '.':
+ # correctly count length in "„A. Merkel"
+ if first[0] in '"„«¿¡‹(':
+ first = first[1:]
+ if len(first) == 2 and first[0].isupper():
+ return False
+ if self.is_nonfinal_abbrev(first[:-1]):
+ return False
+ return True
+
+
+ def segment_string(self, string):
+ """Return a list of sentences in a given string."""
+ tokens = string.split(' ')
+ previous = tokens[0]
+ segments = [previous]
+ for token in tokens[1:]:
+ if self.is_boundary(previous, token):
+ if self.keep_spaces:
+ segments[-1] += ' '
+ segments.append(token)
+ else:
+ segments[-1] += ' ' + token
+ previous = token
+ return segments
+
+
+ def process_document(self, doc):
+ old_bundles = doc.bundles
+ new_bundles = []
+ for bundle in old_bundles:
+ new_bundles.append(bundle)
+ for tree in bundle:
+ if self._should_process_tree(tree):
+ if tree.children:
+ raise ValueError("Segmenting already tokenized text is not supported.")
+ sentences = self.segment_string(tree.text)
+ orig_bundle_id = bundle.bundle_id
+ bundle.bundle_id = orig_bundle_id + '-1'
+ if len(sentences) > 1:
+ tree.text = sentences[0]
+ for i, sentence in enumerate(sentences[1:], 2):
+ new_bundle = Bundle(document=doc, bundle_id=orig_bundle_id + '-' + str(i))
+ new_bundle.create_tree(tree.zone).text = sentence
+ new_bundles.append(new_bundle)
+ doc.bundles = new_bundles
diff --git a/udapi/block/tokenize/__init__.py b/udapi/block/tokenize/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/tokenize/onwhitespace.py b/udapi/block/tokenize/onwhitespace.py
new file mode 100644
index 00000000..913dae61
--- /dev/null
+++ b/udapi/block/tokenize/onwhitespace.py
@@ -0,0 +1,97 @@
+"""Block tokenize.OnWhitespace"""
+import re
+from udapi.core.block import Block
+
+
+class OnWhitespace(Block):
+ """Base tokenizer, splits on whitespaces, fills SpaceAfter=No.
+
+ Use the parameter `keep_spaces=True` to preserve all whitespaces in the sentence
+ in the UDPipe way, i.e. using the `SpacesAfter` and `SpacesBefore` features in the MISC field.
+ It is backward compatible with CoNLL-U v2 `SpaceAfter=No` feature. That is, no following
+ whitespace is marked by `SpaceAfter=No` and a single following space results in no
+ whitespace-related markup.
+ If loading the text using `read.Sentences` and all whitespaces need to be preserved
+ (in order to be able to reconstruct the original document), the `read.Sentences` block
+ must be called with `rstrip=''`, `rstrip=\n` or `rstrip=\r\n` to prevent stripping the
+ trailing whitespace, e.g.::
+ $> echo -e "Hello \t world " | udapy read.Sentences $'rstrip=\r\n' tokenize.OnWhitespace keep_spaces=1 write.Conllu
+
+ # sent_id = 1
+ # text = Hello world
+ 1 Hello _ _ _ _ 0 _ _ SpacesAfter=\s\t\s
+ 2 world _ _ _ _ 0 _ _ _
+ Note that the attribute `SpaceAfter=No` is missing for the token `world`, since it is
+ followed by a single space.
+
+ Parameters
+ ----------
+ keep_spaces : bool
+ preserve whitespaces by filling MISC attributes `SpacesAfter` and `SpacesBefore` (by default False)
+ """
+
+ escape_whitespace_table = str.maketrans({' ':r'\s', '\t':r'\t', '\r':r'\r', '\n':r'\n'})
+
+ def __init__(self, keep_spaces=False, **kwargs):
+ super().__init__(**kwargs)
+ self.keep_spaces = keep_spaces
+
+ @staticmethod
+ def tokenize_sentence(string):
+ """A method to be overriden in subclasses."""
+ return string.split()
+
+ def process_tree(self, root):
+ if root.children:
+ raise ValueError('Tree %s is already tokenized.' % root)
+ #sentence = ' '.join(root.text.split())
+ sentence = root.text
+ tokens = self.tokenize_sentence(sentence)
+
+ # Check if there are any spaces before the first token
+ spaces_before = ""
+ m = re.match(r'\s+', sentence)
+ if m:
+ spaces_before = m.group(0)
+ sentence = sentence[len(spaces_before):]
+
+ for i, token in enumerate(tokens, 1):
+ spaces_after = ""
+
+ # The token (returned from tokenization) does not match the start of sentence.
+ # E.g. '. . . word' is tokenized as '... word'.
+ if not sentence.startswith(token):
+ # Let's delete the start of sentence anyway,
+ # using a non-greedy regex and the expected next token
+ # returned from the tokenization.
+ # my $next_token = $tokens[$i+1];
+ # my ($first, $rest) = ($sentence =~ /^(.*?)(\Q$next_token\E.*)$/);
+ # $no_space_after = 1 if (defined $first && $first !~ /\s$/);
+ # $sentence = $rest if (defined $rest);
+ raise ValueError('tokenization does not match: "%s" vs "%s"' % (token, sentence))
+
+ # Delete the token from the begining of the sentence.
+ sentence = sentence[len(token):]
+
+ # Set the SpaceAfter and SpacesAfter properly
+ m = re.match(r'\s+', sentence)
+ if m is not None:
+ spaces_after = m.group(0)
+ sentence = sentence[len(spaces_after):]
+
+ # normalize whitespace
+ if not self.keep_spaces:
+ spaces_before = ""
+ # spaces_after = "" <=> SpaceAfter=No is never set for the last token <=> len(sentence) = 0
+ spaces_after = "" if not len(spaces_after) and len(sentence) else " "
+
+ # create a new node
+ node = root.create_child(form=token)
+ node.ord = i
+
+ if i == 1 and spaces_before:
+ node.misc["SpacesBefore"] = spaces_before.translate(self.escape_whitespace_table)
+ if not spaces_after:
+ node.misc["SpaceAfter"] = 'No'
+ elif spaces_after != " ":
+ node.misc["SpacesAfter"] = spaces_after.translate(self.escape_whitespace_table)
diff --git a/udapi/block/tokenize/simple.py b/udapi/block/tokenize/simple.py
new file mode 100644
index 00000000..f7010d13
--- /dev/null
+++ b/udapi/block/tokenize/simple.py
@@ -0,0 +1,13 @@
+"""Block tokenize.Simple"""
+import re
+
+from udapi.block.tokenize.onwhitespace import OnWhitespace
+
+
+class Simple(OnWhitespace):
+ """Simple tokenizer, splits on whitespaces and punctuation, fills SpaceAfter=No."""
+
+ @staticmethod
+ def tokenize_sentence(string):
+ """A method to be overriden in subclasses."""
+ return re.findall(r'\w+|[^\w\s]', string)
diff --git a/udapi/block/transform/__init__.py b/udapi/block/transform/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/transform/deproj.py b/udapi/block/transform/deproj.py
new file mode 100644
index 00000000..581f5a6b
--- /dev/null
+++ b/udapi/block/transform/deproj.py
@@ -0,0 +1,43 @@
+"""Block Deproj for deprojectivization of pseudo-projective trees à la Nivre & Nilsson (2005).
+
+See ud.transform.Proj for details.
+TODO: implement also path and head+path strategies.
+"""
+from udapi.core.block import Block
+
+
+class Deproj(Block):
+ """De-projectivize the trees à la Nivre & Nilsson (2005)."""
+
+ def __init__(self, strategy='head', label='misc', **kwargs):
+ """Create the Deproj block object."""
+ super().__init__(**kwargs)
+ self.strategy = strategy
+ self.label = label
+
+ def process_node(self, node):
+ if self.label == 'misc':
+ label = node.misc['pproj']
+ elif self.label == 'deprel':
+ parts = node.sdeprel.split('+', 1)
+ if len(parts) == 2:
+ label = parts[1]
+ node.deprel = node.udeprel + (':' + parts[0] if parts[0] else '')
+ else:
+ label = ''
+ else:
+ raise(ValueError('Unknown parameter label=%s' % self.label))
+ if label == '':
+ return
+ reconstructed_parent = self.head_strategy(node, label)
+ if reconstructed_parent:
+ node.parent = reconstructed_parent
+
+ def head_strategy(self, node, label):
+ queue = [n for n in node.parent.children if n != node] # TODO deque
+ while queue:
+ adept = queue.pop(0)
+ if adept.udeprel == label:
+ return adept
+ queue.extend(adept.children)
+ return None
diff --git a/udapi/block/transform/flatten.py b/udapi/block/transform/flatten.py
new file mode 100644
index 00000000..d218ad27
--- /dev/null
+++ b/udapi/block/transform/flatten.py
@@ -0,0 +1,25 @@
+"""transform.Flatten block for flattening trees."""
+from udapi.core.block import Block
+
+class Flatten(Block):
+ """Apply `node.parent = node.root; node.deprel = 'root'` on all nodes."""
+
+ def __init__(self, oneroot=False, **kwargs):
+ """Args:
+ oneroot: only the first node will have deprel 'root'.
+ All other nodes will depend on the first node with deprel 'dep'.
+ This option makes the trees valid according to the validator.
+ (default=False)
+ """
+ super().__init__(**kwargs)
+ self.oneroot = oneroot
+
+ def process_tree(self, tree):
+ for node in tree.descendants:
+ node.parent = node.root
+ node.deprel = 'root'
+ if self.oneroot:
+ first = tree.descendants[0]
+ for node in tree.descendants[1:]:
+ node.parent = first
+ node.deprel = 'dep'
diff --git a/udapi/block/transform/proj.py b/udapi/block/transform/proj.py
new file mode 100644
index 00000000..6e284b4c
--- /dev/null
+++ b/udapi/block/transform/proj.py
@@ -0,0 +1,64 @@
+"""Block Proj for (pseudo-)projectivization of trees à la Nivre & Nilsson (2005).
+
+See http://www.aclweb.org/anthology/P/P05/P05-1013.pdf.
+This block tries to replicate Malt parser's projectivization:
+http://www.maltparser.org/userguide.html#singlemalt_proj
+http://www.maltparser.org/optiondesc.html#pproj-marking_strategy
+
+TODO: implement also path and head+path strategies.
+
+TODO: Sometimes it would be better (intuitively)
+to lower the gap-node (if its whole subtree is in the gap
+and if this does not cause more non-projectivities)
+rather than to lift several nodes whose parent-edge crosses this gap.
+We would need another label value (usually the lowering is of depth 1),
+but the advantage is that reconstruction of lowered edges
+during deprojectivization is simple and needs no heuristics.
+"""
+from udapi.core.block import Block
+
+
+class Proj(Block):
+ """Projectivize the trees à la Nivre & Nilsson (2005)."""
+
+ def __init__(self, strategy='head', lifting_order='deepest', label='misc', **kwargs):
+ """Create the Proj block object."""
+ super().__init__(**kwargs)
+ self.lifting_order = lifting_order
+ self.strategy = strategy
+ self.label = label
+
+ def process_tree(self, tree):
+ nonprojs = [self.nonproj_info(n) for n in tree.descendants if n.is_nonprojective()]
+ for nonproj in sorted(nonprojs, key=lambda info: info[0]):
+ self.lift(nonproj[1])
+
+ def nonproj_info(self, node):
+ if self.lifting_order == 'shortest':
+ return (abs(node.ord - node.parent.ord), node)
+ orig_parent = node.parent
+ node.parent = node.parent.parent
+ depth = 1
+ while node.is_nonprojective():
+ node.parent = node.parent.parent
+ depth += 1
+ node.parent = orig_parent
+ return (-depth, node)
+
+ def lift(self, node):
+ orig_parent = node.parent
+ depth = 0
+ while node.is_nonprojective():
+ node.parent = node.parent.parent
+ depth += 1
+ if depth == 0:
+ return
+ self.mark(node, orig_parent.udeprel)
+
+ def mark(self, node, label):
+ if self.label == 'misc':
+ node.misc['pproj'] = label
+ elif self.label == 'deprel':
+ node.deprel = '%s:%s+%s' % (node.udeprel, node.sdeprel, label)
+ else:
+ raise ValueError('Unknown parameter label=%s' % self.label)
diff --git a/udapi/block/tutorial/__init__.py b/udapi/block/tutorial/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/tutorial/addarticles.py b/udapi/block/tutorial/addarticles.py
new file mode 100644
index 00000000..96f0ba2f
--- /dev/null
+++ b/udapi/block/tutorial/addarticles.py
@@ -0,0 +1,14 @@
+"""tutorial.AddArticles block template."""
+# nickname = xy123
+# TODO: make up a unique nickname and edit the previous line
+# if you want your results to be listed on the NPFL070 web (under that nickname).
+# Delete the line if you don't want to listed on the web.
+from udapi.core.block import Block
+
+class AddArticles(Block):
+ """Heuristically insert English articles."""
+
+ def process_node(self, node):
+ if node.upos == "NOUN":
+ the = node.create_child(form="the", lemma="the", upos="DET", deprel="det")
+ the.shift_before_subtree(node)
diff --git a/udapi/block/tutorial/addcommas.py b/udapi/block/tutorial/addcommas.py
new file mode 100644
index 00000000..97677d89
--- /dev/null
+++ b/udapi/block/tutorial/addcommas.py
@@ -0,0 +1,31 @@
+"""tutorial.AddCommas block template."""
+from udapi.core.block import Block
+
+# nickname = xy123
+# TODO: make up a unique nickname and edit the previous line
+# if you want your results to be listed on the NPFL070 web (under that nickname).
+# Delete the line if you don't want to listed on the web.
+
+class AddCommas(Block):
+ """Heuristically insert nodes for missing commas."""
+
+ def __init__(self, language='en', **kwargs):
+ super().__init__(**kwargs)
+ self.language = language
+
+ def process_node(self, node):
+ # TODO: Your task: implement some heuristics
+ if self.should_add_comma_before(node):
+ comma = node.create_child(form=',', deprel='punct', upos='PUNCT')
+ comma.shift_before_node(node)
+
+ def should_add_comma_before(self, node):
+ prev_node = node.prev_node
+ if prev_node is None:
+ return False
+ if self.language == 'en' and prev_node.lemma == 'however':
+ return True
+ if any(n.deprel == 'appos' for n in prev_node.children):
+ return True
+
+ return False
diff --git a/udapi/block/tutorial/adpositions.py b/udapi/block/tutorial/adpositions.py
new file mode 100644
index 00000000..9c4e131b
--- /dev/null
+++ b/udapi/block/tutorial/adpositions.py
@@ -0,0 +1,35 @@
+"""tutorial.Adpositions block template.
+
+Example usage::
+
+ for a in */sample.conllu; do
+ printf '%50s ' $a;
+ udapy tutorial.Adpositions < $a;
+ done | tee results.txt
+
+ # What are the English postpositions?
+ cat UD_English/sample.conllu | udapy -TM util.Mark \
+ node='node.upos == "ADP" and node.parent.precedes(node)' | less -R
+"""
+from udapi.core.block import Block
+
+
+class Adpositions(Block):
+ """Compute the number of prepositions and postpositions."""
+
+ def __init__(self, **kwargs):
+ """Create the Adpositions block object."""
+ super().__init__(**kwargs)
+ self.prepositions = 0
+ self.postpositions = 0
+
+ def process_node(self, node):
+ # TODO: Your task: distinguish prepositions and postpositions
+ if node.upos == "ADP":
+ self.prepositions += 1
+
+ def process_end(self):
+ total = self.prepositions + self.postpositions or 1
+ prep = 100 * self.prepositions / total
+ post = 100 * self.postpositions / total
+ print("prepositions %5.1f%%, postpositions %5.1f%%" % (prep, post))
diff --git a/udapi/block/tutorial/parse.py b/udapi/block/tutorial/parse.py
new file mode 100644
index 00000000..db732a12
--- /dev/null
+++ b/udapi/block/tutorial/parse.py
@@ -0,0 +1,30 @@
+"""tutorial.Parse block template.
+
+Usage:
+udapy read.Conllu zone=gold files=sample.conllu \
+ read.Conllu zone=pred files=sample.conllu \
+ transform.Flatten zones=pred \
+ tutorial.Parse zones=pred \
+ eval.Parsing gold_zone=gold \
+ util.MarkDiff gold_zone=gold \
+ write.TextModeTreesHtml marked_only=1 files=parse-diff.html
+"""
+# nickname = xy123
+# TODO: make up a unique nickname and edit the previous line
+# if you want your results to be listed on the NPFL070 web (under that nickname).
+# Delete the line if you don't want to listed on the web.
+from udapi.core.block import Block
+
+class Parse(Block):
+ """Dependency parsing."""
+
+ def __init__(self, language='en', **kwargs):
+ super().__init__(**kwargs)
+ self.language = language
+
+ def process_tree(self, root):
+ # TODO: Your task: implement a better heuristics than "right chain"
+ for node in root.descendants:
+ if node.next_node:
+ node.parent = node.next_node
+ node.deprel = 'root'
diff --git a/udapi/block/tutorial/removecommas.py b/udapi/block/tutorial/removecommas.py
new file mode 100644
index 00000000..a07e2bba
--- /dev/null
+++ b/udapi/block/tutorial/removecommas.py
@@ -0,0 +1,13 @@
+"""tutorial.RemoveCommas helper block."""
+from udapi.core.block import Block
+
+
+class RemoveCommas(Block):
+ """Delete all comma nodes and edit SpaceAfter and text accordingly."""
+
+ def process_tree(self, root):
+ for node in root.descendants:
+ if node.form == ",":
+ node.remove(children="rehang")
+ del node.prev_node.misc['SpaceAfter']
+ root.text = root.compute_text()
diff --git a/udapi/block/ud/addmwt.py b/udapi/block/ud/addmwt.py
new file mode 100644
index 00000000..e7eb3989
--- /dev/null
+++ b/udapi/block/ud/addmwt.py
@@ -0,0 +1,113 @@
+"""Abstract base class ud.AddMwt for heuristic detection of multi-word tokens."""
+from udapi.core.block import Block
+import logging
+
+
+class AddMwt(Block):
+ """Detect and mark MWTs (split them into words and add the words to the tree)."""
+
+ def process_node(self, node):
+ analysis = self.multiword_analysis(node)
+ if analysis is None:
+ return
+ orig_attr = {}
+ for attr in 'form lemma upos xpos deprel'.split():
+ orig_attr[attr] = getattr(node, attr)
+ orig_attr['feats'] = node.feats.copy()
+ orig_attr['misc'] = node.misc.copy()
+ # Defaults for the newly created MWT
+ mwt_misc = node.misc.copy()
+ mwt_form = node.form
+
+ forms = analysis['form'].split()
+ main = analysis.get('main', 0)
+ parent = node if analysis.get('shape', '') == 'subtree' else node.parent
+ nodes = []
+ for form in forms[0:main]:
+ new_node = parent.create_child(form=form)
+ new_node.shift_before_node(node)
+ nodes.append(new_node)
+ node.form = forms[main]
+ nodes.append(node)
+ for form in forms[main + 1:]:
+ new_node = parent.create_child(form=form)
+ new_node.shift_after_node(nodes[-1])
+ nodes.append(new_node)
+
+ if orig_attr['form'].isupper():
+ for new_node in nodes:
+ new_node.form = new_node.form.upper()
+ elif orig_attr['form'][0].isupper():
+ nodes[0].form = nodes[0].form.title()
+
+ node.misc = None
+ for attr in 'lemma upos xpos feats deprel misc'.split():
+ if attr in analysis:
+ values = analysis[attr].split()
+ for i, new_node in enumerate(nodes):
+ if len(values) <= i:
+ logging.warning("Attribute '%s' not supplied for word no. %d" % (attr, i))
+ for attr in 'form lemma upos xpos feats deprel misc'.split():
+ logging.warning("%s = %s" % (attr, analysis.get(attr, '')))
+ if values[i] == '*':
+ setattr(new_node, attr, orig_attr[attr])
+ # No MISC attribute should be duplicated on the word level and token level,
+ # so if copying MISC to a new_node, delete mwt_misc.
+ # However, SpaceAfter should be annotated only on the token level,
+ # so make sure it is not accidentally copied on the word level.
+ if attr == 'misc':
+ orig_attr['misc'].clear()
+ for a in 'SpaceAfter SpacesAfter SpacesBefore'.split():
+ if new_node.misc[a]:
+ orig_attr['misc'][a] = new_node.misc[a]
+ del new_node.misc[a]
+
+ elif attr == 'feats' and '*' in values[i]:
+ new_node.feats = values[i]
+ for feat_name, feat_value in list(new_node.feats.items()):
+ if feat_value == '*':
+ new_node.feats[feat_name] = orig_attr['feats'][feat_name]
+ else:
+ setattr(new_node, attr, values[i])
+
+ # Entity (coreference) annotation should be only on the word level,
+ # so make sure it does not stay on the token level.
+ if mwt_misc['Entity']:
+ nodes[0].misc['Entity'] = mwt_misc['Entity']
+ del mwt_misc['Entity']
+
+ # If node is already part of an MWT, we need to delete the old MWT and extend the new MWT.
+ if node.multiword_token:
+ mwt_words = node.multiword_token.words
+ mwt_form = node.multiword_token.form
+ if node.multiword_token.misc:
+ mwt_misc.update(node.multiword_token.misc)
+ node.multiword_token.remove()
+ mwt_words[mwt_words.index(node):mwt_words.index(node)+1] = nodes
+ nodes = mwt_words
+
+ mwt = node.root.create_multiword_token(words=nodes, form=mwt_form, misc=mwt_misc)
+ self.postprocess_mwt(mwt)
+
+ def multiword_analysis(self, node):
+ """Return a dict with MWT info or None if `node` does not represent a multiword token.
+
+ An example return value is::
+
+ {
+ 'form': 'aby bych',
+ 'lemma': 'aby být',
+ 'upos': 'SCONJ AUX',
+ 'xpos': 'J,------------- Vc-S---1-------',
+ 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin', # _ means empty FEATS
+ 'deprel': '* aux', # * means keep the original deprel
+ 'main': 0, # which of the two words will inherit the original children (if any)
+ 'shape': 'siblings', # the newly created nodes will be siblings or alternatively
+ #'shape': 'subtree', # the main-indexed node will be the head
+ }
+ """
+ raise NotImplementedError('multiword_analysis must be overriden in subclasses')
+
+ def postprocess_mwt(self, mwt):
+ """Optional postprocessing of newly created MWTs."""
+ pass
diff --git a/udapi/block/ud/addpuncttype.py b/udapi/block/ud/addpuncttype.py
new file mode 100644
index 00000000..f5f20e06
--- /dev/null
+++ b/udapi/block/ud/addpuncttype.py
@@ -0,0 +1,91 @@
+"""
+Some UD treebanks use features PunctType and PunctSide that classify
+punctuation symbols. This block can be used to add such features to data where
+they are missing – the classification is mostly deterministic. If the input
+data already contains such features, their values will be overwritten.
+"""
+from udapi.core.block import Block
+
+# TODO We need to know the language, there are many other quotation styles,
+# e.g. Finnish and Swedish uses the same symbol for opening and closing: ”X”.
+# Danish uses uses the French quotes, but switched: »X«.
+
+PUNCT_TYPES = {
+ '(': 'Brck',
+ ')': 'Brck',
+ '[': 'Brck',
+ ']': 'Brck',
+ '{': 'Brck',
+ '}': 'Brck',
+ '.': 'Peri',
+ '...': 'Elip',
+ '…': 'Elip',
+ ',': 'Comm',
+ ';': 'Semi',
+ ':': 'Colo',
+ '!': 'Excl',
+ '¡': 'Excl', # Spanish initial exclamation mark
+ '?': 'Qest',
+ '¿': 'Qest', # Spanish initial question mark
+ '/': 'Colo', # it is used this way in AnCora
+ '-': 'Dash',
+ '–': 'Dash',
+ '—': 'Dash',
+ '"': 'Quot',
+ "'": 'Quot',
+ '`': 'Quot',
+ '“': 'Quot', # opening English, closing Czech
+ '”': 'Quot', # closing English
+ '„': 'Quot', # opening Czech
+ '‘': 'Quot', # opening English, closing Czech
+ '’': 'Quot', # closing English
+ '‚': 'Quot', # opening Czech
+ '«': 'Quot', # opening French, closing Danish
+ '»': 'Quot', # closing French, opening Danish
+ '‹': 'Quot',
+ '›': 'Quot',
+ '《': 'Quot', # Korean, Chinese
+ '》': 'Quot',
+ '「': 'Quot', # Chinese, Japanese
+ '」': 'Quot',
+ '『': 'Quot',
+ '』': 'Quot'
+}
+
+PUNCT_SIDES = {
+ '(': 'Ini',
+ ')': 'Fin',
+ '[': 'Ini',
+ ']': 'Fin',
+ '{': 'Ini',
+ '}': 'Fin',
+ '¡': 'Ini', # Spanish initial exclamation mark
+ '!': 'Fin', # but outside Spanish people may expect empty value
+ '¿': 'Ini', # Spanish initial question mark
+ '?': 'Fin',
+ '《': 'Ini', # Korean, Chinese
+ '》': 'Fin',
+ '「': 'Ini', # Chinese, Japanese
+ '」': 'Fin',
+ '『': 'Ini',
+ '』': 'Fin'
+}
+
+
+class AddPunctType(Block):
+ """Add features PunctType and PunctSide where applicable."""
+
+ def process_node(self, node):
+ # The two features apply only to PUNCT. If they already occur elsewhere, erase them.
+ if node.upos != 'PUNCT':
+ node.feats['PunctType'] = ''
+ node.feats['PunctSide'] = ''
+ else:
+ if node.form in PUNCT_TYPES:
+ node.feats['PunctType'] = PUNCT_TYPES[node.form]
+ else:
+ node.feats['PunctType'] = ''
+ if node.form in PUNCT_SIDES:
+ node.feats['PunctSide'] = PUNCT_SIDES[node.form]
+ else:
+ node.feats['PunctSide'] = ''
diff --git a/udapi/block/ud/ar/fixedeprels.py b/udapi/block/ud/ar/fixedeprels.py
new file mode 100644
index 00000000..a4b359ff
--- /dev/null
+++ b/udapi/block/ud/ar/fixedeprels.py
@@ -0,0 +1,699 @@
+"""Block to fix case-enhanced dependency relations in Arabic."""
+from udapi.core.block import Block
+import re
+
+class FixEdeprels(Block):
+
+ # Sometimes there are multiple layers of case marking and only the outermost
+ # layer should be reflected in the relation. For example, the semblative 'jako'
+ # is used with the same case (preposition + morphology) as the nominal that
+ # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations
+ # by all the inner cases.
+ # The list in the value contains exceptions that should be left intact.
+ outermost = {
+ 'أَنَّ': [],
+ 'أَن': [],
+ 'إِنَّ': [],
+ 'إِذَا': [],
+ 'لَو': [],
+ 'حَيثُ': [],
+ 'مِثلَ': [],
+ 'لِأَنَّ': [],
+ 'كَمَا': [],
+# 'فِي_حِينَ': [],
+ 'فَ': []
+ }
+
+ # Reduction and normalization of prepositions and conjunctions, including
+ # the derived and compound ones. The Latin transliterations are not really
+ # needed in the process. We include them here as documentation, but also
+ # to help the poor editor with rendering the lines. Ideally, each line
+ # should have left-to-right text at both the beginning and end.
+ substitution = [
+ {'target': ('min:gen', 'مِن:gen'),
+ 'sources':
+ [('ibtida min', 'اِبتِدَاء_مِن')]
+ },
+ {'target': ('ʾiṯra:gen', 'إِثرَ:gen'), # ʾiṯra = right after
+ 'sources':
+ [('ʾiṯra', 'إِثرَ')]
+ },
+ {'target': ('ʾaṯnāʾa:gen', 'أَثنَاءَ:gen'), # ʾaṯnāʾa = during
+ 'sources':
+ [('ʾaṯnāʾa', 'أَثنَاءَ')]
+ },
+ {'target': ('ʾiḏ', 'إِذ'), # ʾiḏ = because
+ 'sources':
+ [('ʾiḏ', 'إِذ'),
+ ('ʾiḏ ʾanna', 'إِذ_أَنَّ')]
+ },
+ {'target': ('ʾiḏā', 'إِذَا'), # ʾiḏā = if
+ 'sources':
+ [('ʾiḏā', 'إِذَا'),
+ ('ʾiḏā', 'إِذًا')]
+ },
+ ]
+
+ # Secondary prepositions sometimes have the lemma of the original part of
+ # speech. We want the grammaticalized form instead. List even those that
+ # will have the same lexical form, as we also want to check the morphological
+ # case. And include all other prepositions that have unambiguous morphological
+ # case, even if they are not secondary.
+ unambiguous = {
+ 'اِبتِدَاء_مِن': 'مِن:gen',
+ 'إِثرَ': 'إِثرَ:gen', # ʾiṯra = right after
+ 'أَثنَاءَ': 'أَثنَاءَ:gen', # ʾaṯnāʾa = during
+ 'إِذ': 'إِذ', # ʾiḏ = because
+ 'إِذ_أَنَّ': 'إِذ', # ʾiḏ ʾanna
+ 'إِذًا': 'إِذَا',
+ 'إِذَا': 'إِذَا', # remove morphological case; ʾiḏā = if
+ 'إِزَاءَ': 'إِزَاءَ:gen', # ʾizāʾa = regarding, facing, towards
+ 'أَلَّا': 'إِلَّا',
+ 'إِلَّا': 'إِلَّا', # ʾillā = except, unless
+ 'إِلَّا_إِذَا': 'إِلَّا', # ʾillā = except, unless
+ 'إِلَّا_أَن': 'إِلَّا', # ʾillā = except, unless
+ 'إِلَّا_أَنَّ': 'إِلَّا', # ʾillā = except, unless
+ 'إِلَّا_أَنَّ_هُوَ': 'إِلَّا', # ʾillā = except, unless
+ 'إِلَى': 'إِلَى:gen', # ʾilā = to
+ 'إِلَى_أَن': 'إِلَى:gen',
+ 'إِلَى_أَنَّ': 'إِلَى_أَنَّ', # until? that?
+ 'إِلَى_أَنَّ_لَدَى': 'إِلَى_أَنَّ',
+ 'إِلَى_أَنَّ_مِن': 'إِلَى_أَنَّ',
+ 'إِلَى_أَنَّ_هُوَ': 'إِلَى_أَنَّ',
+ 'إِلَى_أَنَّ_هُوَ_مِن': 'إِلَى_أَنَّ',
+ 'إِلَى_أَنَّ_هُوَ_مِن_بَينَ': 'إِلَى_أَنَّ',
+ 'إِلَى_بَعدَ': 'إِلَى:gen',
+ 'إِلَى_بَينَ': 'إِلَى_بَينِ:gen', # ʾilā bayni = to between
+ 'إِلَى_جَانِب': 'إِلَى_جَانِبِ:gen', # ʾilā ǧānibi = beside
+ 'إِلَى_حَوَالَى': 'إِلَى:gen', # ila hawala = to around X
+ 'إِلَى_حَوَالَى_مِن': 'إِلَى:gen', # ila hawala min
+ 'إِلَى_حَيثُ': 'إِلَى:gen',
+ 'إِلَى_حِينَ': 'فِي_حِينِ', # during
+ 'إِلَى_خَارِجَ': 'إِلَى_خَارِجِ:gen', # ʾilā ḫāriǧi = out
+ 'إِلَى_فِي': 'إِلَى:gen',
+ 'إِلَى_قَبلَ': 'إِلَى_قَبلِ:gen', # ʾilā qabli = until before X (e.g. until one year ago)
+ 'إِلَى_مِثلَ': 'مِثلَ', # miṯla = like
+ 'إِلَى_نَحوَ': 'إِلَى:gen', # to about N
+ 'أَمَّا': 'أَمَامَ:gen',
+ 'إِمَّا_لِ': 'لِ:gen',
+ 'أَمَامَ': 'أَمَامَ:gen', # ʾamāma = in front of
+ 'أَمَامَ_مِن': 'أَمَامَ:gen',
+ 'أَن': 'أَنَّ', # remove morphological case; ʾanna = that
+ 'أَنَّ': 'أَنَّ', # remove morphological case; ʾanna = that
+ 'إِن': 'إِنَّ', # remove morphological case; ʾinna = that
+ 'إِنَّ': 'إِنَّ', # remove morphological case; ʾinna = that
+ 'إِنَّمَا': 'إِنَّ',
+ 'إِيَّا': 'إِلَّا',
+ 'بِ': 'بِ:gen', # bi = for, with
+ 'بِ_اِتِّجَاه': 'بِاِتِّجَاهِ:gen', # bi-ittiǧāhi = towards
+ 'بِ_إِزَاءَ': 'إِزَاءَ:gen', # ʾizāʾa = regarding, facing, towards
+ 'بِ_اِستِثنَاء': 'بِاِستِثنَاءِ:gen', # biistiṯnāʾi = with exception of
+ 'بِ_اِسم': 'بِاِسمِ:gen', # biismi = in name of
+ 'بِ_إِضَافَة_إِلَى': 'بِاَلإِضَافَةِ_إِلَى:gen', # bi-al-ʾiḍāfati ʾilā = in addition to
+ 'بِ_إِضَافَة_إِلَى_أَنَّ': 'إِلَى_أَنَّ',
+ 'بِ_إِضَافَة_لِ': 'بِاَلإِضَافَةِ_إِلَى:gen', # in addition to
+ 'بِ_اِعتِبَار': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to
+ 'بِ_اِعتِبَار_أَنَّ': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to
+ 'بِ_اِعتِبَار_مِن': 'بِاِعتِبَارِ:gen', # bi-iʿtibāri = with regard to
+ 'بِ_اِعتِمَاد_عَلَى': 'بِاَلِاعتِمَادِ_عَلَى:gen', # bi-al-i-ʼʿtimādi ʿalā = depending on
+ 'بِ_إِلَى': 'بِ:gen',
+ 'بِ_أَنَّ': 'أَنَّ', # that
+ 'بِ_أَن': 'بِ:gen',
+ 'بِ_إِنَّ': 'بِ:gen',
+ 'بِ_أَنَّ_أَمَامَ': 'أَنَّ', # that
+ 'بِ_أَنَّ_لَا': 'أَنَّ', # that
+ 'بِ_أَنَّ_مِن': 'أَنَّ', # that
+ 'بِ_أَنَّ_هما_مِن': 'أَنَّ', # that
+ 'بِ_أَنَّ_هُوَ': 'أَنَّ', # that
+ 'بِ_أَنَّ_هُوَ_عَلَى': 'أَنَّ', # that
+ 'بِ_اِنطِلَاق': 'بِ:gen',
+ 'بِ_تَالِي_إِنَّ': 'بِ:gen',
+ 'بِ_تَعَاوُن_مَعَ': 'بِاَلتَّعَاوُنِ_مَعَ:gen', # bi-at-taʿāwuni maʿa = in cooperation with
+ 'بِ_تُهمَة': 'بِتُهمَةِ:gen', # bituhmati = on charges of
+ 'بِ_تَوَازِي_مَعَ': 'بِاَلتَّوَازِي_مَعَ:gen', # bi-at-tawāzī maʿa = in parallel with
+ 'بِ_ثُمَّ': 'بِ:gen',
+ 'بِ_جَانِب': 'بِجَانِبِ:gen', # biǧānibi = next to
+ 'بِ_جِهَة': 'بِ:gen',
+ 'بِ_حَالَة': 'فِي_حَالِ:gen', # fī ḥāli = in case
+ 'بِ_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on
+ 'بِ_حُضُور': 'فِي_حُضُورِ:gen', # together with
+ 'بِ_حَقّ': 'بِ:gen',
+ 'بِ_حُكم': 'بِ:gen',
+ 'بِ_حُلُول': 'بِ:gen',
+ 'بِ_حَوَالَى': 'بِ:gen', # bi hawala = with around X
+ 'بِ_حَيثُ': 'بِ:gen',
+ 'بِ_خُصُوص': 'بِخُصُوصِ:gen', # biḫuṣūṣi = with regard
+ 'بِ_خِلَاف': 'بِخِلَافِ:gen', # biḫilāfi = in addition to
+ 'بِ_دَاخِلَ': 'دَاخِلَ:gen',
+ 'بِ_دَعوَى': 'بِ:gen',
+ 'بِ_دَور': 'بِ:gen', # bidawri = with role, in turn?
+ 'بِ_دُون': 'دُونَ:gen',
+ 'بِ_دُونَ': 'دُونَ:gen', # bi dūni = without
+ 'بِ_دُونَ_أَن': 'دُونَ:gen', # bi dūni ʾan = without
+ 'بِ_رِعَايَة': 'بِ:gen',
+ 'بِ_رَغم': 'رَغمَ:gen', # despite
+ 'بِ_رَغم_أَنَّ': 'رَغمَ:gen', # despite
+ 'بِ_رَغم_مِن': 'رَغمَ:gen', # despite
+ 'بِ_رَغم_مِن_أَن': 'بِ:gen',
+ 'بِ_رَغم_مِن_أَنَّ': 'رَغمَ:gen', # despite
+ 'بِ_رَغم_مِن_أَنَّ_هُوَ': 'بِ:gen',
+ 'بِ_رِفقَة': 'بِرِفقَةٍ:gen', # birifqatin = in company of
+ 'بِ_رِئَاسَة': 'بِ:gen',
+ 'بِ_سَبّ': 'بِ:gen',
+ 'بِ_سَبَب': 'بِسَبَبِ:gen', # bisababi = because of
+ 'بِ_شَأن': 'بِشَأنِ:gen', # bišaʾni = about, regarding (lit. with + matter)
+ 'بِ_شَرط_أَن': 'بِ:gen',
+ 'بِ_صَدَد': 'بِصَدَدِ:gen', # biṣadadi = with respect to
+ 'بِ_صَرف_نَظَر_عَن': 'بِصَرفِ_اَلنَّظَرِ_عَن:gen', # biṣarfi an-naẓari ʿan = regardless of
+ 'بِ_صِفَة': 'بِصِفَةِ:gen', # biṣifati = as
+ 'بِ_صُورَة': 'بِ:gen',
+ 'بِ_عَكس': 'بِ:gen',
+ 'بِ_عَلَى': 'بِ:gen',
+ 'بِ_عَن': 'بِ:gen',
+ 'بِ_عَين': 'بِ:gen',
+ 'بِ_غَضّ_نَظَر_عَن': 'بِغَضِّ_اَلنَّظَرِ_عَن:gen', # biġaḍḍi an-naẓari ʿan = regardless of
+ 'بِ_فَضل': 'بِفَضلِ:gen', # bifaḍli = thanks to
+ 'بِ_فِي': 'بِ:gen',
+ 'بِ_قَدر': 'بِ:gen',
+ 'بِ_قُرب_مِن': 'بِاَلقُربِ_مِن:gen', # bi-al-qurbi min = near (with proximity to)
+ 'بِ_قَصد': 'بِقَصدِ:gen', # biqaṣdi = with intention
+ 'بِ_كَ': 'بِ:gen',
+ 'بِ_لِ': 'بِ:gen',
+ 'بِ_لَا': 'بِ:gen',
+ 'بِ_مَا_أَنَّ': 'بِ:gen',
+ 'بِ_مَثَابَة': 'بِ:gen',
+ 'بِ_مِثلَ': 'مِثلَ', # miṯla = like
+ 'بِ_مُجَرَّد': 'بِ:gen',
+ 'بِ_مُسَاعَدَة': 'بِ:gen',
+ 'بِ_مُشَارَكَة': 'بِمُشَارَكَةِ:gen', # bimušārakati = with participation of
+ 'بِ_مُقَارَنَة_بِ': 'بِاَلمُقَارَنَةِ_بِ:gen', # bi-al-muqāranati bi = in comparison to
+ 'بِ_مُقتَضَى': 'بِمُقتَضَى:gen', # bimuqtaḍā = with requirement of
+ 'بِ_مِقدَار': 'بِ:gen',
+ 'بِ_مِن': 'بِ:gen',
+ 'بِ_مُنَاسَبَة': 'بِمُنَاسَبَةِ:gen', # bimunāsabati = on the occasion of
+ 'بِ_مُوجِب': 'بِمُوجِبِ:gen', # bimūǧibi = with motive
+ 'بِ_نَتِيجَة': 'بِ:gen',
+ 'بِ_نَحوَ': 'بِ:gen', # by about N
+ 'بِ_نِسبَة': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati (bin-nisbati) = in proportion/relation to
+ 'بِ_نِسبَة_إِلَى': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati ʾilā (bin-nisbati ʾilā) = in proportion/relation to
+ 'بِ_نِسبَة_لِ': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati li (bin-nisbati li) = in proportion/relation to
+ 'بِ_نِسبَة_لِ_مِن': 'بِاَلنِّسبَةِ_لِ:gen', # bi an-nisbati li (bin-nisbati li) = in proportion/relation to
+ 'بِ_نَظَر_إِلَى': 'بِ:gen',
+ 'بِ_نِيَابَة_عَن': 'بِاَلنِّيَابَةِ_عَن:gen', # bi-an-niyābati ʿan = on behalf of
+ 'بِ_هَدَف': 'بِهَدَفِ:gen', # bihadafi = with goal
+ 'بِ_وَ_لِ': 'بِ:gen',
+ 'بِ_وَاسِطَة': 'بِوَاسِطَةِ:gen', # biwāsiṭati = by means of
+ 'بِ_وَاقِع': 'بِ:gen',
+ 'بِ_وَسَط': 'بِوَسَطِ:gen', # biwasaṭi = in the middle of
+ 'بِ_وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle
+ 'بِ_وَصف': 'بِ:gen',
+ 'بازاء': 'بِ:gen',
+ 'بالتسخين': 'بِ:gen',
+ 'بَدَلًا_مِن': 'بَدَلًا_مِن:gen', # badalan min = instead of
+ 'بدون': 'دُونَ:gen', # without
+ 'بشان': 'بِشَأنِ:gen',
+ 'بَعدَ': 'بَعدَ:gen', # baʿda = after
+ 'بَعدَ_أَن': 'بَعدَ:gen', # baʿda ʾan = after + clause
+ 'بَعدَ_حَوَالَى': 'بَعدَ:gen', # baada hawala
+ 'بَعدَ_نَحوَ': 'بَعدَ:gen', # after about N
+ 'بَعدَمَا': 'بَعدَ:gen', # baʿdamā = after
+ 'بُعَيدَ': 'بُعَيدَ:gen', # buʿayda = shortly after
+ 'بَل': 'قَبلَ:gen',
+ 'بِنَاء_عَلَى': 'بناء_عَلَى:gen',
+ 'بناء_عَلَى': 'بناء_عَلَى:gen', # bnāʾ ʿalā = based on
+ 'بناء_لِ': 'لِ:gen',
+ 'بَيدَ': 'بِ:gen',
+ 'بَيدَ_أَنَّ': 'بِ:gen',
+ 'بَينَ': 'بَينَ:gen', # bayna = between
+ 'بَينَ_حَوَالَى': 'بَينَ:gen', # bayna hawala
+ 'بينا': 'بَينَ:gen', # bayna = between
+ 'بَينَ_وَ_وَ_وَ': 'بَينَ:gen', # bayna = between
+ 'بَينَمَا': 'بَينَ:gen',
+ 'بَينَمَا_لَم': 'بَينَ:gen',
+ 'تُجَاهَ': 'تُجَاهَ:gen', # tuǧāha = towards, facing
+ 'تَحتَ': 'تَحتَ:gen', # tahta = under
+ 'ثَمَّ': 'بِ:gen',
+ 'ثُمَّ': 'بِ:gen',
+ 'جَرَّاء': 'جَرَّاء:gen', # ǧarrāʾ = because of
+ 'حَتَّى': 'حَتَّى:gen', # ḥattā = until
+ 'حَتَّى_أَنَّ': 'حَتَّى:gen', # before
+ 'حَتَّى_إِنَّ': 'حَتَّى:gen', # before
+ 'حَتَّى_بِ': 'حَتَّى:gen', # before
+ 'حَتَّى_لَو': 'لَو', # even if
+ 'حَتَّى_وَ_لَو': 'لَو', # even if
+ 'حَتَّى_وإن': 'إِنَّ',
+ 'حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on
+ 'حَسَبَمَا': 'حَسَبَ:gen', # ḥasaba = according to, depending on
+ 'حَوَالَى': 'حَوَالَى', # ḥawālā = around, about
+ 'حَوَالَى_مِن': 'مِن:gen', # hawala min = from around X
+ 'حَولَ': 'حَولَ:gen', # ḥawla = about
+ 'حولما_إِذَا': 'إِذَا',
+ 'حَولَ_مَا_إِذَا': 'إِذَا',
+ 'حِيَالَ': 'حِيَالَ:gen', # ḥiyāla = concerning
+ 'حَيثُ': 'حَيثُ', # remove morphological case; ḥayṯu = where (SCONJ, not ADV)
+ 'حِينَمَا': 'فِي_حِينِ', # during
+ 'خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside
+ 'خِلَالَ': 'خِلَالَ:gen', # ḫilāla = during
+ 'خَلفَ': 'خَلفَ:gen', # ḫalfa = behind
+ 'دَاخِل':
+ 'دَاخِلَ:gen', # dāḫila = inside of
+ 'دَاخِلَ':
+ 'دَاخِلَ:gen', # dāḫila = inside of
+ 'دُونَ': 'دُونَ:gen', # dūna = without
+ 'دُونَ_أَن': 'دُونَ:gen', # dūna ʾan = without
+ 'دُونَ_سِوَى': 'دُونَ:gen', # dūna siwā = without
+ 'دونما': 'دُونَ:gen',
+ 'ذٰلِكَ_بَعدَمَا': 'بَعدَ:gen',
+ 'ذٰلِكَ_عِندَمَا': 'بِ:gen',
+ 'ذٰلِكَ_لِأَنَّ': 'لِأَنَّ', # because
+ 'ذٰلِكَ_لِكَي': 'لِكَي', # li-kay = in order to
+ 'ذٰلِكَ_نَظَر_لِ': 'بِ:gen',
+ 'رَغمَ': 'رَغمَ:gen', # raġma = despite
+ 'رَغمَ_أَنَّ': 'رَغمَ:gen', # raġma ʾanna = despite + clause
+ 'رَغمَ_أَنَّ_مِن': 'رَغمَ:gen', # raġma ʾanna min = despite
+ 'رَهنَ': 'رَهنَ:gen', # rahna = depending on
+ 'رَيثَمَا': 'رَهنَ:gen', # rahna = depending on
+ 'سِوَى': 'سِوَى:gen', # siwā = except for
+ 'سِوَى_أَنَّ_هُوَ': 'سِوَى:gen', # siwā = except for
+ 'سِوَى_بِ': 'سِوَى:gen', # siwā = except for
+ 'سِوَى_عَلَى': 'سِوَى:gen', # siwā = except for
+ 'سِوَى_لِ': 'سِوَى:gen', # siwā = except for
+ 'ضِدَّ': 'ضِدَّ:gen', # ḍidda = against
+ 'ضِمنَ': 'ضِمنَ:gen', # ḍimna = within, inside, among
+ 'طَالَمَا':
+ 'طَالَمَا', # ṭālamā = as long as
+ 'طالَما':
+ 'طَالَمَا', # ṭālamā = as long as
+ 'طَالَمَا_أَنَّ':
+ 'طَالَمَا', # ṭālamā = as long as
+ 'طِوَالَ': 'طِوَالَ:gen', # ṭiwāla = throughout
+ 'طِيلَةَ': 'طِيلَةَ:gen', # ṭīlata = during
+ 'عبر': 'عَبرَ:gen',
+ 'عَبرَ': 'عَبرَ:gen', # ʿabra = via
+ 'عَدَا': 'عَدَا:gen', # ʿadā = except for
+ 'عَقِبَ': 'عَقِبَ:gen', # ʿaqiba = following
+ 'عَقِبَ_أَن': 'عَقِبَ:gen', # ʿaqiba = following
+ 'عَقِبَ_مِن': 'عَقِبَ:gen', # ʿaqiba = following
+ 'عَلَى': 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_أبواب': 'عَلَى:gen',
+ 'عَلَى_إِثرَ': 'إِثرَ:gen', # ʿalā ʾiṯri = right after
+ 'عَلَى_أَثَر': 'عَلَى:gen',
+ 'عَلَى_اِختِلَاف': 'عَلَى:gen',
+ 'عَلَى_أَسَاس': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on
+ 'عَلَى_أَسَاس_أَنَّ': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on
+ 'عَلَى_اِعتِبَار_أَنَّ': 'عَلَى_اِعتِبَارِ_أَنَّ', # ʿalā iʿtibāri ʾanna = considering that
+ 'عَلَى_إِلَّا': 'إِلَّا', # ʾillā = except, unless
+ 'عَلَى_الفور':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_إِلَى':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_أَن':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_أَنَّ':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_أَن_بِ':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_أَنَّ_عَلَى':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_أَنَّ_مِن_شَأن':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_أَنَّ_هُوَ':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_أَنَّ_هُوَ_لَدَى':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_بِ':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_بِ_فِي':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_بَينَ':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_حَدّ':
+ 'عَلَى:gen', # ʿalā = on
+ 'عَلَى_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of
+ 'عَلَى_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on
+ 'عَلَى_حَولَ': 'عَلَى:gen',
+ 'عَلَى_رَأس': 'عَلَى_رَأسِ:gen', # ʿalā raʾsi = on top of
+ 'عَلَى_رَغم': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite
+ 'عَلَى_رَغمَ_أَنَّ': 'رَغمَ:gen', # ʿalā raġma ʾanna = despite + clause
+ 'عَلَى_رَغم_أَنَّ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite
+ 'عَلَى_رَغم_مِن': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite
+ 'عَلَى_رَغم_مِن_أَنَّ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite
+ 'عَلَى_رَغم_مِن_أَنَّ_هُوَ': 'عَلَى_رَغمِ:gen', # ʿalā raġmi = despite
+ 'عَلَى_طَرِيقَة': 'عَلَى_طَرِيقَةِ:gen', # ʿalā ṭarīqati = on the way
+ 'عَلَى_عَكس': 'عَلَى:gen',
+ 'عَلَى_غِرَار': 'عَلَى_غِرَارِ:gen', # ʿalā ġirāri = similar to
+ 'عَلَى_قَيد': 'عَلَى:gen',
+ 'عَلَى_لِسَان': 'عَلَى:gen',
+ 'عَلَى_مِثلَ': 'مِثلَ', # miṯla = like
+ 'عَلَى_مدى': 'عَلَى:gen',
+ 'عَلَى_مَدَى': 'عَلَى_مَدَى:gen', # ʿalā madā = on period
+ 'عَلَى_مَقرَبَة_مِن': 'عَلَى_مَقرَبَةٍ_مِن:gen', # ʿalā maqrabatin min = in the vicinity of
+ 'عَلَى_مِن': 'عَلَى:gen',
+ 'عَلَى_نَحوَ': 'عَلَى:gen', # to about N
+ 'عَلَى_يَد': 'عَلَى:gen',
+ 'عَن': 'عَن:gen', # ʿan = about, from
+ 'عَن_أَن': 'عَن:gen',
+ 'عَن_أَنَّ': 'عَن:gen',
+ 'عَن_أَنَّ_وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond
+ 'عَن_بِ': 'عَن:gen',
+ 'عَن_طَرِيق': 'عَن_طَرِيقِ:gen', # ʿan ṭarīqi = via
+ 'عَن_فِي_أَن': 'عَن:gen',
+ 'عَن_قُربَ': 'قُربَ:gen', # qurba = near
+ 'عَن_مِثلَ': 'مِثلَ', # miṯla = like
+ 'عَن_مِن': 'عَن:gen',
+ 'عِندَ': 'عِندَمَا', # ʿinda = when
+ 'عِندَمَا': 'عِندَمَا', # ʿindamā = when
+ 'غَيرَ': 'إِلَّا',
+ 'فَ': 'فَ', # fa = so (advcl or coordination)
+ 'فَ_إِذَا': 'فَ', # fa = so (advcl or coordination)
+ 'فَ_بَدَل_مِن_أَن': 'فَ', # fa = so (advcl or coordination)
+ 'فَ_بَينَ': 'فَ', # fa = so (advcl or coordination)
+ 'فَ_عَلَى': 'فَ', # fa = so (advcl or coordination)
+ 'فَ_فِي': 'فَ', # fa = so (advcl or coordination)
+ 'فَ_مِن': 'فَ', # fa = so (advcl or coordination)
+ 'فَورَ': 'فَورَ:gen', # fawra = as soon as
+ 'فَوقَ': 'فَوقَ:gen', # fawqa = above, over
+ 'فِي': 'فِي:gen', # fī = in
+ 'فِي_اِتِّجَاه': 'بِاِتِّجَاهِ:gen', # bi-ittiǧāhi = towards
+ 'فِي_أَثنَاءَ': 'أَثنَاءَ:gen', # ʾaṯnāʾa = during
+ 'فِي_إِطَار': 'فِي_إِطَار:gen', # fī ʾiṭār = in frame
+ 'فِي_اعقاب': 'فِي_أَعقَابِ:gen',
+ 'فِي_إِلَى': 'فِي:gen',
+ 'فِي_أَن': 'فِي:gen',
+ 'فِي_أَنَّ': 'فِي:gen',
+ 'فِي_أَنَّ_عَلَى': 'فِي:gen',
+ 'فِي_أَنَّ_لَدَى': 'فِي:gen',
+ 'فِي_أَنَّ_مِن': 'فِي:gen',
+ 'فِي_بِ': 'فِي:gen',
+ 'فِي_بِ_فِي': 'فِي:gen',
+ 'فِي_بَاطِن': 'فِي:gen',
+ 'فِي_بَعدَ': 'فِي:gen',
+ 'فِي_بَينَ': 'بَينَ:gen',
+ 'فِي_حَال': 'فِي_حَالِ:gen', # fī ḥāli = in case
+ 'فِي_حَالَة': 'فِي_حَالِ:gen', # fī ḥāli = in case
+ 'فِي_حَدّ': 'فِي:gen',
+ 'فِي_حُضُور': 'فِي_حُضُورِ:gen', # fī ḥuḍūri = in presence of
+ 'فِي_حَقّ': 'فِي:gen',
+ 'فِي_حُكم': 'فِي:gen',
+ 'فِي_حَوَالَى': 'فِي:gen', # fi hawala = in around X
+ 'فِي_حِين':
+ 'فِي_حِينِ', # fī ḥīni = while
+ 'فِي_حِينَ':
+ 'فِي_حِينِ', # fī ḥīni = while
+ 'فِي_حِين_أَنَّ':
+ 'فِي_حِينِ', # fī ḥīni = while
+ 'فِي_حِينَ_أَنَّ_هُوَ':
+ 'فِي_حِينِ', # fī ḥīni = while
+ 'فِي_خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside
+ 'فِي_خِتَام': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion
+ 'فِي_خِتَامِ': 'فِي_خِتَامِ:gen', # fī ḫitāmi = in conclusion
+ 'فِي_خِلَالَ': 'فِي:gen',
+ 'فِي_دَاخِل':
+ 'دَاخِلَ:gen',
+ 'فِي_دَاخِلَ': 'فِي:gen',
+ 'فِي_سَبِيل': 'فِي_سَبِيلِ:gen', # fī sabīli = in order to
+ 'فِي_سِيَاق': 'فِي:gen',
+ 'فِي_شَأن': 'فِي_شَأنِ:gen', # fī šaʾni = in regard of
+ 'فِي_شَكل': 'فِي:gen',
+ 'فِي_صَفّ': 'فِي:gen',
+ 'فِي_صُورَة': 'فِي:gen',
+ 'فِي_ضَوء': 'فِي_ضَوءِ:gen', # fī ḍawʾi = in light of
+ 'فِي_ظِلّ': 'فِي_ظِلِّ:gen', # fī ẓilli = in light of
+ 'فِي_عُقب': 'فِي_أَعقَابِ:gen', # fī ʾaʿqābi = in the aftermath of
+ 'فِي_غَضن': 'فِي:gen',
+ 'فِي_غُضُون': 'فِي:gen',
+ 'فِي_مَا': 'فِي:gen',
+ 'فِي_مِثلَ': 'مِثلَ', # miṯla = like
+ 'فِي_مَجَال': 'فِي_مَجَالِ:gen', # fī maǧāli = in the area of
+ 'فِي_مستشفى': 'فِي:gen',
+ 'فِي_مَعَ': 'فِي:gen',
+ 'فِي_مُقَابِلَ': 'مُقَابِلَ:gen',
+ 'فِي_مَقدَم': 'فِي:gen',
+ 'فِي_مِن': 'فِي:gen',
+ 'فِي_مُنَاسَبَة': 'فِي_مُنَاسَبَةِ:gen', # fī munāsabati = on the occasion of
+ 'فِي_مُوَاجَهَة': 'فِي:gen',
+ 'فِي_نَحوَ': 'فِي:gen', # in about N
+ 'فِي_نِطَاق': 'فِي:gen',
+ 'فِي_وَجه': 'فِي:gen',
+ 'فِي_وَسط': 'وَسطَ:gen',
+ 'فِي_وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle
+ 'فِيمَا': 'فِيمَا', # fīmā = while
+ 'قُبَالَةَ': 'قُبَالَةَ:gen', # qubālata = in front of, facing
+ 'قَبلَ': 'قَبلَ:gen', # qabla = before
+ 'قَبلَ_أَن': 'قَبلَ:gen', # qabla = before
+ 'قَبلَ_حَوَالَى': 'قَبلَ:gen', # qabla hawala
+ 'قَبلَ_نَحوَ': 'قَبلَ:gen', # before about N
+ 'قُبَيلَ': 'قُبَيلَ:gen', # qubayla = before
+ 'قُربَ': 'قُربَ:gen', # qurba = near
+ 'قَيدَ': 'فِي:gen',
+ 'كَ': 'كَ:gen', # ka = in (temporal?)
+ 'كَ_أَنَّ': 'كَ:gen',
+ 'كَ_لِ': 'كَ:gen',
+ 'كَ_وَ_وَ': 'كَ:gen',
+ 'كَأَنَّمَا': 'كَأَنَّمَا', # ka-ʾannamā = as if
+ 'كُلَّمَا': 'كُلَّمَا', # kullamā = whenever
+ 'كَمَا': 'كَمَا', # remove morphological case; kamā = as
+ 'كَي': 'لِكَي', # kay = in order to
+ 'لَ': 'لِ:gen',
+ 'لَ_عَلَّ': 'لِ:gen',
+ 'لِ': 'لِ:gen', # li = to
+ 'لِ_أَجَلّ': 'لِ:gen',
+ 'لِ_إِلَى': 'لِ:gen',
+ 'لِ_أَمَامَ_وَ': 'لِ:gen',
+ 'لِ_أَن': 'لِ:gen',
+ 'لِ_بِ': 'لِ:gen',
+ 'لِ_جِهَة': 'لِ:gen',
+ 'لِ_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of
+ 'لِ_حَوَالَى': 'لِ:gen', # li hawala = for around X
+ 'لِ_خَارِجَ': 'لِخَارِجِ:gen', # liḫāriǧi = out
+ 'لِ_دُخُول': 'لِ:gen',
+ 'لِ_دَرَجَة_أَنَّ': 'لِ:gen',
+ 'لِ_سَبَب': 'لِ:gen',
+ 'لِ_صَالِح': 'لِصَالِحِ:gen', # liṣāliḥi = in interest of
+ 'لِ_عَلَى': 'لِ:gen',
+ 'لِ_عَن': 'لِ:gen',
+ 'لِ_عِندَ': 'لِ:gen',
+ 'لِ_فِي': 'لِ:gen',
+ 'لِ_فِي_بَينَ': 'لِ:gen',
+ 'لِ_كَون': 'لِكَونِ', # likawni = because
+ 'لِ_لِئَلّا': 'لِ:gen',
+ 'لِ_مِثلَ': 'مِثلَ', # miṯla = like
+ 'لِ_مَعَ': 'لِ:gen',
+ 'لِ_مِن': 'لِ:gen',
+ 'لِ_نَحوَ': 'لِ:gen', # to/for about N
+ 'لِ_وَ': 'لِ:gen',
+ 'لِ_وَ_فِي': 'لِ:gen',
+ 'لَا': 'إِلَّا',
+ 'لَا_سِيَّمَا_بَعدَ': 'بَعدَ:gen',
+ 'لَا_سِيَّمَا_وَ_أَنَّ': 'أَنَّ',
+ 'لَا_سِيَّمَا_وَ_أَنَّ_هُوَ': 'أَنَّ',
+ 'لِأَنَّ': 'لِأَنَّ', # remove morphological case; li-ʾanna = because
+ 'لدى': 'لَدَى:gen',
+ 'لَدَى': 'لَدَى:gen', # ladā = with, by, of, for
+ 'لِذَا': 'لِذَا', # liḏā = so, therefore
+ 'لِذَا_فَ': 'لِ:gen',
+ 'لِذٰلِكَ': 'لِذَا', # liḏā = so, therefore
+ 'لٰكِنَّ': 'مَعَ:gen',
+ 'لكن_إِذَا': 'إِذَا',
+ 'لكن_بِ': 'بِ:gen',
+ 'لٰكِن_بَعدَ': 'بَعدَ:gen',
+ 'لكن_دَاخِلَ': 'دَاخِلَ:gen',
+ 'لكن_لَدَى': 'لَدَى:gen',
+ 'لٰكِن_مَعَ': 'مَعَ:gen',
+ 'لِكَي': 'لِكَي', # li-kay = in order to
+ 'لَمَّا': 'كُلَّمَا',
+ 'لَمَّا_لِ': 'كُلَّمَا',
+ 'لَو': 'لَو', # law = if
+ 'لَو_أَنَّ': 'لَو', # if
+ 'لَو_مِن': 'لَو', # if
+ 'ما': 'مِمَّا',
+ 'مَا': 'مِمَّا',
+ 'ما_دَام': 'مِمَّا',
+ 'مادامت': 'مِمَّا',
+ 'مَالَم': 'مَالَم', # mālam = unless
+ 'مَا_إِذَا': 'إِذَا',
+ 'مِثلَ': 'مِثلَ', # remove morphological case; miṯla = like
+ 'مِثلَمَا': 'مِثلَ', # miṯla = like
+ 'مَعَ': 'مَعَ:gen', # maʿa = with
+ 'مَعَ_أَنَّ': 'مَعَ:gen',
+ 'مَعَ_بِ': 'مَعَ:gen',
+ 'مَعَ_فِي': 'مَعَ:gen',
+ 'مَعَ_مِن_بَينَ': 'بَينَ:gen',
+ 'مقابل': 'مُقَابِلَ:gen',
+ 'مُقَابِلَ': 'مُقَابِلَ:gen', # muqābila = in exchange for, opposite to, corresponding to
+ 'مُقَابِلَ_حَوَالَى': 'مُقَابِلَ:gen', # muqabila hawala
+ 'مُقَارَن_بِ': 'بِ:gen',
+ 'مِمَّا': 'مِمَّا', # mimmā = that, which
+ 'مِمَّا_لَدَى': 'مِمَّا', # mimmā = that, which
+ 'مِن': 'مِن:gen', # min = from
+ 'مِن_اجل': 'مِن_أَجلِ:gen', # min ʾaǧli = for the sake of
+ 'مِن_أَجل': 'مِن_أَجلِ:gen', # min ʾaǧli = for the sake of
+ 'مِن_أَجل_أَن': 'مِن:gen',
+ 'مِن_إِلَى': 'مِن:gen',
+ 'مِن_أَن': 'مِن:gen',
+ 'مِن_أَنَّ': 'مِن:gen',
+ 'مِن_بِ': 'مِن:gen',
+ 'مِن_بَعدَ': 'مِن:gen',
+ 'مِن_بَينَ': 'بَينَ:gen',
+ 'مِن_تَحتَ': 'مِن:gen',
+ 'مِن_ثَمَّ': 'مِن:gen',
+ 'مِن_ثُمَّ': 'مِن:gen',
+ 'مِن_جَانِب': 'إِلَى_جَانِبِ:gen', # min ǧānibi = beside
+ 'مِن_جَرَّاء': 'جَرَّاء:gen', # ǧarrāʾ = because of
+ 'مِن_حَوَالَى': 'مِن:gen', # min hawala = from around X
+ 'مِن_حَولَ': 'مِن:gen',
+ 'مِن_حَيثُ': 'مِن:gen',
+ 'مِن_خَارِج': 'مِن_خَارِجِ:gen', # min ḫāriǧi = from outside
+ 'مِن_خَارِجَ': 'مِن_خَارِجِ:gen', # min ḫāriǧi = from outside
+ 'مِن_خِلَالَ': 'مِن_خِلَالِ:gen', # min ḫilāli = through, during
+ 'مِن_دَاخِلَ': 'مِن_دَاخِلِ:gen', # min dāḫili = from inside
+ 'مِن_دُون': 'مِن_دُونِ:gen', # min dūni = without, beneath, underneath
+ 'مِن_دُونَ': 'مِن_دُونِ:gen', # min dūni = without, beneath, underneath
+ 'مِن_دُون_أَن': 'مِن_دُونِ:gen',
+ 'مِن_دُونَ_أَن': 'مِن_دُونِ:gen', # min dūni ʾan = without, beneath, underneath + clause
+ 'مِن_زَاوِيَة': 'مِن:gen',
+ 'مِن_شَأن': 'مِن_شَأنِ:gen', # min šaʾni = from matter
+ 'مِن_ضِمنَ': 'مِن_ضِمنِ:gen', # min ḍimni = from within = including
+ 'مِن_طَرَف': 'مِن:gen',
+ 'مِن_عَلَى': 'مِن:gen',
+ 'مِن_عِندَ': 'مِن:gen',
+ 'مِن_غَير_أَن': 'مِن:gen',
+ 'مِن_فَوقَ': 'مِن_فَوقِ:gen', # min fawqi = from above
+ 'مِن_فِي': 'مِن:gen',
+ 'مِن_قَبلَ': 'مِن_قِبَلِ:gen',
+ 'مِن_قِبَل': 'مِن_قِبَلِ:gen', # min qibali = by
+ 'مِن_قِبَل_بِ_فِي': 'مِن_قِبَلِ:gen', # min qibali = by
+ 'مِن_مِثلَ': 'مِثلَ', # miṯla = like
+ 'مِن_مِن': 'مِن:gen',
+ 'مِن_مِن_بَينَ': 'بَينَ:gen',
+ 'مِن_مَوقِع': 'مِن:gen',
+ 'مِن_نَاحِيَة': 'مِن:gen',
+ 'مِن_وَرَاءَ': 'مِن_وَرَاءِ:gen', # min warāʾi = from behind
+ 'مُنذُ': 'مُنذُ:gen', # munḏu = since
+ 'مُنذُ_أَن': 'مُنذُ:gen',
+ 'مُنذُ_نَحوَ': 'مُنذُ:gen', # since about N
+ 'مُنذُ_وَ_فِي': 'مُنذُ:gen',
+ 'مَهمَا': 'مَهمَا', # mahmā = regardless
+ 'نَاهِيك_بِ': 'بِ:gen',
+ 'نَتِيجَة_لِ': 'لِ:gen',
+ 'نَحوَ': 'نَحوَ', # naḥwa = about, approximately
+ 'نَحوَ_بِ': 'بِ:gen', # about by N
+ 'هذا_بالأضافة': 'بِ:gen',
+ 'وان': 'أَنَّ',
+ 'وإن': 'إِنَّ',
+ 'وبشان': 'بِشَأنِ:gen',
+ 'وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond
+ 'وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle
+ 'وِفقَ': 'وِفقَ:gen', # wifqa = according to
+ 'وِفق_لِ': 'وِفقَ:gen', # wifqa = according to
+ 'ولو': 'إِذَا', # walaw = even if
+ 'ولو_أَنَّ': 'إِذَا' # walaw = even if
+ }
+
+ def copy_case_from_adposition(self, node, adposition):
+ """
+ In some treebanks, adpositions have the Case feature and it denotes the
+ valency case that the preposition's nominal must be in.
+ """
+ # The following is only partial solution. We will not see
+ # some children because they may be shared children of coordination.
+ prepchildren = [x for x in node.children if x.lemma == adposition]
+ if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '':
+ return adposition+':'+prepchildren[0].feats['Case'].lower()
+ else:
+ return None
+
+ @staticmethod
+ def compose_edeprel(bdeprel, cdeprel):
+ """
+ Composes enhanced deprel from the basic part and optional case
+ enhancement.
+
+ Parameters
+ ----------
+ bdeprel : str
+ Basic deprel (can include subtype, e.g., 'acl:relcl').
+ cdeprel : TYPE
+ Case enhancement (can be composed of adposition and morphological
+ case, e.g., 'k:dat'). It is optional and it can be None or empty
+ string if there is no case enhancement.
+
+ Returns
+ -------
+ Full enhanced deprel (str).
+ """
+ assert(bdeprel[-1] != ':')
+ edeprel = bdeprel
+ if cdeprel:
+ assert(cdeprel[0] != ':')
+ edeprel += ':'+cdeprel
+ return edeprel
+
+ def process_tree(self, tree):
+ """
+ Occasionally the edeprels automatically derived from the Czech basic
+ trees do not match the whitelist. For example, the noun is an
+ abbreviation and its morphological case is unknown.
+
+ We cannot use the process_node() method because it ignores empty nodes.
+ """
+ for node in tree.descendants_and_empty:
+ for edep in node.deps:
+ if edep['deprel'] == 'advcl:pred:إِذَن' or edep['deprel'] == 'advcl:pred:كدا' or edep['deprel'] == 'advcl:pred:لكن':
+ edep['deprel'] = 'advcl:pred'
+ continue
+ if edep['deprel'] == 'nmod:بِأَسْرِ:gen':
+ edep['deprel'] = 'nmod'
+ continue
+ m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel'])
+ if m:
+ bdeprel = m.group(1)
+ cdeprel = m.group(2)
+ solved = False
+ # Arabic clauses often start with وَ wa "and", which does not add
+ # much to the meaning but sometimes gets included in the enhanced
+ # case label. Remove it if there are more informative subsequent
+ # morphs.
+ cdeprel = re.sub(r'^وَ_', r'', cdeprel)
+ cdeprel = re.sub(r'^وَ:', r'', cdeprel)
+ cdeprel = re.sub(r'^وَ$', r'', cdeprel)
+ edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
+ # If one of the following expressions occurs followed by another preposition
+ # or by morphological case, remove the additional case marking.
+ for x in self.outermost:
+ exceptions = self.outermost[x]
+ m = re.fullmatch(x+r'([_:].+)?', cdeprel)
+ if m and m.group(1) and not x+m.group(1) in exceptions:
+ cdeprel = x
+ edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
+ solved = True
+ break
+ if solved:
+ continue
+ # Split preposition from morphological case (if any), normalize
+ # the preposition and add the fixed morphological case where
+ # applicable.
+ m = re.fullmatch(r'([^:]+):(nom|gen|acc)', cdeprel)
+ adposition = m.group(1) if m else cdeprel
+ if adposition in self.unambiguous:
+ cdeprel = self.unambiguous[adposition]
+ edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
+ continue
+
+ def set_basic_and_enhanced(self, node, parent, deprel, edeprel):
+ '''
+ Modifies the incoming relation of a node both in the basic tree and in
+ the enhanced graph. If the node does not yet depend in the enhanced
+ graph on the current basic parent, the new relation will be added without
+ removing any old one. If the node already depends multiple times on the
+ current basic parent in the enhanced graph, all such enhanced relations
+ will be removed before adding the new one.
+ '''
+ old_parent = node.parent
+ node.parent = parent
+ node.deprel = deprel
+ node.deps = [x for x in node.deps if x['parent'] != old_parent]
+ new_edep = {}
+ new_edep['parent'] = parent
+ new_edep['deprel'] = edeprel
+ node.deps.append(new_edep)
diff --git a/udapi/block/ud/basic2enhanced.py b/udapi/block/ud/basic2enhanced.py
new file mode 100644
index 00000000..bc5c8b25
--- /dev/null
+++ b/udapi/block/ud/basic2enhanced.py
@@ -0,0 +1,23 @@
+"""Block ud.Basic2Enhanced for copying basic dependencies to enhanced where missing.
+
+UD treebanks are not required to have enhanced dependencies (https://universaldependencies.org/u/overview/enhanced-syntax.html).
+However, if such annotation is present (in the DEPS column of CoNLL-U),
+it must be present in all nodes and all nodes must be reachable from the root
+in the enhanced-deps graph (as checked by the validator).
+There may be use cases where enhanced deps are annotated only in some kinds of nodes (e.g. empty nodes)
+and the rest of nodes is expected to be the same as in the basic dependencies.
+To make such file valid, one can use this block.
+
+This block should not be used on a file with no enhanced dependencies:
+It makes no sense to just duplicate the HEAD+DEPREL information also in the DEPS column.
+"""
+from udapi.core.block import Block
+
+
+class Basic2Enhanced(Block):
+ """Make sure DEPS column is always filled."""
+
+ def process_tree(self, tree):
+ for node in tree.descendants_and_empty:
+ if node.raw_deps == "_":
+ node.raw_deps = f"{node.parent.ord}:{node.deprel}"
diff --git a/udapi/block/ud/bg/removedotafterabbr.py b/udapi/block/ud/bg/removedotafterabbr.py
index d1d94628..a132dad1 100644
--- a/udapi/block/ud/bg/removedotafterabbr.py
+++ b/udapi/block/ud/bg/removedotafterabbr.py
@@ -7,6 +7,7 @@
"""
from udapi.core.block import Block
+
class RemoveDotAfterAbbr(Block):
"""Block for deleting extra PUNCT nodes after abbreviations.
diff --git a/udapi/block/ud/ca/__init__.py b/udapi/block/ud/ca/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/ud/ca/addmwt.py b/udapi/block/ud/ca/addmwt.py
new file mode 100644
index 00000000..49b79da1
--- /dev/null
+++ b/udapi/block/ud/ca/addmwt.py
@@ -0,0 +1,194 @@
+"""Block ud.ca.AddMwt for heuristic detection of Catalan contractions.
+
+According to the UD guidelines, contractions such as "del" = "de el"
+should be annotated using multi-word tokens.
+
+Note that this block should be used only for converting legacy conllu files.
+Ideally a tokenizer should have already split the MWTs.
+"""
+import re
+import udapi.block.ud.addmwt
+
+MWTS = {
+ 'al': {'form': 'a el', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'},
+ 'als': {'form': 'a els', 'lemma': 'a el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'},
+ 'del': {'form': 'de el', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'},
+ 'dels': {'form': 'de els', 'lemma': 'de el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'},
+ 'pel': {'form': 'per el', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'},
+ 'pels': {'form': 'per els', 'lemma': 'per el', 'feats': '_ Definite=Def|Gender=Masc|Number=Plur|PronType=Art'},
+}
+
+# shared values for all entries in MWTS
+for v in MWTS.values():
+ v['lemma'] = v['form']
+ v['upos'] = 'ADP DET'
+ v['deprel'] = '* det'
+ # The following are the default values
+ # v['main'] = 0 # which of the two words will inherit the original children (if any)
+ # v['shape'] = 'siblings', # the newly created nodes will be siblings
+
+
+class AddMwt(udapi.block.ud.addmwt.AddMwt):
+ """Detect and mark MWTs (split them into words and add the words to the tree)."""
+
+ def __init__(self, verbpron=False, **kwargs):
+ super().__init__(**kwargs)
+ self.verbpron = verbpron
+
+ def multiword_analysis(self, node):
+ """Return a dict with MWT info or None if `node` does not represent a multiword token."""
+ analysis = MWTS.get(node.form.lower(), None)
+
+ if analysis is not None:
+ # Modify the default attachment of the new syntactic words in special situations.
+ if re.match(r'^(root|conj|reparandum)$', node.udeprel):
+ # Copy the dictionary so that we do not modify the original and do not affect subsequent usages.
+ analysis = analysis.copy()
+ analysis['shape'] = 'subtree'
+ return analysis
+ return None
+
+ def fix_personal_pronoun(self, node):
+ # There is a mess in lemmas and features of personal pronouns.
+ if node.upos == 'PRON':
+ if re.match("^jo$", node.form, re.IGNORECASE):
+ node.lemma = 'jo'
+ node.feats = 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
+ if re.match("^(em|m'|-me|'m|me|m)$", node.form, re.IGNORECASE):
+ node.lemma = 'jo'
+ node.feats = 'Case=Acc,Dat|Number=Sing|Person=1|PrepCase=Npr|PronType=Prs'
+ if re.match("^mi$", node.form, re.IGNORECASE):
+ node.lemma = 'jo'
+ node.feats = 'Case=Acc|Number=Sing|Person=1|PrepCase=Pre|PronType=Prs'
+ if re.match("^tu$", node.form, re.IGNORECASE):
+ node.lemma = 'tu'
+ node.feats = 'Case=Nom|Number=Sing|Person=2|Polite=Infm|PronType=Prs'
+ if re.match("^(et|t'|-te|'t|te|t)$", node.form, re.IGNORECASE):
+ node.lemma = 'tu'
+ node.feats = 'Case=Acc,Dat|Number=Sing|Person=2|Polite=Infm|PrepCase=Npr|PronType=Prs'
+ if re.match("^ti$", node.form, re.IGNORECASE):
+ node.lemma = 'tu'
+ node.feats = 'Case=Acc|Number=Sing|Person=2|Polite=Infm|PrepCase=Pre|PronType=Prs'
+ # Strong forms of third person pronouns can be used as subjects or after preposition.
+ # Do not mark them as nominative (because of the prepositions).
+ if re.match("^ell$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Gender=Masc|Number=Sing|Person=3|PronType=Prs'
+ if re.match("^ella$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Gender=Fem|Number=Sing|Person=3|PronType=Prs'
+ if re.match("^(el|-lo|'l|lo)$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs'
+ if re.match("^(la|-la)$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs'
+ if re.match("^(l')$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Case=Acc|Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs'
+ if re.match("^(ho|-ho)$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs'
+ if re.match("^(li|-li)$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Case=Dat|Number=Sing|Person=3|PronType=Prs'
+ if re.match("^(es|s'|-se|'s|se|s)$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes'
+ if re.match("^si$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Case=Acc|Person=3|PrepCase=Pre|PronType=Prs|Reflex=Yes'
+ # If nosaltres can be used after a preposition, we should not tag it as nominative.
+ if re.match("^nosaltres$", node.form, re.IGNORECASE):
+ node.lemma = 'jo'
+ node.feats = 'Number=Plur|Person=1|PronType=Prs'
+ # Nós is the majestic first person singular. In accusative and dative, it is identical to first person plural.
+ if re.match("^nós$", node.form, re.IGNORECASE):
+ node.lemma = 'jo'
+ node.feats = 'Number=Sing|Person=1|Polite=Form|PronType=Prs'
+ if re.match("^(ens|-nos|'ns|nos|ns)$", node.form, re.IGNORECASE):
+ node.lemma = 'jo'
+ node.feats = 'Case=Acc,Dat|Number=Plur|Person=1|PronType=Prs'
+ if re.match("^vosaltres$", node.form, re.IGNORECASE):
+ node.lemma = 'tu'
+ node.feats = 'Number=Plur|Person=2|PronType=Prs'
+ # Vós is the formal second person singular. In accusative and dative, it is identical to second person plural.
+ # Vostè is even more formal than vós. In accusative and dative, it is identical to third person singular.
+ if re.match("^(vós|vostè)$", node.form, re.IGNORECASE):
+ node.lemma = 'tu'
+ node.feats = 'Number=Sing|Person=2|Polite=Form|PronType=Prs'
+ if re.match("^vostès$", node.form, re.IGNORECASE):
+ node.lemma = 'tu'
+ node.feats = 'Number=Plur|Person=2|Polite=Form|PronType=Prs'
+ if re.match("^(us|-vos|-us|vos)$", node.form, re.IGNORECASE):
+ node.lemma = 'tu'
+ node.feats = 'Case=Acc,Dat|Number=Plur|Person=2|PronType=Prs'
+ # Strong forms of third person pronouns can be used as subjects or after preposition.
+ # Do not mark them as nominative (because of the prepositions).
+ if re.match("^ells$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Gender=Masc|Number=Plur|Person=3|PronType=Prs'
+ if re.match("^elles$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Gender=Fem|Number=Plur|Person=3|PronType=Prs'
+ # Els is masculine accusative, or dative in any gender.
+ if re.match("^(els|-los|'ls|los|ls)$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Case=Acc,Dat|Number=Plur|Person=3|PronType=Prs'
+ if re.match("^(les|-les)$", node.form, re.IGNORECASE):
+ node.lemma = 'ell'
+ node.feats = 'Case=Acc|Gender=Fem|Number=Plur|Person=3|PronType=Prs'
+ # There are also "adverbial" pronominal clitics that can occur at direct object positions.
+ if re.match("^(en|n'|'n|-ne|n|ne)$", node.form, re.IGNORECASE):
+ node.lemma = 'en'
+ node.feats = 'Case=Gen|Person=3|PronType=Prs'
+ if re.match("^(hi|-hi)$", node.form, re.IGNORECASE):
+ node.lemma = 'hi'
+ node.feats = 'Case=Loc|Person=3|PronType=Prs'
+
+ def report_suspicious_lemmas(self, node):
+ # There are offset issues of splitted multi_word_expressions.
+ # Sometimes a word gets the lemma of the neighboring word.
+ if node.form.lower()[:1] != node.lemma.lower()[:1]:
+ # Exclude legitimate cases where the lemma starts with a different letter.
+ hit = True
+ if node.lemma == 'jo' and re.match("(em|ens|m'|me|mi|nos|nosaltres|'ns)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'tu' and re.match("(et|'t|us|vosaltres|vostè)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'el' and re.match("(la|l|l'|les)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'ell' and re.match("(hi|ho|'l|l'|la|-la|les|li|lo|-lo|los|'ls|'s|s'|se|-se|si)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'es' and re.match("(s|se)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'em' and re.match("('m|m|m')", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'en' and re.match("('n|n'|ne|-ne)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'anar' and re.match("(va|van|vàrem)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'ser' and re.match("(és|era|eren|eres|érem|essent|estat|ets|foren|fos|fossin|fou)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'estar' and re.match("(sigut)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'caure' and re.match("(queia|queies|quèiem|quèieu|queien)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'ampli' and re.match("(àmplia|àmplies)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'indi' and re.match("(índies)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'obvi' and re.match("(òbvia)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'ossi' and re.match("(òssies)", node.form, re.IGNORECASE):
+ hit = False
+ if node.lemma == 'ús' and re.match("(usos)", node.form, re.IGNORECASE):
+ hit = False
+ # Form = '2001/37/CE', lemma = 'CE'
+ # Form = 'nº5', lemma = '5'
+ # Form = 'kg.', lemma = 'quilogram'
+ # Form = 'un', lemma = '1'
+ if node.lemma == 'CE' or re.match("nº", node.form, re.IGNORECASE) or re.match("^quil[oò]", node.lemma, re.IGNORECASE) or re.match("^[0-9]+$", node.lemma):
+ hit = False
+ if hit:
+ print("Form = '%s', lemma = '%s', address = %s" % (node.form, node.lemma, node.address()))
diff --git a/udapi/block/ud/ca/elque.py b/udapi/block/ud/ca/elque.py
new file mode 100644
index 00000000..6b3ad22b
--- /dev/null
+++ b/udapi/block/ud/ca/elque.py
@@ -0,0 +1,116 @@
+"""
+This block searches for relative clauses modifying a determiner ('el que...').
+It is written for Catalan but a similar block should work for Spanish and other
+Romance languages.
+"""
+from udapi.core.block import Block
+import logging
+import re
+
+class ElQue(Block):
+
+ def __init__(self, fix=False, **kwargs):
+ """
+ Default: Print the annotation patterns but do not fix anything.
+ fix=1: Do not print the patterns but fix them.
+ """
+ super().__init__(**kwargs)
+ self.fix = fix
+
+ def process_node(self, node):
+ # We take 'que' as the central node of the construction.
+ if node.lemma == 'que' and node.upos == 'PRON' and node.parent.ord > node.ord:
+ # We will refer to the parent of 'que' as a verb, although it can be
+ # a non-verbal predicate, too.
+ que = node
+ verb = node.parent
+ # Check the lemma of the determiner. The form may vary for gender and number.
+ if que.prev_node and que.prev_node.lemma == 'el':
+ el = que.prev_node
+ adp = None
+ if el.prev_node and el.prev_node.upos == 'ADP':
+ adp = el.prev_node
+ if adp.udeprel == 'fixed':
+ adp = adp.parent
+ if self.fix:
+ self.fix_pattern(adp, el, que, verb)
+ else:
+ self.print_pattern(adp, el, que, verb)
+
+ def print_pattern(self, adp, el, que, verb):
+ stanford = []
+ if adp:
+ if adp.parent == el:
+ parentstr = 'el'
+ elif adp.parent == que:
+ parentstr = 'que'
+ elif adp.parent == verb:
+ parentstr = 'VERB'
+ else:
+ parentstr = 'OTHER'
+ stanford.append(adp.deprel + '(' + parentstr + ', ADP)')
+ if el.parent == adp:
+ parentstr = 'ADP'
+ elif el.parent == que:
+ parentstr = 'que'
+ elif el.parent == verb:
+ parentstr = 'VERB'
+ else:
+ parentstr = 'OTHER'
+ stanford.append(el.deprel + '(' + parentstr + ', el)')
+ # We found the verb as the parent of 'que', so we do not need to check the parent of 'que' now.
+ stanford.append(que.deprel + '(VERB, que)')
+ if verb.parent == adp:
+ parentstr = 'ADP'
+ elif verb.parent == el:
+ parentstr = 'el'
+ else:
+ parentstr = 'OTHER'
+ stanford.append(verb.deprel + '(' + parentstr + ', VERB)')
+ print('; '.join(stanford))
+
+ def fix_pattern(self, adp, el, que, verb):
+ if adp:
+ if adp.parent == que or adp.parent == verb:
+ attach(adp, el, 'case')
+ if el.parent == que:
+ ###!!! Just a temporary change. In the end it will be attached elsewhere.
+ attach(el, verb)
+ el.parent = verb
+ if len(el.deps) == 1:
+ el.deps[0]['parent'] = verb
+ if verb.parent != adp and verb.parent != el and verb.parent != que:
+ eldeprel = None
+ if re.match(r'^[nc]subj$', verb.udeprel):
+ eldeprel = 'nsubj'
+ elif re.match(r'^ccomp$', verb.udeprel):
+ eldeprel = 'obj'
+ elif re.match(r'^advcl$', verb.udeprel):
+ eldeprel = 'obl'
+ elif re.match(r'^acl$', verb.udeprel):
+ eldeprel = 'nmod'
+ elif re.match(r'^(xcomp|conj|appos|root)$', verb.udeprel):
+ eldeprel = verb.deprel
+ if eldeprel:
+ attach(el, verb.parent, eldeprel)
+ attach(verb, el, 'acl:relcl')
+ # If anything before 'el' depends on the verb ('cc', 'mark', 'punct' etc.),
+ # re-attach it to 'el'.
+ for c in verb.children:
+ if c.ord < el.ord and re.match(r'^(cc|mark|case|punct)$', c.udeprel):
+ attach(c, el)
+
+def attach(node, parent, deprel=None):
+ """
+ Attach a node to a new parent with a new deprel in the basic tree. In
+ addition, if there are enhanced dependencies and there is just one incoming
+ enhanced relation (this is the case in AnCora), this relation will be
+ modified accordingly.
+ """
+ node.parent = parent
+ if deprel:
+ node.deprel = deprel
+ if len(node.deps) == 1:
+ node.deps[0]['parent'] = parent
+ if deprel:
+ node.deps[0]['deprel'] = deprel
diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py
new file mode 100644
index 00000000..b36b2512
--- /dev/null
+++ b/udapi/block/ud/complywithtext.py
@@ -0,0 +1,342 @@
+r"""Block ComplyWithText for adapting the nodes to comply with the text.
+
+Implementation design details:
+Usually, most of the inconsistencies between tree tokens and the raw text are simple to solve.
+However, there may be also rare cases when it is not clear how to align the tokens
+(nodes in the tree) with the raw text (stored in ``root.text``).
+This block tries to solve the general case using several heuristics.
+
+It starts with running a LCS-like algorithm (LCS = longest common subsequence)
+``difflib.SequenceMatcher`` on the raw text and concatenation of tokens' forms,
+i.e. on sequences of characters (as opposed to running LCS on sequences of tokens).
+
+To prevent mis-alignment problems, we keep the spaces present in the raw text
+and we insert spaces into the concatenated forms (``tree_chars``) according to ``SpaceAfter=No``.
+An example of a mis-alignment problem:
+text "énfase na necesidade" with 4 nodes "énfase en a necesidade"
+should be solved by adding multiword token "na" over the nodes "en" and "a".
+However, running LCS (or difflib) over the character sequences
+"énfaseenanecesidade"
+"énfasenanecesidade"
+may result in énfase -> énfas.
+
+Author: Martin Popel
+"""
+import difflib
+import logging
+import regex
+
+from udapi.core.block import Block
+from udapi.core.mwt import MWT
+
+
+class ComplyWithText(Block):
+ """Adapt the nodes to comply with the text."""
+
+ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4,
+ allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True,
+ previous_form_label='CorrectForm', previous_text_label='OrigText',
+ added_label='Added', **kwargs):
+ """Args:
+ fix_text: After all heuristics are applied, the token forms may still not match the text.
+ Should we edit the text to match the token forms (as a last resort)? Default=True.
+ prefer_mwt - What to do if multiple subsequent nodes correspond to a text written
+ without spaces and non-word characters (punctuation)?
+ E.g. if "3pm doesn't" is annotated with four nodes "3 pm does n't".
+ We can use either SpaceAfter=No, or create a multi-word token (MWT).
+ Note that if there is space or punctuation, SpaceAfter=No will be used always
+ (e.g. "3 p.m." annotated with three nodes "3 p. m.").
+ If the character sequence does not match exactly, MWT will be used always
+ (e.g. "3pm doesn't" annotated with four nodes "3 p.m. does not").
+ Thus this parameter influences only the "unclear" cases.
+ Default=True (i.e. prefer multi-word tokens over SpaceAfter=No).
+ allow_goeswith - If a node corresponds to multiple space-separated strings in text,
+ which are not allowed as tokens with space, we can either leave this diff
+ unresolved or create new nodes and join them with the `goeswith` deprel.
+ Default=True (i.e. add the goeswith nodes if applicable).
+ max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words).
+ Default=4.
+ allow_add_punct - allow creating punctuation-only nodes
+ allow_delete_punct - allow deleting extra punctuation-only nodes,
+ which are not represented in root.text
+ allow_hyphen_goeswith - if e.g. node.form=="mother-in-law" corresponds to
+ "mother in law" in root.text, convert it to three nodes:
+ node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law")
+ node2(form="in", deprel="goeswith", upos="X", parent=node1)
+ node3(form="law", deprel="goeswith", upos="X", parent=node1).
+ previous_form_label - when changing node.form, we store the previous value
+ in node.misc[previous_form_label] (so no information is lost).
+ Default="CorrectForm" because we expect that the previous value
+ (i.e. the value of node.form before applying this block)
+ contained the corrected spelling, while root.text contains
+ the original spelling with typos as found in the raw text.
+ CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html
+ When setting this parameter to an empty string, no values will be stored to node.misc.
+ When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well.
+ previous_text_label - when we are not able to adapt the annotation to match root.text
+ and fix_text is True, we store the previous root.text value in a CoNLL-U comment with this label.
+ Default="OrigText". When setting this parameter to an empty string,
+ no values will be stored to root.comment.
+ added_label - when creating new nodes because allow_add_punct=True, we mark these nodes
+ as new_node.misc[added_label] = 1. Default="Added".
+ """
+ super().__init__(**kwargs)
+ self.fix_text = fix_text
+ self.prefer_mwt = prefer_mwt
+ self.allow_goeswith = allow_goeswith
+ self.max_mwt_length = max_mwt_length
+ self.allow_add_punct = allow_add_punct
+ self.allow_delete_punct = allow_delete_punct
+ self.allow_hyphen_goeswith = allow_hyphen_goeswith
+ self.previous_form_label = previous_form_label
+ self.previous_text_label = previous_text_label
+ self.added_label = added_label
+
+ @staticmethod
+ def allow_space(form):
+ """Is space allowed within this token form?"""
+ return regex.fullmatch('[0-9 ]+([,.][0-9]+)?', form)
+
+ def store_previous_form(self, node):
+ """Store the previous form of this node into MISC, unless the change is common&expected."""
+ if node.form not in ("''", "``") and self.previous_form_label:
+ node.misc[self.previous_form_label] = node.form
+ if self.previous_form_label == 'CorrectForm':
+ node.feats['Typo'] = 'Yes'
+
+ def process_tree(self, root):
+ text = root.text
+ if text is None:
+ raise ValueError('Tree %s has no text, cannot use ud.ComplyWithText' % root)
+
+ # Normalize the stored text (e.g. double space or no-break space -> single space)
+ # and skip sentences which are already ok.
+ text = ' '.join(text.split())
+ if root.text != text and self.fix_text:
+ if self.previous_text_label:
+ root.add_comment(f'{self.previous_text_label} = {root.text}')
+ root.text = text
+ if text == root.compute_text():
+ return
+
+ tree_chars, char_nodes = _nodes_to_chars(root.token_descendants)
+
+ # Align. difflib may not give LCS, but usually it is good enough.
+ matcher = difflib.SequenceMatcher(None, tree_chars, text, autojunk=False)
+ diffs = list(matcher.get_opcodes())
+ _log_diffs(diffs, tree_chars, text, 'matcher')
+
+ diffs = self.unspace_diffs(diffs, tree_chars, text)
+ _log_diffs(diffs, tree_chars, text, 'unspace')
+
+ diffs = self.merge_diffs(diffs, char_nodes)
+ _log_diffs(diffs, tree_chars, text, 'merge')
+
+ # Solve diffs.
+ self.solve_diffs(diffs, tree_chars, char_nodes, text)
+
+ # Fill SpaceAfter=No.
+ tmp_text = text
+ for node in root.token_descendants:
+ if tmp_text.startswith(node.form):
+ tmp_text = tmp_text[len(node.form):]
+ if not tmp_text or tmp_text[0].isspace():
+ del node.misc['SpaceAfter']
+ tmp_text = tmp_text.lstrip()
+ else:
+ node.misc['SpaceAfter'] = 'No'
+ else:
+ logging.warning('Node %s does not match text "%s"', node, tmp_text[:20])
+ break
+
+ # Edit root.text if needed.
+ if self.fix_text:
+ computed_text = root.compute_text()
+ if text != computed_text:
+ if self.previous_text_label:
+ root.add_comment(f'{self.previous_text_label} = {root.text}')
+ root.text = computed_text
+
+ def unspace_diffs(self, orig_diffs, tree_chars, text):
+ diffs = []
+ for diff in orig_diffs:
+ edit, tree_lo, tree_hi, text_lo, text_hi = diff
+ if edit != 'insert':
+ if tree_chars[tree_lo] == ' ':
+ tree_lo += 1
+ if tree_chars[tree_hi - 1] == ' ':
+ tree_hi -= 1
+ if text[text_lo] == ' ':
+ text_lo += 1
+ if text[text_hi - 1] == ' ':
+ text_hi -= 1
+ old = tree_chars[tree_lo:tree_hi]
+ new = text[text_lo:text_hi]
+ if old == '' and new == '':
+ continue
+ elif old == new:
+ edit = 'equal'
+ elif old == '':
+ edit = 'insert'
+ diffs.append((edit, tree_lo, tree_hi, text_lo, text_hi))
+ return diffs
+
+ def merge_diffs(self, orig_diffs, char_nodes):
+ """Make sure each diff starts on original token boundary.
+
+ If not, merge the diff with the previous diff.
+ E.g. (equal, "5", "5"), (replace, "-6", "–7")
+ is changed into (replace, "5-6", "5–7")
+ """
+ diffs = []
+ for diff in orig_diffs:
+ edit, tree_lo, tree_hi, text_lo, text_hi = diff
+ if edit != 'insert' and char_nodes[tree_lo] is not None:
+ diffs.append(diff)
+ elif edit == 'equal':
+ while tree_lo < tree_hi and char_nodes[tree_lo] is None:
+ tree_lo += 1
+ text_lo += 1
+ diffs[-1] = ('replace', diffs[-1][1], tree_lo, diffs[-1][3], text_lo)
+ if tree_lo < tree_hi:
+ diffs.append(('equal', tree_lo, tree_hi, text_lo, text_hi))
+ else:
+ if not diffs:
+ diffs = [diff]
+ elif diffs[-1][0] != 'equal':
+ diffs[-1] = ('replace', diffs[-1][1], tree_hi, diffs[-1][3], text_hi)
+ else:
+ p_tree_hi = diffs[-1][2] - 1
+ p_text_hi = diffs[-1][4] - 1
+ while char_nodes[p_tree_hi] is None:
+ p_tree_hi -= 1
+ p_text_hi -= 1
+ assert p_tree_hi >= diffs[-1][1]
+ assert p_text_hi >= diffs[-1][3]
+ diffs[-1] = ('equal', diffs[-1][1], p_tree_hi, diffs[-1][3], p_text_hi)
+ diffs.append(('replace', p_tree_hi, tree_hi, p_text_hi, text_hi))
+ return diffs
+
+ def solve_diffs(self, diffs, tree_chars, char_nodes, text):
+ for diff in diffs:
+ edit, tree_lo, tree_hi, text_lo, text_hi = diff
+
+ if edit == 'equal':
+ pass
+ elif edit == 'insert':
+ forms = text[text_lo:text_hi].split(' ')
+ if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct:
+ next_node = char_nodes[tree_lo]
+ for f in reversed(forms):
+ new = next_node.create_child(form=f, deprel='punct', upos='PUNCT')
+ new.shift_before_node(next_node)
+ new.misc[self.added_label] = 1
+ else:
+ logging.warning('Unable to insert nodes\n%s',
+ _diff2str(diff, tree_chars, text))
+ elif edit == 'delete':
+ nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None]
+ if all(regex.fullmatch('\p{P}+', n.form) for n in nodes):
+ if self.allow_delete_punct:
+ for node in nodes:
+ node.remove(children='rehang')
+ else:
+ logging.warning('Unable to delete punctuation nodes (try ud.ComplyWithText allow_delete_punct=1)\n%s',
+ _diff2str(diff, tree_chars, text))
+ else:
+ logging.warning('Unable to delete non-punctuation nodes\n%s',
+ _diff2str(diff, tree_chars, text))
+ else:
+ assert edit == 'replace'
+ # Revert the splittng and solve the diff.
+ nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None]
+ form = text[text_lo:text_hi]
+ self.solve_diff(nodes, form.strip())
+
+ def solve_diff(self, nodes, form):
+ """Fix a given (minimal) tokens-vs-text inconsistency."""
+ nodes_str = ' '.join([n.form for n in nodes]) # just for debugging
+ node = nodes[0]
+
+ # First, solve the cases when the text contains a space.
+ if ' ' in form:
+ node_form = node.form
+ if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form:
+ node_form = node_form.replace('-', '')
+ if len(nodes) == 1:
+ if node_form == form.replace(' ', ''):
+ if self.allow_space(form):
+ self.store_previous_form(node)
+ node.form = form
+ elif self.allow_goeswith:
+ self.store_previous_form(node)
+ forms = form.split()
+ node.form = forms[0]
+ node.feats['Typo'] = 'Yes'
+ for split_form in reversed(forms[1:]):
+ new = node.create_child(form=split_form, deprel='goeswith', upos='X')
+ new.shift_after_node(node)
+ else:
+ logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form)
+ elif self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('[ \p{P}]+', form[len(node.form):]):
+ for punct_form in reversed(form[len(node.form):].split()):
+ new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT')
+ new.shift_after_node(node)
+ new.misc[self.added_label] = 1
+ else:
+ logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form)
+ else:
+ logging.warning(f'Unable to solve {len(nodes)}:{len(form.split(" "))} diff:\n{nodes_str} -> {form}')
+
+ # Second, solve the cases when multiple nodes match one form (without any spaces).
+ elif len(nodes) > 1:
+ # If the match is exact, we can choose between MWT ans SpaceAfter solutions.
+ if not self.prefer_mwt and ''.join([n.form for n in nodes]) == form:
+ pass # SpaceAfter=No will be added later on.
+ # If one of the nodes is already a MWT, we cannot have nested MWTs.
+ # TODO: enlarge the MWT instead of failing.
+ elif any(isinstance(n, MWT) for n in nodes):
+ logging.warning('Unable to solve partial-MWT diff:\n%s -> %s', nodes_str, form)
+ # MWT with too many words are suspicious.
+ elif len(nodes) > self.max_mwt_length:
+ logging.warning('Not creating too long (%d>%d) MWT:\n%s -> %s',
+ len(nodes), self.max_mwt_length, nodes_str, form)
+ # Otherwise, create a new MWT.
+ else:
+ node.root.create_multiword_token(nodes, form)
+
+ # Third, solve the 1-1 cases.
+ else:
+ if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]):
+ punct_form = form[len(node.form):]
+ new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT')
+ new.shift_after_node(node)
+ new.misc[self.added_label] = 1
+ else:
+ self.store_previous_form(node)
+ node.form = form
+
+
+def _nodes_to_chars(nodes):
+ chars, char_nodes = [], []
+ for node in nodes:
+ form = node.form
+ if node.misc['SpaceAfter'] != 'No' and node != nodes[-1]:
+ form += ' '
+ chars.extend(form)
+ char_nodes.append(node)
+ char_nodes.extend([None] * (len(form) - 1))
+ return ''.join(chars), char_nodes
+
+
+def _log_diffs(diffs, tree_chars, text, msg):
+ if logging.getLogger().isEnabledFor(logging.DEBUG):
+ logging.warning('=== After %s:', msg)
+ for diff in diffs:
+ logging.warning(_diff2str(diff, tree_chars, text))
+
+
+def _diff2str(diff, tree, text):
+ old = '|' + ''.join(tree[diff[1]:diff[2]]) + '|'
+ new = '|' + ''.join(text[diff[3]:diff[4]]) + '|'
+ return '{:7} {!s:>50} --> {!s}'.format(diff[0], old, new)
diff --git a/udapi/block/ud/convert1to2.py b/udapi/block/ud/convert1to2.py
index 72d08ab8..d76c50b9 100644
--- a/udapi/block/ud/convert1to2.py
+++ b/udapi/block/ud/convert1to2.py
@@ -23,9 +23,10 @@
"csubjpass": "csubj:pass",
"auxpass": "aux:pass",
"name": "flat:name",
- "foreign": "flat", # "flat:foreign" not needed once we have Foreign=Yes in FEATS
+ "foreign": "flat", # "flat:foreign" not needed once we have Foreign=Yes in FEATS
}
+
class Convert1to2(Block):
"""Block for converting UD v1 to UD v2."""
@@ -45,7 +46,7 @@ def __init__(self, skip='', save_stats=True, **kwargs):
self.skip = {k for k in skip.split(',')}
self.save_stats = save_stats
- def process_tree(self, tree): # pylint: disable=too-many-branches
+ def process_tree(self, tree): # pylint: disable=too-many-branches
"""Apply all the changes on the current tree.
This method is automatically called on each tree by Udapi.
@@ -114,15 +115,21 @@ def change_upos_copula(node):
if node.deprel == 'cop' and node.upos not in ("AUX", "PRON"):
node.upos = "AUX"
- @staticmethod
- def change_deprel_simple(node):
+ def change_deprel_simple(self, node):
"""mwe→fixed, dobj→obj, *pass→*:pass, name→flat, foreign→flat+Foreign=Yes."""
- if node.deprel == 'foreign':
+ if node.udeprel == 'foreign':
node.feats['Foreign'] = 'Yes'
+ udeprel, sdeprel = node.udeprel, node.sdeprel
try:
- node.deprel = DEPREL_CHANGE[node.deprel]
+ node.deprel = DEPREL_CHANGE[udeprel]
except KeyError:
- pass
+ return
+ if sdeprel:
+ if ':' in node.deprel:
+ self.log(node, 'deprel', 'deprel=%s:%s new_deprel=%s but %s is lost' %
+ (udeprel, sdeprel, node.deprel, sdeprel))
+ else:
+ node.deprel += ':' + sdeprel
def change_neg(self, node):
"""neg→advmod/det/ToDo + Polarity=Neg.
@@ -139,7 +146,7 @@ def change_neg(self, node):
if 'Neg' not in node.feats['PronType']:
node.feats['Polarity'] = 'Neg'
- if node.upos in ['ADV', 'PART']:
+ if node.upos in ['ADV', 'PART', 'AUX']:
node.deprel = 'advmod'
elif node.upos == 'DET':
node.deprel = 'det'
@@ -162,24 +169,30 @@ def is_nominal(node):
"""Returns 'no' (for predicates), 'yes' (sure nominals) or 'maybe'.
Used in `change_nmod`."""
- if node.upos in ["VERB", "AUX", "ADJ", "ADV"]:
+ if node.upos in ["VERB", "AUX", "ADV"]:
return 'no'
+ # check whether the node is a predicate
+ # (either has a nsubj/csubj dependendent or a copula dependent)
+ has_cop = any("subj" in child.deprel or child.deprel == 'cop' for child in node.children)
+ # Adjectives are very likely complements of copula verbs.
+ if node.upos == "ADJ":
+ return "no" if has_cop else "maybe"
# Include NUM for examples such as "one of the guys"
# and DET for examples such as "some/all of them"
if node.upos in ["NOUN", "PRON", "PROPN", "NUM", "DET"]:
- # check whether the node is a predicate
- # (either has a nsubj/csubj dependendent or a copula dependent)
- if any(["subj" in child.deprel or child.deprel == 'cop' for child in node.children]):
- return 'maybe'
- return 'yes'
+ return "maybe" if has_cop else "yes"
return 'maybe'
def change_nmod(self, node):
"""nmod→obl if parent is not nominal, but predicate."""
- if node.deprel == 'nmod':
+ if node.udeprel == 'nmod':
parent_is_nominal = self.is_nominal(node.parent)
if parent_is_nominal == 'no':
- node.deprel = 'obl'
+ node.udeprel = 'obl'
+ elif node.deprel == 'nmod:tmod':
+ node.deprel = 'obl:tmod'
+ elif node.deprel == 'nmod:poss':
+ node.deprel = 'nmod:poss'
elif parent_is_nominal == 'maybe':
self.log(node, 'nmod', 'deprel=nmod, but parent is ambiguous nominal/predicate')
@@ -269,13 +282,16 @@ def fix_remnants_in_tree(self, root):
Remnant's parent is always the correlate (same-role) node.
Usually, correlate's parent is the head of the whole ellipsis subtree,
i.e. the first conjunct. However, sometimes remnants are deeper, e.g.
- 'Over 300 Iraqis are reported dead and 500 wounded.' with edges:
- nsubjpass(reported, Iraqis)
- nummod(Iraqis, 300)
- remnant(300, 500)
+ 'Over 300 Iraqis are reported dead and 500 wounded.' with edges::
+
+ nsubjpass(reported, Iraqis)
+ nummod(Iraqis, 300)
+ remnant(300, 500)
+
Let's expect all remnants in one tree are part of the same ellipsis structure.
+
TODO: theoretically, there may be more ellipsis structures with remnants in one tree,
- but I have no idea how to distinguish them from the deeper-remnants cases.
+ but I have no idea how to distinguish them from the deeper-remnants cases.
"""
remnants = [n for n in root.descendants if n.deprel == 'remnant']
if not remnants:
diff --git a/udapi/block/ud/cs/__init__.py b/udapi/block/ud/cs/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/ud/cs/addmwt.py b/udapi/block/ud/cs/addmwt.py
new file mode 100644
index 00000000..a690c95b
--- /dev/null
+++ b/udapi/block/ud/cs/addmwt.py
@@ -0,0 +1,245 @@
+"""Block ud.cs.AddMwt for heuristic detection of multi-word tokens."""
+import udapi.block.ud.addmwt
+import re
+import logging
+
+# Define static rules for 'aby', 'kdyby' and similar forms.
+MWTS = {
+ 'abych': {'form': 'aby bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'},
+ 'kdybych': {'form': 'když bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'},
+ 'abys': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
+ 'abysi': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
+ 'kdybys': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
+ 'kdybysi': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'},
+ 'aby': {'form': 'aby by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'},
+ 'kdyby': {'form': 'když by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'},
+ 'abychom': {'form': 'aby bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
+ 'kdybychom': {'form': 'když bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
+ # Old Czech 'abychme' == Modern Czech 'abychom'
+ 'abychme': {'form': 'aby bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
+ 'kdybychme': {'form': 'když bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'},
+ 'abyste': {'form': 'aby byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
+ 'abyšte': {'form': 'aby byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
+ 'kdybyste': {'form': 'když byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
+ 'kdybyšte': {'form': 'když byšte', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'},
+ # Old Czech 'abyšta' == dual number; 2nd or 3rd person, the one example in data so far is 3rd.
+ 'abyšta': {'form': 'aby byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'},
+ 'kdybyšta': {'form': 'když byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'},
+}
+for v in MWTS.values():
+ v['upos'] = 'SCONJ AUX'
+ number = '-'
+ if 'Sing' in v['feats']:
+ number = 'S'
+ elif 'Plur' in v['feats']:
+ number = 'P'
+ person = '-'
+ if 'Person=1' in v['feats']:
+ person = '1'
+ elif 'Person=2' in v['feats']:
+ person = '2'
+ v['xpos'] = 'J,------------- Vc-%s---%s-------' % (number, person)
+ v['deprel'] = '* aux'
+ v['lemma'] = v['form'].split()[0] + ' být'
+ v['main'] = 0
+ v['shape'] = 'siblings'
+
+# Define static rules for 'nač', 'oč', 'zač' (but not 'proč').
+# Add them to the already existing dictionary MWTS.
+# nač -> na + co
+for prep in 'na o za'.split():
+ MWTS[prep + 'č'] = {
+ 'form': prep + ' co',
+ 'lemma': prep + ' co',
+ 'upos': 'ADP PRON',
+ 'xpos': 'RR--4---------- PQ--4----------',
+ 'feats': 'AdpType=Prep|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel',
+ 'deprel': 'case *',
+ 'main': 1,
+ 'shape': 'subtree',
+ }
+# In 19th century texts (Hičkok etalon), one instance of 'seč' was also split (and annotated as ADP + accusative!)
+# A few additional instances were found in older texts, too (e.g. 16th century).
+# We must do it separately, as the preposition is vocalized.
+MWTS['seč'] = {
+ 'form': 'se' + ' co',
+ 'lemma': 's' + ' co',
+ 'upos': 'ADP PRON',
+ 'xpos': 'RV--4---------- PQ--4----------',
+ 'feats': 'AdpType=Voc|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel',
+ 'deprel': 'case *',
+ 'main': 1,
+ 'shape': 'subtree',
+}
+
+# Old Czech 'toliť' (special case with 3 subtokens; general -ť will be solved dynamically below).
+MWTS['toliť'] = {
+ 'form': 'to li ť',
+ 'lemma': 'ten li ť',
+ 'upos': 'DET SCONJ PART',
+ 'xpos': '* J,------------- TT-------------',
+ 'feats': '* _ _',
+ 'deprel': '* mark discourse',
+ 'main': 0,
+ 'shape': 'siblings'
+}
+
+
+
+class AddMwt(udapi.block.ud.addmwt.AddMwt):
+ """Detect and mark MWTs (split them into words and add the words to the tree)."""
+
+ def multiword_analysis(self, node):
+ """Return a dict with MWT info or None if `node` does not represent a multiword token."""
+ # Avoid adding a MWT if the current node already is part of an MWT.
+ if node.multiword_token:
+ return None
+ analysis = MWTS.get(node.form.lower(), None)
+ if analysis is not None:
+ return analysis
+ # If the node did not match any of the static rules defined in MWTS,
+ # check it against the "dynamic" rules below. The enclitic 'ť' will be
+ # separated from its host but only if it has been marked by an annotator
+ # in MISC. (These are annotation conventions used for Old Czech in the
+ # Hičkok project.)
+ if node.misc['AddMwt'] != '':
+ subtokens = node.misc['AddMwt'].split()
+ if len(subtokens) != 2:
+ logging.warning("MISC 'AddMwt=%s' has unexpected number of subtokens." % node.misc['AddMwt'])
+ return None
+ token_from_subtokens = ''.join(subtokens)
+ if subtokens[1] == 'jsi':
+ node.misc['AddMwt'] = ''
+ return {
+ 'form': subtokens[0] + ' jsi',
+ 'lemma': '* být',
+ 'upos': '* AUX',
+ 'xpos': '* VB-S---2P-AAI--',
+ 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act',
+ 'deprel': '* aux',
+ 'main': 0,
+ 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings',
+ }
+ if subtokens[1] == 'jest':
+ node.misc['AddMwt'] = ''
+ return {
+ 'form': subtokens[0] + ' jest',
+ 'lemma': '* být',
+ 'upos': '* AUX',
+ 'xpos': '* VB-S---3P-AAI-2',
+ 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act',
+ 'deprel': '* aux',
+ 'main': 0,
+ 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings',
+ }
+ if subtokens[1] == 'i':
+ node.misc['AddMwt'] = ''
+ return {
+ 'form': subtokens[0] + ' i',
+ 'lemma': '* i',
+ 'upos': '* CCONJ',
+ 'xpos': '* J^-------------',
+ 'feats': '* _',
+ 'deprel': '* cc',
+ 'main': 0,
+ 'shape': 'subtree',
+ }
+ if subtokens[1] in ['ť', 'tě', 'ti']:
+ if token_from_subtokens != node.form:
+ logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form))
+ return None
+ node.misc['AddMwt'] = ''
+ return {
+ 'form': subtokens[0] + ' ' + subtokens[1],
+ 'lemma': '* ť',
+ 'upos': '* PART',
+ 'xpos': '* TT-------------',
+ 'feats': '* _',
+ 'deprel': '* discourse',
+ 'main': 0,
+ 'shape': 'subtree',
+ }
+ # dajžto = dajž + to
+ if subtokens[1] == 'to':
+ if token_from_subtokens != node.form:
+ logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form))
+ return None
+ node.misc['AddMwt'] = ''
+ return {
+ 'form': subtokens[0] + ' ' + subtokens[1],
+ 'lemma': '* ten',
+ 'upos': '* DET',
+ 'xpos': '* PDNS4----------',
+ 'feats': '* Case=Acc|Gender=Neut|Number=Sing|PronType=Dem',
+ 'deprel': '* obj',
+ 'main': 0,
+ 'shape': 'subtree',
+ }
+ # Contractions of prepositions and pronouns almost could be processed
+ # regardless of AddMwt instructions by the annotator, but we still
+ # require it to be on the safe side. For example, both 'přědeň' and
+ # 'přěden' are attested in Old Czech but then we do not want to catch
+ # 'on' (besides the wanted 'oň'). Another reason si that the pronoun
+ # could be masculine or neuter. We pick Gender=Masc and Animacy=Anim
+ # by default, unless the original token was annotated as Animacy=Inan
+ # or Gender=Neut.
+ m = re.match(r"^(na|nade|o|po|pro|přěde|ski?rz[eě]|za)[nň](ž?)$", node.form.lower())
+ if m:
+ node.misc['AddMwt'] = ''
+ # Remove vocalization from 'přěde' (přěd něj) but keep it in 'skrze'
+ # (skrze něj).
+ if m.group(1) == 'přěde':
+ pform = 'přěd'
+ plemma = 'před'
+ adptype = 'Voc'
+ at = 'V'
+ elif re.match(r"^ski?rz[eě]$", m.group(1).lower()):
+ pform = m.group(1)
+ plemma = 'skrz'
+ adptype = 'Voc'
+ at = 'V'
+ else:
+ pform = m.group(1)
+ plemma = m.group(1)
+ adptype = 'Prep'
+ at = 'R'
+ # In UD PDT, Gender=Masc,Neut, and in PDT it is PEZS4--3 / P4ZS4---.
+ if node.feats['Gender'] == 'Neut':
+ gender = 'Neut'
+ animacy = ''
+ g = 'N'
+ elif node.feats['Animacy'] == 'Inan':
+ gender = 'Masc'
+ animacy = 'Animacy=Inan|'
+ g = 'I'
+ else:
+ gender = 'Masc'
+ animacy = 'Animacy=Anim|'
+ g = 'M'
+ if m.group(2).lower() == 'ž':
+ return {
+ 'form': pform + ' nějž',
+ 'lemma': plemma + ' jenž',
+ 'upos': 'ADP PRON',
+ 'xpos': 'R'+at+'--4---------- P4'+g+'S4---------2',
+ 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|PrepCase=Pre|PronType=Rel',
+ 'deprel': 'case *',
+ 'main': 1,
+ 'shape': 'subtree',
+ }
+ else:
+ return {
+ 'form': pform + ' něj',
+ 'lemma': plemma + ' on',
+ 'upos': 'ADP PRON',
+ 'xpos': 'R'+at+'--4---------- PE'+g+'S4--3-------',
+ 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|Person=3|PrepCase=Pre|PronType=Prs',
+ 'deprel': 'case *',
+ 'main': 1,
+ 'shape': 'subtree',
+ }
+ return None
+
+ def postprocess_mwt(self, mwt):
+ if mwt.words[0].deprel == 'fixed' and mwt.words[0].parent.parent.upos == 'VERB':
+ mwt.words[1].parent = mwt.words[0].parent.parent
diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py
new file mode 100644
index 00000000..4e2be633
--- /dev/null
+++ b/udapi/block/ud/cs/fixedeprels.py
@@ -0,0 +1,685 @@
+"""Block to fix case-enhanced dependency relations in Czech."""
+from udapi.core.block import Block
+import re
+
+class FixEdeprels(Block):
+
+ # Sometimes there are multiple layers of case marking and only the outermost
+ # layer should be reflected in the relation. For example, the semblative 'jako'
+ # is used with the same case (preposition + morphology) as the nominal that
+ # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations
+ # by all the inner cases.
+ # The list in the value contains exceptions that should be left intact.
+ outermost = {
+ 'aby': [],
+ 'ač': [],
+ 'ačkoli': [], # 'ačkoliv' se převede na 'ačkoli' dole
+ 'ačkoliv': [], # ... ale možná ne když je doprovázeno předložkou
+ 'ať': [],
+ 'byť': [],
+ 'i_když': [],
+ 'jak': [],
+ 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole
+ 'jako': [],
+ 'jakoby': ['jakoby_pod:ins'], # these instances in FicTree should be spelled 'jako by'
+ 'když': [],
+ 'než': ['než_aby'],
+ 'nežli': [],
+ 'pokud': [],
+ 'protože': [],
+ 'takže': [],
+ 'třebaže': [],
+ 'že': []
+ }
+
+ # Secondary prepositions sometimes have the lemma of the original part of
+ # speech. We want the grammaticalized form instead. List even those that
+ # will have the same lexical form, as we also want to check the morphological
+ # case. And include all other prepositions that have unambiguous morphological
+ # case, even if they are not secondary.
+ unambiguous = {
+ 'á': 'na:acc', # "á konto té záležitosti", ovšem "á konto" není ani spojeno jako složená předložka (význam = "na konto")
+ 'abi': 'aby',
+ 'aby_na': 'na:loc',
+ 'ačkoliv': 'ačkoli',
+ 'ať': 'ať', # remove morphological case
+ 'ať_forma': 'formou:gen',
+ 'ať_jako': 'jako',
+ 'ať_na': 'na:loc',
+ 'ať_s': 's:ins',
+ 'ať_v': 'v:loc',
+ 'ať_v_oblast': 'v_oblasti:gen',
+ 'ať_z': 'z:gen',
+ 'ať_z_hledisko': 'z_hlediska:gen',
+ 'ať_z_strana': 'ze_strany:gen',
+ 'až_do': 'do:gen',
+ 'až_o': 'o:acc',
+ 'během': 'během:gen',
+ 'bez': 'bez:gen',
+ 'bez_ohled_na': 'bez_ohledu_na:acc',
+ 'bez_na': 'bez_ohledu_na:acc', ###!!! a temporary hack to silence the validator about (https://github.com/UniversalDependencies/UD_Czech-PDT/issues/10#issuecomment-2710721703)
+ 'bez_zřetel_k': 'bez_zřetele_k:dat',
+ 'bez_zřetel_na': 'bez_zřetele_na:acc',
+ 'blízko': 'blízko:dat',
+ 'blízko_k': 'blízko:dat',
+ 'blíž': 'blízko:dat',
+ 'blíže': 'blízko:dat',
+ 'bok_po_bok_s': 'bok_po_boku_s:ins',
+ 'cesta': 'cestou:gen',
+ 'co_jako': 'jako',
+ 'coby': 'coby', # remove morphological case
+ 'daleko': 'nedaleko:gen',
+ 'daleko_od': 'od:gen',
+ 'dík': 'díky:dat',
+ 'díky': 'díky:dat',
+ 'dle': 'dle:gen',
+ 'do': 'do:gen',
+ 'do_čelo': 'do_čela:gen',
+ 'do_k': 'k:dat',
+ 'do_oblast': 'do_oblasti:gen',
+ 'do_rozpor_s': 'do_rozporu_s:ins',
+ 'do_ruka': 'do_rukou:gen',
+ 'do_soulad_s': 'do_souladu_s:ins',
+ 'důsledkem': 'v_důsledku:gen',
+ 'forma': 'formou:gen',
+ 'formou': 'formou:gen',
+ 'hledět_na': 'nehledě_na:acc',
+ 'i_když': 'i_když', # remove morphological case
+ 'i_pro': 'pro:acc',
+ 'jak_aby': 'jak',
+ 'jak_ad': 'jak',
+ 'jakkoliv': 'jakkoli',
+ 'jako': 'jako', # remove morphological case
+ 'jako_kupříkladu': 'jako',
+ 'jakoby': 'jako',
+ 'jakoby_pod': 'pod:ins',
+ 'jakožto': 'jako',
+ 'jelikož_do': 'jelikož',
+ 'jenom': 'jen',
+ 'jesli': 'jestli',
+ 'jestli_že': 'jestliže',
+ 'jménem': 'jménem:gen',
+ 'k': 'k:dat',
+ 'k_konec': 'ke_konci:gen',
+ 'k_prospěch': 'ku_prospěchu:gen',
+ 'kdykoliv': 'kdykoli',
+ 'kol': 'kolem:gen',
+ 'kolem': 'kolem:gen',
+ 'kolem_dokola': 'kolem:gen',
+ 'koncem': 'koncem:gen',
+ 'konec': 'koncem:gen',
+ 'krom': 'kromě:gen',
+ 'kromě': 'kromě:gen',
+ 'kvůli': 'kvůli:dat',
+ 'leda_když': 'ledaže',
+ 'li_jako': 'li',
+ 'liž': 'li',
+ 'mezi_uvnitř': 'uvnitř:gen',
+ 'na:ins': 'na:acc',
+ 'na_báze': 'na_bázi:gen',
+ 'na_čelo': 'na_čele:gen',
+ 'na_mimo': 'na:loc', # na kurtě i mimo něj
+ 'na_než': 'na:acc', # na víc než čtyři a půl kilometru
+ 'na_od': 'na_rozdíl_od:gen',
+ 'na_počátek': 'na_počátku:gen',
+ 'na_počest': 'na_počest:gen', # appears also with :dat but the meaning is same
+ 'na_podklad': 'na_podkladě:gen',
+ 'na_rozdíl_od': 'na_rozdíl_od:gen',
+ 'na_strana': 'na_straně:gen',
+ 'na_účet': 'na_účet:gen',
+ 'na_újma': 'gen', # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier
+ 'na_úroveň': 'na_úrovni:gen',
+ 'na_úroveň_okolo': 'na_úrovni:gen',
+ 'na_úsek': 'na_úseku:gen',
+ 'na_začátek': 'na_začátku:gen',
+ 'na_základ': 'na_základě:gen',
+ 'na_základna': 'na_základně:gen',
+ 'na_závěr': 'na_závěr:gen',
+ 'na_zda': 'na:loc', # na tom, zda a v jaké formě...
+ 'namísto': 'namísto:gen',
+ 'namísto_do': 'do:gen',
+ 'napospas': 'napospas:dat',
+ 'narozdíl_od': 'na_rozdíl_od:gen',
+ 'následek': 'následkem:gen',
+ 'navzdory': 'navzdory:dat',
+ 'nedaleko': 'nedaleko:gen',
+ 'než': 'než', # remove morphological case
+ 'nežli': 'nežli', # remove morphological case
+ 'o_jako': 'jako',
+ 'o_o': 'o:acc',
+ 'od': 'od:gen',
+ 'od_počínaje': 'počínaje:ins', # od brambor počínaje a základní zeleninou konče
+ 'ohledně': 'ohledně:gen',
+ 'okolo': 'okolo:gen',
+ 'oproti': 'oproti:dat',
+ 'po_v': 'po:loc',
+ 'po_bok': 'po_boku:gen',
+ 'po_doba': 'po_dobu:gen',
+ 'po_stránka': 'po_stránce:gen',
+ 'po_vzor': 'po_vzoru:gen',
+ 'poblíž': 'poblíž:gen',
+ 'počátek': 'počátkem:gen',
+ 'počátkem': 'počátkem:gen',
+ 'počínaje': 'počínaje:ins',
+ 'počínat': 'počínaje:ins',
+ 'počínat_od': 'počínaje:ins',
+ 'pod_dojem': 'pod_dojmem:gen',
+ 'pod_tlak': 'pod_tlakem:gen',
+ 'pod_vliv': 'pod_vlivem:gen',
+ 'pod_záminka': 'pod_záminkou:gen',
+ 'pod_záminka_že': 'pod_záminkou_že',
+ 'podél': 'podél:gen',
+ 'podle': 'podle:gen',
+ 'pomoc': 'pomocí:gen',
+ 'pomocí': 'pomocí:gen',
+ 'postup': 'postupem:gen',
+ 'pouze_v': 'v:loc',
+ 'pro': 'pro:acc',
+ 'pro_aby': 'pro:acc',
+ 'prostřednictví': 'prostřednictvím:gen',
+ 'prostřednictvím': 'prostřednictvím:gen',
+ 'proti': 'proti:dat',
+ 'proto_aby': 'aby',
+ 'protože': 'protože', # remove morphological case
+ 'před_během': 'během:gen', # před a během utkání
+ 'před_po': 'po:loc', # před a po vyloučení Schindlera
+ 'přes': 'přes:acc',
+ 'přes_přes': 'přes:acc', # annotation error
+ 'přestože': 'přestože', # remove morphological case
+ 'při': 'při:loc',
+ 'při_pro': 'při:loc',
+ 'při_příležitost': 'při_příležitosti:gen',
+ 'ruka_v_ruka_s': 'ruku_v_ruce_s:ins',
+ 's_cíl': 's_cílem', # s cílem projednat X
+ 's_ohled_k': 's_ohledem_k:dat',
+ 's_ohled_na': 's_ohledem_na:acc',
+ 's_pomoc': 's_pomocí:gen',
+ 's_postup': 'postupem:gen',
+ 's_přihlédnutí_k': 's_přihlédnutím_k:dat',
+ 's_přihlédnutí_na': 's_přihlédnutím_na:acc',
+ 's_výjimka': 's_výjimkou:gen',
+ 's_výjimka_z': 's_výjimkou:gen',
+ 's_výjimka_že': 's_výjimkou_že',
+ 's_vyloučení': 's_vyloučením:gen',
+ 's_zřetel_k': 'se_zřetelem_k:dat',
+ 's_zřetel_na': 'se_zřetelem_na:acc',
+ 'severně_od': 'od:gen',
+ 'skrz': 'skrz:acc',
+ 'směr_do': 'směrem_do:gen',
+ 'směr_k': 'směrem_k:dat',
+ 'směr_na': 'směrem_na:acc',
+ 'směr_od': 'směrem_od:gen',
+ 'směr_přes': 'směrem_přes:acc',
+ 'směr_z': 'směrem_z:gen',
+ 'společně_s': 'společně_s:ins',
+ 'spolu': 'spolu_s:ins',
+ 'spolu_s': 'spolu_s:ins',
+ 'spolu_se': 'spolu_s:ins',
+ 'stranou': 'stranou:gen',
+ 'stranou_od': 'stranou:gen',
+ 'takže': 'takže', # remove morphological case
+ 'takže_a': 'takže',
+ 'třebaže': 'třebaže', # remove morphological case
+ 'tvář_v_tvář': 'tváří_v_tvář:dat',
+ 'u': 'u:gen',
+ 'u_příležitost': 'u_příležitosti:gen',
+ 'uprostřed': 'uprostřed:gen',
+ 'uvnitř': 'uvnitř:gen',
+ 'v:ins': 'v:loc', # ve skutečností (překlep)
+ 'v_analogie_s': 'v_analogii_s:ins',
+ 'v_blízkost': 'v_blízkosti:gen',
+ 'v_čas': 'v_čase:gen',
+ 'v_čelo': 'v_čele:gen',
+ 'v_čelo_s': 'v_čele_s:ins',
+ 'v_doba': 'v_době:gen',
+ 'v_dohoda_s': 'v_dohodě_s:ins',
+ 'v_duch': 'v_duchu:gen',
+ 'v_důsledek': 'v_důsledku:gen',
+ 'v_forma': 've_formě:gen',
+ 'v_jméno': 've_jménu:gen',
+ 'v_k': 'k:dat',
+ 'v_kombinace_s': 'v_kombinaci_s:ins',
+ 'v_konfrontace_s': 'v_konfrontaci_s:ins',
+ 'v_kontext_s': 'v_kontextu_s:ins',
+ 'v_na': 'na:loc',
+ 'v_neprospěch': 'v_neprospěch:gen',
+ 'v_oblast': 'v_oblasti:gen',
+ 'v_oblast_s': 's:ins',
+ 'v_obor': 'v_oboru:gen',
+ 'v_otázka': 'v_otázce:gen',
+ 'v_podoba': 'v_podobě:gen',
+ 'v_poměr_k': 'v_poměru_k:dat',
+ 'v_porovnání_s': 'v_porovnání_s:ins',
+ 'v_proces': 'v_procesu:gen',
+ 'v_prospěch': 've_prospěch:gen',
+ 'v_protiklad_k': 'v_protikladu_k:dat',
+ 'v_průběh': 'v_průběhu:gen',
+ 'v_případ': 'v_případě:gen',
+ 'v_případ_že': 'v_případě_že',
+ 'v_rámec': 'v_rámci:gen',
+ 'v_reakce_na': 'v_reakci_na:acc',
+ 'v_rozpor_s': 'v_rozporu_s:ins',
+ 'v_řada': 'v_řadě:gen',
+ 'v_shoda_s': 've_shodě_s:ins',
+ 'v_služba': 've_službách:gen',
+ 'v_směr': 've_směru:gen',
+ 'v_směr_k': 've_směru_k:dat',
+ 'v_směr_na': 've_směru_k:dat', # same meaning as ve_směru_na:acc
+ 'v_smysl': 've_smyslu:gen',
+ 'v_součinnost_s': 'v_součinnosti_s:ins',
+ 'v_souhlas_s': 'v_souhlasu_s:ins',
+ 'v_soulad_s': 'v_souladu_s:ins',
+ 'v_souvislost_s': 'v_souvislosti_s:ins',
+ 'v_spojení_s': 've_spojení_s:ins',
+ 'v_spojení_se': 've_spojení_s:ins',
+ 'v_spojený_s': 've_spojení_s:ins',
+ 'v_spojitost_s': 've_spojitosti_s:ins',
+ 'v_spolupráce_s': 've_spolupráci_s:ins',
+ 'v_s_spolupráce': 've_spolupráci_s:ins',
+ 'v_srovnání_s': 've_srovnání_s:ins',
+ 'v_srovnání_se': 've_srovnání_s:ins',
+ 'v_stav': 've_stavu:gen',
+ 'v_stín': 've_stínu:gen',
+ 'v_světlo': 've_světle:gen',
+ 'v_úroveň': 'v_úrovni:gen',
+ 'v_věc': 've_věci:gen',
+ 'v_vztah_k': 've_vztahu_k:dat',
+ 'v_vztah_s': 've_vztahu_k:dat',
+ 'v_zájem': 'v_zájmu:gen',
+ 'v_záležitost': 'v_záležitosti:gen',
+ 'v_závěr': 'v_závěru:gen',
+ 'v_závislost_na': 'v_závislosti_na:loc',
+ 'v_závislost_s': 'v_závislosti_s:ins',
+ 'v_znamení': 've_znamení:gen',
+ 'včetně': 'včetně:gen',
+ 'vedle': 'vedle:gen',
+ 'versus': 'versus:nom',
+ 'vina': 'vinou:gen',
+ 'vliv': 'vlivem:gen',
+ 'vlivem': 'vlivem:gen',
+ 'vůči': 'vůči:dat',
+ 'výměna_za': 'výměnou_za:acc',
+ 'vzhledem': 'vzhledem_k:dat',
+ 'vzhledem_k': 'vzhledem_k:dat',
+ 'z': 'z:gen',
+ 'z_důvod': 'z_důvodu:gen',
+ 'z_hledisko': 'z_hlediska:gen',
+ 'z_oblast': 'z_oblasti:gen',
+ 'z_řada': 'z_řad:gen',
+ 'z_strana': 'ze_strany:gen',
+ 'z_nedostatek': 'z_nedostatku:gen',
+ 'z_titul': 'z_titulu:gen',
+ 'z_začátek': 'ze_začátku:gen',
+ 'za_pomoc': 'za_pomoci:gen',
+ 'za_účast': 'za_účasti:gen',
+ 'za_účel': 'za_účelem:gen',
+ 'začátek': 'začátkem:gen',
+ 'zásluha': 'zásluhou:gen',
+ 'zatím_co': 'zatímco',
+ 'závěr': 'závěrem:gen',
+ 'závisle_na': 'nezávisle_na:loc',
+ 'že': 'že', # remove morphological case
+ 'že_ať': 'ať',
+ 'že_jako': 'že',
+ 'že_jakoby': 'že',
+ 'že_za': 'za:gen'
+ }
+
+ def copy_case_from_adposition(self, node, adposition):
+ """
+ In some treebanks, adpositions have the Case feature and it denotes the
+ valency case that the preposition's nominal must be in.
+ """
+ # The following is only partial solution. We will not see
+ # some children because they may be shared children of coordination.
+ prepchildren = [x for x in node.children if x.lemma == adposition]
+ if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '':
+ return adposition+':'+prepchildren[0].feats['Case'].lower()
+ else:
+ return None
+
+ @staticmethod
+ def compose_edeprel(bdeprel, cdeprel):
+ """
+ Composes enhanced deprel from the basic part and optional case
+ enhancement.
+
+ Parameters
+ ----------
+ bdeprel : str
+ Basic deprel (can include subtype, e.g., 'acl:relcl').
+ cdeprel : TYPE
+ Case enhancement (can be composed of adposition and morphological
+ case, e.g., 'k:dat'). It is optional and it can be None or empty
+ string if there is no case enhancement.
+
+ Returns
+ -------
+ Full enhanced deprel (str).
+ """
+ edeprel = bdeprel
+ if cdeprel:
+ edeprel += ':'+cdeprel
+ return edeprel
+
+ def process_tree(self, tree):
+ """
+ Occasionally the edeprels automatically derived from the Czech basic
+ trees do not match the whitelist. For example, the noun is an
+ abbreviation and its morphological case is unknown.
+
+ We cannot use the process_node() method because it ignores empty nodes.
+ """
+ for node in tree.descendants_and_empty:
+ for edep in node.deps:
+ m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel'])
+ if m:
+ bdeprel = m.group(1)
+ cdeprel = m.group(2)
+ solved = False
+ # Issues caused by errors in the original annotation must be fixed early.
+ # Especially if acl|advcl occurs with a preposition that unambiguously
+ # receives a morphological case in the subsequent steps, and then gets
+ # flagged as solved.
+ if re.match(r'advcl', bdeprel):
+ # The following advcl should in fact be obl.
+ if re.fullmatch(r'do(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu!
+ bdeprel = 'obl'
+ cdeprel = 'do:gen'
+ elif re.fullmatch(r'k(?::dat)?', cdeprel): ###!!! Ale měli bychom opravit i závislost v základním stromu!
+ bdeprel = 'obl'
+ cdeprel = 'k:dat'
+ elif re.fullmatch(r'místo(?::gen)?', cdeprel): # 'v poslední době se množí bysem místo bych'
+ bdeprel = 'obl'
+ cdeprel = 'místo:gen'
+ elif re.fullmatch(r'od(?::gen)?', cdeprel): # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu!
+ bdeprel = 'obl'
+ cdeprel = 'od:gen'
+ elif re.fullmatch(r'podle(?::gen)?', cdeprel):
+ bdeprel = 'obl'
+ cdeprel = 'podle:gen'
+ elif re.fullmatch(r's(?::ins)?', cdeprel): ###!!! "seděli jsme tam s Člověče, nezlob se!" Měla by se opravit konverze stromu.
+ bdeprel = 'obl'
+ cdeprel = 's:ins'
+ elif re.fullmatch(r'v_duchu?(?::gen)?', cdeprel):
+ bdeprel = 'obl'
+ cdeprel = 'v_duchu:gen'
+ elif re.fullmatch(r'v', cdeprel):
+ bdeprel = 'obl'
+ cdeprel = 'v:loc'
+ # byl by pro, abychom... ###!!! Opravit i konverzi stromu.
+ elif re.fullmatch(r'pro(?::acc)?', cdeprel):
+ cdeprel = 'aby'
+ elif re.match(r'acl', bdeprel):
+ # The following acl should in fact be nmod.
+ if re.fullmatch(r'k(?::dat)?', cdeprel):
+ bdeprel = 'nmod'
+ cdeprel = 'k:dat'
+ elif re.fullmatch(r'na_způsob(?::gen)?', cdeprel): # 'střídmost na způsob Masarykova "jez dopolosyta"'
+ bdeprel = 'nmod'
+ cdeprel = 'na_způsob:gen'
+ elif re.fullmatch(r'od(?::gen)?', cdeprel):
+ bdeprel = 'nmod'
+ cdeprel = 'od:gen'
+ elif re.fullmatch(r'v', cdeprel):
+ bdeprel = 'nmod'
+ cdeprel = 'v:loc'
+ else: # bdeprel is 'obl' or 'nmod'
+ # The following subordinators should be removed if they occur with nominals.
+ if re.match(r'(ačkoli|když)', cdeprel): # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here!
+ cdeprel = ''
+ # Removing 'až' must be done early. The remainder may be 'počátek'
+ # and we will want to convert it to 'počátkem:gen'.
+ elif re.match(r'až_(.+):(gen|dat|acc|loc|ins)', cdeprel):
+ cdeprel = re.sub(r'až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2', cdeprel)
+ elif re.fullmatch(r'jestli(?::gen)?', cdeprel): # nevím, jestli osmého nebo devátého září
+ cdeprel = 'gen'
+ edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
+ # If one of the following expressions occurs followed by another preposition
+ # or by morphological case, remove the additional case marking. For example,
+ # 'jako_v' becomes just 'jako'.
+ for x in self.outermost:
+ exceptions = self.outermost[x]
+ m = re.fullmatch(x+r'([_:].+)?', cdeprel)
+ if m and m.group(1) and not x+m.group(1) in exceptions:
+ cdeprel = x
+ edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
+ solved = True
+ break
+ if solved:
+ continue
+ for x in self.unambiguous:
+ # All secondary prepositions have only one fixed morphological case
+ # they appear with, so we can replace whatever case we encounter with the correct one.
+ m = re.fullmatch(x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?', cdeprel)
+ if m:
+ cdeprel = self.unambiguous[x]
+ edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
+ solved = True
+ break
+ if solved:
+ continue
+ # The following prepositions have more than one morphological case
+ # available. Thanks to the Case feature on prepositions, we can
+ # identify the correct one.
+ if re.match(r'(obl|nmod)', bdeprel):
+ m = re.fullmatch(r'(mezi|na|nad|o|po|pod|před|v|za)(?::(?:nom|gen|dat|voc))?', cdeprel)
+ if m:
+ adpcase = self.copy_case_from_adposition(node, m.group(1))
+ if adpcase and not re.search(r':(nom|gen|dat|voc)$', adpcase):
+ cdeprel = adpcase
+ edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
+ continue
+ ###!!! bdeprel and cdeprel are not visible from here on but we may want to use them there as well.
+ if re.match(r'^(acl|advcl):', edep['deprel']):
+ # We do not include 'i' in the list of redundant prefixes because we want to preserve 'i když' (but we want to discard the other combinations).
+ edep['deprel'] = re.sub(r'^(acl|advcl):(?:a|alespoň|až|jen|hlavně|například|ovšem_teprve|protože|teprve|totiž|zejména)_(aby|až|jestliže|když|li|pokud|protože|že)$', r'\1:\2', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(acl|advcl):i_(aby|až|jestliže|li|pokud)$', r'\1:\2', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(acl|advcl):(aby|až|jestliže|když|li|pokud|protože|že)_(?:ale|tedy|totiž|už|však)$', r'\1:\2', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(acl|advcl):co_když$', r'\1', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(acl|advcl):kdy$', r'\1', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(advcl):neboť$', r'\1', edep['deprel']) # 'neboť' is coordinating
+ edep['deprel'] = re.sub(r'^(advcl):nechť$', r'\1', edep['deprel'])
+ if edep['deprel'] == 'acl:v' and node.form == 'patře':
+ edep['deprel'] = 'nmod:v:loc'
+ node.deprel = 'nmod'
+ node.lemma = 'patro'
+ node.upos = 'NOUN'
+ node.xpos = 'NNNS6-----A----'
+ node.feats['Aspect'] = ''
+ node.feats['Gender'] = 'Neut'
+ node.feats['Tense'] = ''
+ node.feats['VerbForm'] = ''
+ node.feats['Voice'] = ''
+ elif re.match(r'^(nmod|obl(:arg)?):', edep['deprel']):
+ if edep['deprel'] == 'nmod:loc' and (node.parent == None or node.parent.feats['Case'] == 'Loc') or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc':
+ # This is a same-case noun-noun modifier, which just happens to be in the locative.
+ # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has
+ # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant.
+ edep['deprel'] = 'nmod'
+ elif edep['deprel'] == 'obl:loc':
+ # Annotation error. The first occurrence in PDT dev:
+ # 'V Rapaportu, ceníku Antverpské burzy i Diamantberichtu jsou uvedeny ceny...'
+ # The preposition 'V' should modify coordination 'Rapaportu i Diamantberichtu'.
+ # However, 'Rapaportu' is attached as 'obl' to 'Diamantberichtu'.
+ edep['deprel'] = 'obl:v:loc'
+ elif edep['deprel'] == 'obl:arg:loc':
+ # Annotation error. The first occurrence in PDT dev:
+ edep['deprel'] = 'obl:arg:na:loc'
+ elif edep['deprel'] == 'nmod:loc':
+ # 'působil v kanadském Edmontonu Oilers', 'Edmontonu' attached to 'Oilers' and not vice versa.
+ edep['deprel'] = 'nmod:nom'
+ elif edep['deprel'] == 'obl:nom' or edep['deprel'] == 'obl:voc':
+ # Possibly an annotation error, nominative should be accusative, and the nominal should be direct object?
+ # However, there seems to be a great variability in the causes, some are subjects and many are really obliques, so let's go just with 'obl' for now.
+ edep['deprel'] = 'obl'
+ elif edep['deprel'] == 'nmod:voc':
+ # 'v 8. čísle tiskoviny Ty rudá krávo'
+ edep['deprel'] = 'nmod:nom'
+ elif edep['deprel'] == 'nmod:co:nom':
+ # Annotation error: 'kompatibilní znamená tolik co slučitelný'
+ # 'co' should be relative pronoun rather than subordinating conjunction.
+ edep['deprel'] = 'acl:relcl'
+ node.deprel = 'acl:relcl'
+ elif re.match(r'^(obl(:arg)?):li$', edep['deprel']):
+ edep['deprel'] = 'advcl:li'
+ elif re.match(r'^(nmod|obl(:arg)?):mezi:voc$', edep['deprel']):
+ edep['deprel'] = re.sub(r':voc$', r':acc', edep['deprel'])
+ elif re.match(r'^(nmod|obl(:arg)?):mezi$', edep['deprel']):
+ if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0:
+ edep['deprel'] += ':acc'
+ else:
+ edep['deprel'] += ':ins'
+ elif re.match(r'^(nmod|obl(:arg)?):mimo$', edep['deprel']):
+ edep['deprel'] += ':acc'
+ elif re.match(r'^(nmod|obl(:arg)?):místo$', edep['deprel']):
+ edep['deprel'] += ':gen'
+ elif re.match(r'^obl:místo_za:acc$', edep['deprel']):
+ # 'chytají krávu místo za rohy spíše za ocas'
+ # This should be treated as coordination; 'místo' and 'spíše' are adverbs (???); 'case' for 'místo' does not seem to be the optimal solution.
+ for c in node.children:
+ if c.form == 'místo':
+ c.upos = 'ADV'
+ c.deprel = 'cc'
+ edep['deprel'] = 'obl:za:acc'
+ elif re.match(r'^(nmod|obl(:arg)?):místo[_:].+$', edep['deprel']) and not re.match(r'^(nmod|obl(:arg)?):místo_aby$', edep['deprel']):
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):místo[_:].+$', r'\1:místo:gen', edep['deprel'])
+ elif re.match(r'^(nmod|obl(:arg)?):na(:gen)?$', edep['deprel']):
+ edep['deprel'] = re.sub(r':gen$', '', edep['deprel'])
+ # The case is unknown. We need 'acc' or 'loc'.
+ # The locative is probably more frequent but it is not so likely with every noun.
+ # If there is an nummod:gov child, it must be accusative and not locative.
+ # (The case would be taken from the number but if it is expressed as digits, it does not have the case feature.)
+ if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0:
+ edep['deprel'] += ':acc'
+ elif re.match(r'^(adresát|AIDS|DEM|frank|h|ha|hodina|Honolulu|jméno|koruna|litr|metr|míle|miliarda|milión|mm|MUDr|NATO|obyvatel|OSN|počet|procento|příklad|rok|SSSR|vůz)$', node.lemma):
+ edep['deprel'] += ':acc'
+ else:
+ edep['deprel'] += ':loc'
+ elif re.match(r'^obl:arg:na_konec$', edep['deprel']):
+ # Annotation error. It should have been two prepositional phrases: 'snížil na 225 tisíc koncem minulého roku'
+ edep['deprel'] = 'obl:arg:na:acc'
+ elif re.match(r'^(nmod|obl(:arg)?):nad$', edep['deprel']):
+ if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0:
+ edep['deprel'] += ':acc'
+ else:
+ edep['deprel'] += ':ins'
+ elif re.match(r'^(nmod|obl(:arg)?):o$', edep['deprel']):
+ if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0:
+ edep['deprel'] += ':acc'
+ else:
+ edep['deprel'] += ':loc'
+ elif re.match(r'^(nmod|obl(:arg)?):ohled_na:ins$', edep['deprel']):
+ # Annotation error.
+ if node.form == 's':
+ ohled = node.next_node
+ na = ohled.next_node
+ noun = na.next_node
+ self.set_basic_and_enhanced(noun, node.parent, 'obl', 'obl:s_ohledem_na:acc')
+ self.set_basic_and_enhanced(ohled, node, 'fixed', 'fixed')
+ self.set_basic_and_enhanced(na, node, 'fixed', 'fixed')
+ self.set_basic_and_enhanced(node, noun, 'case', 'case')
+ elif re.match(r'^nmod:pára:nom$', edep['deprel']):
+ # Annotation error: 'par excellence'.
+ edep['deprel'] = 'nmod'
+ for c in node.children:
+ if c.udeprel == 'case' and c.form.lower() == 'par':
+ c.lemma = 'par'
+ c.upos = 'ADP'
+ c.xpos = 'RR--X----------'
+ c.feats['Case'] = ''
+ c.feats['Gender'] = ''
+ c.feats['Number'] = ''
+ c.feats['Polarity'] = ''
+ c.feats['AdpType'] = 'Prep'
+ elif re.match(r'^(nmod|obl(:arg)?):po$', edep['deprel']):
+ if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0:
+ edep['deprel'] += ':acc'
+ else:
+ edep['deprel'] += ':loc'
+ elif re.match(r'^(nmod|obl(:arg)?):pod$', edep['deprel']):
+ if re.match(r'[0-9]', node.lemma) or len([x for x in node.children if x.deprel == 'nummod:gov']) > 0:
+ edep['deprel'] += ':acc'
+ else:
+ edep['deprel'] += ':ins'
+ elif re.match(r'^(nmod|obl(:arg)?):před$', edep['deprel']):
+ # Accusative would be possible but unlikely.
+ edep['deprel'] += ':ins'
+ elif re.match(r'^(nmod|obl(:arg)?):s$', edep['deprel']):
+ # Genitive would be possible but unlikely.
+ edep['deprel'] += ':ins'
+ elif re.match(r'^(nmod|obl(:arg)?):v_s(:loc)?$', edep['deprel']) and node.form == 'spolupráci':
+ # Annotation error. 'Ve spolupráci s' should be analyzed as a multi-word preposition.
+ # Find the content nominal.
+ cnouns = [x for x in node.children if x.ord > node.ord and re.match(r'^(nmod|obl)', x.udeprel)]
+ vs = [x for x in node.children if x.ord < node.ord and x.lemma == 'v']
+ if len(cnouns) > 0 and len(vs) > 0:
+ cnoun = cnouns[0]
+ v = vs[0]
+ self.set_basic_and_enhanced(cnoun, node.parent, 'obl', 'obl:ve_spolupráci_s:ins')
+ self.set_basic_and_enhanced(v, cnoun, 'case', 'case')
+ self.set_basic_and_enhanced(node, v, 'fixed', 'fixed')
+ elif re.match(r'^(nmod|obl(:arg)?):v(:nom)?$', edep['deprel']):
+ # ':nom' occurs in 'karneval v Rio de Janeiro'
+ edep['deprel'] = re.sub(r':nom$', '', edep['deprel'])
+ if len([x for x in node.children if x.deprel == 'nummod:gov']) > 0:
+ edep['deprel'] += ':acc'
+ else:
+ edep['deprel'] += ':loc'
+ elif re.match(r'^obl:v_čel[eo]_s:ins$', edep['deprel']):
+ # There is just one occurrence and it is an error:
+ # 'Předloňský kůň roku Law Soziri šel již v Lahovickém oblouku v čele s Raddelliosem a tato dvojice také nakonec zahanbila ostatní soupeře...'
+ # There should be two independent oblique modifiers, 'v čele' and 's Raddelliosem'.
+ edep['deprel'] = 'obl:s:ins'
+ elif re.match(r'^(nmod|obl(:arg)?):za$', edep['deprel']):
+ # Instrumental would be possible but unlikely.
+ edep['deprel'] += ':acc'
+ else:
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a([_:].+)?$', r'\1', edep['deprel']) # ala vršovický dloubák
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):a_?l[ae]([_:].+)?$', r'\1', edep['deprel']) # a la bondovky
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):(jak_)?ad([_:].+)?$', r'\1', edep['deprel']) # ad infinitum
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):ať:.+$', r'\1:ať', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):beyond([_:].+)?$', r'\1', edep['deprel']) # Beyond the Limits
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):co(:nom)?$', r'advmod', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):de([_:].+)?$', r'\1', edep['deprel']) # de facto
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):di([_:].+)?$', r'\1', edep['deprel']) # Lido di Jesolo
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):en([_:].+)?$', r'\1', edep['deprel']) # bienvenue en France
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):in([_:].+)?$', r'\1', edep['deprel']) # made in NHL
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):into([_:].+)?$', r'\1', edep['deprel']) # made in NHL
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno:nom$', r'\1:jménem:nom', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):jméno(:gen)?$', r'\1:jménem:gen', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):mezi:(nom|dat)$', r'\1:mezi:ins', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):o:(nom|gen|dat)$', r'\1:o:acc', edep['deprel']) # 'zájem o obaly'
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):of([_:].+)?$', r'\1', edep['deprel']) # University of North Carolina
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):per([_:].+)?$', r'\1', edep['deprel']) # per rollam
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):po:(nom|gen)$', r'\1:po:acc', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):před:gen$', r'\1:před:ins', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):přestože[_:].+$', r'\1:přestože', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):se?:(nom|acc|ins)$', r'\1:s:ins', edep['deprel']) # accusative: 'být s to' should be a fixed expression and it should be the predicate!
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):shoda(:gen)?$', r'\1', edep['deprel']) # 'shodou okolností' is not a prepositional phrase
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):v:gen$', r'\1:v:loc', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):vo:acc$', r'\1:o:acc', edep['deprel']) # colloquial: vo všecko
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):von([_:].+)?$', r'\1', edep['deprel']) # von Neumannem
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):voor([_:].+)?$', r'\1', edep['deprel']) # Hoge Raad voor Diamant
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:nom$', r'\1:z:gen', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):z:ins$', r'\1:s:ins', edep['deprel'])
+ edep['deprel'] = re.sub(r'^(nmod|obl(:arg)?):za:nom$', r'\1:za:acc', edep['deprel'])
+ edep['deprel'] = re.sub(r'^nmod:že:gen$', 'acl:že', edep['deprel'])
+
+ def set_basic_and_enhanced(self, node, parent, deprel, edeprel):
+ '''
+ Modifies the incoming relation of a node both in the basic tree and in
+ the enhanced graph. If the node does not yet depend in the enhanced
+ graph on the current basic parent, the new relation will be added without
+ removing any old one. If the node already depends multiple times on the
+ current basic parent in the enhanced graph, all such enhanced relations
+ will be removed before adding the new one.
+ '''
+ old_parent = node.parent
+ node.parent = parent
+ node.deprel = deprel
+ node.deps = [x for x in node.deps if x['parent'] != old_parent]
+ new_edep = {}
+ new_edep['parent'] = parent
+ new_edep['deprel'] = edeprel
+ node.deps.append(new_edep)
diff --git a/udapi/block/ud/cs/fixmorpho.py b/udapi/block/ud/cs/fixmorpho.py
new file mode 100644
index 00000000..7fcb0e12
--- /dev/null
+++ b/udapi/block/ud/cs/fixmorpho.py
@@ -0,0 +1,471 @@
+"""
+A Czech-specific block to fix lemmas, UPOS and morphological features in UD.
+It should increase consistency across the Czech treebanks. It focuses on
+individual closed-class verbs (such as the auxiliary "být") or on entire classes
+of words (e.g. whether or not nouns should have the Polarity feature). It was
+created as part of the Hičkok project (while importing nineteenth-century Czech
+data) but it should be applicable on any other Czech treebank.
+"""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixMorpho(Block):
+
+ def process_node(self, node):
+ # Do not touch words marked as Foreign or Typo. They may not behave the
+ # way we expect in Czech data.
+ if node.feats['Foreign'] == 'Yes' or node.feats['Typo'] == 'Yes':
+ return
+ #----------------------------------------------------------------------
+ # NOUNS, PROPER NOUNS, AND ADJECTIVES
+ #----------------------------------------------------------------------
+ # Nouns do not have polarity but the Prague-style tagsets may mark it.
+ if node.upos in ['NOUN', 'PROPN']:
+ if node.feats['Polarity'] == 'Pos':
+ node.feats['Polarity'] = ''
+ elif node.feats['Polarity'] == 'Neg':
+ logging.warn(f'To remove Polarity=Neg from the NOUN {node.form}, we may have to change its lemma ({node.lemma}).')
+ # For some nouns, there is disagreement in whether to tag and lemmatize
+ # them as proper nouns. We must be careful and not add too many to this
+ # rule, as many of them could be used as surnames and then they should
+ # be PROPN.
+ if node.upos == 'PROPN' and re.fullmatch(r'(bůh|duch|hospodin|město|milost|pan|pán|panna|stvořitel|trojice)', node.lemma.lower()):
+ node.lemma = node.lemma.lower()
+ node.upos = 'NOUN'
+ # Lemmatization.
+ if node.upos == 'NOUN' and node.lemma == 'zem':
+ node.lemma = 'země'
+ if node.upos == 'ADJ':
+ # Adjectives should be lemmatized to lowercase even if they are part of
+ # a multiword name, e.g., "Malá" in "Malá Strana" should be lemmatized
+ # to "malý". Exception: Possessive adjectives derived from personal
+ # names, e.g., "Karlův".
+ if node.feats['Poss'] != 'Yes':
+ node.lemma = node.lemma.lower()
+ # Short forms of adjectives are rare in Modern Czech and uninflected
+ # (they are used as predicates), so they lack the Case feature. But
+ # they were inflected for Case in the past, so it is better to add
+ # Case=Nom for consistency.
+ if node.feats['Variant'] == 'Short' and node.feats['Case'] == '':
+ node.feats['Case'] = 'Nom'
+ #----------------------------------------------------------------------
+ # PRONOUNS AND DETERMINERS
+ #----------------------------------------------------------------------
+ # Clitic forms of personal pronouns have Variant=Short if there is also a longer, full form.
+ if node.upos == 'PRON' and node.feats['PronType'] == 'Prs' and re.fullmatch(r'(mi|mě|ti|tě|si|se|ho|mu)', node.form.lower()):
+ node.feats['Variant'] = 'Short'
+ # Forms of "my" should be lemmatized as "já".
+ if node.upos == 'PRON' and node.lemma == 'my':
+ node.lemma = 'já'
+ # Forms of "vy" should be lemmatized as "ty".
+ if node.upos == 'PRON' and node.lemma == 'vy':
+ node.lemma = 'ty'
+ # Forms of "oni" should be lemmatized as "on" and cases that allow
+ # a preposition should have PrepCase.
+ if node.upos == 'PRON' and node.lemma in ['on', 'oni']:
+ node.lemma = 'on'
+ if node.feats['Case'] not in ['Nom', 'Voc']:
+ if node.form.lower().startswith('j'):
+ node.feats['PrepCase'] = 'Npr'
+ elif re.match(r'[nň]', node.form.lower()):
+ node.feats['PrepCase'] = 'Pre'
+ # In 19th century data, the grammaticalized usages of "se", "si" are
+ # tagged as PART (rather than a reflexive PRON, which is the standard).
+ # Even if it already was tagged PRON, some features may have to be added.
+ if node.upos in ['PRON', 'PART'] and node.form.lower() in ['se', 'si']:
+ node.lemma = 'se'
+ node.upos = 'PRON'
+ node.feats['PronType'] = 'Prs'
+ node.feats['Reflex'] = 'Yes'
+ if node.form.lower() == 'se':
+ # Occasionally "se" can be genitive: "z prudkého do se dorážení".
+ if not node.feats['Case'] == 'Gen':
+ node.feats['Case'] = 'Acc'
+ else:
+ node.feats['Case'] = 'Dat'
+ node.feats['Variant'] = 'Short'
+ # As the genitive/accusative form of "on", "jeho" should have PrepCase.
+ if node.upos == 'PRON' and node.form.lower() == 'jeho':
+ node.feats['PrepCase'] = 'Npr'
+ # Possessive pronouns have Person, Gender[psor] and Number[psor].
+ # Although it is questionable, plural possessors are lemmatized to singular
+ # possessors in an analogy to personal pronouns: "my" --> "já", "náš" --> "můj".
+ # Some source corpora lack Person and [psor] features, others do not respect
+ # the lemmatization rule, so in the end we have to look at the forms; but
+ # there are potentially many variants, especially in old texts.
+ if node.upos == 'DET' and node.feats['Poss'] == 'Yes':
+ if node.form.lower().startswith('m'):
+ # můj muoj mój mého mému mém mým moje má mojí mé moji mou mí mých mými
+ node.feats['Person'] = '1'
+ node.feats['Number[psor]'] = 'Sing'
+ elif node.form.lower().startswith('t'):
+ # tvůj tvuoj tvój tvého tvému tvém tvým tvoje tvá tvojí tvé tvoji tvou tví tvých tvými
+ node.feats['Person'] = '2'
+ node.feats['Number[psor]'] = 'Sing'
+ elif node.form.lower().startswith('n'):
+ # náš našeho našemu našem naším naše naší naši našich našim našimi
+ node.lemma = 'můj'
+ node.feats['Person'] = '1'
+ node.feats['Number[psor]'] = 'Plur'
+ elif node.form.lower().startswith('v'):
+ # váš vašeho vašemu vašem vaším vaše vaší vaši vašich vašim vašimi
+ node.lemma = 'tvůj'
+ node.feats['Person'] = '2'
+ node.feats['Number[psor]'] = 'Plur'
+ elif node.form.lower() == 'jeho':
+ node.feats['Person'] = '3'
+ node.feats['Number[psor]'] = 'Sing'
+ if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']):
+ node.feats['Gender[psor]'] = 'Masc,Neut'
+ elif re.fullmatch(r'jehož', node.form.lower()):
+ node.lemma = 'jehož'
+ node.feats['PronType'] = 'Rel'
+ node.feats['Number[psor]'] = 'Sing'
+ if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']):
+ node.feats['Gender[psor]'] = 'Masc,Neut'
+ elif re.fullmatch(r'(její|jejího|jejímu|jejím|jejích|jejími|jejíma)', node.form.lower()):
+ node.lemma = 'jeho'
+ node.feats['Person'] = '3'
+ node.feats['Number[psor]'] = 'Sing'
+ node.feats['Gender[psor]'] = 'Fem'
+ elif re.fullmatch(r'jejíž', node.form.lower()):
+ node.lemma = 'jehož'
+ node.feats['PronType'] = 'Rel'
+ node.feats['Number[psor]'] = 'Sing'
+ node.feats['Gender[psor]'] = 'Fem'
+ elif re.fullmatch(r'jich|jejich', node.form.lower()):
+ node.lemma = 'jeho'
+ node.feats['Person'] = '3'
+ node.feats['Number[psor]'] = 'Plur'
+ elif re.fullmatch(r'jichž|jejichž', node.form.lower()):
+ node.lemma = 'jehož'
+ node.feats['PronType'] = 'Rel'
+ node.feats['Number[psor]'] = 'Plur'
+ elif re.fullmatch(r'jichžto|jejichžto', node.form.lower()):
+ node.lemma = 'jehožto'
+ node.feats['PronType'] = 'Rel'
+ node.feats['Number[psor]'] = 'Plur'
+ elif node.lemma == 'čí':
+ node.feats['Poss'] = 'Yes'
+ if node.feats['PronType'] == '':
+ node.feats['PronType'] = 'Int,Rel'
+ # Reflexive possessive pronoun should not forget the Reflex=Yes feature.
+ if node.upos == 'DET' and node.lemma == 'svůj':
+ node.feats['Reflex'] = 'Yes'
+ # Demonstrative, interrogative, relative, negative, total and indefinite
+ # pronouns (or determiners, because some of them get the DET tag).
+ if node.upos in ['PRON', 'DET']:
+ # Relative pronoun "jenž" should be PRON, not DET
+ # (it inflects for Gender but it can never be used as congruent attribute).
+ if re.fullmatch(r'(jenž|jenžto)', node.lemma):
+ node.upos = 'PRON'
+ if node.form.lower().startswith('j'):
+ node.feats['PrepCase'] = 'Npr'
+ else:
+ node.feats['PrepCase'] = 'Pre'
+ # Relative pronoun "ješto" should be PRON, not DET (if it is not SCONJ, but that was excluded by a condition above)
+ # (it inflects for Gender but it can never be used as congruent attribute).
+ elif node.form.lower() in ['ješto', 'ježto']:
+ node.lemma = 'jenžto'
+ node.upos = 'PRON'
+ node.feats['PrepCase'] = 'Npr'
+ # Relative pronoun "an" is PRON (not DET).
+ elif node.lemma == 'an':
+ node.upos = 'PRON'
+ node.feats['PronType'] = 'Rel'
+ # Pronoun "kdo" is PRON (not DET).
+ elif node.lemma == 'kdo':
+ node.lemma = 'kdo'
+ node.upos = 'PRON'
+ if node.feats['PronType'] == '':
+ node.feats['PronType'] = 'Int,Rel'
+ # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc.
+ # However, we do not annotate Number ("kdo" can be the subject of a plural verb).
+ node.feats['Gender'] = 'Masc'
+ node.feats['Animacy'] = 'Anim'
+ node.feats['Number'] = ''
+ # Pronoun "kdož" is PRON (not DET).
+ elif node.lemma == 'kdož':
+ node.lemma = 'kdož'
+ node.upos = 'PRON'
+ if node.feats['PronType'] == '':
+ node.feats['PronType'] = 'Rel'
+ # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc.
+ # However, we do not annotate Number ("kdo" can be the subject of a plural verb).
+ node.feats['Gender'] = 'Masc'
+ node.feats['Animacy'] = 'Anim'
+ node.feats['Number'] = ''
+ # Pronoun "někdo", "kdosi" is PRON (not DET).
+ elif re.fullmatch(r'(kdosi|někdo)', node.lemma):
+ node.upos = 'PRON'
+ node.feats['PronType'] = 'Ind'
+ # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc.
+ # However, we do not annotate Number ("kdo" can be the subject of a plural verb).
+ node.feats['Gender'] = 'Masc'
+ node.feats['Animacy'] = 'Anim'
+ node.feats['Number'] = ''
+ # Pronoun "nikdo" is PRON (not DET).
+ elif node.lemma == 'nikdo':
+ node.lemma = 'nikdo'
+ node.upos = 'PRON'
+ node.feats['PronType'] = 'Neg'
+ # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc.
+ # However, we do not annotate Number ("kdo" can be the subject of a plural verb).
+ node.feats['Gender'] = 'Masc'
+ node.feats['Animacy'] = 'Anim'
+ node.feats['Number'] = ''
+ # Pronoun "co" is PRON (not DET).
+ elif node.lemma == 'co':
+ node.lemma = 'co'
+ node.upos = 'PRON'
+ if node.feats['PronType'] == '':
+ node.feats['PronType'] = 'Int,Rel'
+ # We do not annotate Gender and Number, although it could be argued
+ # to be Gender=Neut|Number=Sing.
+ node.feats['Gender'] = ''
+ node.feats['Animacy'] = ''
+ node.feats['Number'] = ''
+ # Pronoun "což" is PRON (not DET).
+ elif node.lemma in ['což', 'cože']:
+ node.upos = 'PRON'
+ if node.feats['PronType'] == '':
+ node.feats['PronType'] = 'Rel'
+ # We do not annotate Gender and Number, although it could be argued
+ # to be Gender=Neut|Number=Sing.
+ node.feats['Gender'] = ''
+ node.feats['Animacy'] = ''
+ node.feats['Number'] = ''
+ # Pronoun "něco" is PRON (not DET).
+ elif re.fullmatch(r'(cokoli|cosi|něco)', node.lemma):
+ node.upos = 'PRON'
+ node.feats['PronType'] = 'Ind'
+ # We do not annotate Gender and Number, although it could be argued
+ # to be Gender=Neut|Number=Sing.
+ node.feats['Gender'] = ''
+ node.feats['Animacy'] = ''
+ node.feats['Number'] = ''
+ # Pronoun "nic" is PRON (not DET).
+ elif node.lemma == 'nic':
+ node.lemma = 'nic'
+ node.upos = 'PRON'
+ node.feats['PronType'] = 'Neg'
+ # We do not annotate Gender and Number, although it could be argued
+ # to be Gender=Neut|Number=Sing.
+ node.feats['Gender'] = ''
+ node.feats['Animacy'] = ''
+ node.feats['Number'] = ''
+ # Pronoun "týž" is DET and PronType=Dem.
+ elif re.fullmatch(r'(tentýž|týž)', node.lemma):
+ node.upos = 'DET'
+ node.feats['PronType'] = 'Dem'
+ # Pronoun "každý" is DET and PronType=Tot.
+ elif node.lemma == 'každý':
+ node.upos = 'DET'
+ node.feats['PronType'] = 'Tot'
+ # Pronoun "vše" is lemmatized to "všechen", it is DET and PronType=Tot.
+ elif node.form.lower() == 'vše':
+ node.lemma = 'všechen'
+ node.upos = 'DET'
+ node.feats['PronType'] = 'Tot'
+ elif node.lemma == 'všechen':
+ node.upos = 'DET'
+ node.feats['PronType'] = 'Tot'
+ elif re.fullmatch(r'(všecek|všecka|všecku|všecko|všickni)', node.form.lower()):
+ node.lemma = 'všechen'
+ node.upos = 'DET'
+ node.feats['PronType'] = 'Tot'
+ # Pronoun "sám" is lemmatized to the long form, it is DET and PronType=Emp.
+ elif node.lemma in ['sám', 'samý']:
+ node.lemma = 'samý'
+ node.upos = 'DET'
+ node.feats['PronType'] = 'Emp'
+ node.feats['Variant'] = 'Short' if re.fullmatch(r'(sám|sama|samo|sami|samy|samu)', node.form.lower()) else ''
+ #----------------------------------------------------------------------
+ # PRONOMINAL NUMERALS AND ADVERBS
+ #----------------------------------------------------------------------
+ # The numeral "oba" should be NUM, not PRON or DET. But it should have PronType=Tot.
+ if node.upos in ['NUM', 'PRON', 'DET'] and node.lemma == 'oba':
+ node.upos = 'NUM'
+ node.feats['NumType'] = 'Card'
+ node.feats['NumForm'] = 'Word'
+ node.feats['PronType'] = 'Tot'
+ # Pronominal cardinal numerals should be DET, not NUM.
+ if node.upos == 'NUM':
+ if re.fullmatch(r'(mnoho|málo|několik)', node.lemma):
+ node.upos = 'DET'
+ node.feats['PronType'] = 'Ind'
+ node.feats['NumForm'] = ''
+ node.feats['Polarity'] = '' ###!!! so we are losing the distinction mnoho/nemnoho?
+ elif re.fullmatch(r'(toliko?)', node.lemma):
+ node.lemma = 'tolik'
+ node.upos = 'DET'
+ node.feats['PronType'] = 'Dem'
+ node.feats['NumForm'] = ''
+ node.feats['Polarity'] = ''
+ elif re.fullmatch(r'(kolik)', node.lemma):
+ node.upos = 'DET'
+ if node.feats['PronType'] == '':
+ node.feats['PronType'] = 'Int,Rel'
+ node.feats['NumForm'] = ''
+ node.feats['Polarity'] = ''
+ if node.upos in ['ADV', 'NUM']:
+ if re.fullmatch(r'(mnoho|málo|několi)krát', node.lemma):
+ node.upos = 'ADV'
+ node.feats['NumType'] = 'Mult'
+ node.feats['PronType'] = 'Ind'
+ elif re.fullmatch(r'(tolikrát)', node.lemma):
+ node.upos = 'ADV'
+ node.feats['NumType'] = 'Mult'
+ node.feats['PronType'] = 'Dem'
+ elif re.fullmatch(r'(kolikrát)', node.lemma):
+ node.upos = 'ADV'
+ node.feats['NumType'] = 'Mult'
+ if node.feats['PronType'] == '':
+ node.feats['PronType'] = 'Int,Rel'
+ # Pronominal adverbs have PronType but most of them do not have Degree
+ # and Polarity.
+ if node.upos == 'ADV':
+ if re.fullmatch(r'(dosud|dotud|nyní|odsud|odtud|proto|sem|tady|tak|takož|takto|tam|tamto|teď|tehdy|tenkrát|tu|tudy|zde)', node.lemma):
+ node.feats['PronType'] = 'Dem'
+ node.feats['Degree'] = ''
+ node.feats['Polarity'] = ''
+ elif re.fullmatch(r'(dokdy|dokud|jak|kam|kde|kdy|kterak|kudy|odkdy|odkud|proč)', node.lemma):
+ if node.feats['PronType'] == '':
+ node.feats['PronType'] = 'Int,Rel'
+ node.feats['Degree'] = ''
+ node.feats['Polarity'] = ''
+ elif re.fullmatch(r'(kdežto)', node.lemma):
+ node.feats['PronType'] = 'Rel'
+ node.feats['Degree'] = ''
+ node.feats['Polarity'] = ''
+ elif re.fullmatch(r'(jakkoli|jaksi|kamkoli|kamsi|kdekoli|kdesi|kdykoli|kdysi|kudykoli|kudysi|nějak|někam|někde|někdy|někudy)', node.lemma):
+ node.feats['PronType'] = 'Ind'
+ node.feats['Degree'] = ''
+ node.feats['Polarity'] = ''
+ elif re.fullmatch(r'(nic|nijak|nikam|nikde|nikdy|nikudy)', node.lemma):
+ node.feats['PronType'] = 'Neg'
+ node.feats['Degree'] = ''
+ node.feats['Polarity'] = ''
+ # Total pronominals can be negated ("nevždy"). Then they get Degree, too.
+ elif re.fullmatch(r'(odevšad|všude|všudy|ve?ždy|ve?ždycky)', node.lemma):
+ node.feats['PronType'] = 'Tot'
+ node.feats['Degree'] = 'Pos'
+ node.feats['Polarity'] = 'Pos'
+ #----------------------------------------------------------------------
+ # VERBS AND AUXILIARIES
+ #----------------------------------------------------------------------
+ # In Czech UD, "být" is always tagged as AUX and never as VERB, regardless
+ # of the fact that it can participate in purely existential constructions
+ # where it no longer acts as a copula. Czech tagsets typically do not
+ # distinguish AUX from VERB, which means that converted data may have to
+ # be fixed.
+ if node.upos == 'VERB' and node.lemma in ['být', 'bývat', 'bývávat']:
+ node.upos = 'AUX'
+ if node.upos in ['ADV', 'VERB'] and re.fullmatch(r'(ne)?lze', node.form.lower()):
+ node.upos = 'ADV'
+ node.lemma = 'lze' # not 'nelze'
+ node.feats['VerbForm'] = ''
+ node.feats['Voice'] = ''
+ node.feats['Aspect'] = ''
+ node.feats['Mood'] = ''
+ node.feats['Tense'] = ''
+ node.feats['Person'] = ''
+ node.feats['Number'] = ''
+ node.feats['Degree'] = 'Pos'
+ if node.upos in ['VERB', 'AUX']:
+ # Most non-passive verb forms have Voice=Act, and infinitives should
+ # have it, too. Passive infinitives are always periphrastic.
+ # (This is not done in the PDT tagset, but we should add it.)
+ if node.feats['VerbForm'] == 'Inf':
+ node.feats['Voice'] = 'Act'
+ # Same for imperatives.
+ elif node.feats['Mood'] == 'Imp':
+ node.feats['Voice'] = 'Act'
+ # Some verbs lack the Aspect feature although they are not biaspectual.
+ if node.feats['Aspect'] == '':
+ if re.fullmatch(r'(cítit|čekat|činit|číst|dávat|dělat|dít|dívat|hledat|chodit|chtít|jít|kralovat|ležet|milovat|mít|mluvit|moci|mus[ei]t|mysl[ei]t|patřit|počínat|prosit|ptát|působit|sedět|snažit|vědět|vidět|vyprávět|zdát|znamenat|žít)', node.lemma):
+ node.feats['Aspect'] = 'Imp'
+ elif re.fullmatch(r'(dát|dojít|dostat|nalézt|napadnout|nechat|obrátit|odpovědět|otevřít|počít|položit|pomoci|poslat|postavit|povědět|poznat|přijít|přinést|říci|učinit|udělat|ukázat|vrátit|vstát|vydat|vzít|začít|zeptat|zůstat)', node.lemma):
+ node.feats['Aspect'] = 'Perf'
+ # We must look at word form to distinguish imperfective "stát" from perfective "stát se".
+ elif re.fullmatch(r'(stojí(me?|š|te)?|stál(a|o|i|y)?)', node.form.lower()):
+ node.feats['Aspect'] = 'Imp'
+ elif re.fullmatch(r'(stan(u|eš|e|eme?|ete|ou)|stal(a|o|i|y)?)', node.form.lower()):
+ node.feats['Aspect'] = 'Perf'
+ # Present forms of perfective verbs normally have Tense=Pres despite
+ # meaning future. However, a few imperfective verbs have a separate
+ # future form (distinct from present form), which gets Tense=Fut
+ # despite inflecting similarly to present forms.
+ if node.feats['Mood'] == 'Ind' and node.feats['Tense'] == 'Pres' and node.feats['Aspect'] != 'Perf' and re.match(r'(ne)?((bud|půjd|pojed|polez|pones)(u|eš|e|eme?|ete|ou)|polet(ím|íš|í|íme|íte))', node.form.lower()):
+ node.feats['Tense'] = 'Fut'
+ # Passive participles (including the short forms) should be ADJ, not VERB.
+ # But they keep the verbal features of VerbForm, Voice, Aspect.
+ if node.feats['VerbForm'] == 'Part' and node.feats['Voice'] == 'Pass':
+ node.upos = 'ADJ'
+ # But now we need an adjectival lemma.
+ ###!!! Bohužel to občas zahodí normalizaci, kterou tam Martinův tým zavedl ručně, např. "rozhřita" mělo lemma "rozehřát", ale já teď místo "rozehřátý" vyrobím "rozhřitý".
+ ###!!! odepříno - odepříný místo odepřený
+ ###!!! dovolíno - dovolíný místo dovolený
+ ###!!! vyslyšána - vyslyšaný místo vyslyšený
+ ###!!! obmezený místo omezený, oslyšaný místo oslyšený
+ node.misc['LDeriv'] = node.lemma
+ node.lemma = re.sub(r'([nt])[auoiy]?$', r'\1ý', node.form.lower())
+ node.lemma = re.sub(r'áný$', r'aný', node.lemma) # ztroskotány --> ztroskotáný --> ztroskotaný; zachován, spořádán
+ if node.feats['Polarity'] == 'Neg':
+ node.lemma = re.sub(r'^ne', '', node.lemma)
+ if node.feats['Case'] == '':
+ node.feats['Case'] = 'Nom'
+ if node.feats['Degree'] == '':
+ node.feats['Degree'] = 'Pos'
+ node.feats['Variant'] = 'Short'
+ #----------------------------------------------------------------------
+ # ADVERBS
+ #----------------------------------------------------------------------
+ # Words that indicate the speaker's attitude are tagged ADV in UD,
+ # although the Czech tagsets often treat them as particles.
+ if node.upos == 'PART' and re.fullmatch(r'(ani|asi?|až|bezpochyby|bohdá|co|dokonce|jen|jistě|již|hlavně|hned|jednoduše|leda|možná|naopak|nejen|nejspíše?|opravdu|ovšem|patrně|právě|prej|prý|přece|především|rozhodně|skoro|skutečně|snad|spíše?|teda|tedy|třeba|určitě|věru|vlastně|vůbec|zajisté|zase|zrovna|zřejmě|zvlášť|zvláště)', node.lemma):
+ node.upos = 'ADV'
+ node.feats['Degree'] = 'Pos'
+ node.feats['Polarity'] = 'Pos'
+ node.misc['CzechParticle'] = 'Yes'
+ # Adverb "brzo" should be lemmatized as "brzy".
+ if node.upos == 'ADV' and node.form.lower() == 'brzo':
+ node.lemma = 'brzy'
+ if node.upos == 'ADV' and node.form.lower() == 'teprv':
+ node.lemma = 'teprve'
+ # All non-pronominal adverbs (and also some pronominal ones) should
+ # have Degree and Polarity. At least for now we also exclude adverbial
+ # numerals, e.g. "jednou" – "nejednou".
+ if node.upos == 'ADV' and node.feats['PronType'] == '' and node.feats['NumType'] == '':
+ if node.feats['Degree'] == '':
+ node.feats['Degree'] = 'Pos'
+ if node.feats['Polarity'] == '':
+ node.feats['Polarity'] = 'Pos'
+ #----------------------------------------------------------------------
+ # PREPOSITIONS
+ #----------------------------------------------------------------------
+ # Preposition "u" may combine with Case=Loc|Acc in old texts, and then
+ # it functions as a vocalized counterpart of "v". Nevertheless, we always
+ # lemmatize it as "u" and thus AdpType is Prep, not Voc.
+ if node.upos == 'ADP' and node.form.lower() == 'u':
+ node.lemma = 'u'
+ node.feats['AdpType'] = 'Prep'
+ #----------------------------------------------------------------------
+ # CONJUNCTIONS
+ #----------------------------------------------------------------------
+ # As a conjunction (and not particle/adverb), "ani" is coordinating and
+ # not subordinating.
+ if node.upos == 'SCONJ' and node.lemma == 'ani':
+ node.upos = 'CCONJ'
+ if node.upos == 'CCONJ' and node.lemma == 'nebť':
+ node.lemma = 'neboť'
+ #----------------------------------------------------------------------
+ # PARTICLES (other than those already grabbed above)
+ #----------------------------------------------------------------------
+ # "jako" should be SCONJ but 19th century data have it as PART.
+ if node.upos == 'PART':
+ if node.lemma == 'jako':
+ node.upos = 'SCONJ'
+ elif node.lemma == 'ti':
+ node.lemma = 'ť'
diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py
new file mode 100644
index 00000000..da9f5bda
--- /dev/null
+++ b/udapi/block/ud/cs/markfeatsbugs.py
@@ -0,0 +1,979 @@
+"""
+Block to identify missing or ill-valued features in Czech. Any bugs that it
+finds will be saved in the MISC column as a Bug attribute, which can be later
+used in filters and highlighted in text output.
+
+Usage: cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html
+Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc
+"""
+import udapi.block.ud.markfeatsbugs
+import re
+
+class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs):
+
+ # The convention used in PDT is not consistent. Adjectives are fully disambiguated
+ # (three genders, two animacies, three numbers, seven cases), even though some
+ # forms are shared among many feature combinations. On the other hand, pronouns
+ # and determiners omit some features in the context of certain values of other
+ # features (e.g., gender and animacy are not distinguished in plural if the case
+ # is genitive, dative, locative or instrumental).
+ # In contrast, ČNK (CNC) fully disambiguates pronouns and determiners just like
+ # adjectives.
+ # Here we can trigger one of the two conventions. It should become a block parameter
+ # in the future.
+ pdt20 = False # True = like in PDT 2.0; False = like in ČNK
+
+ def process_node(self, node):
+ # Czech constraints should not be applied to foreign words.
+ if node.feats['Foreign'] == 'Yes':
+ pass
+ # NOUNS ################################################################
+ elif node.upos == 'NOUN':
+ self.check_required_features(node, ['Gender', 'Number', 'Case'])
+ if node.feats['VerbForm'] == 'Vnoun':
+ # verbal nouns: bytí, dělání, ...
+ self.check_allowed_features(node, {
+ 'VerbForm': ['Vnoun'],
+ 'Gender': ['Neut'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']
+ })
+ elif node.feats['Gender'] == 'Masc':
+ self.check_required_features(node, ['Animacy'])
+ self.check_allowed_features(node, {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ else:
+ self.check_allowed_features(node, {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ # PROPER NOUNS #########################################################
+ elif node.upos == 'PROPN':
+ self.check_required_features(node, ['Gender', 'Number', 'Case'])
+ if node.feats['Gender'] == 'Masc':
+ self.check_required_features(node, ['Animacy'])
+ self.check_allowed_features(node, {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ else:
+ self.check_allowed_features(node, {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'NameType': ['Giv', 'Sur', 'Geo', 'Nat', 'Com', 'Pro', 'Oth'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ # ADJECTIVES ###########################################################
+ elif node.upos == 'ADJ':
+ if node.feats['Poss'] == 'Yes': # possessive adjectives
+ if node.feats['Gender'] == 'Masc':
+ self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Animacy', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'Poss': ['Yes'],
+ 'Gender[psor]': ['Masc', 'Fem'],
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ else:
+ self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'Poss': ['Yes'],
+ 'Gender[psor]': ['Masc', 'Fem'],
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ elif node.feats['NumType'] == 'Ord' or node.feats['NumType'] == 'Mult': # ordinal numerals are a subtype of adjectives; same for some multiplicative numerals (dvojí, trojí)
+ if node.feats['Gender'] == 'Masc':
+ self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'NumType': ['Ord', 'Mult'],
+ 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho')
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Variant': ['Short'], # sedmer (Mult Short) duch tvój; pól čtverta (Ord Short) komára
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ else:
+ self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'NumType': ['Ord', 'Mult'],
+ 'NumForm': ['Roman'], # NumForm is normally not used with ordinals except when a Roman numeral is clearly ordinal even without context ('XXXIIho')
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Variant': ['Short'],
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives
+ self.check_required_features(node, ['VerbForm', 'Voice'])
+ if node.feats['Voice'] == 'Act': # active participles have tense, passives don't but they have degree
+ if node.feats['Gender'] == 'Masc':
+ # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující').
+ self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'VerbForm': ['Part'],
+ 'Aspect': ['Imp', 'Perf'],
+ 'Voice': ['Act'],
+ 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí'
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Variant': ['Short'],
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ else:
+ # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující').
+ self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'VerbForm': ['Part'],
+ 'Aspect': ['Imp', 'Perf'],
+ 'Voice': ['Act'],
+ 'Tense': ['Past', 'Pres', 'Fut'], # Fut only for lemma 'boudoucí'
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Variant': ['Short'],
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ else:
+ if node.feats['Gender'] == 'Masc':
+ # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný').
+ self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Degree'])
+ self.check_allowed_features(node, {
+ 'VerbForm': ['Part'],
+ 'Aspect': ['Imp', 'Perf'],
+ 'Voice': ['Pass'],
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Degree': ['Pos', 'Cmp', 'Sup'],
+ 'Variant': ['Short'],
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ else:
+ # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný').
+ self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Number', 'Case', 'Polarity', 'Degree'])
+ self.check_allowed_features(node, {
+ 'VerbForm': ['Part'],
+ 'Aspect': ['Imp', 'Perf'],
+ 'Voice': ['Pass'],
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Degree': ['Pos', 'Cmp', 'Sup'],
+ 'Variant': ['Short'],
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ else: # regular adjectives, including short forms
+ if node.feats['Gender'] == 'Masc':
+ self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Degree': ['Pos', 'Cmp', 'Sup'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Variant': ['Short'],
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ else:
+ self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Degree': ['Pos', 'Cmp', 'Sup'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Variant': ['Short'],
+ 'Emph': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes']})
+ # PRONOUNS #############################################################
+ elif node.upos == 'PRON':
+ self.check_required_features(node, ['PronType'])
+ if node.feats['PronType'] == 'Prs':
+ if node.feats['Reflex'] == 'Yes':
+ self.check_required_features(node, ['PronType', 'Reflex', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Prs'],
+ 'Reflex': ['Yes'],
+ 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'],
+ 'Variant': ['Short']
+ })
+ else: # not reflexive
+ if node.feats['Person'] == '3': # on, ona, ono, oni, ony
+ if re.match(r'^(Nom|Voc)$', node.feats['Case']): # on, ona, ono, oni, ony
+ self.check_adjective_like(node, ['PronType', 'Person'], {
+ 'PronType': ['Prs'],
+ 'Person': ['3']
+ })
+ elif re.match(r"^(ho|mu)$", node.form.lower()):
+ # The short (clitic) forms do not have PrepCase in Modern Czech.
+ # Old Czech has also 'jmu' (besides 'jemu' and 'mu') and 'jho'
+ # (besides 'jeho' and 'ho'); it should not have Variant=Short
+ # and it should have PrepCase=Npr (the next block).
+ self.check_adjective_like(node, ['PronType', 'Person', 'Variant'], {
+ 'PronType': ['Prs'],
+ 'Person': ['3'],
+ 'Variant': ['Short']
+ })
+ else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně
+ # Mostly only two gender groups and no animacy:
+ # Masc,Neut ... jeho, jho, jemu, jmu, jej, něm, jím
+ # Fem ... jí, ji, ní
+ # Neut ... je
+ # No gender in dual and plural:
+ # Plur ... jich, jim, je, nich, jimi
+ # Here we require PrepCase but disallow Variant.
+ self.check_adjective_like(node, ['PronType', 'Person', 'PrepCase'], {
+ 'PronType': ['Prs'],
+ 'Person': ['3'],
+ 'PrepCase': ['Npr', 'Pre']
+ })
+ else: # 1st and 2nd person do not have gender: já, ty
+ self.check_required_features(node, ['PronType', 'Person', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Prs'],
+ 'Person': ['1', '2'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Variant': ['Short']
+ })
+ elif re.search(r'k[dt][oe]', node.lemma): # kdo (kto), kdož, někdo, nikdo
+ # There is no Number. Někdo and nikdo behave like singular;
+ # kdo is by default singular as well but it also occurs as subject
+ # of plural verbs ("ti, kdo nepřišli včas, byli vyloučeni").
+ # In Old Czech, "nikde" is a variant of the pronoun "nikdo" (nobody)
+ # (while in New Czech, "nikde" (nowhere) is a pronominal adverb only).
+ # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kdo to je?" use Rel.)
+ # New Czech data, in particular PDT, use Int,Rel regardless of context.
+ self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'],
+ 'Gender': ['Masc'],
+ 'Animacy': ['Anim'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins']
+ })
+ elif re.match(r'^(co(si?)?|což|což?koliv?|něco|lečco|lecco|ledacos?|nic|nicož)$', node.lemma):
+ # Although these pronouns behave by default as neuter singular,
+ # no Gender and Number is annotated. However, quite unusually,
+ # there is Animacy=Inan without Gender.
+ ###!!! This should probably be fixed in all Czech treebanks and
+ ###!!! in Interset. The pronoun should get Gender=Neut and no
+ ###!!! animacy. For now, let's at least make animacy an optional
+ ###!!! feature (I see that we already do not fill it in the Old
+ ###!!! Czech data).
+ # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, co to je?" use Rel.)
+ # New Czech data, in particular PDT, use Int,Rel regardless of context.
+ self.check_required_features(node, ['PronType', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'],
+ 'Animacy': ['Inan'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins']
+ })
+ elif node.lemma == 'ješto':
+ # Unlike 'jenžto', this relative pronoun does not inflect, it
+ # always occurs in a nominative position, but the context can
+ # be any gender and number.
+ # Update from the Hičkok project: 'ješto' is lemmatized to
+ # 'jenžto' (see below), meaning that this branch should not be
+ # needed for the new data.
+ self.check_required_features(node, ['PronType', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Rel'],
+ 'Case': ['Nom']
+ })
+ elif re.match(r'^(jenž|jenžto)$', node.lemma):
+ # The relative pronouns 'jenž', 'jenžto' inflect for gender;
+ # while we normally take this as a sign of DET (instead of PRON),
+ # these can never act as real DET because they never modify a
+ # nominal.
+ # Similarly to the personal pronoun 'on', animacy is only
+ # annotated for masculine nominative plural, non-nominative
+ # forms are merged for masculine and neuter (jehož, jemuž), and
+ # non-singular gender is only annotated in nominative (while
+ # these cases are common for all genders: jichž, jimž, jimiž).
+ # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even
+ # in the nominative, although there is no prepositional counter-
+ # part (but similarly the locative has no prepositionless form).
+ # Update from the Hičkok project: In Old Czech, both 'jenž' and
+ # 'jenžto' (or its variant 'ješto') can be used uninflected,
+ # accompanied by a resumptive pronoun which provides the inflection.
+ # In this case, the Hičkok data will not annotate Gender, Animacy,
+ # Number and Case of the relative pronoun. Therefore, we require
+ # the full set of features if any of them is present; otherwise,
+ # we only expect PronType and PrepCase.
+ if node.feats['Gender'] != '' or node.feats['Animacy'] != '' or node.feats['Number'] != '' or node.feats['Case'] != '':
+ self.check_adjective_like(node, ['PronType', 'PrepCase'], {
+ 'PronType': ['Rel'],
+ 'PrepCase': ['Npr', 'Pre']
+ })
+ else:
+ self.check_required_features(node, ['PronType', 'PrepCase'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Rel'],
+ 'PrepCase': ['Npr']
+ })
+ else:
+ # What remains is the relative pronoun 'an'. It behaves similarly
+ # to 'jenž' but it does not have the PrepCase feature and it
+ # only occurs in the nominative.
+ if node.feats['Gender'] == 'Masc' and node.feats['Number'] == 'Plur': # ani
+ self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Rel'],
+ 'Gender': ['Masc'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Plur'],
+ 'Case': ['Nom']
+ })
+ else: # not Masc Plur: an, ana, ano, any
+ self.check_required_features(node, ['PronType', 'Gender', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Rel'],
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom']
+ })
+ # DETERMINERS ##########################################################
+ elif node.upos == 'DET':
+ # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case.
+ # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'.
+ if re.match(r'^(je?ho|jejich|j[ií]ch)$', node.form.lower()):
+ self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Prs'],
+ 'Poss': ['Yes'],
+ 'Person': ['3'],
+ 'Number[psor]': ['Sing', 'Dual', 'Plur'],
+ 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'],
+ 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context
+ 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context
+ 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context
+ # PrepCase is not allowed when it is a possessive determiner because no n-form can be used (jeho dům VS. na jeho dům).
+ # Compare with genitive/accusative of the pronoun "on", there the form changes after preposition and PrepCase must be annotated
+ # (jeho se bojím VS. bez něho se neobejdu).
+ })
+ # Relative possessive determiners 'jehož' and 'jejichž' behave similarly
+ # to the personal possessive determiners but they do not have Person.
+ # Normally determiners do not change j->n after prepositions but we
+ # have an example in Old Czech (štěpové zlatí, na nichžto větviech...)
+ elif re.match(r'^(jeho|jejich|[jn][ií]ch)ž(e|to)?$', node.form.lower()):
+ self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Rel'],
+ 'Poss': ['Yes'],
+ 'Number[psor]': ['Sing', 'Dual', 'Plur'],
+ 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'],
+ 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context
+ 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context
+ 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context
+ # PrepCase is not allowed when it is a possessive determiner (muž, jehož manželka zahynula při nehodě) because no n-form can be used
+ # (after preposition: muž, na jehož manželku jste si stěžoval). Compare with genitive/accusative of the relative pronoun "jenž",
+ # there the form changes after preposition and PrepCase must be annotated (muž, jehož se bojím VS. muž, bez něhož se neobejdeme).
+ })
+ # Feminine personal possessive determiner.
+ elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)$', node.form.lower()):
+ # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'.
+ # Congruent gender:
+ # - in PDT, only in singular; masculine and neuter are merged even in nominative
+ # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural
+ # Case:
+ # - in PDT, not distinguished in feminine singular (její bota, její boty, její botě, její botu...)
+ # - in Old Czech data, distinguished always (and needed at least for 'jejiej')
+ if self.pdt20:
+ if node.feats['Number'] == 'Sing':
+ self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Prs'],
+ 'Poss': ['Yes'],
+ 'Person': ['3'],
+ 'Number[psor]': ['Sing'],
+ 'Gender[psor]': ['Fem'],
+ 'Gender': ['Masc,Neut', 'Fem'],
+ 'Number': ['Sing'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ else:
+ self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Prs'],
+ 'Poss': ['Yes'],
+ 'Person': ['3'],
+ 'Number[psor]': ['Sing'],
+ 'Gender[psor]': ['Fem'],
+ 'Number': ['Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ else:
+ self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Prs'],
+ 'Poss': ['Yes'],
+ 'Person': ['3'],
+ 'Number[psor]': ['Sing'],
+ 'Gender[psor]': ['Fem'],
+ 'Gender': ['Masc', 'Neut', 'Fem'],
+ 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ # Feminine relative possessive determiner.
+ elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(e|to)?)$', node.form.lower()):
+ # The feminine possessive 'jejíž' slightly inflects, unlike 'jehož' and 'jejichž'.
+ # Congruent gender:
+ # - in PDT, only in singular; masculine and neuter are merged even in nominative
+ # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural
+ # Case:
+ # - in PDT, not distinguished in feminine singular (jejíž bota, jejíž boty, jejíž botě, jejíž botu...)
+ # - in Old Czech data, distinguished always (and needed at least for 'jejiejž')
+ if self.pdt20:
+ if node.feats['Number'] == 'Sing':
+ self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Rel'],
+ 'Poss': ['Yes'],
+ 'Number[psor]': ['Sing'],
+ 'Gender[psor]': ['Fem'],
+ 'Gender': ['Masc,Neut', 'Fem'],
+ 'Number': ['Sing'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ else:
+ self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Rel'],
+ 'Poss': ['Yes'],
+ 'Number[psor]': ['Sing'],
+ 'Gender[psor]': ['Fem'],
+ 'Number': ['Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ else:
+ self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Rel'],
+ 'Poss': ['Yes'],
+ 'Number[psor]': ['Sing'],
+ 'Gender[psor]': ['Fem'],
+ 'Gender': ['Masc', 'Neut', 'Fem'],
+ 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ elif re.match(r'^(můj|tvůj|svůj)(ž(e|to)?)?$', node.lemma):
+ if node.feats['Reflex'] == 'Yes':
+ self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], {
+ 'PronType': ['Prs'],
+ 'Poss': ['Yes'],
+ 'Reflex': ['Yes']
+ })
+ else:
+ self.check_adjective_like(node, ['PronType', 'Poss', 'Person', 'Number[psor]'], {
+ 'PronType': ['Prs'],
+ 'Poss': ['Yes'],
+ 'Person': ['1', '2'],
+ 'Number[psor]': ['Sing', 'Plur']
+ })
+ elif re.match(r'^(ně|lec|ni)?číž?(koliv?)?$', node.lemma):
+ self.check_adjective_like(node, ['PronType', 'Poss'], {
+ 'PronType': ['Int', 'Rel', 'Ind', 'Neg'],
+ 'Poss': ['Yes']
+ })
+ elif re.match(r'^(sám|samý)$', node.lemma):
+ # The above condition looks at both lemma options, although only one lemma is assumed.
+ # However, in New Czech data the one lemma is "samý" while in Old Czech data it is "sám".
+ # Unlike other determiners, it allows Variant=Short: sám, sama, samu, samo, sami, samy.
+ self.check_adjective_like(node, ['PronType'], {'PronType': ['Emp'], 'Variant': ['Short']})
+ elif node.lemma == 'veškerý':
+ # In Old Czech, this determiner also allows Variant=Short: veškeren, veškera, veškeru, veškero, veškeři, veškery.
+ self.check_adjective_like(node, ['PronType'], {'PronType': ['Tot'], 'Variant': ['Short']})
+ elif node.lemma == 'žádný':
+ # In Old Czech, this determiner also allows Variant=Short: žáden, žádna, žádnu, žádno, žádni, žádny.
+ self.check_adjective_like(node, ['PronType'], {'PronType': ['Neg'], 'Variant': ['Short']})
+ elif node.feats['NumType'] in ['Ord', 'Mult']: # pronominal numerals 'několikátý', 'několikerý', 'několiký' etc.
+ self.check_adjective_like(node, ['PronType', 'NumType'], {
+ 'PronType': ['Ind', 'Int', 'Rel', 'Dem'],
+ 'NumType': ['Ord', 'Mult']
+ })
+ elif node.feats['NumType'] == 'Card': # pronominal quantifiers 'mnoho', 'málo', 'několik' etc.
+ if node.lemma == 'nejeden':
+ self.check_adjective_like(node, ['PronType', 'NumType'], {'PronType': ['Ind'], 'NumType': ['Card']})
+ else:
+ # Lemmas 'hodně' and 'málo' have Degree even if used as quantifiers and not adverbs:
+ # hodně, více, nejvíce; málo, méně, nejméně
+ # Lemmas 'mnoho' and 'málo' can be negated (nemnoho, nemálo).
+ self.check_required_features(node, ['PronType', 'NumType', 'Case'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Ind', 'Int', 'Rel', 'Dem'],
+ 'NumType': ['Card'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Degree': ['Pos', 'Cmp', 'Sup'],
+ 'Polarity': ['Pos', 'Neg']
+ })
+ else:
+ # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kde to je?" use Rel.)
+ # New Czech data, in particular PDT, use Int,Rel regardless of context.
+ self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Int', 'Rel', 'Ind', 'Neg', 'Tot']})
+ # NUMERALS #############################################################
+ elif node.upos == 'NUM':
+ self.check_required_features(node, ['NumType', 'NumForm'])
+ # Arabic digits and Roman numerals do not have inflection features.
+ if re.match(r'^(Digit|Roman)$', node.feats['NumForm']):
+ self.check_allowed_features(node, {
+ 'NumType': ['Card'],
+ 'NumForm': ['Digit', 'Roman']
+ })
+ else:
+ if node.feats['NumType'] == 'Sets':
+ # 'jedny', 'dvoje', 'oboje', 'troje', 'čtvery'
+ # Number should perhaps be only Plur because the counted noun will be Plur.
+ # Gender is not annotated in PDT but there are different forms ('jedni' vs. 'jedny',
+ # and in Old Czech also 'dvoji' vs. 'dvoje'), so we should allow Gender (and Animacy).
+ self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'NumType': ['Sets'],
+ 'PronType': ['Tot'], # for 'oboje'
+ 'NumForm': ['Word'],
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ # 'jeden' has Gender, Animacy, Number, Case: jeden, jedna, jedno, jednoho, jednomu, jednom, jedním, jedné, jednu, jednou, jedni, jedny, jedněch, jedněm, jedněmi.
+ # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma.
+ # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi.
+ # 'pět' and more have Number=Plur, Case: pět, pěti.
+ # 'půl' has no Number and Case, although it behaves syntactically similarly to 'pět' (but genitive is still 'půl', not '*půli').
+ # 'sto', 'tisíc', 'milión', 'miliarda' etc. have Gender (+ possibly Animacy) and Number (depending on their form).
+ elif node.lemma == 'jeden':
+ self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'NumType': ['Card'],
+ 'NumForm': ['Word'],
+ 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ elif re.match(r'^(dva|oba)$', node.lemma):
+ self.check_required_features(node, ['NumType', 'NumForm', 'Gender', 'Number', 'Case'])
+ if self.pdt20:
+ self.check_allowed_features(node, {
+ 'NumType': ['Card'],
+ 'PronType': ['Tot'], # for 'oba'
+ 'NumForm': ['Word'],
+ 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm
+ 'Number': ['Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ else:
+ self.check_allowed_features(node, {
+ 'NumType': ['Card'],
+ 'PronType': ['Tot'], # for 'oba'
+ 'NumForm': ['Word'],
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ elif re.match(r'^(dvé|obé)$', node.lemma):
+ self.check_required_features(node, ['NumType', 'NumForm', 'Gender', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'NumType': ['Card'],
+ 'PronType': ['Tot'], # for 'obé'
+ 'NumForm': ['Word'],
+ 'Gender': ['Neut'],
+ 'Number': ['Sing'], # when 'dvé' is subject, the verb is neuter singular
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ elif node.lemma == 'půl':
+ self.check_required_features(node, ['NumType', 'NumForm'])
+ self.check_allowed_features(node, {
+ 'NumType': ['Card'],
+ 'NumForm': ['Word']
+ })
+ elif re.match(r'^(sto|tisíc|.+ili[oó]n|.+iliarda)$', node.lemma):
+ self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'NumType': ['Card'],
+ 'NumForm': ['Word'],
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ else:
+ # In PDT, cardinal numerals higher than four in nominative/accusative/vocative
+ # have Number=Sing instead of Plur! It may be motivated by the default
+ # agreement they trigger on verbs (but they don't have Gender=Neut).
+ # It does not make much sense but we must allow Sing before a better
+ # approach is defined and implemented in the data.
+ # On the other hand, we may want to allow Dual for "stě".
+ self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'NumType': ['Card'],
+ 'NumForm': ['Word'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ })
+ # VERBS AND AUXILIARIES ################################################
+ elif node.upos in ['VERB', 'AUX']:
+ # There are only three lemmas recognized as AUX in Czech. This is not
+ # about features and it would be caught by the UD validator, but it
+ # is error in morphology, so let's report it here as well.
+ if node.upos == 'AUX' and node.lemma not in ['být', 'bývat', 'bývávat']:
+ self.bug(node, 'NonAuxLemma')
+ # All Czech verbs (and some adjectives and nouns) must have VerbForm.
+ # Almost all verbs have lexical Aspect but we cannot require it
+ # because there are a few biaspectual verbs (e.g. 'analyzovat') that
+ # do not have the feature.
+ self.check_required_features(node, ['VerbForm'])
+ if node.feats['VerbForm'] in ['Inf', 'Sup']:
+ # There is no voice. For some reason, PDT does not annotate that
+ # the infinitive form is active (while a passive infinitive is
+ # a combination of the infinitive with a passive participle).
+ self.check_required_features(node, ['Polarity'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf'],
+ 'VerbForm': ['Inf', 'Sup'],
+ 'Polarity': ['Pos', 'Neg']
+ })
+ elif node.feats['VerbForm'] == 'Fin':
+ # Voice is optional. For some reason it is not annotated with
+ # imperatives (although passive imperatives are a combination
+ # of the active imperative and a passive participle). It is
+ # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'.
+ # Conditional "by" has no person and number (it is typically
+ # 3rd person but it could be other persons, too, as in "ty by
+ # ses bál").
+ if node.feats['Mood'] == 'Cnd':
+ if node.form.lower() == 'by':
+ self.check_required_features(node, ['Mood'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp'],
+ 'VerbForm': ['Fin'],
+ 'Mood': ['Cnd']
+ })
+ elif node.form.lower() == 'byšta':
+ self.check_required_features(node, ['Mood', 'Person', 'Number'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp'],
+ 'VerbForm': ['Fin'],
+ 'Mood': ['Cnd'],
+ 'Person': ['2', '3'],
+ 'Number': ['Dual']
+ })
+ else:
+ self.check_required_features(node, ['Mood', 'Person', 'Number'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp'],
+ 'VerbForm': ['Fin'],
+ 'Mood': ['Cnd'],
+ 'Person': ['1', '2'],
+ 'Number': ['Sing', 'Dual', 'Plur']
+ })
+ elif node.feats['Mood'] == 'Imp':
+ self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf'],
+ 'VerbForm': ['Fin'],
+ 'Mood': ['Imp'],
+ 'Voice': ['Act'], # optional in Old Czech data, not used with imperatives in Modern Czech data (at least not yet)
+ 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person)
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Emph': ['Yes']
+ })
+ else: # indicative
+ self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf'],
+ 'VerbForm': ['Fin'],
+ 'Mood': ['Ind'],
+ 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative
+ 'Voice': ['Act'],
+ 'Person': ['1', '2', '3'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Variant': ['Short', 'Long'], # distinguishes sigmatic (Long) and asigmatic (Short) aorist
+ 'Emph': ['Yes']
+ })
+ elif node.feats['VerbForm'] == 'Part': # only l-participle; the others are ADJ, not VERB
+ if node.feats['Gender'] == 'Masc':
+ self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf'],
+ 'VerbForm': ['Part'],
+ 'Tense': ['Past'],
+ 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Gender': ['Masc'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Polarity': ['Pos', 'Neg']
+ })
+ else:
+ self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf'],
+ 'VerbForm': ['Part'],
+ 'Tense': ['Past'],
+ 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Gender': ['Fem', 'Neut'],
+ 'Polarity': ['Pos', 'Neg']
+ })
+ else: # converb
+ # Old Czech data annotate converb gender by context rather than form
+ # (because the form was different than in Modern Czech) and for
+ # masculines they also include animacy. In Modern Czech animacy is
+ # currently not annotated and Masc,Neut gender is merged.
+ if node.feats['Number'] == 'Sing':
+ if node.feats['Gender'] == 'Masc':
+ self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf'],
+ 'VerbForm': ['Conv'],
+ 'Tense': ['Past', 'Pres'],
+ 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB
+ 'Number': ['Sing'],
+ 'Gender': ['Masc'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Polarity': ['Pos', 'Neg']
+ })
+ else:
+ self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf'],
+ 'VerbForm': ['Conv'],
+ 'Tense': ['Past', 'Pres'],
+ 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB
+ 'Number': ['Sing'],
+ 'Gender': ['Fem', 'Neut'],
+ 'Polarity': ['Pos', 'Neg']
+ })
+ else:
+ self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf'],
+ 'VerbForm': ['Conv'],
+ 'Tense': ['Past', 'Pres'],
+ 'Voice': ['Act'],
+ 'Number': ['Dual', 'Plur'],
+ 'Polarity': ['Pos', 'Neg']
+ })
+ # ADVERBS ##############################################################
+ elif node.upos == 'ADV':
+ if node.feats['NumType'] != '':
+ # Adverbial multiplicative numerals (jednou, dvakrát, třikrát)
+ # belong here. They have also pronominal counterparts (kolikrát,
+ # tolikrát, několikrát). There are also adverbial ordinal numerals
+ # (zaprvé, poprvé, zadruhé, podruhé).
+ # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.)
+ # New Czech data, in particular PDT, use Int,Rel regardless of context.
+ self.check_allowed_features(node, {
+ 'NumType': ['Mult', 'Ord'],
+ 'PronType': ['Dem', 'Int', 'Rel', 'Int,Rel', 'Ind']
+ })
+ elif self.pdt20:
+ if node.feats['PronType'] != '':
+ # Pronominal adverbs in PDT are neither compared nor negated.
+ # New Czech data, in particular PDT, use Int,Rel regardless of context.
+ self.check_allowed_features(node, {
+ 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot']
+ })
+ elif node.feats['Degree'] != '':
+ # Adverbs that are compared can also be negated.
+ self.check_required_features(node, ['Degree', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'Degree': ['Pos', 'Cmp', 'Sup'],
+ 'Polarity': ['Pos', 'Neg']
+ })
+ else:
+ # The remaining adverbs are neither pronominal, nor compared or
+ # negated.
+ self.check_allowed_features(node, {})
+ else:
+ if node.feats['PronType'] == 'Tot':
+ # Total adverbs in Old Czech can be negated: vždy, nevždy.
+ # Then for consistence with other adverbs, we also require
+ # Degree, although it will be always Pos.
+ self.check_required_features(node, ['Degree', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Tot'],
+ 'Degree': ['Pos'],
+ 'Polarity': ['Pos', 'Neg']
+ })
+ elif node.feats['PronType'] != '':
+ # Other pronominal adverbs are neither compared nor negated.
+ # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.)
+ self.check_allowed_features(node, {
+ 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg']
+ })
+ else:
+ # All other adverbs should have both Degree and Polarity,
+ # although for some of them the values will always be Pos.
+ self.check_required_features(node, ['Degree', 'Polarity'])
+ self.check_allowed_features(node, {
+ 'Degree': ['Pos', 'Cmp', 'Sup'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Emph': ['Yes'],
+ 'Abbr': ['Yes']
+ })
+ # ADPOSITIONS ##########################################################
+ elif node.upos == 'ADP':
+ self.check_required_features(node, ['AdpType', 'Case'])
+ self.check_allowed_features(node, {
+ 'AdpType': ['Prep', 'Voc'],
+ 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'],
+ 'Abbr': ['Yes']
+ })
+ # SUBORDINATING CONJUNCTIONS ###########################################
+ elif node.upos == 'SCONJ':
+ self.check_allowed_features(node, {
+ 'Emph': ['Yes'],
+ 'Abbr': ['Yes']
+ })
+ # COORDINATING CONJUNCTIONS ############################################
+ elif node.upos == 'CCONJ':
+ self.check_allowed_features(node, {
+ 'Emph': ['Yes'],
+ 'Abbr': ['Yes']
+ })
+ # PARTICLES ############################################################
+ elif node.upos == 'PART':
+ # "t." = "totiž"
+ self.check_allowed_features(node, {
+ 'Abbr': ['Yes']
+ })
+ # THE REST: NO FEATURES ################################################
+ # (OR UNDEFINED UPOS) ##################################################
+ else:
+ if not node.upos in ['INTJ', 'PUNCT', 'SYM', 'X']:
+ bugmsg = 'UnknownUpos'
+ if node.upos:
+ bugmsg += node.upos
+ self.bug(node, bugmsg)
+ self.check_allowed_features(node, {})
+
+ def check_adjective_like(self, node, r0, a0):
+ """
+ Long form of adjectives, pronouns and determiners mostly share declension
+ paradigms and thus the sets of features that are expected. Whether the
+ actual feature sets are the same depends on the tagging convention (PDT
+ vs. ČNK): in PDT, adjectives are fully disambiguated while pronouns are
+ not; in ČNK, both adjectives and pronouns (incl. determiners) are fully
+ disambiguated. This method defines the core inflectional features while
+ any extras (such as PronType for pronouns) have to be provided by the
+ caller in parameters r0 (list) and a0 (dict).
+ """
+ required_features = []
+ allowed_features = {}
+ full_set = node.upos == 'ADJ' or not self.pdt20
+ if full_set:
+ # Even in the full set, animacy is only distinguished for the
+ # masculine gender.
+ if node.feats['Gender'] == 'Masc':
+ required_features = ['Gender', 'Animacy', 'Number', 'Case']
+ allowed_features = {
+ 'Gender': ['Masc'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ }
+ else:
+ required_features = ['Gender', 'Number', 'Case']
+ allowed_features = {
+ 'Gender': ['Fem', 'Neut'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ }
+ else:
+ # Gender is annotated in all cases in singular (ten, ta, to)
+ # but only in nominative, accusative, and vocative in plural
+ # (Nom/Voc ti, ty, ta; Acc ty, ta). Animacy is distinguished
+ # in plural if gender is distinguished and it is masculine; in
+ # singular it is distinguished only in accusative (toho, ten).
+ # Other cases in plural are gender-less (těch, těm, těmi).
+ # Note that this is not consistent with adjectives, where we
+ # disambiguate gender in all cases in plural.
+ if node.feats['Number'] == 'Sing':
+ if node.feats['Gender'] == 'Masc' and node.feats['Case'] == 'Acc':
+ required_features = ['Gender', 'Animacy', 'Number', 'Case']
+ allowed_features = {
+ 'Gender': ['Masc'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing'],
+ 'Case': ['Acc']
+ }
+ else:
+ required_features = ['Gender', 'Number', 'Case']
+ allowed_features = {
+ 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # non-nominative forms of Masc and Neut are merged; Fem,Neut is e.g. 'vaše' in singular
+ 'Number': ['Sing'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins']
+ }
+ elif re.match(r'^(Nom|Acc|Voc)$', node.feats['Case']):
+ required_features = ['Gender', 'Number', 'Case']
+ allowed_features = {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Dual', 'Plur'],
+ 'Case': ['Nom', 'Acc', 'Voc']
+ }
+ else:
+ required_features = ['Number', 'Case']
+ allowed_features = {
+ 'Number': ['Dual', 'Plur'],
+ 'Case': ['Gen', 'Dat', 'Loc', 'Ins']
+ }
+ required_features = r0 + required_features
+ a0.update(allowed_features)
+ allowed_features = a0
+ self.check_required_features(node, required_features)
+ self.check_allowed_features(node, allowed_features)
diff --git a/udapi/block/ud/da/fixmultisubject.py b/udapi/block/ud/da/fixmultisubject.py
new file mode 100644
index 00000000..e9367d46
--- /dev/null
+++ b/udapi/block/ud/da/fixmultisubject.py
@@ -0,0 +1,123 @@
+"""
+Block ud.da.FixMultiSubject tries to fix some systemic instances of predicates
+that have more than one subject dependent.
+"""
+from udapi.core.block import Block
+import re
+
+class FixMultiSubject(Block):
+ """
+ Make sure that a predicate has at most one subject. Note that it can
+ only fix instances that follow certain pattern observed in the Danish
+ data.
+ """
+
+ def process_node(self, node):
+ subjects = [x for x in node.children if re.match(r'^[nc]subj$', x.udeprel)]
+ if len(subjects) > 1:
+ # Pattern 1: A node is is attached as xcomp to the current node, and
+ # one of the subjects is closer to that xcomp than to the current
+ # node.
+ xcompchildren = [x for x in node.children if x.udeprel == 'xcomp']
+ # Pattern 2: Similar to pattern 1, but advcl instead of xcomp, and
+ # possibly not so many other mis-attached dependents.
+ advclchildren = [x for x in node.children if x.udeprel == 'advcl']
+ # Pattern 3: Instead of xcomp or advcl, there is a simple amod
+ # (under a verb!), in fact an adjective with a copula that should
+ # have been advcl. Alternatively, the nonverbal clause is headed
+ # by a noun, and the deprel is obl instead of amod.
+ amodchildren = [x for x in node.children if re.match(r'^(amod|obl)$', x.udeprel)]
+ if len(subjects) == 2 and len(xcompchildren) > 0:
+ for xcompnode in xcompchildren:
+ dn = [dist(node, x) for x in subjects]
+ dx = [dist(xcompnode, x) for x in subjects]
+ # Is the first subject closer to xcomp than it is to the current node?
+ # At the same time, is the second subject closer to the current node than it is to xcomp?
+ if dx[0] <= dn[0] and dn[1] <= dx[1]:
+ # The first subject should be re-attached to the xcomp node.
+ subjects[0].parent = xcompnode
+ # There are typically other dependents that should belong to the xcomp node.
+ for c in node.children:
+ if c != xcompnode and dist(xcompnode, c) < dist(node, c):
+ c.parent = xcompnode
+ # The xcompnode should probably be attached as something else
+ # than xcomp, perhaps even the direction of the relation should
+ # be reversed, but one would have to resolve this manually.
+ xcompnode.misc['ToDo'] = 'check-xcomp'
+ break
+ # Is the second subject closer to xcomp than it is to the current node?
+ # At the same time, is the first subject closer to the current node than it is to xcomp?
+ elif dx[1] <= dn[1] and dn[0] <= dx[0]:
+ # The second subject should be re-attached to the xcomp node.
+ subjects[1].parent = xcompnode
+ # There are typically other dependents that should belong to the xcomp node.
+ for c in node.children:
+ if c != xcompnode and dist(xcompnode, c) < dist(node, c):
+ c.parent = xcompnode
+ # The xcompnode should probably be attached as something else
+ # than xcomp, perhaps even the direction of the relation should
+ # be reversed, but one would have to resolve this manually.
+ xcompnode.misc['ToDo'] = 'check-xcomp'
+ break
+ elif len(subjects) == 2 and len(advclchildren) > 0:
+ for advclnode in advclchildren:
+ dn = [dist(node, x) for x in subjects]
+ dx = [dist(advclnode, x) for x in subjects]
+ # Is the first subject closer to advcl than it is to the current node?
+ # At the same time, is the second subject closer to the current node than it is to advcl?
+ if dx[0] < dn[0] and dn[1] < dx[1]:
+ # The first subject should be re-attached to the advcl node.
+ subjects[0].parent = advclnode
+ break
+ # Is the second subject closer to advcl than it is to the current node?
+ # At the same time, is the first subject closer to the current node than it is to advcl?
+ elif dx[1] < dn[1] and dn[0] < dx[0]:
+ # The second subject should be re-attached to the xcomp node.
+ subjects[1].parent = advclnode
+ break
+ elif len(subjects) == 2 and len(amodchildren) > 0:
+ for amodnode in amodchildren:
+ if len([x for x in amodnode.children if x.udeprel == 'cop']) > 0:
+ dn = [dist(node, x) for x in subjects]
+ dx = [dist(amodnode, x) for x in subjects]
+ # Is the first subject closer to amod than it is to the current node?
+ # At the same time, is the second subject closer to the current node than it is to amod?
+ if dx[0] < dn[0] and dn[1] < dx[1]:
+ # The first subject should be re-attached to the advcl node.
+ subjects[0].parent = amodnode
+ amodnode.deprel = 'advcl'
+ # There are typically other dependents that should belong to the amod node.
+ for c in node.children:
+ if c != amodnode and dist(amodnode, c) < dist(node, c):
+ c.parent = amodnode
+ break
+ # Is the second subject closer to amod than it is to the current node?
+ # At the same time, is the first subject closer to the current node than it is to amod?
+ elif dx[1] < dn[1] and dn[0] < dx[0]:
+ # The second subject should be re-attached to the xcomp node.
+ subjects[1].parent = amodnode
+ amodnode.deprel = 'advcl'
+ # There are typically other dependents that should belong to the amod node.
+ for c in node.children:
+ if c != amodnode and dist(amodnode, c) < dist(node, c):
+ c.parent = amodnode
+ break
+
+def dist(x, y):
+ if x.ord < y.ord:
+ a = x
+ b = y
+ else:
+ a = y
+ b = x
+ d = b.ord - a.ord
+ # Count the commas between the two nodes. A comma should be seen as increasing
+ # the distance of the nodes, that is, decreasing the probability that they
+ # are in the same clause.
+ nc = 0
+ for i in a.root.descendants:
+ if i.ord > a.ord and i.ord < b.ord:
+ if i.form == ',':
+ nc += 1
+ d += nc * 10
+ return d
diff --git a/udapi/block/ud/de/__init__.py b/udapi/block/ud/de/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/ud/de/addmwt.py b/udapi/block/ud/de/addmwt.py
new file mode 100644
index 00000000..18778a4a
--- /dev/null
+++ b/udapi/block/ud/de/addmwt.py
@@ -0,0 +1,50 @@
+"""Block ud.de.AddMwt for heuristic detection of German contractions.
+
+According to the UD guidelines, contractions such as "am" = "an dem"
+should be annotated using multi-word tokens.
+
+Notice that this should be used only for converting existing conllu files.
+Ideally a tokenizer should have already split the MWTs.
+"""
+import udapi.block.ud.addmwt
+
+MWTS = {
+ 'am': {'form': 'an dem', },
+ 'ans': {'form': 'an das', },
+ 'aufs': {'form': 'auf das', },
+ 'beim': {'form': 'bei dem', },
+ 'durchs': {'form': 'durch das', },
+ 'fürs': {'form': 'fürs das', },
+ 'hinterm': {'form': 'hinter dem', },
+ 'hinters': {'form': 'hinter das', },
+ 'im': {'form': 'in dem', },
+ 'ins': {'form': 'in das', },
+ 'übers': {'form': 'über das', },
+ 'ums': {'form': 'um das', },
+ 'unterm': {'form': 'unter dem', },
+ 'unters': {'form': 'unter das', },
+ 'vom': {'form': 'von dem', },
+ 'vorm': {'form': 'vor dem', },
+ 'vors': {'form': 'vor das', },
+ 'zum': {'form': 'zu dem', },
+ 'zur': {'form': 'zu der', },
+}
+
+# shared values for all entries in MWTS
+for v in MWTS.values():
+ v['lemma'] = v['form'].split()[0] + ' der'
+ v['upos'] = 'ADP DET'
+ v['xpos'] = 'APPR ART'
+ v['deprel'] = 'case det'
+ v['feats'] = '_ *'
+ # The following are the default values
+ # v['main'] = 0 # which of the two words will inherit the original children (if any)
+ # v['shape'] = 'siblings', # the newly created nodes will be siblings
+
+
+class AddMwt(udapi.block.ud.addmwt.AddMwt):
+ """Detect and mark MWTs (split them into words and add the words to the tree)."""
+
+ def multiword_analysis(self, node):
+ """Return a dict with MWT info or None if `node` does not represent a multiword token."""
+ return MWTS.get(node.form.lower(), None)
diff --git a/udapi/block/ud/de/fixgsd.py b/udapi/block/ud/de/fixgsd.py
new file mode 100644
index 00000000..65d12681
--- /dev/null
+++ b/udapi/block/ud/de/fixgsd.py
@@ -0,0 +1,58 @@
+"""
+Block to fix annotation of UD German-GSD.
+"""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixGSD(Block):
+
+ def process_node(self, node):
+ """
+ Normalizes tokenization, lemmatization and tagging of ordinal numerals
+ that are expressed using digits followed by a period.
+ https://github.com/UniversalDependencies/UD_German-GSD/issues/24
+ """
+ # Ignore periods that terminate a sentence, although they could belong
+ # to an ordinal numeral at the same time.
+ if node.form == '.' and node.next_node:
+ # Ignore number+period combinations that have an intervening space.
+ if node.prev_node and re.match(r'^\d+$', node.prev_node.form) and node.prev_node.no_space_after:
+ # Merge the number and the period into one token.
+ number = node.prev_node
+ period = node
+ # The period should not have any children but if it does, re-attach them to the number.
+ for c in period.children:
+ c.parent = number
+ # The period should be followed by a space but if it isn't, mark it at the number.
+ number.misc['SpaceAfter'] = 'No' if period.no_space_after else ''
+ number.form += '.'
+ number.lemma = number.form
+ number.upos = 'ADJ'
+ number.xpos = 'ADJA'
+ number.feats = '_'
+ number.feats['NumType'] = 'Ord'
+ if number.udeprel == 'nummod':
+ number.deprel = 'amod'
+ period.remove()
+ # Even if the digits and the period are already in one token, check their annotation.
+ if re.match(r'^\d+\.$', node.form):
+ node.lemma = node.form
+ node.upos = 'ADJ'
+ node.xpos = 'ADJA'
+ node.feats = '_'
+ node.feats['NumType'] = 'Ord'
+ if node.udeprel == 'nummod':
+ node.deprel = 'amod'
+ # Finally, make sure that ordinal numerals expressed verbosely are tagged properly.
+ # Unlike for digits, do not remove the features for Gender, Number, and Case.
+ # Skip 'acht' because we cannot reliably distinguish it from the cardinal numeral and from the verb 'achten'.
+ if re.match(r'^(erst|zweit|dritt|viert|fünft|sechst|siebt|neunt|(drei|vier|fünf|sechs|sieb|acht|neun)?zehnt|elft|zwölft)(er)?$', node.lemma, re.IGNORECASE):
+ # Skip 'erst' that is used as an adverb.
+ if node.lemma != 'erst' or node.upos != 'ADV':
+ node.lemma = re.sub(r'^(.+)er$', r'\1', node.lemma)
+ node.upos = 'ADJ'
+ node.xpos = 'ADJA'
+ node.feats['NumType'] = 'Ord'
+ if node.udeprel == 'nummod':
+ node.deprel = 'amod'
diff --git a/udapi/block/ud/de/fixhdt.py b/udapi/block/ud/de/fixhdt.py
new file mode 100644
index 00000000..a3792a96
--- /dev/null
+++ b/udapi/block/ud/de/fixhdt.py
@@ -0,0 +1,109 @@
+"""
+Block to fix annotation of UD German-HDT.
+
+It was created independently of ud.de.AddMwt but it aims to do essentially the
+same thing. Future work: make the two blocks converge.
+
+Currently known differences:
+- This block covers a wider range of contractions.
+- This block generates morphological features for the syntactic words.
+- This block does not touch words that look like contractions but do not have PronType=Art (this is a reliable indicator in HDT).
+- This block overrides the default attachment when the original relation is root, conj, reparandum.
+- The other block takes advantage of the generic class ud.AddMwt, so it does not have to re-invent common procedures.
+"""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixHDT(Block):
+
+ def process_node(self, node):
+ # PronType=Art with ADP is wrong. Fused prepositions and articles should be decomposed in UD.
+ # The following contractions have been observed:
+ # a. am ans aufs beim durchs fürs hinterm hinters im ins übers ums unterm unters vom vorm vors z. zum zur
+ if node.upos == 'ADP' and node.feats['PronType'] == 'Art':
+ if re.match("^(a\.|am|ans|aufs|beim|durchs|fürs|hinter[ms]|im|ins|übers|ums|unter[ms]|vom|vor[ms]|z\.|zu[mr])$", node.form, re.IGNORECASE):
+ # We need two nodes instead of one. Create a node.
+ # The parent should not be the root but unfortunately it is not guaranteed.
+ node2 = node.create_child()
+ node2.shift_after_node(node)
+ if not re.match(r"^(root|conj|reparandum)$", node.udeprel):
+ node2.parent = node.parent
+ node.deprel = 'case'
+ node2.deprel = 'det'
+ mwt = node.root.create_multiword_token(form=node.form, words=[node, node2], misc=node.misc)
+ node.misc['SpaceAfter'] = ''
+ # We want to respect the original letter case in the forms of the syntactic words.
+ # We can use the isupper() method to find out whether all letters are uppercase.
+ # However, detecting first-letter capitalization requires more work.
+ up = 2 if mwt.form.isupper() else 1 if mwt.form[:1].isupper() else 0
+ up2 = 2 if up == 2 else 0
+ if re.match(r"^(a\.|am|ans)$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'an')
+ node.lemma = 'an'
+ elif re.match(r"^aufs$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'auf')
+ node.lemma = 'auf'
+ elif re.match(r"^beim$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'bei')
+ node.lemma = 'bei'
+ elif re.match(r"^durchs$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'durch')
+ node.lemma = 'durch'
+ elif re.match(r"^fürs$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'für')
+ node.lemma = 'für'
+ elif re.match(r"^hinter[ms]$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'hinter')
+ node.lemma = 'hinter'
+ elif re.match(r"^(im|ins)$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'in')
+ node.lemma = 'in'
+ elif re.match(r"^übers$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'über')
+ node.lemma = 'über'
+ elif re.match(r"^ums$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'um')
+ node.lemma = 'um'
+ elif re.match(r"^unter[ms]$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'unter')
+ node.lemma = 'unter'
+ elif re.match(r"^vom$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'von')
+ node.lemma = 'von'
+ elif re.match(r"^vor[ms]$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'vor')
+ node.lemma = 'vor'
+ elif re.match(r"^(z\.|zu[mr])$", mwt.form, re.IGNORECASE):
+ node.form = mimic_case(up, 'zu')
+ node.lemma = 'zu'
+ node.upos = 'ADP'
+ node.xpos = 'APPR'
+ node.feats = '_'
+ node.feats['AdpType'] = 'Prep'
+ # We must use search() because match() only checks at the beginning of the string.
+ if re.search("[m\.]$", mwt.form, re.IGNORECASE):
+ node2.form = mimic_case(up2, 'dem')
+ node2.feats = 'Case=Dat|Definite=Def|Gender=Masc,Neut|Number=Sing|PronType=Art'
+ node.feats['Case'] = 'Dat'
+ node2.lemma = 'der'
+ elif re.search("s$", mwt.form, re.IGNORECASE):
+ node2.form = mimic_case(up2, 'das')
+ node2.feats = 'Case=Acc|Definite=Def|Gender=Neut|Number=Sing|PronType=Art'
+ node.feats['Case'] = 'Acc'
+ node2.lemma = 'der'
+ elif re.search("r$", mwt.form, re.IGNORECASE):
+ node2.form = mimic_case(up2, 'der')
+ node2.feats = 'Case=Dat|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'
+ node.feats['Case'] = 'Dat'
+ node2.lemma = 'der'
+ node2.upos = 'DET'
+ node2.xpos = 'ART'
+
+def mimic_case(up, x):
+ if up >= 2:
+ return x.upper()
+ elif up == 1:
+ return x[:1].upper() + x[1:].lower()
+ else:
+ return x.lower()
diff --git a/udapi/block/ud/el/__init__.py b/udapi/block/ud/el/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/ud/el/addmwt.py b/udapi/block/ud/el/addmwt.py
new file mode 100644
index 00000000..ac753ed5
--- /dev/null
+++ b/udapi/block/ud/el/addmwt.py
@@ -0,0 +1,36 @@
+"""Block ud.el.AddMwt for heuristic detection of multi-word (σε+DET) tokens.
+
+Notice that this should be used only for converting existing conllu files.
+Ideally a tokenizer should have already split the MWTs.
+Also notice that this block does not deal with the relatively rare
+``PRON(Person=2)+'*+PRON(Person=3, i.e. "σ'το" and "στο")`` MWTs.
+"""
+import udapi.block.ud.addmwt
+
+MWTS = {
+ 'στη': {'form': 'σ τη', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'},
+ 'στην': {'form': 'σ την', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'},
+ 'στα': {'form': 'σ τα', 'feats': '_ Case=Acc|Definite=Def|Gender=Neut|Number=Plur|PronType=Art'},
+ 'στους': {'form': 'σ τους', 'feats': '_ Case=Acc|Definite=Def|Gender=Masc|Number=Plur|PronType=Art'},
+ 'στις': {'form': 'σ τις', 'feats': '_ Case=Acc|Definite=Def|Gender=Fem|Number=Plur|PronType=Art'},
+ 'στον': {'form': 'σ τον', 'feats': '_ Case=Acc|Definite=Def|Gender=Masc|Number=Sing|PronType=Art'},
+ 'στο': {'form': 'σ το', 'feats': '_ Case=Acc|Definite=Def|Gender=*|Number=Sing|PronType=Art'},
+}
+
+# shared values for all entries in MWTS
+for v in MWTS.values():
+ v['lemma'] = 'σε ο'
+ v['upos'] = 'ADP DET'
+ v['xpos'] = 'AsPpSp AtDf'
+ v['deprel'] = 'case det'
+ # The following are the default values
+ # v['main'] = 0 # which of the two words will inherit the original children (if any)
+ # v['shape'] = 'siblings', # the newly created nodes will be siblings
+
+
+class AddMwt(udapi.block.ud.addmwt.AddMwt):
+ """Detect and mark MWTs (split them into words and add the words to the tree)."""
+
+ def multiword_analysis(self, node):
+ """Return a dict with MWT info or None if `node` does not represent a multiword token."""
+ return MWTS.get(node.form.lower(), None)
diff --git a/udapi/block/ud/en/__init__.py b/udapi/block/ud/en/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/ud/en/setspaceafter.py b/udapi/block/ud/en/setspaceafter.py
new file mode 100644
index 00000000..1ebc3054
--- /dev/null
+++ b/udapi/block/ud/en/setspaceafter.py
@@ -0,0 +1,46 @@
+"""Block ud.en.SetSpaceAfter for heuristic setting of SpaceAfter=No in English.
+
+Usage::
+
+ udapy -s ud.en.SetSpaceAfter < in.conllu > fixed.conllu
+
+Author: Martin Popel
+"""
+import udapi.block.ud.setspaceafter
+
+
+class SetSpaceAfter(udapi.block.ud.setspaceafter.SetSpaceAfter):
+ """Block for heuristic setting of the SpaceAfter=No MISC attribute in English.
+
+ """
+
+ def process_tree(self, root):
+ nodes = root.descendants
+ for i, node in enumerate(nodes[:-1]):
+ next_form = nodes[i + 1].form
+
+ # Contractions like "don't" and possessive suffix 's should be annotated as MWT.
+ # However, older UD_English-EWT versions did not follow this rule and even v2.7
+ # contains some forgotten occurrences, so let's handle these as well.
+ if next_form in {"n't", "'s"}:
+ self.mark_no_space(node)
+
+ # Parsers may distinguish opening and closing single quotes by XPOS.
+ elif node.form == "'" and node.xpos == "``":
+ self.mark_no_space(node)
+ elif next_form == "'" and nodes[i + 1].xpos == "''":
+ self.mark_no_space(node)
+
+
+ # hyphen-compounds
+ elif node.form == '-' and i:
+ if ((nodes[i - 1] is node.parent or nodes[i - 1].parent is node.parent) and
+ (nodes[i + 1] is node.parent or nodes[i + 1].parent is node.parent)):
+ self.mark_no_space(nodes[i - 1])
+ self.mark_no_space(node)
+
+ # $200
+ elif node.form == '$' and nodes[i + 1].upos == 'NUM':
+ self.mark_no_space(node)
+
+ super().process_tree(root)
diff --git a/udapi/block/ud/es/__init__.py b/udapi/block/ud/es/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/ud/es/addmwt.py b/udapi/block/ud/es/addmwt.py
new file mode 100644
index 00000000..92f80160
--- /dev/null
+++ b/udapi/block/ud/es/addmwt.py
@@ -0,0 +1,109 @@
+"""Block ud.es.AddMwt for heuristic detection of Spanish contractions.
+
+According to the UD guidelines, contractions such as "del" = "de el"
+should be annotated using multi-word tokens.
+
+Note that this block should be used only for converting legacy conllu files.
+Ideally a tokenizer should have already split the MWTs.
+"""
+import re
+import udapi.block.ud.addmwt
+
+MWTS = {
+ 'al': {'form': 'a el'},
+ 'del': {'form': 'de el'},
+}
+
+LEMMA = {
+ 'se': 'él',
+ 'le': 'él',
+ 'la': 'él',
+ 'lo': 'él',
+ 'te': 'tú',
+ 'me': 'yo',
+}
+
+# shared values for all entries in MWTS
+for v in MWTS.values():
+ v['lemma'] = v['form']
+ v['upos'] = 'ADP DET'
+ v['deprel'] = '* det'
+ v['feats'] = '_ Definite=Def|Gender=Masc|Number=Sing|PronType=Art'
+ # The following are the default values
+ # v['main'] = 0 # which of the two words will inherit the original children (if any)
+ # v['shape'] = 'siblings', # the newly created nodes will be siblings
+
+
+class AddMwt(udapi.block.ud.addmwt.AddMwt):
+ """Detect and mark MWTs (split them into words and add the words to the tree)."""
+
+ def __init__(self, verbpron=False, **kwargs):
+ super().__init__(**kwargs)
+ self.verbpron = verbpron
+
+ def multiword_analysis(self, node):
+ """Return a dict with MWT info or None if `node` does not represent a multiword token."""
+ analysis = MWTS.get(node.form.lower(), None)
+
+ if analysis is not None:
+ # Modify the default attachment of the new syntactic words in special situations.
+ if re.match(r'^(root|conj|reparandum)$', node.udeprel):
+ # Copy the dictionary so that we do not modify the original and do not affect subsequent usages.
+ analysis = analysis.copy()
+ analysis['shape'] = 'subtree'
+ return analysis
+
+ if not self.verbpron or node.upos not in {'VERB', 'AUX'}:
+ return None
+
+ form = node.form.lower()
+
+ if re.search('(me|la|le|lo|se|te)$', form):
+ verbform = node.feats['VerbForm']
+ # TODO there are contractions even with VerbForm=Fin
+ if verbform == 'Fin' or form == 'pese':
+ return None
+ del node.feats['VerbForm']
+ pron = form[-2:]
+ return {
+ 'form': form[:-2] + ' ' + pron,
+ 'lemma': '* ' + LEMMA[pron],
+ 'upos': '* PRON',
+ 'feats': 'VerbForm=%s *' % verbform,
+ 'deprel': '* iobj',
+ 'main': 0,
+ 'shape': 'subtree',
+ }
+
+ if re.search('l[oe]s$', form):
+ verbform = node.feats['VerbForm']
+ if verbform == 'Fin':
+ return None
+ del node.feats['VerbForm']
+ pron = form[-3:]
+ return {
+ 'form': form[:-3] + ' ' + pron,
+ 'lemma': '* él',
+ 'upos': '* PRON',
+ 'feats': 'VerbForm=%s *' % verbform,
+ 'deprel': '* iobj',
+ 'main': 0,
+ 'shape': 'subtree',
+ }
+
+ # TODO: multiple suffixes, e.g. compratelo = compra + te + lo
+ return None
+
+ # Sometimes "del" has a shape which is neither "siblings" nor "subtree".
+ # E.g. in "a partir del NOUN"
+ # "del" = "de el", but
+ # "de" is attached to "a" (as fixed), while "el" is attached to the NOUN.
+ def postprocess_mwt(self, mwt):
+ if mwt.form.lower() in {'al', 'del'} and mwt.words[1].parent.precedes(mwt.words[1]):
+ head = mwt.words[1].next_node
+ while head.upos not in {'NOUN', 'PROPN'}:
+ if head.parent.precedes(head) or head.is_root():
+ head = mwt.words[1].next_node
+ break
+ head = head.parent
+ mwt.words[1].parent = head
diff --git a/udapi/block/ud/es/elque.py b/udapi/block/ud/es/elque.py
new file mode 100644
index 00000000..4d14b98d
--- /dev/null
+++ b/udapi/block/ud/es/elque.py
@@ -0,0 +1,116 @@
+"""
+This block searches for relative clauses modifying a determiner ('el que, el cual...').
+It is written for Spanish but a similar block should work for other Romance
+languages.
+"""
+from udapi.core.block import Block
+import logging
+import re
+
+class ElQue(Block):
+
+ def __init__(self, fix=False, **kwargs):
+ """
+ Default: Print the annotation patterns but do not fix anything.
+ fix=1: Do not print the patterns but fix them.
+ """
+ super().__init__(**kwargs)
+ self.fix = fix
+
+ def process_node(self, node):
+ # We take 'que' as the central node of the construction.
+ if re.match(r'^(que|cual)$', node.lemma) and node.upos == 'PRON' and node.parent.ord > node.ord:
+ # We will refer to the parent of 'que' as a verb, although it can be
+ # a non-verbal predicate, too.
+ que = node
+ verb = node.parent
+ # Check the lemma of the determiner. The form may vary for gender and number.
+ if que.prev_node and que.prev_node.lemma == 'el':
+ el = que.prev_node
+ adp = None
+ if el.prev_node and el.prev_node.upos == 'ADP':
+ adp = el.prev_node
+ if adp.udeprel == 'fixed':
+ adp = adp.parent
+ if self.fix:
+ self.fix_pattern(adp, el, que, verb)
+ else:
+ self.print_pattern(adp, el, que, verb)
+
+ def print_pattern(self, adp, el, que, verb):
+ stanford = []
+ if adp:
+ if adp.parent == el:
+ parentstr = 'el'
+ elif adp.parent == que:
+ parentstr = 'que'
+ elif adp.parent == verb:
+ parentstr = 'VERB'
+ else:
+ parentstr = 'OTHER'
+ stanford.append(adp.deprel + '(' + parentstr + ', ADP)')
+ if el.parent == adp:
+ parentstr = 'ADP'
+ elif el.parent == que:
+ parentstr = 'que'
+ elif el.parent == verb:
+ parentstr = 'VERB'
+ else:
+ parentstr = 'OTHER'
+ stanford.append(el.deprel + '(' + parentstr + ', el)')
+ # We found the verb as the parent of 'que', so we do not need to check the parent of 'que' now.
+ stanford.append(que.deprel + '(VERB, que)')
+ if verb.parent == adp:
+ parentstr = 'ADP'
+ elif verb.parent == el:
+ parentstr = 'el'
+ else:
+ parentstr = 'OTHER'
+ stanford.append(verb.deprel + '(' + parentstr + ', VERB)')
+ print('; '.join(stanford))
+
+ def fix_pattern(self, adp, el, que, verb):
+ if adp:
+ if adp.parent == que or adp.parent == verb:
+ attach(adp, el, 'case')
+ if el.parent == que:
+ ###!!! Just a temporary change. In the end it will be attached elsewhere.
+ attach(el, verb)
+ el.parent = verb
+ if len(el.deps) == 1:
+ el.deps[0]['parent'] = verb
+ if verb.parent != adp and verb.parent != el and verb.parent != que:
+ eldeprel = None
+ if re.match(r'^[nc]subj$', verb.udeprel):
+ eldeprel = 'nsubj'
+ elif re.match(r'^ccomp$', verb.udeprel):
+ eldeprel = 'obj'
+ elif re.match(r'^advcl$', verb.udeprel):
+ eldeprel = 'obl'
+ elif re.match(r'^acl$', verb.udeprel):
+ eldeprel = 'nmod'
+ elif re.match(r'^(xcomp|conj|appos|root)$', verb.udeprel):
+ eldeprel = verb.deprel
+ if eldeprel:
+ attach(el, verb.parent, eldeprel)
+ attach(verb, el, 'acl:relcl')
+ # If anything before 'el' depends on the verb ('cc', 'mark', 'punct' etc.),
+ # re-attach it to 'el'.
+ for c in verb.children:
+ if c.ord < el.ord and re.match(r'^(cc|mark|case|punct)$', c.udeprel):
+ attach(c, el)
+
+def attach(node, parent, deprel=None):
+ """
+ Attach a node to a new parent with a new deprel in the basic tree. In
+ addition, if there are enhanced dependencies and there is just one incoming
+ enhanced relation (this is the case in AnCora), this relation will be
+ modified accordingly.
+ """
+ node.parent = parent
+ if deprel:
+ node.deprel = deprel
+ if len(node.deps) == 1:
+ node.deps[0]['parent'] = parent
+ if deprel:
+ node.deps[0]['deprel'] = deprel
diff --git a/udapi/block/ud/es/fixexclamation.py b/udapi/block/ud/es/fixexclamation.py
new file mode 100644
index 00000000..7dea8e0d
--- /dev/null
+++ b/udapi/block/ud/es/fixexclamation.py
@@ -0,0 +1,47 @@
+"""Block to fix tokenization of exclamation marks in UD Spanish-AnCora."""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixExclamation(Block):
+
+ def process_node(self, node):
+ """
+ In Spanish AnCora, there are things like '¡Hola!' as one token.
+ The punctuation should be separated. One may question whether this
+ should include names of companies (Yahoo!) or products (la revista
+ Hello!) but it should, as company and product names often have
+ multiple tokens (even multiple full words, not just punctuation)
+ and these are also separated in UD.
+ """
+ if re.search(r'^[¡!]\w', node.form):
+ # Separate the punctuation and attach it to the rest.
+ punct = node.create_child()
+ punct.shift_before_node(node)
+ punct.form = node.form[:1]
+ node.form = node.form[1:]
+ punct.lemma = punct.form
+ punct.upos = 'PUNCT'
+ punct.xpos = 'faa' if punct.form == '¡' else 'fat'
+ punct.feats['PunctType'] = 'Excl'
+ punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin'
+ punct.misc['SpaceAfter'] = 'No'
+ punct.deprel = 'punct'
+ # Mark the position for manual check.
+ node.misc['Mark'] = 'PunctSep'
+ if re.search(r'\w[¡!]$', node.form):
+ # Separate the punctuation and attach it to the rest.
+ punct = node.create_child()
+ punct.shift_after_node(node)
+ punct.form = node.form[-1:]
+ node.form = node.form[:-1]
+ punct.lemma = punct.form
+ punct.upos = 'PUNCT'
+ punct.xpos = 'faa' if punct.form == '¡' else 'fat'
+ punct.feats['PunctType'] = 'Excl'
+ punct.feats['PunctSide'] = 'Ini' if punct.form == '¡' else 'Fin'
+ punct.misc['SpaceAfter'] = node.misc['SpaceAfter']
+ node.misc['SpaceAfter'] = 'No'
+ punct.deprel = 'punct'
+ # Mark the position for manual check.
+ node.misc['Mark'] = 'PunctSep'
diff --git a/udapi/block/ud/es/fixtenerque.py b/udapi/block/ud/es/fixtenerque.py
new file mode 100644
index 00000000..62fa0f4d
--- /dev/null
+++ b/udapi/block/ud/es/fixtenerque.py
@@ -0,0 +1,47 @@
+"""Block to fix spurious auxiliary verbs in UD Spanish-AnCora."""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixTenerQue(Block):
+
+ def process_node(self, node):
+ """
+ Some Spanish treebanks treat the verb 'tener' in constructions such as
+ 'tener que comer' as auxiliary. This is wrong and the validator will
+ flag it as an error. This block fixes such annotations.
+
+ EDIT: 'ir a comer' is processed the same way.
+ """
+ if re.match(r'^(tener|ir)$', node.lemma) and node.upos == 'AUX':
+ node.upos = 'VERB'
+ # In rare cases the auxiliary may have been promoted due to ellipsis.
+ # Most of the time however, it is attached as 'aux' to the main verb.
+ if node.udeprel == 'aux':
+ mainverb = node.parent
+ self.reattach(node, mainverb.parent, mainverb.deprel)
+ self.reattach(mainverb, node, 'xcomp')
+ # Some children of the former main verb should be reattached to 'tener'.
+ # Others (especially a direct object) should stay with the former main verb.
+ for c in mainverb.children:
+ if not re.match(r'^(obj|iobj|obl|ccomp|xcomp|conj|list|compound|flat|fixed|goeswith|reparandum)$', c.udeprel):
+ self.reattach(c, node, c.deprel)
+ # On the other hand, the conjunction 'que' may have been wrongly attached as 'fixed' to 'tener'.
+ for c in node.children:
+ if re.match(r'^(que|a)$', c.form.lower()) and c.ord > node.ord and c.ord < mainverb.ord:
+ self.reattach(c, mainverb, 'mark')
+
+ def reattach(self, node, parent, deprel):
+ """
+ Changes the incoming dependency relation to a node. Makes sure that the
+ same change is done in the basic tree and in the enhanced graph.
+ """
+ if node.deps:
+ # If the enhanced graph contains the current basic relation, remove it.
+ orig_n_deps = len(node.deps)
+ node.deps = [x for x in node.deps if x['parent'] != node.parent or re.sub(r':.*', '', x['deprel']) != node.udeprel]
+ # Add the new basic relation to the enhanced graph only if the original one was there.
+ if len(node.deps) < orig_n_deps:
+ node.deps.append({'parent': parent, 'deprel': deprel})
+ node.parent = parent
+ node.deprel = deprel
diff --git a/udapi/block/ud/es/fixverbfeats.py b/udapi/block/ud/es/fixverbfeats.py
new file mode 100644
index 00000000..643ecd7c
--- /dev/null
+++ b/udapi/block/ud/es/fixverbfeats.py
@@ -0,0 +1,38 @@
+"""Block to fix features (and potentially lemmas) of verbs in UD Spanish-PUD."""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixVerbFeats(Block):
+
+ def process_node(self, node):
+ """
+ The features assigned to verbs in Spanish PUD are often wrong, although
+ the annotation was (reportedly) done manually. For example, infinitives
+ are tagged with VerbForm=Fin instead of VerbForm=Inf.
+ """
+ if re.match(r'^(VERB|AUX)$', node.upos):
+ if re.search(r'[aei]r$', node.form, re.IGNORECASE):
+ # The infinitive has no features other than VerbForm.
+ node.feats = {}
+ node.feats['VerbForm'] = 'Inf'
+ node.lemma = node.form.lower()
+ elif re.search(r'ndo$', node.form, re.IGNORECASE):
+ if node.form.lower() != 'entiendo':
+ # The gerund has no features other than VerbForm.
+ # The lemma is not always straightforward but we have fixed it manually.
+ node.feats = {}
+ node.feats['VerbForm'] = 'Ger'
+ elif re.search(r'([ai]d|biert|dich|fech|hech|muert|puest|vist)[oa]s?$', node.form, re.IGNORECASE):
+ # The (past) participle has always Gender and Number.
+ # It can be VERB/AUX (infinitive is the lemma) or ADJ (masculine singular is the lemma).
+ # As a verb, it also has Tense=Past. As an adjective it does not have this feature (in AnCora; but why not?)
+ gender = node.feats['Gender'] if node.feats['Gender'] else ('Masc' if re.search(r'os?$', node.form, re.IGNORECASE) else 'Fem')
+ number = node.feats['Number'] if node.feats['Number'] else ('Plur' if re.search(r's$', node.form, re.IGNORECASE) else 'Sing')
+ node.feats = {}
+ node.feats['VerbForm'] = 'Part'
+ node.feats['Tense'] = 'Past'
+ node.feats['Gender'] = gender
+ node.feats['Number'] = number
+ if re.search(r'ad[oa]s?$', node.form, re.IGNORECASE):
+ node.lemma = re.sub(r'd[oa]s?$', 'r', node.form.lower())
diff --git a/udapi/block/ud/exgoogle2ud.py b/udapi/block/ud/exgoogle2ud.py
new file mode 100644
index 00000000..f63fad74
--- /dev/null
+++ b/udapi/block/ud/exgoogle2ud.py
@@ -0,0 +1,97 @@
+"""Block ud.ExGoogle2ud converts data which were originally annotated in Google style
+then converted with an older version of ud.Google2ud to UDv2,
+then manually edited and we don't want to loose these edits,
+so we cannot simply rerun the newer version of ud.Google2ud on the original Google data.
+"""
+from udapi.block.ud.fixchain import FixChain
+from udapi.block.ud.fixpunct import FixPunct
+from udapi.block.ud.fixrightheaded import FixRightheaded
+from udapi.block.ud.complywithtext import ComplyWithText
+from udapi.block.ud.es.addmwt import AddMwt as es_AddMwt
+from udapi.block.ud.joinasmwt import JoinAsMwt
+from udapi.core.block import Block
+
+
+class ExGoogle2ud(Block):
+ """Convert former Google Universal Dependency Treebank into UD style."""
+
+ def __init__(self, lang='unk', **kwargs):
+ super().__init__(**kwargs)
+ self.lang = lang
+
+ self._fixpunct_block = None if self.lang == 'it' else FixPunct()
+ self._fixrigheaded_block = FixRightheaded()
+ self._fixchain_block = FixChain()
+ self._comply_block = None
+ if lang == 'ja':
+ self._comply_block = ComplyWithText()
+
+ self._addmwt_block = None
+ self._joinasmwt_block = None
+ if lang == 'es':
+ self._addmwt_block = es_AddMwt()
+ self._joinasmwt_block = JoinAsMwt()
+
+ def process_tree(self, root):
+ for node in root.descendants:
+ self.fix_node(node)
+
+ for block in (
+ self._addmwt_block,
+ self._joinasmwt_block,
+ self._comply_block,
+ self._fixrigheaded_block, # deprel=fixed,flat,... should be always head-initial
+ self._fixchain_block, # and form a flat structure, not a chain.
+ self._fixpunct_block): # commas should depend on the subord unit.
+ if block:
+ block.process_tree(root)
+
+ def fix_node(self, node):
+ """Various fixed taken from ud.Google2ud."""
+
+ if node.xpos == 'SYM': # These are almost always tagged as upos=X which is wrong.
+ node.upos = 'SYM'
+ if node.deprel in {'punct', 'p'}:
+ if node.form in "_-.؟”'":
+ node.upos = 'PUNCT'
+ else:
+ node.deprel = 'dep' # This is another way how to say deprel=todo.
+
+ if self.lang != 'es' and node.udeprel == 'nmod' and node.deprel != 'nmod':
+ parent_is_nominal = self.is_nominal(node.parent)
+ if parent_is_nominal == 'no':
+ node.deprel = 'obl' + ':' + node.sdeprel
+ elif node.deprel == 'nmod:tmod':
+ node.deprel = 'obl:tmod'
+
+ if node.deprel == 'obl:gmod' and self.lang == 'ar':
+ node.deprel = 'obl'
+ node.feats['Case'] = 'Gen'
+
+ if node.upos == 'CCONJ' and node.deprel == 'mark':
+ node.upos = 'SCONJ'
+
+ if self.lang == 'es':
+ if node.deprel == 'compound':
+ # most of the uppercase compounds are upos=PROPN, but not all, e.g. Hack Forums
+ if node.form[0].isupper():
+ node.deprel = 'flat:name'
+ else:
+ node.deprel = 'nmod'
+
+ @staticmethod
+ def is_nominal(node):
+ """Returns 'no' (for predicates), 'yes' (sure nominals) or 'maybe'.
+
+ Used in `change_nmod`."""
+ if node.upos in ["VERB", "AUX", "ADJ", "ADV"]:
+ return 'no'
+ # Include NUM for examples such as "one of the guys"
+ # and DET for examples such as "some/all of them"
+ if node.upos in ["NOUN", "PRON", "PROPN", "NUM", "DET"]:
+ # check whether the node is a predicate
+ # (either has a nsubj/csubj dependendent or a copula dependent)
+ if any(["subj" in child.deprel or child.deprel == 'cop' for child in node.children]):
+ return 'maybe'
+ return 'yes'
+ return 'maybe'
diff --git a/udapi/block/ud/fixadvmodbyupos.py b/udapi/block/ud/fixadvmodbyupos.py
new file mode 100644
index 00000000..a2e4439c
--- /dev/null
+++ b/udapi/block/ud/fixadvmodbyupos.py
@@ -0,0 +1,103 @@
+"""
+Block ud.FixAdvmodByUpos will change the dependency relation from advmod to something else
+if the UPOS is not ADV.
+"""
+from udapi.core.block import Block
+
+
+class FixAdvmodByUpos(Block):
+ """
+ Make sure advmod is not used with UPOS it should not be used with.
+ """
+
+ def process_node(self, node):
+ if node.udeprel == 'advmod':
+ if node.upos in ['NOUN', 'PROPN', 'PRON', 'DET', 'NUM']:
+ node.deprel = 'obl'
+ elif node.upos == 'VERB':
+ node.deprel = 'advcl'
+ elif node.upos == 'AUX':
+ node.deprel = 'aux'
+ elif node.upos in ['ADP', 'SCONJ']:
+ if node.parent.upos == 'VERB':
+ node.deprel = 'mark'
+ else:
+ node.deprel = 'case'
+ elif node.upos == 'CCONJ':
+ node.deprel = 'cc'
+ elif node.upos == 'INTJ':
+ node.deprel = 'discourse'
+ else:
+ node.deprel = 'dep'
+ ###!!! The following are not advmod so they should probably have their
+ ###!!! own block or this block should have a different name.
+ elif node.udeprel == 'expl':
+ if node.upos == 'AUX':
+ node.deprel = 'aux'
+ elif node.upos == 'ADP':
+ node.deprel = 'case'
+ elif node.upos == 'ADV':
+ node.deprel = 'advmod'
+ elif node.upos == 'CCONJ':
+ node.deprel = 'cc'
+ elif node.udeprel in ['aux', 'cop']:
+ if node.upos != 'AUX':
+ node.deprel = 'dep'
+ elif node.udeprel == 'case':
+ if node.upos == 'ADJ':
+ node.deprel = 'amod'
+ elif node.upos == 'DET':
+ node.deprel = 'det'
+ elif node.upos == 'PRON':
+ node.deprel = 'nmod'
+ elif node.udeprel == 'mark':
+ if node.upos in ['PRON', 'DET']:
+ node.deprel = 'nsubj' # it could be also obj, iobj, obl or nmod; just guessing what might be more probable
+ elif node.upos == 'NOUN':
+ node.deprel = 'obl'
+ elif node.upos == 'ADJ':
+ node.deprel = 'amod'
+ elif node.upos == 'INTJ':
+ node.deprel = 'discourse'
+ elif node.udeprel == 'cc':
+ if node.upos == 'AUX':
+ node.deprel = 'aux'
+ elif node.upos == 'DET':
+ node.deprel = 'det'
+ elif node.upos == 'INTJ':
+ node.deprel = 'discourse'
+ elif node.upos == 'NOUN':
+ node.deprel = 'dep'
+ elif node.udeprel == 'det':
+ if node.upos == 'NOUN':
+ node.deprel = 'nmod'
+ elif node.upos == 'ADJ':
+ node.deprel = 'amod'
+ elif node.upos == 'NUM':
+ node.deprel = 'nummod'
+ elif node.upos == 'ADV':
+ node.deprel = 'advmod'
+ elif node.upos == 'AUX':
+ node.deprel = 'aux'
+ elif node.upos == 'VERB':
+ node.deprel = 'dep'
+ elif node.upos == 'SCONJ':
+ node.deprel = 'mark'
+ elif node.upos == 'CCONJ':
+ node.deprel = 'cc'
+ elif node.upos == 'X':
+ node.deprel = 'dep'
+ elif node.udeprel == 'nummod':
+ if node.upos == 'ADJ':
+ node.deprel = 'amod'
+ elif node.upos == 'PRON':
+ node.deprel = 'nmod'
+ elif node.upos == 'DET':
+ node.deprel = 'det'
+ elif node.upos == 'ADP':
+ node.deprel = 'case'
+ elif node.udeprel == 'punct':
+ if node.upos != 'PUNCT':
+ node.deprel = 'dep'
+ elif node.udeprel == 'obl' and node.parent.upos in ['NOUN', 'PROPN', 'PRON'] and node.parent.udeprel in ['nsubj', 'obj', 'iobj', 'obl', 'vocative', 'dislocated', 'expl', 'nmod']:
+ node.deprel = 'nmod'
diff --git a/udapi/block/ud/fixchain.py b/udapi/block/ud/fixchain.py
new file mode 100644
index 00000000..b3a586f6
--- /dev/null
+++ b/udapi/block/ud/fixchain.py
@@ -0,0 +1,18 @@
+"""Block ud.FixChain for making sure deprel=fixed|flat|goeswith|list does not form a chain."""
+from udapi.core.block import Block
+
+
+class FixChain(Block):
+ """Make sure deprel=fixed etc. does not form a chain, but a flat structure."""
+
+ def __init__(self, deprels='fixed,flat,goeswith,list', **kwargs):
+ """Args:
+ deprels: comma-separated list of deprels to be fixed. Default = fixed,goeswith,list.
+ """
+ super().__init__(**kwargs)
+ self.deprels = deprels.split(',')
+
+ def process_node(self, node):
+ for deprel in self.deprels:
+ if node.udeprel == deprel and node.parent.udeprel == deprel:
+ node.parent = node.parent.parent
diff --git a/udapi/block/ud/fixcompoundname.py b/udapi/block/ud/fixcompoundname.py
new file mode 100644
index 00000000..90596e35
--- /dev/null
+++ b/udapi/block/ud/fixcompoundname.py
@@ -0,0 +1,46 @@
+"""
+Block ud.FixCompoundName finds compound relations between PROPN nodes and converts
+them to flat:name. This is not necessarily correct in all situations. The difference
+between compound and flat is that compound allows to distinguish head and modifier.
+Multiword person names (given name and surname, or various other patterns) typically
+should be analyzed as flat but there are treebanks that incorrectly use compound
+for person names. This block can be used to fix them.
+"""
+from udapi.core.block import Block
+import regex as re
+import logging
+
+
+class FixCompoundName(Block):
+ """
+ Converts a compound relation between two PROPN nodes into a flat relation.
+ Compounds of a PROPN and a non-PROPN will be left alone, although they are
+ suspicious, too.
+ """
+
+ def process_node(self, node):
+ if node.upos == 'PROPN' and node.udeprel == 'compound' and node.parent.upos == 'PROPN':
+ origparent = node.parent
+ grandparent = origparent.parent
+ outdeprel = origparent.deprel
+ # See if there are other PROPN compound siblings.
+ # (The list node.children is automatically sorted by ord. If any new sorting is needed later, we can compare nodes directly, their default comparison value is ord.)
+ namewords = [x for x in origparent.children(add_self=True) if x.upos == 'PROPN' and (x.udeprel == 'compound' or x == origparent)]
+ # The Hindi treebank tags dates (['30', 'navaṁbara'], ['disaṁbara', '1993']) as PROPN compounds.
+ # This is wrong but it is also different from personal names we are targeting here.
+ # Hence, we will skip "names" that contain numbers.
+ if any(re.search(r"\d", x.form) for x in namewords):
+ #logging.info(str([x.misc['Translit'] for x in namewords]))
+ ###!!! We currently cannot transform enhanced dependencies.
+ ###!!! If we proceed, the basic tree would diverge from the enhanced dependencies.
+ if len(node.deps) > 0:
+ logging.fatal('There are enhanced dependencies but ud.FixCompoundName has been implemented only for basic dependencies.')
+ # The first name word will be the technical head. If it is the current parent, fine.
+ head = namewords[0]
+ rest = namewords[1:]
+ if head != origparent:
+ head.parent = grandparent
+ head.deprel = outdeprel
+ for n in rest:
+ n.parent = head
+ n.deprel = 'flat:name'
diff --git a/udapi/block/ud/fixleaf.py b/udapi/block/ud/fixleaf.py
new file mode 100644
index 00000000..9b4ce191
--- /dev/null
+++ b/udapi/block/ud/fixleaf.py
@@ -0,0 +1,42 @@
+"""
+Block ud.FixLeaf checks that function word dependents are leaves.
+Certain known exceptions are observed (e.g., fixed expressions).
+"""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixLeaf(Block):
+ """
+ Make sure that function words are leaves unless one of the known exceptions
+ applies.
+ """
+
+ def __init__(self, deprels='aux,cop,case,mark,cc', **kwargs):
+ """
+ Args:
+ deprels: comma-separated list of deprels to be fixed. Default = aux,cop,case,mark,cc.
+ """
+ super().__init__(**kwargs)
+ self.deprels = deprels.split(',')
+
+ def process_node(self, node):
+ for deprel in self.deprels:
+ if node.udeprel == deprel:
+ # Every function dependent can have a fixed child.
+ # We will also allow conj, cc, punct, goeswith, reparandum.
+ allowed = ['fixed', 'punct', 'goeswith', 'reparandum']
+ if deprel != 'cc':
+ allowed += ['conj', 'cc']
+ children = [c for c in node.children if not (c.udeprel in allowed)]
+ # Re-attach the remaining children to an acceptable ancestor.
+ ancestor = node.parent
+ while ancestor.udeprel in self.deprels:
+ ancestor = ancestor.parent
+ for c in children:
+ c.parent = ancestor
+ # If there are enhanced dependencies, check whether we want to redirect them too.
+ if c.deps:
+ for edep in c.deps:
+ if edep['parent'] == node:
+ edep['parent'] = ancestor
diff --git a/udapi/block/ud/fixmultiobjects.py b/udapi/block/ud/fixmultiobjects.py
new file mode 100644
index 00000000..485b85f0
--- /dev/null
+++ b/udapi/block/ud/fixmultiobjects.py
@@ -0,0 +1,47 @@
+"""
+Block ud.FixMultiObjects will ensure that no node has more than one (direct) object child.
+"""
+from udapi.core.block import Block
+
+
+class FixMultiObjects(Block):
+ """
+ Make sure there is at most one object.
+ """
+
+ def process_node(self, node):
+ objects = [x for x in node.children if x.udeprel == 'obj']
+ if len(objects) > 1:
+ subjects = [x for x in node.children if x.udeprel in ['nsubj', 'csubj']]
+ # Some heuristics that could work in AnCora:
+ # If all objects are after the verb, keep the one that is closest to the verb.
+ if objects[0].ord > node.ord:
+ objects = objects[1:]
+ for o in objects:
+ o.deprel = 'obl:arg'
+ o.deps[0]['deprel'] = 'obl:arg'
+ elif objects[-1].ord < node.ord:
+ objects = objects[:-1]
+ for o in objects:
+ o.deprel = 'dislocated'
+ o.deps[0]['deprel'] = 'dislocated'
+ # ho experimenta tot
+ elif objects[-1].lemma in ['tot', 'todo']:
+ objects[-1].parent = objects[0]
+ objects[-1].deprel = 'nmod'
+ objects[-1].deps[0]['parent'] = objects[0]
+ objects[-1].deps[0]['deprel'] = 'nmod'
+ # X se llama Y
+ elif node.lemma in ['llamar', 'considerar', 'decir', 'denunciar', 'causar', 'escribir', 'hacer', 'rubricar']:
+ objects[-1].deprel = 'xcomp'
+ objects[-1].deps[0]['deprel'] = 'xcomp'
+ elif len(subjects) == 0:
+ objects[0].deprel = 'nsubj'
+ objects[0].deps[0]['deprel'] = 'nsubj'
+ else:
+ objects[0].deprel = 'dislocated'
+ objects[0].deps[0]['deprel'] = 'dislocated'
+ # For the moment, we take the dummiest approach possible: The first object survives and all others are forced to a different deprel.
+ #objects = objects[1:]
+ #for o in objects:
+ # o.deprel = 'iobj'
diff --git a/udapi/block/ud/fixmultisubjects.py b/udapi/block/ud/fixmultisubjects.py
new file mode 100644
index 00000000..f8aeca06
--- /dev/null
+++ b/udapi/block/ud/fixmultisubjects.py
@@ -0,0 +1,23 @@
+"""
+Block ud.FixMultiSubjects will ensure that no node has more than one subject child (except those
+marked as :outer).
+"""
+import re
+from udapi.core.block import Block
+
+
+class FixMultiSubjects(Block):
+ """
+ Make sure there is at most one subject that is not marked as :outer.
+ """
+
+ def process_node(self, node):
+ subjects = [x for x in node.children if re.match(r"^[nc]subj(:|$)", x.deprel) and not re.search(r":outer$", x.deprel)]
+ # For the moment, we take the dummiest approach possible: The first subject survives and all others are forced to a different deprel.
+ if len(subjects) > 1:
+ subjects = subjects[1:]
+ for s in subjects:
+ if re.match(r"^n", s.deprel):
+ s.deprel = 'obl'
+ else:
+ s.deprel = 'advcl'
diff --git a/udapi/block/ud/fixmwtspace.py b/udapi/block/ud/fixmwtspace.py
new file mode 100644
index 00000000..a2b7b875
--- /dev/null
+++ b/udapi/block/ud/fixmwtspace.py
@@ -0,0 +1,22 @@
+"""
+Block ud.FixMwtSpace looks for multiword tokens whose form contains a space,
+which should be avoided. If found, the block checks whether it can remove
+the multiword token seamlessly, that is, whether the syntactic words correspond
+to the space-delimited parts of the multiword token. If possible, the MWT
+line will be removed.
+"""
+from udapi.core.block import Block
+import re
+
+
+class FixMwtSpace(Block):
+ """Try to remove multiword tokens with spaces."""
+
+ def process_node(self, node):
+ if node.multiword_token:
+ mwt = node.multiword_token
+ if re.search(r' ', mwt.form):
+ if node == mwt.words[0]:
+ wordforms = [x.form for x in mwt.words]
+ if ' '.join(wordforms) == mwt.form:
+ mwt.remove()
diff --git a/udapi/block/ud/fixpseudocop.py b/udapi/block/ud/fixpseudocop.py
new file mode 100644
index 00000000..f4d9a1ec
--- /dev/null
+++ b/udapi/block/ud/fixpseudocop.py
@@ -0,0 +1,45 @@
+"""Block to fix annotation of verbs that are currently treated as copulas
+ but they should be treated as normal verbs (with secondary predication)
+ instead."""
+from udapi.core.block import Block
+import re
+
+class FixPseudoCop(Block):
+
+ def __init__(self, lemmas, noncopaux=False, **kwargs):
+ """Create the ud.FixPseudoCop block instance.
+
+ Args:
+ lemmas: comma-separated list of lemmas of the pseudocopulas that should be fixed
+ noncopaux: do the same for non-copula auxiliaries with the given lemma
+ """
+ super().__init__(**kwargs)
+ self.lemmas = lemmas.split(',')
+ self.noncopaux = noncopaux
+
+ def process_node(self, node):
+ pseudocop = self.lemmas
+ if node.lemma in pseudocop:
+ # Besides spurious copulas, this block can be optionally used to fix spurious auxiliaries (if noncopaux is set).
+ if node.udeprel == 'cop' or self.noncopaux and node.udeprel == 'aux':
+ secpred = node.parent
+ grandparent = secpred.parent
+ node.parent = grandparent
+ node.deprel = secpred.deprel
+ secpred.parent = node
+ secpred.deprel = "xcomp"
+ ###!!! We should also take care of DEPS if they exist.
+ # As a copula, the word was tagged AUX. Now it should be VERB.
+ node.upos = "VERB"
+ # Examine the children of the original parent.
+ # Those that modify the clause should be re-attached to me.
+ # Those that modify the word (noun, adjective) should stay there.
+ for c in secpred.children:
+ # obl is borderline. It could modify an adjective rather than a clause.
+ # obj and iobj should not occur in copular clauses but it sometimes
+ # occurs with pseudocopulas: "I declare him handsome."
+ if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel):
+ c.parent = node
+ # Another possible error is that the word is tagged AUX without being attached as "cop" or "aux".
+ elif self.noncopaux and node.upos == 'AUX':
+ node.upos = 'VERB'
diff --git a/udapi/block/ud/fixpunct.py b/udapi/block/ud/fixpunct.py
new file mode 100644
index 00000000..854a24a8
--- /dev/null
+++ b/udapi/block/ud/fixpunct.py
@@ -0,0 +1,302 @@
+"""Block ud.FixPunct for making sure punctuation is attached projectively.
+
+Punctuation in Universal Dependencies has the tag PUNCT, dependency relation punct,
+and is always attached projectively, usually to the head of a neighboring subtree
+to its left or right (see https://universaldependencies.org/u/dep/punct.html).
+Punctuation normally does not have children. If it does, we will fix it first.
+
+This block tries to re-attach punctuation projectively and according to the guidelines.
+It should help in cases where punctuation is attached randomly, always to the root
+or always to the neighboring word. However, there are limits to what it can do;
+for example it cannot always recognize whether a comma is introduced to separate
+the block to its left or to its right. Hence if the punctuation before running
+this block is almost good, the block may actually do more harm than good.
+
+Since the punctuation should not have children, we should not create a non-projectivity
+if we check the root edges going to the right.
+"""
+from udapi.core.block import Block
+# pylint: disable=no-self-use
+
+# TODO We need to know the language, there are many other quotation styles,
+# e.g. Finnish and Swedish uses the same symbol for opening and closing: ”X”.
+# Danish uses uses the French quotes, but switched: »X«.
+PAIRED_PUNCT = {
+ '(': ')',
+ '[': ']',
+ '{': '}',
+ '"': '"', # ASCII double quotes
+ "'": "'", # ASCII single quotes
+ '“': '”', # quotation marks used in English, ...
+ '„': '“', # Czech, German, Russian, ...
+ '«': '»', # French, Russian, Spanish, ...
+ '‹': '›', # dtto
+ '《': '》', # Korean, Chinese
+ '「': '」', # Chinese, Japanese
+ '『': '』', # ditto
+ '¿': '?', # Spanish paired question marks
+ '¡': '!', # Spanish paired exclamation marks
+ }
+
+FINAL_PUNCT = '.?!'
+
+
+class FixPunct(Block):
+ """Make sure punctuation nodes are attached projectively."""
+
+ def __init__(self, check_paired_punct_upos=False, copy_to_enhanced=False, **kwargs):
+ """Create the ud.FixPunct block instance.
+
+ Args:
+ check_paired_punct_upos: fix paired punctuation tokens only if their UPOS=PUNCT.
+ The default is false, which means that fixed punctuation is detected only
+ based on the form with the exception of single & double quote character,
+ which is frequently ambiguous*, so UPOS=PUNCT is checked always.
+ *) Single quote can be an apostrophe. Double quote as a NOUN can be the inch symbol.
+ copy_to_enhanced: for all upos=PUNCT, let the enhanced depencies
+ be the same as the basic dependencies.
+ """
+ super().__init__(**kwargs)
+ self._punct_type = None
+ self.check_paired_punct_upos = check_paired_punct_upos
+ self.copy_to_enhanced = copy_to_enhanced
+
+ def _is_punct(self, node):
+ if node.upos == 'PUNCT':
+ return True
+ if self.check_paired_punct_upos:
+ return False
+ if node.form in "'\"":
+ return False
+ if node.form in PAIRED_PUNCT or node.form in PAIRED_PUNCT.values():
+ return True
+ return False
+
+ def process_tree(self, root):
+ # First, make sure no PUNCT has children.
+ # This may introduce multiple subroots, which will be fixed later on
+ # (preventing to temporarily create multiple subroots here would prevent fixing some errors).
+ for node in root.descendants:
+ while self._is_punct(node.parent):
+ node.parent = node.parent.parent
+
+ # Second, fix paired punctuations: quotes and brackets, marking them in _punct_type.
+ # This should be done before handling the subordinate punctuation,
+ # in order to prevent non-projectivities e.g. in dot-before-closing-quote style sentences:
+ # I call him "Bob."
+ # Here both quotes and the sentence-final dot should be attached to "Bob".
+ # (As you can see on the previous line, I don't like this American typographic rule.)
+ self._punct_type = [None] * (1 + len(root.descendants))
+ for node in root.descendants:
+ if self._punct_type[node.ord] != 'closing':
+ closing_punct = PAIRED_PUNCT.get(node.form)
+ if closing_punct is not None:
+ self._fix_paired_punct(root, node, closing_punct)
+
+ # Third, fix subordinate punctuation (i.e. any punctuation not marked in _punct_type).
+ for node in root.descendants:
+ if node.upos == 'PUNCT' and not self._punct_type[node.ord]:
+ self._fix_subord_punct(node)
+
+ # UD requires "exactly one word is the head of the sentence, dependent on a notional ROOT", i.e. a single "subroot".
+ # This seems to be a stronger rule than no-PUNCT-children because it is checked by the validator.
+ # So lets prevent multiple subroots (at the cost of possibly re-introducing PUNCT-children).
+ if len(root.children) > 1:
+ selected_subroot = next((n for n in root.children if n.udeprel == 'root'), root.children[0])
+ for a_subroot in root.children:
+ if a_subroot != selected_subroot:
+ a_subroot.parent = selected_subroot
+
+ # Check if the subroot is still marked with deprel=root.
+ # This may not hold if the original subroot was a paired punctuation, which was rehanged.
+ if root.children[0].udeprel != 'root':
+ root.children[0].udeprel = 'root'
+ if self.copy_to_enhanced:
+ root.children[0].deps = [{'parent': root, 'deprel': 'root'}]
+ for another_node in root.children[0].descendants:
+ if another_node.udeprel == 'root':
+ another_node.udeprel = 'punct'
+
+ # TODO: This block changes parents not only for PUNCT nodes. These should be reflected into enhanced deps as well.
+ if self.copy_to_enhanced:
+ for node in root.descendants:
+ if node.upos == 'PUNCT':
+ node.deps = [{'parent': node.parent, 'deprel': node.deprel}]
+
+ def _fix_subord_punct(self, node):
+ # Dot used as the ordinal-number marker (in some languages) or abbreviation marker.
+ # TODO: detect these cases somehow
+ # Numbers can be detected with `node.parent.form.isdigit()`,
+ # but abbreviations are more tricky because the Abbr=Yes feature is not always used.
+ if node.form == '.' and node.parent == node.prev_node:
+ return
+
+ # Even non-paired punctuation like commas and dashes may work as paired.
+ # Detect such cases and try to preserve, but only if projective.
+ p_desc = node.parent.descendants(add_self=1)
+ if node in (p_desc[0], p_desc[-1]) and len(p_desc) == p_desc[-1].ord - p_desc[0].ord + 1:
+ if (p_desc[0].upos == 'PUNCT' and p_desc[-1].upos == 'PUNCT'
+ and p_desc[0].parent == node.parent and p_desc[-1].parent == node.parent):
+ return
+
+ # Initialize the candidates (left and right) with the nearest nodes excluding punctuation.
+ # Final punctuation should not be attached to any following, so exclude r_cand there.
+ l_cand, r_cand = node.prev_node, node.next_node
+ if node.form in FINAL_PUNCT:
+ r_cand = None
+ while l_cand.ord > 0 and l_cand.upos == 'PUNCT':
+ if self._punct_type[l_cand.ord] == 'opening' and l_cand.parent != node:
+ l_cand = None
+ break
+ l_cand = l_cand.prev_node
+ while r_cand is not None and r_cand.upos == 'PUNCT':
+ if self._punct_type[r_cand.ord] == 'closing' and r_cand.parent != node:
+ r_cand = None
+ break
+ r_cand = r_cand.next_node
+
+ # Climb up from the candidates, until we would reach the root or "cross" the punctuation.
+ # If the candidates' descendants span across the punctuation, we also stop
+ # because climbing higher would cause a non-projectivity (the punct would be the gap).
+ l_path, r_path = [l_cand], [r_cand]
+ if l_cand is None or l_cand.is_root():
+ l_cand, l_path = None, []
+ else:
+ while (not l_cand.parent.is_root() and l_cand.parent < node
+ and not node < l_cand.descendants(add_self=1)[-1]):
+ l_cand = l_cand.parent
+ l_path.append(l_cand)
+ if r_cand is not None:
+ while (not r_cand.parent.is_root() and node < r_cand.parent
+ and not r_cand.descendants(add_self=1)[0] < node):
+ r_cand = r_cand.parent
+ r_path.append(r_cand)
+
+ # Filter out candidates which would lead to non-projectivities, i.e. bugs
+ # punct-nonproj and punct-nonproj-gap as checked by the UD validator and ud.MarkBugs.
+ orig_parent = node.parent
+ l_path = [n for n in l_path if n and self._will_be_projective(node, n)]
+ r_path = [n for n in r_path if n and self._will_be_projective(node, n)]
+ l_cand = l_path[-1] if l_path else None
+ r_cand = r_path[-1] if r_path else None
+ node.parent = orig_parent
+
+ # Now select between l_cand and r_cand -- which will be the new parent?
+ # The lower one. Note that if neither is descendant of the other and neither is None
+ # (which can happen in rare non-projective cases), we arbitrarily prefer l_cand,
+ # but if the original parent is either on l_path or r_path, we keep it as acceptable.
+ if l_cand is not None and l_cand.is_descendant_of(r_cand):
+ cand, path = l_cand, l_path
+ elif r_cand is not None and r_cand.is_descendant_of(l_cand):
+ cand, path = r_cand, r_path
+ elif l_cand is not None:
+ cand, path = l_cand, l_path + r_path
+ elif r_cand is not None:
+ cand, path = r_cand, l_path + r_path
+ else:
+ return
+
+ # The guidelines say:
+ # Within the relevant unit, a punctuation mark is attached
+ # at the highest possible node that preserves projectivity.
+ # However, sometimes it is difficult to detect the unit (and its head).
+ # E.g. in "Der Mann, den Sie gestern kennengelernt haben, kam wieder."
+ # the second comma should depend on "kennengelernt", not on "Mann"
+ # because the unit is just the relative clause.
+ # We try to be conservative and keep the parent, unless we are sure it is wrong.
+ if node.parent not in path:
+ node.parent = cand
+ node.deprel = 'punct'
+
+ def _will_be_projective(self, node, cand):
+ node.parent = cand
+ return not node.is_nonprojective() and not self._causes_gap(node)
+
+ def _causes_gap(self, node):
+ return node.is_nonprojective_gap() and not node.parent.is_nonprojective_gap()
+
+ def _fix_paired_punct(self, root, opening_node, closing_punct):
+ if (self.check_paired_punct_upos
+ or opening_node.form in "'\"") and opening_node.upos != 'PUNCT':
+ return
+ nested_level = 0
+ for node in root.descendants[opening_node.ord:]:
+ if node.form == closing_punct:
+ if nested_level > 0:
+ nested_level -= 1
+ else:
+ self._fix_pair(root, opening_node, node)
+ return
+ elif node.form == opening_node.form:
+ nested_level += 1
+
+ def _fix_pair(self, root, opening_node, closing_node):
+ # Ideally, paired punctuation symbols should be attached to the single
+ # head of the subtree inside. Provided the inside segment is a single
+ # subtree.
+ heads = []
+ punct_heads = []
+ for node in root.descendants:
+ if node == opening_node or node == closing_node:
+ continue
+ # If this is a node inside of the pair, is its parent outside?
+ if node > opening_node and node < closing_node:
+ if node.parent < opening_node or node.parent > closing_node:
+ if node.upos == 'PUNCT':
+ punct_heads.append(node)
+ else:
+ heads.append(node)
+ # Not only the punctuation symbols must not be attached non-projectively,
+ # they also must not cause non-projectivity of other relations. This could
+ # happen if an outside node is attached to an inside node. To account for
+ # this, mark the inside parent as a head, too.
+ elif node.parent > opening_node and node.parent < closing_node:
+ if node.parent.upos == 'PUNCT':
+ punct_heads.append(node.parent)
+ else:
+ heads.append(node.parent)
+
+ # Punctuation should not have children, but if there is no other head candidate,
+ # let's break this rule.
+ if len(heads) == 0:
+ heads = punct_heads
+ # If there are no nodes between the opening and closing mark (),
+ # let's treat the marks as any other (non-pair) punctuation.
+ if len(heads) == 0:
+ return
+ else:
+ # Ideally, there should be only a single head.
+ # If not, we could try e.g. to choose the "widests-span head":
+ # opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0]
+ # closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0]
+ # which often leads to selecting the same head for the opening and closing punctuation
+ # ignoring single words inside the paired punct which are non-projectively attached outside.
+ # However, this means that the paired punctuation will be attached non-projectively,
+ # which is forbidden by the UD guidelines.
+ # Thus, we will choose the nearest head, which is the only way how to prevent non-projectivities.
+ # Sort the heads by their ords (this is not guaranteed because we were adding a mixture of
+ # inside heads and inside parents of outside nodes).
+ heads.sort(key=lambda x: x.ord)
+ opening_node.parent = heads[0]
+ closing_node.parent = heads[-1]
+
+ self._punct_type[opening_node.ord] = 'opening'
+ self._punct_type[closing_node.ord] = 'closing'
+
+ # In rare cases, non-projective gaps may remain. Let's dirty fix these!
+ # E.g. in "the (lack of) reproducibility", the closing parenthesis
+ # should be attached to "of" rather than to "lack"
+ # -- breaking the paired-marks-have-same-parent rule
+ # in order to prevent the punct-nonproj-gap bug (recently checked by validator.py).
+ if self._causes_gap(opening_node):
+ opening_node.parent = opening_node.next_node
+ while (opening_node.parent.ord < closing_node.ord - 1
+ and (opening_node.parent.upos == 'PUNCT' or opening_node.is_nonprojective()
+ or self._causes_gap(opening_node))):
+ opening_node.parent = opening_node.parent.next_node
+ if self._causes_gap(closing_node):
+ closing_node.parent = closing_node.prev_node
+ while (closing_node.parent.ord > opening_node.ord + 1
+ and (closing_node.parent.upos == 'PUNCT' or closing_node.is_nonprojective()
+ or self._causes_gap(closing_node))):
+ closing_node.parent = closing_node.parent.prev_node
diff --git a/udapi/block/ud/fixpunctchild.py b/udapi/block/ud/fixpunctchild.py
new file mode 100644
index 00000000..07ef3eb3
--- /dev/null
+++ b/udapi/block/ud/fixpunctchild.py
@@ -0,0 +1,10 @@
+"""Block ud.FixPunctChild for making sure punctuation nodes have no children."""
+from udapi.core.block import Block
+
+
+class FixPunctChild(Block):
+ """Make sure punct nodes have no children by rehanging the children upwards."""
+
+ def process_node(self, node):
+ while node.parent.deprel == 'punct':
+ node.parent = node.parent.parent
diff --git a/udapi/block/ud/fixrightheaded.py b/udapi/block/ud/fixrightheaded.py
new file mode 100644
index 00000000..045278dd
--- /dev/null
+++ b/udapi/block/ud/fixrightheaded.py
@@ -0,0 +1,33 @@
+"""Block ud.FixRightheaded for making sure flat,fixed,appos,goeswith,list is head initial.
+
+Note that deprel=conj should also be left-headed,
+but it is not included in this fix-block by default
+because coordinations are more difficult to convert
+and one should use a specialized block instead.
+"""
+from udapi.core.block import Block
+
+
+class FixRightheaded(Block):
+ """Make sure deprel=flat,fixed,... form a head-initial (i.e. left-headed) structure."""
+
+ def __init__(self, deprels='flat,fixed,appos,goeswith,list', **kwargs):
+ """Args:
+ deprels: comma-separated list of deprels to be fixed.
+ Default = flat,fixed,appos,goeswith,list.
+ """
+ super().__init__(**kwargs)
+ self.deprels = deprels.split(',')
+
+ def process_node(self, node):
+ for deprel in self.deprels:
+ if node.udeprel == deprel and node.precedes(node.parent):
+ orig_parent = node.parent
+ node.parent = orig_parent.parent
+ if deprel != 'conj':
+ for child in orig_parent.children:
+ child.parent = node
+ orig_parent.parent = node
+ head_deprel = orig_parent.deprel
+ orig_parent.deprel = node.deprel
+ node.deprel = head_deprel
diff --git a/udapi/block/ud/fixroot.py b/udapi/block/ud/fixroot.py
new file mode 100644
index 00000000..be972d8b
--- /dev/null
+++ b/udapi/block/ud/fixroot.py
@@ -0,0 +1,37 @@
+"""
+Block ud.FixRoot will ensure that the tree is free of common root-related errors.
+Simple heuristics are used; it is likely that human inspection would lead to
+a different solution. Nevertheless, if a quick fix is needed to pass the
+validation, this block can be helpful.
+
+WARNING: The block currently ignores enhanced dependencies.
+"""
+import re
+from udapi.core.block import Block
+
+
+class FixRoot(Block):
+ """
+ Fixes the following validation errors:
+ - Only one node must be attached directly to the artificial root node.
+ => If the root has multiple children, keep the first one. Attach the other
+ ones to the first one. Change their deprel to 'parataxis'.
+ - The node attached as a child of the artificial root node must have the
+ 'root' relation (or its subtype).
+ => If the root child has another deprel, change it to 'root'.
+ - The node attached as a child of the artificial root node is the only one
+ allowed to have the 'root' relation (or its subtype).
+ => If another node has that deprel, change it to 'parataxis'.
+ """
+
+ def process_tree(self, root):
+ rchildren = root.children
+ if len(rchildren) > 1:
+ for i in range(len(rchildren)-1):
+ rchildren[i+1].parent = rchildren[0]
+ rchildren[i+1].deprel = 'parataxis'
+ if rchildren[0].udeprel != 'root':
+ rchildren[0].deprel = 'root'
+ for n in root.descendants:
+ if not n.parent == root and n.udeprel == 'root':
+ n.deprel = 'parataxis'
diff --git a/udapi/block/ud/fr/__init__.py b/udapi/block/ud/fr/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/ud/fr/addmwt.py b/udapi/block/ud/fr/addmwt.py
new file mode 100644
index 00000000..948a927a
--- /dev/null
+++ b/udapi/block/ud/fr/addmwt.py
@@ -0,0 +1,82 @@
+"""Block ud.fr.AddMwt for heuristic detection of French contractions.
+
+According to the UD guidelines, contractions such as "des" = "de les"
+should be annotated using multi-word tokens.
+
+Note that this block should be used only for converting legacy conllu files.
+Ideally a tokenizer should have already split the MWTs.
+"""
+import udapi.block.ud.addmwt
+
+MWTS = {
+ 'au': {'form': 'à le', 'lemma': 'à le'},
+ 'aux': {'form': 'à les', 'lemma': 'à le'},
+ 'des': {'form': 'de les', 'lemma': 'de le'},
+ 'du': {'form': 'de le', 'lemma': 'de le'},
+
+ 'auquel': {'form': 'à lequel', 'upos': 'ADP PRON', 'lemma': 'à lequel'},
+ 'auxquels': {'form': 'à lesquels', 'upos': 'ADP PRON', 'lemma': 'à lequel'},
+ 'auxquelles': {'form': 'à lesquelles', 'upos': 'ADP PRON', 'lemma': 'à lequel'},
+ 'desquels': {'form': 'de lesquels', 'upos': 'ADP PRON', 'lemma': 'de lequel'},
+ 'desquelles': {'form': 'de lesquelles', 'upos': 'ADP PRON', 'lemma': 'de lequel'},
+ 'duquel': {'form': 'de lequel', 'upos': 'ADP PRON', 'lemma': 'de lequel'},
+}
+# TODO https://fr.wiktionary.org/wiki/des#Vocabulaire_apparent.C3.A9_par_le_sens_2
+# lists more contractions, e.g. "dudit", "audit"
+
+# shared values for all entries in MWTS
+for v in MWTS.values():
+ if not v.get('upos'):
+ v['upos'] = 'ADP DET'
+ if not v.get('shape'):
+ v['shape'] = 'subtree'
+ if not v.get('deprel'):
+ v['deprel'] = 'case det' if v['upos'] == 'ADP DET' else 'case *'
+ if not v.get('main'):
+ v['main'] = 1 if v['upos'] == 'ADP PRON' else 0
+ v['feats'] = '_ *'
+
+
+class AddMwt(udapi.block.ud.addmwt.AddMwt):
+ """Detect and mark MWTs (split them into words and add the words to the tree)."""
+
+ def multiword_analysis(self, node):
+ """Return a dict with MWT info or None if `node` does not represent a multiword token."""
+
+ # "du" can be
+ # - "du + le" (tagged ADP)
+ # - the partitive article "du" (tagged DET)
+ # - past participle of devoir (correctly dû, tagged VERB)
+ # Only the ADP case should be split.
+ # Similarly with "des" -> "de les".
+ if node.upos != 'ADP':
+ return None
+
+ return MWTS.get(node.form.lower(), None)
+
+ # "du" has a shape which is neither "siblings" nor "subtree"
+ # E.g. in "À partir du XXIe siècle"
+ # "du" = "de le", but
+ # "de" is attached to "À", while "le" is attached to "siècle".
+ def postprocess_mwt(self, mwt):
+ if mwt.form.lower() in {'du', 'des', 'au', 'aux'}:
+ if mwt.words[0].descendants[-1] != mwt.words[1]:
+ pass
+ elif mwt.words[0].precedes(mwt.words[0].parent):
+ mwt.words[1].parent = mwt.words[0].parent
+ else:
+ head = mwt.words[1].next_node
+ while head.upos not in {'NOUN', 'PROPN'} and not head.is_root():
+ if head.parent.precedes(head):
+ head = mwt.words[1].next_node
+ break
+ head = head.parent
+ if head.is_root():
+ head = mwt.words[1].next_node
+ mwt.words[1].parent = head
+
+ if mwt.words[1].parent == mwt.words[0] and mwt.words[0].descendants[-1].deprel == 'fixed':
+ mwt.words[1].deprel = 'fixed'
+ if (mwt.words[0].parent.precedes(mwt.words[0])
+ and mwt.words[0].prev_node.udeprel in {'case', 'fixed'}):
+ mwt.words[0].deprel = 'fixed'
diff --git a/udapi/block/ud/ga/to2.py b/udapi/block/ud/ga/to2.py
index 4d8506e1..dbf093a9 100644
--- a/udapi/block/ud/ga/to2.py
+++ b/udapi/block/ud/ga/to2.py
@@ -4,6 +4,7 @@
"""
from udapi.core.block import Block
+
class To2(Block):
"""Block for fixing the remaining cases (after ud.Convert1to2) in UD_Irish."""
diff --git a/udapi/block/ud/gl/__init__.py b/udapi/block/ud/gl/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/ud/gl/to2.py b/udapi/block/ud/gl/to2.py
new file mode 100644
index 00000000..81a17c64
--- /dev/null
+++ b/udapi/block/ud/gl/to2.py
@@ -0,0 +1,60 @@
+"""Block ud.gl.To2 UD_Galician-specific conversion of UDv1 to UDv2
+
+Author: Martin Popel
+"""
+from udapi.core.block import Block
+
+ADP_HEAD_PREFERENCES = {
+ 'NOUN': 10,
+ 'PRON': 9,
+ 'ADJ': 8,
+ 'VERB': 8,
+ 'PUNCT': -10,
+}
+
+
+class To2(Block):
+ """Block for fixing the remaining cases (before ud.Convert1to2) in UD_Galician."""
+
+ def process_node(self, node):
+
+ # UD_Galician v1.4 uses incorrectly deprel=cop not for the copula verb,
+ # but for its complement (typically ADJ) and also copula is the head.
+ if node.deprel == 'cop':
+ copula = node.parent
+ # In UDv2 discussions it has been decided that only a limited set of verbs
+ # can be annotated as copula. For Spanish, "estar" was questionable, but accepted.
+ # I guess in Galician it is the same. The rest (considerar, resultar, quedar,...)
+ # should not be annotated as copulas. Luckily, in UD_Galician v1.4 they are
+ # governing the clause, so no change of topology is needed, just deprel=xcomp.
+ if copula.lemma in ('ser', 'estar'):
+ node.parent = copula.parent
+ for cop_child in copula.children:
+ cop_child.parent = node
+ copula.parent = node
+ node.deprel = copula.deprel
+ copula.deprel = 'cop'
+ else:
+ node.deprel = 'xcomp'
+
+ # Prepositions should depend on the noun, not vice versa.
+ # This is easy to fix, but unfortunatelly, there are many nodes with deprel=case
+ # which are not actually prepostions or case markes, but standard NOUNs, VERBs etc.
+ # These are left as ToDo.
+ if node.deprel == 'case' and node.children:
+ if node.upos not in ('ADP', 'CONJ', 'PART'):
+ node.misc['ToDo'] = 'case-upos'
+ else:
+ children = sorted(node.children, key=lambda n: -ADP_HEAD_PREFERENCES.get(n.upos, 0))
+ children[0].parent = node.parent
+ node.parent = children[0]
+ for child in children[1:]:
+ child.parent = children[0]
+
+ # Punctuation should have no children.
+ if node.deprel == 'punct' and node.children and node.upos == 'PUNCT':
+ children = sorted(node.children, key=lambda n: -ADP_HEAD_PREFERENCES.get(n.upos, 0))
+ children[0].parent = node.parent
+ node.parent = children[0]
+ for child in children[1:]:
+ child.parent = children[0]
diff --git a/udapi/block/ud/goeswithfromtext.py b/udapi/block/ud/goeswithfromtext.py
index 64e1d99f..fe419fa2 100644
--- a/udapi/block/ud/goeswithfromtext.py
+++ b/udapi/block/ud/goeswithfromtext.py
@@ -9,6 +9,7 @@
from udapi.core.block import Block
+
class GoeswithFromText(Block):
"""Block for splitting nodes and attaching via goeswith according to the the sentence text.
@@ -96,6 +97,6 @@ def process_tree(self, root):
else:
last_node.misc['SpaceAfter'] = 'No'
else:
- assert False # we have checked the whole sentence already
+ assert False # we have checked the whole sentence already
if text:
logging.warning('Extra text "%s" in tree %s', text, root)
diff --git a/udapi/block/ud/google2ud.py b/udapi/block/ud/google2ud.py
new file mode 100644
index 00000000..3ba20c5c
--- /dev/null
+++ b/udapi/block/ud/google2ud.py
@@ -0,0 +1,528 @@
+"""Block ud.Google2ud for converting Google Universal Dependency Treebank into UD.
+
+Usage:
+udapy -s ud.Google2ud < google.conllu > ud2.conllu
+"""
+import re
+from udapi.block.ud.convert1to2 import Convert1to2
+from udapi.block.ud.complywithtext import ComplyWithText
+from udapi.block.ud.fixchain import FixChain
+from udapi.block.ud.fixrightheaded import FixRightheaded
+from udapi.block.ud.fixpunct import FixPunct
+from udapi.block.ud.de.addmwt import AddMwt as de_AddMwt
+from udapi.block.ud.es.addmwt import AddMwt as es_AddMwt
+from udapi.block.ud.fr.addmwt import AddMwt as fr_AddMwt
+from udapi.block.ud.pt.addmwt import AddMwt as pt_AddMwt
+from udapi.block.ud.joinasmwt import JoinAsMwt
+
+DEPREL_CHANGE = {
+ "ROOT": "root",
+ "prep": "case",
+ "ncomp": "case:loc", # only in Chinese; Herman proposes case:loc
+ "p": "punct",
+ "poss": "nmod:poss",
+ "ps": "case",
+ "num": "nummod",
+ "number": "nummod", # TODO ?
+ "tmod": "nmod:tmod",
+ "vmod": "acl",
+ "rcmod": "acl:relcl",
+ "npadvmod": "advmod",
+ "preconj": "cc:preconj",
+ "predet": "det:predet",
+ "gobj": "obj",
+ "postneg": "neg", # will be changed to advmod + Polarity=Neg in ud.Convert1to2
+ "pronl": "obj", # TODO: or expl? UD_French seems to use a mix of both
+ "oblcomp": "obl",
+ "mes": "clf", # TODO: structural transformation needed
+ "mwn": "compound:n", # nominal multi-word
+ "mwa": "compound:a", # adjectival multi-word
+ "mwv": "compound:v", # verbal multi-word
+ "asp": "aux", # aspectual particle
+ "rcmodrel": "mark:relcl",
+ "auxcaus": "aux", # redundant with Voice=Cau
+ "topic": "dep",
+ "possessive": "case",
+ "quantmod": "det", # TODO UD_Hindi uses "dep" for the same words
+ "agent": "obl:agent",
+ # TODO: "ref" - in basic dependencies it should be rehanged and relabelled
+ "conjv": "compound:conjv",
+ "advphmod": "advmod",
+ "clas": "clf",
+ "narg": "nmod:arg", # Turkish only
+}
+
+FEATS_CHANGE = {
+ "proper=false": "",
+ "Proper=false": "",
+ "case=prep": "",
+ "case=unsp_c": "",
+ "gender=unsp_g": "",
+ "gender_antecedent=unsp_g": "",
+ "voice=unsp_v": "",
+ "number=unsp_n": "",
+ "number_antecedent=unsp_n": "",
+ "tense=unsp_t": "",
+ "mood=unsp_m": "",
+ "animacy=unsp_r": "",
+ "aspect=unsp_a": "",
+ "case=rel": "", # redundant with rcmodrel (mark:relcl)
+ "reciprocity=non-rcp": "",
+ "reciprocity=rcp": "PronType=Rcp",
+ "aspect=imperf": "Aspect=Imp",
+ "form=long": "Variant=Long",
+ "form=short": "Variant=Short",
+ "person=reflex": "Reflex=Yes",
+ "case=reflex": "Reflex=Yes",
+ "case=dir": "Case=Nom",
+ "gender=pl_tantum": "Number=Ptan",
+ "gender_antecedent=fem_a": "Gender[psor]=Fem",
+ "gender_antecedent=masc_a": "Gender[psor]=Masc",
+ "gender_antecedent=neut_a": "Gender[psor]=Neut",
+ "number_antecedent=sing_a": "Number[psor]=Sing",
+ "number_antecedent=plur_a": "Number[psor]=Plur",
+ "person_antecedent=1_a": "Person[psor]=1",
+ "person_antecedent=2_a": "Person[psor]=2",
+ "person_antecedent=3_a": "Person[psor]=3",
+ "definiteness=def": "Definite=Def",
+ "definiteness=indef": "Definite=Ind",
+ "mood=sub1": "Mood=Sub|Tense=Pres", # de
+ "mood=sub2": "Mood=Sub|Tense=Past", # de
+ "mood=inter": "PronType=Int", # TODO or keep Mood=Inter (it is used in UD_Chinese)
+ "tense=cnd": "Mood=Cnd",
+ "degree=sup_a": "Degree=Abs",
+ "degree=sup_r": "Degree=Sup",
+ "case=obl": "Case=Acc",
+ "tense=impf": "Tense=Imp",
+ "animacy=rat": "Animacy=Hum",
+ "animacy=irrat": "Animacy=Nhum",
+ "honorific=hon": "Polite=Form",
+ "mood=psm": "Tense=Fut", # TODO ?
+ "form=fin": "VerbForm=Fin",
+ "form=ger": "VerbForm=Ger",
+ "formality=fml": "Polite=Form",
+ "Evidentiality=Nfh": "Evident=Nfh",
+ "Evidentiality=Fh": "Evident=Fh",
+}
+
+FR_DAYS_MONTHS = ('lundi mardi mercredi jeudi vendredi samedi dimanche '
+ 'janvier février mars avril mai juin juillet août '
+ 'septembre octobre novembre décembre'.split())
+
+
+class Google2ud(Convert1to2):
+ """Convert Google Universal Dependency Treebank into UD style."""
+
+ def __init__(self, lang='unk', non_mwt_langs='ar en ja ko zh', **kwargs):
+ """Create the Google2ud block instance.
+
+ See ``Convert1to2`` for all the args.
+ """
+ super().__init__(**kwargs)
+ self.lang = lang
+
+ self._addmwt_block = None
+ if lang == 'de':
+ self._addmwt_block = de_AddMwt()
+ elif lang == 'es':
+ self._addmwt_block = es_AddMwt()
+ elif lang == 'fr':
+ self._addmwt_block = fr_AddMwt()
+ elif lang == 'pt':
+ self._addmwt_block = pt_AddMwt()
+ self._joinasmwt_block = JoinAsMwt() if lang in {'es', 'tr'} else None
+
+ self._fixrigheaded_block = None
+ if lang in {'ar', 'de', 'en', 'fr', 'hi', 'ru', 'th', 'zh'}:
+ self._fixrigheaded_block = FixRightheaded()
+ elif lang == 'tr':
+ self._fixrigheaded_block = FixRightheaded(deprels='conj,flat,fixed,appos,goeswith,list')
+
+ # Normalize the attachment of punctuation for all languages.
+ self._fixpunct_block = FixPunct()
+
+ self._fixchain_block = None
+ if lang in {'pt', 'ru'}:
+ self._fixchain_block = FixChain()
+
+ # UD_English v2.0 still uses "do n't" with SpaceAfter=No,
+ # instead of annotating it as a multiword token.
+ # In several other languages it is also common
+ # that syntactic words are not separated with a space without being an MWT.
+ self._comply_block = ComplyWithText(prefer_mwt=bool(lang not in non_mwt_langs.split()))
+
+ def process_tree(self, root):
+ comment_lines = root.comment.split("\n")
+ root.sent_id = comment_lines[0].strip().replace(' ', '-')
+ root.text = comment_lines[1].strip()
+ # The third line of comments contains the English translation.
+ root.comment = '' if self.lang == "en" or len(comment_lines) < 3 else comment_lines[2]
+
+ # ud.ComplyWithText is the very first step because it may change the tokenization
+ # and also it fills SpaceAfter=No, which is used in further steps.
+ if self._comply_block:
+ self._comply_block.process_tree(root)
+
+ # `deprel=goeswith` must be fixed now because it also changes the number of nodes.
+ # Unlike UDv2, Google style uses `goeswith` mostly to fix "wrong" tokenization,
+ # e.g. "e-mail" written correctly without spaces, but tokenized into three words.
+ # Moreover, the hyphen is not always marked with `goeswith`.
+ if self.lang in {'de', 'fr', 'it', 'pt', 'ru', 'tr'}:
+ for node in root.descendants:
+ if node.form == '-' and node.no_space_after and node.prev_node.no_space_after:
+ if 'goeswith' in (node.prev_node.deprel, node.next_node.deprel):
+ node.deprel = 'goeswith'
+ if self.lang == 'fr':
+ node.deprel = 'goeswith'
+ node.parent = node.next_node
+ for node in root.descendants:
+ self.fix_goeswith(node)
+
+ # Google Turkish annotation of coordination is very different from both UDv1 and UDv2.
+ # Also some of the deprel=ig nodes should be merged with their parents.
+ if self.lang == 'tr':
+ for node in root.descendants:
+ conjs = [n for n in node.children if n.deprel == 'conj']
+ if conjs:
+ conjs[0].parent = node.parent
+ conjs[0].deprel = node.deprel
+ node.deprel = 'conj'
+ for nonfirst_conj in conjs[1:] + [node]:
+ nonfirst_conj.parent = conjs[0]
+ for node in root.descendants:
+ if node.deprel == 'ig' and re.match('leş|laş', node.parent.form.lower()):
+ self._merge_with(node.parent, node)
+
+ # Multi-word prepositions must be solved before fix_deprel() fixes pobj+pcomp.
+ for node in root.descendants:
+ self.fix_multiword_prep(node)
+
+ # Fixing feats, upos and deprel in separate steps (the order is important).
+ for node in root.descendants:
+ self.fix_feats(node)
+ for node in root.descendants:
+ self.fix_upos(node)
+ for node in root.descendants:
+ self.fix_deprel(node)
+
+ # This needs to be executed after all other deprels are converted
+ for node in root.descendants:
+ if node.deprel in ('acomp', 'attr'): # TODO not sure about attr
+ copula = node.parent
+ node.parent = copula.parent
+ node.deprel = copula.deprel
+ copula.parent = node
+ copula.deprel = 'cop'
+ for child in copula.children:
+ child.parent = node
+
+ # call ud.Convert1to2
+ super().process_tree(root)
+
+ for block in (
+ self._addmwt_block, # e.g. "im" -> "in dem" in de. Must follow Convert1to2.
+ self._joinasmwt_block, # no pair of alphabetical words with SpaceAfter=No
+ self._fixrigheaded_block, # deprel=fixed,flat,... should be always head-initial
+ self._fixchain_block, # and form a flat structure, not a chain.
+ self._fixpunct_block): # commas should depend on the subord unit.
+ if block:
+ block.process_tree(root)
+
+ if self.lang == 'tr':
+ root.children[0].deprel = 'root'
+ for node in root.descendants:
+ if node.deprel in {'obl:poss', 'obl:arg'}:
+ node.udeprel = 'nmod'
+
+ def fix_goeswith(self, node):
+ """Solve deprel=goeswith which is almost always wrong in the Google annotation."""
+ if node.deprel != 'goeswith':
+ return
+
+ # It has been decided German should use "compound" and keep e.g. "E-mail" as three words.
+ # The only two cases we want to merge are:
+ # * dots marking ordinal numbers (21. Oktober) should be merged with the number
+ # keeping the upos of the number (Google has the dot as parent, don't ask me why).
+ # There are still bugs in the output ("Oktober" depends on "21.") which I give up.
+ # * apostrophes in foreign words "don't" or "Smith'" (the original English was "Smith's").
+ if self.lang == 'de':
+ if (node.precedes(node.parent) and node.misc['SpaceAfter'] == 'No'
+ and node.next_node.form in ".'"):
+ node.next_node.upos = node.upos
+ self._merge_with(node.next_node, node)
+ elif (node.parent.precedes(node) and node.prev_node.misc['SpaceAfter'] == 'No'
+ and node.prev_node.form in ".'"):
+ node.prev_node.upos = node.upos
+ self._merge_with(node.prev_node, node)
+ else:
+ node.deprel = 'compound'
+
+ # Other languages use goeswith for marking Google-tokenization errors.
+ # In Portuguese, there are in addition cases like "Primeira Dama".
+ elif self.lang in {'fr', 'it', 'pt', 'ru', 'tr'}:
+ if node.precedes(node.parent) and node.misc['SpaceAfter'] == 'No':
+ self._merge_with(node.next_node, node)
+ elif node.parent.precedes(node) and node.prev_node.misc['SpaceAfter'] == 'No':
+ self._merge_with(node.prev_node, node)
+ elif self.lang in {'pt'}:
+ node.deprel = 'compound'
+
+ @staticmethod
+ def _merge_with(node, delete_node):
+ """Concat forms, merge feats, remove `delete_node`, and keep SpaceAfter of the right node.
+
+ Should be called only on neighboring nodes.
+ """
+ if node.precedes(delete_node):
+ node.form += delete_node.form
+ node.misc['SpaceAfter'] = delete_node.misc['SpaceAfter']
+ else:
+ node.form = delete_node.form + node.form
+ if node.parent == delete_node:
+ node.parent = delete_node.parent
+ for child in delete_node.children:
+ child.parent = node
+ delete_node.feats.update(node.feats)
+ node.feats = delete_node.feats
+ # node.misc['Merge'] = 1
+ delete_node.remove()
+
+ def fix_multiword_prep(self, node):
+ """Solve pobj/pcomp depending on pobj/pcomp.
+
+ Only some of these cases are multi-word prepositions (which should get deprel=fixed).
+ """
+ if node.deprel in ('pobj', 'pcomp') and node.parent.deprel in ('pobj', 'pcomp'):
+ lo_prep = node.parent
+ hi_prep = node.parent.parent
+ if hi_prep.deprel != 'prep':
+ return
+ # E.g. in "from A to B", the Google style attaches "to"/pcomp under "from"/prep.
+ # Let's use this heuristics: if the prepositions are not next to each other,
+ # they should be siblings (as in "from A to B").
+ if abs(lo_prep.ord - hi_prep.ord) != 1:
+ lo_prep.parent = hi_prep.parent
+ lo_prep.deprel = 'prep'
+ # Some languages (e.g. pt) in UDv2 do not use multi-word prepositions at all.
+ elif self.lang in {'pt'}:
+ node.parent = hi_prep
+ lo_prep.parent = node
+ lo_prep.deprel = 'case'
+ elif self.lang == 'es' and lo_prep.form in {'entre', 'en', 'a'}:
+ node.parent = hi_prep
+ lo_prep.parent = node
+ lo_prep.deprel = 'case'
+ elif self.lang == 'es' and lo_prep.form == 'para':
+ node.parent, node.deprel = hi_prep.parent, 'obj'
+ lo_prep.deprel, hi_prep.deprel = 'mark', 'mark'
+ lo_prep.parent, hi_prep.parent = node, node
+ # Otherwise, they are probably multi-word prepositions, e.g. "according to",
+ # but they can also be sibling prepositions, e.g. "out of".
+ # The Google style does not distinguish those and I don't see any heuristics,
+ # so let's mark these cases as ToDo.
+ else:
+ first_prep, second_prep = hi_prep, lo_prep
+ if lo_prep.precedes(hi_prep):
+ first_prep, second_prep = lo_prep, hi_prep
+ first_prep.parent = hi_prep.parent
+ second_prep.parent = first_prep
+ for prep_child in second_prep.children:
+ prep_child.parent = first_prep
+ second_prep.deprel = 'fixed'
+ if self.lang == 'es' and lo_prep.form == 'par':
+ pass
+ else:
+ self.log(second_prep, 'unsure-multi-prep', 'deprel=fixed, but may be siblings')
+
+ @staticmethod
+ def fix_feats(node):
+ """Remove language prefixes, capitalize names and values, apply FEATS_CHANGE."""
+ orig_feats = dict(node.feats)
+ node.feats = None
+ for name, value in sorted(orig_feats.items()):
+ name = name.split('/')[1]
+ if name == 'inflection_type':
+ node.misc['InflectionType'] = value.capitalize()
+ continue
+ if "antecedent" in name and node.upos == 'PRON':
+ node.feats["PronType"] = "Prs"
+ new = FEATS_CHANGE.get(name + '=' + value)
+ if new is not None:
+ if new != '':
+ for new_pair in new.split('|'):
+ new_name, new_value = new_pair.split('=')
+ node.feats[new_name] = new_value
+ elif name[0].isupper():
+ node.feats[name] = value
+ else:
+ node.feats[name.capitalize()] = value.capitalize()
+
+ # Don't loose info about proper names which will not have upos=PROPN.
+ if node.feats['Proper'] == 'True':
+ if node.xpos not in {'NNP', 'NNPS'}:
+ node.misc['Proper'] = 'True'
+ del node.feats['Proper']
+
+ def fix_upos(self, node):
+ """PRT→PART, .→PUNCT, NOUN+Proper→PROPN, VERB+neg→AUX etc."""
+ if node.xpos == 'SYM': # These are almost always tagged as upos=X which is wrong.
+ node.upos = 'SYM'
+ if node.deprel in {'punct', 'p'}:
+ if node.form in "_-.؟”'":
+ node.upos = 'PUNCT'
+ else:
+ node.deprel = 'dep' # This is another way how to say deprel=todo.
+ elif node.upos == '.':
+ node.upos = 'PUNCT'
+ elif node.upos == 'PRT':
+ node.upos = 'PART'
+ elif node.upos == 'NOUN':
+ if node.xpos in {'NNP', 'NNPS'}:
+ node.upos = 'PROPN'
+
+ # Japanese uses negators with deprel=neg, which should be changed to advmod in Convert1to2.
+ if node.upos == "VERB" and node.deprel == "neg":
+ node.upos = "AUX"
+
+ # Indonesian uses prefixes (me, di, ber, ke,...) and suffixes (an, kan, i,...),
+ # which are written without spaces with the main word/stem (according to the raw text).
+ # These could be treated as syntactic words and annotated using multi-word tokens.
+ # However, there is no annotation about their dependency relations (just suff, pref)
+ # and UD_Indonesian v2.0 keeps them as one word with the stem. So let's follow this style.
+ # Chinese AFFIXes are more tricky to convert.
+ # It seems these words are quite often tagged as PART in UD_Chinese.
+ if node.upos == 'AFFIX':
+ if node.deprel == 'suff':
+ node.prev_node.form += node.form
+ node.remove(children='rehang')
+ elif node.deprel == 'pref':
+ node.next_node.form = node.form + node.next_node.form
+ node.remove(children='rehang')
+ else:
+ self.log(node, 'affix', 'upos=AFFIX deprel=' + node.deprel)
+ node.upos = 'PART'
+
+ if node.upos == 'PUNCT' and node.form in ('$', '£'):
+ node.upos = 'SYM'
+
+ if node.upos == "NUM" and node.deprel == "det" and not node.form.isnumeric():
+ node.upos = "DET"
+
+ if self.lang == 'de' and node.upos == 'CONJ' and node.form.lower() == 'zu':
+ node.deprel = 'mark'
+ node.upos = 'PART'
+ node.xpos = 'RP'
+ if node.parent.deprel == 'aux':
+ node.parent = node.parent.parent
+
+ if node.upos == 'CONJ' and node.deprel == 'mark':
+ node.upos = 'SCONJ'
+
+ if self.lang == 'fr':
+ if node.upos == 'PROPN' and node.form.lower() in FR_DAYS_MONTHS:
+ node.upos = 'NOUN'
+ if node.form == 'États-Unis':
+ node.upos = 'PROPN'
+
+ def fix_deprel(self, node):
+ """Convert Google dependency relations to UD deprels.
+
+ Change topology where needed.
+ """
+ try:
+ node.deprel = DEPREL_CHANGE[node.deprel]
+ except KeyError:
+ pass
+
+ if node.deprel in ('nn', 'compound'):
+ if node.upos == 'PROPN' and node.parent.upos == 'PROPN':
+ node.deprel = 'flat:name'
+ else:
+ node.deprel = 'compound'
+ elif node.deprel in ('pobj', 'pcomp'):
+ if node.parent.deprel in ('case', 'prep', 'conj'):
+ preposition = node.parent
+ node.parent = preposition.parent
+ preposition.parent = node
+
+ # ud.Convert1to2 will change 'nmod' to 'obl' if needed
+ if preposition.deprel == 'conj':
+ node.deprel = 'conj'
+ preposition.deprel = 'case'
+ elif node.deprel == 'pobj':
+ node.deprel = 'nmod'
+ else:
+ node.deprel = 'xcomp' # TODO check if pcomp -> xcomp is correct
+
+ # Prepositions should not have any children (except for deprel=fixed/mwe), see
+ # http://universaldependencies.org/u/overview/syntax.html#multiword-function-words.
+ # Unfortunatelly, there are many annotation errors and it is almost always better
+ # to rehang the extra children (at least to prevent spurious non-projectivities).
+ # In case of PUNCTuation it is surely correct.
+ # Otherwise, let's mark it as ToDo.
+ for extra_prep_child in preposition.children:
+ if extra_prep_child.udeprel in ('fixed', 'mwe'):
+ continue
+ extra_prep_child.parent = node
+ if extra_prep_child.upos != 'PUNCT':
+ self.log(extra_prep_child, 'ex-adp-child', 'was an extra adposition child')
+ else:
+ self.log(node, node.deprel, node.deprel + ' but parent.deprel!=case')
+ node.deprel = 'obj'
+ elif node.deprel == 'infmod':
+ node.deprel = 'xcomp'
+ node.feats['VerbForm'] = 'Inf'
+ elif node.deprel == 'partmod':
+ node.deprel = 'ccomp'
+ node.feats['VerbForm'] = 'Part'
+ elif node.deprel == 'suff':
+ node.misc['OrigDeprel'] = 'suff'
+ node.deprel = 'dep'
+ elif node.deprel == 'gmod':
+ node.deprel = 'nmod' if node.feats['Case'] == 'Gen' else 'nmod:gmod'
+ elif node.deprel == 'cc':
+ if node.upos == 'PUNCT' and node.form == ',':
+ node.deprel = 'punct'
+ elif node.deprel == 'parataxis':
+ if node.children:
+ cc_node = node.descendants[0].prev_node
+ if cc_node.udeprel == 'cc' and cc_node.parent == node.parent:
+ node.deprel = 'conj'
+ elif node.deprel == 'dislocated':
+ if self.lang == 'fr':
+ nsubj = next((n for n in node.parent.children if n.udeprel == 'nsubj'), None)
+ if nsubj is not None:
+ node.deprel = 'nsubj'
+ nsubj.deprel = 'expl' if nsubj.upos == 'PRON' else 'dislocated'
+ elif node.deprel == 'appos':
+ if self.lang == 'fr' and node.parent.form in {'M.', 'Mme', 'Dr'}:
+ node.deprel = 'flat:name'
+ elif node.deprel == 'prt':
+ if self.lang in {'en', 'de', 'nl', 'sv', 'da', 'no', 'th'}:
+ node.deprel = 'compound:prt'
+ elif self.lang == 'tr':
+ node.deprel = 'advmod:emph'
+ else:
+ node.deprel = 'dep:prt'
+ elif node.deprel == 'redup':
+ node.deprel = 'compound:plur' if self.lang == 'id' else 'compound:redup'
+ elif node.deprel == 'ig':
+ if node.parent.form == 'ki' and node.parent.deprel not in {'prep', 'pobj'}:
+ ki = node.parent
+ node.deprel = ki.deprel
+ ki.upos = 'ADP'
+ ki.deprel = 'case'
+ node.parent = ki.parent
+ ki.parent = node
+ elif node.upos == 'AUX' or node.form == 'ler': # dır, dir, ydi, dı, ydı, tu, değil,...
+ node.deprel = 'cop'
+ elif node.parent.upos == 'AUX': # yaşlıyken, gençken,...
+ copula = node.parent
+ node.parent = copula.parent
+ copula.parent = node
+ node.deprel = copula.deprel
+ copula.deprel = 'cop'
+ elif node.upos == 'PUNCT':
+ node.deprel = 'punct'
+ else:
+ node.deprel = 'dep:ig'
diff --git a/udapi/block/ud/he/fixneg.py b/udapi/block/ud/he/fixneg.py
index 5062854c..15325990 100644
--- a/udapi/block/ud/he/fixneg.py
+++ b/udapi/block/ud/he/fixneg.py
@@ -6,6 +6,7 @@
from udapi.core.block import Block
+
class FixNeg(Block):
"""Block for fixing the remaining cases (after ud.Convert1to2) of deprel=neg in UD_Hebrew."""
diff --git a/udapi/block/ud/hi/fixaux.py b/udapi/block/ud/hi/fixaux.py
new file mode 100644
index 00000000..004ab4af
--- /dev/null
+++ b/udapi/block/ud/hi/fixaux.py
@@ -0,0 +1,170 @@
+"""
+Block to fix annotation of verbs that are currently treated as auxiliaries
+but they should be treated as normal verbs instead.
+"""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixAux(Block):
+
+ def process_node(self, node):
+ self.fix_lemma(node)
+ # The following verbs appear in verb-verb compounds as the semantically
+ # less salient element: le (to take), de (to give), ḍāla / phenka (to throw),
+ # baiṭha (to sit), uṭha (to rise), rakha (to keep), ā (to come), lā (to bring),
+ # pahuñca (to reach), dekha (to look), phara (to return), cala (to walk),
+ # caṛha (to climb), saṛa (to rot), nikala (to get out), nikāla (to remove), girā (to drop),
+ # samā (to encounter), dhamaka (to bully), khaḍā (to stand), daboca (to catch),
+ # gujara (to pass), ghera (to surround), baca (to escape).
+ # There are also jā (to go) and paṛa (to fall) but we do not list them here
+ # because they can also act as genuine auxiliaries.
+ hicompound = ['ले', 'दे', 'डाल', 'फेंक', 'बैठ', 'उठ', 'रख', 'आ', 'पहुंच', 'चल', 'निकल', 'निकाल', 'गिरा', 'समा', 'धमक', 'खडा', 'दबोच', 'गुजर', 'फूंक', 'घेर', 'बच']
+ urcompound = ['لے', 'دے', 'ڈال', 'پھینک', 'بیٹھ', 'اٹھ', 'رکھ', 'آ', 'لا', 'پہنچ', 'دیکھ', 'پھر', 'چل', 'چڑھ', 'سڑ']
+ recompound = r'^(' + '|'.join(hicompound + urcompound) + r')$'
+ # Control and raising verbs.
+ # चाहना چاہنا (cāhnā) “to want, to wish” is a control verb but not an auxiliary.
+ # Its form چاہیئے (cāhie) “should, ought to” (literally "is wanted"?) is treated as a separate, derived word, and it is a modal auxiliary.
+ # दिखाना دکھانا (dikhānā) “to show”
+ # बनना بننا (bananā) “to become”
+ hiphase = ['लग', 'चुक', 'चाह', 'दिखा', 'बन', 'करा']
+ urphase = ['لگ', 'چک', 'چاہ', 'دکھا', 'بن']
+ rephase = r'^(' + '|'.join(hiphase + urphase) + r')$'
+ if re.match(recompound, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux':
+ node.deprel = 'compound'
+ # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX.
+ node.upos = "VERB"
+ # वाला والا (vālā) with infinitive is annotated as auxiliary but it should not.
+ # It is not even a verb (it does not have a verbal paradigm); it is more
+ # like an adjective morphologically, and like a noun syntactically. It means
+ # “the one who does the action of the content verb infinitive.”
+ # Some occurrences in the original annotation are case or mark, so we do not
+ # check AUX/aux here.
+ elif node.lemma == 'वाला' or node.lemma == 'والا':
+ node.upos = 'ADJ'
+ node.feats['AdpType'] = ''
+ node.feats['VerbForm'] = ''
+ node.feats['Aspect'] = ''
+ node.deprel = 'compound'
+ elif re.match(rephase, node.lemma) and node.upos == 'AUX' and node.udeprel == 'aux':
+ secpred = node.parent
+ grandparent = secpred.parent
+ node.parent = grandparent
+ node.deprel = secpred.deprel
+ secpred.parent = node
+ secpred.deprel = "xcomp"
+ ###!!! We should also take care of DEPS if they exist.
+ # The word is no longer treated as an auxiliary, so it should be VERB rather than AUX.
+ node.upos = "VERB"
+ # Examine the children of the original parent.
+ # Those that modify the clause should be re-attached to me.
+ # Those that modify the word (noun, adjective) should stay there.
+ for c in secpred.children:
+ # obl is borderline. It could modify an adjective rather than a clause.
+ # obj and iobj should not occur in copular clauses but it sometimes
+ # occurs with pseudocopulas: "I declare him handsome."
+ if re.match("(nsubj|csubj|advmod|advcl|obj|iobj|obl|aux|mark|punct|cc|expl|dislocated|vocative|discourse|parataxis)", c.udeprel):
+ c.parent = node
+
+ def fix_lemma(self, node):
+ """
+ Some verbal forms have wrong lemmas in the Hindi/Urdu treebanks. If they
+ are tagged AUX, it means that either the validator fails to recognize a
+ correct auxiliary, or we fail here to recognize a spurious auxiliary that
+ must be fixed.
+ """
+ if node.upos == 'AUX':
+ # آنے is the oblique infinitive form of “to come”
+ if node.lemma == 'آنہ':
+ node.lemma = 'آ'
+ # بنانا बनाना “make, create, produce, cause to be/become”
+ # (I don't know why in some instances بنا was used as lemma for کر “to do”.)
+ if node.form == 'کر' and node.lemma == 'بنا':
+ node.lemma = 'کر'
+ # چاہئے (cāhie) “should, ought to” occurs with alternative spellings (should they also be labeled as typos?)
+ if node.form == 'چاہئے' or node.form == 'چاہیئے' or node.form == 'چاہیے':
+ node.lemma = 'چاہئے'
+ if node.form == 'چاہئیں':
+ node.lemma = 'چاہئے'
+ node.feats['Number'] = 'Plur'
+ # چاہے seems to be a wrong lemma of چاہیں_گے “would like”
+ if node.lemma == 'چاہے':
+ node.lemma = 'چاہ'
+ # चुका چکا is a perfective participle of चुकना چکنا (cuknā) “to be finished”
+ if node.lemma == 'चुका':
+ node.lemma = 'चुक'
+ if node.lemma == 'چکا':
+ node.lemma = 'چک'
+ # दिया دیا is a perfective participle of देना دینا (denā) “to give”
+ if node.lemma == 'दिया':
+ node.lemma = 'दे'
+ if node.lemma == 'دیا' or node.lemma == 'دی' or node.lemma == 'دیت':
+ node.lemma = 'دے'
+ # دکھائیں (dikhānā) “to show”
+ if node.form == 'دکھائیں':
+ node.lemma = 'دکھا'
+ # گا, گی, گے denote the future tense. They are written as separate
+ # words in Urdu (while they are just suffixes in Hindi). However,
+ # when written as a separate auxiliary, all these forms should share
+ # the same lemma.
+ if node.lemma == 'گی' or node.lemma == 'گے':
+ node.lemma = 'گا'
+ # گیا is a perfective participle of जाना جانا (jānā) “to go”
+ # जान جان is nonsense. It occurs with forms like جانی, which is a feminine form of the infinitive جانا.
+ if node.lemma == 'जाना' or node.lemma == 'जान':
+ node.lemma = 'जा'
+ if node.lemma == 'گیا' or node.lemma == 'جائے' or node.lemma == 'جاتا' or node.lemma == 'جاتی' or node.lemma == 'جان' or node.lemma == 'جانا' or node.lemma == 'جاؤ' or node.lemma == 'جائی' or node.lemma == 'جاتے' or node.lemma == 'جات':
+ node.lemma = 'جا'
+ # Wrongly lemmatized present forms of “to be”.
+ # In one instance, ہے had a lemma from a neighboring verb, so we also look at the form.
+ if node.lemma == 'हों' or node.lemma == 'है.':
+ node.lemma = 'है'
+ if node.lemma == 'ہوں' or node.lemma == 'ہوا' or node.form == 'ہے':
+ node.lemma = 'ہے'
+ # लिया لیا is a perfective participle of लेना لینا (lenā) “to take”
+ # In one instance, لیا had a lemma from a neighboring verb, so we also look at the form.
+ if node.lemma == 'लिया':
+ node.lemma = 'ले'
+ if node.lemma == 'لیا' or node.form == 'لیا' or node.lemma == 'لو' or node.lemma == 'لی' or node.lemma == 'لیجیے':
+ node.lemma = 'لے'
+ # लगा لگا is a perfective participle of लगना لگنا (lagnā) “to seem, to appear”
+ if node.lemma == 'लगा':
+ node.lemma = 'लग'
+ if node.lemma == 'لگا':
+ node.lemma = 'لگ'
+ # पहुंचा پہنچا is a perfective participle of पहुंचना پہنچنا (pahuñcnā) “to reach”
+ if node.lemma == 'पहुंचा' or node.lemma == 'पहुँच':
+ node.lemma = 'पहुंच'
+ # پڑے is a perfective participle of پڑنا (paṛnā) “to fall”
+ if node.lemma == 'پڑے':
+ node.lemma = 'پڑ'
+ # پھرے is a perfective participle of پھرنا (pharnā) “to return”
+ if node.lemma == 'پھرے':
+ node.lemma = 'پھر'
+ # रहा رہا is a perfective participle of रहना رہنا (rahnā) “to stay”
+ if node.lemma == 'रहा' or node.lemma == 'रहूं':
+ node.lemma = 'रह'
+ if node.lemma == 'رہا' or node.lemma == 'رہی' or node.lemma == 'رہے':
+ node.lemma = 'رہ'
+ # sakna to be able to
+ if node.lemma == 'سکے' or node.lemma == 'سکی' or node.lemma == 'سکتا' or node.lemma == 'سکت':
+ node.lemma = 'سک'
+ # Wrongly lemmatized past forms of “to be”.
+ if node.lemma == 'थी':
+ node.lemma = 'था'
+ if node.lemma == 'تھ' or node.lemma == 'تھے' or node.lemma == 'تھیں':
+ node.lemma = 'تھا'
+ # उठा اٹھا is a perfective participle of उठना اٹھنا (uṭhnā) “to rise, get up”
+ if node.lemma == 'उठा':
+ node.lemma = 'उठ'
+ if node.lemma == 'اٹھا':
+ node.lemma = 'اٹھ'
+ # The compound part vālā is not an auxiliary. We handle it in process_node()
+ # but it must be lemmatized properly.
+ if node.lemma == 'والی':
+ node.lemma = 'والا'
+ # The postposition ke after a verbal stem is not an auxiliary.
+ # Example: علحدہ علحدہ کیس رجسٹر کر کے “by registering separate cases”
+ if node.lemma == 'کا' and node.form == 'کے':
+ node.upos = 'ADP'
+ node.deprel = 'mark'
diff --git a/udapi/block/ud/id/__init__.py b/udapi/block/ud/id/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/ud/id/addmwt.py b/udapi/block/ud/id/addmwt.py
new file mode 100644
index 00000000..a8d50748
--- /dev/null
+++ b/udapi/block/ud/id/addmwt.py
@@ -0,0 +1,219 @@
+"""
+Block ud.id.AddMwt cuts the clitic "-nya" in Indonesian (preprocessed with
+MorphInd whose output is stored in MISC attribute MorphInd).
+"""
+import udapi.block.ud.addmwt
+import logging
+import re
+
+class AddMwt(udapi.block.ud.addmwt.AddMwt):
+ """Detect and mark MWTs (split them into words and add the words to the tree)."""
+
+ def multiword_analysis(self, node):
+ """Return a dict with MWT info or None if `node` does not represent a multiword token."""
+ if re.search(r'^(ku|kau)', node.form, re.IGNORECASE) and re.search(r'^\^(aku_PS1|kamu
_PS2)\+', node.misc['MorphInd']) and node.upos == 'VERB':
+ splitform = re.sub(r'^(ku|kau)', r'\1 ', node.form, flags=re.IGNORECASE)
+ # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3.
+ # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3.
+ node.feats['Number[psor]'] = ''
+ node.feats['Person[psor]'] = ''
+ upos = 'PRON VERB'
+ if re.search(r'^ku ', splitform.lower()):
+ lemma = re.sub(r'^ku ', 'aku ', splitform.lower())
+ feats = 'Number=Sing|Person=1|PronType=Prs *'
+ xpos = re.sub(r'\+', ' ', node.xpos)
+ if len(xpos.split())<2:
+ xpos = 'PS1 VSA'
+ else:
+ lemma = re.sub(r'^kau ', 'kamu ', splitform.lower())
+ feats = 'Number=Sing|Person=2|PronType=Prs *'
+ xpos = re.sub(r'\+', ' ', node.xpos)
+ if len(xpos.split())<2:
+ xpos = 'PS2 VSA'
+ deprel = 'nsubj *'
+ return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel}
+ elif re.search(r'(nya|ku|mu)$', node.form, re.IGNORECASE) and re.search(r'\+(dia
_PS3|aku
_PS1|kamu
_PS2)\$$', node.misc['MorphInd']):
+ if node.upos == 'VERB':
+ splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE)
+ # For transitive verbs with the meN- prefix, -nya is an object clitic.
+ # For passive verbs with the di- prefix, -nya refers to a passive agent.
+ # For verbs with prefixes ber-, ter-, and verbs without prefixes, -nya is a definite article and signals nominalization.
+ # The same would hold for intransitive verbs with the meN- prefix but we cannot recognize them (we will treat all meN- verbs as transitive).
+ menverb = True if re.match(r'^\^meN\+', node.misc['MorphInd']) else False
+ diverb = True if re.match(r'^\^di\+', node.misc['MorphInd']) else False
+ nominalization = not menverb and not diverb
+ # The verb with -nya typically has Number[psor]=Sing|Person[psor]=3.
+ # Remove these features from the verb and give the pronoun normal features Number=Sing|Person=3.
+ node.feats['Number[psor]'] = ''
+ node.feats['Person[psor]'] = ''
+ if nominalization:
+ lemma = splitform.lower()
+ upos = 'VERB DET'
+ feats = '* Definite=Def|PronType=Art'
+ deprel = '* det'
+ else:
+ upos = 'VERB PRON'
+ if re.search(r' nya$', splitform.lower()):
+ lemma = re.sub(r' nya$', ' dia', splitform.lower())
+ feats = '* Number=Sing|Person=3|PronType=Prs'
+ elif re.search(r' ku$', splitform.lower()):
+ lemma = re.sub(r' ku$', ' aku', splitform.lower())
+ feats = '* Number=Sing|Person=1|PronType=Prs'
+ else:
+ lemma = re.sub(r' mu$', ' kamu', splitform.lower())
+ feats = '* Number=Sing|Person=2|PronType=Prs'
+ # The agent of the passive verb is coded like a direct object of an active verb,
+ # so we might want to use obj:agent rather than obl:agent. However, full nominals
+ # as passive agents can be optionally accompanied by the preposition _oleh_ "by",
+ # which is an argument in favor of saying that they are oblique. So we currently
+ # mark all passive agents as obliques, although it is disputable in Austronesian
+ # languages (unlike Indo-European passives).
+ deprel = '* obl:agent' if diverb else '* obj'
+ xpos = re.sub(r'\+', ' ', node.xpos)
+ # 'main': 0 ... this is the default value (the first node will be the head and inherit children)
+ return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel}
+ elif re.match(r'(NOUN|PROPN|X)', node.upos):
+ splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE)
+ # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3.
+ # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3.
+ node.feats['Number[psor]'] = ''
+ node.feats['Person[psor]'] = ''
+ upos = '* PRON'
+ if re.search(r' nya$', splitform.lower()):
+ lemma = re.sub(r' nya$', ' dia', splitform.lower())
+ feats = '* Number=Sing|Person=3|PronType=Prs'
+ elif re.search(r' ku$', splitform.lower()):
+ lemma = re.sub(r' ku$', ' aku', splitform.lower())
+ feats = '* Number=Sing|Person=1|PronType=Prs'
+ else:
+ lemma = re.sub(r' mu$', ' kamu', splitform.lower())
+ feats = '* Number=Sing|Person=2|PronType=Prs'
+ xpos = re.sub(r'\+', ' ', node.xpos)
+ deprel = '* nmod:poss'
+ # 'main': 0 ... this is the default value (the first node will be the head and inherit children)
+ return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel}
+ elif node.upos == 'PRON' and re.match(r'^diri(nya|ku|mu)$', node.form, re.IGNORECASE):
+ # dirinya = reflexive himself/herself/itself (similarly, diriku = myself, dirimu = yourself; somewhere else we should check that they have the right features)
+ splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE)
+ # The noun with -nya typically has Number[psor]=Sing|Person[psor]=3.
+ # Remove these features from the noun and give the pronoun normal features Number=Sing|Person=3.
+ node.feats['Number[psor]'] = ''
+ node.feats['Person[psor]'] = ''
+ upos = 'PRON PRON'
+ if re.search(r' nya$', splitform.lower()):
+ lemma = re.sub(r' nya$', ' dia', splitform.lower())
+ feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=3|PronType=Prs'
+ xpos = 'NSD PS3'
+ elif re.search(r' ku$', splitform.lower()):
+ lemma = re.sub(r' ku$', ' aku', splitform.lower())
+ feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=1|PronType=Prs'
+ xpos = 'NSD PS1'
+ else:
+ lemma = re.sub(r' mu$', ' kamu', splitform.lower())
+ feats = 'PronType=Prs|Reflex=Yes Number=Sing|Person=2|PronType=Prs'
+ xpos = 'NSD PS2'
+ deprel = '* nmod:poss'
+ # 'main': 0 ... this is the default value (the first node will be the head and inherit children)
+ return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel}
+ elif node.upos == 'ADJ' and re.search(r'(nya)$', node.form, re.IGNORECASE):
+ # nominalized adjective
+ splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE)
+ lemma = splitform.lower()
+ upos = 'ADJ DET'
+ feats = '* Definite=Def|PronType=Art'
+ if re.match(r' ', node.xpos):
+ xpos = re.sub(r'\+', ' ', node.xpos)
+ else:
+ xpos = 'ASP PS3'
+ deprel = '* det'
+ # 'main': 0 ... this is the default value (the first node will be the head and inherit children)
+ return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel}
+ elif re.match(r'^(banyak|semua)nya$', node.form, re.IGNORECASE):
+ # semua = all (DET)
+ # semuanya = nominalization of semua, i.e., 'everything' (PRON)
+ # banyak = many, much (DET)
+ # banyaknya = nominalization of banyak, i.e., 'a lot' (PRON)
+ splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE)
+ lemma = splitform.lower()
+ upos = 'DET DET'
+ feats = ('PronType=Tot' if lemma == 'semua nya' else 'PronType=Ind')+' Definite=Def|PronType=Art'
+ xpos = re.sub(r'\+', ' ', node.xpos)
+ deprel = '* det'
+ # 'main': 0 ... this is the default value (the first node will be the head and inherit children)
+ return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel}
+ elif re.match(r'^(satu)nya$', node.form, re.IGNORECASE):
+ # satu = one (NUM)
+ # satunya = nominalization of satu, meaning 'the only one'
+ splitform = re.sub(r'(nya)$', r' \1', node.form, flags=re.IGNORECASE)
+ lemma = splitform.lower()
+ upos = 'NUM DET'
+ feats = 'NumType=Card Definite=Def|PronType=Art'
+ xpos = re.sub(r'\+', ' ', node.xpos)
+ deprel = '* det'
+ # 'main': 0 ... this is the default value (the first node will be the head and inherit children)
+ return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel}
+ elif node.upos == 'ADP' and re.match(r'^R--\+PS[123]$', node.xpos) or re.match(r'^(bersama|dibawah|didalam|sekitar)nya$', node.form, re.IGNORECASE):
+ # Fused preposition and pronoun.
+ # Most of them are recognized as R--+PS3 by MorphInd. However, some are different:
+ # bersamanya = 'with him' = VSA+PS3
+ # dibawahnya = 'under it' = VSP+PS3
+ # didalamnya = 'inside it' = VSP+PS3
+ # sekitarnya = 'around it' = D--+PS3
+ # However:
+ # layaknya = 'like' is a derivation from 'layak' = 'worthy' (ASP+PS3)
+ splitform = re.sub(r'(nya|ku|mu)$', r' \1', node.form, flags=re.IGNORECASE)
+ upos = 'ADP PRON'
+ if re.search(r' nya$', splitform.lower()):
+ lemma = re.sub(r' nya$', ' dia', splitform.lower())
+ feats = '* Number=Sing|Person=3|PronType=Prs'
+ xpos = 'R-- PS3'
+ elif re.search(r' ku$', splitform.lower()):
+ lemma = re.sub(r' ku$', ' aku', splitform.lower())
+ feats = '* Number=Sing|Person=1|PronType=Prs'
+ xpos = 'R-- PS1'
+ else:
+ lemma = re.sub(r' mu$', ' kamu', splitform.lower())
+ feats = '* Number=Sing|Person=2|PronType=Prs'
+ xpos = 'R-- PS2'
+ if node.udeprel == 'case':
+ if re.match(r'^(NOUN|PROPN|PRON|DET|NUM|X|SYM)$', node.parent.upos):
+ deprel = 'nmod'
+ else:
+ deprel = 'obl'
+ else:
+ deprel = '*'
+ deprel = 'case '+deprel
+ return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'main': 1, 'shape': 'subtree', 'deprel': deprel}
+ else:
+ # Do not warn about instances that are known exceptions.
+ # akibatnya = as a result (SCONJ); akibat = result
+ # bukannya = instead (PART); bukan = no, not
+ # layaknya = like (ADP); layak = worthy
+ # sebaiknya = should (AUX)
+ # sesampainya = once in / arriving at (ADP)
+ # tidaknya = whether or not (PART); tidak = no, not
+ # Adverbs are an exception, too. The -nya morpheme could be derivation. E.g., 'ironis' = 'ironic'; 'ironisnya' = 'ironically'.
+ if node.upos != 'ADV' and not re.match(r'^(akibat|bukan|layak|sebaik|sesampai|tidak)(nya|ku|mu)$', node.form, re.IGNORECASE):
+ logging.warning("Form '%s' analyzed by MorphInd as having the -nya|-ku|-mu clitic but the UPOS is '%s' and XPOS is '%s'" % (node.form, node.upos, node.xpos))
+ return None
+ elif re.search(r'(kah|lah|pun|tah)$', node.form, re.IGNORECASE) and re.search(r'\+(kah|lah|pun|tah)_T--\$$', node.misc['MorphInd']):
+ splitform = re.sub(r'(kah|lah|pun|tah)$', r' \1', node.form, flags=re.IGNORECASE)
+ lemma = splitform.lower()
+ upos = '* PART'
+ feats = '* _'
+ xpos = re.sub(r'\+', ' ', node.xpos)
+ if len(xpos.split()) < 2:
+ xpos = xpos + ' T--'
+ deprel = '* advmod:emph'
+ # 'main': 0 ... this is the default value (the first node will be the head and inherit children)
+ return {'form': splitform, 'lemma': lemma, 'upos': upos, 'feats': feats, 'xpos': xpos, 'shape': 'subtree', 'deprel': deprel}
+ return None
+
+ def postprocess_mwt(self, mwt):
+ """Distribute the MorphInd analysis to the two parts so that we can later use it to fix the lemmas of verbs."""
+ match = re.match(r'^\^(.*)\+(aku_PS1|kamu
_PS2|dia
_PS3|kah_T--|lah_T--|pun_T--|tah_T--)\$$', mwt.misc['MorphInd'])
+ if not match:
+ match = re.match(r'^\^(aku_PS1|kamu
_PS2)\+(.*)\$$', mwt.misc['MorphInd'])
+ if match:
+ mwt.words[0].misc['MorphInd'] = '^'+match.group(1)+'$'
+ mwt.words[1].misc['MorphInd'] = '^'+match.group(2)+'$'
diff --git a/udapi/block/ud/id/fixgsd.py b/udapi/block/ud/id/fixgsd.py
new file mode 100644
index 00000000..4ea23d06
--- /dev/null
+++ b/udapi/block/ud/id/fixgsd.py
@@ -0,0 +1,447 @@
+"""Block to fix annotation of UD Indonesian-GSD."""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixGSD(Block):
+
+ def fix_upos_based_on_morphind(self, node):
+ """
+ Example from data: ("kesamaan"), the correct UPOS is NOUN, as
+ suggested by MorphInd.
+ Based on my observation so far, if there is a different UPOS between
+ the original GSD and MorphInd, it's better to trust MorphInd
+ I found so many incorrect UPOS in GSD, especially when NOUNs become
+ VERBs and VERBs become NOUNs.
+ I suggest adding Voice=Pass when the script decides ke-xxx-an as VERB.
+ """
+ if node.upos == 'VERB' and node.xpos == 'NSD' and re.match(r'^ke.+an$', node.form, re.IGNORECASE):
+ node.upos = 'NOUN'
+ if node.udeprel == 'acl':
+ node.deprel = 'nmod'
+ elif node.udeprel == 'advcl':
+ node.deprel = 'obl'
+
+ def fix_semua(self, node):
+ """
+ Indonesian "semua" means "everything, all".
+ Originally it was DET, PRON, or ADV.
+ Ika: I usually only labeled "semua" as DET only if it's followed by a
+ NOUN/PROPN. If it's followed by DET (including '-nya' as DET) or it's
+ not followed by any NOUN/DET, I labeled them as PRON.
+ """
+ if node.form.lower() == 'semua':
+ if re.match(r'^(NOUN|PROPN)$', node.parent.upos) and node.parent.ord > node.ord:
+ node.upos = 'DET'
+ if node.udeprel == 'nmod' or node.udeprel == 'advmod':
+ node.deprel = 'det'
+ else:
+ node.upos = 'PRON'
+ if node.udeprel == 'det' or node.udeprel == 'advmod':
+ node.deprel = 'nmod'
+ node.feats['PronType'] = 'Tot'
+
+ def fix_ordinal_numerals(self, node):
+ """
+ Ordinal numerals should be ADJ NumType=Ord in UD. They have many different
+ UPOS tags in Indonesian GSD. This method harmonizes them.
+ pertama = first
+ kedua = second
+ ketiga = third
+ keempat = fourth
+ kelima = fifth
+ keenam = sixth
+ ketujuh = seventh
+ kedelapan = eighth
+ kesembilan = ninth
+ ke-48 = 48th
+
+ However! The ke- forms (i.e., not 'pertama') can also function as total
+ versions of cardinal numbers ('both', 'all three' etc.). If the numeral
+ precedes the noun, it is a total cardinal; if it follows the noun, it is
+ an ordinal. An exception is when the modified noun is 'kali' = 'time'.
+ Then the numeral is ordinal regardless where it occurs, and together
+ with 'kali' it functions as an adverbial ordinal ('for the second time').
+ """
+ # We could also check the XPOS, which is derived from MorphInd: re.match(r'^CO-', node.xpos)
+ if re.match(r'^pertama(nya)?$', node.form, re.IGNORECASE):
+ node.upos = 'ADJ'
+ node.feats['NumType'] = 'Ord'
+ if re.match(r'^(det|nummod|nmod)$', node.udeprel):
+ node.deprel = 'amod'
+ elif re.match(r'^(kedua|ketiga|keempat|kelima|keenam|ketujuh|kedelapan|kesembilan|ke-?\d+)(nya)?$', node.form, re.IGNORECASE):
+ if node.parent.ord < node.ord or node.parent.lemma == 'kali':
+ node.upos = 'ADJ'
+ node.feats['NumType'] = 'Ord'
+ if re.match(r'^(det|nummod|nmod)$', node.udeprel):
+ node.deprel = 'amod'
+ else:
+ node.upos = 'NUM'
+ node.feats['NumType'] = 'Card'
+ node.feats['PronType'] = 'Tot'
+ if re.match(r'^(det|amod|nmod)$', node.udeprel):
+ node.deprel = 'nummod'
+
+ def rejoin_ordinal_numerals(self, node):
+ """
+ If an ordinal numeral is spelled using digits ('ke-18'), it is often
+ tokenized as multiple tokens, which is wrong. Fix it.
+ """
+ if node.form.lower() == 'ke':
+ dash = None
+ number = None
+ if node.next_node:
+ if node.next_node.form == '-':
+ dash = node.next_node
+ if dash.next_node and re.match(r'^\d+$', dash.next_node.form):
+ number = dash.next_node
+ node.form = node.form + dash.form + number.form
+ node.lemma = node.lemma + dash.lemma + number.lemma
+ elif re.match(r'^\d+$', node.next_node.form) and (node.parent == node.next_node or node.next_node.parent == node):
+ number = node.next_node
+ node.feats['Typo'] = 'Yes'
+ node.misc['CorrectForm'] = node.form + '-' + number.form
+ node.form = node.form + number.form
+ node.lemma = node.lemma + '-' + number.lemma
+ if number:
+ # Let us pretend that these forms are always ordinal numerals.
+ # Situations where they act as total cardinals will be disambiguated
+ # in a subsequent call to fix_ordinal_numerals().
+ node.upos = 'ADJ'
+ node.xpos = 'CO-'
+ node.feats['NumType'] = 'Ord'
+ node.misc['MorphInd'] = '^ke_R--+' + number.form + '_CC-$'
+ # Find the parent node. Assume that the dash, if present, was not the head.
+ if node.parent == number:
+ node.parent = number.parent
+ node.deprel = number.deprel
+ if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel):
+ node.deprel = 'amod'
+ # Adjust SpaceAfter.
+ node.misc['SpaceAfter'] = 'No' if number.no_space_after else ''
+ # Remove the separate node of the dash and the number.
+ if dash:
+ if len(dash.children) > 0:
+ for c in dash.children:
+ c.parent = node
+ dash.remove()
+ if len(number.children) > 0:
+ for c in number.children:
+ c.parent = node
+ number.remove()
+ # There may have been spaces around the dash, which are now gone. Recompute the sentence text.
+ node.root.text = node.root.compute_text()
+
+ def rejoin_decades(self, node):
+ """
+ In Indonesian, the equivalent of English "1990s" is written as "1990-an".
+ In GSD, it is often tokenized as multiple tokens, which is wrong. Fix it.
+ """
+ if node.form.lower() == 'an':
+ dash = None
+ number = None
+ if node.prev_node:
+ if node.prev_node.form == '-':
+ dash = node.prev_node
+ if dash.prev_node and re.match(r'^\d+$', dash.prev_node.form):
+ number = dash.prev_node
+ node.form = number.form + dash.form + node.form
+ node.lemma = number.lemma + dash.lemma + node.lemma
+ elif re.match(r'^\d+$', node.prev_node.form) and (node.parent == node.prev_node or node.prev_node.parent == node):
+ number = node.prev_node
+ node.feats['Typo'] = 'Yes'
+ node.misc['CorrectForm'] = number.form + '-' + node.form
+ node.form = number.form + node.form
+ node.lemma = number.lemma + '-' + node.lemma
+ if number:
+ # The combined token is no longer a numeral. It cannot quantify an entity.
+ # Instead, it is itself something like a noun (or perhaps proper noun).
+ node.upos = 'NOUN'
+ node.xpos = 'NSD'
+ node.feats['NumType'] = ''
+ # In some cases, "-an" is labeled as foreign for no obvious reason.
+ node.feats['Foreign'] = ''
+ node.misc['MorphInd'] = '^' + number.form + '_CC-+an_F--$'
+ # Find the parent node. Assume that the dash, if present, was not the head.
+ if node.parent == number:
+ node.parent = number.parent
+ node.deprel = number.deprel
+ if re.match(r'(case|mark|det|nummod|nmod)', node.udeprel):
+ node.deprel = 'nmod'
+ # No need to adjust SpaceAfter, as the 'an' node was the last one in the complex.
+ #node.misc['SpaceAfter'] = 'No' if number.no_space_after else ''
+ # Remove the separate node of the dash and the number.
+ if dash:
+ if len(dash.children) > 0:
+ for c in dash.children:
+ c.parent = node
+ dash.remove()
+ if len(number.children) > 0:
+ for c in number.children:
+ c.parent = node
+ number.remove()
+ # There may have been spaces around the dash, which are now gone. Recompute the sentence text.
+ node.root.text = node.root.compute_text()
+
+ def merge_reduplication(self, node):
+ """
+ Reduplication is a common morphological device in Indonesian. Reduplicated
+ nouns signal plural but some reduplications also encode emphasis, modification
+ of meaning etc. In the previous annotation of GSD, reduplication was mostly
+ analyzed as three tokens, e.g., for plurals, the second copy would be attached
+ to the first one as compound:plur, and the hyphen would be attached to the
+ second copy as punct. We want to analyze reduplication as a single token.
+ Fix it.
+ """
+ # We assume that the previous token is a hyphen and the token before it is the parent.
+ first = node.parent
+ root = node.root
+ # Example of identical reduplication: negara-negara = countries
+ # Example of reduplication with -an: kopi-kopian = various coffee trees
+ # Example of reduplication with vowel substitution: bolak-balik = alternating
+ # Example of reduplication with di-: disebut-sebut = mentioned (the verb sebut is reduplicated, then passivized)
+ # Example of reduplication with se-: sehari-hari = daily (hari = day)
+ # The last pattern is not reduplication but we handle it here because the procedure is very similar: non-/sub-/anti- + a word.
+ if first.ord == node.ord-2 and (first.form.lower() == node.form.lower() or first.form.lower() + 'an' == node.form.lower() or re.match(r'^(.)o(.)a(.)-\1a\2i\3$', first.form.lower() + '-' + node.form.lower()) or first.form.lower() == 'di' + node.form.lower() or first.form.lower() == 'se' + node.form.lower() or re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower())):
+ hyph = node.prev_node
+ if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form):
+ # This is specific to the reduplicated plurals. The rest will be done for any reduplications.
+ # Note that not all reduplicated plurals had compound:plur. So we will look at whether they are NOUN.
+ ###!!! Also, reduplicated plural nouns always have exact copies on both sides of the hyphen.
+ ###!!! Some other reduplications have slight modifications on one or the other side.
+ if node.upos == 'NOUN' and first.form.lower() == node.form.lower():
+ first.feats['Number'] = 'Plur'
+ # For the non-/sub-/anti- prefix we want to take the morphology from the second word.
+ if re.match(r'^(non|sub|anti|multi|kontra)$', first.form.lower()):
+ first.lemma = first.lemma + '-' + node.lemma
+ first.upos = node.upos
+ first.xpos = node.xpos
+ first.feats = node.feats
+ first.misc['MorphInd'] = re.sub(r'\$\+\^', '+', first.misc['MorphInd'] + '+' + node.misc['MorphInd'])
+ # Neither the hyphen nor the current node should have children.
+ # If they do, re-attach the children to the first node.
+ for c in hyph.children:
+ c.parent = first
+ for c in node.children:
+ c.parent = first
+ # Merge the three nodes.
+ # It is possible that the last token of the original annotation
+ # is included in a multi-word token. Then we must extend the
+ # multi-word token to the whole reduplication! Example:
+ # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-'
+ # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'.
+ mwt = node.multiword_token
+ if mwt:
+ # We assume that the MWT has only two words. We are not prepared for other possibilities.
+ if len(mwt.words) > 2:
+ logging.critical('MWT of only two words is expected')
+ mwtmisc = mwt.misc.copy()
+ second = mwt.words[1]
+ mwt.remove()
+ first.form = first.form + '-' + node.form
+ hyph.remove()
+ node.remove()
+ first.misc['SpaceAfter'] = ''
+ mwt = root.create_multiword_token([first, second], form=first.form + second.form, misc=mwtmisc)
+ else:
+ first.form = first.form + '-' + node.form
+ if node.no_space_after:
+ first.misc['SpaceAfter'] = 'No'
+ else:
+ first.misc['SpaceAfter'] = ''
+ hyph.remove()
+ node.remove()
+ # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen.
+ # If it did not, then we have a mismatch with the sentence text, which we must fix.
+ # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-').
+ root.text = root.compute_text()
+ # In some cases the non-/sub-/anti- prefix is annotated as the head of the phrase and the above pattern does not catch it.
+ elif first.ord == node.ord+2 and re.match(r'^(non|sub|anti|multi|kontra)$', node.form.lower()):
+ prefix = node
+ stem = first # here it is not the first part at all
+ hyph = stem.prev_node
+ if hyph.is_descendant_of(first) and re.match(r'^(-|–|--)$', hyph.form):
+ # For the non-/sub-/anti- prefix we want to take the morphology from the second word.
+ stem.lemma = prefix.lemma + '-' + stem.lemma
+ stem.misc['MorphInd'] = re.sub(r'\$\+\^', '+', prefix.misc['MorphInd'] + '+' + stem.misc['MorphInd'])
+ # Neither the hyphen nor the prefix should have children.
+ # If they do, re-attach the children to the stem.
+ for c in hyph.children:
+ c.parent = stem
+ for c in prefix.children:
+ c.parent = stem
+ # Merge the three nodes.
+ # It is possible that the last token of the original annotation
+ # is included in a multi-word token. Then we must extend the
+ # multi-word token to the whole reduplication! Example:
+ # pemeran-pemerannya (the actors) ... originally 'pemeran' and '-'
+ # are tokens, 'pemerannya' is a MWT split to 'pemeran' and 'nya'.
+ mwt = stem.multiword_token
+ if mwt:
+ # We assume that the MWT has only two words. We are not prepared for other possibilities.
+ if len(mwt.words) > 2:
+ logging.critical('MWT of only two words is expected')
+ mwtmisc = mwt.misc.copy()
+ second = mwt.words[1]
+ mwt.remove()
+ stem.form = prefix.form + '-' + stem.form
+ prefix.remove()
+ hyph.remove()
+ stem.misc['SpaceAfter'] = ''
+ mwt = root.create_multiword_token([stem, second], form=stem.form + second.form, misc=mwtmisc)
+ else:
+ stem.form = prefix.form + '-' + stem.form
+ prefix.remove()
+ hyph.remove()
+ # We cannot be sure whether the original annotation correctly said that there are no spaces around the hyphen.
+ # If it did not, then we have a mismatch with the sentence text, which we must fix.
+ # The following will also fix cases where there was an n-dash ('–') instead of a hyphen ('-').
+ root.text = root.compute_text()
+
+ def fix_plural_propn(self, node):
+ """
+ It is unlikely that a proper noun will have a plural form in Indonesian.
+ All examples observed in GSD should actually be tagged as common nouns.
+ """
+ if node.upos == 'PROPN' and node.feats['Number'] == 'Plur':
+ node.upos = 'NOUN'
+ node.lemma = node.lemma.lower()
+ if node.upos == 'PROPN':
+ node.feats['Number'] = ''
+
+ def fix_satu_satunya(self, node):
+ """
+ 'satu' = 'one' (NUM)
+ 'satu-satunya' = 'the only'
+ """
+ root = node.root
+ if node.form == 'nya' and node.parent.form.lower() == 'satu' and node.parent.udeprel == 'fixed' and node.parent.parent.form.lower() == 'satu':
+ satu0 = node.parent.parent
+ satu1 = node.parent
+ nya = node
+ dash = None
+ if satu1.ord == satu0.ord+2 and satu1.prev_node.form == '-':
+ dash = satu1.prev_node
+ satu0.misc['SpaceAfter'] = 'No'
+ dash.misc['SpaceAfter'] = 'No'
+ root.text = root.compute_text()
+ satu1.deprel = 'compound:redup'
+ nya.parent = satu0
+ # We actually cannot leave the 'compound:redup' here because it is not used in Indonesian.
+ if node.form == 'nya' and node.parent.form.lower() == 'satu':
+ satu0 = node.parent
+ nya = node
+ if satu0.next_node.form == '-':
+ dash = satu0.next_node
+ if dash.next_node.form.lower() == 'satu':
+ satu1 = dash.next_node
+ if satu1.ord == node.ord-1:
+ # Merge satu0 + dash + satu1 into one node.
+ satu0.form = satu0.form + dash.form + satu1.form
+ dash.remove()
+ satu1.remove()
+ # There should be a multi-word token comprising satu1 + nya.
+ mwt = nya.multiword_token
+ if mwt:
+ mwtmisc = mwt.misc.copy()
+ mwt.remove()
+ mwt = root.create_multiword_token([satu0, nya], form=satu0.form + nya.form, misc=mwtmisc)
+ satu0.misc['SpaceAfter'] = ''
+ root.text = root.compute_text()
+ if node.multiword_token and node.no_space_after:
+ node.misc['SpaceAfter'] = ''
+
+ def lemmatize_from_morphind(self, node):
+ # The MISC column contains the output of MorphInd for the current word.
+ # The analysis has been interpreted wrongly for some verbs, so we need
+ # to re-interpret it and extract the correct lemma.
+ morphind = node.misc['MorphInd']
+ if node.upos == 'VERB':
+ if morphind:
+ # Remove the start and end tags from morphind.
+ morphind = re.sub(r"^\^", "", morphind)
+ morphind = re.sub(r"\$$", "", morphind)
+ # Remove the final XPOS tag from morphind.
+ morphind = re.sub(r"_V[SP][AP]$", "", morphind)
+ # Split morphind to prefix, stem, and suffix.
+ morphemes = re.split(r"\+", morphind)
+ # Expected suffixes are -kan, -i, -an, or no suffix at all.
+ # There is also the circumfix ke-...-an which seems to be nominalized adjective:
+ # "sama" = "same, similar"; "kesamaan" = "similarity", lemma is "sama";
+ # but I am not sure what is the reason that these are tagged VERB.
+ if len(morphemes) > 1 and re.match(r"^(kan|i|an(_NSD)?)$", morphemes[-1]):
+ del morphemes[-1]
+ # Expected prefixes are meN-, di-, ber-, peN-, ke-, ter-, se-, or no prefix at all.
+ # There can be two prefixes in a row, e.g., "ber+ke+", or "ter+peN+".
+ while len(morphemes) > 1 and re.match(r"^(meN|di|ber|peN|ke|ter|se|per)$", morphemes[0]):
+ del morphemes[0]
+ # Check that we are left with just one morpheme.
+ if len(morphemes) != 1:
+ logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats))
+ else:
+ lemma = morphemes[0]
+ # Remove the stem POS category.
+ lemma = re.sub(r"<[a-z]+>(_.*)?$", "", lemma)
+ node.lemma = lemma
+ else:
+ logging.warning("No MorphInd analysis found for form '%s'" % (node.form))
+ elif node.upos == 'NOUN':
+ if morphind:
+ # Remove the start and end tags from morphind.
+ morphind = re.sub(r"^\^", "", morphind)
+ morphind = re.sub(r"\$$", "", morphind)
+ # Remove the final XPOS tag from morphind.
+ morphind = re.sub(r'_(N[SP]D|VSA)$', '', morphind)
+ # Do not proceed if there is an unexpected final XPOS tag.
+ if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind):
+ # Split morphind to prefix, stem, and suffix.
+ morphemes = re.split(r'\+', morphind)
+ # Expected prefixes are peN-, per-, ke-, ber-.
+ # Expected suffix is -an.
+ if len(morphemes) > 1 and re.match(r'^an$', morphemes[-1]):
+ del morphemes[-1]
+ if len(morphemes) > 1 and re.match(r'^(peN|per|ke|ber)$', morphemes[0]):
+ del morphemes[0]
+ # Check that we are left with just one morpheme.
+ if len(morphemes) != 1:
+ logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats))
+ else:
+ lemma = morphemes[0]
+ # Remove the stem POS category.
+ lemma = re.sub(r'<[a-z]+>', '', lemma)
+ node.lemma = lemma
+ elif node.upos == 'ADJ':
+ if morphind:
+ # Remove the start and end tags from morphind.
+ morphind = re.sub(r"^\^", "", morphind)
+ morphind = re.sub(r"\$$", "", morphind)
+ # Remove the final XPOS tag from morphind.
+ morphind = re.sub(r'_ASS$', '', morphind)
+ # Do not proceed if there is an unexpected final XPOS tag.
+ if not re.search(r'_[A-Z][-A-Z][-A-Z]$', morphind):
+ # Split morphind to prefix, stem, and suffix.
+ morphemes = re.split(r'\+', morphind)
+ # Expected prefix is ter-.
+ if len(morphemes) > 1 and re.match(r'^ter$', morphemes[0]):
+ del morphemes[0]
+ # Check that we are left with just one morpheme.
+ if len(morphemes) != 1:
+ logging.warning("One morpheme expected, found %d %s, morphind = '%s', form = '%s', feats = '%s'" % (len(morphemes), morphemes, morphind, node.form, node.feats))
+ else:
+ lemma = morphemes[0]
+ # Remove the stem POS category.
+ lemma = re.sub(r'<[a-z]+>', '', lemma)
+ node.lemma = lemma
+ else:
+ logging.warning("No MorphInd analysis found for form '%s'" % (node.form))
+
+ def process_node(self, node):
+ self.fix_plural_propn(node)
+ self.fix_upos_based_on_morphind(node)
+ self.fix_semua(node)
+ self.rejoin_ordinal_numerals(node)
+ self.fix_ordinal_numerals(node)
+ self.rejoin_decades(node)
+ self.merge_reduplication(node)
+ self.fix_satu_satunya(node)
+ self.lemmatize_from_morphind(node)
diff --git a/udapi/block/ud/joinasmwt.py b/udapi/block/ud/joinasmwt.py
new file mode 100644
index 00000000..be93bd3c
--- /dev/null
+++ b/udapi/block/ud/joinasmwt.py
@@ -0,0 +1,51 @@
+"""Block ud.JoinAsMwt for creating multi-word tokens
+
+if multiple neighboring words are not separated by a space
+and the boundaries between the word forms are alphabetical.
+"""
+from udapi.core.block import Block
+
+
+class JoinAsMwt(Block):
+ """Create MWTs if words are not separated by a space.."""
+
+ def __init__(self, revert_orig_form=True, **kwargs):
+ """Args:
+ revert_orig_form: if any node of the newly created MWT has `misc['OrigForm']`,
+ it is used as the FORM (and deleted from MISC). Useful after `ud.ComplyWithText`.
+ Default=True.
+ """
+ super().__init__(**kwargs)
+ self.revert_orig_form = revert_orig_form
+
+ def process_node(self, node):
+ if node.multiword_token:
+ return
+ mwt_nodes = [node]
+ while (node.next_node and not node.next_node.multiword_token
+ and self.should_join(node, node.next_node)):
+ node = node.next_node
+ mwt_nodes.append(node)
+ if len(mwt_nodes) > 1:
+ self.create_mwt(mwt_nodes)
+
+ def should_join(self, node, next_node):
+ return node.no_space_after and node.form[-1].isalpha() and next_node.form[0].isalpha()
+
+ def create_mwt(self, mwt_nodes):
+ mwt_form = ''.join([n.form for n in mwt_nodes])
+ mwt = mwt_nodes[0].root.create_multiword_token(words=mwt_nodes, form=mwt_form)
+ if mwt_nodes[0].node.misc['SpaceAfter'] == 'No':
+ mwt.misc['SpaceAfter'] = 'No'
+ for mwt_node in mwt_nodes:
+ del mwt_node.misc['SpaceAfter']
+ if self.revert_orig_form:
+ for mwt_node in mwt_nodes:
+ if mwt_node.misc['OrigForm']:
+ mwt_node.form = mwt_node.misc['OrigForm']
+ del mwt_node.misc['OrigForm']
+ self.postprocess_mwt()
+
+ # a helper method to be overriden
+ def postprocess_mwt(self, mwt):
+ pass
diff --git a/udapi/block/ud/jointoken.py b/udapi/block/ud/jointoken.py
new file mode 100644
index 00000000..43d2b30d
--- /dev/null
+++ b/udapi/block/ud/jointoken.py
@@ -0,0 +1,97 @@
+"""
+Block ud.JoinToken will join a given token with the preceding one.
+"""
+from udapi.core.block import Block
+import logging
+
+
+class JoinToken(Block):
+ """
+ Merge two tokens into one. A MISC attribute is used to mark the tokens that
+ should join the preceding token. (The attribute may have been set by an
+ annotator or by a previous block that tests the specific conditions under
+ which joining is desired.) Joining cannot be done across sentence
+ boundaries; if necessary, apply util.JoinSentence first. Multiword tokens
+ are currently not supported: None of the nodes to be merged can belong to
+ a MWT. (The block ud.JoinAsMwt may be of some help, but it works differently.)
+ Merging is simple if there is no space between the tokens (see SpaceAfter=No
+ at the first token). If there is a space, there are three options in theory:
+
+ 1. Keep the tokens as two nodes but apply the UD goeswith relation
+ (see https://universaldependencies.org/u/overview/typos.html) and
+ the related annotation rules.
+ 2. Join them into one token that contains a space. Such "words with
+ spaces" can be exceptionally allowed in UD if they are registered
+ in the given language.
+ 3. Remove the space without any trace. Not recommended in UD unless the
+ underlying text was created directly for UD and can be thus considered
+ part of the annotation.
+
+ At present, this block does not support merging with spaces at all, but
+ in the future one or more of the options may be added.
+ """
+
+ def __init__(self, misc_name='JoinToken', misc_value=None, **kwargs):
+ """
+ Args:
+ misc_name: name of the MISC attribute that can trigger the joining
+ default: JoinToken
+ misc_value: value of the MISC attribute to trigger the joining;
+ if not specified, then simple occurrence of the attribute with any value will cause the joining
+ MISC attributes that have triggered sentence joining will be removed from their node.
+ """
+ super().__init__(**kwargs)
+ self.misc_name = misc_name
+ self.misc_value = misc_value
+
+ def process_node(self, node):
+ """
+ The JoinToken (or equivalent) attribute in MISC will trigger action.
+ Either the current node will be merged with the previous node and the
+ attribute will be removed from MISC, or a warning will be issued that
+ the merging cannot be done and the attribute will stay in MISC. Note
+ that multiword token lines and empty nodes are not even scanned for
+ the attribute, so if it is there, it will stay there but no warning
+ will be printed.
+ """
+ if node.misc[self.misc_name] == '':
+ return
+ if self.misc_value and node.misc[self.misc_name] != self.misc_value:
+ return
+ prevnode = node.prev_node
+ if not prevnode:
+ logging.warning("MISC %s cannot be used at the first token of a sentence." % self.misc_name)
+ node.misc['Bug'] = 'JoiningTokenNotSupportedHere'
+ return
+ if node.multiword_token or prevnode.multiword_token:
+ logging.warning("MISC %s cannot be used if one of the nodes belongs to a multiword token." % self.misc_name)
+ node.misc['Bug'] = 'JoiningTokenNotSupportedHere'
+ return
+ if prevnode.misc['SpaceAfter'] != 'No':
+ logging.warning("MISC %s cannot be used if there is space between the tokens." % self.misc_name)
+ node.misc['Bug'] = 'JoiningTokensWithSpaceNotSupported'
+ return
+ ###!!! This block currently must not be applied on data containing
+ ###!!! enhanced dependencies. We must first implement adjustments of
+ ###!!! the enhanced structure.
+ if prevnode.deps or node.deps:
+ logging.fatal('At present this block cannot be applied to data with enhanced dependencies.')
+ # If the first token depends on the second token, re-attach it to the
+ # second token's parent to prevent cycles.
+ if prevnode in node.descendants:
+ prevnode.parent = node.parent
+ prevnode.deprel = node.deprel
+ # Re-attach all children of the second token to the first token.
+ for c in node.children:
+ c.parent = prevnode
+ # Concatenate the word forms of the two tokens. Assume that morphological
+ # annotation, including the lemma, is already updated accordingly (we
+ # cannot guess it anyway).
+ prevnode.form += node.form
+ # Remove SpaceAfter=No from the first token unless the second token has
+ # this attribute, too (meaning that there is no space between the second
+ # token and whatever comes next).
+ prevnode.misc['SpaceAfter'] = node.misc['SpaceAfter']
+ # Remove the current node. The joining instruction was in its MISC, so
+ # it will disappear together with the node.
+ node.remove()
diff --git a/udapi/block/ud/kk/fixspuriousaux.py b/udapi/block/ud/kk/fixspuriousaux.py
new file mode 100644
index 00000000..044ff178
--- /dev/null
+++ b/udapi/block/ud/kk/fixspuriousaux.py
@@ -0,0 +1,27 @@
+"""Block to convert spurious auxiliaries to lexical verbs in Kazakh."""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixSpuriousAux(Block):
+
+ def process_node(self, node):
+ """
+ Some verbs that are called auxiliary by the traditional grammar, should
+ be analyzed in UD as VERB + non-finite xcomp.
+ """
+ if node.upos == 'AUX' and node.udeprel == 'aux':
+ # баста = start
+ if re.match(r'^(баста|кет)$', node.lemma):
+ node.upos = 'VERB'
+ # The auxiliary inherits the incoming relation of its original parent.
+ lexverb = node.parent
+ node.parent = lexverb.parent
+ node.deprel = lexverb.deprel
+ # The auxiliary also inherits some but not all children of the lexical verb.
+ for c in lexverb.children:
+ if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel):
+ c.parent = node
+ # The lexical verb becomes an xcomp of the auxiliary.
+ lexverb.parent = node
+ lexverb.deprel = 'xcomp'
diff --git a/udapi/block/ud/la/addmwt.py b/udapi/block/ud/la/addmwt.py
new file mode 100644
index 00000000..27831151
--- /dev/null
+++ b/udapi/block/ud/la/addmwt.py
@@ -0,0 +1,41 @@
+""" Block ud.la.AddMwt for heuristic detection of multi-word (PRON + cum, nonne) and abbreviations-dots tokens. """
+import udapi.block.ud.addmwt
+
+MWTS = {
+ 'mecum': {'lemma': 'ego cum', 'form': 'me cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'},
+ 'tecum': {'lemma': 'tu cum', 'form': 'te cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Sing AdpType=Post|Clitic=Yes', 'deprel': 'obl case'},
+ 'nobiscum': {'lemma': 'nos cum', 'form': 'nobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Neut|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'},
+ 'vobiscum': {'lemma': 'vos cum', 'form': 'vobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'},
+ 'uobiscum': {'lemma': 'uos cum', 'form': 'uobis cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc|Number=Plur AdpType=Post|Clitic=Yes', 'deprel': 'obl case'},
+ 'secum': {'lemma': 'sui cum', 'form': 'se cum', 'upos': 'PRON ADP', 'feats': 'Case=Abl|Gender=Masc AdpType=Post|Clitic=Yes', 'deprel': 'obl case'}, # can be singular or plural
+ 'nonne': {'lemma': 'non ne', 'form': 'non ne', 'upos': 'PART PART', 'feats': 'Polarity=Neg Clitic=Yes|PartType=Int', 'deprel': 'advmod:neg discourse', 'shape': 'sibling'}
+}
+
+# shared values for all entries in MWTS
+for v in MWTS.values():
+ # v['xpos'] = '' # treebank-specific
+ if 'shape' not in v:
+ v['shape'] = 'subtree'
+ v['main'] = 0
+
+
+class AddMwt(udapi.block.ud.addmwt.AddMwt):
+ """Detect and mark MWTs (split them into words and add the words to the tree)."""
+
+ def multiword_analysis(self, node):
+ """Return a dict with MWT info or None if `node` does not represent a multiword token."""
+ analysis = MWTS.get(node.form.lower(), None)
+ if analysis is not None:
+ return analysis
+
+ if node.form.endswith('.') and len(node.form) > 1 and node.form != '...':
+ # currently under discussion
+ return {'form': node.form[:-1] + ' .',
+ 'lemma': '* .',
+ 'upos': '* PUNCT',
+ 'xpos': '_ _',
+ 'feats': '* _',
+ 'deprel': '* punct',
+ 'main': 0,
+ 'shape': 'subtree'}
+
diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py
new file mode 100644
index 00000000..a7b506e8
--- /dev/null
+++ b/udapi/block/ud/la/markfeatsbugs.py
@@ -0,0 +1,338 @@
+"""
+Block to identify missing or ill-valued features in Latin. Any bugs that it
+finds will be saved in the MISC column as a Bug attribute, which can be later
+used in filters and highlighted in text output.
+
+Usage: cat *.conllu | udapy -HAMX layout=compact ud.la.MarkFeatsBugs > bugs.html
+Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.la.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc
+"""
+import udapi.block.ud.markfeatsbugs
+import logging
+import re
+
+class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs):
+
+ def __init__(self, flavio=False, **kwargs):
+ """
+ Create the ud.la.MarkFeatsBugs block instance.
+
+ Args:
+ flavio=1: Accept features as defined by Flavio for treebanks he
+ maintains. By default, a more conservative set of features and
+ values is expected.
+ """
+ super().__init__(**kwargs)
+ self.flavio = flavio
+
+ def process_node(self, node):
+ rf = []
+ af = {}
+ # PROIEL-specific: greek words without features
+ # LLCT-specific: corrupted nodes
+ if node.lemma in ['greek.expression', 'missing^token']:
+ pass
+ # NOUNS ################################################################
+ elif node.upos == 'NOUN':
+ if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns
+ rf = ['Gender', 'Number', 'Case']
+ af = {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'],
+ 'Degree': ['Dim'],
+ 'Abbr': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'VerbForm': ['Part', 'Vnoun']}
+ if self.flavio:
+ # Flavio added InflClass but not everywhere, so it is not required.
+ af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX']
+ af['Proper'] = ['Yes']
+ af['Polarity'] = ['Neg']
+ af['Compound'] = ['Yes']
+ af['Variant'] = ['Greek']
+ af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth']
+ self.check_required_features(node, rf)
+ self.check_allowed_features(node, af)
+ # PROPER NOUNS #########################################################
+ elif node.upos == 'PROPN':
+ if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: # abbreviated and indeclinable nouns
+ rf = ['Gender', 'Number', 'Case']
+ af = {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'],
+ 'Abbr': ['Yes'],
+ 'Foreign': ['Yes']}
+ if self.flavio:
+ af['Compound'] = ['Yes']
+ af['Variant'] = ['Greek']
+ af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth']
+ af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX']
+ self.check_required_features(node, rf)
+ self.check_allowed_features(node, af)
+ # ADJECTIVES ###########################################################
+ elif node.upos == 'ADJ':
+ if not node.feats['Abbr'] == 'Yes' and node.feats['Case']:
+ rf = ['Gender', 'Number', 'Case']
+ af = {
+ 'NumType': ['Dist', 'Mult', 'Ord'],
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'],
+ 'Degree': ['Cmp', 'Sup', 'Abs'],
+ 'Abbr': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Polarity': ['Neg'],
+ 'VerbForm': ['Part']}
+ if self.flavio:
+ # Flavio added InflClass but not everywhere, so it is not required.
+ af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX']
+ af['Compound'] = ['Yes']
+ af['Proper'] = ['Yes']
+ af['Variant'] = ['Greek']
+ af['Degree'].append('Dim')
+ af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth']
+ self.check_required_features(node, rf)
+ self.check_allowed_features(node, af)
+ # PRONOUNS #############################################################
+ elif node.upos == 'PRON':
+ rf = ['PronType', 'Case']
+ af = {
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'],
+ 'Proper': ['Yes'],
+ 'Compound': ['Yes'],
+ 'Polarity': ['Neg']
+ }
+ if node.feats['PronType'] == 'Prs':
+ af['Reflex'] = ['Yes']
+ if node.feats['Reflex'] == 'Yes': # seipsum, se
+ rf.extend(['Person'])
+ # seipsum has gender and number but se does not, so it is not required
+ af['Gender'] = ['Masc', 'Fem', 'Neut']
+ af['Number'] = ['Sing', 'Plur']
+ af['Person'] = ['3']
+ af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Abl']
+ else: # not reflexive: ego, tu, is, nos
+ rf.extend(['Person', 'Number'])
+ af['Person'] = ['1', '2', '3']
+ af['Number'] = ['Sing', 'Plur']
+ # 3rd person must have gender
+ if node.feats['Person'] == '3': # is, id
+ rf.append('Gender')
+ af['Gender'] = ['Masc', 'Fem', 'Neut']
+ elif re.match(r'^(Rel|Int)$', node.feats['PronType']):
+ rf.extend(['Gender', 'Number'])
+ af['Gender'] = ['Masc', 'Fem', 'Neut']
+ af['Number'] = ['Sing', 'Plur']
+ elif node.feats['PronType'] == 'Ind':
+ rf = [f for f in rf if f != 'Case']
+ af['Gender'] = ['Masc', 'Fem', 'Neut']
+ af['Number'] = ['Sing', 'Plur']
+ # lexical check of PronTypes
+ af['PronType'] = []
+ if node.lemma in ['ego', 'tu', 'is', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'egoipse', 'egometipse', 'tumetipse', 'semetipse', 'nosmetipse']:
+ af['PronType'].append('Prs')
+ elif node.lemma in ['aliquis', 'nemo', 'nihil', 'nihilum', 'qui', 'quis', 'quisquis', 'quiuis', 'quivis']:
+ af['PronType'].append('Ind')
+ elif node.lemma in ['inuicem', 'invicem']:
+ af['PronType'].append('Rcp')
+ rf.remove('Case')
+ if node.lemma in ['qui', 'quicumque', 'quisquis']:
+ af['PronType'].append('Rel')
+ if node.lemma in [ 'ecquis', 'ecqui', 'numquis', 'qui', 'quis', 'quisnam']:
+ af['PronType'].append('Int')
+ if self.flavio:
+ # Flavio added InflClass but not everywhere, so it is not required.
+ af['InflClass'] = ['Ind', 'IndEurO', 'IndEurX', 'LatAnom', 'LatPron']
+ af['Compound'] = ['Yes']
+ af['Polarity'] = ['Neg']
+ af['Form'] = ['Emp']
+ self.check_required_features(node, rf)
+ self.check_allowed_features(node, af)
+ # DETERMINERS ##########################################################
+ elif node.upos == 'DET':
+ rf = ['PronType']
+ if node.feats['Case']:
+ rf.extend(['Gender', 'Number', 'Case'])
+ af = {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'],
+ 'Degree': ['Cmp', 'Abs', 'Sup'],
+ 'Polarity': ['Neg'],
+ 'Proper': ['Yes'],
+ 'PronType': []
+ }
+ if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster'
+ rf.extend(['Poss', 'Person[psor]'])
+ af['PronType'] = ['Prs']
+ af['Poss'] = 'Yes'
+ af['Person[psor]'] = ['1', '2', '3']
+ af['Reflex'] = ['Yes']
+ # The possessor's number is distinguished in the first and second person (meus vs. noster) but not in the third person (suus).
+ if node.feats['Person[psor]'] != '3':
+ rf.append('Number[psor]')
+ af['Number[psor]'] = ['Sing', 'Plur']
+ if node.feats['PronType'] == 'Ind':
+ af['NumType'] = ['Card']
+ # lexical check of PronTypes
+ if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']:
+ if not af['PronType'] == ['Prs']:
+ af['PronType'].append('Prs')
+ elif node.lemma in ['aliquantus', 'aliqui', 'aliquot', 'quidam', 'nonnullus', 'nullus', 'quantuscumque', 'quantuslibet', 'qui', 'quilibet', 'quispiam', 'quiuis', 'quivis', 'quotlibet', 'ullus', 'unus', 'uterque','multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']:
+ af['PronType'].append('Ind')
+ elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']:
+ af['PronType'].append('Tot')
+ if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']:
+ af['PronType'].append('Rel')
+ if node.lemma in ['qui', 'quantus', 'quot']:
+ af['PronType'].append('Int')
+ elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot', 'praedictus', 'praefatus', 'suprascriptus']:
+ af['PronType'].append('Dem')
+ elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter', 'uterlibet', 'uterque']:
+ af['PronType'].append('Con')
+ if self.flavio:
+ # Flavio added InflClass but not everywhere, so it is not required.
+ af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron']
+ af['Compound'] = ['Yes']
+ af['Form'] = ['Emp']
+ af['NumType'] = ['Card']
+ af['Degree'].append('Dim')
+ af['PronType'].append('Art')
+ if re.match(r'^(unus|ambo)', node.lemma):
+ af['NumValue'] = ['1', '2']
+ self.check_required_features(node, rf)
+ self.check_allowed_features(node, af)
+ # NUMERALS #############################################################
+ elif node.upos == 'NUM':
+ rf = ['NumType', 'NumForm']
+ af = {
+ 'NumType': ['Card', 'Ord'],
+ 'NumForm': ['Word', 'Roman', 'Digit'],
+ 'Proper': ['Yes']}
+ # Arabic digits and Roman numerals do not have inflection features.
+ if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']):
+ af['Gender'] = ['Masc', 'Fem', 'Neut']
+ af['Number'] = ['Sing', 'Plur']
+ af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']
+ if self.flavio:
+ # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim
+ af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'LatPron']
+ af['NumForm'].append('Reference')
+ af['Compound'] = ['Yes']
+ self.check_required_features(node, rf)
+ self.check_allowed_features(node, af)
+ # VERBS AND AUXILIARIES ################################################
+ elif re.match(r'^(VERB|AUX)$', node.upos):
+ rf = ['VerbForm', 'Aspect']
+ af = {
+ 'VerbForm': ['Inf', 'Fin', 'Part', 'Conv'],
+ 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'],
+ 'Polarity': ['Neg'],
+ 'Typo': ['Yes']
+ }
+ if node.feats['VerbForm'] not in ['Part', 'Conv']:
+ rf.append('Tense')
+ af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut']
+ if node.upos == 'VERB' or (node.upos == 'AUX' and node.lemma != 'sum'):
+ rf.append('Voice')
+ af['Voice'] = ['Act', 'Pass']
+ if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive
+ rf.extend(['Mood', 'Person', 'Number'])
+ af['Mood'] = ['Ind', 'Sub', 'Imp']
+ af['Person'] = ['1', '2', '3']
+ af['Number'] = ['Sing', 'Plur']
+ elif node.feats['VerbForm'] == 'Part':
+ rf.extend(['Gender', 'Number', 'Case'])
+ af['Number'] = ['Sing', 'Plur'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Sing']
+ af['Gender'] = ['Masc', 'Fem', 'Neut'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Neut']
+ af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl']
+ af['Degree'] = ['Abs', 'Cmp']
+ if node.misc['TraditionalMood'].startswith('Gerundi'):
+ af['Voice'] = ['Pass']
+ af['Aspect'] = 'Prosp'
+ elif node.feats['VerbForm'] == 'Conv':
+ rf.extend(['Case', 'Gender', 'Number'])
+ af['Case'] = ['Abl', 'Acc']
+ af['Gender'] = ['Masc']
+ af['Number'] = ['Sing']
+ af['Voice'] = ['Act']
+ elif node.feats['VerbForm'] == 'Inf':
+ af['Tense'].remove('Pqp')
+ if self.flavio:
+ # Flavio added InflClass but not everywhere, so it is not required.
+ af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX']
+ af['VerbType'] = ['Mod']
+ if 'Degree' in af:
+ af['Degree'].append('Dim')
+ else:
+ af['Degree'] = ['Dim']
+ af['Compound'] = ['Yes']
+ af['Proper'] = ['Yes']
+ if re.match(r'^(Part|Conv)$', node.feats['VerbForm']):
+ af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX']
+ elif node.feats['VerbForm'] == 'Inf':
+ af['Case'] = ['Nom', 'Acc', 'Abl']
+ af['Gender'] = ['Neut']
+ af['Number'] = ['Sing']
+ af['InflClass[nominal]'] = ['Ind']
+ self.check_required_features(node, rf)
+ self.check_allowed_features(node, af)
+ # ADVERBS ##############################################################
+ elif node.upos == 'ADV':
+ af = {
+ 'AdvType': ['Loc', 'Tim'],
+ 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'],
+ 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'],
+ 'NumType': ['Card', 'Mult', 'Ord'], # e.g., primum
+ 'Polarity': ['Neg']
+ }
+ if self.flavio:
+ af['Compound'] = ['Yes']
+ af['Form'] = ['Emp']
+ af['VerbForm'] = ['Fin', 'Part']
+ af['Degree'].append('Dim')
+ self.check_allowed_features(node, af)
+ # PARTICLES ############################################################
+ elif node.upos == 'PART':
+ af = {
+ 'PartType': ['Int', 'Emp'],
+ 'Polarity': ['Neg']
+ }
+ if self.flavio:
+ af['Form'] = ['Emp']
+ af['PronType'] = ['Dem']
+ af['Compound'] = ['Yes']
+ self.check_allowed_features(node, af)
+ # CONJUNCTIONS #########################################################
+ elif re.match(r'^[CS]CONJ$', node.upos):
+ af = {
+ 'PronType': ['Rel', 'Con'],
+ 'Polarity': ['Neg'],
+ 'Compound': ['Yes']}
+ if self.flavio:
+ af['Compound'] = ['Yes']
+ af['Form'] = ['Emp']
+ af['VerbForm'] = ['Fin']
+ af['NumType'] = ['Card']
+ af['ConjType'] = ['Expl']
+ af['AdvType'] = ['Loc']
+ self.check_allowed_features(node, af)
+ # ADPOSITIONS ##########################################################
+ elif node.upos == 'ADP':
+ rf = ['AdpType']
+ af = {
+ 'AdpType': ['Prep', 'Post'],
+ 'Abbr': ['Yes']
+ }
+ if self.flavio:
+ af['VerbForm'] = ['Part']
+ af['Proper'] = ['Yes']
+ af['Compound'] = ['Yes']
+ self.check_allowed_features(node, af)
+ # X ##########################################################
+ elif node.upos == 'X':
+ af = {'Abbr': ['Yes']}
+ # THE REST: NO FEATURES ################################################
+ else:
+ self.check_allowed_features(node, {})
diff --git a/udapi/block/ud/lemmatize.py b/udapi/block/ud/lemmatize.py
new file mode 100644
index 00000000..a234256f
--- /dev/null
+++ b/udapi/block/ud/lemmatize.py
@@ -0,0 +1,42 @@
+"""Block to add missing lemmas in cases where it seems obvious what the lemma should be."""
+from udapi.core.block import Block
+import logging
+import re
+
+class Lemmatize(Block):
+
+ def process_node(self, node):
+ """
+ Some treebanks lack lemmas for some or all words. Occasionally we may be
+ able to guess that the lemma is identical to the word form. This block
+ will then fill out the lemma.
+
+ For some parts of speech, we can only say that the form is the lemma if
+ we have morphological features that will confirm it is the right form.
+ """
+ if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes':
+ # Many closed classes do not inflect and have the same lemma as the form (just lowercased).
+ if re.match(r'^(PUNCT|SYM|ADP|CCONJ|SCONJ|PART|INTJ|X)$', node.upos):
+ node.lemma = node.form.lower()
+ # NOUN PROPN ADJ PRON DET NUM VERB AUX ADV
+ # ADV: use positive affirmative
+ elif re.match(r'^(ADV)$', node.upos) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']):
+ node.lemma = node.form.lower()
+ # VERB and AUX: use the infinitive
+ elif re.match(r'^(VERB|AUX)$', node.upos) and node.feats['VerbForm'] == 'Inf' and re.match(r'^(Pos)?$', node.feats['Polarity']):
+ node.lemma = node.form.lower()
+ # NOUN and PROPN: use singular nominative (but do not lowercase for PROPN)
+ # Note: This rule is wrong in German, where no nouns should be lowercased.
+ elif re.match(r'^(NOUN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']):
+ node.lemma = node.form.lower()
+ elif re.match(r'^(PROPN)$', node.upos) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Polarity']):
+ node.lemma = node.form
+ # ADJ: use masculine singular nominative positive affirmative
+ elif re.match(r'^(ADJ)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']) and re.match(r'^(Pos)?$', node.feats['Degree']) and re.match(r'^(Pos)?$', node.feats['Polarity']):
+ node.lemma = node.form.lower()
+ # ADJ, PRON, DET: use masculine singular nominative (pronouns: each person has its own lemma)
+ elif re.match(r'^(ADJ|PRON|DET)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Sing)?$', node.feats['Number']) and re.match(r'^(Nom)?$', node.feats['Case']):
+ node.lemma = node.form.lower()
+ # NUM: use masculine nominative (number, if present at all, is lexical)
+ elif re.match(r'^(NUM)$', node.upos) and re.match(r'^(Masc)?$', node.feats['Gender']) and re.match(r'^(Nom)?$', node.feats['Case']):
+ node.lemma = node.form.lower()
diff --git a/udapi/block/ud/lt/fixedeprels.py b/udapi/block/ud/lt/fixedeprels.py
new file mode 100644
index 00000000..9b1cb98d
--- /dev/null
+++ b/udapi/block/ud/lt/fixedeprels.py
@@ -0,0 +1,144 @@
+"""Block to fix case-enhanced dependency relations in Lithuanian."""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixEdeprels(Block):
+
+ # Sometimes there are multiple layers of case marking and only the outermost
+ # layer should be reflected in the relation. For example, the semblative 'jako'
+ # is used with the same case (preposition + morphology) as the nominal that
+ # is being compared ('jako_v:loc' etc.) We do not want to multiply the relations
+ # by all the inner cases.
+ # The list in the value contains exceptions that should be left intact.
+ outermost = {
+ 'kaip': [],
+ 'lyg': [],
+ 'negu': [],
+ 'nei': [],
+ 'nes': []
+ }
+
+ # Secondary prepositions sometimes have the lemma of the original part of
+ # speech. We want the grammaticalized form instead. List even those that
+ # will have the same lexical form, as we also want to check the morphological
+ # case. And include all other prepositions that have unambiguous morphological
+ # case, even if they are not secondary.
+ unambiguous = {
+ 'apie': 'apie:acc', # about (topic)
+ 'dėl': 'dėl:gen', # because of
+ 'iki': 'iki:gen', # until
+ 'iš': 'iš:gen', # from, out of
+ 'į': 'į:acc', # to, into, in
+ 'jei': 'jei', # remove morphological case # if
+ 'jeigu': 'jeigu', # remove morphological case # if
+ 'jog': 'jog', # remove morphological case # because
+ 'kadangi': 'kadangi', # remove morphological case # since, because
+ 'kai': 'kai', # remove morphological case # when
+ 'kaip': 'kaip', # remove morphological case # as, than
+ 'lyg': 'lyg', # remove morphological case # like
+ 'negu': 'negu', # remove morphological case # than
+ 'nei': 'nei', # remove morphological case # more than
+ 'nes': 'nes', # remove morphological case # because
+ 'nors': 'nors', # remove morphological case # though, although, when, if
+ 'nuo': 'nuo:gen', # from
+ 'pagal': 'pagal:acc', # according to, under, by
+ 'pagal_dėl': 'pagal:acc',
+ 'per': 'per:acc', # through, over (přes)
+ 'prie': 'prie:gen', # to, at, near, under
+ 'prieš': 'prieš:acc', # against
+ 'su': 'su:ins', # with
+ 'tarp': 'tarp:gen', # between
+ 'tarsi': 'tarsi', # remove morphological case # as if
+ 'virš': 'virš:gen' # above
+ }
+
+ def copy_case_from_adposition(self, node, adposition):
+ """
+ In some treebanks, adpositions have the Case feature and it denotes the
+ valency case that the preposition's nominal must be in.
+ """
+ # The following is only partial solution. We will not see
+ # some children because they may be shared children of coordination.
+ prepchildren = [x for x in node.children if x.lemma == adposition]
+ if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '':
+ return adposition+':'+prepchildren[0].feats['Case'].lower()
+ else:
+ return None
+
+ def process_node(self, node):
+ """
+ Occasionally the edeprels automatically derived from the Czech basic
+ trees do not match the whitelist. For example, the noun is an
+ abbreviation and its morphological case is unknown.
+ """
+ for edep in node.deps:
+ m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel'])
+ if m:
+ bdeprel = m.group(1)
+ solved = False
+ # Issues caused by errors in the original annotation must be fixed early.
+ # Especially if acl|advcl occurs with a preposition that unambiguously
+ # receives a morphological case in the subsequent steps, and then gets
+ # flagged as solved.
+ edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu!
+ edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel'])
+ # If one of the following expressions occurs followed by another preposition
+ # or by morphological case, remove the additional case marking. For example,
+ # 'jako_v' becomes just 'jako'.
+ for x in self.outermost:
+ exceptions = self.outermost[x]
+ m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel'])
+ if m and m.group(2) and not x+m.group(2) in exceptions:
+ edep['deprel'] = m.group(1)+':'+x
+ solved = True
+ break
+ if solved:
+ continue
+ for x in self.unambiguous:
+ # All secondary prepositions have only one fixed morphological case
+ # they appear with, so we can replace whatever case we encounter with the correct one.
+ m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel'])
+ if m:
+ edep['deprel'] = m.group(1)+':'+self.unambiguous[x]
+ solved = True
+ break
+ if solved:
+ continue
+ # The following prepositions have more than one morphological case
+ # available. Thanks to the Case feature on prepositions, we can
+ # identify the correct one. Exclude 'nom' and 'voc', which cannot
+ # be correct.
+ m = re.match(r'^(obl(?::arg)?|nmod):(po|už)(?::(?:nom|voc))?$', edep['deprel'])
+ if m:
+ adpcase = self.copy_case_from_adposition(node, m.group(2))
+ if adpcase and not re.search(r':(nom|voc)$', adpcase):
+ edep['deprel'] = m.group(1)+':'+adpcase
+ continue
+ # The remaining instance of 'po' should be ':acc'.
+ elif m.group(2) == 'po':
+ edep['deprel'] = m.group(1)+':po:acc'
+ continue
+ # The remaining 'už' are ':acc' (they are second conjuncts
+ # in coordinated oblique modifiers).
+ elif m.group(2) == 'už':
+ edep['deprel'] = m.group(1)+':už:acc'
+ continue
+
+ def set_basic_and_enhanced(self, node, parent, deprel, edeprel):
+ '''
+ Modifies the incoming relation of a node both in the basic tree and in
+ the enhanced graph. If the node does not yet depend in the enhanced
+ graph on the current basic parent, the new relation will be added without
+ removing any old one. If the node already depends multiple times on the
+ current basic parent in the enhanced graph, all such enhanced relations
+ will be removed before adding the new one.
+ '''
+ old_parent = node.parent
+ node.parent = parent
+ node.deprel = deprel
+ node.deps = [x for x in node.deps if x['parent'] != old_parent]
+ new_edep = {}
+ new_edep['parent'] = parent
+ new_edep['deprel'] = edeprel
+ node.deps.append(new_edep)
diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py
index 37fd94bd..ee58084a 100644
--- a/udapi/block/ud/markbugs.py
+++ b/udapi/block/ud/markbugs.py
@@ -8,6 +8,13 @@
Usage:
udapy -s ud.MarkBugs < in.conllu > marked.conllu 2> log.txt
+Some tests may be customized for individual languages if the language code is
+available as the zone id. The zone id can be provided in the sentence id after
+the slash (e.g., "sent_id = s125/en" for English), or as a parameter of the
+reader:
+
+udapy -s read.Conllu zone=en ud.MarkBugs < in.conllu > marked.conllu 2> log.txt
+
Errors are both logged to stderr and marked within the nodes' MISC field,
e.g. `node.misc['Bug'] = 'aux-chain'`, so the output conllu file can be
searched for "Bug=" occurences.
@@ -28,25 +35,40 @@
'VERB': 'VerbForm',
}
+
class MarkBugs(Block):
"""Block for checking suspicious/wrong constructions in UD v2."""
- def __init__(self, save_stats=True, skip=None, **kwargs):
+ def __init__(self, save_stats=True, tests=None, skip=None, max_cop_lemmas=2, **kwargs):
"""Create the MarkBugs block object.
Args:
save_stats: store the bug statistics overview into `document.misc["bugs"]`?
- skip: a regex. If `re.search(skip, short_msg)` the node is not reported.
+ tests: a regex of tests to include.
+ If `not re.search(tests, short_msg)` the node is not reported.
+ You can use e.g. `tests=aux-chain|cop-upos` to apply only those two tests.
+ Default = None (or empty string or '.*') which all tests.
+ skip: a regex of tests to exclude.
+ If `re.search(skip, short_msg)` the node is not reported.
You can use e.g. `skip=no-(VerbForm|NumType|PronType)`.
+ This has higher priority than the `tests` regex.
Default = None (or empty string) which means no skipping.
+ max_cop_lemmas: how many different lemmas are allowed to have deprel=cop.
+ Default = 2, so all except for the two most frequent lemmas are reported as bugs.
"""
super().__init__(**kwargs)
self.save_stats = save_stats
self.stats = collections.Counter()
+ self.tests_re = re.compile(tests) if (tests is not None and tests != '') else None
self.skip_re = re.compile(skip) if (skip is not None and skip != '') else None
+ self.max_cop_lemmas = max_cop_lemmas
+ self.cop_count = collections.Counter()
+ self.cop_nodes = collections.defaultdict(list)
def log(self, node, short_msg, long_msg):
"""Log node.address() + long_msg and add ToDo=short_msg to node.misc."""
+ if self.tests_re is not None and not self.tests_re.search(short_msg):
+ return
if self.skip_re is not None and self.skip_re.search(short_msg):
return
logging.debug('node %s %s: %s', node.address(), short_msg, long_msg)
@@ -57,35 +79,47 @@ def log(self, node, short_msg, long_msg):
node.misc['Bug'] = short_msg
self.stats[short_msg] += 1
- # pylint: disable=too-many-branches
+ # pylint: disable=too-many-branches, too-many-statements
def process_node(self, node):
- form, deprel, upos, feats = node.form, node.deprel, node.upos, node.feats
+ form, udeprel, upos, feats = node.form, node.udeprel, node.upos, node.feats
parent = node.parent
- for dep in ('aux', 'fixed', 'appos', 'goeswith'):
- if deprel == dep and parent.deprel == dep:
+ for dep in ('aux', 'fixed', 'goeswith', 'list'):
+ if udeprel == dep and parent.udeprel == dep:
self.log(node, dep + '-chain', dep + ' dependencies should not form a chain.')
- for dep in ('flat', 'fixed', 'conj', 'appos', 'goeswith'):
- if deprel == dep and node.precedes(parent):
+ # 'appos-chain' is more difficult to test because nested appositions are allowed.
+ # The commented-out code below prevents just some of the false alarms
+ # (those where changing the nested appos into flat would result in non-projectivity).
+ # Unfortunatelly, there are still too many false alarms, so let's skip this test completely.
+ # It seems that multiple appositions as siblings are much less common than nested.
+ # if deprel == 'appos' and parent.deprel == 'appos':
+ # if not node.precedes(parent.children[-1]):
+ # self.log(node, 'appos-chain', 'appos should not form a chain except when nested.')
+
+ for dep in ('flat', 'fixed', 'conj', 'appos', 'goeswith', 'list'):
+ if udeprel == dep and node.precedes(parent):
self.log(node, dep + '-rightheaded',
dep + ' relations should be left-headed, not right.')
- if deprel == 'cop' and upos not in ('AUX', 'PRON'):
+ if udeprel == 'cop' and upos not in ('AUX', 'PRON'):
self.log(node, 'cop-upos', 'deprel=cop upos!=AUX|PRON (but %s)' % upos)
- if deprel == 'mark' and upos == 'PRON':
+ if udeprel == 'mark' and upos == 'PRON':
self.log(node, 'mark-upos', 'deprel=mark upos=PRON')
- if deprel == 'det' and upos not in ('DET', 'PRON'):
+ if udeprel == 'det' and upos not in ('DET', 'PRON'):
self.log(node, 'det-upos', 'deprel=det upos!=DET|PRON (but %s)' % upos)
- if deprel == 'punct' and upos != 'PUNCT':
+ if udeprel == 'punct' and upos != 'PUNCT':
self.log(node, 'punct-upos', 'deprel=punct upos!=PUNCT (but %s)' % upos)
for i_upos, i_feat in REQUIRED_FEATURE_FOR_UPOS.items():
- if upos == i_upos and not node.feats[i_feat]:
- self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat))
+ if upos == i_upos and not feats[i_feat]:
+ # Some languages do not distinguish finite and non-finite forms of verbs.
+ # The VerbForm feature is not obligatory in those languages.
+ if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb', 'naq'}:
+ self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat))
if feats['VerbForm'] == 'Fin':
if upos not in ('VERB', 'AUX'):
@@ -93,22 +127,22 @@ def process_node(self, node):
if not feats['Mood']:
self.log(node, 'finverb-mood', 'VerbForm=Fin but Mood feature is missing')
- if feats['Degree'] and upos not in ('ADJ', 'ADV'):
- self.log(node, 'degree-upos',
- 'Degree=%s upos!=ADJ|ADV (but %s)' % (feats['Degree'], upos))
-
- subject_children = [n for n in node.children if 'subj' in n.deprel]
+ subject_children = [n for n in node.children if 'subj' in n.udeprel and n.sdeprel != 'outer']
if len(subject_children) > 1:
- self.log(node, 'multi-subj', 'More than one [nc]subj(:pass)? child')
-
- object_children = [n for n in node.children if n.deprel in ('obj', 'ccomp')]
+ self.log(node, 'multi-subj', 'More than one (non-outer) [nc]subj child')
+
+ # Since "ccomp" is considered a clausal counterpart of "obj" in UD v2,
+ # one may conclude that "obj" and "ccomp" are mutually exclusive.
+ # However, this has always be a gray zone and people have occasionally
+ # brought up examples where they would want the two relations to co-occur.
+ # Also, there is no clausal counterpart for "iobj", which may cause some
+ # of the problems. It is probably safer not to consider "ccomp" in this
+ # test. Nevertheless, two "obj" under the same parent are definitely an
+ # error.
+ object_children = [n for n in node.children if n.udeprel == 'obj']
if len(object_children) > 1:
self.log(node, 'multi-obj', 'More than one obj|ccomp child')
- # In addition to http://universaldependencies.org/svalidation.html
- if parent.deprel == 'punct':
- self.log(node, 'punct-child', 'parent.deprel=punct')
-
# See http://universaldependencies.org/u/overview/syntax.html#the-status-of-function-words
# TODO: Promotion by Head Elision: It is difficult to detect this exception.
# So far, I have just excluded "det" from the forbidded parent.deprel set
@@ -119,15 +153,15 @@ def process_node(self, node):
# It seems the documentation does not allow any other deprel than advmod,
# so there should be no false alarms. Some errors are not reported, i.e. the cases
# when advmod incorrectly depends on a function word ("right before midnight").
- if parent.deprel in ('aux', 'cop', 'mark', 'clf', 'case'):
- if deprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod'):
+ if parent.udeprel in ('aux', 'cop', 'mark', 'clf', 'case'):
+ if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod', 'reparandum'):
self.log(node, parent.deprel + '-child',
'parent.deprel=%s deprel!=conj|cc|punct|fixed|goeswith' % parent.deprel)
# goeswith should be left-headed, but this is already checked, so let's skip right-headed.
- if deprel == 'goeswith' and parent.precedes(node):
+ if udeprel == 'goeswith' and parent.precedes(node):
span = node.root.descendants(add_self=1)[parent.ord:node.ord]
- intruder = next((n for n in span[1:] if n.deprel != "goeswith"), None)
+ intruder = next((n for n in span[1:] if n.udeprel != "goeswith"), None)
if intruder is not None:
self.log(intruder, 'goeswith-gap', "deprel!=goeswith but lies within goeswith span")
else:
@@ -138,10 +172,30 @@ def process_node(self, node):
if upos == 'SYM' and form.isalpha():
self.log(node, 'sym-alpha', "upos=SYM but all form chars are alphabetical: " + form)
- if upos == 'PUNCT' and any(char.isalpha() for char in form):
+ if upos == 'PUNCT' and any(char.isalpha() for char in form):
self.log(node, 'punct-alpha', "upos=PUNCT but form has alphabetical char(s): " + form)
+ if upos == 'PUNCT' and udeprel not in ('punct', 'fixed', 'goeswith', 'root'):
+ self.log(node, 'punct-deprel', 'upos=PUNCT deprel!=punct|fixed|goeswith|root (but %s)'
+ % udeprel)
+
+ if upos == 'PUNCT' and node.is_nonprojective():
+ self.log(node, 'punct-nonproj', 'upos=PUNCT and edge is non-projective')
+ if upos == 'PUNCT' and node.is_nonprojective_gap() and not parent.is_nonprojective_gap():
+ self.log(node, 'punct-nonproj-gap', 'upos=PUNCT and causing a non-projectivity')
+
+ if udeprel == 'cop':
+ lemma = node.lemma if node.lemma != '_' else form
+ self.cop_nodes[lemma].append(node)
+ self.cop_count[lemma] += 1
+
def after_process_document(self, document):
+ for lemma, _count in self.cop_count.most_common()[self.max_cop_lemmas:]:
+ for node in self.cop_nodes[lemma]:
+ self.log(node, 'cop-many-lemmas', 'deprel=cop but lemma=%s not in top-%d'
+ % (lemma, self.max_cop_lemmas))
+ self.cop_count.clear()
+ self.cop_nodes.clear()
total = 0
message = 'ud.MarkBugs Error Overview:'
for bug, count in sorted(self.stats.items(), key=lambda pair: (pair[1], pair[0])):
diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py
new file mode 100644
index 00000000..26c5624d
--- /dev/null
+++ b/udapi/block/ud/markfeatsbugs.py
@@ -0,0 +1,73 @@
+"""
+Block to identify missing or ill-valued features in a treebank. Any bugs that it
+finds will be saved in the MISC column as a Bug attribute, which can be later
+used in filters and highlighted in text output. This is a base block that only
+implements service methods. A language-specific block must be derived from this
+one and define the actual rules valid in that language.
+
+Usage (Czech example): cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html
+"""
+from udapi.core.block import Block
+
+class MarkFeatsBugs(Block):
+
+ def bug(self, node, bugstring):
+ bugs = []
+ if node.misc['Bug']:
+ bugs = node.misc['Bug'].split('+')
+ if not bugstring in bugs:
+ bugs.append(bugstring)
+ node.misc['Bug'] = '+'.join(bugs)
+
+ def check_allowed_features(self, node, allowed):
+ """
+ We need a dictionary indexed by feature names that are allowed; for each
+ feature name, there is a list of allowed values.
+ """
+ # Check for features that are not allowed but the node has them.
+ # For features that are allowed, check that their values are allowed.
+ for f in node.feats:
+ if f in allowed:
+ if not node.feats[f] in allowed[f]:
+ self.bug(node, 'Feat' + f + 'Value' + node.feats[f] + 'NotAllowed')
+ else:
+ self.bug(node, 'Feat' + f + 'NotAllowed')
+
+ def check_required_features(self, node, required):
+ """
+ We need a list of names of features whose values must not be empty.
+ """
+ for f in required:
+ if not f in node.feats:
+ self.bug(node, 'Feat' + f + 'Missing')
+
+ def process_node(self, node):
+ """
+ This is a generic block, do nothing here. In a language-specific block
+ based on this one, rules similar to the examples below can be specified:
+
+ # NOUNS ################################################################
+ if node.upos == 'NOUN':
+ self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity'])
+ if node.feats['Gender'] == 'Masc':
+ self.check_required_features(node, ['Animacy'])
+ self.check_allowed_features(node, {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Foreign': ['Yes']})
+ else:
+ self.check_allowed_features(node, {
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Number': ['Sing', 'Dual', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Foreign': ['Yes']})
+ #...
+ # THE REST: NO FEATURES ################################################
+ else:
+ self.check_allowed_features(node, {})
+ """
+ return
diff --git a/udapi/block/ud/ml/markfeatsbugs.py b/udapi/block/ud/ml/markfeatsbugs.py
new file mode 100644
index 00000000..13c8434c
--- /dev/null
+++ b/udapi/block/ud/ml/markfeatsbugs.py
@@ -0,0 +1,279 @@
+"""
+Block to identify missing or ill-valued features in Malayalam. Any bugs that it
+finds will be saved in the MISC column as a Bug attribute, which can be later
+used in filters and highlighted in text output.
+
+Usage: cat *.conllu | udapy -HAMX layout=compact ud.ml.MarkFeatsBugs > bugs.html
+Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.ml.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc
+"""
+import udapi.block.ud.markfeatsbugs
+import logging
+import re
+
+class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs):
+
+ def process_node(self, node):
+ # FOREIGN WORDS ########################################################
+ # Do not put any restrictions on words that have Foreign=Yes. These may
+ # also have Lang=xx in MISC, which would mean that the official
+ # validator would judge them by the rules for language [xx]. But even
+ # if they are not fully code-switched (e.g. because they are written in
+ # the Malayalam script, like the English verb പ്ലാന്റ് plānṟ "plant"),
+ # they still may not have the regular features of Malayalam morphology.
+ if node.feats['Foreign'] == 'Yes':
+ pass
+ # NOUNS AND PROPER NOUNS ###############################################
+ elif re.match(r'^(NOUN|PROPN)$', node.upos):
+ self.check_required_features(node, ['Animacy', 'Number', 'Case'])
+ self.check_allowed_features(node, {
+ 'Animacy': ['Anim', 'Inan'],
+ 'Number': ['Sing', 'Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'],
+ 'Abbr': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Typo': ['Yes']})
+ # ADJECTIVES ###########################################################
+ elif node.upos == 'ADJ':
+ self.check_allowed_features(node, {
+ 'VerbForm': ['Part'],
+ 'NumType': ['Ord'],
+ 'Abbr': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Typo': ['Yes']})
+ # PRONOUNS #############################################################
+ elif node.upos == 'PRON':
+ rf = ['PronType', 'Case']
+ af = {
+ 'PronType': ['Prs', 'Int', 'Ind'], # demonstrative pronouns are treated as third person personal pronouns
+ 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'],
+ 'Abbr': ['Yes'],
+ 'Typo': ['Yes']
+ }
+ if node.feats['PronType'] == 'Prs':
+ af['Reflex'] = ['Yes']
+ if node.feats['Reflex'] == 'Yes':
+ rf = ['PronType']
+ else: # not reflexive
+ rf.extend(['Person', 'Number'])
+ af['Person'] = ['1', '2', '3']
+ af['Number'] = ['Sing', 'Plur']
+ # 1st and 2nd person do not have gender: ഞാൻ ñān, നീ nī; or 3rd person താൻ tān̕
+ if node.feats['Person'] == '3' and not node.lemma == 'താൻ': # അവൻ avan, അവൾ avaḷ, അത് at, അവർ avaṟ; but not താൻ tān̕
+ rf.append('Deixis')
+ af['Deixis'] = ['Prox', 'Remt']
+ if node.feats['Number'] == 'Sing':
+ rf.append('Gender')
+ af['Gender'] = ['Masc', 'Fem', 'Neut']
+ # third person singular neuter pronouns also distinguish animacy (animate neuter are animals and plants, they have a different accusative form)
+ if node.feats['Gender'] == 'Neut':
+ rf.append('Animacy')
+ af['Animacy'] = ['Anim', 'Inan']
+ else: # plural pronouns do not distinguish gender but they do distinguish animacy
+ rf.append('Animacy')
+ af['Animacy'] = ['Anim', 'Inan']
+ elif node.feats['Person'] == '1' and node.feats['Number'] == 'Plur':
+ rf.append('Clusivity')
+ af['Clusivity'] = ['In', 'Ex']
+ # Interrogative pronouns, too, can be case-marked. Therefore, the
+ # base form must have Case=Nom.
+ # ആര് ār "who" (Nom) എന്ത് ent "what" (Nom, Acc.Inan)
+ # ആരെ āre "who" (Acc) എന്തെ ente "what" (Acc.Anim) എന്തിനെ entine "what" (Acc.Anim or maybe Inan but optional)
+ # ആരുടെ āruṭe "who" (Gen) എന്തിന് entin "what" (Gen) or "why"
+ # ആരൊക്കെ ārokke "who" (Dat?) എന്തൊക്കെ entokke "what" (Dat?)
+ #elif node.feats['PronType'] == 'Int':
+ # rf.append('Animacy')
+ # af['Animacy'] = ['Anim', 'Inan']
+ self.check_required_features(node, rf)
+ self.check_allowed_features(node, af)
+ # DETERMINERS ##########################################################
+ elif node.upos == 'DET':
+ if node.feats['PronType'] == 'Art':
+ self.check_required_features(node, ['PronType', 'Definite'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Art'],
+ 'Definite': ['Ind'],
+ 'Abbr': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ else:
+ self.check_required_features(node, ['PronType'])
+ self.check_allowed_features(node, {
+ 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'],
+ 'Deixis': ['Prox', 'Remt'],
+ 'Abbr': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ # NUMERALS #############################################################
+ elif node.upos == 'NUM':
+ self.check_required_features(node, ['NumType', 'NumForm'])
+ # Arabic digits and Roman numerals do not have inflection features.
+ if re.match(r'^(Digit|Roman)$', node.feats['NumForm']):
+ self.check_allowed_features(node, {
+ 'NumType': ['Card'],
+ 'NumForm': ['Digit', 'Roman'],
+ 'Abbr': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ else:
+ self.check_required_features(node, ['NumType', 'NumForm', 'Case'])
+ self.check_allowed_features(node, {
+ 'NumType': ['Card', 'Frac'],
+ 'NumForm': ['Word'],
+ 'Number': ['Plur'],
+ 'Case': ['Nom', 'Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'],
+ 'Abbr': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ # VERBS ################################################################
+ elif node.upos == 'VERB':
+ self.check_required_features(node, ['VerbForm'])
+ if node.feats['VerbForm'] == 'Inf':
+ self.check_allowed_features(node, {
+ 'VerbForm': ['Inf'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Voice': ['Act', 'Pass', 'Cau'],
+ 'Foreign': ['Yes'],
+ 'Abbr': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ elif node.feats['VerbForm'] == 'Fin':
+ if node.feats['Mood'] == 'Imp':
+ # Unlike other forms, the imperative distinguishes politeness.
+ # The verb stem serves as an informal imperative: തുറ tuṟa "open"
+ # The citation form may serve as a formal imperative: തുറക്കുക tuṟakkūka "open"
+ # Finally, there is another formal imperative with -kkū: തുറക്കൂ tuṟakkū "open"
+ self.check_required_features(node, ['Mood', 'Polite'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf', 'Prog'],
+ 'VerbForm': ['Fin'],
+ 'Mood': ['Imp'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Polite': ['Infm', 'Form'],
+ 'Abbr': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ elif node.feats['Mood'] == 'Nec':
+ self.check_required_features(node, ['Mood', 'Voice'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf', 'Prog'],
+ 'VerbForm': ['Fin'],
+ 'Mood': ['Nec'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Voice': ['Act', 'Pass', 'Cau'],
+ 'Abbr': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ else:
+ self.check_required_features(node, ['Mood', 'Tense', 'Voice'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf', 'Prog'],
+ 'VerbForm': ['Fin'],
+ 'Mood': ['Ind', 'Pot', 'Cnd'],
+ 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative
+ 'Polarity': ['Pos', 'Neg'],
+ 'Voice': ['Act', 'Pass', 'Cau'],
+ 'Abbr': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ elif node.feats['VerbForm'] == 'Part':
+ self.check_required_features(node, ['Tense'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf', 'Prog'],
+ 'VerbForm': ['Part'],
+ 'Tense': ['Past'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Voice': ['Act', 'Pass', 'Cau'],
+ 'Abbr': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ else: # verbal noun
+ # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice.
+ # Currently both forms are VerbForm=Vnoun.
+ #self.check_required_features(node, ['Tense', 'Voice'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf', 'Prog'],
+ 'VerbForm': ['Vnoun'],
+ 'Tense': ['Past', 'Pres'],
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Voice': ['Act', 'Pass', 'Cau'],
+ # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix.
+ 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'],
+ 'Abbr': ['Yes'],
+ 'Foreign': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ # AUXILIARIES ##########################################################
+ elif node.upos == 'AUX':
+ self.check_required_features(node, ['VerbForm'])
+ if node.feats['VerbForm'] == 'Fin':
+ if node.feats['Mood'] == 'Imp':
+ self.check_required_features(node, ['Mood'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf', 'Prog'],
+ 'VerbForm': ['Fin'],
+ 'Mood': ['Imp'],
+ 'Polarity': ['Pos', 'Neg'],
+ 'Abbr': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ else: # indicative or subjunctive
+ self.check_required_features(node, ['Mood', 'Tense'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf', 'Prog'],
+ 'VerbForm': ['Fin'],
+ 'Mood': ['Ind', 'Sub', 'Cnd'],
+ 'Tense': ['Past', 'Imp', 'Pres', 'Fut'], # only in indicative
+ 'Polarity': ['Pos', 'Neg'],
+ 'Abbr': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ else: # verbal noun
+ # The "actual Malayalam verbal noun" (unlike the "nominalized form") does not inflect for Tense and Voice.
+ # Currently both forms are VerbForm=Vnoun.
+ #self.check_required_features(node, ['Tense', 'Voice'])
+ self.check_allowed_features(node, {
+ 'Aspect': ['Imp', 'Perf', 'Prog'],
+ 'VerbForm': ['Vnoun'],
+ 'Tense': ['Past', 'Pres'],
+ 'Gender': ['Masc', 'Fem', 'Neut'],
+ 'Polarity': ['Pos', 'Neg'],
+ # We only annotate case of verbal nouns if it is not Nom, i.e., there is an actual case suffix.
+ 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'],
+ 'Abbr': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ # ADVERBS ##############################################################
+ elif node.upos == 'ADV':
+ if node.feats['PronType'] != '':
+ # Pronominal adverbs are neither compared nor negated.
+ self.check_allowed_features(node, {
+ 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot'],
+ 'Typo': ['Yes']
+ })
+ else:
+ # The remaining adverbs are neither pronominal, nor compared or
+ # negated.
+ self.check_allowed_features(node, {'Typo': ['Yes']})
+ # ADPOSITIONS ##########################################################
+ elif node.upos == 'ADP':
+ self.check_allowed_features(node, {
+ # Case suffixes after numbers are separate tokens, they are attached
+ # via the 'case' relation and they bear the Case feature (the number does not).
+ 'Case': ['Gen', 'Dat', 'Ben', 'Acc', 'Voc', 'Loc', 'Abl', 'Ins', 'Cmp', 'Com', 'All'],
+ 'Abbr': ['Yes'],
+ 'Typo': ['Yes']})
+ # PARTICLES ############################################################
+ elif node.upos == 'PART':
+ self.check_allowed_features(node, {
+ 'Polarity': ['Neg'],
+ 'Abbr': ['Yes'],
+ 'Typo': ['Yes']
+ })
+ # THE REST: NO FEATURES ################################################
+ else:
+ self.check_allowed_features(node, {'Abbr': ['Yes'], 'Typo': ['Yes']})
diff --git a/udapi/block/ud/mr/addformsinmwt.py b/udapi/block/ud/mr/addformsinmwt.py
new file mode 100644
index 00000000..bd63ee7d
--- /dev/null
+++ b/udapi/block/ud/mr/addformsinmwt.py
@@ -0,0 +1,94 @@
+"""
+Block ud.mr.AddFormsInMwt looks for multiword tokens whose words lack forms.
+Based on the form of the surface token and on the information provided in
+the lemmas and UPOS, tries to reconstruct the forms of individual words.
+"""
+from udapi.core.block import Block
+import re
+import logging
+
+
+class AddFormsInMwt(Block):
+ """Guess forms of syntactic worms within a multiword token."""
+
+ def process_node(self, node):
+ if node.form == '_' and node.multiword_token:
+ mwt = node.multiword_token
+ # Many multiword tokens consist of NOUN + ADP. Beware: The adposition
+ # may have a form different from its lemma. It happens with possessive
+ # postpositions चा, चे, which distinguish the gender and number of
+ # the possessed entity.
+ if len(mwt.words) == 2 and re.match(r'^(ADP|PART)$', mwt.words[1].upos):
+ # Occasionally the lemma of the possessive postposition is mistakenly 'ची' instead of 'चा'.
+ if mwt.words[1].lemma == 'चा' or mwt.words[1].lemma == 'ची':
+ mwt.words[1].lemma = 'चा'
+ # चा (cā) ... Masc Sing
+ # ची (cī) ... Fem Sing, Neut Plur
+ # चे (ce) ... Neut Sing, Masc Plur
+ # च्या (cyā) ... Fem Plur
+ # चं (caṁ) ... ?
+ m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)$', mwt.form)
+ # The resulting form is different with personal pronouns.
+ # माझा (mājhā), माझी (mājhī), माझे (mājhe), माझ्या (mājhyā)
+ # तुझी (tujhī), तुझे (tujhe)
+ # आपला (āpalā), आपली (āpalī), आपल्या (āpalyā)
+ # त्याचं (tyācaṁ)
+ m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)$', mwt.form)
+ if m:
+ if node == mwt.words[0]:
+ node.form = m.group(1)
+ else:
+ node.form = m.group(2)
+ elif m2:
+ if node == mwt.words[0]:
+ node.form = m2.group(1)
+ else:
+ node.form = 'च' + m2.group(2)
+ else:
+ logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma))
+ elif mwt.words[1].lemma == 'वरती':
+ m = re.match(r'^(.+)(वर(?:ती)?)$', mwt.form)
+ if m:
+ if node == mwt.words[0]:
+ node.form = m.group(1)
+ else:
+ node.form = m.group(2)
+ else:
+ logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma))
+ else: # not the possessive 'चा'
+ m = re.match(r'^(.+)' + mwt.words[1].lemma + r'$', mwt.form)
+ if m:
+ if node == mwt.words[0]:
+ node.form = m.group(1)
+ else:
+ node.form = node.lemma
+ else:
+ logging.info("Cannot decompose %s+ADP multiword token '%s'. Part lemmas are '%s' and '%s'." % (mwt.words[0].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma))
+ elif len(mwt.words) == 3 and re.match(r'^(ADP|PART)$', mwt.words[1].upos) and re.match(r'^(ADP|PART)$', mwt.words[2].upos):
+ # Compound postpositions where the middle word is the possessive 'चा'.
+ # The lemma of the middle word should be 'चा' but sometimes it is 'च्या'.
+ if re.match(r'^(चा|च्या)$', mwt.words[1].lemma):
+ m = re.match(r'^(.+)(चा|ची|चे|च्या|चं)(.+)$', mwt.form)
+ m2 = re.match(r'^(माझ|तुझ|आपल)(ा|ी|े|्या)(.+)$', mwt.form)
+ if m:
+ if node == mwt.words[0]:
+ node.form = m.group(1)
+ elif node == mwt.words[1]:
+ node.form = m.group(2)
+ node.lemma = 'चा'
+ else:
+ node.form = m.group(3)
+ elif m2:
+ if node == mwt.words[0]:
+ node.form = m2.group(1)
+ elif node == mwt.words[1]:
+ node.form = 'च' + m2.group(2)
+ node.lemma = 'चा'
+ else:
+ node.form = m2.group(3)
+ else:
+ logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma))
+ else:
+ logging.info("Cannot decompose %s+%s+%s multiword token '%s'. Part lemmas are '%s', '%s', and '%s'." % (mwt.words[0].upos, mwt.words[1].upos, mwt.words[2].upos, mwt.form, mwt.words[0].lemma, mwt.words[1].lemma, mwt.words[1].lemma))
+ else:
+ logging.info("Cannot decompose multiword token '%s' of %d parts: %s" % (mwt.form, len(mwt.words), str([x.lemma + '/' + x.upos for x in mwt.words])))
diff --git a/udapi/block/ud/printfixed.py b/udapi/block/ud/printfixed.py
new file mode 100644
index 00000000..313943bb
--- /dev/null
+++ b/udapi/block/ud/printfixed.py
@@ -0,0 +1,104 @@
+"""
+Block PrintFixed prints occurrences of fixed multiword expressions in UD. It
+can be run twice in a row, first collecting known fixed expressions and then
+also reporting other occurrences of these expressions where they are not
+annotated as fixed.
+
+Usage:
+udapy ud.PrintFixed only_forms=1 < in.conllu | sort -u > fixed_expressions.txt
+udapy ud.PrintFixed known_expressions=fixed_expressions.txt < in.conllu | sort | uniq -c | less
+
+Author: Dan Zeman
+"""
+from udapi.core.block import Block
+import re
+import logging
+
+class PrintFixed(Block):
+ """
+ Print fixed multiword expressions.
+ """
+
+ def __init__(self, only_forms=False, known_expressions=None, **kwargs):
+ """
+ Create the PrintFixed block.
+
+ Parameters:
+ only_forms=1: print the word forms but not tags and other info;
+ This can be used to create the list of known forms that we want to
+ identify even if they are not annotated as fixed.
+ known_expressions: the name of the text file with the expressions
+ """
+ super().__init__(**kwargs)
+ self.only_forms = only_forms
+ self.known_expressions = {}
+ self.first_words = {}
+ self.max_length = 2
+ if known_expressions:
+ fh = open(known_expressions, 'r', encoding='utf-8')
+ n = 0
+ for expression in fh.readlines():
+ expression = expression.replace('\n', '')
+ if expression in self.known_expressions:
+ self.known_expressions[expression] += 1
+ else:
+ self.known_expressions[expression] = 1
+ logging.info("Read known fixed expression '%s'" % expression)
+ n += 1
+ words = expression.split(' ')
+ first_word = words[0]
+ self.first_words[first_word] = 1
+ length = len(words)
+ if length > self.max_length:
+ self.max_length = length
+ logging.info('Read %d known fixed expressions.' % n)
+
+ def process_node(self, node):
+ fixed_children = [x for x in node.children if x.udeprel == 'fixed']
+ if len(fixed_children) > 0:
+ # Fixed children are always to the right of of the parent. But there
+ # may be other nodes in between that are not fixed children (for
+ # example, there may be punctuation that is attached to one of the
+ # fixed nodes).
+ n = node
+ list_of_forms = [node.form.lower()]
+ list_of_tags = [node.upos]
+ while n != fixed_children[-1]:
+ n = n.next_node
+ if n.parent == node and n.udeprel == 'fixed':
+ list_of_forms.append(n.form.lower())
+ list_of_tags.append(n.upos)
+ else:
+ list_of_forms.append('X')
+ list_of_tags.append('X')
+ forms = ' '.join(list_of_forms)
+ tags = ' '.join(list_of_tags)
+ if self.only_forms:
+ print(forms)
+ else:
+ print("%s / %s / %s" % (forms, tags, node.deprel))
+ else:
+ # If this is not the first word of a fixed expression, check whether
+ # something that looks like a known fixed expression starts here.
+ # Note that it is also possible that a known expression starts here
+ # but only a subset is actually marked as such; we currently do not
+ # account for this.
+ if node.form.lower() in self.first_words:
+ n = node
+ list_of_forms = [node.form.lower()]
+ list_of_tags = [node.upos]
+ for i in range(self.max_length - 1):
+ n = n.next_node
+ if not n:
+ break
+ ###!!! At present we cannot identify known expressions with gaps ('X').
+ list_of_forms.append(n.form.lower())
+ list_of_tags.append(n.upos)
+ forms = ' '.join(list_of_forms)
+ if forms in self.known_expressions:
+ if self.only_forms:
+ print(forms)
+ else:
+ tags = ' '.join(list_of_tags)
+ print("%s / %s / NOT FIXED" % (forms, tags))
+ break
diff --git a/udapi/block/ud/pt/__init__.py b/udapi/block/ud/pt/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/ud/pt/addhyphenmwt.py b/udapi/block/ud/pt/addhyphenmwt.py
new file mode 100644
index 00000000..9492b1a2
--- /dev/null
+++ b/udapi/block/ud/pt/addhyphenmwt.py
@@ -0,0 +1,37 @@
+"""Block ud.pt.AddHyphenMwt for transforming hyphen compounds into multiword tokens in Portuguese-GSD.
+
+See https://github.com/UniversalDependencies/UD_Portuguese-GSD/issues/39
+"""
+from udapi.core.block import Block
+
+class AddHyphenMwt(Block):
+
+ def _ok(self, token):
+ # The hyphen in "al-Assad" perhaps should be kept as a separate word.
+ return token.form.isalnum() and token.form.lower() != 'al'
+
+ def process_tree(self, root):
+ tokens, i = root.token_descendants, 1
+ while i+1 < len(tokens):
+ start_i = i-1
+ if tokens[i].form == "-" and self._ok(tokens[i-1]) and self._ok(tokens[i+1]):
+ while i+3 < len(tokens) and tokens[i+2].form == "-" and self._ok(tokens[i+3]):
+ i += 2
+ compound, words = tokens[start_i:i+2], []
+ for token in compound:
+ words += token.words
+ heads = [w for w in words if w.parent not in words]
+ cuckolds = [w for w in words if w not in heads and any(c not in words for c in w.children)]
+ if len(heads) > 1:
+ for h in heads:
+ h.misc["ToDo"] = 'NonCatenaCompound'
+ elif cuckolds:
+ for c in cuckolds:
+ c.misc["ToDo"] = 'HasChildrenOutsideCompound'
+ else:
+ compound_form = "".join(t.form for t in compound)
+ for hyphen in compound[1::2]:
+ hyphen.remove()
+ root.create_multiword_token([w for w in words if w.form != '-'], compound_form)
+ root.text = None
+ i += 1
diff --git a/udapi/block/ud/pt/addmwt.py b/udapi/block/ud/pt/addmwt.py
new file mode 100644
index 00000000..daa605b2
--- /dev/null
+++ b/udapi/block/ud/pt/addmwt.py
@@ -0,0 +1,148 @@
+"""Block ud.pt.AddMwt for heuristic detection of Portuguese contractions.
+
+According to the UD guidelines, contractions such as "dele" = "de ele"
+should be annotated using multi-word tokens.
+
+Note that this block should be used only for converting legacy conllu files.
+Ideally a tokenizer should have already split the MWTs.
+"""
+import udapi.block.ud.addmwt
+
+MWTS = {
+ 'à': {'form': 'a a', 'lemma': 'a o'},
+ 'às': {'form': 'a as', 'lemma': 'a o'},
+ 'ao': {'form': 'a o', 'lemma': 'a o'},
+ 'aos': {'form': 'a os', 'lemma': 'a o'},
+ 'da': {'form': 'de a', 'lemma': 'de o'},
+ 'das': {'form': 'de as', 'lemma': 'de o'},
+ 'dessa': {'form': 'de essa', 'lemma': 'de esse'},
+ 'dessas': {'form': 'de essas', 'lemma': 'de esse'},
+ 'desse': {'form': 'de esse', 'lemma': 'de esse'},
+ 'desses': {'form': 'de esses', 'lemma': 'de esse'},
+ 'desta': {'form': 'de esta', 'lemma': 'de este'},
+ 'destas': {'form': 'de estas', 'lemma': 'de este'},
+ 'deste': {'form': 'de este', 'lemma': 'de este'},
+ 'destes': {'form': 'de estes', 'lemma': 'de este'},
+ 'disso': {'form': 'de isso', 'lemma': 'de este'},
+ 'disto': {'form': 'de isto', 'lemma': 'de este'},
+ 'do': {'form': 'de o', 'lemma': 'de o'}, # 'upos': 'ADP PRON', 'deprel': 'case *''
+ 'dos': {'form': 'de os', 'lemma': 'de o'},
+ 'dum': {'form': 'de um', 'lemma': 'de um'},
+ 'duma': {'form': 'de uma', 'lemma': 'de um'},
+ 'dumas': {'form': 'de umas', 'lemma': 'de um'},
+ 'duns': {'form': 'de uns', 'lemma': 'de um'},
+ 'na': {'form': 'em a', 'lemma': 'em o'},
+ 'nas': {'form': 'em as', 'lemma': 'em o'}, # ADP PRON
+ 'nesses': {'form': 'em esses', 'lemma': 'em esse'},
+ 'nesta': {'form': 'em esta', 'lemma': 'em este'},
+ 'neste': {'form': 'em este', 'lemma': 'em este'},
+ 'nisso': {'form': 'em isso', 'lemma': 'em este'},
+ 'nisto': {'form': 'em isto', 'lemma': 'em este',
+ 'upos': 'ADP PRON', 'main': 1, 'shape': 'subtree'},
+ 'no': {'form': 'em o', 'lemma': 'em o'}, # PRON cases are excluded below
+ 'nos': {'form': 'em os', 'lemma': 'em o'}, # PRON cases are excluded below
+ 'num': {'form': 'em um', 'lemma': 'em um'},
+ 'numa': {'form': 'em uma', 'lemma': 'em um'},
+ 'numas': {'form': 'em umas', 'lemma': 'em um'},
+ 'nuns': {'form': 'em uns', 'lemma': 'em um'},
+ 'pela': {'form': 'por a', 'lemma': 'por o'},
+ 'pelas': {'form': 'por as', 'lemma': 'por o'},
+ 'pelos': {'form': 'por os', 'lemma': 'por o'},
+ 'pelo': {'form': 'por o', 'lemma': 'por o'},
+ # TODO daí = de aí = ADP ADV = case advmod
+}
+
+# shared values for all entries in MWTS
+for v in MWTS.values():
+ if not v.get('upos'):
+ v['upos'] = 'ADP DET'
+ if not v.get('deprel'):
+ v['deprel'] = 'case det'
+ v['feats'] = '_ *'
+ # The following are the default values
+ # v['main'] = 0 # which of the two words will inherit the original children (if any)
+ # v['shape'] = 'siblings', # the newly created nodes will be siblings
+
+for pronoun in 'ela ele eles elas'.split():
+ MWTS['d' + pronoun] = {
+ 'form': 'de ' + pronoun,
+ 'lemma': 'de ' + pronoun,
+ 'upos': 'ADP PRON',
+ 'deprel': 'case *',
+ 'main': 1,
+ 'shape': 'subtree',
+ }
+
+
+class AddMwt(udapi.block.ud.addmwt.AddMwt):
+ """Detect and mark MWTs (split them into words and add the words to the tree)."""
+
+ def multiword_analysis(self, node):
+ """Return a dict with MWT info or None if `node` does not represent a multiword token."""
+
+ # "no" can be either a contraction of "em o", or a pronoun
+ if node.form.lower() in ('no', 'nos') and node.upos == 'PRON':
+ return
+
+ analysis = MWTS.get(node.form.lower(), None)
+
+ # If the input is e.g.:
+ # 1 na _ ADP _ _ deprel_x ?
+ # 2 verdade _ NOUN _ _ fixed 1
+ # The expected output is:
+ # 1-2 na _ _ _ _ _ _
+ # 1 em _ ADP _ _ deprel_x ?
+ # 2 a _ DET _ _ fixed 1
+ # 3 verdade _ NOUN _ _ fixed 1
+ if analysis and analysis['deprel'] == 'case det' and node.udeprel != 'case':
+ copy = dict(analysis)
+ copy['deprel'] = '* det'
+ copy['shape'] = 'subtree'
+ first_child = next((c for c in node.children if node.precedes(c)), None)
+ if first_child is not None and first_child.udeprel == 'fixed':
+ copy['deprel'] = '* fixed'
+ return copy
+ if analysis is not None:
+ return analysis
+
+ if node.form.lower().endswith('-se') and node.upos == 'VERB':
+ return {
+ 'form': node.form.lower()[:-3] + ' se',
+ 'lemma': '* se',
+ 'upos': '* PRON',
+ 'feats': '* _',
+ 'deprel': '* nsubj', # or '* expl'
+ 'main': 0,
+ 'shape': 'subtree',
+ }
+ elif node.form.lower().endswith('-lo') and node.upos == 'VERB':
+ return {
+ 'form': node.form.lower()[:-3] + ' lo',
+ 'lemma': '* ele',
+ 'upos': '* PRON',
+ 'feats': '* _',
+ 'deprel': '* obj',
+ 'main': 0,
+ 'shape': 'subtree',
+ }
+ elif node.form.lower().endswith('-los') and node.upos == 'VERB':
+ return {
+ 'form': node.form.lower()[:-4] + ' los',
+ 'lemma': '* eles',
+ 'upos': '* PRON',
+ 'feats': '* _',
+ 'deprel': '* obj',
+ 'main': 0,
+ 'shape': 'subtree',
+ }
+ elif node.form.lower().endswith('-o') and node.upos == 'VERB':
+ return {
+ 'form': node.form.lower()[:-2] + ' o',
+ 'lemma': '* ele',
+ 'upos': '* PRON',
+ 'feats': '* _',
+ 'deprel': '* obj',
+ 'main': 0,
+ 'shape': 'subtree',
+ }
+ return None
diff --git a/udapi/block/ud/removemwt.py b/udapi/block/ud/removemwt.py
new file mode 100644
index 00000000..99c37b4d
--- /dev/null
+++ b/udapi/block/ud/removemwt.py
@@ -0,0 +1,38 @@
+"""Block ud.RemoveMwt for removing multi-word tokens."""
+from udapi.core.block import Block
+
+
+class RemoveMwt(Block):
+ """Substitute MWTs with one word representing the whole MWT."""
+
+ def process_tree(self, root):
+ for mwt in root.multiword_tokens:
+ words = mwt.words
+ words[0].form = mwt.form
+ words[0].misc = mwt.misc
+ words[0].upos = self.guess_upos(words)
+ words[0].feats = self.guess_feats(words)
+ words[0].deprel = self.guess_deprel(words)
+ mwt.remove()
+ for word in words[1:]:
+ word.remove(children='rehang')
+
+ @staticmethod
+ def guess_upos(words):
+ """UPOS of the whole MWT"""
+ return words[0].upos
+
+ @staticmethod
+ def guess_deprel(words):
+ """DEPREL of the whole MWT"""
+ return words[0].deprel
+ # Alternatively, we could define deprel subtypes
+ # return words[0].deprel + ':' + ','.join([w.deprel for w in words[1:]])
+
+ @staticmethod
+ def guess_feats(words):
+ """FEATS of the whole MWT"""
+ feats = words[0].feats
+ for word in words[1:]:
+ feats.update(word.feats)
+ return feats
diff --git a/udapi/block/ud/ro/fixfixed.py b/udapi/block/ud/ro/fixfixed.py
new file mode 100644
index 00000000..14d16464
--- /dev/null
+++ b/udapi/block/ud/ro/fixfixed.py
@@ -0,0 +1,20 @@
+"""Block ud.ro.FixFixed
+
+Author: Dan Zeman
+"""
+import logging
+
+from udapi.core.block import Block
+
+
+class FixFixed(Block):
+ """Block for fixing annotation of some 'fixed' expressions."""
+
+ def process_node(self, node):
+ fixchildren = [x for x in node.children if x.udeprel=='fixed']
+ nfc = len(fixchildren)
+ if nfc > 0:
+ if node.udeprel == 'advmod' and node.feats['ExtPos'] == '':
+ node.feats['ExtPos'] = 'ADV'
+ elif node.feats['ExtPos'] == '':
+ logging.info('Another case: '+node.lemma+' '+' '.join([x.form for x in fixchildren]))
diff --git a/udapi/block/ud/ro/fixneg.py b/udapi/block/ud/ro/fixneg.py
index a22131b2..68888aa6 100644
--- a/udapi/block/ud/ro/fixneg.py
+++ b/udapi/block/ud/ro/fixneg.py
@@ -6,13 +6,14 @@
from udapi.core.block import Block
+
class FixNeg(Block):
"""Block for fixing the remaining cases (after ud.Convert1to2) of deprel=neg in UD_Romanian."""
def process_node(self, node):
if node.deprel == "neg":
if node.upos == "PRON" and node.form == "ne":
- node.feats = 'Polarity=Neg' # delete other features
+ node.feats = 'Polarity=Neg' # delete other features
elif node.upos != "ADJ":
logging.warning("Strange node %s with deprel=neg", node)
node.upos = "ADV"
diff --git a/udapi/block/ud/ro/setspaceafter.py b/udapi/block/ud/ro/setspaceafter.py
index 80bfda8f..6c4b27e3 100644
--- a/udapi/block/ud/ro/setspaceafter.py
+++ b/udapi/block/ud/ro/setspaceafter.py
@@ -1,7 +1,8 @@
"""Block ud.ro.SetSpaceAfter for heuristic setting of SpaceAfter=No in Romanian.
-Usage:
-udapy -s ud.ro.SetSpaceAfter < in.conllu > fixed.conllu
+Usage::
+
+ udapy -s ud.ro.SetSpaceAfter < in.conllu > fixed.conllu
Author: Martin Popel
"""
@@ -9,17 +10,21 @@
import udapi.block.ud.setspaceafter
+
class SetSpaceAfter(udapi.block.ud.setspaceafter.SetSpaceAfter):
"""Block for heuristic setting of the SpaceAfter=No MISC attribute in Romanian.
Romanian uses many contractions, e.g.
- raw | meaning | tokenized | lemmatized
- -------|---------|-----------|-----------
- n-ar | nu ar | n- ar | nu avea
- să-i | să îi | să -i | să el
- într-o | în o | într- o | întru un
- nu-i | nu îi | nu -i | nu el
- nu-i | nu e | nu -i | nu fi
+
+ ======= ======= ========= ==========
+ raw meaning tokenized lemmatized
+ ======= ======= ========= ==========
+ n-ar nu ar n- ar nu avea
+ să-i să îi să -i să el
+ într-o în o într- o întru un
+ nu-i nu îi nu -i nu el
+ nu-i nu e nu -i nu fi
+ ======= ======= ========= ==========
Detokenization is quite simple: no space after word-final hyphen and before word-initial hyphen.
There are just two exceptions, I have found:
@@ -33,7 +38,7 @@ def process_tree(self, root):
# Mark contractions like -i, -și, -l, -urilor, but not negative numbers like -12,3.
# Store SpaceAfter=No to the previous node.
- next_form = nodes[i+1].form
+ next_form = nodes[i + 1].form
if re.match('-.*[^0-9,.]', next_form):
self.mark_no_space(node)
diff --git a/udapi/block/ud/ru/fixedeprels.py b/udapi/block/ud/ru/fixedeprels.py
new file mode 100644
index 00000000..6fa73460
--- /dev/null
+++ b/udapi/block/ud/ru/fixedeprels.py
@@ -0,0 +1,279 @@
+"""Block to fix case-enhanced dependency relations in Russian."""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixEdeprels(Block):
+
+ # Sometimes there are multiple layers of case marking and only the outermost
+ # layer should be reflected in the relation. For example, the semblative 'как'
+ # is used with the same case (preposition + morphology) as the nominal that
+ # is being compared ('как_в:loc' etc.) We do not want to multiply the relations
+ # by all the inner cases.
+ # The list in the value contains exceptions that should be left intact.
+ outermost = {
+ 'более_чем': [],
+ 'будто': [],
+ 'ведь': [],
+ 'ежели': [],
+ 'если': [],
+ 'как': ['как_только'],
+ 'когда': [],
+ 'кроме_как': [],
+ 'менее_чем': [],
+ 'минус': [],
+ 'нежели': [],
+ 'плюс': [],
+ 'пока': [],
+ 'поскольку': [],
+ 'потому_что': [],
+ 'пусть': [],
+ 'равно_как': [],
+ 'раз': [],
+ 'словно': [],
+ 'так_что': [],
+ 'хоть': [],
+ 'хотя': [],
+ 'чем': [],
+ 'что': [],
+ 'чтобы': [],
+ 'яко': []
+ }
+
+ # Secondary prepositions sometimes have the lemma of the original part of
+ # speech. We want the grammaticalized form instead. List even those that
+ # will have the same lexical form, as we also want to check the morphological
+ # case. And include all other prepositions that have unambiguous morphological
+ # case, even if they are not secondary.
+ unambiguous = {
+ 'versus': 'версус:nom',
+ 'loc': 'в:loc',
+ 'в_вид': 'в_виде:gen',
+ 'в_во_глава': 'в:acc', # annotation error: 'входил в группу во главе с геологом'
+ 'в_для': 'в:acc',
+ 'в_качество': 'в_качестве:gen',
+ 'в_отношение': 'в_отношении:gen',
+ 'в_с': 'в:loc', # annotation error: 'в партнерстве с ACCELS' lacks the second level
+ 'в_связь_с': 'в_связи_с:ins',
+ 'в_случай_если': 'в_случае_если',
+ 'в_случай_когда': 'в_случае_когда',
+ 'в_соответствие_с': 'в_соответствии_с:ins',
+ 'в_течение': 'в_течение:gen',
+ 'в_то_быть': 'в:loc',
+ 'в_тот_время_как': 'в_то_время_как',
+ 'в_угода': 'в_угоду:dat',
+ 'в_ход': 'в_ходе:gen',
+ 'вблизи': 'вблизи:gen',
+ 'взамен': 'взамен:gen',
+ 'вместо': 'вместо:gen',
+ 'во_глава': 'во_главе_с:ins',
+ 'во_глава_с': 'во_главе_с:ins',
+ 'во_избежание': 'во_избежание:gen',
+ 'возле': 'возле:gen',
+ 'вокруг': 'вокруг:gen',
+ 'вплоть_до': 'вплоть_до:gen',
+ 'вроде': 'вроде:gen',
+ 'выше': 'выше:gen',
+ 'для': 'для:gen',
+ 'для_в': 'для:gen',
+ 'до_то_как': 'до:gen', # до того, как ...
+ 'за_исключение': 'за_исключением:gen',
+ 'из_более_чем': 'из:gen',
+ 'к': 'к:dat',
+ 'ко': 'ко:dat',
+ 'коли_скоро': 'коль_скоро',
+ 'кроме': 'кроме:gen',
+ 'между_во_глава': 'между:ins', # annotation error: 'между делегацией Минобороны во главе с замминистра Владимиром Исаковым и лидером Приднестровья Игорем Смирновым'
+ 'на_вперед': 'на:acc',
+ 'над': 'над:ins', # at least I have not encountered any genuine example of accusative
+ 'насчет': 'насчет:gen',
+ 'несмотря_на': 'несмотря_на:acc',
+ 'ниже': 'ниже:gen',
+ 'около': 'около:gen',
+ 'от_до': 'от:gen',
+ 'от_от': 'от:gen',
+ 'от_с': 'от:gen',
+ 'относительно': 'относительно:gen',
+ 'перед': 'перед:ins',
+ 'по_мера': 'по_мере:gen',
+ 'по_мера_то_как': 'по_мере_того_как',
+ 'по_отношение_ко?': 'по_отношению_к:dat',
+ 'по_повод': 'по_поводу:gen',
+ 'по_сравнение_с': 'по_сравнению_с:ins',
+ 'помимо': 'помимо:gen',
+ 'порядка': 'порядка:gen',
+ 'после': 'после:gen',
+ 'посредством_как': 'посредством:gen',
+ 'при': 'при:loc',
+ 'при_помощь': 'при_помощи:gen',
+ 'при_условие_что': 'при_условии_что',
+ 'про': 'про:acc',
+ 'против': 'против:gen',
+ 'с_более_чем': 'с:gen',
+ 'с_во_глава': 'с:ins',
+ 'с_на': 'с:par',
+ 'с_помощь': 'с_помощью:gen',
+ 'с_тем': 'с:ins',
+ 'с_тот_пора_как': 'с_тех_пор_как',
+ 'с_что': 'с:ins',
+ 'свыше': 'свыше:gen',
+ 'со_сторона': 'со_стороны:gen',
+ 'согласно': 'согласно:dat',
+ 'спустя': 'спустя:acc',
+ 'среди': 'среди:gen',
+ 'среди_в': 'среди:gen',
+ 'так_чтобы': 'чтобы',
+ 'тем_между': 'между:ins',
+ 'у': 'у:gen',
+ 'у_без': 'у:gen',
+ 'через': 'через:acc',
+ 'чтоб': 'чтобы'
+ }
+
+ def copy_case_from_adposition(self, node, adposition):
+ """
+ In some treebanks, adpositions have the Case feature and it denotes the
+ valency case that the preposition's nominal must be in.
+ """
+ # The following is only partial solution. We will not see
+ # some children because they may be shared children of coordination.
+ prepchildren = [x for x in node.children if x.lemma == adposition]
+ if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '':
+ return adposition+':'+prepchildren[0].feats['Case'].lower()
+ else:
+ return None
+
+ def process_node(self, node):
+ """
+ Occasionally the edeprels automatically derived from the Russian basic
+ trees do not match the whitelist. For example, the noun is an
+ abbreviation and its morphological case is unknown.
+ """
+ for edep in node.deps:
+ # Although in theory allowed by the EUD guidelines, Russian does not enhance the ccomp relation with case markers.
+ edep['deprel'] = re.sub(r'^ccomp:чтобы$', r'ccomp', edep['deprel'])
+ m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel'])
+ if m:
+ bdeprel = m.group(1)
+ solved = False
+ # If the marker is 'быть', discard it. It represents the phrase 'то есть', which should not be analyzed as introducing a subordinate clause.
+ edep['deprel'] = re.sub(r':(быть|сколь|столько|типа).*', '', edep['deprel'])
+ # Some markers should be discarded only if they occur as clause markers (acl, advcl).
+ edep['deprel'] = re.sub(r'^(advcl|acl(?::relcl)?):(в|вместо|при)$', r'\1', edep['deprel'])
+ # Some markers should not occur as clause markers (acl, advcl) and should be instead considered nominal markers (nmod, obl).
+ edep['deprel'] = re.sub(r'^advcl:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'obl:\1\2', edep['deprel'])
+ edep['deprel'] = re.sub(r'^acl(?::relcl)?:(взамен|для|до|из|на|насчет|от|перед|по|после|с|среди|у)(:|$)', r'nmod:\1\2', edep['deprel'])
+ # If the case marker starts with 'столько', remove this part.
+ # It occurs in the expressions of the type 'сколько...столько' but the real case marker of the modifier is something else.
+ # Similarly, 'то' occurs in 'то...то' and should be removed.
+ edep['deprel'] = re.sub(r':(столько|то|точно)[_:]', ':', edep['deprel'])
+ # If one of the following expressions occurs followed by another preposition
+ # or by morphological case, remove the additional case marking. For example,
+ # 'словно_у' becomes just 'словно'.
+ for x in self.outermost:
+ exceptions = self.outermost[x]
+ m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel'])
+ if m and m.group(2) and not x+m.group(2) in exceptions:
+ edep['deprel'] = m.group(1)+':'+x
+ solved = True
+ break
+ if solved:
+ continue
+ for x in self.unambiguous:
+ # All secondary prepositions have only one fixed morphological case
+ # they appear with, so we can replace whatever case we encounter with the correct one.
+ m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|par|dat|acc|voc|loc|ins))?$', edep['deprel'])
+ if m:
+ edep['deprel'] = m.group(1)+':'+self.unambiguous[x]
+ solved = True
+ break
+ if solved:
+ continue
+ # The following prepositions have more than one morphological case
+ # available.
+ m = re.match(r'^(obl(?::arg)?|nmod):(до|из|от)(?::(?:nom|dat|acc|voc|loc|ins))?$', edep['deprel'])
+ if m:
+ adpcase = self.copy_case_from_adposition(node, m.group(2))
+ if adpcase:
+ edep['deprel'] = m.group(1)+':'+adpcase
+ else:
+ # Genitive or partitive are possible. Pick genitive.
+ edep['deprel'] = m.group(1)+':'+m.group(2)+':gen'
+ continue
+ # Both "на" and "в" also occur with genitive. However, this
+ # is only because there are numerals in the phrase ("в 9 случаев из 10")
+ # and the whole phrase should not be analyzed as genitive.
+ m = re.match(r'^(obl(?::arg)?|nmod):(в|во|на|о)(?::(?:nom|gen|dat|voc|ins))?$', edep['deprel'])
+ if m:
+ adpcase = self.copy_case_from_adposition(node, m.group(2))
+ if adpcase:
+ edep['deprel'] = m.group(1)+':'+adpcase
+ else:
+ # Accusative or locative are possible. Pick locative.
+ edep['deprel'] = m.group(1)+':'+m.group(2)+':loc'
+ continue
+ # Unlike in Czech, 'над' seems to allow only instrumental and not accusative.
+ m = re.match(r'^(obl(?::arg)?|nmod):(за|под)(?::(?:nom|gen|dat|voc|loc))?$', edep['deprel'])
+ if m:
+ adpcase = self.copy_case_from_adposition(node, m.group(2))
+ if adpcase:
+ edep['deprel'] = m.group(1)+':'+adpcase
+ else:
+ # Accusative or instrumental are possible. Pick accusative.
+ edep['deprel'] = m.group(1)+':'+m.group(2)+':acc'
+ continue
+ m = re.match(r'^(obl(?::arg)?|nmod):(между)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel'])
+ if m:
+ adpcase = self.copy_case_from_adposition(node, m.group(2))
+ if adpcase:
+ edep['deprel'] = m.group(1)+':'+adpcase
+ else:
+ # Genitive or instrumental are possible. Pick genitive.
+ edep['deprel'] = m.group(1)+':'+m.group(2)+':gen'
+ continue
+ m = re.match(r'^(obl(?::arg)?|nmod):(по)(?::(?:nom|gen|voc|ins))?$', edep['deprel'])
+ if m:
+ adpcase = self.copy_case_from_adposition(node, m.group(2))
+ if adpcase:
+ edep['deprel'] = m.group(1)+':'+adpcase
+ else:
+ # Dative, accusative or locative are possible. Pick dative.
+ edep['deprel'] = m.group(1)+':'+m.group(2)+':dat'
+ continue
+ m = re.match(r'^(obl(?::arg)?|nmod):(с)(?::(?:nom|dat|acc|voc|loc))?$', edep['deprel'])
+ if m:
+ adpcase = self.copy_case_from_adposition(node, m.group(2))
+ if adpcase:
+ edep['deprel'] = m.group(1)+':'+adpcase
+ else:
+ # Genitive or instrumental are possible. Pick instrumental.
+ edep['deprel'] = m.group(1)+':'+m.group(2)+':ins'
+ continue
+ if re.match(r'^(nmod|obl):', edep['deprel']):
+ if edep['deprel'] == 'nmod:loc' and node.parent.feats['Case'] == 'Loc' or edep['deprel'] == 'nmod:voc' and node.parent.feats['Case'] == 'Voc':
+ # This is a same-case noun-noun modifier, which just happens to be in the locative.
+ # For example, 'v Ostravě-Porubě', 'Porubě' is attached to 'Ostravě', 'Ostravě' has
+ # nmod:v:loc, which is OK, but for 'Porubě' the case does not say anything significant.
+ edep['deprel'] = 'nmod'
+ elif edep['deprel'] == 'nmod:loc':
+ edep['deprel'] = 'nmod:nom'
+ elif edep['deprel'] == 'nmod:voc':
+ edep['deprel'] = 'nmod:nom'
+
+ def set_basic_and_enhanced(self, node, parent, deprel, edeprel):
+ '''
+ Modifies the incoming relation of a node both in the basic tree and in
+ the enhanced graph. If the node does not yet depend in the enhanced
+ graph on the current basic parent, the new relation will be added without
+ removing any old one. If the node already depends multiple times on the
+ current basic parent in the enhanced graph, all such enhanced relations
+ will be removed before adding the new one.
+ '''
+ old_parent = node.parent
+ node.parent = parent
+ node.deprel = deprel
+ node.deps = [x for x in node.deps if x['parent'] != old_parent]
+ new_edep = {}
+ new_edep['parent'] = parent
+ new_edep['deprel'] = edeprel
+ node.deps.append(new_edep)
diff --git a/udapi/block/ud/ru/fixremnant.py b/udapi/block/ud/ru/fixremnant.py
index d94b0e5c..b41431db 100644
--- a/udapi/block/ud/ru/fixremnant.py
+++ b/udapi/block/ud/ru/fixremnant.py
@@ -4,6 +4,7 @@
"""
from udapi.core.block import Block
+
class FixRemnant(Block):
"""ad-hoc fixing the remaining cases (after ud.Convert1to2) of deprel=remnant in UD_Russian."""
diff --git a/udapi/block/ud/ru/fixtoest.py b/udapi/block/ud/ru/fixtoest.py
new file mode 100644
index 00000000..1b603e96
--- /dev/null
+++ b/udapi/block/ud/ru/fixtoest.py
@@ -0,0 +1,35 @@
+"""Block to fix annotation of то есть in Russian."""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixToEst(Block):
+
+ def process_node(self, node):
+ """
+ In the converted data from Kira, the fixed expression "то есть" ("that is")
+ is treated as a subordinator and attached as "mark", which later makes it
+ part of complex enhanced relation labels. I believe that this analysis is
+ wrong and that it will be better to label these expressions as "cc".
+ """
+ if node.udeprel == 'mark' and node.lemma == 'то':
+ if len([c for c in node.children if c.udeprel == 'fixed' and c.lemma == 'быть']) > 0:
+ self.set_basic_and_enhanced(node, node.parent, 'cc', 'cc')
+
+ def set_basic_and_enhanced(self, node, parent, deprel, edeprel):
+ '''
+ Modifies the incoming relation of a node both in the basic tree and in
+ the enhanced graph. If the node does not yet depend in the enhanced
+ graph on the current basic parent, the new relation will be added without
+ removing any old one. If the node already depends multiple times on the
+ current basic parent in the enhanced graph, all such enhanced relations
+ will be removed before adding the new one.
+ '''
+ old_parent = node.parent
+ node.parent = parent
+ node.deprel = deprel
+ node.deps = [x for x in node.deps if x['parent'] != old_parent]
+ new_edep = {}
+ new_edep['parent'] = parent
+ new_edep['deprel'] = edeprel
+ node.deps.append(new_edep)
diff --git a/udapi/block/ud/setspaceafter.py b/udapi/block/ud/setspaceafter.py
index 00193770..04c9fffb 100644
--- a/udapi/block/ud/setspaceafter.py
+++ b/udapi/block/ud/setspaceafter.py
@@ -9,13 +9,15 @@
from udapi.core.block import Block
+
class SetSpaceAfter(Block):
"""Block for heuristic setting of the SpaceAfter=No MISC attribute."""
- def __init__(self, not_after='¡¿([{„', not_before='.,;:!?}])', fix_text=True, **kwargs):
+ def __init__(self, not_after='¡ ¿ ( [ { „ /', not_before='. , ; : ! ? } ] ) / ?? ??? !! !!! ... …',
+ fix_text=True, extra_not_after='', extra_not_before='', **kwargs):
super().__init__(**kwargs)
- self.not_after = not_after
- self.not_before = not_before
+ self.not_after = (not_after + ' ' + extra_not_after).split(' ')
+ self.not_before = (not_before + ' ' + extra_not_before).split(' ')
self.fix_text = fix_text
self.changed = False
@@ -25,7 +27,7 @@ def process_tree(self, root):
self.changed = False
# Undirected double quotes are ambiguous.
- # If there is an even number of quotes in a sentence, supposed they are not nested
+ # If there is an even number of quotes in a sentence, suppose they are not nested
# and treat odd-indexed ones as opening and even-indexed ones as closing.
# Otherwise (odd number, e.g. when quoting multiple sentences), don't remove any space.
matching_quotes = not bool(count_of_form['"'] % 2)
@@ -35,22 +37,25 @@ def process_tree(self, root):
# Some languages use directed „quotes“ and some “quotes”,
# so the symbol “ (U+201C) is ambiguous and we heuristically check for presence of „.
if count_of_form['„']:
- not_before += '“'
+ not_before += ['“']
else:
- not_after += '“'
+ not_after += ['“']
for i, node in enumerate(nodes[:-1]):
- next_form = nodes[i+1].form
+ next_form = nodes[i + 1].form
if node.form in self.not_after or next_form in not_before:
self.mark_no_space(node)
- if matching_quotes and node.form == '"':
- if odd_indexed_quote:
+ if node.form == '"':
+ if matching_quotes:
+ if odd_indexed_quote:
+ self.mark_no_space(node)
+ elif i:
+ self.mark_no_space(nodes[i - 1])
+ odd_indexed_quote = not odd_indexed_quote
+ elif i==0:
self.mark_no_space(node)
- elif i:
- self.mark_no_space(nodes[i-1])
- odd_indexed_quote = not odd_indexed_quote
- if matching_quotes and nodes[-1].form == '"':
+ if nodes[-1].form == '"':
self.mark_no_space(nodes[-2])
if self.fix_text and self.changed:
diff --git a/udapi/block/ud/setspaceafterfromtext.py b/udapi/block/ud/setspaceafterfromtext.py
index 0c4d8d9d..ec7ab658 100644
--- a/udapi/block/ud/setspaceafterfromtext.py
+++ b/udapi/block/ud/setspaceafterfromtext.py
@@ -9,13 +9,19 @@
from udapi.core.block import Block
+
class SetSpaceAfterFromText(Block):
"""Block for setting of the SpaceAfter=No MISC attribute according to the sentence text."""
def process_tree(self, root):
+ # Empty nodes cannot have 'SpaceAfter=No', so make sure the file is valid.
+ for empty_node in root.empty_nodes:
+ del empty_node.misc['SpaceAfter']
+
text = root.text
- computed = root.compute_text()
- if text == computed:
+ if text is None:
+ raise ValueError('Tree %s has no text, cannot use ud.SetSpaceAfterFromText' % root)
+ if text == root.compute_text():
return
for node in root.token_descendants:
diff --git a/udapi/block/ud/settranslation.py b/udapi/block/ud/settranslation.py
new file mode 100644
index 00000000..487cca06
--- /dev/null
+++ b/udapi/block/ud/settranslation.py
@@ -0,0 +1,59 @@
+"""
+Block SetTranslation for setting of sentence-level translation (the attribute
+text_en for English translation) from a separate text file (one sentence per
+line). For example, one can export the original sentences using write.SentencesHtml,
+then Google-translate them in the web browser, then CTRL+C CTRL+V to a plain
+text editor, save them as translations.txt and import them using this block.
+
+Usage:
+udapy -s ud.SetTranslation file=translations.txt < in.conllu > out.conllu
+
+Author: Dan Zeman
+"""
+from udapi.core.block import Block
+import re
+import logging
+
+class SetTranslation(Block):
+ """
+ Set text_en to the next available translation.
+ """
+
+ def __init__(self, file, overwrite=False, **kwargs):
+ """
+ Create the SetTranslation block.
+
+ Parameters:
+ file: the name of the text file with the translations (one sentence per line)
+ overwrite=1: set the translation even if the sentence already has one
+ (default: do not overwrite existing translations)
+ """
+ super().__init__(**kwargs)
+ self.file = file
+ fh = open(self.file, 'r', encoding='utf-8')
+ self.trlines = fh.readlines()
+ self.nlines = len(self.trlines)
+ self.iline = 0
+ self.overwrite = overwrite
+
+ def process_tree(self, tree):
+ if self.iline < self.nlines:
+ translation = self.trlines[self.iline]
+ self.iline += 1
+ comments = []
+ if tree.comment:
+ comments = tree.comment.split('\n')
+ i_tr = -1
+ for i in range(len(comments)):
+ # The initial '#' character has been stripped.
+ if re.match(r'\s*text_en\s*=', comments[i]):
+ i_tr = i
+ break
+ if i_tr >= 0:
+ if self.overwrite:
+ comments[i_tr] = ' text_en = ' + translation
+ else:
+ comments.append(' text_en = ' + translation)
+ tree.comment = '\n'.join(comments)
+ elif self.iline == self.nlines:
+ logging.warning('There are only %d translation lines but there are more input sentences.' % self.nlines)
diff --git a/udapi/block/ud/sk/fixedeprels.py b/udapi/block/ud/sk/fixedeprels.py
new file mode 100644
index 00000000..7de53881
--- /dev/null
+++ b/udapi/block/ud/sk/fixedeprels.py
@@ -0,0 +1,138 @@
+"""Block to fix case-enhanced dependency relations in Slovak."""
+from udapi.core.block import Block
+import re
+
+class FixEdeprels(Block):
+
+ # Secondary prepositions sometimes have the lemma of the original part of
+ # speech. We want the grammaticalized form instead. List even those that
+ # will have the same lexical form, as we also want to check the morphological
+ # case. And include all other prepositions that have unambiguous morphological
+ # case, even if they are not secondary.
+ unambiguous = {
+ 'a_hoci': 'hoci',
+ 'ako': 'ako', # remove morphological case
+ 'ako_na': 'ako',
+ 'ako_z': 'ako',
+ 'akoby_z': 'z:gen',
+ 'akže': 'ak',
+ 'ani_keby': 'keby',
+ 'ani_keď': 'keď',
+ 'až_keď': 'keď',
+ 'do': 'do:gen',
+ 'k': 'k:dat',
+ 'kto': 'kým', ###!!! The lemma should be fixed! The pronoun has grammaticalized as a subordinator.
+ 'mimo': 'mimo:gen',
+ 'na_rozdiel_od': 'na_rozdiel_od:gen',
+ 'na_základ': 'na_základe:gen',
+ 'od': 'od:gen',
+ 'pod_vplyv': 'pod_vplyvom:gen',
+ 'pomoc': 'pomocou:gen',
+ 'pre': 'pre:acc',
+ 'prostredníctvom': 'prostredníctvom:gen',
+ 'prv_ako': 'ako',
+ 's': 's:ins',
+ 's_cieľ': 's_cieľom', # no case, used with infinitives (advcl)
+ 's_dôraz_na': 's_dôrazom_na:acc',
+ 's_ohľad_na': 's_ohľadom_na:acc',
+ 's_pomoc': 's_pomocou:gen',
+ 'smer_k': 'smerom_k:dat',
+ 'spoločne_s': 'spoločne_s:ins',
+ 'spolu_s': 'spolu_s:ins',
+ 'v_dôsledok': 'v_dôsledku:gen',
+ 'v_meno': 'v_mene:gen',
+ 'v_oblasť': 'v_oblasti:gen',
+ 'v_porovnanie_s': 'v_porovnaní_s:ins',
+ 'v_porovnaniu_s': 'v_porovnaní_s:ins',
+ 'v_priebeh': 'v_priebehu:gen',
+ 'v_prípad': 'v_prípade:gen',
+ 'v_prospech': 'v_prospech:gen',
+ 'v_rámec': 'v_rámci:gen',
+ 'v_spolupráca_s': 'v_spolupráci_s:ins',
+ 'v_súlad_s': 'v_súlade_s:ins',
+ 'v_súvislosť_s': 'v_súvislosti_s:ins',
+ 'v_ústrety': 'v_ústrety:dat',
+ 'v_vzťah_k': 'vo_vzťahu_k:dat',
+ 'v_závislosť_na': 'v_závislosti_na:loc',
+ 'vzhľad_na': 'vzhľadom_na:acc',
+ 'z': 'z:gen',
+ 'z_hľadisko': 'z_hľadiska:gen',
+ 'začiatkom': 'začiatkom:gen'
+ }
+
+ def process_node(self, node):
+ """
+ Occasionally the edeprels automatically derived from the Slovak basic
+ trees do not match the whitelist. For example, the noun is an
+ abbreviation and its morphological case is unknown.
+ """
+ for edep in node.deps:
+ m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):', edep['deprel'])
+ if m:
+ bdeprel = m.group(1)
+ solved = False
+ for x in self.unambiguous:
+ # All secondary prepositions have only one fixed morphological case
+ # they appear with, so we can replace whatever case we encounter with the correct one.
+ m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel'])
+ if m:
+ edep['deprel'] = m.group(1)+':'+self.unambiguous[x]
+ solved = True
+ break
+ # The following prepositions have more than one morphological case
+ # available. Thanks to the Case feature on prepositions, we can
+ # identify the correct one.
+ if not solved:
+ m = re.match(r'^(obl(?::arg)?|nmod):(medzi|na|o|po|pred|v|za)(?::(?:nom|gen|dat|voc))?$', edep['deprel'])
+ if m:
+ # The following is only partial solution. We will not see
+ # some children because they may be shared children of coordination.
+ prepchildren = [x for x in node.children if x.lemma == m.group(2)]
+ if len(prepchildren) > 0 and prepchildren[0].feats['Case'] != '':
+ edep['deprel'] = m.group(1)+':'+m.group(2)+':'+prepchildren[0].feats['Case'].lower()
+ solved = True
+ # If we failed to identify the case of the preposition in the
+ # preceding steps, pick a default. It applies mostly to 'o'
+ # with wrongly split time values.
+ if not solved:
+ m = re.match(r'^(obl(?::arg)?|nmod):o$', edep['deprel'])
+ if m:
+ edep['deprel'] = m.group(1)+':o:acc'
+ solved = True
+ m = re.match(r'^(obl(?::arg)?|nmod):(po|v)$', edep['deprel'])
+ if m:
+ edep['deprel'] = m.group(1)+':'+m.group(2)+':loc'
+ solved = True
+ # Some cases do not occur with nominal modifiers without preposition.
+ # If we see them, chances are that it is the same-case modifier,
+ # and the same case just happens to be the one we see. For vocatives,
+ # it is also possible that they have been confused with nominatives.
+ if not solved:
+ m = re.match(r'^(obl(?::arg)?|nmod):(voc|loc)$', edep['deprel'])
+ if m:
+ edep['deprel'] = m.group(1)
+ solved = True
+ # Annotation and conversion errors.
+ if not solved:
+ # Povedal som jej „na zdorovie“.
+ if edep['deprel'] == 'obl:arg:na' and node.form == 'zdorovie':
+ self.set_basic_and_enhanced(node, edep['parent'], 'ccomp', 'ccomp')
+ solved = True
+
+ def set_basic_and_enhanced(self, node, parent, deprel, edeprel):
+ '''
+ Modifies the incoming relation of a node both in the basic tree and in
+ the enhanced graph. If the node does not yet depend in the enhanced
+ graph on the current basic parent, the new relation will be added without
+ removing any old one. If the node already depends multiple times on the
+ current basic parent in the enhanced graph, all such enhanced relations
+ will be removed before adding the new one.
+ '''
+ old_parent = node.parent
+ node.parent = parent
+ node.deprel = deprel
+ node.deps = [x for x in node.deps if x['parent'] != old_parent]
+ new_edep = {}
+ new_edep['parent'] = parent
+ new_edep['deprel'] = edeprel
+ node.deps.append(new_edep)
diff --git a/udapi/block/ud/splittoken.py b/udapi/block/ud/splittoken.py
new file mode 100644
index 00000000..16c60a38
--- /dev/null
+++ b/udapi/block/ud/splittoken.py
@@ -0,0 +1,107 @@
+"""
+Block ud.SplitToken will split a given token into multiple tokens.
+"""
+from udapi.core.block import Block
+import re
+import logging
+
+
+class SplitToken(Block):
+ """
+ Split a token into two or more. A MISC attribute is used to mark the tokens
+ that should be split. (The attribute may have been set by an annotator or
+ by a previous block that tests the specific conditions under which splitting
+ is desired.) Multiword tokens are currently not supported: The node to be
+ split cannot belong to a MWT. Note that the result will not be a MWT either
+ (use the block ud.AddMwt if that is desired). There will be simply a new
+ attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes
+ (indicating that this was an error in the source text).
+ """
+
+ def __init__(self, misc_name='SplitToken', **kwargs):
+ """
+ Args:
+ misc_name: name of the MISC attribute that can trigger the splitting
+ default: SplitToken
+ The value of the attribute should indicate where to split the token.
+ It should be a string that is identical to node.form except that
+ there is one or more spaces where the token should be split.
+ """
+ super().__init__(**kwargs)
+ self.misc_name = misc_name
+
+ def process_node(self, node):
+ """
+ The SplitToken (or equivalent) attribute in MISC will trigger action.
+ Either the current node will be split to multiple nodes and the
+ attribute will be removed from MISC, or a warning will be issued that
+ the splitting cannot be done and the attribute will stay in MISC. Note
+ that multiword token lines and empty nodes are not even scanned for
+ the attribute, so if it is there, it will stay there but no warning
+ will be printed.
+ """
+ value = node.misc[self.misc_name]
+ if value == '':
+ return
+ if node.multiword_token:
+ logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.")
+ node.misc['Bug'] = 'SplittingTokenNotSupportedHere'
+ return
+ ###!!! This block currently must not be applied on data containing
+ ###!!! enhanced dependencies. We must first implement adjustments of
+ ###!!! the enhanced structure.
+ if node.deps:
+ logging.fatal('At present this block cannot be applied to data with enhanced dependencies.')
+ # Verify that the value of the MISC attribute can be used as specification
+ # of the split.
+ if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value):
+ logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.")
+ node.misc['Bug'] = f'{self.misc_name}BadValue'
+ return
+ if re.search(r'\s', node.form):
+ logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').")
+ node.misc['Bug'] = 'SplittingTokenNotSupportedHere'
+ return
+ if re.sub(r' ', '', value) != node.form:
+ logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.")
+ node.misc['Bug'] = f'{self.misc_name}BadValue'
+ return
+ # Do the split.
+ space_after = node.misc['SpaceAfter']
+ forms = value.split(' ')
+ # Optionally, SplitTokenMorpho in MISC can have the morphological annotation
+ # of the new tokens. For example:
+ # SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act
+ if node.misc['SplitTokenMorpho'] != '':
+ morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ')
+ del node.misc['SplitTokenMorpho']
+ else:
+ morphoblocks = ['' for x in forms]
+ node.form = forms[0]
+ last_node = node
+ for form, morpho in zip(forms[1:], morphoblocks[1:]):
+ last_node.misc['SpaceAfter'] = 'No'
+ last_node.misc['CorrectSpaceAfter'] = 'Yes'
+ lemma = form
+ upos = node.upos
+ feats = str(node.feats)
+ xpos = node.xpos
+ if morpho != '':
+ cols = morpho.split('\\t')
+ for c in cols:
+ colname, value = c.split('=', 1)
+ if colname == 'LEMMA':
+ lemma = value
+ elif colname == 'UPOS':
+ upos = value
+ elif colname == 'FEATS':
+ feats = re.sub(r'\\p', '|', value)
+ elif colname == 'XPOS':
+ xpos = value
+ else:
+ logging.fatal(f"c = {c}")
+ new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep')
+ new_node.shift_after_node(last_node)
+ last_node = new_node
+ last_node.misc['SpaceAfter'] = space_after
+ del node.misc[self.misc_name]
diff --git a/udapi/block/ud/splitunderscoretokens.py b/udapi/block/ud/splitunderscoretokens.py
index 25caeb3b..44575e0c 100644
--- a/udapi/block/ud/splitunderscoretokens.py
+++ b/udapi/block/ud/splitunderscoretokens.py
@@ -8,6 +8,7 @@
import logging
from udapi.core.block import Block
+
class SplitUnderscoreTokens(Block):
"""Block for spliting tokens with underscores and attaching the new nodes using deprel=flat.
@@ -22,7 +23,7 @@ class SplitUnderscoreTokens(Block):
Real-world use cases: UD_Irish (`default_deprel=fixed`) and UD_Czech-CLTT v1.4.
"""
- def __init__(self, deprel=None, default_deprel='flat', **kwargs):
+ def __init__(self, deprel=None, default_deprel='flat', lemma='split', **kwargs):
"""Create the SplitUnderscoreTokens block instance.
Args:
@@ -30,14 +31,21 @@ def __init__(self, deprel=None, default_deprel='flat', **kwargs):
Most common values are: flat, fixed, compound. Default=None.
default_deprel: Which deprel to use for the newly created nodes if the heuristics
in `deprel_for()` method fail. Default=flat.
+ lemma: What to do with the lemmas?
+ - 'split' (the default) means to split them on underscores as well
+ (and warn in case of a different number of underscores than in the form).
+ - 'form' means to copy the forms to the lemmas
"""
super().__init__(**kwargs)
self.deprel = deprel
self.default_deprel = default_deprel
+ self.lemma = lemma
def process_node(self, node):
if node.form != '_' and '_' in node.form:
forms = node.form.split('_')
+ if self.lemma == 'form':
+ node.lemma = node.form
lemmas = node.lemma.split('_')
if len(forms) != len(lemmas):
logging.warning("Different number of underscores in %s and %s, skipping.",
diff --git a/udapi/block/ud/ug/fixspuriousaux.py b/udapi/block/ud/ug/fixspuriousaux.py
new file mode 100644
index 00000000..952644f8
--- /dev/null
+++ b/udapi/block/ud/ug/fixspuriousaux.py
@@ -0,0 +1,46 @@
+"""Block to convert spurious auxiliaries to lexical verbs in Uyghur."""
+from udapi.core.block import Block
+import logging
+import re
+
+class FixSpuriousAux(Block):
+
+ def process_node(self, node):
+ """
+ Some verbs that are called auxiliary by the traditional grammar, should
+ be analyzed in UD as VERB + non-finite xcomp.
+ """
+ # Sometimes there is a double error: it should not be auxiliary, it is
+ # attached as aux but it is not tagged AUX. So we only look at the deprel.
+ if node.udeprel == 'aux':
+ # بەر/بار = give (used with actions done for the benefit of somebody)
+ # چىق = go out
+ # چىقىش = come out
+ # يۈر = walk (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur)
+ # ئولتۇر = sit (the equivalent in Kazakh is considered to be a progressive auxiliary but it does not seem to be the case in Uyghur)
+ # باق = do ever?
+ # ئۆت = pass
+ # كۆرۈش = see
+ # باشلى = start
+ # يەت = be enough
+ # قايت = return
+ # چۈش = fall down
+ # قىل = do
+ # چاپ = jump
+ # قورق = fear
+ # كەلتۈر = cause
+ # كىر = enter
+ # _ ... some putative auxiliaries do not even have a lemma
+ if re.match(r'^(بەر|بار|چىق|چىقىش|يۈر|ئولتۇر|باق|ئۆت|_|كۆرۈش|باشلى|يەت|قايت|چۈش|قىل|چاپ|قورق|كەلتۈر|كىر)$', node.lemma):
+ node.upos = 'VERB'
+ # The auxiliary inherits the incoming relation of its original parent.
+ lexverb = node.parent
+ node.parent = lexverb.parent
+ node.deprel = lexverb.deprel
+ # The auxiliary also inherits some but not all children of the lexical verb.
+ for c in lexverb.children:
+ if re.match(r'^(nsubj|csubj|obl|advmod|advcl|vocative|discourse|parataxis|punct)$', c.udeprel):
+ c.parent = node
+ # The lexical verb becomes an xcomp of the auxiliary.
+ lexverb.parent = node
+ lexverb.deprel = 'xcomp'
diff --git a/udapi/block/ud/yue/lemmatize.py b/udapi/block/ud/yue/lemmatize.py
new file mode 100644
index 00000000..87279dc1
--- /dev/null
+++ b/udapi/block/ud/yue/lemmatize.py
@@ -0,0 +1,43 @@
+"""Block to add missing lemmas in cases where it seems obvious what the lemma should be."""
+from udapi.core.block import Block
+import logging
+import re
+
+class Lemmatize(Block):
+
+ # dictionary: form --> lemma
+ lemma = {
+ '𡃁仔': '笭仔',
+ '仲': '重',
+ '企': '徛',
+ '係咪': '係',
+ '出嚟': '出唻',
+ '可': '可以',
+ '啦': '喇',
+ '㗎喇': '㗎嘑',
+ '喇': '嘑',
+ '嚟': '唻',
+ '就嚟': '就唻',
+ '死𡃁妹': '死笭妹',
+ '老豆': '老頭',
+ '蚊': '緡',
+ '蛋撻': '蛋澾',
+ '返嚟': '返唻',
+ '過嚟人': '過唻人',
+ '過嚟': '過唻'
+ }
+
+ def process_node(self, node):
+ """
+ Parts of the Cantonese treebank lack lemmas. Fortunately, lemmatization
+ of Sino-Tibetan languages is pretty straightforward most of the time,
+ as the lemma typically equals to the actual word form.
+
+ For Cantonese, lemmatization includes normalization of some characters.
+ These are the few cases where lemma differs from the surface form.
+ """
+ if node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes':
+ if node.form in self.lemma:
+ node.lemma = self.lemma[node.form]
+ else:
+ node.lemma = node.form
diff --git a/udapi/block/ud/zh/lemmatize.py b/udapi/block/ud/zh/lemmatize.py
new file mode 100644
index 00000000..abacf29f
--- /dev/null
+++ b/udapi/block/ud/zh/lemmatize.py
@@ -0,0 +1,81 @@
+"""Block to add missing lemmas in cases where it seems obvious what the lemma should be."""
+from udapi.core.block import Block
+import logging
+import re
+
+class Lemmatize(Block):
+
+ def __init__(self, rewrite='empty', **kwargs):
+ """
+ Create the ud.zh.Lemmatize block instance.
+
+ Args:
+ rewrite=empty: set the lemma if it was empty so far; do not touch the rest
+ rewrite=form: set the lemma if it was empty or equal to form; do not touch the rest
+ rewrite=all: set the lemma regardless of what it was previously
+ """
+ super().__init__(**kwargs)
+ if not re.match(r'^(empty|form|all)$', rewrite):
+ raise ValueError("Unexpected value of parameter 'rewrite'")
+ self.rewrite = rewrite
+
+ # dictionary: form --> lemma
+ lemma = {
+ # The plural suffix -men.
+ '我們': '我', # trad
+ '我们': '我', # simp
+ '他們': '他', # trad
+ '他们': '他', # simp
+ '它們': '它', # trad
+ '它们': '它', # simp
+ '牠們': '牠', # trad
+ '她們': '她', # trad
+ '她们': '她', # simp
+ '人們': '人', # trad
+ '人们': '人' # simp
+ }
+
+ def process_node(self, node):
+ """
+ Parts of the Chinese treebanks lack lemmas. Fortunately, lemmatization
+ of Sino-Tibetan languages is pretty straightforward most of the time,
+ as the lemma typically equals to the actual word form.
+ """
+ if self.rewrite == 'empty' and not (node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'):
+ return
+ elif self.rewrite == 'form' and not (node.lemma == node.form or node.lemma == '' or node.lemma == '_' and node.form != '_' and node.feats['Typo'] != 'Yes'):
+ return
+ # Lemmatize negated verbs to their affirmative forms.
+ # 不是 bùshì = not be
+ # 沒有 没有 méiyǒu = not exist
+ # 沒能 没能 méinéng = cannot
+ # 未能 wèinéng = cannot
+ # Lemmatize question verbs to their base forms.
+ # 要不要 yàobùyào = do (you) want?
+ # 有没有 yǒuméiyǒu = do (you) have?
+ # Verbs that are derived from the copula and tagged as the copula need
+ # to have the lemma of the copula (是 shì 爲 為 为 wèi/wéi).
+ # 亦為 亦为 yìwèi = také
+ # 則為 则为 zéwèi = potom
+ # 更為 更为 gèngwèi = více
+ # 認為 认为 rènwéi = myslet, věřit
+ # 以為 以为 yǐwéi = myslet, věřit
+ # 以爲 以为 yǐwéi = myslet, věřit
+ if re.match(r'^(AUX|VERB)$', node.upos):
+ m1 = re.match(r'^([不没沒未])(.+)$', node.form)
+ m2 = re.match(r'^(.+)([不没沒未])\1$', node.form)
+ m3 = re.search(r'([是爲為为])', node.form)
+ if m1:
+ node.lemma = m1.group(2)
+ node.feats['Polarity'] = 'Neg'
+ elif m2:
+ node.lemma = m2.group(1)
+ node.feats['Mood'] = 'Int'
+ elif m3:
+ node.lemma = m3.group(1)
+ if node.lemma == '爲':
+ node.lemma = '為'
+ elif node.form in self.lemma:
+ node.lemma = self.lemma[node.form]
+ else:
+ node.lemma = node.form
diff --git a/udapi/block/udpipe/__init__.py b/udapi/block/udpipe/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py
new file mode 100644
index 00000000..9d053cb7
--- /dev/null
+++ b/udapi/block/udpipe/base.py
@@ -0,0 +1,270 @@
+"""Block udpipe.Base for tagging and parsing using UDPipe."""
+from udapi.core.block import Block
+from udapi.tool.udpipeonline import UDPipeOnline
+from udapi.core.bundle import Bundle
+
+# Import UDPipe only if available (requires ufal.udpipe)
+try:
+ from udapi.tool.udpipe import UDPipe
+ UDPIPE_AVAILABLE = True
+except ImportError:
+ UDPIPE_AVAILABLE = False
+
+KNOWN_MODELS = {
+ 'af': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe',
+ 'af_afribooms': 'models/udpipe/2.4/afrikaans-afribooms-ud-2.4-190531.udpipe',
+ 'grc': 'models/udpipe/2.4/ancient_greek-perseus-ud-2.4-190531.udpipe',
+ 'grc_perseus': 'models/udpipe/2.4/ancient_greek-perseus-ud-2.4-190531.udpipe',
+ 'grc_proiel': 'models/udpipe/2.4/ancient_greek-proiel-ud-2.4-190531.udpipe',
+ 'ar': 'models/udpipe/2.4/arabic-padt-ud-2.4-190531.udpipe',
+ 'ar_padt': 'models/udpipe/2.4/arabic-padt-ud-2.4-190531.udpipe',
+ 'hy': 'models/udpipe/2.4/armenian-armtdp-ud-2.4-190531.udpipe',
+ 'hy_armtdp': 'models/udpipe/2.4/armenian-armtdp-ud-2.4-190531.udpipe',
+ 'eu': 'models/udpipe/2.4/basque-bdt-ud-2.4-190531.udpipe',
+ 'eu_bdt': 'models/udpipe/2.4/basque-bdt-ud-2.4-190531.udpipe',
+ 'be': 'models/udpipe/2.4/belarusian-hse-ud-2.4-190531.udpipe',
+ 'be_hse': 'models/udpipe/2.4/belarusian-hse-ud-2.4-190531.udpipe',
+ 'bg': 'models/udpipe/2.4/bulgarian-btb-ud-2.4-190531.udpipe',
+ 'bg_btb': 'models/udpipe/2.4/bulgarian-btb-ud-2.4-190531.udpipe',
+ 'ca': 'models/udpipe/2.4/catalan-ancora-ud-2.4-190531.udpipe',
+ 'ca_ancora': 'models/udpipe/2.4/catalan-ancora-ud-2.4-190531.udpipe',
+ 'zh': 'models/udpipe/2.4/chinese-gsd-ud-2.4-190531.udpipe',
+ 'zh_gsd': 'models/udpipe/2.4/chinese-gsd-ud-2.4-190531.udpipe',
+ 'lzh': 'models/udpipe/2.4/classical_chinese-kyoto-ud-2.4-190531.udpipe',
+ 'lzh_kyoto': 'models/udpipe/2.4/classical_chinese-kyoto-ud-2.4-190531.udpipe',
+ 'cop': 'models/udpipe/2.4/coptic-scriptorium-ud-2.4-190531.udpipe',
+ 'cop_scriptotium': 'models/udpipe/2.4/coptic-scriptorium-ud-2.4-190531.udpipe',
+ 'hr': 'models/udpipe/2.4/croatian-set-ud-2.4-190531.udpipe',
+ 'hr_set': 'models/udpipe/2.4/croatian-set-ud-2.4-190531.udpipe',
+ 'cs': 'models/udpipe/2.4/czech-pdt-ud-2.4-190531.udpipe',
+ 'cs_pdt': 'models/udpipe/2.4/czech-pdt-ud-2.4-190531.udpipe',
+ 'cs_cac': 'models/udpipe/2.4/czech-cac-ud-2.4-190531.udpipe',
+ 'cs_cltt': 'models/udpipe/2.4/czech-cltt-ud-2.4-190531.udpipe',
+ 'cs_fictree': 'models/udpipe/2.4/czech-fictree-ud-2.4-190531.udpipe',
+ 'da': 'models/udpipe/2.4/danish-ddt-ud-2.4-190531.udpipe',
+ 'da_ddt': 'models/udpipe/2.4/danish-ddt-ud-2.4-190531.udpipe',
+ 'nl': 'models/udpipe/2.4/dutch-alpino-ud-2.4-190531.udpipe',
+ 'nl_alpino': 'models/udpipe/2.4/dutch-alpino-ud-2.4-190531.udpipe',
+ 'nl_lassysmall': 'models/udpipe/2.4/dutch-lassysmall-ud-2.4-190531.udpipe',
+ 'en': 'models/udpipe/2.4/english-ewt-ud-2.4-190531.udpipe',
+ 'en_ewt': 'models/udpipe/2.4/english-ewt-ud-2.4-190531.udpipe',
+ 'en_gum': 'models/udpipe/2.4/english-gum-ud-2.4-190531.udpipe',
+ 'en_lines': 'models/udpipe/2.4/english-lines-ud-2.4-190531.udpipe',
+ 'en_partut': 'models/udpipe/2.4/english-partut-ud-2.4-190531.udpipe',
+ 'et_edt': 'models/udpipe/2.4/estonian-edt-ud-2.4-190531.udpipe',
+ 'et_ewt': 'models/udpipe/2.4/estonian-ewt-ud-2.4-190531.udpipe',
+ 'fi': 'models/udpipe/2.4/finnish-tdt-ud-2.4-190531.udpipe',
+ 'fi_tdt': 'models/udpipe/2.4/finnish-tdt-ud-2.4-190531.udpipe',
+ 'fi_ftb': 'models/udpipe/2.4/finnish-ftb-ud-2.4-190531.udpipe',
+ 'fr_gsd': 'models/udpipe/2.4/french-gsd-ud-2.4-190531.udpipe',
+ 'fr_partut': 'models/udpipe/2.4/french-partut-ud-2.4-190531.udpipe',
+ 'fr_sequoia': 'models/udpipe/2.4/french-sequoia-ud-2.4-190531.udpipe',
+ 'fr_spoken': 'models/udpipe/2.4/french-spoken-ud-2.4-190531.udpipe',
+ 'gl_ctg': 'models/udpipe/2.4/galician-ctg-ud-2.4-190531.udpipe',
+ 'gl_treegal': 'models/udpipe/2.4/galician-treegal-ud-2.4-190531.udpipe',
+ 'de': 'models/udpipe/2.4/german-gsd-ud-2.4-190531.udpipe',
+ 'got': 'models/udpipe/2.4/gothic-proiel-ud-2.4-190531.udpipe',
+ 'el': 'models/udpipe/2.4/greek-gdt-ud-2.4-190531.udpipe',
+ 'he': 'models/udpipe/2.4/hebrew-htb-ud-2.4-190531.udpipe',
+ 'hi': 'models/udpipe/2.4/hindi-hdtb-ud-2.4-190531.udpipe',
+ 'hu': 'models/udpipe/2.4/hungarian-szeged-ud-2.4-190531.udpipe',
+ 'id': 'models/udpipe/2.4/indonesian-gsd-ud-2.4-190531.udpipe',
+ 'ga': 'models/udpipe/2.4/irish-idt-ud-2.4-190531.udpipe',
+ 'it_isdt': 'models/udpipe/2.4/italian-isdt-ud-2.4-190531.udpipe',
+ 'it_partut': 'models/udpipe/2.4/italian-partut-ud-2.4-190531.udpipe',
+ 'it_postwita': 'models/udpipe/2.4/italian-postwita-ud-2.4-190531.udpipe',
+ 'it_vit': 'models/udpipe/2.4/italian-vit-ud-2.4-190531.udpipe',
+ 'ja': 'models/udpipe/2.4/japanese-gsd-ud-2.4-190531.udpipe',
+ 'ko_gsd': 'models/udpipe/2.4/korean-gsd-ud-2.4-190531.udpipe',
+ 'ko_kaist': 'models/udpipe/2.4/korean-kaist-ud-2.4-190531.udpipe',
+ 'la_ittb': 'models/udpipe/2.4/latin-ittb-ud-2.4-190531.udpipe',
+ 'la_perseus': 'models/udpipe/2.4/latin-perseus-ud-2.4-190531.udpipe',
+ 'la_proiel': 'models/udpipe/2.4/latin-proiel-ud-2.4-190531.udpipe',
+ 'lv': 'models/udpipe/2.4/latvian-lvtb-ud-2.4-190531.udpipe',
+ 'lt_alksnis': 'models/udpipe/2.4/lithuanian-alksnis-ud-2.4-190531.udpipe',
+ 'lt_hse': 'models/udpipe/2.4/lithuanian-hse-ud-2.4-190531.udpipe',
+ 'mt': 'models/udpipe/2.4/maltese-mudt-ud-2.4-190531.udpipe',
+ 'mr': 'models/udpipe/2.4/marathi-ufal-ud-2.4-190531.udpipe',
+ 'sme': 'models/udpipe/2.4/north_sami-giella-ud-2.4-190531.udpipe',
+ 'no_bokmaal': 'models/udpipe/2.4/norwegian-bokmaal-ud-2.4-190531.udpipe',
+ 'no_nynorsklia': 'models/udpipe/2.4/norwegian-nynorsklia-ud-2.4-190531.udpipe',
+ 'no_nynorsk': 'models/udpipe/2.4/norwegian-nynorsk-ud-2.4-190531.udpipe',
+ 'cu': 'models/udpipe/2.4/old_church_slavonic-proiel-ud-2.4-190531.udpipe',
+ 'fro': 'models/udpipe/2.4/old_french-srcmf-ud-2.4-190531.udpipe',
+ 'orv': 'models/udpipe/2.4/old_russian-torot-ud-2.4-190531.udpipe',
+ 'fa': 'models/udpipe/2.4/persian-seraji-ud-2.4-190531.udpipe',
+ 'pl_lfg': 'models/udpipe/2.4/polish-lfg-ud-2.4-190531.udpipe',
+ 'pl_pdb': 'models/udpipe/2.4/polish-pdb-ud-2.4-190531.udpipe',
+ 'pt_bosque': 'models/udpipe/2.4/portuguese-bosque-ud-2.4-190531.udpipe',
+ 'pt_gsd': 'models/udpipe/2.4/portuguese-gsd-ud-2.4-190531.udpipe',
+ 'ro_nonstandard': 'models/udpipe/2.4/romanian-nonstandard-ud-2.4-190531.udpipe',
+ 'ro_rrt': 'models/udpipe/2.4/romanian-rrt-ud-2.4-190531.udpipe',
+ 'ru_gsd': 'models/udpipe/2.4/russian-gsd-ud-2.4-190531.udpipe',
+ 'ru_syntagrus': 'models/udpipe/2.4/russian-syntagrus-ud-2.4-190531.udpipe',
+ 'ru_taiga': 'models/udpipe/2.4/russian-taiga-ud-2.4-190531.udpipe',
+ 'sr': 'models/udpipe/2.4/serbian-set-ud-2.4-190531.udpipe',
+ 'sk': 'models/udpipe/2.4/slovak-snk-ud-2.4-190531.udpipe',
+ 'sl_ssj': 'models/udpipe/2.4/slovenian-ssj-ud-2.4-190531.udpipe',
+ 'sl_sst': 'models/udpipe/2.4/slovenian-sst-ud-2.4-190531.udpipe',
+ 'es_ancora': 'models/udpipe/2.4/spanish-ancora-ud-2.4-190531.udpipe',
+ 'es_gsd': 'models/udpipe/2.4/spanish-gsd-ud-2.4-190531.udpipe',
+ 'sv_lines': 'models/udpipe/2.4/swedish-lines-ud-2.4-190531.udpipe',
+ 'sv_talbanken': 'models/udpipe/2.4/swedish-talbanken-ud-2.4-190531.udpipe',
+ 'ta': 'models/udpipe/2.4/tamil-ttb-ud-2.4-190531.udpipe',
+ 'te': 'models/udpipe/2.4/telugu-mtg-ud-2.4-190531.udpipe',
+ 'tr': 'models/udpipe/2.4/turkish-imst-ud-2.4-190531.udpipe',
+ 'uk': 'models/udpipe/2.4/ukrainian-iu-ud-2.4-190531.udpipe',
+ 'ur': 'models/udpipe/2.4/urdu-udtb-ud-2.4-190531.udpipe',
+ 'ug': 'models/udpipe/2.4/uyghur-udt-ud-2.4-190531.udpipe',
+ 'vi': 'models/udpipe/2.4/vietnamese-vtb-ud-2.4-190531.udpipe',
+ 'wo': 'models/udpipe/2.4/wolof-wtb-ud-2.4-190531.udpipe',
+}
+
+
+class Base(Block):
+ """Base class for all UDPipe blocks."""
+
+ # pylint: disable=too-many-arguments
+ def __init__(self, model=None, model_alias=None, online=False,
+ tokenize=True, tag=True, parse=True, resegment=False,
+ ranges=False, delete_nodes=False, **kwargs):
+ super().__init__(**kwargs)
+ self.model, self.model_alias, self.online = model, model_alias, online
+ self._tool = None
+ self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment
+ self.ranges, self.delete_nodes = ranges, delete_nodes
+
+ @property
+ def tool(self):
+ """Return the tool (UDPipe in this case), created lazily."""
+ if self._tool:
+ return self._tool
+ if not self.model:
+ if not self.model_alias:
+ raise ValueError('model (path/to/model) or model_alias (e.g. en) must be set!')
+ if self.online:
+ self.model = self.model_alias
+ else:
+ self.model = KNOWN_MODELS[self.model_alias]
+ if self.online:
+ self._tool = UDPipeOnline(model=self.model)
+ else:
+ if not UDPIPE_AVAILABLE:
+ raise ImportError("UDPipe is not available. Install ufal.udpipe or use online=1")
+ self._tool = UDPipe(model=self.model)
+ return self._tool
+
+ def process_document(self, doc):
+ tok, tag, par, reseg, ranges = self.tokenize, self.tag, self.parse, self.resegment, self.ranges
+ if self.zones == "all" and self.online:
+ self.tool.process_document(doc, tok, tag, par, reseg, ranges)
+ return
+ old_bundles = doc.bundles
+ new_bundles = []
+ for bundle in old_bundles:
+ for tree in bundle:
+ new_bundles.append(bundle)
+ if self._should_process_tree(tree):
+ if self.delete_nodes:
+ for subroot in tree.children:
+ subroot.remove()
+ if tok:
+ new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=reseg,
+ tag=tag, parse=par, ranges=ranges)
+ if self.resegment and len(new_trees) > 1:
+ orig_bundle_id = bundle.bundle_id
+ bundle.bundle_id = orig_bundle_id + '-1'
+ for i, new_tree in enumerate(new_trees[1:], 2):
+ new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}")
+ new_tree.zone = tree.zone
+ new_bundle.add_tree(new_tree)
+ new_bundles.append(new_bundle)
+ elif not tok and not reseg and (tag or par):
+ self.tool.tag_parse_tree(tree, tag=tag, parse=par)
+ elif not tok and reseg and not tag and not par:
+ sentences = self.tool.segment_text(tree.text)
+ if len(sentences) > 1:
+ orig_bundle_id = bundle.bundle_id
+ bundle.bundle_id = orig_bundle_id + '-1'
+ tree.text = sentences[0]
+ for i, sentence in enumerate(sentences[1:], 2):
+ new_bundle = Bundle(document=doc, bundle_id=f"{orig_bundle_id}-{i}")
+ new_tree = new_bundle.create_tree(zone=tree.zone)
+ new_tree.text = sentence
+ new_bundles.append(new_bundle)
+ else:
+ raise ValueError(f"Unimplemented tokenize={tok} tag={tag} parse={par} resegment={reseg}")
+ doc.bundles = new_bundles
+
+'''
+Udapi::Block::UDPipe::Base - tokenize, tag and parse into UD
+
+=head1 SYNOPSIS
+
+ # from the command line
+ echo John loves Mary | udapi.pl Read::Sentences UDPipe::Base model_alias=en Write::TextModeTrees
+
+ # in scenario
+ UDPipe::Base model=/home/me/english-ud-1.2-160523.udpipe
+ UDPipe::Base model_alias=en
+ UDPipe::EN # shortcut for the above
+ UDPipe::EN tokenize=1 tag=1 parse=0
+
+=head1 DESCRIPTION
+
+This block loads L (a wrapper for the UDPipe C++ tool) with
+the given C for analysis into the Universal Dependencies (UD) style.
+UDPipe can do tokenization, tagging (plus lemmatization and universal features)
+and parsing (with deprel labels) and users of this block can select which of the
+substasks should be done using parameters C, C and C.
+The default is to do all three.
+
+=head1 TODO
+
+UDPipe can do also sentence segmentation, but L does not supported it yet.
+
+Similarly with multi-word tokens.
+
+=head1 PARAMETERS
+
+=head2 C
+
+Path to the model file within Udapi share
+(or relative path starting with "./" or absolute path starting with "/").
+This parameter is required if C is not supplied.
+
+=head2 C
+
+The C parameter can be omitted if this parameter is supplied.
+Currently available model aliases are:
+
+B.
+
+They correspond to paths where the language code in the alias is substituted
+with the respective language name, e.g. B expands to
+C.
+
+=head1 tokenize
+
+Do tokenization, i.e. create new nodes with attributes
+C