{tree.newdoc if tree.newdoc is not True else ""}

') for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() - self._start_subspan(subspan, mention_ids) + self._start_subspan(subspan, mention_ids, entity_colors) opened.append(subspan) is_head = self._is_head(node) @@ -180,7 +199,7 @@ def process_tree(self, tree, mention_ids): opened = new_opened print('' * (len(endings) + len(brokens)), end='') for broken in brokens: - self._start_subspan(broken, mention_ids, True) + self._start_subspan(broken, mention_ids, entity_colors, True) opened.append(subspan) if not node.no_space_after: From cae7c37efe8548c2e432b108e4aa06df3b778e3a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 15:07:42 +0100 Subject: [PATCH 0772/1374] `read.Conllu max_docs=3` will load only the first three documents This is nice for debugging coreference files, where we cannot load just first N sentences because there may be Bridge/SplitAnte referring to unknown eid. This way we load whole docs. --- udapi/block/read/conllu.py | 22 ++++++++++++++++++++-- udapi/core/basereader.py | 31 ++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index bba69696..d5623fba 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -81,8 +81,26 @@ def parse_comment_line(self, line, root): root.comment += line[1:] + "\n" def read_trees(self): - return [self.read_tree_from_lines(s.split('\n')) for s in - self.filehandle.read().split('\n\n') if s] + if not self.max_docs: + return [self.read_tree_from_lines(s.split('\n')) for s in + self.filehandle.read().split('\n\n') if s] + # udapi.core.basereader takes care about the max_docs parameter. + # However, we can make the loading much faster by not reading + # the whole file if the user wants just first N documents. + trees, lines, loaded_docs = [], [], 0 + for line in self.filehandle: + line = line.rstrip() + if line == '': + tree = self.read_tree_from_lines(lines) + lines = [] + if tree.newdoc: + if loaded_docs == self.max_docs: + return trees + loaded_docs += 1 + trees.append(tree) + else: + lines.append(line) + return def read_tree(self): if self.filehandle is None: diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index a3b334da..a841bf1b 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -13,7 +13,8 @@ class BaseReader(Block): # pylint: disable=too-many-arguments def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, encoding='utf-8-sig', - sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, **kwargs): + sent_id_filter=None, split_docs=False, ignore_sent_id=False, merge=False, + max_docs=0, **kwargs): super().__init__(**kwargs) if filehandle is not None: files = None @@ -29,6 +30,8 @@ def __init__(self, files='-', filehandle=None, zone='keep', bundles_per_doc=0, e self.split_docs = split_docs self.ignore_sent_id = ignore_sent_id self.merge = merge + self.max_docs = max_docs + self._docs_loaded = 0 # `global.Entity` is a header stored in a comment before the first tree of each document in a given CoNLL-U file. # In Udapi, it is stored in `document.meta['global.Entity']`, but for technical reasons, we need to temporarily store it in here, the reader. # The reason is that `read.Conllu` uses a fast loading interface with `read_trees()`, @@ -126,6 +129,11 @@ def try_fast_load(self, document): bundle, last_bundle_id = None, '' for root in trees: + if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return True + self._docs_loaded += 1 add_to_the_last_bundle = False if self.ignore_sent_id: @@ -180,8 +188,10 @@ def process_document(self, document): if root._sent_id is not None: bundle.bundle_id = root._sent_id.split('/', 1)[0] bundle.add_tree(root) - if root.newdoc and root.newdoc is not True: - document.meta["docname"] = root.newdoc + if root.newdoc: + self._docs_loaded += 1 + if root.newdoc is not True: + document.meta["docname"] = root.newdoc document.meta['global.Entity'] = self._global_entity document.meta['loaded_from'] = self.filename @@ -204,6 +214,17 @@ def process_document(self, document): if trees_loaded == 0: document.meta['loaded_from'] = self.filename document.meta['global.Entity'] = self._global_entity + # Parameter max_docs is primarily aimed for counting UD docs, ie. trees with newdoc. + # However, it could be useful even when working with files without the newdoc annotations, + # e.g. when using files='!*.conllu' or bundles_per_doc, in which case we count the Udapi documents + # so even if the first tree in udapi.Document does not have newdoc, we count it as a new document. + # The cases where newdoc is used are checked further below. + if not root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return + self._docs_loaded += 1 + add_to_the_last_bundle = False trees_loaded += 1 @@ -222,6 +243,9 @@ def process_document(self, document): # The `# newdoc` comment in CoNLL-U marks a start of a new document. if root.newdoc: + if self.max_docs and self._docs_loaded >= self.max_docs: + self.finished = True + return if not bundle and root.newdoc is not True: document.meta["docname"] = root.newdoc if bundle and self.split_docs: @@ -231,6 +255,7 @@ def process_document(self, document): len(orig_bundles)) self.finished = False return + self._docs_loaded += 1 # assign new/next bundle to `bundle` if needed if not bundle or not add_to_the_last_bundle: From ae34d8024d8ee95db6e1bf39581e44fc08bcbc73 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Mon, 6 Feb 2023 15:25:50 +0100 Subject: [PATCH 0773/1374] refactor code duplication --- udapi/block/write/corefhtml.py | 29 +++-------------------------- udapi/block/write/html.py | 28 +++++++++++++++------------- 2 files changed, 18 insertions(+), 39 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 0a06b7e5..c7950ce9 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -50,6 +50,8 @@ }); ''' +WRITE_HTML = udapi.block.write.html.Html() + class CorefHtml(BaseWriter): def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): @@ -90,32 +92,7 @@ def process_document(self, doc): print('') print('') diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 148b29ee..48431900 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,16 +79,26 @@ def process_document(self, doc): print('\n') print('

Date: Mon, 6 Feb 2023 19:53:25 +0100 Subject: [PATCH 0774/1374] write.CorefHtml: add side panel with an overview of entities --- udapi/block/write/corefhtml.py | 62 ++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index c7950ce9..280fc213 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -6,7 +6,25 @@ ETYPES = 'person place organization animal plant object substance time number abstract event'.split() +HEADER = ''' + +Udapi CorefUD viewer + +''' +# I use a pure CSS-3 solution: #overiew {resize: horizontal; overflow: auto;} +# so that the width of #overview can be changed by dragging the bottom right corner. +# The following lines would make the whole right border draggable: +# +# +# +#

CSS = ''' +#wrap {display: flex; align-items: flex-start;} +#main {width: 100%; padding: 5px; background: white; z-index:100;} +#overview { position: sticky; top: 0; overflow-y: scroll; height:95vh; resize:horizontal; + display: grid; border-right: double; + padding: 5px; width: 20em; background: #ddd; border-radius: 5px; +} .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} .sentence span .eid {display:block; font-size: 10px;} .showtree {float:left; margin: 5px;} @@ -23,10 +41,16 @@ $("span").click(function(e) { let was_selected = $(this).hasClass("selected"); $("span").removeClass("selected"); - if (!was_selected){$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} + if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} e.stopPropagation(); }); +window.onhashchange = function() { + $("span").removeClass("selected"); + var fragment = window.location.hash.substring(1); + if (fragment) {$("." + fragment).addClass("selected");} +} + $("span").hover( function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, function(e) {$("span").removeClass("active");} @@ -60,10 +84,18 @@ def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): self.show_eid = show_eid self.colors = colors + def _representative_word(self, entity): + # return the first PROPN or NOUN. Or the most frequent one? + heads = [m.head for m in entity.mentions] + lemma_or_form = lambda n: n.lemma if n.lemma else n.form + for upos in ('PROPN', 'NOUN'): + nodes = [n for n in heads if n.upos == upos] + if nodes: + return lemma_or_form(nodes[0]) + return lemma_or_form(heads[0]) + def process_document(self, doc): - print('') - print('Udapi CorefUD viewer') - print('') + print(HEADER) if self.show_trees: print('') print('') - print('\n') + print('\n\n

') mention_ids = {} entity_colors = {} @@ -86,8 +118,21 @@ def process_document(self, doc): for idx, mention in enumerate(entity.mentions, 1): mention_ids[mention] = f'{entity.eid}e{idx}' + print('

') + print('' + '' + '\n') + for entity in doc.coref_entities: + print(f'' + f'' + f'') + print('

eid	#m	word
{entity.eid}	{len(entity.mentions)}	{self._representative_word(entity)}

') + print('

') + + print('

') for tree in doc.trees: self.process_tree(tree, mention_ids, entity_colors) + print('

') print('') - print('') + print('

') def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): m = subspan.mention @@ -113,7 +158,10 @@ def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): title += '\ncrossing' if m.other: title += f'\n{m.other}' - print(f'', end='') + span_id = '' + if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m: + span_id = f'id="{e.eid}" ' + print(f'', end='') if self.show_eid: print(f'{subspan.subspan_eid}', end='') From bbd702aa35fcf4e13d2a4ab2d3972a7efd89fcc5 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 16:22:03 +0100 Subject: [PATCH 0775/1374] Python glob.glob does not support {dir1,dir2} anyway --- udapi/core/files.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index 7fcd9149..c6973dad 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -58,14 +58,6 @@ def string_to_filenames(self, string): or commas. For specifying files with spaces or commas in filenames, you need to use wildcard patterns or '@' filelist. (But preferably don't use such filenames.) """ - # "!" means glob pattern which can contain {dir1,dir2} - # so it cannot be combined with separating tokens with comma. - if string[0] == '!': - pattern = string[1:] - filenames = glob.glob(pattern) - if not filenames: - raise RuntimeError('No filenames matched "%s" pattern' % pattern) - return filenames return list(itertools.chain.from_iterable(self._token_to_filenames(tok) for tok in string.replace(',', ' ').split())) From a5acaf43b1edb3468dfc493da6e7ae87f2d99966 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 17:58:45 +0100 Subject: [PATCH 0776/1374] ud.ComplyWithText: use node.misc['CorrectForm'] instead of node.misc['OrigForm'] which was a misleading name because the previous form value is usually not the real original form. --- udapi/block/ud/complywithtext.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index cead294a..bacc56a2 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -34,7 +34,7 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, - **kwargs): + previous_form_attr='CorrectForm', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -54,24 +54,33 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + previous_form_attr - when changing node.form, we store the previous value + in node.misc[previous_form_attr] (so no information is lost). + Default="CorrectForm" because we expect that the previous value + (i.e. the value of node.form before applying this block) + contained the corrected spelling, while root.text contains + the original spelling with typos as found in the raw text. + CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html + When setting this parameter to an empty string, no values will be stored to node.misc. """ super().__init__(**kwargs) self.fix_text = fix_text self.prefer_mwt = prefer_mwt self.allow_goeswith = allow_goeswith self.max_mwt_length = max_mwt_length + self.allow_add_punct = allow_add_punct + self.allow_delete_punct = allow_delete_punct + self.previous_form_attr = previous_form_attr @staticmethod def allow_space(form): """Is space allowed within this token form?""" return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) - @staticmethod - def store_orig_form(node, new_form): - """Store the original form of this node into MISC, unless the change is common&expected.""" - _ = new_form + def store_previous_form(self, node): + """Store the previous form of this node into MISC, unless the change is common&expected.""" if node.form not in ("''", "``"): - node.misc['OrigForm'] = node.form + node.misc[self.previous_form_attr] = node.form def process_tree(self, root): text = root.text @@ -203,7 +212,7 @@ def solve_diff(self, nodes, form): if ' ' in form: if len(nodes) == 1 and node.form == form.replace(' ', ''): if self.allow_space(form): - self.store_orig_form(node, form) + self.store_previous_form(node) node.form = form elif self.allow_goeswith: forms = form.split() @@ -235,7 +244,7 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_orig_form(node, form) + self.store_previous_form(node) node.form = form From a69c7a158edb91d12d2907f6802c3104d946ee0d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Tue, 7 Feb 2023 18:00:46 +0100 Subject: [PATCH 0777/1374] ud.ComplyWithText fix_text=1 should always produce valid CoNLL-U so even if there are diffs which cannot be resolved, and thus we cannot fill SpaceAfter=No in the rest of the sentence, we must execute the "if self.fix_text:..." code, which changes the root.text (instead of changing the annotation of nodes). --- udapi/block/ud/complywithtext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index bacc56a2..1a13a4ec 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -121,7 +121,7 @@ def process_tree(self, root): node.misc['SpaceAfter'] = 'No' else: logging.warning('Node %s does not match text "%s"', node, tmp_text[:20]) - return + break # Edit root.text if needed. if self.fix_text: From fde163c32837ccc02a9b89d535be9769d4414340 Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Wed, 8 Feb 2023 14:23:05 +0100 Subject: [PATCH 0778/1374] further adjusted Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 122 ++++++++++++++++++----------- 1 file changed, 78 insertions(+), 44 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 323f60f7..111bceb9 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -29,7 +29,7 @@ def process_node(self, node): af = {} # NOUNS ################################################################ if node.upos == 'NOUN': - if not node.feats['Abbr'] == 'Yes' or node.feats['Case']: # abbreviated or indeclinable nouns + if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { 'Gender': ['Masc', 'Fem', 'Neut'], @@ -37,11 +37,11 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Dim'], 'Abbr': ['Yes'], - 'Foreign': ['Yes']} + 'Foreign': ['Yes'], + 'VerbForm': ['Part']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] - af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] af['Compound'] = ['Yes'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] @@ -76,14 +76,12 @@ def process_node(self, node): 'Degree': ['Cmp', 'Sup', 'Abs'], 'Abbr': ['Yes'], 'Foreign': ['Yes'], - 'Polarity': ['Neg']} + 'Polarity': ['Neg'], + 'VerbForm': ['Part']} if self.flavio: - # Flavio does not use Degree=Pos, hence Degree is not required. - # rf = [f for f in rf if f != 'Degree'] # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] - af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] af['Degree'].append('Dim') af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] @@ -93,15 +91,16 @@ def process_node(self, node): elif node.upos == 'PRON': rf = ['PronType', 'Case'] af = { - 'PronType': ['Prs', 'Rel', 'Ind', 'Int', 'Rcp'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], + 'Proper': ['Yes'], + 'Compound': ['Yes'], + 'Polarity': ['Neg'] } if node.feats['PronType'] == 'Prs': af['Reflex'] = ['Yes'] if node.feats['Reflex'] == 'Yes': # seipsum, se rf.extend(['Person']) # seipsum has gender and number but se does not, so it is not required - # TODO: seipsum in ITTB, but why lemma seipsum instead of seipse? af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] af['Person'] = ['3'] @@ -122,6 +121,19 @@ def process_node(self, node): rf = [f for f in rf if f != 'Case'] af['Gender'] = ['Masc', 'Fem', 'Neut'] af['Number'] = ['Sing', 'Plur'] + # lexical check of PronTypes + af['PronType'] = [] + if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: + af['PronType'].append('Prs') + elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis']: + af['PronType'].append('Ind') + elif node.lemma in ['inuicem', 'invicem']: + af['PronType'].append('Rcp') + rf.remove('Case') + elif node.lemma in ['quicumque', 'qui', 'quisquis']: + af['PronType'].append('Rel') + if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis']: + af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatAnom', 'LatPron'] @@ -140,7 +152,9 @@ def process_node(self, node): 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], 'Degree': ['Cmp', 'Abs', 'Sup'], - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Proper': ['Yes'], + 'PronType': [] } if node.feats['Poss'] == 'Yes': # 'meus', 'tuus', 'suus', 'noster' rf.extend(['Poss', 'Person[psor]']) @@ -152,8 +166,24 @@ def process_node(self, node): if node.feats['Person[psor]'] != '3': rf.append('Number[psor]') af['Number[psor]'] = ['Sing', 'Plur'] - else: - af['PronType'] = ['Dem', 'Rel', 'Ind', 'Int', 'Tot', 'Con'] + if node.feats['PronType'] == 'Ind': + af['NumType'] = ['Card'] + # lexical check of PronTypes + if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: + if not af['PronType'] == ['Prs']: + af['PronType'].append('Prs') + elif node.lemma in ['aliquot', 'quidam', 'quispiam', 'quivis', 'nullus', 'nonnullus', 'aliqui', 'qui', 'quilibet', 'quantuslibet', 'unus', 'uterque', 'ullus', 'multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + af['PronType'].append('Ind') + elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: + af['PronType'].append('Tot') + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus']: + af['PronType'].append('Rel') + elif node.lemma in ['qui', 'quantus', 'quot']: + af['PronType'].append('Int') + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot']: + af['PronType'].append('Dem') + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter']: + af['PronType'].append('Con') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] @@ -170,8 +200,8 @@ def process_node(self, node): rf = ['NumType', 'NumForm'] af = { 'NumType': ['Card'], - 'NumForm': ['Word', 'Roman', 'Digit'] - } + 'NumForm': ['Word', 'Roman', 'Digit'], + 'Proper': ['Yes']} # Arabic digits and Roman numerals do not have inflection features. if not re.match(r'^(Digit|Roman)$', node.feats['NumForm']): af['Gender'] = ['Masc', 'Fem', 'Neut'] @@ -186,40 +216,40 @@ def process_node(self, node): elif re.match(r'^(VERB|AUX)$', node.upos): rf = ['VerbForm', 'Aspect'] af = { - 'VerbForm': ['Inf', 'Fin', 'Part'], + 'VerbForm': ['Inf', 'Fin', 'Part', 'Conv'], 'Aspect': ['Imp', 'Inch', 'Perf', 'Prosp'], - 'Polarity': ['Neg'] + 'Polarity': ['Neg'], + 'Typo': ['Yes'] } - if not re.match(r'^(Ger|Gdv)$', node.feats['VerbForm']): + if node.feats['VerbForm'] not in ['Part', 'Conv']: rf.append('Tense') - af['Tense'] = ['Pres', 'Fut'] - if node.upos == 'VERB': # and not node.lemma.endswith('sum'): # compounds of sum + af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] + if node.upos == 'VERB': rf.append('Voice') af['Voice'] = ['Act', 'Pass'] - # Main verbs have aspect but auxiliaries don't. - # TODO: apparently, apparently AUXs have aspect as well - # if node.upos == 'VERB': - # rf.append('Aspect') - # af['Aspect'] = ['Imp', 'Inch', 'Perf', 'Prosp'] if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive rf.extend(['Mood', 'Person', 'Number']) - af['Tense'].extend(['Past', 'Pqp']) af['Mood'] = ['Ind', 'Sub', 'Imp'] af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] elif node.feats['VerbForm'] == 'Part': rf.extend(['Gender', 'Number', 'Case']) - af['Number'] = ['Sing', 'Plur'] - af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Number'] = ['Sing', 'Plur'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Sing'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] if node.misc['TraditionalMood'] != 'Gerundium' else ['Neut'] af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] af['Degree'] = ['Abs', 'Cmp'] - af['Gender'] = ['Masc', 'Fem', 'Neut'] - af['Tense'].append('Past') - # else: nothing to be added for VerbForm=Inf + if node.misc['TraditionalMood'].startswith('Gerundi'): + af['Voice'] = ['Pass'] + af['Aspect'] = 'Prosp' + elif node.feats['VerbForm'] == 'Conv': + rf.extend(['Case', 'Gender', 'Number']) + af['Case'] = ['Abl', 'Acc'] + af['Gender'] = ['Masc'] + af['Number'] = ['Sing'] + af['Voice'] = ['Act'] + elif node.feats['VerbForm'] == 'Inf': + af['Tense'].remove('Pqp') if self.flavio: - # Flavio has killed Tense in his treebanks. - rf = [f for f in rf if f != 'Tense'] - af['VerbForm'].append('Vnoun') # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] if 'Degree' in af: @@ -228,23 +258,22 @@ def process_node(self, node): af['Degree'] = ['Dim'] af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] - if re.match(r'^(Part|Vnoun)$', node.feats['VerbForm']): - af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO'] - af['VerbForm'].append('Vnoun') + if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## elif node.upos == 'ADV': af = { - 'AdvType': ['Loc', 'Tim'], + 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], - 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], + 'NumType': ['Card', 'Ord'], # e.g., primum 'Polarity': ['Neg'] } if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] - af['NumType'] = ['Card', 'Ord'] # e.g., primum af['VerbForm'] = ['Part'] af['Degree'].append('Dim') self.check_allowed_features(node, af) @@ -262,7 +291,8 @@ def process_node(self, node): elif re.match(r'^[CS]CONJ$', node.upos): af = { 'PronType': ['Rel', 'Con'], - 'Polarity': ['Neg']} + 'Polarity': ['Neg'], + 'Compound': ['Yes']} if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] @@ -271,10 +301,14 @@ def process_node(self, node): self.check_allowed_features(node, af) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': + rf = ['AdpType'] + af = { + 'AdpType': ['Prep', 'Post'], + 'Abbr': ['Yes'] + } if self.flavio: - af = { - 'VerbForm': ['Part'], - 'Proper': ['Yes']} + af['VerbForm'] = ['Part'], + af['Proper'] = ['Yes'] self.check_allowed_features(node, af) # THE REST: NO FEATURES ################################################ else: From 29fb09caccd678560845ea3d80b2027145231c90 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:04:56 +0100 Subject: [PATCH 0779/1374] improve ud.ComplyWithText for KorKor --- udapi/block/ud/complywithtext.py | 81 ++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 1a13a4ec..02904731 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -24,7 +24,7 @@ """ import difflib import logging -import re +import regex from udapi.core.block import Block from udapi.core.mwt import MWT @@ -34,6 +34,7 @@ class ComplyWithText(Block): """Adapt the nodes to comply with the text.""" def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, + allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, previous_form_attr='CorrectForm', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. @@ -54,6 +55,14 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ Default=True (i.e. add the goeswith nodes if applicable). max_mwt_length - Maximum length of newly created multi-word tokens (in syntactic words). Default=4. + allow_add_punct - allow creating punctuation-only nodes + allow_delete_punct - allow deleting extra punctuation-only nodes, + which are not represented in root.text + allow_hyphen_goeswith - if e.g. node.form=="mother-in-law" corresponds to + "mother in law" in root.text, convert it to three nodes: + node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") + node2(form="in", deprel="goeswith", upos="X", parent=node1) + node3(form="law", deprel="goeswith", upos="X", parent=node1). previous_form_attr - when changing node.form, we store the previous value in node.misc[previous_form_attr] (so no information is lost). Default="CorrectForm" because we expect that the previous value @@ -62,6 +71,7 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ the original spelling with typos as found in the raw text. CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html When setting this parameter to an empty string, no values will be stored to node.misc. + When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. """ super().__init__(**kwargs) self.fix_text = fix_text @@ -70,17 +80,20 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ self.max_mwt_length = max_mwt_length self.allow_add_punct = allow_add_punct self.allow_delete_punct = allow_delete_punct + self.allow_hyphen_goeswith = allow_hyphen_goeswith self.previous_form_attr = previous_form_attr @staticmethod def allow_space(form): """Is space allowed within this token form?""" - return re.fullmatch('[0-9 ]+([,.][0-9]+)?', form) + return regex.fullmatch('[0-9 ]+([,.][0-9]+)?', form) def store_previous_form(self, node): """Store the previous form of this node into MISC, unless the change is common&expected.""" - if node.form not in ("''", "``"): + if node.form not in ("''", "``") and self.previous_form_attr: node.misc[self.previous_form_attr] = node.form + if self.previous_form_attr == 'CorrectForm': + node.feats['Typo'] = 'Yes' def process_tree(self, root): text = root.text @@ -190,18 +203,38 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): for diff in diffs: edit, tree_lo, tree_hi, text_lo, text_hi = diff - # Focus only on edits of type 'replace', log insertions and deletions as failures. if edit == 'equal': - continue - if edit in ('insert', 'delete'): - logging.warning('Unable to solve token-vs-text mismatch\n%s', - _diff2str(diff, tree_chars, text)) - continue - - # Revert the splittng and solve the diff. - nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] - form = text[text_lo:text_hi] - self.solve_diff(nodes, form.strip()) + pass + elif edit == 'insert': + forms = text[text_lo:text_hi].split(' ') + if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: + #logging.info(f'trying to add {forms} before {char_nodes[tree_lo]}') + next_node = char_nodes[tree_lo] + for f in reversed(forms): + new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') + new.shift_before_node(next_node) + new.misc['Added'] = 1 + else: + logging.warning('Unable to insert nodes\n%s', + _diff2str(diff, tree_chars, text)) + elif edit == 'delete': + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + if all(regex.fullmatch('\p{P}+', n.form) for n in nodes): + if self.allow_delete_punct: + for node in nodes: + node.remove(children='rehang') + else: + logging.warning('Unable to delete punctuation nodes (try ud.ComplyWithText allow_delete_punct=1)\n%s', + _diff2str(diff, tree_chars, text)) + else: + logging.warning('Unable to delete non-punctuation nodes\n%s', + _diff2str(diff, tree_chars, text)) + else: + assert edit == 'replace' + # Revert the splittng and solve the diff. + nodes = [n for n in char_nodes[tree_lo:tree_hi] if n is not None] + form = text[text_lo:text_hi] + self.solve_diff(nodes, form.strip()) def solve_diff(self, nodes, form): """Fix a given (minimal) tokens-vs-text inconsistency.""" @@ -210,20 +243,25 @@ def solve_diff(self, nodes, form): # First, solve the cases when the text contains a space. if ' ' in form: - if len(nodes) == 1 and node.form == form.replace(' ', ''): + node_form = node.form + if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: + node_form = node_form.replace('-', '') + if len(nodes) == 1 and node_form == form.replace(' ', ''): if self.allow_space(form): self.store_previous_form(node) node.form = form elif self.allow_goeswith: + self.store_previous_form(node) forms = form.split() node.form = forms[0] + node.feats['Typo'] = 'Yes' for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos=node.upos) + new = node.create_child(form=split_form, deprel='goeswith', upos='X') new.shift_after_node(node) else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: - logging.warning('Unable to solve n:m diff:\n%s -> %s', nodes_str, form) + logging.warning(f'Unable to solve {len(nodes)}:{len(form.split(" "))} diff:\n{nodes_str} -> {form}') # Second, solve the cases when multiple nodes match one form (without any spaces). elif len(nodes) > 1: @@ -244,8 +282,13 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: - self.store_previous_form(node) - node.form = form + if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): + new = node.create_child(form=form[len(node.form):], deprel='punct', upos='PUNCT') + new.shift_after_node(node) + new.misc['Added'] = 1 + else: + self.store_previous_form(node) + node.form = form def _nodes_to_chars(nodes): From d5a1a2a756ef13629984eb40af7b5853dbd8c7a0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:06:45 +0100 Subject: [PATCH 0780/1374] udapy hints when using a wrong block name or parameter name thanks to @michnov for this idea --- udapi/core/block.py | 23 +++++++++++++++++++---- udapi/core/run.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/udapi/core/block.py b/udapi/core/block.py index f039abce..fdcad9fa 100644 --- a/udapi/core/block.py +++ b/udapi/core/block.py @@ -1,5 +1,6 @@ """Block class represents the basic Udapi processing unit.""" import logging +import inspect def not_overridden(method): method.is_not_overridden = True @@ -14,9 +15,23 @@ class Block(object): Possible values are: process (default), skip, skip_warn, fail, delete. """ - def __init__(self, zones='all', if_empty_tree='process'): + def __init__(self, zones='all', if_empty_tree='process', **kwargs): self.zones = zones self.if_empty_tree = if_empty_tree + if kwargs: + params = set() + for cls in type(self).mro()[:-1]: + params.update(inspect.signature(cls.__init__).parameters.keys()) + params -= {'self', 'kwargs'} + raise TypeError(f"Extra parameters {kwargs}.\n" + f"Parameters of {self.block_name()} are:\n" + + '\n'.join(sorted(params))) + + def block_name(self): + module = ".".join(self.__module__.split(".")[:-1]) + if module.startswith('udapi.block.'): + module = module[12:] + return module + "." + self.__class__.__name__ def process_start(self): """A hook method that is executed before processing UD data""" @@ -73,7 +88,7 @@ def process_document(self, document): p_tree = not hasattr(self.process_tree, 'is_not_overridden') p_node = not hasattr(self.process_node, 'is_not_overridden') if not any((p_entity, p_mention, p_bundle, p_tree, p_node)): - raise Exception("No processing activity defined in block " + str(self)) + raise Exception("No processing activity defined in block " + self.block_name()) if p_entity or p_mention: for entity in document.coref_entities: @@ -85,8 +100,8 @@ def process_document(self, document): if p_bundle or p_tree or p_node: for bundle_no, bundle in enumerate(document.bundles, 1): - logging.debug('Block %s processing bundle #%d (id=%s)', - self.__class__.__name__, bundle_no, bundle.bundle_id) + logging.debug(f'Block {self.block_name()} processing ' + f'bundle #{bundle_no} (id={bundle.bundle_id})') if p_bundle: self.process_bundle(bundle) else: diff --git a/udapi/core/run.py b/udapi/core/run.py index a0cc4a9a..418baca6 100644 --- a/udapi/core/run.py +++ b/udapi/core/run.py @@ -67,6 +67,26 @@ def _parse_command_line_arguments(scenario): return block_names, block_args +def _blocks_in_a_package(package_name): + import importlib.util, pkgutil + + if not importlib.util.find_spec(package_name): + return [] + try: + package = __import__(package_name, fromlist="dummy") + submodule_names = [m.name for m in pkgutil.iter_modules(package.__path__)] + pname = package_name + if pname.startswith("udapi.block."): + pname = pname[12:] + blocks = [] + for sname in submodule_names: + module = __import__(f"{package_name}.{sname}", fromlist="dummy") + bname = [c for c in dir(module) if c.lower() == sname][0] + blocks.append(f"{pname}.{bname}") + return blocks + except: + return [] + def _import_blocks(block_names, block_args): """ Parse block names, import particular packages and call the constructor for each object. @@ -92,8 +112,17 @@ def _import_blocks(block_names, block_args): command = "from " + module + " import " + class_name + " as b" + str(block_id) logging.debug("Trying to run command: %s", command) exec(command) # pylint: disable=exec-used - except Exception: - logging.warning("Error when trying import the block %s", block_name) + except ModuleNotFoundError as err: + package_name = ".".join(module.split(".")[:-1]) + blocks = _blocks_in_a_package(package_name) + if not blocks: + raise + raise ModuleNotFoundError( + f"Cannot find block {block_name} (i.e. class {module}.{class_name})\n" + f"Available block in {package_name} are:\n" + + "\n".join(_blocks_in_a_package(package_name))) from err + except Exception as ex: + logging.warning(f"Cannot import block {block_name} (i.e. class {module}.{class_name})") raise # Run the imported module. From 49ed44d2e309523cdf3361c599934d5dbf58a2a8 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 8 Feb 2023 18:23:36 +0100 Subject: [PATCH 0781/1374] read.XY files='!*.conllu' should iterated over sorted files glob.glob() returns files in an arbitrary order (as `ls -U`) --- udapi/core/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/core/files.py b/udapi/core/files.py index c6973dad..be59b2c0 100644 --- a/udapi/core/files.py +++ b/udapi/core/files.py @@ -65,7 +65,7 @@ def string_to_filenames(self, string): def _token_to_filenames(token): if token[0] == '!': pattern = token[1:] - filenames = glob.glob(pattern) + filenames = sorted(glob.glob(pattern)) if not filenames: raise RuntimeError('No filenames matched "%s" pattern' % pattern) elif token[0] == '@': From 1a4241104709e7647cf75ff84dbc68df3428fbe0 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Thu, 9 Feb 2023 23:49:11 +0100 Subject: [PATCH 0782/1374] improve ud.ComplyWithText (for KorKor) --- udapi/block/ud/complywithtext.py | 70 ++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 02904731..c850018e 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -35,7 +35,8 @@ class ComplyWithText(Block): def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, - previous_form_attr='CorrectForm', **kwargs): + previous_form_label='CorrectForm', previous_text_label='CorrectText', + added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. Should we edit the text to match the token forms (as a last resort)? Default=True. @@ -63,8 +64,8 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ node1(form="mother", feats["Typo"]="Yes", misc["CorrectForm"]="mother-in-law") node2(form="in", deprel="goeswith", upos="X", parent=node1) node3(form="law", deprel="goeswith", upos="X", parent=node1). - previous_form_attr - when changing node.form, we store the previous value - in node.misc[previous_form_attr] (so no information is lost). + previous_form_label - when changing node.form, we store the previous value + in node.misc[previous_form_label] (so no information is lost). Default="CorrectForm" because we expect that the previous value (i.e. the value of node.form before applying this block) contained the corrected spelling, while root.text contains @@ -72,6 +73,12 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ CorrectForm is defined in https://universaldependencies.org/u/overview/typos.html When setting this parameter to an empty string, no values will be stored to node.misc. When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. + previous_text_label - when we are not able to adapt the annotation to match root.text + and fix_text is True, we store the previous root.text in a CoNLL-U comment with this label. + Default="CorrectText". When setting this parameter to an empty string, + no values will be stored to root.comment. + added_label - when creating new nodes because allow_add_punct=True, we mark these nodes + as new_node.misc[added_label] = 1. Default="Added". """ super().__init__(**kwargs) self.fix_text = fix_text @@ -81,7 +88,9 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ self.allow_add_punct = allow_add_punct self.allow_delete_punct = allow_delete_punct self.allow_hyphen_goeswith = allow_hyphen_goeswith - self.previous_form_attr = previous_form_attr + self.previous_form_label = previous_form_label + self.previous_text_label = previous_text_label + self.added_label = added_label @staticmethod def allow_space(form): @@ -90,9 +99,9 @@ def allow_space(form): def store_previous_form(self, node): """Store the previous form of this node into MISC, unless the change is common&expected.""" - if node.form not in ("''", "``") and self.previous_form_attr: - node.misc[self.previous_form_attr] = node.form - if self.previous_form_attr == 'CorrectForm': + if node.form not in ("''", "``") and self.previous_form_label: + node.misc[self.previous_form_label] = node.form + if self.previous_form_label == 'CorrectForm': node.feats['Typo'] = 'Yes' def process_tree(self, root): @@ -140,7 +149,8 @@ def process_tree(self, root): if self.fix_text: computed_text = root.compute_text() if text != computed_text: - root.add_comment('ToDoOrigText = ' + root.text) + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') root.text = computed_text def unspace_diffs(self, orig_diffs, tree_chars, text): @@ -152,6 +162,10 @@ def unspace_diffs(self, orig_diffs, tree_chars, text): tree_lo += 1 if tree_chars[tree_hi - 1] == ' ': tree_hi -= 1 + if text[text_lo] == ' ': + text_lo += 1 + if text[text_hi - 1] == ' ': + text_hi -= 1 old = tree_chars[tree_lo:tree_hi] new = text[text_lo:text_hi] if old == '' and new == '': @@ -208,12 +222,11 @@ def solve_diffs(self, diffs, tree_chars, char_nodes, text): elif edit == 'insert': forms = text[text_lo:text_hi].split(' ') if all(regex.fullmatch('\p{P}+', f) for f in forms) and self.allow_add_punct: - #logging.info(f'trying to add {forms} before {char_nodes[tree_lo]}') next_node = char_nodes[tree_lo] for f in reversed(forms): new = next_node.create_child(form=f, deprel='punct', upos='PUNCT') new.shift_before_node(next_node) - new.misc['Added'] = 1 + new.misc[self.added_label] = 1 else: logging.warning('Unable to insert nodes\n%s', _diff2str(diff, tree_chars, text)) @@ -246,18 +259,26 @@ def solve_diff(self, nodes, form): node_form = node.form if self.allow_hyphen_goeswith and node_form.replace('-', ' ') == form: node_form = node_form.replace('-', '') - if len(nodes) == 1 and node_form == form.replace(' ', ''): - if self.allow_space(form): - self.store_previous_form(node) - node.form = form - elif self.allow_goeswith: - self.store_previous_form(node) - forms = form.split() - node.form = forms[0] - node.feats['Typo'] = 'Yes' - for split_form in reversed(forms[1:]): - new = node.create_child(form=split_form, deprel='goeswith', upos='X') + if len(nodes) == 1: + if node_form == form.replace(' ', ''): + if self.allow_space(form): + self.store_previous_form(node) + node.form = form + elif self.allow_goeswith: + self.store_previous_form(node) + forms = form.split() + node.form = forms[0] + node.feats['Typo'] = 'Yes' + for split_form in reversed(forms[1:]): + new = node.create_child(form=split_form, deprel='goeswith', upos='X') + new.shift_after_node(node) + else: + logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) + elif self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('[ \p{P}]+', form[len(node.form):]): + for punct_form in reversed(form[len(node.form):].split()): + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) + new.misc[self.added_label] = 1 else: logging.warning('Unable to solve 1:m diff:\n%s -> %s', nodes_str, form) else: @@ -283,9 +304,10 @@ def solve_diff(self, nodes, form): # Third, solve the 1-1 cases. else: if self.allow_add_punct and form.startswith(node.form) and regex.fullmatch('\p{P}+', form[len(node.form):]): - new = node.create_child(form=form[len(node.form):], deprel='punct', upos='PUNCT') + punct_form = form[len(node.form):] + new = node.create_child(form=punct_form, lemma=punct_form, deprel='punct', upos='PUNCT') new.shift_after_node(node) - new.misc['Added'] = 1 + new.misc[self.added_label] = 1 else: self.store_previous_form(node) node.form = form @@ -313,6 +335,4 @@ def _log_diffs(diffs, tree_chars, text, msg): def _diff2str(diff, tree, text): old = '|' + ''.join(tree[diff[1]:diff[2]]) + '|' new = '|' + ''.join(text[diff[3]:diff[4]]) + '|' - if diff[0] == 'equal': - return '{:7} {!s:>50}'.format(diff[0], old) return '{:7} {!s:>50} --> {!s}'.format(diff[0], old, new) From 3abb76df036f7aa2e8f39437aa7d5b80032ae850 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:08:12 +0100 Subject: [PATCH 0783/1374] ud.ComplyWithText fix_text=1 should always produce valid CoNLL-U even if the raw texts include double spaces or no-break spaces (TODO: alternatively, we could annotate these using SpacesAfter). --- udapi/block/ud/complywithtext.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index c850018e..351ebc01 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -109,9 +109,13 @@ def process_tree(self, root): if text is None: raise ValueError('Tree %s has no text, cannot use ud.ComplyWithText' % root) - # Normalize the stored text (double space -> single space) + # Normalize the stored text (e.g. double space or no-break space -> single space) # and skip sentences which are already ok. text = ' '.join(text.split()) + if root.text != text and self.fix_text: + if self.previous_text_label: + root.add_comment(f'{self.previous_text_label} = {root.text}') + root.text = text if text == root.compute_text(): return From 0c6f946802345cc670ece9663fc7007ff05efd73 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:09:36 +0100 Subject: [PATCH 0784/1374] corefud.PrintMentions should show Entity annotations in MISC by default --- udapi/block/corefud/printmentions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/printmentions.py b/udapi/block/corefud/printmentions.py index 12db433a..d011f686 100644 --- a/udapi/block/corefud/printmentions.py +++ b/udapi/block/corefud/printmentions.py @@ -12,7 +12,7 @@ def __init__(self, continuous='include', almost_continuous='include', treelet='i empty='include', max_trees=0, html=False, shuffle=True, print_other_forms=5, print_total=True, print_should=True, print_sent_id=True, print_text=True, add_empty_line=True, indent=1, - minimize_cross=True, color=True, attributes='form,upos,deprel', + minimize_cross=True, color=True, attributes='ord,form,upos,deprel,misc', print_undef_as='_', print_doc_meta=True, print_comments=False, mark='(Mark)', hints=True, layout='classic', **kwargs): From f9dd071481e49944fe6c70629bf9d56a90bd86d6 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 14:27:46 +0100 Subject: [PATCH 0785/1374] keep newdoc and global.Entity when using read.Conllu sent_id_filter=regex The global.Entity comment will be read automatically by read.Conllu and then inserted automatically by write.Conllu, but only for trees with tree.newdoc, so we need to keep this annotation as well (move it to the new first tree in a given document). --- udapi/core/basereader.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/udapi/core/basereader.py b/udapi/core/basereader.py index a841bf1b..71d57159 100644 --- a/udapi/core/basereader.py +++ b/udapi/core/basereader.py @@ -97,13 +97,19 @@ def filtered_read_tree(self): tree = self.read_tree() if self.sent_id_filter is None: return tree + + skipped_newdoc = None while True: if tree is None: return None if self.sent_id_filter.match(tree.sent_id) is not None: + if skipped_newdoc and not tree.newdoc: + tree.newdoc = skipped_newdoc return tree logging.debug('Skipping sentence %s as it does not match the sent_id_filter %s.', tree.sent_id, self.sent_id_filter) + if tree.newdoc: + skipped_newdoc = tree.newdoc tree = self.read_tree() def try_fast_load(self, document): From b036d572af97a9f06482ccdcd7e90cfe4f0f5655 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 15:15:48 +0100 Subject: [PATCH 0786/1374] update ord of empty nodes when deleting preceding nonempty nodes TODO: add tests, solve also deleting of empty nodes --- udapi/core/node.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/udapi/core/node.py b/udapi/core/node.py index 618e75eb..8a764498 100644 --- a/udapi/core/node.py +++ b/udapi/core/node.py @@ -516,6 +516,7 @@ def remove(self, children=None): `rehang_warn` means to rehang and warn:-). """ self._parent._children.remove(self) + empty_follows = None if children is not None and self._children: if children.startswith('rehang'): for child in self._children: @@ -523,6 +524,16 @@ def remove(self, children=None): self._parent._children.extend(self._children) self._parent._children.sort() self._children.clear() + elif self._root.empty_nodes: + will_be_removed = self.descendants(add_self=1) + prev_nonempty = self._root + empty_follows = {} + for node in self._root.descendants_and_empty: + if node.empty: + empty_follows[node] = prev_nonempty + elif node not in will_be_removed: + prev_nonempty = node + if children.endswith('warn'): logging.warning('%s is being removed by remove(children=%s), ' ' but it has (unexpected) children', self, children) @@ -536,14 +547,29 @@ def remove(self, children=None): self._root._descendants.remove(self) except ValueError: pass # self may be an already deleted node e.g. if n.remove() called twice - for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): - node.ord = new_ord + else: + for (new_ord, node) in enumerate(self._root._descendants[self._ord - 1:], self._ord): + node.ord = new_ord + for empty in self._root.empty_nodes: + if empty > self: + empty.ord = round(empty.ord - 1, 1) else: # TODO nodes_to_remove = self.unordered_descendants() # and mark all nodes as deleted, remove them from MWT and coref mentions self._root._descendants = sorted(self._root.unordered_descendants()) for (new_ord, node) in enumerate(self._root._descendants, 1): node.ord = new_ord + # Decrease ord of empty nodes (keep their fractional part) + # Make sure that e.g. after deleting node with ord=2 + # ords "1 1.1 1.2 2 2.1" will become "1 1.1 1.2 1.3". + if empty_follows: + last_ord = 0 + for empty in self._root.empty_nodes: + prev_nonempty = empty_follows[empty] + new_ord = round(prev_nonempty.ord + (empty.ord % 1), 1) + while new_ord <= last_ord: + new_ord = round(new_ord + 0.1, 1) + last_ord, empty.ord = new_ord, new_ord def _shift_before_ord(self, reference_ord, without_children=False): """Internal method for changing word order.""" From 6c289d3bda8134a683f6362198888ee920520203 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 10 Feb 2023 16:32:51 +0100 Subject: [PATCH 0787/1374] ud.ComplyWithText: the previous root.text value is better described as OrigText Unlike the previous node.form values, it is (usually) the original raw text including typos etc, so the label "CorrectText" was completely misleading. --- udapi/block/ud/complywithtext.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/udapi/block/ud/complywithtext.py b/udapi/block/ud/complywithtext.py index 351ebc01..b36b2512 100644 --- a/udapi/block/ud/complywithtext.py +++ b/udapi/block/ud/complywithtext.py @@ -35,7 +35,7 @@ class ComplyWithText(Block): def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_length=4, allow_add_punct=True, allow_delete_punct=True, allow_hyphen_goeswith=True, - previous_form_label='CorrectForm', previous_text_label='CorrectText', + previous_form_label='CorrectForm', previous_text_label='OrigText', added_label='Added', **kwargs): """Args: fix_text: After all heuristics are applied, the token forms may still not match the text. @@ -74,8 +74,8 @@ def __init__(self, fix_text=True, prefer_mwt=True, allow_goeswith=True, max_mwt_ When setting this parameter to an empty string, no values will be stored to node.misc. When keeping the default name CorrectForm, node.feats["Typo"] = "Yes" will be filled as well. previous_text_label - when we are not able to adapt the annotation to match root.text - and fix_text is True, we store the previous root.text in a CoNLL-U comment with this label. - Default="CorrectText". When setting this parameter to an empty string, + and fix_text is True, we store the previous root.text value in a CoNLL-U comment with this label. + Default="OrigText". When setting this parameter to an empty string, no values will be stored to root.comment. added_label - when creating new nodes because allow_add_punct=True, we mark these nodes as new_node.misc[added_label] = 1. Default="Added". From 043f4d73745a0155db76d5f4776d77f7ceeeba8a Mon Sep 17 00:00:00 2001 From: "Federica Gamba (PhD" Date: Fri, 17 Feb 2023 16:47:25 +0100 Subject: [PATCH 0788/1374] minor changes in Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index 111bceb9..fde3b0bd 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -27,8 +27,11 @@ def __init__(self, flavio=False, **kwargs): def process_node(self, node): rf = [] af = {} + # PROIEL-specific: greek words without features + if node.lemma == 'greek.expression': + pass # NOUNS ################################################################ - if node.upos == 'NOUN': + elif node.upos == 'NOUN': if node.feats['Case'] and not node.feats['Abbr'] == 'Yes': # abbreviated or indeclinable nouns rf = ['Gender', 'Number', 'Case'] af = { @@ -125,14 +128,14 @@ def process_node(self, node): af['PronType'] = [] if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: af['PronType'].append('Prs') - elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis']: + elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis', 'qui']: af['PronType'].append('Ind') elif node.lemma in ['inuicem', 'invicem']: af['PronType'].append('Rcp') rf.remove('Case') - elif node.lemma in ['quicumque', 'qui', 'quisquis']: + if node.lemma in ['quicumque', 'qui', 'quisquis']: af['PronType'].append('Rel') - if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis']: + if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis', 'ecqui']: af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. @@ -176,7 +179,7 @@ def process_node(self, node): af['PronType'].append('Ind') elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: af['PronType'].append('Tot') - if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus']: + if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: af['PronType'].append('Rel') elif node.lemma in ['qui', 'quantus', 'quot']: af['PronType'].append('Int') From e84741a6e78acaaf13739945bd17814d569e3601 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Feb 2023 22:06:56 +0100 Subject: [PATCH 0789/1374] Remove NOCOREF entities e.g. from AnCora. --- udapi/block/corefud/removenocorefentities.py | 21 ++++++++++++++++++++ udapi/core/coref.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 udapi/block/corefud/removenocorefentities.py diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py new file mode 100644 index 00000000..8baba086 --- /dev/null +++ b/udapi/block/corefud/removenocorefentities.py @@ -0,0 +1,21 @@ +from udapi.core.block import Block +import udapi.core.coref +import re +import logging + +class RemoveNoCorefEntities(Block): + """ + Some corpora (e.g., AnCora) include annotation of named entities that are + not annotated for coreference. To distinguish them, their cluster ID starts + with 'NOCOREF' (optionally followed by entity type, so that one cluster + still has just one type). We may want to remove such entities from datasets + that are used to train coreference resolves, to prevent the resolvers from + thinking that all members of a NOCOREF cluster are coreferential. That is + what this block does. + """ + + def process_document(self, doc): + entities = doc.coref_entities + if not entities: + return + doc.coref_entities = [e for e in entities if not re.match(r'^NOCOREF', e.eid)] diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 12dda239..4cd656f1 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -300,7 +300,7 @@ def __init__(self, eid, etype=None): self.split_ante = [] def __lt__(self, another): - """Does this CorefEntity precedes (word-order wise) `another` entity? + """Does this CorefEntity precede (word-order wise) `another` entity? This method defines a total ordering of all entities by the first mention of each entity (see `CorefMention.__lt__`). From 16c3a48ed3eb7861757092649a6ece22b893151c Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 17 Feb 2023 22:27:19 +0100 Subject: [PATCH 0790/1374] Another method of removing entities. --- udapi/block/corefud/removenocorefentities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/udapi/block/corefud/removenocorefentities.py b/udapi/block/corefud/removenocorefentities.py index 8baba086..4551873c 100644 --- a/udapi/block/corefud/removenocorefentities.py +++ b/udapi/block/corefud/removenocorefentities.py @@ -18,4 +18,4 @@ def process_document(self, doc): entities = doc.coref_entities if not entities: return - doc.coref_entities = [e for e in entities if not re.match(r'^NOCOREF', e.eid)] + doc._eid_to_entity = {e._eid: e for e in entities if not re.match(r'^NOCOREF', e.eid)} From 8b442889aca3c1b881d7d53896d1eb0547635cfa Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 21 Feb 2023 15:52:18 +0100 Subject: [PATCH 0791/1374] CorefUD: counting sentence sequences with no coref annotation --- udapi/block/corefud/countgaps.py | 67 ++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 udapi/block/corefud/countgaps.py diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py new file mode 100644 index 00000000..c8ee8d76 --- /dev/null +++ b/udapi/block/corefud/countgaps.py @@ -0,0 +1,67 @@ +from udapi.core.block import Block +from collections import Counter + +class CountGaps(Block): + """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" + + def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=True, **kwargs): + super().__init__(**kwargs) + self.report_per_newdoc = report_per_newdoc + self.report_per_file = report_per_file + self.report_total = report_total + self._total_counter = Counter() + + def _report_stats(self, counter=None, header_id=None): + if not counter: + counter = self._total_counter + if header_id: + print(f"============ {header_id} ============") + for key in sorted(counter): + print(f"{key:2d}: {counter[key]}") + + def _count_empty_seqs(self, empty_seqs): + counter = Counter() + for seq in empty_seqs: + counter[len(seq)] += 1 + return counter + + def process_document(self, doc): + file_counter = Counter() + empty_seqs = [] + curr_seq = [] + newdoc = None + for i, tree in enumerate(doc.trees): + if tree.newdoc: + if i: + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_counter = self._count_empty_seqs(empty_seqs) + file_counter.update(newdoc_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_counter, header_id=newdoc) + newdoc = tree.newdoc + empty_seqs = [] + curr_seq = [] + + has_mention = any(node.coref_mentions for node in tree.descendants) + if not has_mention: + curr_seq.append(tree.sent_id) + elif curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + + if curr_seq: + empty_seqs.append(curr_seq) + newdoc_counter = self._count_empty_seqs(empty_seqs) + file_counter.update(newdoc_counter) + if self.report_per_newdoc: + self._report_stats(newdoc_counter, header_id=newdoc) + + if self.report_per_file: + self._report_stats(file_counter, header_id="FULL DOC") + + self._total_counter.update(file_counter) + + def process_end(self): + if self.report_total: + self._report_stats(header_id="TOTAL") From 716461fe3b67711f71a8cee028668fe34ceffef0 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Tue, 21 Feb 2023 19:22:33 +0100 Subject: [PATCH 0792/1374] besides sequences, counting also paragraphs with no coref mentions --- udapi/block/corefud/countgaps.py | 63 +++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/udapi/block/corefud/countgaps.py b/udapi/block/corefud/countgaps.py index c8ee8d76..fc45540a 100644 --- a/udapi/block/corefud/countgaps.py +++ b/udapi/block/corefud/countgaps.py @@ -1,5 +1,5 @@ from udapi.core.block import Block -from collections import Counter +from collections import defaultdict, Counter class CountGaps(Block): """Block corefud.checkConsistency searches for sentence sequences with no coref annotation.""" @@ -9,15 +9,15 @@ def __init__(self, report_per_newdoc=False, report_per_file=True, report_total=T self.report_per_newdoc = report_per_newdoc self.report_per_file = report_per_file self.report_total = report_total - self._total_counter = Counter() + self._total_counter = defaultdict(Counter) - def _report_stats(self, counter=None, header_id=None): - if not counter: - counter = self._total_counter + def _report_stats(self, counter, header_id=None): if header_id: print(f"============ {header_id} ============") for key in sorted(counter): print(f"{key:2d}: {counter[key]}") + print("-------") + print(f"SUM: {sum([k*counter[k] for k in counter])}") def _count_empty_seqs(self, empty_seqs): counter = Counter() @@ -26,42 +26,69 @@ def _count_empty_seqs(self, empty_seqs): return counter def process_document(self, doc): - file_counter = Counter() + file_counters = defaultdict(Counter) empty_seqs = [] + empty_pars = [] curr_seq = [] + curr_par = [] + is_empty_par = True newdoc = None for i, tree in enumerate(doc.trees): if tree.newdoc: if i: if curr_seq: empty_seqs.append(curr_seq) - newdoc_counter = self._count_empty_seqs(empty_seqs) - file_counter.update(newdoc_counter) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if is_empty_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) if self.report_per_newdoc: - self._report_stats(newdoc_counter, header_id=newdoc) + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS in {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS in {newdoc}") newdoc = tree.newdoc empty_seqs = [] + empty_pars = [] curr_seq = [] + curr_par = [] + is_empty_par = True + if tree.newpar: + if not tree.newdoc and is_empty_par: + empty_pars.append(curr_par) + curr_par = [] + is_empty_par = True has_mention = any(node.coref_mentions for node in tree.descendants) if not has_mention: curr_seq.append(tree.sent_id) - elif curr_seq: - empty_seqs.append(curr_seq) - curr_seq = [] + curr_par.append(tree.sent_id) + else: + if curr_seq: + empty_seqs.append(curr_seq) + curr_seq = [] + is_empty_par = False if curr_seq: empty_seqs.append(curr_seq) - newdoc_counter = self._count_empty_seqs(empty_seqs) - file_counter.update(newdoc_counter) + newdoc_seq_counter = self._count_empty_seqs(empty_seqs) + file_counters["seq"].update(newdoc_seq_counter) + if curr_par: + empty_pars.append(curr_par) + newdoc_par_counter = self._count_empty_seqs(empty_pars) + file_counters["par"].update(newdoc_par_counter) if self.report_per_newdoc: - self._report_stats(newdoc_counter, header_id=newdoc) + self._report_stats(newdoc_seq_counter, header_id=f"SEQ STATS, {newdoc}") + self._report_stats(newdoc_par_counter, header_id=f"PAR STATS, {newdoc}") if self.report_per_file: - self._report_stats(file_counter, header_id="FULL DOC") + self._report_stats(file_counters["seq"], header_id="SEQ STATS, FILE") + self._report_stats(file_counters["par"], header_id="PAR STATS, FILE") - self._total_counter.update(file_counter) + self._total_counter["seq"].update(file_counters["seq"]) + self._total_counter["par"].update(file_counters["par"]) def process_end(self): if self.report_total: - self._report_stats(header_id="TOTAL") + self._report_stats(self._total_counter["seq"], header_id="SEQ STATS, TOTAL") + self._report_stats(self._total_counter["par"], header_id="PAR STATS, TOTAL") From c147469f5a4a9267902974846c6ff2d804447cdb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 25 Feb 2023 00:25:12 +0100 Subject: [PATCH 0793/1374] write.CorefHtml add visualization menu show: eid, trees, line breaks, paragraphs --- udapi/block/write/corefhtml.py | 39 +++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 280fc213..20f68291 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -11,7 +11,7 @@ Udapi CorefUD viewer ''' -# I use a pure CSS-3 solution: #overiew {resize: horizontal; overflow: auto;} +# I use a pure CSS-3 solution: #overview {resize: horizontal; overflow: auto;} # so that the width of #overview can be changed by dragging the bottom right corner. # The following lines would make the whole right border draggable: # @@ -25,9 +25,19 @@ display: grid; border-right: double; padding: 5px; width: 20em; background: #ddd; border-radius: 5px; } +#main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; + padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} +#menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} +.change .b1 {transform: translate(0, 9px) rotate(-45deg);} +.change .b2 {opacity: 0;} +.change .b3 {transform: translate(0, -9px) rotate(45deg);} + .sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.sentence .tree span {border: none; padding: 0; display:inline;} .sentence span .eid {display:block; font-size: 10px;} -.showtree {float:left; margin: 5px;} +.showtree {margin: 5px; user-select: none;} +.display-inline {display: inline;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} .empty {color: gray;} .sentence .singleton {border-style: dotted;} @@ -55,16 +65,22 @@ function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, function(e) {$("span").removeClass("active");} ); + +function menuclick(x) { + x.classList.toggle("change"); + $("#main-menu").toggle(); +} + ''' SCRIPT_SHOWTREE = ''' $(".sentence").each(function(index){ var sent_id = this.id; - $(this).before( + $(this).prepend( $("

') print('

') + print('\n' + '\n') for tree in doc.trees: self.process_tree(tree, mention_ids, entity_colors) print('

') @@ -180,7 +203,7 @@ def process_tree(self, tree, mention_ids, entity_colors): if tree.newdoc: print(f'

{tree.newdoc if tree.newdoc is not True else ""}

') elif tree.newpar: - print('

') + print('

') opened = [] print(f'

') for node in nodes_and_empty: @@ -188,7 +211,7 @@ def process_tree(self, tree, mention_ids, entity_colors): subspan = subspans.pop() self._start_subspan(subspan, mention_ids, entity_colors) opened.append(subspan) - + is_head = self._is_head(node) if is_head: print('', end='') @@ -199,7 +222,7 @@ def process_tree(self, tree, mention_ids, entity_colors): print('', end='') if is_head: print('', end='') - + while opened and opened[-1].words[-1] == node: print('', end='') opened.pop() @@ -229,7 +252,7 @@ def process_tree(self, tree, mention_ids, entity_colors): if not node.no_space_after: print(' ', end='') - + print('

') def _is_head(self, node): From 0b30f5b75ab2a53ed5e0425d536094dee5c56f02 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 25 Feb 2023 02:53:43 +0100 Subject: [PATCH 0794/1374] more visualization options --- udapi/block/write/corefhtml.py | 65 +++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index 20f68291..fd500e7d 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -11,13 +11,7 @@ Udapi CorefUD viewer ''' -# I use a pure CSS-3 solution: #overview {resize: horizontal; overflow: auto;} -# so that the width of #overview can be changed by dragging the bottom right corner. -# The following lines would make the whole right border draggable: -# -# -# -#

+ CSS = ''' #wrap {display: flex; align-items: flex-start;} #main {width: 100%; padding: 5px; background: white; z-index:100;} @@ -27,15 +21,19 @@ } #main-menu {position:fixed; z-index:150; top: 4px; right:4px; display:none; padding: 5px 55px 5px 5px; background-color:gray; border-radius: 5px;} +#main-menu div {display: inline-block;} #menubtn {position: fixed; right: 8px; top: 8px; z-index: 200;} #menubtn div {width: 30px; height: 4px; background-color: black; margin: 5px 0; transition: 0.4s;} .change .b1 {transform: translate(0, 9px) rotate(-45deg);} .change .b2 {opacity: 0;} .change .b3 {transform: translate(0, -9px) rotate(45deg);} -.sentence span {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} -.sentence .tree span {border: none; padding: 0; display:inline;} -.sentence span .eid {display:block; font-size: 10px;} +.m {border: 1px solid black; border-radius: 5px; padding: 2px; display:inline-block;} +.nobox {border:1px solid transparent; padding:0; background: transparent !important; display: inline} +.nobox .labels {display: inline;} +.nocolor {color: black !important;} +.nobold {font-weight: normal;} +.labels {display: block; font-size: 10px;} .showtree {margin: 5px; user-select: none;} .display-inline {display: inline;} .close{float:right; font-weight: 900; font-size: 30px; width: 36px; height: 36px; padding: 2px} @@ -48,22 +46,22 @@ ''' SCRIPT_BASE = ''' -$("span").click(function(e) { +$(".m").click(function(e) { let was_selected = $(this).hasClass("selected"); - $("span").removeClass("selected"); + $(".m").removeClass("selected"); if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} e.stopPropagation(); }); window.onhashchange = function() { - $("span").removeClass("selected"); + $(".m").removeClass("selected"); var fragment = window.location.hash.substring(1); if (fragment) {$("." + fragment).addClass("selected");} } -$("span").hover( - function(e) {$("span").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, - function(e) {$("span").removeClass("active");} +$(".m").hover( + function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, + function(e) {$(".m").removeClass("active");} ); function menuclick(x) { @@ -94,10 +92,11 @@ class CorefHtml(BaseWriter): - def __init__(self, show_trees=True, show_eid=True, colors=7, **kwargs): + def __init__(self, show_trees=True, show_eid=False, show_etype=False, colors=7, **kwargs): super().__init__(**kwargs) self.show_trees = show_trees self.show_eid = show_eid + self.show_etype = show_etype self.colors = colors def _representative_word(self, entity): @@ -120,6 +119,10 @@ def process_document(self, doc): if self.colors: for i in range(self.colors): print(f'.c{i} {{color: hsl({int(i * 360/self.colors)}, 100%, 30%);}}') + if not self.show_eid: + print('.eid {display: none;}') + if not self.show_etype: + print('.etype {display: none;}') print('') print('\n\n

') @@ -146,13 +149,19 @@ def process_document(self, doc): print('

') print('

') - print('

Show
\n' - ' eid
\n' + print('\n' - '\n') + '\n') for tree in doc.trees: self.process_tree(tree, mention_ids, entity_colors) print('

') @@ -168,8 +177,8 @@ def process_document(self, doc): def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): m = subspan.mention e = m.entity - classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"}' - title = f'eid={subspan.subspan_eid}\ntype={e.etype} ({entity_colors[e]})\nhead={m.head.form}' + classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"} m' + title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}' if self.colors: classes += f' {entity_colors[e]}' if all(w.is_empty() for w in subspan.words): @@ -184,9 +193,9 @@ def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): span_id = '' if (subspan.subspan_id == '' or subspan.subspan_id.startswith('[1/')) and e.mentions[0] == m: span_id = f'id="{e.eid}" ' - print(f'', end='') - if self.show_eid: - print(f'{subspan.subspan_eid}', end='') + print(f'' + f'{subspan.subspan_eid}' + f' {e.etype}', end='') def process_tree(self, tree, mention_ids, entity_colors): mentions = set() @@ -214,9 +223,9 @@ def process_tree(self, tree, mention_ids, entity_colors): is_head = self._is_head(node) if is_head: - print('', end='') + print('', end='') if node.is_empty(): - print('', end='') + print('', end='') print(node.form, end='') if node.is_empty(): print('', end='') From 520512c9d1de454321cb1ff3a5389d4bf67ad36d Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 20 Mar 2023 21:16:49 +0100 Subject: [PATCH 0795/1374] New block: util.SplitSentence. --- udapi/block/util/splitsentence.py | 73 +++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 udapi/block/util/splitsentence.py diff --git a/udapi/block/util/splitsentence.py b/udapi/block/util/splitsentence.py new file mode 100644 index 00000000..704b11ef --- /dev/null +++ b/udapi/block/util/splitsentence.py @@ -0,0 +1,73 @@ +""" +my.SplitSentence will split a given sentence at a given token. +""" +import logging +from udapi.core.block import Block +from udapi.core.root import Root + +class SplitSentence(Block): + """ + If the sent_id of the current sentence matches the parameter, splits the + sentence into two. The first token of the second sentence is also given as + a parameter. + """ + + def __init__(self, sent_id=None, word_id=None, **kwargs): + """ + Args: + sent_id: which sentence should be split (new ids will have A and B appended) + word_id: which word should be the first word of the second sentence (tokens and words will be renumbered) + """ + super().__init__(**kwargs) + if not sent_id: + logging.fatal('Missing parameter sent_id') + if not word_id: + logging.fatal('Missing parameter word_id') + self.sent_id = sent_id + self.word_id = word_id + + def process_document(self, document): + for bundle_no, bundle in enumerate(document.bundles): + if bundle.bundle_id == self.sent_id: + logging.info('Found!') + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to split all zones but we do not try to do it at present. + # (The zones may be translations to other languages and it is not likely that we would + # want to split each translation at the same position.) + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + root = bundle.get_tree() + nodes_to_move = [n for n in root.descendants if n.ord >= self.word_id] + if len(nodes_to_move) == 0: + logging.fatal('No nodes to move to the new sentence; word_id may be out of range') + # Create a new bundle at the end of the current document. + new_bundle = document.create_bundle() + # Move the new bundle to the position right after the current bundle. + new_bundle_no = bundle_no + 1 + document.bundles.pop() + document.bundles.insert(new_bundle_no, new_bundle) + updated_no = new_bundle_no + for b in document.bundles[new_bundle_no:]: + b.number = updated_no + updated_no += 1 + new_bundle.bundle_id = bundle.bundle_id + 'B' + bundle.bundle_id += 'A' + new_root = Root(zone='') + new_bundle.add_tree(new_root) + new_root.steal_nodes(nodes_to_move) + # The steal_nodes() method does not make sure that all nodes newly attached + # to the artificial root have the 'root' relation. Fix it. + for n in root.descendants: + if n.parent.is_root(): + n.deprel = 'root' + for n in new_root.descendants: + if n.parent.is_root(): + n.deprel = 'root' + # Update the sentence text attributes of the new sentences. + root.text = root.compute_text() + new_root.text = new_root.compute_text() + # We have found our sentence. No need to process the rest of the document. + break From 75c85b98f5f36cd6c48785c5495819aa5ac7f535 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Mon, 20 Mar 2023 21:28:33 +0100 Subject: [PATCH 0796/1374] Warn if there are multiple children of the root. --- udapi/block/util/splitsentence.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/udapi/block/util/splitsentence.py b/udapi/block/util/splitsentence.py index 704b11ef..5a093a30 100644 --- a/udapi/block/util/splitsentence.py +++ b/udapi/block/util/splitsentence.py @@ -60,12 +60,20 @@ def process_document(self, document): new_root.steal_nodes(nodes_to_move) # The steal_nodes() method does not make sure that all nodes newly attached # to the artificial root have the 'root' relation. Fix it. + n_root = 0 for n in root.descendants: if n.parent.is_root(): n.deprel = 'root' + n_root += 1 + if n_root > 1: + logging.warning('More than one 0:root relation in the first part of the sentence.') + n_root = 0 for n in new_root.descendants: if n.parent.is_root(): n.deprel = 'root' + n_root += 1 + if n_root > 1: + logging.warning('More than one 0:root relation in the second part of the sentence.') # Update the sentence text attributes of the new sentences. root.text = root.compute_text() new_root.text = new_root.compute_text() From 948f915c6aa41cde044ebbbdd9d69499065b6352 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Wed, 22 Mar 2023 15:32:55 +0100 Subject: [PATCH 0797/1374] lazy loading of further documents in write.CorefHtml --- udapi/block/write/corefhtml.py | 91 +++++++++++++++++++++++++++------- 1 file changed, 73 insertions(+), 18 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index fd500e7d..e2cd40f2 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -1,8 +1,24 @@ -"""CorefHtml class is a writer for HTML+JavaScript visualization of coreference.""" +"""CorefHtml class is a writer for HTML+JavaScript visualization of coreference. + +When using lazy loading of documents (infinite scrolling), +modern browsers don't allow JavaScript to load files from a local file system +("Access to XMLHttpRequest at 'file://.../doc2.html' from origin 'null' has been +blocked by CORS policy: Cross origin requests are only supported for protocol schemes: +http, data, chrome, chrome-extension, https.") + +The recommended solution is to start a local web server, e.g. using + python -m http.server +and browse http://0.0.0.0:8000/my.html. + +Non-recommended solution is to run + google-chrome --new-window --user-data-dir=/tmp/chrome-proxy --allow-file-access-from-files my.html +""" from udapi.core.basewriter import BaseWriter from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention from collections import Counter import udapi.block.write.html +import sys +import os ETYPES = 'person place organization animal plant object substance time number abstract event'.split() @@ -46,12 +62,19 @@ ''' SCRIPT_BASE = ''' -$(".m").click(function(e) { - let was_selected = $(this).hasClass("selected"); - $(".m").removeClass("selected"); - if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} - e.stopPropagation(); -}); +function add_mention_listeners(mentions){ + mentions.click(function(e) { + let was_selected = $(this).hasClass("selected"); + $(".m").removeClass("selected"); + if (!was_selected) {$("."+$(this).attr("class").split(" ")[0]).addClass("selected");} + e.stopPropagation(); + }); + mentions.hover( + function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, + function(e) {$(".m").removeClass("active");} + ); +} +add_mention_listeners($(".m")); window.onhashchange = function() { $(".m").removeClass("selected"); @@ -59,16 +82,22 @@ if (fragment) {$("." + fragment).addClass("selected");} } -$(".m").hover( - function(e) {$(".m").removeClass("active"); $("."+$(this).attr("class").split(" ")[1]).addClass("active");}, - function(e) {$(".m").removeClass("active");} -); - function menuclick(x) { x.classList.toggle("change"); $("#main-menu").toggle(); } +var docs_loaded = 1; +$(window).scroll(function () { + if ($(window).scrollTop() >= $(document).height() - $(window).height() - 42 && docs_loaded < all_docs) { + docs_loaded += 1; + console.log("loading doc" + docs_loaded + ".html"); + $.get(docs_dir + "/doc" + docs_loaded + ".html", function(data){ + $("#main").append(data); + add_mention_listeners($("#doc" + docs_loaded + " .m")); + }); + } +}); ''' SCRIPT_SHOWTREE = ''' @@ -80,7 +109,7 @@ if (tree_div.length == 0){ var tdiv = $("
", {id:"tree-"+sent_id, class:"tree"}).insertAfter($(this)); tdiv.treexView([data[index]]); - $("

\n' '\n') - for tree in doc.trees: - self.process_tree(tree, mention_ids, entity_colors) - print('

') - print('') print('

') - def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): + def _start_subspan(self, subspan, crossing=False): m = subspan.mention e = m.entity - classes = f'{e.eid} {mention_ids[m]} {e.etype or "other"} m' + classes = f'{e.eid} {self._mention_ids[m]} {e.etype or "other"} m' title = f'eid={subspan.subspan_eid}\netype={e.etype}\nhead={m.head.form}' if self.colors: - classes += f' {entity_colors[e]}' + classes += f' {self._entity_colors[e]}' if all(w.is_empty() for w in subspan.words): classes += ' empty' if len(e.mentions) == 1: @@ -252,7 +303,7 @@ def _start_subspan(self, subspan, mention_ids, entity_colors, crossing=False): f'{subspan.subspan_eid}' f' {e.etype}', end='') - def process_tree(self, tree, mention_ids, entity_colors): + def process_tree(self, tree): mentions = set() nodes_and_empty = tree.descendants_and_empty for node in nodes_and_empty: @@ -273,7 +324,7 @@ def process_tree(self, tree, mention_ids, entity_colors): for node in nodes_and_empty: while subspans and subspans[-1].words[0] == node: subspan = subspans.pop() - self._start_subspan(subspan, mention_ids, entity_colors) + self._start_subspan(subspan) opened.append(subspan) is_head = self._is_head(node) @@ -311,7 +362,7 @@ def process_tree(self, tree, mention_ids, entity_colors): opened = new_opened print('' * (len(endings) + len(brokens)), end='') for broken in brokens: - self._start_subspan(broken, mention_ids, entity_colors, True) + self._start_subspan(broken, True) opened.append(subspan) if not node.no_space_after: diff --git a/udapi/block/write/html.py b/udapi/block/write/html.py index 48431900..ae85d43c 100644 --- a/udapi/block/write/html.py +++ b/udapi/block/write/html.py @@ -79,7 +79,9 @@ def process_document(self, doc): print('\n') print('
') def print_doc_json(self, doc): - print('data=[') + print('[') for (bundle_number, bundle) in enumerate(doc, 1): if bundle_number != 1: print(',', end='') print('{"zones":{', end='') first_zone = True desc = '' - for tree in bundle.trees: + try: + trees = bundle.trees + except: + trees = [bundle] # allow to call print_doc_json([tree1, tree2]) + for tree in trees: zone = tree.zone if first_zone: first_zone = False @@ -116,7 +122,7 @@ def print_doc_json(self, doc): print(']}}}') # print desc without the extra starting comma print('},"desc":[%s]}' % desc[1:]) - print('];') + print(']') @staticmethod From 327bb6f9083f6131b4f986dac9b56f2570957f60 Mon Sep 17 00:00:00 2001 From: Federica Gamba Date: Thu, 30 Mar 2023 12:22:27 +0200 Subject: [PATCH 0799/1374] adjustments in Latin feature rules --- udapi/block/ud/la/markfeatsbugs.py | 74 +++++++++++++++++++----------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/udapi/block/ud/la/markfeatsbugs.py b/udapi/block/ud/la/markfeatsbugs.py index fde3b0bd..dce4592d 100644 --- a/udapi/block/ud/la/markfeatsbugs.py +++ b/udapi/block/ud/la/markfeatsbugs.py @@ -28,7 +28,8 @@ def process_node(self, node): rf = [] af = {} # PROIEL-specific: greek words without features - if node.lemma == 'greek.expression': + # LLCT-specific: corrupted nodes + if node.lemma in ['greek.expression', 'missing^token']: pass # NOUNS ################################################################ elif node.upos == 'NOUN': @@ -41,12 +42,14 @@ def process_node(self, node): 'Degree': ['Dim'], 'Abbr': ['Yes'], 'Foreign': ['Yes'], - 'VerbForm': ['Part']} + 'VerbForm': ['Part', 'Vnoun']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Proper'] = ['Yes'] + af['Polarity'] = ['Neg'] af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) self.check_allowed_features(node, af) @@ -61,10 +64,10 @@ def process_node(self, node): 'Abbr': ['Yes'], 'Foreign': ['Yes']} if self.flavio: - af['Compound'] = 'Yes' + af['Compound'] = ['Yes'] + af['Variant'] = ['Greek'] af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] - if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADJECTIVES ########################################################### @@ -72,7 +75,7 @@ def process_node(self, node): if not node.feats['Abbr'] == 'Yes' and node.feats['Case']: rf = ['Gender', 'Number', 'Case'] af = { - 'NumType': ['Ord', 'Dist'], + 'NumType': ['Dist', 'Mult', 'Ord'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'], @@ -83,9 +86,10 @@ def process_node(self, node): 'VerbForm': ['Part']} if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurE', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] + af['Variant'] = ['Greek'] af['Degree'].append('Dim') af['NameType'] = ['Ast', 'Cal', 'Com', 'Geo', 'Giv', 'Let', 'Lit', 'Met', 'Nat', 'Rel', 'Sur', 'Oth'] self.check_required_features(node, rf) @@ -112,10 +116,10 @@ def process_node(self, node): rf.extend(['Person', 'Number']) af['Person'] = ['1', '2', '3'] af['Number'] = ['Sing', 'Plur'] - # 1st and 2nd person do not have gender + # 3rd person must have gender if node.feats['Person'] == '3': # is, id rf.append('Gender') - af['Gender'] = ['Masc', 'Fem', 'Neut'] + af['Gender'] = ['Masc', 'Fem', 'Neut'] elif re.match(r'^(Rel|Int)$', node.feats['PronType']): rf.extend(['Gender', 'Number']) af['Gender'] = ['Masc', 'Fem', 'Neut'] @@ -126,20 +130,20 @@ def process_node(self, node): af['Number'] = ['Sing', 'Plur'] # lexical check of PronTypes af['PronType'] = [] - if node.lemma in ['is', 'ego', 'tu', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'tumetipse', 'nosmetipse']: + if node.lemma in ['ego', 'tu', 'is', 'sui', 'seipsum', 'nos', 'uos', 'vos', 'egoipse', 'egometipse', 'tumetipse', 'semetipse', 'nosmetipse']: af['PronType'].append('Prs') - elif node.lemma in ['quis', 'aliquis', 'nihil', 'nemo', 'quivis', 'qui']: + elif node.lemma in ['aliquis', 'nemo', 'nihil', 'nihilum', 'qui', 'quis', 'quisquis', 'quiuis', 'quivis']: af['PronType'].append('Ind') elif node.lemma in ['inuicem', 'invicem']: af['PronType'].append('Rcp') rf.remove('Case') - if node.lemma in ['quicumque', 'qui', 'quisquis']: + if node.lemma in ['qui', 'quicumque', 'quisquis']: af['PronType'].append('Rel') - if node.lemma in ['qui', 'quis', 'quisnam', 'ecquis', 'ecqui']: + if node.lemma in [ 'ecquis', 'ecqui', 'numquis', 'qui', 'quis', 'quisnam']: af['PronType'].append('Int') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['LatAnom', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurO', 'IndEurX', 'LatAnom', 'LatPron'] af['Compound'] = ['Yes'] af['Polarity'] = ['Neg'] af['Form'] = ['Emp'] @@ -175,25 +179,26 @@ def process_node(self, node): if node.lemma in ['suus', 'meus', 'noster', 'tuus', 'uester', 'vester', 'voster']: if not af['PronType'] == ['Prs']: af['PronType'].append('Prs') - elif node.lemma in ['aliquot', 'quidam', 'quispiam', 'quivis', 'nullus', 'nonnullus', 'aliqui', 'qui', 'quilibet', 'quantuslibet', 'unus', 'uterque', 'ullus', 'multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: + elif node.lemma in ['aliquantus', 'aliqui', 'aliquot', 'quidam', 'nonnullus', 'nullus', 'quantuscumque', 'quantuslibet', 'qui', 'quilibet', 'quispiam', 'quiuis', 'quivis', 'quotlibet', 'ullus', 'unus', 'uterque','multus', 'quisque', 'paucus', 'complures', 'quamplures', 'quicumque', 'reliquus', 'plerusque', 'aliqualis', 'quisquam', 'qualiscumque']: af['PronType'].append('Ind') elif node.lemma in ['omnis', 'totus', 'ambo', 'cunctus', 'unusquisque', 'uniuersus']: af['PronType'].append('Tot') if node.lemma in ['quantus', 'qualis', 'quicumque', 'quot', 'quotus', 'quotquot']: af['PronType'].append('Rel') - elif node.lemma in ['qui', 'quantus', 'quot']: + if node.lemma in ['qui', 'quantus', 'quot']: af['PronType'].append('Int') - elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot']: + elif node.lemma in ['hic', 'ipse', 'ille', 'tantus', 'talis', 'is', 'iste', 'eiusmodi', 'huiusmodi', 'idem', 'totidem', 'tot', 'praedictus', 'praefatus', 'suprascriptus']: af['PronType'].append('Dem') - elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter']: + elif node.lemma in ['alius', 'alter', 'solus', 'ceterus', 'alteruter', 'neuter', 'uter', 'uterlibet', 'uterque']: af['PronType'].append('Con') if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'IndEurX', 'LatPron'] af['Compound'] = ['Yes'] af['Form'] = ['Emp'] af['NumType'] = ['Card'] af['Degree'].append('Dim') + af['PronType'].append('Art') if re.match(r'^(unus|ambo)', node.lemma): af['NumValue'] = ['1', '2'] self.check_required_features(node, rf) @@ -202,7 +207,7 @@ def process_node(self, node): elif node.upos == 'NUM': rf = ['NumType', 'NumForm'] af = { - 'NumType': ['Card'], + 'NumType': ['Card', 'Ord'], 'NumForm': ['Word', 'Roman', 'Digit'], 'Proper': ['Yes']} # Arabic digits and Roman numerals do not have inflection features. @@ -212,7 +217,9 @@ def process_node(self, node): af['Case'] = ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Abl'] if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. # e.g. duodecim - af['InflClass'] = ['IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['InflClass'] = ['Ind', 'IndEurA', 'IndEurI', 'IndEurO', 'LatPron'] + af['NumForm'].append('Reference') + af['Compound'] = ['Yes'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # VERBS AND AUXILIARIES ################################################ @@ -227,7 +234,7 @@ def process_node(self, node): if node.feats['VerbForm'] not in ['Part', 'Conv']: rf.append('Tense') af['Tense'] = ['Past', 'Pqp', 'Pres', 'Fut'] - if node.upos == 'VERB': + if node.upos == 'VERB' or (node.upos == 'AUX' and node.lemma != 'sum'): rf.append('Voice') af['Voice'] = ['Act', 'Pass'] if node.feats['VerbForm'] == 'Fin': # imperative, indicative or subjunctive @@ -255,6 +262,7 @@ def process_node(self, node): if self.flavio: # Flavio added InflClass but not everywhere, so it is not required. af['InflClass'] = ['LatA', 'LatAnom', 'LatE', 'LatI', 'LatI2', 'LatX'] + af['VerbType'] = ['Mod'] if 'Degree' in af: af['Degree'].append('Dim') else: @@ -262,7 +270,12 @@ def process_node(self, node): af['Compound'] = ['Yes'] af['Proper'] = ['Yes'] if re.match(r'^(Part|Conv)$', node.feats['VerbForm']): - af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU'] + af['InflClass[nominal]'] = ['IndEurA', 'IndEurI', 'IndEurO', 'IndEurU', 'IndEurX'] + elif node.feats['VerbForm'] == 'Inf': + af['Case'] = ['Nom', 'Acc', 'Abl'] + af['Gender'] = ['Neut'] + af['Number'] = ['Sing'] + af['InflClass[nominal]'] = ['Ind'] self.check_required_features(node, rf) self.check_allowed_features(node, af) # ADVERBS ############################################################## @@ -271,13 +284,13 @@ def process_node(self, node): 'AdvType': ['Loc', 'Tim'], 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg', 'Tot', 'Con'], 'Degree': ['Pos', 'Cmp', 'Sup', 'Abs'], - 'NumType': ['Card', 'Ord'], # e.g., primum + 'NumType': ['Card', 'Mult', 'Ord'], # e.g., primum 'Polarity': ['Neg'] } if self.flavio: af['Compound'] = ['Yes'] af['Form'] = ['Emp'] - af['VerbForm'] = ['Part'] + af['VerbForm'] = ['Fin', 'Part'] af['Degree'].append('Dim') self.check_allowed_features(node, af) # PARTICLES ############################################################ @@ -289,6 +302,7 @@ def process_node(self, node): if self.flavio: af['Form'] = ['Emp'] af['PronType'] = ['Dem'] + af['Compound'] = ['Yes'] self.check_allowed_features(node, af) # CONJUNCTIONS ######################################################### elif re.match(r'^[CS]CONJ$', node.upos): @@ -301,6 +315,8 @@ def process_node(self, node): af['Form'] = ['Emp'] af['VerbForm'] = ['Fin'] af['NumType'] = ['Card'] + af['ConjType'] = ['Expl'] + af['AdvType'] = ['Loc'] self.check_allowed_features(node, af) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': @@ -310,9 +326,13 @@ def process_node(self, node): 'Abbr': ['Yes'] } if self.flavio: - af['VerbForm'] = ['Part'], + af['VerbForm'] = ['Part'] af['Proper'] = ['Yes'] + af['Compound'] = ['Yes'] self.check_allowed_features(node, af) + # X ########################################################## + elif node.upos == 'X': + af = {'Abbr': ['Yes']} # THE REST: NO FEATURES ################################################ else: self.check_allowed_features(node, {}) From 1ddfce4aec593e222a0e3d26e8f74acf561d1356 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 31 Mar 2023 19:42:35 +0200 Subject: [PATCH 0800/1374] gzip the docs/* json and html files --- udapi/block/write/corefhtml.py | 49 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/udapi/block/write/corefhtml.py b/udapi/block/write/corefhtml.py index cd0db1e5..6129b335 100644 --- a/udapi/block/write/corefhtml.py +++ b/udapi/block/write/corefhtml.py @@ -17,6 +17,7 @@ from udapi.core.coref import span_to_nodes, CorefEntity, CorefMention from collections import Counter import udapi.block.write.html +import gzip import sys import os @@ -26,6 +27,7 @@ Udapi CorefUD viewer + ''' CSS = ''' @@ -87,21 +89,26 @@ $("#main-menu").toggle(); } -function load_doc(doc_num) { +async function load_doc(doc_num) { loading_now = true; - console.log("loading doc" + doc_num + ".html"); - $.get(docs_dir + "/doc" + doc_num + ".html", function(data){ - $("#main").append(data); - add_mention_listeners($("#doc" + doc_num + " .m")); - $("#doc" + doc_num + " .sentence").each(add_show_tree_button); - loading_now = false; - }).fail(function(){ + let filename = docs_dir + "/doc" + doc_num + ".html.gz" + console.log("loading " + filename); + try { + const res = await fetch(filename); + let raw = await res.arrayBuffer(); + data = pako.inflate(raw, {to: "string"}); + } catch (error){ if (! load_fail_reported) { load_fail_reported = true; - alert("Cannot load " + docs_dir + "/doc" + doc_num - + ".html\\nLocal files do not support lazy loading. Run a web server 'python -m http.server'"); + alert("Cannot load " + filename + "\\nLocal files do not support lazy loading." + + " Run a web server 'python -m http.server'\\n" + + "error = " + error); } - }); + } + $("#main").append(data); + add_mention_listeners($("#doc" + doc_num + " .m")); + $("#doc" + doc_num + " .sentence").each(add_show_tree_button); + loading_now = false; } var docs_loaded = 1; @@ -126,7 +133,7 @@ add_show_tree_button = function(index, el){ var sent_id = el.id; $(el).prepend( - $("