-
Notifications
You must be signed in to change notification settings - Fork 35
Expand file tree
/
Copy pathmwt.py
More file actions
160 lines (135 loc) · 5.62 KB
/
mwt.py
File metadata and controls
160 lines (135 loc) · 5.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""MWT class represents a multi-word token."""
from udapi.core.dualdict import DualDict
from udapi.core.feats import Feats
class MWT(object):
"""Class for representing multi-word tokens in UD trees."""
__slots__ = ['words', 'form', '_feats', '_misc', 'root']
def __init__(self, words=None, form=None, feats=None, misc=None, root=None):
self.words = words if words is not None else []
self.form = form
self._feats = Feats(feats) if feats and feats != '_' else None
self._misc = DualDict(misc) if misc and misc != '_' else None
self.root = root
for word in self.words:
word._mwt = self # pylint: disable=W0212
@property
def feats(self):
"""Property `feats` in MWT should be used only for `Typo=Yes`.
See https://universaldependencies.org/changes.html#typos-in-multiword-tokens
However, Udapi does not enforce this restriction and mwt.feats works exactly the same as node.feats.
"""
if self._feats is None:
self._feats = Feats()
return self._feats
@feats.setter
def feats(self, value):
if self._feats is None:
self._feats = Feats(value)
else:
self._feats.set_mapping(value)
@property
def misc(self):
"""Property for MISC attributes stored as a `DualDict` object.
See `udapi.core.node.Node` for details.
"""
if self._misc is None:
self._misc = DualDict()
return self._misc
@misc.setter
def misc(self, value):
if self._misc is None:
self._misc = DualDict(value)
else:
self._misc.set_mapping(value)
@property
def ord_range(self):
"""Return a string suitable for the first column of CoNLL-U."""
self.words.sort()
return "%d-%d" % (self.words[0].ord, self.words[-1].ord)
def remove(self):
"""Delete this multi-word token (but keep its words)."""
for word in self.words:
word._mwt = None # pylint: disable=W0212
self.root.multiword_tokens.remove(self)
def address(self):
"""Full (document-wide) id of the multi-word token."""
return self.root.address + '#' + self.ord_range
@staticmethod
def is_mwt():
"""Is this a multi-word token?
Returns always True.
False is returned only by instances of the Node class.
"""
return True
@property
def no_space_after(self):
"""Boolean property as a shortcut for `mwt.misc["SpaceAfter"] == "No"`."""
return self.misc["SpaceAfter"] == "No"
@staticmethod
def is_empty():
"""Is this an Empty node?
Returns always False because multi-word tokens cannot be empty nodes.
"""
return False
@staticmethod
def is_leaf():
"""Is this a node/mwt without any children?
Returns always True because multi-word tokens cannot have children.
"""
return True
def _get_attr(self, name): # pylint: disable=too-many-return-statements
if name == 'form':
return self.form
if name == 'ord':
return self.ord_range
if name in ('edge', 'children', 'siblings', 'depth'):
return 0
if name == 'feats_split':
return str(self.feats).split('|')
if name == 'misc_split':
return str(self.misc).split('|')
if name.startswith('feats['):
return self.feats[name[6:-1]]
if name.startswith('misc['):
return self.misc[name[5:-1]]
return '<mwt>'
def get_attrs(self, attrs, undefs=None, stringify=True):
"""Return multiple attributes or pseudo-attributes, possibly substituting empty ones.
MWTs do not have children nor parents nor prev/next nodes,
so the pseudo-attributes: p_xy, c_xy, l_xy and r_xy are irrelevant (and return nothing).
Other pseudo-attributes (e.g. dir) return always the string "<mwt>".
The only relevant pseudo-attributes are
feats_split and misc_split: a list of name=value formatted strings.
The `ord` attribute returns actually `mwt.ord_range`.
Args:
attrs: A list of attribute names, e.g. ``['form', 'ord', 'feats_split']``.
undefs: A value to be used instead of None for empty (undefined) values.
stringify: Apply `str()` on each value (except for None)
"""
values = []
for name in attrs:
nodes = [self]
if name[1] == '_':
nodes, name = [], name[2:]
for node in (n for n in nodes if n is not None):
if name in {'feats_split', 'misc_split'}:
values.extend(node._get_attr(name))
else:
values.append(node._get_attr(name))
if undefs is not None:
values = [x if x is not None else undefs for x in values]
if stringify:
values = [str(x) if x is not None else None for x in values]
return values
@property
def _ord(self):
self.words.sort()
return self.words[0]._ord
# TODO: node.remove() should check if the node is not part of any MWT
# TODO: Document that editing words by mwt.words.append(node), del or remove(node) is not supported
# TODO: Make mwt._words private and provide a setter
# TODO: What to do when mwt.words = []? (It is allowed after mwt=MWT().)
# TODO: words.setter and node.shift* should check if the MWT does not contain gaps
# and is still multi-word
# TODO: Make sure mwt.words are always sorted (even after node.shift*).
# TODO: Check if one word is not included in multiple multi-word tokens.