-
Notifications
You must be signed in to change notification settings - Fork 35
Expand file tree
/
Copy pathblock.py
More file actions
166 lines (145 loc) · 6.69 KB
/
block.py
File metadata and controls
166 lines (145 loc) · 6.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""Block class represents the basic Udapi processing unit."""
import logging
import inspect
def not_overridden(method):
method.is_not_overridden = True
return method
class Block(object):
"""The smallest processing unit for processing Universal Dependencies data.
Parameters:
zones: which zone to process (default="all")
if_empty_tree: what to do when encountering a tree with no nodes.
Possible values are: process (default), skip, skip_warn, fail, delete.
"""
def __init__(self, zones='all', if_empty_tree='process', **kwargs):
self.zones = zones
self.if_empty_tree = if_empty_tree
if kwargs:
params = set()
for cls in type(self).mro()[:-1]:
params.update(inspect.signature(cls.__init__).parameters.keys())
params -= {'self', 'kwargs'}
raise TypeError(f"Extra parameters {kwargs}.\n"
f"Parameters of {self.block_name()} are:\n"
+ '\n'.join(sorted(params)))
def block_name(self):
module = ".".join(self.__module__.split(".")[:-1])
if module.startswith('udapi.block.'):
module = module[12:]
return module + "." + self.__class__.__name__
def process_start(self):
"""A hook method that is executed before processing UD data"""
pass
def process_end(self):
"""A hook method that is executed after processing all UD data"""
pass
@not_overridden
def process_node(self, _):
"""Process a UD node"""
pass
@not_overridden
def process_empty_node(self, _):
"""Process an empty node (in enhanced dependencies)"""
pass
@not_overridden
def process_tree(self, tree):
"""Process a UD tree"""
# tree.descendants is slightly slower than tree._descendants (0.05s per iterating over 700k words),
# but it seems safer to iterate over a copy of the list of nodes.
# If a user calls parent.create_child().shift_before_node(parent) in process_node,
# it may end up in endless cycle (because the same node is processed again - Python for cycle remembers the position).
for node in tree.descendants:
self.process_node(node)
@not_overridden
def process_bundle(self, bundle):
"""Process a UD bundle"""
for tree in bundle:
if self._should_process_tree(tree):
self.process_tree(tree)
def run(self, document):
self.process_start()
self.apply_on_document(document)
self.process_end()
def apply_on_document(self, document):
self.before_process_document(document)
self.process_document(document)
self.after_process_document(document)
def process_document(self, document):
"""Process a UD document"""
# Calling document.coref_entities is expensive because
# it needs to deserialize coref_entities from the MISC attributes.
# If no block in a scenario needs to process coreference entities/mentions,
# the deserialization does not need to be done.
# So we need to detect if any of the methods process_coref_entity and process_coref_mention
# has been overriden (without calling them, which could have adverse side effects).
# Let's use method annotations for this.
p_entity = not hasattr(self.process_coref_entity, 'is_not_overridden')
p_mention = not hasattr(self.process_coref_mention, 'is_not_overridden')
p_bundle = not hasattr(self.process_bundle, 'is_not_overridden')
p_tree = not hasattr(self.process_tree, 'is_not_overridden')
p_node = not hasattr(self.process_node, 'is_not_overridden')
p_empty_node = not hasattr(self.process_empty_node, 'is_not_overridden')
if not any((p_entity, p_mention, p_bundle, p_tree, p_node, p_empty_node)):
raise Exception("No processing activity defined in block " + self.block_name())
if p_entity or p_mention:
for entity in document.coref_entities:
if p_entity:
self.process_coref_entity(entity)
else:
for mention in entity.mentions:
self.process_coref_mention(mention)
if p_bundle or p_tree or p_node or p_empty_node:
for bundle_no, bundle in enumerate(document.bundles, 1):
logging.debug(f'Block {self.block_name()} processing '
f'bundle #{bundle_no} (id={bundle.bundle_id})')
if p_bundle:
self.process_bundle(bundle)
else:
for tree in bundle:
if self._should_process_tree(tree):
if p_tree:
self.process_tree(tree)
else:
if p_node:
for node in tree.descendants:
self.process_node(node)
if p_empty_node:
for empty_node in tree.empty_nodes:
self.process_empty_node(empty_node)
@not_overridden
def process_coref_entity(self, entity):
"""This method is called on each coreference entity in the document."""
for mention in entity.mentions:
self.process_coref_mention(mention)
@not_overridden
def process_coref_mention(self, mention):
"""This method is called on each coreference mention in the document."""
pass
def before_process_document(self, document):
"""This method is called before each process_document."""
pass
def after_process_document(self, document):
"""This method is called after each process_document."""
pass
def _should_process_tree(self, tree):
if self.if_empty_tree != 'process' and not tree.descendants:
if self.if_empty_tree == 'skip':
return False
elif self.if_empty_tree == 'delete':
tree.remove()
return False
elif self.if_empty_tree == 'skip_warn':
logging.warning("Tree %s is empty", tree)
return False
elif self.if_empty_tree == 'fail':
raise Exception("Tree %s is empty" % tree)
else:
raise ValueError("Unknown value for if_empty_tree: "
+ self.if_empty_tree)
if self.zones == 'all':
return True
if self.zones == '' and tree.zone == '':
return True
if tree.zone in self.zones.split(','):
return True
return False