Skip to content

Commit a8f9681

Browse files
committed
UDPipeOnline do not change LF to CRLF
prevent `SpacesAfter=\r\n\r\n` when there was an empty line with just `\n\n`.
1 parent 4f996fd commit a8f9681

File tree

1 file changed

+34
-4
lines changed

1 file changed

+34
-4
lines changed

udapi/tool/udpipeonline.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import os
99
import sys
1010
import urllib.error
11+
import urllib.parse
1112
import urllib.request
1213

1314
from udapi.block.read.conllu import Conllu as ConlluReader
@@ -62,6 +63,35 @@ def perform_request(self, params, method="process"):
6263

6364
return response["result"]
6465

66+
def perform_request_urlencoded(self, params, method="process"):
67+
"""Perform a request using application/x-www-form-urlencoded to preserve LF newlines.
68+
69+
This avoids CRLF normalization done by the email MIME serializer, ensuring that
70+
the content of the 'data' field retains Unix LF ("\n") exactly as provided.
71+
"""
72+
request_data = urllib.parse.urlencode(params).encode("utf-8")
73+
request_headers = {"Content-Type": "application/x-www-form-urlencoded; charset=utf-8"}
74+
75+
try:
76+
with urllib.request.urlopen(urllib.request.Request(
77+
url=f"{self.server}/{method}", headers=request_headers, data=request_data
78+
)) as request:
79+
response = json.loads(request.read())
80+
except urllib.error.HTTPError as e:
81+
print("An exception was raised during UDPipe '{}' REST request.\n"
82+
"The service returned the following error:\n"
83+
" {}".format(method, e.fp.read().decode("utf-8")), file=sys.stderr)
84+
raise
85+
except json.JSONDecodeError as e:
86+
print("Cannot parse the JSON response of UDPipe '{}' REST request.\n"
87+
" {}".format(method, e.msg), file=sys.stderr)
88+
raise
89+
90+
if "model" not in response or "result" not in response:
91+
raise ValueError("Cannot parse the UDPipe '{}' REST request response.".format(method))
92+
93+
return response["result"]
94+
6595
def tag_parse_tree(self, root, tag=True, parse=True):
6696
"""Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized)."""
6797
if not tag and not parse:
@@ -76,7 +106,7 @@ def tag_parse_tree(self, root, tag=True, parse=True):
76106
params["parser"] = ""
77107
attrs.append('deprel')
78108

79-
out_data = self.perform_request(params=params)
109+
out_data = self.perform_request_urlencoded(params=params)
80110
conllu_reader = ConlluReader(empty_parent="ignore")
81111
conllu_reader.files.filehandle = io.StringIO(out_data)
82112
parsed_root = conllu_reader.read_tree()
@@ -108,7 +138,7 @@ def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, r
108138
params["parser"] = ""
109139
if ranges:
110140
params["tokenizer"] = "presegmented;ranges" if resegment else "ranges"
111-
out_data = self.perform_request(params=params)
141+
out_data = self.perform_request_urlencoded(params=params)
112142
conllu_reader = ConlluReader(empty_parent="ignore")
113143
conllu_reader.files.filehandle = io.StringIO(out_data)
114144
trees = conllu_reader.read_trees()
@@ -126,7 +156,7 @@ def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, r
126156
def segment_text(self, text):
127157
"""Segment the provided text into sentences returned as a Python list."""
128158
params = {"model": self.model, "data": text, "tokenizer":"", "output": "plaintext=normalized_spaces"}
129-
return self.perform_request(params=params).rstrip().split("\n")
159+
return self.perform_request_urlencoded(params=params).rstrip().split("\n")
130160

131161
def process_document(self, doc, tokenize=True, tag=True, parse=True, resegment=False, ranges=False):
132162
"""Delete all existing bundles and substitute them with those parsed by UDPipe."""
@@ -152,7 +182,7 @@ def process_document(self, doc, tokenize=True, tag=True, parse=True, resegment=F
152182
params["input"] = "horizontal"
153183
params["data"] = "\n".join(" ".join([n.form for n in root.descendants]) for root in doc.trees) + "\n"
154184

155-
out_data = self.perform_request(params=params)
185+
out_data = self.perform_request_urlencoded(params=params)
156186
conllu_reader = ConlluReader(empty_parent="ignore")
157187
conllu_reader.files.filehandle = io.StringIO(out_data)
158188
trees = conllu_reader.read_trees()

0 commit comments

Comments
 (0)