88import os
99import sys
1010import urllib .error
11+ import urllib .parse
1112import urllib .request
1213
1314from udapi .block .read .conllu import Conllu as ConlluReader
@@ -62,6 +63,35 @@ def perform_request(self, params, method="process"):
6263
6364 return response ["result" ]
6465
66+ def perform_request_urlencoded (self , params , method = "process" ):
67+ """Perform a request using application/x-www-form-urlencoded to preserve LF newlines.
68+
69+ This avoids CRLF normalization done by the email MIME serializer, ensuring that
70+ the content of the 'data' field retains Unix LF ("\n ") exactly as provided.
71+ """
72+ request_data = urllib .parse .urlencode (params ).encode ("utf-8" )
73+ request_headers = {"Content-Type" : "application/x-www-form-urlencoded; charset=utf-8" }
74+
75+ try :
76+ with urllib .request .urlopen (urllib .request .Request (
77+ url = f"{ self .server } /{ method } " , headers = request_headers , data = request_data
78+ )) as request :
79+ response = json .loads (request .read ())
80+ except urllib .error .HTTPError as e :
81+ print ("An exception was raised during UDPipe '{}' REST request.\n "
82+ "The service returned the following error:\n "
83+ " {}" .format (method , e .fp .read ().decode ("utf-8" )), file = sys .stderr )
84+ raise
85+ except json .JSONDecodeError as e :
86+ print ("Cannot parse the JSON response of UDPipe '{}' REST request.\n "
87+ " {}" .format (method , e .msg ), file = sys .stderr )
88+ raise
89+
90+ if "model" not in response or "result" not in response :
91+ raise ValueError ("Cannot parse the UDPipe '{}' REST request response." .format (method ))
92+
93+ return response ["result" ]
94+
6595 def tag_parse_tree (self , root , tag = True , parse = True ):
6696 """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized)."""
6797 if not tag and not parse :
@@ -76,7 +106,7 @@ def tag_parse_tree(self, root, tag=True, parse=True):
76106 params ["parser" ] = ""
77107 attrs .append ('deprel' )
78108
79- out_data = self .perform_request (params = params )
109+ out_data = self .perform_request_urlencoded (params = params )
80110 conllu_reader = ConlluReader (empty_parent = "ignore" )
81111 conllu_reader .files .filehandle = io .StringIO (out_data )
82112 parsed_root = conllu_reader .read_tree ()
@@ -108,7 +138,7 @@ def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, r
108138 params ["parser" ] = ""
109139 if ranges :
110140 params ["tokenizer" ] = "presegmented;ranges" if resegment else "ranges"
111- out_data = self .perform_request (params = params )
141+ out_data = self .perform_request_urlencoded (params = params )
112142 conllu_reader = ConlluReader (empty_parent = "ignore" )
113143 conllu_reader .files .filehandle = io .StringIO (out_data )
114144 trees = conllu_reader .read_trees ()
@@ -126,7 +156,7 @@ def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True, r
126156 def segment_text (self , text ):
127157 """Segment the provided text into sentences returned as a Python list."""
128158 params = {"model" : self .model , "data" : text , "tokenizer" :"" , "output" : "plaintext=normalized_spaces" }
129- return self .perform_request (params = params ).rstrip ().split ("\n " )
159+ return self .perform_request_urlencoded (params = params ).rstrip ().split ("\n " )
130160
131161 def process_document (self , doc , tokenize = True , tag = True , parse = True , resegment = False , ranges = False ):
132162 """Delete all existing bundles and substitute them with those parsed by UDPipe."""
@@ -152,7 +182,7 @@ def process_document(self, doc, tokenize=True, tag=True, parse=True, resegment=F
152182 params ["input" ] = "horizontal"
153183 params ["data" ] = "\n " .join (" " .join ([n .form for n in root .descendants ]) for root in doc .trees ) + "\n "
154184
155- out_data = self .perform_request (params = params )
185+ out_data = self .perform_request_urlencoded (params = params )
156186 conllu_reader = ConlluReader (empty_parent = "ignore" )
157187 conllu_reader .files .filehandle = io .StringIO (out_data )
158188 trees = conllu_reader .read_trees ()
0 commit comments