forked from PolMine/RcppCWB
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcwb_utils.html
More file actions
332 lines (282 loc) · 32.8 KB
/
cwb_utils.html
File metadata and controls
332 lines (282 loc) · 32.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>CWB Tools for Creating Corpora — cwb_makeall • RcppCWB</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="CWB Tools for Creating Corpora — cwb_makeall"><meta property="og:description" content="Wrappers for the CWB tools (cwb-makeall, cwb-huffcode,
cwb-compress-rdx). Unlike the 'original' command line tools, these
wrappers will always perform a specific indexing/compression step on one
positional attribute, and produce all components."><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]--></head><body data-spy="scroll" data-target="#toc">
<div class="container template-reference-topic">
<header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<span class="navbar-brand">
<a class="navbar-link" href="../index.html">RcppCWB</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
</span>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav"><li>
<a href="../reference/index.html">Reference</a>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
Articles
<span class="caret"></span>
</a>
<ul class="dropdown-menu" role="menu"><li>
<a href="../articles/vignette.html">Writing performance code with RcppCWB</a>
</li>
</ul></li>
<li>
<a href="../news/index.html">Changelog</a>
</li>
</ul><ul class="nav navbar-nav navbar-right"><li>
<a href="https://github.com/PolMine/RcppCWB/" class="external-link">
<span class="fab fa-github fa-lg"></span>
</a>
</li>
</ul></div><!--/.nav-collapse -->
</div><!--/.container -->
</div><!--/.navbar -->
</header><div class="row">
<div class="col-md-9 contents">
<div class="page-header">
<h1>CWB Tools for Creating Corpora</h1>
<small class="dont-index">Source: <a href="https://github.com/PolMine/RcppCWB/blob/HEAD/R/cwb.R" class="external-link"><code>R/cwb.R</code></a></small>
<div class="hidden name"><code>cwb_utils.Rd</code></div>
</div>
<div class="ref-description">
<p>Wrappers for the CWB tools (<code>cwb-makeall</code>, <code>cwb-huffcode</code>,
<code>cwb-compress-rdx</code>). Unlike the 'original' command line tools, these
wrappers will always perform a specific indexing/compression step on one
positional attribute, and produce all components.</p>
</div>
<div id="ref-usage">
<div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">cwb_makeall</span><span class="op">(</span></span>
<span> <span class="va">corpus</span>,</span>
<span> <span class="va">p_attribute</span>,</span>
<span> registry <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Sys.getenv.html" class="external-link">Sys.getenv</a></span><span class="op">(</span><span class="st">"CORPUS_REGISTRY"</span><span class="op">)</span>,</span>
<span> quietly <span class="op">=</span> <span class="cn">FALSE</span></span>
<span><span class="op">)</span></span>
<span></span>
<span><span class="fu">cwb_huffcode</span><span class="op">(</span></span>
<span> <span class="va">corpus</span>,</span>
<span> <span class="va">p_attribute</span>,</span>
<span> registry <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Sys.getenv.html" class="external-link">Sys.getenv</a></span><span class="op">(</span><span class="st">"CORPUS_REGISTRY"</span><span class="op">)</span>,</span>
<span> quietly <span class="op">=</span> <span class="cn">FALSE</span>,</span>
<span> delete <span class="op">=</span> <span class="cn">TRUE</span></span>
<span><span class="op">)</span></span>
<span></span>
<span><span class="fu">cwb_compress_rdx</span><span class="op">(</span></span>
<span> <span class="va">corpus</span>,</span>
<span> <span class="va">p_attribute</span>,</span>
<span> registry <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Sys.getenv.html" class="external-link">Sys.getenv</a></span><span class="op">(</span><span class="st">"CORPUS_REGISTRY"</span><span class="op">)</span>,</span>
<span> quietly <span class="op">=</span> <span class="cn">FALSE</span>,</span>
<span> delete <span class="op">=</span> <span class="cn">TRUE</span></span>
<span><span class="op">)</span></span>
<span></span>
<span><span class="fu">cwb_encode</span><span class="op">(</span></span>
<span> <span class="va">corpus</span>,</span>
<span> registry <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Sys.getenv.html" class="external-link">Sys.getenv</a></span><span class="op">(</span><span class="st">"CORPUS_REGISTRY"</span><span class="op">)</span>,</span>
<span> <span class="va">data_dir</span>,</span>
<span> <span class="va">vrt_dir</span>,</span>
<span> encoding <span class="op">=</span> <span class="st">"utf8"</span>,</span>
<span> p_attributes <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"word"</span>, <span class="st">"pos"</span>, <span class="st">"lemma"</span><span class="op">)</span>,</span>
<span> <span class="va">s_attributes</span>,</span>
<span> skip_blank_lines <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> strip_whitespace <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> xml <span class="op">=</span> <span class="cn">TRUE</span>,</span>
<span> quietly <span class="op">=</span> <span class="cn">FALSE</span>,</span>
<span> verbose <span class="op">=</span> <span class="cn">FALSE</span></span>
<span><span class="op">)</span></span></code></pre></div>
</div>
<div id="arguments">
<h2>Arguments</h2>
<dl><dt>corpus</dt>
<dd><p>name of a CWB corpus (upper case)</p></dd>
<dt>p_attribute</dt>
<dd><p>name p-attribute</p></dd>
<dt>registry</dt>
<dd><p>path to the registry directory, defaults to the value of the
environment variable CORPUS_REGISTRY</p></dd>
<dt>quietly</dt>
<dd><p>A <code>logical</code> value, whether to turn off messages (including
warnings).</p></dd>
<dt>delete</dt>
<dd><p>A <code>logical</code> value, whether to remove redundant files after
compression.</p></dd>
<dt>data_dir</dt>
<dd><p>The data directory where <code>cwb_encode</code> will save the binary
files of the indexed corpus. Tilde expansion is performed on <code>data_dir</code>
using <code><a href="https://rdrr.io/r/base/path.expand.html" class="external-link">path.expand()</a></code> to avoid a crash.</p></dd>
<dt>vrt_dir</dt>
<dd><p>Directory with input corpus files (verticalised format / file
ending *.vrt). Tilde expansion is performed on <code>vrt_dir</code> using
<code><a href="https://rdrr.io/r/base/path.expand.html" class="external-link">path.expand()</a></code> to avoid a crash.</p></dd>
<dt>encoding</dt>
<dd><p>The encoding of the files to be encoded. Needs to be an
encoding supported by CWB, see <code><a href="cwb_charsets.html">cwb_charsets()</a></code>. "UTF-8" is taken as
"utf8". Defaults to "utf8" (recommended charset).</p></dd>
<dt>p_attributes</dt>
<dd><p>Positional attributes (p-attributes) to be declared.</p></dd>
<dt>s_attributes</dt>
<dd><p>A <code>list</code> of named <code>character</code> vectors to declare
structural attributes that shall be encoded. The names of the list are the
XML elements present in the corpus. Character vectors making up the list
declare the attributes that include the metadata of regions. To declare a
structural attribute without annotations, provide a zero-length character
vector using <code><a href="https://rdrr.io/r/base/character.html" class="external-link">character()</a></code> - see examples.</p></dd>
<dt>skip_blank_lines</dt>
<dd><p>A <code>logical</code> value, whether to skip blank lines in the
input.</p></dd>
<dt>strip_whitespace</dt>
<dd><p>A <code>logical</code> value, whether to strip whitespace from
tokens</p></dd>
<dt>xml</dt>
<dd><p>A <code>logical</code> value, whether input is XML.</p></dd>
<dt>verbose</dt>
<dd><p>A <code>logical</code> value, whether to show progress information
(counter of tokens processed).</p></dd>
</dl></div>
<div id="ref-examples">
<h2>Examples</h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span class="r-in"><span><span class="co"># The package includes and 'unfinished' corpus of debates in the UN General </span></span></span>
<span class="r-in"><span><span class="co"># Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it is</span></span></span>
<span class="r-in"><span><span class="co"># not compressed.</span></span></span>
<span class="r-in"><span><span class="co">#</span></span></span>
<span class="r-in"><span><span class="co"># The first step in the following example is to copy the raw</span></span></span>
<span class="r-in"><span><span class="co"># corpus to a temporary place.</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="va">home_dir</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/system.file.html" class="external-link">system.file</a></span><span class="op">(</span>package <span class="op">=</span> <span class="st">"RcppCWB"</span>, <span class="st">"extdata"</span>, <span class="st">"cwb"</span>, <span class="st">"indexed_corpora"</span>, <span class="st">"unga"</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="va">tmp_data_dir</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempdir</a></span><span class="op">(</span><span class="op">)</span>, <span class="st">"indexed_corpora"</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">tmp_unga_dir</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">tmp_data_dir</span>, <span class="st">"unga2"</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="kw">if</span> <span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/files.html" class="external-link">file.exists</a></span><span class="op">(</span><span class="va">tmp_data_dir</span><span class="op">)</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">tmp_data_dir</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="kw">if</span> <span class="op">(</span><span class="op">!</span><span class="fu"><a href="https://rdrr.io/r/base/files.html" class="external-link">file.exists</a></span><span class="op">(</span><span class="va">tmp_unga_dir</span><span class="op">)</span><span class="op">)</span><span class="op">{</span></span></span>
<span class="r-in"><span> <span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">tmp_unga_dir</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="op">}</span> <span class="kw">else</span> <span class="op">{</span></span></span>
<span class="r-in"><span> <span class="fu"><a href="https://rdrr.io/r/base/files.html" class="external-link">file.remove</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/list.files.html" class="external-link">list.files</a></span><span class="op">(</span><span class="va">tmp_unga_dir</span>, full.names <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="op">}</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="va">regfile</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/readLines.html" class="external-link">readLines</a></span><span class="op">(</span></span></span>
<span class="r-in"><span> <span class="fu"><a href="https://rdrr.io/r/base/system.file.html" class="external-link">system.file</a></span><span class="op">(</span>package <span class="op">=</span> <span class="st">"RcppCWB"</span>, <span class="st">"extdata"</span>, <span class="st">"cwb"</span>, <span class="st">"registry"</span>, <span class="st">"unga"</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">regfile</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/grep.html" class="external-link">grep</a></span><span class="op">(</span><span class="st">"^HOME"</span>, <span class="va">regfile</span><span class="op">)</span><span class="op">]</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/sprintf.html" class="external-link">sprintf</a></span><span class="op">(</span><span class="st">'HOME "%s"'</span>, <span class="va">tmp_unga_dir</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">regfile</span><span class="op">[</span><span class="fu"><a href="https://rdrr.io/r/base/grep.html" class="external-link">grep</a></span><span class="op">(</span><span class="st">"^ID"</span>, <span class="va">regfile</span><span class="op">)</span><span class="op">]</span> <span class="op"><-</span> <span class="st">"ID unga2"</span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/base/writeLines.html" class="external-link">writeLines</a></span><span class="op">(</span>text <span class="op">=</span> <span class="va">regfile</span>, con <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span>, <span class="st">"unga2"</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="kw">for</span> <span class="op">(</span><span class="va">x</span> <span class="kw">in</span> <span class="fu"><a href="https://rdrr.io/r/base/list.files.html" class="external-link">list.files</a></span><span class="op">(</span><span class="va">home_dir</span>, full.names <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span><span class="op">)</span><span class="op">{</span></span></span>
<span class="r-in"><span> <span class="fu"><a href="https://rdrr.io/r/base/files.html" class="external-link">file.copy</a></span><span class="op">(</span>from <span class="op">=</span> <span class="va">x</span>, to <span class="op">=</span> <span class="va">tmp_unga_dir</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="op">}</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># perform cwb_makeall (equivalent to cwb-makeall command line utility)</span></span></span>
<span class="r-in"><span><span class="fu">cwb_makeall</span><span class="op">(</span>corpus <span class="op">=</span> <span class="st">"UNGA2"</span>, p_attribute <span class="op">=</span> <span class="st">"word"</span>, registry <span class="op">=</span> <span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#></span> === Makeall: processing corpus UNGA2 ===</span>
<span class="r-out co"><span class="r-pr">#></span> Registry directory: /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T/Rtmpa5P52q/registry_tmp</span>
<span class="r-out co"><span class="r-pr">#></span> ATTRIBUTE word</span>
<span class="r-out co"><span class="r-pr">#></span> + creating LEXSRT ... OK</span>
<span class="r-out co"><span class="r-pr">#></span> - lexicon OK</span>
<span class="r-out co"><span class="r-pr">#></span> + creating FREQS ... OK</span>
<span class="r-out co"><span class="r-pr">#></span> - frequencies OK</span>
<span class="r-out co"><span class="r-pr">#></span> - token stream OK</span>
<span class="r-out co"><span class="r-pr">#></span> + creating REVCIDX ... OK</span>
<span class="r-out co"><span class="r-pr">#></span> + creating REVCORP ... OK</span>
<span class="r-out co"><span class="r-pr">#></span> ? validating REVCORP ... OK</span>
<span class="r-out co"><span class="r-pr">#></span> - index OK</span>
<span class="r-out co"><span class="r-pr">#></span> ========================================</span>
<span class="r-out co"><span class="r-pr">#></span> [1] 0</span>
<span class="r-in"><span><span class="fu"><a href="cl_load_corpus.html">cl_load_corpus</a></span><span class="op">(</span><span class="st">"UNGA2"</span>, registry <span class="op">=</span> <span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#></span> [1] TRUE</span>
<span class="r-in"><span><span class="fu"><a href="cqp_initialize.html">cqp_load_corpus</a></span><span class="op">(</span><span class="st">"UNGA2"</span>, registry <span class="op">=</span> <span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#></span> [1] TRUE</span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># see whether it works</span></span></span>
<span class="r-in"><span><span class="va">ids_sentence_1</span> <span class="op"><-</span> <span class="fu"><a href="p_attributes.html">cl_cpos2id</a></span><span class="op">(</span></span></span>
<span class="r-in"><span> corpus <span class="op">=</span> <span class="st">"UNGA2"</span>, p_attribute <span class="op">=</span> <span class="st">"word"</span>, registry <span class="op">=</span> <span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span>,</span></span>
<span class="r-in"><span> cpos <span class="op">=</span> <span class="fl">0</span><span class="op">:</span><span class="fl">83</span></span></span>
<span class="r-in"><span> <span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">tokens_sentence_1</span> <span class="op"><-</span> <span class="fu"><a href="p_attributes.html">cl_id2str</a></span><span class="op">(</span></span></span>
<span class="r-in"><span> corpus <span class="op">=</span> <span class="st">"UNGA2"</span>, p_attribute <span class="op">=</span> <span class="st">"word"</span>,</span></span>
<span class="r-in"><span> registry <span class="op">=</span> <span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span>, id <span class="op">=</span> <span class="va">ids_sentence_1</span></span></span>
<span class="r-in"><span> <span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">sentence</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/grep.html" class="external-link">gsub</a></span><span class="op">(</span><span class="st">"\\s+([\\.,])"</span>, <span class="st">"\\1"</span>, <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste</a></span><span class="op">(</span><span class="va">tokens_sentence_1</span>, collapse <span class="op">=</span> <span class="st">" "</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># perform cwb_huffcode (equivalent to cwb-makeall command line utility)</span></span></span>
<span class="r-in"><span><span class="fu">cwb_huffcode</span><span class="op">(</span></span></span>
<span class="r-in"><span> corpus <span class="op">=</span> <span class="st">"UNGA2"</span>,</span></span>
<span class="r-in"><span> p_attribute <span class="op">=</span> <span class="st">"word"</span>,</span></span>
<span class="r-in"><span> registry <span class="op">=</span> <span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#></span> COMPRESSING TOKEN STREAM of (null).word</span>
<span class="r-out co"><span class="r-pr">#></span> - writing code descriptor block to /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.hcd</span>
<span class="r-out co"><span class="r-pr">#></span> - writing compressed item sequence to /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.huf</span>
<span class="r-out co"><span class="r-pr">#></span> - writing sync (every 128 tokens) to /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.huf.syn</span>
<span class="r-out co"><span class="r-pr">#></span> VALIDATING UNGA2.word</span>
<span class="r-out co"><span class="r-pr">#></span> - reading code descriptor block from /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.hcd</span>
<span class="r-out co"><span class="r-pr">#></span> - reading compressed item sequence from /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.huf</span>
<span class="r-out co"><span class="r-pr">#></span> - reading sync (mod 128) from /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.huf.syn</span>
<span class="r-out co"><span class="r-pr">#></span> !! You can delete the file </var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.corpus> now.</span>
<span class="r-msg co"><span class="r-pr">#></span> redundant file deleted: /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T/Rtmpa5P52q/indexed_corpora/unga2/word.corpus</span>
<span class="r-out co"><span class="r-pr">#></span> [1] 0</span>
<span class="r-in"><span><span class="fu">cwb_compress_rdx</span><span class="op">(</span></span></span>
<span class="r-in"><span> corpus <span class="op">=</span> <span class="st">"UNGA2"</span>,</span></span>
<span class="r-in"><span> p_attribute <span class="op">=</span> <span class="st">"word"</span>,</span></span>
<span class="r-in"><span> registry <span class="op">=</span> <span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#></span> COMPRESSING INDEX of UNGA2.word</span>
<span class="r-out co"><span class="r-pr">#></span> - writing compressed index to /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.crc</span>
<span class="r-out co"><span class="r-pr">#></span> - writing compressed index offsets to /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.crx</span>
<span class="r-out co"><span class="r-pr">#></span> VALIDATING UNGA2.word</span>
<span class="r-out co"><span class="r-pr">#></span> - reading compressed index from /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.crc</span>
<span class="r-out co"><span class="r-pr">#></span> - reading compressed index offsets from /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.crx</span>
<span class="r-out co"><span class="r-pr">#></span> !! You can delete the file </var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.corpus.rev> now.</span>
<span class="r-out co"><span class="r-pr">#></span> !! You can delete the file </var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T//Rtmpa5P52q/indexed_corpora/unga2/word.corpus.rdx> now.</span>
<span class="r-msg co"><span class="r-pr">#></span> redundant file deleted: /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T/Rtmpa5P52q/indexed_corpora/unga2/word.corpus.rev</span>
<span class="r-msg co"><span class="r-pr">#></span> redundant file deleted: /var/folders/fw/qwt11pjx1qs83dl2jwltcvmr0000gn/T/Rtmpa5P52q/indexed_corpora/unga2/word.corpus.rdx</span>
<span class="r-out co"><span class="r-pr">#></span> [1] 0</span>
<span class="r-in"><span><span class="va">data_dir</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempdir</a></span><span class="op">(</span><span class="op">)</span>, <span class="st">"bt_data_dir"</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/base/files2.html" class="external-link">dir.create</a></span><span class="op">(</span><span class="va">data_dir</span><span class="op">)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="fu">cwb_encode</span><span class="op">(</span></span></span>
<span class="r-in"><span> corpus <span class="op">=</span> <span class="st">"BTMIN"</span>,</span></span>
<span class="r-in"><span> registry <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Sys.getenv.html" class="external-link">Sys.getenv</a></span><span class="op">(</span><span class="st">"CORPUS_REGISTRY"</span><span class="op">)</span>,</span></span>
<span class="r-in"><span> vrt_dir <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/system.file.html" class="external-link">system.file</a></span><span class="op">(</span>package <span class="op">=</span> <span class="st">"RcppCWB"</span>, <span class="st">"extdata"</span>, <span class="st">"vrt"</span><span class="op">)</span>,</span></span>
<span class="r-in"><span> data_dir <span class="op">=</span> <span class="va">data_dir</span>,</span></span>
<span class="r-in"><span> p_attributes <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"word"</span>, <span class="st">"pos"</span>, <span class="st">"lemma"</span><span class="op">)</span>,</span></span>
<span class="r-in"><span> s_attributes <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span></span></span>
<span class="r-in"><span> plenary_protocol <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span></span></span>
<span class="r-in"><span> <span class="st">"lp"</span>, <span class="st">"protocol_no"</span>, <span class="st">"date"</span>, <span class="st">"year"</span>, <span class="st">"birthday"</span>, <span class="st">"version"</span>,</span></span>
<span class="r-in"><span> <span class="st">"url"</span>, <span class="st">"filetype"</span></span></span>
<span class="r-in"><span> <span class="op">)</span>,</span></span>
<span class="r-in"><span> speaker <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span></span></span>
<span class="r-in"><span> <span class="st">"id"</span>, <span class="st">"type"</span>, <span class="st">"lp"</span>, <span class="st">"protocol_no"</span>, <span class="st">"date"</span>, <span class="st">"year"</span>, <span class="st">"ai_no"</span>, <span class="st">"ai_id"</span>,</span></span>
<span class="r-in"><span> <span class="st">"ai_type"</span>, <span class="st">"who"</span>, <span class="st">"name"</span>, <span class="st">"parliamentary_group"</span>, <span class="st">"party"</span>, <span class="st">"role"</span></span></span>
<span class="r-in"><span> <span class="op">)</span>,</span></span>
<span class="r-in"><span> p <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/character.html" class="external-link">character</a></span><span class="op">(</span><span class="op">)</span></span></span>
<span class="r-in"><span> <span class="op">)</span></span></span>
<span class="r-in"><span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#></span> [1] 0</span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/base/unlink.html" class="external-link">unlink</a></span><span class="op">(</span><span class="va">data_dir</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/base/unlink.html" class="external-link">unlink</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Sys.getenv.html" class="external-link">Sys.getenv</a></span><span class="op">(</span><span class="st">"CORPUS_REGISTRY"</span><span class="op">)</span>, <span class="st">"btmin"</span><span class="op">)</span><span class="op">)</span></span></span>
</code></pre></div>
</div>
</div>
<div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
<nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
</nav></div>
</div>
<footer><div class="copyright">
<p></p><p>Developed by Andreas Blaette, Bernard Desgraupes, Sylvain Loiseau.</p>
</div>
<div class="pkgdown">
<p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.6.</p>
</div>
</footer></div>
</body></html>