forked from PolMine/RcppCWB
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRcppCWB-packge.html
More file actions
229 lines (191 loc) · 17 KB
/
RcppCWB-packge.html
File metadata and controls
229 lines (191 loc) · 17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
<!DOCTYPE html>
<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Rcpp Bindings for the Corpus Workbench (CWB). — RcppCWB-package • RcppCWB</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Rcpp Bindings for the Corpus Workbench (CWB). — RcppCWB-package"><meta property="og:description" content="The RcppCWB package is a wrapper library to expose core functions of
the Open Corpus Workbench (CWB). This includes the low-level
functionality of the Corpus Library (CL) as well as capacities to use
the query syntax of the Corpus Query Processor (CQP)."><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]--></head><body data-spy="scroll" data-target="#toc">
<div class="container template-reference-topic">
<header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<span class="navbar-brand">
<a class="navbar-link" href="../index.html">RcppCWB</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
</span>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav"><li>
<a href="../reference/index.html">Reference</a>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
Articles
<span class="caret"></span>
</a>
<ul class="dropdown-menu" role="menu"><li>
<a href="../articles/vignette.html">Writing performance code with RcppCWB</a>
</li>
</ul></li>
<li>
<a href="../news/index.html">Changelog</a>
</li>
</ul><ul class="nav navbar-nav navbar-right"><li>
<a href="https://github.com/PolMine/RcppCWB/" class="external-link">
<span class="fab fa-github fa-lg"></span>
</a>
</li>
</ul></div><!--/.nav-collapse -->
</div><!--/.container -->
</div><!--/.navbar -->
</header><div class="row">
<div class="col-md-9 contents">
<div class="page-header">
<h1>Rcpp Bindings for the Corpus Workbench (CWB).</h1>
<small class="dont-index">Source: <a href="https://github.com/PolMine/RcppCWB/blob/HEAD/R/RcppCWB_package.R" class="external-link"><code>R/RcppCWB_package.R</code></a></small>
<div class="hidden name"><code>RcppCWB-packge.Rd</code></div>
</div>
<div class="ref-description">
<p>The <code>RcppCWB</code> package is a wrapper library to expose core functions of
the <code>Open Corpus Workbench</code> (CWB). This includes the low-level
functionality of the <code>Corpus Library</code> (CL) as well as capacities to use
the query syntax of the <code>Corpus Query Processor</code> (CQP).</p>
</div>
<div id="the-idea-behind-rcppcwb">
<h2>The Idea Behind RcppCWB</h2>
<p>The <code>Open Corpus Workbench</code> (CWB) is an indexing and querying engine
popular in corpus-assisted research. Its core aim is to support working
efficiently with large, structurally and linguistically annotated corpora.
First of all, the CWB includes tools to index and compress corpora. Second,
the <code>Corpus Library</code> (CL) offers low-level functionality to retrieve
information from CWB indexed corpora. Third, the <code>Corpus Query
Processor</code> (CQP) offers a syntax that allows to perform anything from
simple to complex queries, using different annotation layers of corpora.</p>
<p>The CWB is a classical tool which has inspired a set of developments. A
persisting advantage of the CWB is its mature, open source code base that
is actively maintained by a community of developers. It is used as a robust
and efficient backend for widely used tools such as
TXM(<a href="https://txm.gitpages.huma-num.fr/textometrie/" class="external-link">https://txm.gitpages.huma-num.fr/textometrie/</a>) or CQPweb
(<a href="https://cwb.sourceforge.io/cqpweb.php" class="external-link">https://cwb.sourceforge.io/cqpweb.php</a>). Its uncompromising C
implementation guarantees speed and makes it well suited to be integrated
with R at the same time.</p>
<p>The package <code>RcppCWB</code> is a follow-up on the <code>rcqp</code> package that
has pioneered to expose CWB functionality from within R. Indeed, the
<code>rcqp</code> package, published at CRAN in 2015, offers robust access to CWB
functionality. However, the "pure C" implementation of the <code>rcqp</code>
package creates difficulties to make the package portable to Windows. The
primary purpose of the <code>RcppCWB</code> package is to reimplement a wrapper
library for the CWB using a design that makes it easier to achieve
cross-platform portability.</p>
<p>Even though <code>RcppCWB</code> functions may be used directly, the package is
designed to serve as an interface to CWB indexed corpora in packages with
higher-level functionality. In this regard, <code>RcppCWB</code> is the backend
of the <code>polmineR</code> package. It is deliberately open to be used in other
contexts. The package may stimulate using linguistically annotated, indexed
and compressed corpora on all platforms. The paradigm of working with text
as linguistic data may benefit from <code>RcppCWB</code>.</p>
</div>
<div id="implementation">
<h2>Implementation</h2>
<p>When building the package, the first step is to compile the relevant parts
of the CWB on Linux and macOS machines. On Windows, cross-compiled binaries
are downloaded from a GitHub repository of the PolMine Project
(<a href="https://github.com/PolMine/libcl" class="external-link">https://github.com/PolMine/libcl</a>). Second, <code>Rcpp</code> wrappers are
compiled and make the relevant functions of the Corpus Library and CQP
accessible. In addition to genuine CWB functions, <code>RcppCWB</code> offers a
set of higher level functions implemented using <code>Rcpp</code> for common
performance critical tasks.</p>
</div>
<div id="getting-started-with-rcppcwb">
<h2>Getting Started with RcppCWB</h2>
<p>To understand the data storage model of the CWB, in particular the notions
of positional and structural attributes (s- and p-attributes), the vignette
of the <code>rcqp</code> package is a very good starting point (see references).</p>
<p>The CWB 'Corpus Encoding Tutorial' explains how to create your own corpus,
the 'CQP Query Language Tutorial' introduces the syntax of CQP (see
references).</p>
<p>The <code>RcppCWB</code> package includes a sample corpus (REUTERS, the data also
included in the <code>tm</code> package). The examples in the documentation
of the functions may be a good starting point to understand how to use
<code>RcppCWB</code>.</p>
</div>
<div id="digging-deeper">
<h2>Digging Deeper</h2>
<p>The original paper of Christ (1994) explains the design choices of the CWB.
The indexing and compression techniques of the CWB (Huffman coding) are
explained in Witten et al. (1999).</p>
</div>
<div id="acknowledgements">
<h2>Acknowledgements</h2>
<p>The work of the all developers of the CWB is gratefully acknowledged. There
is a particular intellectual debt to Bernard Desgraupes and Sylvain
Loiseau, and the <code>rcqp</code> package they developed as the original R
wrapper to expose the functionality of the CWB.</p>
</div>
<div id="references">
<h2>References</h2>
<p>Christ, O. 1994. "A modular and flexible architecture for an integrated
corpus query system", in: Proceedings of COMPLEX '94, pp. 23-32. Budapest.
Available online at <a href="https://cwb.sourceforge.io/files/Christ1994.pdf" class="external-link">https://cwb.sourceforge.io/files/Christ1994.pdf</a></p>
<p>Desgraupes, B.; Loiseau, S. 2012. Introduction to the rcqp package.
Vignette of the rcqp package. Available at the CRAN archive at
<a href="https://cran.r-project.org/src/contrib/Archive/rcqp/" class="external-link">https://cran.r-project.org/src/contrib/Archive/rcqp/</a></p>
<p>Evert, S. 2005. The CQP Query Language Tutorial. Available online at
<a href="https://cwb.sourceforge.io/files/CWB_Encoding_Tutorial.pdf" class="external-link">https://cwb.sourceforge.io/files/CWB_Encoding_Tutorial.pdf</a></p>
<p>Evert, S. 2005. The IMS Open Corpus Workbench (CWB). Corpus Encoding
Tutorial. Available online at
<a href="https://cwb.sourceforge.io/files/CWB_Encoding_Tutorial.pdf" class="external-link">https://cwb.sourceforge.io/files/CWB_Encoding_Tutorial.pdf</a></p>
<p>Open Corpus Workbench (<a href="https://cwb.sourceforge.io" class="external-link">https://cwb.sourceforge.io</a>)</p>
<p>Witten, I.H.; Moffat, A.; Bell, T.C. (1999). Managing Gigabytes. Morgan
Kaufmann Publishing, San Francisco, 2nd edition.</p>
</div>
<div id="author">
<h2>Author</h2>
<p>Andreas Blaette (andreas.blaette@uni-due.de)</p>
</div>
<div id="ref-examples">
<h2>Examples</h2>
<div class="sourceCode"><pre class="sourceCode r"><code><span class="r-in"><span><span class="co"># functions of the corpus library (starting with cl) expose the low-level</span></span></span>
<span class="r-in"><span><span class="co"># access to the CWB corpus library (CL)</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="va">ids</span> <span class="op"><-</span> <span class="fu"><a href="p_attributes.html">cl_cpos2id</a></span><span class="op">(</span><span class="st">"REUTERS"</span>, cpos <span class="op">=</span> <span class="fl">1</span><span class="op">:</span><span class="fl">20</span>, p_attribute <span class="op">=</span> <span class="st">"word"</span>, registry <span class="op">=</span> <span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">tokens</span> <span class="op"><-</span> <span class="fu"><a href="p_attributes.html">cl_id2str</a></span><span class="op">(</span><span class="st">"REUTERS"</span>, id <span class="op">=</span> <span class="va">ids</span>, p_attribute <span class="op">=</span> <span class="st">"word"</span>, registry <span class="op">=</span> <span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="fu"><a href="https://rdrr.io/r/base/print.html" class="external-link">print</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste</a></span><span class="op">(</span><span class="va">tokens</span>, collapse <span class="op">=</span> <span class="st">" "</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#></span> [1] "Shamrock Corp said that effective today it had cut its contract prices for crude oil by 1.50 dlrs a barrel"</span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="co"># To use the corpus query processor (CQP) and its syntax, it is necessary first</span></span></span>
<span class="r-in"><span><span class="co"># to initialize CQP (example: get concordances of 'oil')</span></span></span>
<span class="r-in"><span></span></span>
<span class="r-in"><span><span class="fu"><a href="cqp_query.html">cqp_query</a></span><span class="op">(</span><span class="st">"REUTERS"</span>, query <span class="op">=</span> <span class="st">'[]{5} "oil" []{5}'</span><span class="op">)</span></span></span>
<span class="r-out co"><span class="r-pr">#></span> <pointer: 0x6000011da3a0></span>
<span class="r-in"><span><span class="va">cpos_matrix</span> <span class="op"><-</span> <span class="fu"><a href="cqp_query.html">cqp_dump_subcorpus</a></span><span class="op">(</span><span class="st">"REUTERS"</span><span class="op">)</span></span></span>
<span class="r-in"><span><span class="va">concordances_oil</span> <span class="op"><-</span> <span class="fu"><a href="https://rdrr.io/r/base/apply.html" class="external-link">apply</a></span><span class="op">(</span></span></span>
<span class="r-in"><span> <span class="va">cpos_matrix</span>, <span class="fl">1</span>,</span></span>
<span class="r-in"><span> <span class="kw">function</span><span class="op">(</span><span class="va">row</span><span class="op">)</span><span class="op">{</span></span></span>
<span class="r-in"><span> <span class="va">ids</span> <span class="op"><-</span> <span class="fu"><a href="p_attributes.html">cl_cpos2id</a></span><span class="op">(</span><span class="st">"REUTERS"</span>, p_attribute <span class="op">=</span> <span class="st">"word"</span>, cpos <span class="op">=</span> <span class="va">row</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">:</span><span class="va">row</span><span class="op">[</span><span class="fl">2</span><span class="op">]</span>, <span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-in"><span> <span class="va">tokens</span> <span class="op"><-</span> <span class="fu"><a href="p_attributes.html">cl_id2str</a></span><span class="op">(</span><span class="st">"REUTERS"</span>, p_attribute <span class="op">=</span> <span class="st">"word"</span>, id <span class="op">=</span> <span class="va">ids</span>, <span class="fu"><a href="tmp_registry.html">get_tmp_registry</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span></span></span>
<span class="r-in"><span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste</a></span><span class="op">(</span><span class="va">tokens</span>, collapse <span class="op">=</span> <span class="st">" "</span><span class="op">)</span></span></span>
<span class="r-in"><span> <span class="op">}</span></span></span>
<span class="r-in"><span> <span class="op">)</span></span></span>
</code></pre></div>
</div>
</div>
<div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
<nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
</nav></div>
</div>
<footer><div class="copyright">
<p></p><p>Developed by Andreas Blaette, Bernard Desgraupes, Sylvain Loiseau.</p>
</div>
<div class="pkgdown">
<p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.6.</p>
</div>
</footer></div>
</body></html>