forked from PolMine/RcppCWB
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathp_attributes.Rd
More file actions
113 lines (95 loc) · 3.11 KB
/
p_attributes.Rd
File metadata and controls
113 lines (95 loc) · 3.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cl.R
\name{CL: p_attributes}
\alias{CL: p_attributes}
\alias{cl_cpos2str}
\alias{cl_cpos2id}
\alias{cl_id2str}
\alias{cl_regex2id}
\alias{cl_str2id}
\alias{cl_id2freq}
\alias{cl_id2cpos}
\title{Using Positional Attributes.}
\usage{
cl_cpos2str(
corpus,
p_attribute,
registry = Sys.getenv("CORPUS_REGISTRY"),
cpos
)
cl_cpos2id(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), cpos)
cl_id2str(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), id)
cl_regex2id(
corpus,
p_attribute,
regex,
registry = Sys.getenv("CORPUS_REGISTRY")
)
cl_str2id(corpus, p_attribute, str, registry = Sys.getenv("CORPUS_REGISTRY"))
cl_id2freq(corpus, p_attribute, id, registry = Sys.getenv("CORPUS_REGISTRY"))
cl_id2cpos(corpus, p_attribute, id, registry = Sys.getenv("CORPUS_REGISTRY"))
}
\arguments{
\item{corpus}{name of a CWB corpus (upper case)}
\item{p_attribute}{a p-attribute (positional attribute)}
\item{registry}{path to the registry directory, defaults to the value of the
environment variable CORPUS_REGISTRY}
\item{cpos}{corpus positions (integer vector)}
\item{id}{id of a token}
\item{regex}{a regular expression}
\item{str}{a character string}
}
\description{
CWB indexed corpora store the text of a corpus as numbers: Every token
in the token stream of the corpus is identified by a unique corpus
position. The string value of every token is identified by a unique integer
id. The corpus library (CL) offers a set of functions to make the transitions
between corpus positions, token ids, and the character string of tokens.
}
\examples{
# registry directory and cpos_total will be needed in examples
cpos_total <- cl_attribute_size(
corpus = "REUTERS", attribute = "word",
attribute_type = "p", registry = get_tmp_registry()
)
# decode the token stream of the corpus (the quick way)
token_stream_str <- cl_cpos2str(
corpus = "REUTERS", p_attribute = "word",
cpos = seq.int(from = 0, to = cpos_total - 1),
registry = get_tmp_registry()
)
# decode the token stream (cpos2id first, then id2str)
token_stream_ids <- cl_cpos2id(
corpus = "REUTERS", p_attribute = "word",
cpos = seq.int(from = 0, to = cpos_total - 1),
registry = get_tmp_registry()
)
token_stream_str <- cl_id2str(
corpus = "REUTERS", p_attribute = "word",
id = token_stream_ids, registry = get_tmp_registry()
)
# get corpus positions of a token
token_to_get <- "oil"
id_oil <- cl_str2id(
corpus = "REUTERS", p_attribute = "word",
str = token_to_get, registry = get_tmp_registry()
)
cpos_oil <- cl_id2cpos <- cl_id2cpos(
corpus = "REUTERS", p_attribute = "word",
id = id_oil, registry = get_tmp_registry()
)
# get frequency of token
oil_freq <- cl_id2freq(
corpus = "REUTERS", p_attribute = "word", id = id_oil, registry = get_tmp_registry()
)
length(cpos_oil) # needs to be the same as oil_freq
# use regular expressions
ids <- cl_regex2id(
corpus = "REUTERS", p_attribute = "word",
regex = "M.*", registry = get_tmp_registry()
)
m_words <- cl_id2str(
corpus = "REUTERS", p_attribute = "word",
id = ids, registry = get_tmp_registry()
)
}