cwb_encode {RcppCWB} | R Documentation |
CWB Tools for Creating Corpora
Description
Wrappers for the CWB tools cwb-makeall
, cwb-huffcode
and
cwb-compress-rdx
. Unlike the 'original' command line tools, these wrappers
will always perform a specific indexing/compression step on one positional
attribute, and produce all components.
Usage
cwb_encode(
corpus,
registry = Sys.getenv("CORPUS_REGISTRY"),
data_dir,
vrt_dir,
encoding = "utf8",
p_attributes = c("word", "pos", "lemma"),
s_attributes = list(),
skip_blank_lines = TRUE,
strip_whitespace = TRUE,
xml = TRUE,
quietly = FALSE,
verbose = FALSE
)
cwb_makeall(
corpus,
p_attribute,
registry = Sys.getenv("CORPUS_REGISTRY"),
quietly = FALSE,
logfile
)
cwb_huffcode(
corpus,
p_attribute,
registry = Sys.getenv("CORPUS_REGISTRY"),
quietly = FALSE,
logfile,
delete = TRUE
)
cwb_compress_rdx(
corpus,
p_attribute,
registry = Sys.getenv("CORPUS_REGISTRY"),
quietly = FALSE,
logfile,
delete = TRUE
)
Arguments
corpus |
Name of a CWB corpus (upper case). |
registry |
Path to the registry directory, defaults to the value of the environment variable CORPUS_REGISTRY. |
data_dir |
The data directory where |
vrt_dir |
Directory with input corpus files (verticalised format / file
ending *.vrt). Tilde expansion is performed on |
encoding |
The encoding of the files to be encoded. Needs to be an
encoding supported by CWB, see |
p_attributes |
Positional attributes (p-attributes) to be declared. |
s_attributes |
A |
skip_blank_lines |
A |
strip_whitespace |
A |
xml |
A |
quietly |
A |
verbose |
A |
p_attribute |
Name of p-attribute. |
logfile |
Redirect messages of |
delete |
A |
Details
Running cwb_huffcode()
and cwb_compress_rdx()
is optional. Corpora can be
fully used without compression. It is recommended when reducing the size of
corpus data has relevant benefits, e.g. for sharing data. On Windows,
compression is not stable and not recommended. A respective warning
is issued when running cwb_huffcode()
and cwb_compress_rdx()
on Windows.
Examples
data_dir <- file.path(tempdir(), "bt_data_dir")
dir.create(data_dir)
cwb_encode(
corpus = "BTMIN",
registry = Sys.getenv("CORPUS_REGISTRY"),
vrt_dir = system.file(package = "RcppCWB", "extdata", "vrt"),
data_dir = data_dir,
p_attributes = c("word", "pos", "lemma"),
s_attributes = list(
plenary_protocol = c(
"lp", "protocol_no", "date", "year", "birthday", "version",
"url", "filetype"
),
speaker = c(
"id", "type", "lp", "protocol_no", "date", "year", "ai_no", "ai_id",
"ai_type", "who", "name", "parliamentary_group", "party", "role"
),
p = character()
)
)
unlink(data_dir)
unlink(file.path(Sys.getenv("CORPUS_REGISTRY"), "btmin"))
# The package includes and 'unfinished' corpus of debates in the UN General
# Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it
# is not compressed.
#
# The first step in the following example is to copy the raw
# corpus to a temporary place.
home_dir <- system.file(
package = "RcppCWB",
"extdata", "cwb", "indexed_corpora", "unga"
)
tmp_data_dir <- file.path(tempdir(), "indexed_corpora")
tmp_unga_dir <- file.path(tmp_data_dir, "unga2")
if (!file.exists(tmp_data_dir)) dir.create(tmp_data_dir)
if (!file.exists(tmp_unga_dir)){
dir.create(tmp_unga_dir)
} else {
file.remove(list.files(tmp_unga_dir, full.names = TRUE))
}
regfile <- readLines(
system.file(package = "RcppCWB", "extdata", "cwb", "registry", "unga")
)
regfile[grep("^HOME", regfile)] <- sprintf('HOME "%s"', tmp_unga_dir)
regfile[grep("^ID", regfile)] <- "ID unga2"
writeLines(text = regfile, con = file.path(get_tmp_registry(), "unga2"))
for (x in list.files(home_dir, full.names = TRUE)){
file.copy(from = x, to = tmp_unga_dir)
}
# perform cwb_makeall (equivalent to cwb-makeall command line utility)
cwb_makeall(
corpus = "UNGA2",
p_attribute = "word",
registry = get_tmp_registry()
)
cl_load_corpus("UNGA2", registry = get_tmp_registry())
cqp_load_corpus("UNGA2", registry = get_tmp_registry())
# see whether it works
ids_sentence_1 <- cl_cpos2id(
corpus = "UNGA2", p_attribute = "word", registry = get_tmp_registry(),
cpos = 0:83
)
tokens_sentence_1 <- cl_id2str(
corpus = "UNGA2", p_attribute = "word",
registry = get_tmp_registry(), id = ids_sentence_1
)
sentence <- gsub(
"\\s+([\\.,])",
"\\1",
paste(tokens_sentence_1, collapse = " ")
)
# perform cwb_huffcode (equivalent to cwb-makeall command line utility)
cwb_huffcode(
corpus = "UNGA2",
p_attribute = "word",
registry = get_tmp_registry()
)
cwb_compress_rdx(
corpus = "UNGA2",
p_attribute = "word",
registry = get_tmp_registry()
)