R: CWB Tools for Creating Corpora

cwb_encode {RcppCWB}

R Documentation

CWB Tools for Creating Corpora

Description

Wrappers for the CWB tools cwb-makeall, cwb-huffcode and cwb-compress-rdx. Unlike the 'original' command line tools, these wrappers will always perform a specific indexing/compression step on one positional attribute, and produce all components.

Usage

cwb_encode(
  corpus,
  registry = Sys.getenv("CORPUS_REGISTRY"),
  data_dir,
  vrt_dir,
  encoding = "utf8",
  p_attributes = c("word", "pos", "lemma"),
  s_attributes = list(),
  skip_blank_lines = TRUE,
  strip_whitespace = TRUE,
  xml = TRUE,
  quietly = FALSE,
  verbose = FALSE
)

cwb_makeall(
  corpus,
  p_attribute,
  registry = Sys.getenv("CORPUS_REGISTRY"),
  quietly = FALSE,
  logfile
)

cwb_huffcode(
  corpus,
  p_attribute,
  registry = Sys.getenv("CORPUS_REGISTRY"),
  quietly = FALSE,
  logfile,
  delete = TRUE
)

cwb_compress_rdx(
  corpus,
  p_attribute,
  registry = Sys.getenv("CORPUS_REGISTRY"),
  quietly = FALSE,
  logfile,
  delete = TRUE
)

Arguments

`corpus`	Name of a CWB corpus (upper case).
`registry`	Path to the registry directory, defaults to the value of the environment variable CORPUS_REGISTRY.
`data_dir`	The data directory where `cwb_encode` will save the binary files of the indexed corpus. Tilde expansion is performed on `data_dir` using `path.expand()` to avoid a crash.
`vrt_dir`	Directory with input corpus files (verticalised format / file ending *.vrt). Tilde expansion is performed on `vrt_dir` using `path.expand()` to avoid a crash.
`encoding`	The encoding of the files to be encoded. Needs to be an encoding supported by CWB, see `cwb_charsets()`. "UTF-8" is taken as "utf8". Defaults to "utf8" (recommended charset).
`p_attributes`	Positional attributes (p-attributes) to be declared.
`s_attributes`	A `list` of named `character` vectors to declare structural attributes that shall be encoded. The names of the list are the XML elements present in the corpus. Character vectors making up the list declare the attributes that include the metadata of regions. To declare a structural attribute without annotations, provide a zero-length character vector using `character()` - see examples.
`skip_blank_lines`	A `logical` value, whether to skip blank lines in the input.
`strip_whitespace`	A `logical` value, whether to strip whitespace from tokens
`xml`	A `logical` value, whether input is XML.
`quietly`	A `logical` value, whether to turn off messages (including warnings).
`verbose`	A `logical` value, whether to show progress information (counter of tokens processed).
`p_attribute`	Name of p-attribute.
`logfile`	Redirect messages of `cwb_makeall()`, `cwb_huffcode()` or `cwb_compress_rdx()` to this file. Requires that quietly is `TRUE`.
`delete`	A `logical` value, whether to remove redundant file (p_attribute).corpus after compression.

Details

Running cwb_huffcode() and cwb_compress_rdx() is optional. Corpora can be fully used without compression. It is recommended when reducing the size of corpus data has relevant benefits, e.g. for sharing data. On Windows, compression is not stable and not recommended. A respective warning is issued when running cwb_huffcode() and cwb_compress_rdx() on Windows.

Examples

data_dir <- file.path(tempdir(), "bt_data_dir")
dir.create(data_dir)

cwb_encode(
  corpus = "BTMIN",
  registry = Sys.getenv("CORPUS_REGISTRY"),
  vrt_dir = system.file(package = "RcppCWB", "extdata", "vrt"),
  data_dir = data_dir,
  p_attributes = c("word", "pos", "lemma"),
  s_attributes = list(
    plenary_protocol = c(
      "lp", "protocol_no", "date", "year", "birthday", "version",
      "url", "filetype"
    ),
    speaker = c(
      "id", "type", "lp", "protocol_no", "date", "year", "ai_no", "ai_id",
      "ai_type", "who", "name", "parliamentary_group", "party", "role"
     ),
    p = character()
  )
)

unlink(data_dir)
unlink(file.path(Sys.getenv("CORPUS_REGISTRY"), "btmin"))
# The package includes and 'unfinished' corpus of debates in the UN General 
# Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it 
# is not compressed.
#
# The first step in the following example is to copy the raw
# corpus to a temporary place.

home_dir <- system.file(
  package = "RcppCWB",
  "extdata", "cwb", "indexed_corpora", "unga"
)

tmp_data_dir <- file.path(tempdir(), "indexed_corpora")
tmp_unga_dir <- file.path(tmp_data_dir, "unga2")
if (!file.exists(tmp_data_dir)) dir.create(tmp_data_dir)
if (!file.exists(tmp_unga_dir)){
  dir.create(tmp_unga_dir)
} else {
  file.remove(list.files(tmp_unga_dir, full.names = TRUE))
}

regfile <- readLines(
  system.file(package = "RcppCWB", "extdata", "cwb", "registry", "unga")
)
regfile[grep("^HOME", regfile)] <- sprintf('HOME "%s"', tmp_unga_dir)
regfile[grep("^ID", regfile)] <- "ID unga2"
writeLines(text = regfile, con = file.path(get_tmp_registry(), "unga2"))
for (x in list.files(home_dir, full.names = TRUE)){
  file.copy(from = x, to = tmp_unga_dir)
}

# perform cwb_makeall (equivalent to cwb-makeall command line utility)
cwb_makeall(
  corpus = "UNGA2",
  p_attribute = "word",
  registry = get_tmp_registry()
)
cl_load_corpus("UNGA2", registry = get_tmp_registry())
cqp_load_corpus("UNGA2", registry = get_tmp_registry())

# see whether it works
ids_sentence_1 <- cl_cpos2id(
  corpus = "UNGA2", p_attribute = "word", registry = get_tmp_registry(),
  cpos = 0:83
  )
tokens_sentence_1 <- cl_id2str(
  corpus = "UNGA2", p_attribute = "word",
  registry = get_tmp_registry(), id = ids_sentence_1
  )
sentence <- gsub(
  "\\s+([\\.,])",
  "\\1",
  paste(tokens_sentence_1, collapse = " ")
)

# perform cwb_huffcode (equivalent to cwb-makeall command line utility)
cwb_huffcode(
  corpus = "UNGA2",
  p_attribute = "word",
  registry = get_tmp_registry()
)
cwb_compress_rdx(
  corpus = "UNGA2",
  p_attribute = "word",
  registry = get_tmp_registry()
)

[Package RcppCWB version 0.6.4 Index]