encode {cwbtools} | R Documentation |
Encode CWB Corpus.
Description
Usage
encode(x, ...)
## S4 method for signature 'data.frame'
encode(
x,
corpus,
s_attributes = NULL,
encoding = "utf8",
registry_dir = fs::path(tempdir(), "cwb_registry"),
data_dir = fs::path(tempdir(), "cwb_data_dir", tolower(corpus)),
properties = c(),
method = c("R", "CWB"),
verbose = TRUE,
compress = FALSE,
reload = TRUE,
quietly = TRUE
)
Arguments
x |
A |
... |
Further arguments (unused). |
corpus |
ID of the CWB corpus to create. |
s_attributes |
A |
encoding |
Encoding as defined in the charset corpus property of the registry file for the corpus ('latin1' to 'latin9', and 'utf8'). |
registry_dir |
Registry directory. |
data_dir |
The data directory for the binary files of the corpus. |
properties |
A named |
method |
Either 'CWB' or 'R', defaults to 'R'. See section 'Details'. |
verbose |
A |
compress |
A |
reload |
A logical value, whether to reload the corpus to make it immediatedly available. |
quietly |
A |
Examples
# This is an example we run conditionally as packages are suggested.
dplyr_available <- requireNamespace("dplyr")
tidytext_available <- requireNamespace("tidytext")
quanteda_available <- requireNamespace("quanteda")
if (dplyr_available && tidytext_available && quanteda_available){
library(dplyr) # pipe would not be available otherwise
library(tidytext)
registry_tmp <- fs::path(tempdir(), "cwb_registry")
dir.create(registry_tmp)
tidydata <- quanteda::data_char_ukimmig2010 %>%
as.data.frame() %>%
as_tibble(rownames = "party") %>%
rename(`text` = ".")
tokenstream <- tidydata %>%
unnest_tokens(word, text, to_lower = FALSE, strip_punct = FALSE) %>%
mutate(cpos = 0L:(nrow(.) - 1L))
metadata <- tokenstream %>%
group_by(party) %>%
summarise(cpos_left = min(cpos), cpos_right = max(cpos))
tokenstream %>%
select(-cpos, -party) %>%
encode(
corpus = "UKIMMIG2010",
s_attributes = metadata,
properties = c(lang = "en")
)
}