| document_term_matrix {udpipe} | R Documentation |
Create a document/term matrix
Description
Create a document/term matrix from either
a data.frame with 1 row per document/term as returned by
document_term_frequenciesa list of tokens from e.g. from package sentencepiece, tokenizers.bpe or just by using strsplit
an object of class DocumentTermMatrix or TermDocumentMatrix from the tm package
an object of class simple_triplet_matrix from the slam package
a regular dense matrix
Usage
document_term_matrix(x, vocabulary, weight = "freq", ...)
## S3 method for class 'data.frame'
document_term_matrix(x, vocabulary, weight = "freq", ...)
## S3 method for class 'matrix'
document_term_matrix(x, ...)
## S3 method for class 'integer'
document_term_matrix(x, ...)
## S3 method for class 'numeric'
document_term_matrix(x, ...)
## Default S3 method:
document_term_matrix(x, vocabulary, ...)
## S3 method for class 'DocumentTermMatrix'
document_term_matrix(x, ...)
## S3 method for class 'TermDocumentMatrix'
document_term_matrix(x, ...)
## S3 method for class 'simple_triplet_matrix'
document_term_matrix(x, ...)
Arguments
x |
a data.frame with columns doc_id, term and freq indicating how many times a term occurred in that specific document. This is what |
vocabulary |
a character vector of terms which should be present in the document term matrix even if they did not occur in |
weight |
a column of |
... |
further arguments currently not used |
Value
an sparse object of class dgCMatrix with in the rows the documents and in the columns the terms containing the frequencies
provided in x extended with terms which were not in x but were provided in vocabulary.
The rownames of this resulting object contain the doc_id from x
Methods (by class)
-
data.frame: Construct a document term matrix from a data.frame with columns doc_id, term, freq -
matrix: Construct a sparse document term matrix from a matrix -
integer: Construct a sparse document term matrix from an named integer vector -
numeric: Construct a sparse document term matrix from a named numeric vector -
default: Construct a document term matrix from a list of tokens -
DocumentTermMatrix: Convert an object of classDocumentTermMatrixfrom the tm package to a sparseMatrix -
TermDocumentMatrix: Convert an object of classTermDocumentMatrixfrom the tm package to a sparseMatrix with the documents in the rows and the terms in the columns -
simple_triplet_matrix: Convert an object of classsimple_triplet_matrixfrom the slam package to a sparseMatrix
See Also
sparseMatrix, document_term_frequencies
Examples
x <- data.frame(doc_id = c(1, 1, 2, 3, 4),
term = c("A", "C", "Z", "X", "G"),
freq = c(1, 5, 7, 10, 0))
document_term_matrix(x)
document_term_matrix(x, vocabulary = LETTERS)
## Example on larger dataset
data(brussels_reviews_anno)
x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "lemma")])
dtm <- document_term_matrix(x)
dim(dtm)
x <- document_term_frequencies(brussels_reviews_anno[, c("doc_id", "lemma")])
x <- document_term_frequencies_statistics(x)
dtm <- document_term_matrix(x)
dtm <- document_term_matrix(x, weight = "freq")
dtm <- document_term_matrix(x, weight = "tf_idf")
dtm <- document_term_matrix(x, weight = "bm25")
x <- split(brussels_reviews_anno$lemma, brussels_reviews_anno$doc_id)
dtm <- document_term_matrix(x)
## example showing the vocubulary argument
## allowing you to making sure terms which are not in the data are provided in the resulting dtm
allterms <- unique(x$term)
dtm <- document_term_matrix(head(x, 1000), vocabulary = allterms)
## example for a list of tokens
x <- list(doc1 = c("aa", "bb", "cc", "aa", "b"),
doc2 = c("bb", "bb", "dd", ""),
doc3 = character(),
doc4 = c("cc", NA),
doc5 = character())
document_term_matrix(x)
dtm <- document_term_matrix(x, vocabulary = c("a", "bb", "cc"))
dtm <- dtm_conform(dtm, rows = c("doc1", "doc2", "doc7"), columns = c("a", "bb", "cc"))
data(brussels_reviews)
x <- strsplit(setNames(brussels_reviews$feedback, brussels_reviews$id), split = " +")
x <- document_term_matrix(x)
##
## Example adding bigrams/trigrams to the document term matrix
## Mark that this can also be done using ?dtm_cbind
##
library(data.table)
x <- as.data.table(brussels_reviews_anno)
x <- x[, token_bigram := txt_nextgram(token, n = 2), by = list(doc_id, sentence_id)]
x <- x[, token_trigram := txt_nextgram(token, n = 3), by = list(doc_id, sentence_id)]
x <- document_term_frequencies(x = x,
document = "doc_id",
term = c("token", "token_bigram", "token_trigram"))
dtm <- document_term_matrix(x)
##
## Convert dense matrix to sparse matrix
##
x <- matrix(c(0, 0, 0, 1, NA, 3, 4, 5, 6, 7), nrow = 2)
x
dtm <- document_term_matrix(x)
dtm
x <- matrix(c(0, 0, 0, 0.1, NA, 0.3, 0.4, 0.5, 0.6, 0.7), nrow = 2)
x
dtm <- document_term_matrix(x)
dtm
x <- setNames(c(TRUE, NA, FALSE, FALSE), c("a", "b", "c", "d"))
x <- as.matrix(x)
dtm <- document_term_matrix(x)
dtm
##
## Convert vectors to sparse matrices
##
x <- setNames(-3:3, c("a", "b", "c", "d", "e", "f"))
dtm <- document_term_matrix(x)
dtm
x <- setNames(runif(6), c("a", "b", "c", "d", "e", "f"))
dtm <- document_term_matrix(x)
dtm
##
## Convert lists to sparse matrices
##
x <- list(a = c("some", "set", "of", "words"),
b1 = NA,
b2 = NA,
c1 = character(),
c2 = 0,
d = c("words", "words", "words"))
dtm <- document_term_matrix(x)
dtm