as.tdm {qdap} | R Documentation |
tm Package Compatibility Tools: Apply to or Convert to/from Term Document Matrix or Document Term Matrix
Description
as.tdm
- Create term document matrices from raw text or
wfm
for use with other text analysis packages.
as.TermDocumentMatrix
- Create document term matrices from raw text or
wfm
for use with other text analysis packages.
as.dtm
- Create document term matrices from raw text or
wfm
for use with other text analysis packages.
as.DocumentTermMatrix
- Create document term matrices from raw text or
wfm
for use with other text analysis packages.
as.data.frame
- Convert a tm package Corpus
to
a qdap data.frame
.
as.Corpus
- Attempts to convert its argument into a tm package
Corpus
.
apply_as_tm
- Apply functions intended to be used on the tm
package's TermDocumentMatrix
to a wfm
object.
apply_as_df
- Apply a tm Corpus
as a qdap
dataframe.
apply_as_df
- Apply functions intended to be used on the qdap
package's data.frame
+ sentSplit
to
a tm Corpus
object.
Usage
as.tdm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
as.TermDocumentMatrix(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
as.dtm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
as.DocumentTermMatrix(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
## S3 method for class 'Corpus'
as.tdm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
## Default S3 method:
as.tdm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
## S3 method for class 'character'
as.tdm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
## S3 method for class 'Corpus'
as.dtm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
## Default S3 method:
as.dtm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
## S3 method for class 'character'
as.dtm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
## S3 method for class 'wfm'
as.tdm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
## S3 method for class 'wfm'
as.dtm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)
## S3 method for class 'Corpus'
as.data.frame(
x,
row.names,
optional,
...,
doc = "doc_id",
text = "text",
sent.split = FALSE
)
as.Corpus(text.var, grouping.var = NULL, demographic.vars, ...)
## S3 method for class 'sent_split'
as.Corpus(text.var, grouping.var = NULL, demographic.vars, ...)
## Default S3 method:
as.Corpus(text.var, grouping.var = NULL, demographic.vars, ...)
apply_as_tm(wfm.obj, tmfun, ..., to.qdap = TRUE)
apply_as_df(
tm.corpus,
qdapfun,
...,
stopwords = NULL,
min = 1,
max = Inf,
count.apostrophe = TRUE,
ignore.case = TRUE
)
## S3 method for class 'TermDocumentMatrix'
as.Corpus(text.var, ...)
## S3 method for class 'DocumentTermMatrix'
as.Corpus(text.var, ...)
## S3 method for class 'wfm'
as.Corpus(text.var, ...)
Arguments
text.var |
The text variable or a |
grouping.var |
The grouping variables. Default |
vowel.check |
logical. Should terms without vowels be remove? |
x |
A |
row.names |
|
optional |
logical. If |
doc |
Name for |
text |
Name for |
sent.split |
logical. If |
demographic.vars |
Additional demographic information about the grouping
variables. This is a data.frame, list of equal length vectors, or a single
vector corresponding to the grouping variable/text variable. This
information will be mapped to the DMetaData in the |
wfm.obj |
A |
tmfun |
A function applied to a |
to.qdap |
logical. If |
tm.corpus |
A |
qdapfun |
A qdap function that is usually used on text.variable ~ grouping variable. |
stopwords |
A character vector of words to remove from the text. qdap
has a number of data sets that can be used as stop words including:
|
min |
Minimum word length. |
max |
Maximum word length. |
count.apostrophe |
logical. If |
ignore.case |
logical. If |
... |
Function dependant: |
Details
Produces output that is identical to the tm
package's
TermDocumentMatrix
, DocumentTermMatrix
,
Corpus
or allows convenient interface between the qdap and
tm packages.
Value
as.tdm
- Returns a TermDocumentMatrix
.
as.TermDocumentMatrix
- Returns a
TermDocumentMatrix
.
as.dtm
- Returns a DocumentTermMatrix
.
as.DocumentTermMatrix
- Returns a
TermDocumentMatrix
.
as.data.frame
- Converts a Corpus
and returns
a qdap oriented data.frame
.
as.Corpus
- Converts a qdap oriented dataframe and returns
a Corpus
.
apply_as_tm
- Applies a tm oriented function to a
wfm
and attempts to simplify back to a
wfm
or weight
format.
apply_as_df
- Returns the output typical of the applied
qdap function.
Note
aply_as_df
coerces to a dataframe with columns named 'docs' and
the other named 'text'.
See Also
DocumentTermMatrix
,
Corpus
,
TermDocumentMatrix
,
as.wfm
Examples
## Not run:
as.dtm(DATA$state, DATA$person)
as.tdm(DATA$state, DATA$person)
x <- wfm(DATA$state, DATA$person)
as.tdm(x)
as.dtm(x)
library(tm)
plot(as.tdm(x))
pres <- as.tdm(pres_debates2012$dialogue, pres_debates2012$person)
plot(pres, corThreshold = 0.8)
pres
(pres2 <- removeSparseTerms(pres, .3))
plot(pres2, corThreshold = 0.95)
shorts <- all_words(pres_debates2012)[,1][nchar(all_words(
pres_debates2012)[,1]) < 4]
SW <- c(shorts, qdapDictionaries::contractions[, 1],
qdapDictionaries::Top200Words,
"governor", "president", "mister", "obama","romney")
DocTermMat2 <- with(pres_debates2012, as.dtm(dialogue, list(person, time), stopwords = SW))
DocTermMat2 <- removeSparseTerms(DocTermMat2,0.95)
(DocTermMat2 <- DocTermMat2[rowSums(as.matrix(DocTermMat2))> 0,])
plot(DocTermMat2)
## Correspondence Analysis
library(ca)
dat <- pres_debates2012
dat <- dat[dat$person %in% qcv(ROMNEY, OBAMA), ]
speech <- stemmer(dat$dialogue)
mytable1 <- with(dat, as.tdm(speech, list(person, time), stopwords = Top25Words))
fit <- ca(as.matrix(mytable1))
summary(fit)
plot(fit)
plot3d.ca(fit, labels=1)
mytable2 <- with(dat, as.tdm(speech, list(person, time), stopwords = Top200Words))
fit2 <- ca(as.matrix(mytable2))
summary(fit2)
plot(fit2)
plot3d.ca(fit2, labels=1)
## Topic Models
# Example 1 #
library(topicmodels); library(tm)
# Generate stop words based on short words, frequent words and contractions
shorts <- all_words(pres_debates2012)[,1][nchar(all_words(
pres_debates2012)[,1]) < 4]
SW <- c(shorts, qdapDictionaries::contractions[, 1],
qdapDictionaries::Top200Words,
"governor", "president", "mister", "obama","romney")
DocTermMat <- with(pres_debates2012, as.dtm(dialogue, person, stopwords = SW))
DocTermMat <- removeSparseTerms(DocTermMat,0.999)
DocTermMat <- DocTermMat[rowSums(as.matrix(DocTermMat))> 0,]
lda.model <- LDA(DocTermMat, 5)
(topics <- posterior(lda.model, DocTermMat)$topics)
terms(lda.model,20)
# Plot the Topics Per Person
topic.dat <- matrix2df(topics, "Person")
colnames(topic.dat)[-1] <- paste2(t(terms(lda.model,20)), sep=", ")
library(reshape2)
mtopic <- melt(topic.dat, variable="Topic", value.name="Proportion")
ggplot(mtopic, aes(weight=Proportion, x=Topic, fill=Topic)) +
geom_bar() +
coord_flip() +
facet_grid(Person~.) +
guides(fill=FALSE)
# Example 2 #
DocTermMat2 <- with(pres_debates2012, as.dtm(dialogue, list(person, time), stopwords = SW))
DocTermMat2 <- removeSparseTerms(DocTermMat2,0.95)
DocTermMat2 <- DocTermMat2[rowSums(as.matrix(DocTermMat2))> 0,]
lda.model2 <- LDA(DocTermMat2, 6)
(topics2 <- posterior(lda.model2, DocTermMat2)$topics)
terms(lda.model2,20)
qheat(topics2, high="blue", low="yellow", by.col=FALSE)
# Example 3 #
lda.model3 <- LDA(DocTermMat2, 10)
(topics3 <- posterior(lda.model3, DocTermMat2)$topics)
terms(lda.model3, 20)
qheat(topics3, high="blue", low="yellow", by.col=FALSE)
# Plot the Topics Per Person
topic.dat3 <- matrix2df(topics3, "Person&Time")
colnames(topic.dat3)[-1] <- paste2(t(terms(lda.model3, 10)), sep=", ")
topic.dat3 <- colsplit2df(topic.dat3)
library(reshape2)
library(scales)
mtopic3 <- melt(topic.dat3, variable="Topic", value.name="Proportion")
(p1 <- ggplot(mtopic3, aes(weight=Proportion, x=Topic, fill=Topic)) +
geom_bar() +
coord_flip() +
facet_grid(Person~Time) +
guides(fill=FALSE) +
scale_y_continuous(labels = percent) +
theme(plot.margin = unit(c(1, 0, 0.5, .5), "lines")) +
ylab("Proportion"))
mtopic3.b <- mtopic3
mtopic3.b[, "Topic"] <- factor(as.numeric(mtopic3.b[, "Topic"]), levels = 1:10)
mtopic3.b[, "Time"] <- factor(gsub("time ", "", mtopic3.b[, "Time"]))
p2 <- ggplot(mtopic3.b, aes(x=Time, y=Topic, fill=Proportion)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "grey70", high = "red") +
facet_grid(Person~Time, scales = "free") +
theme(axis.title.y = element_blank(),
axis.text.x= element_text(colour="white"),
axis.ticks.x= element_line(colour="white"),
axis.ticks.y = element_blank(),
axis.text.y= element_blank(),
plot.margin = unit(c(1, -.5, .5, -.9), "lines")
)
library(gridExtra)
grid.arrange(p1, p2, nrow=1, widths = grid::unit(c(.85, .15), "native"))
## tm Matrices to wfm
library(tm)
data(crude)
## A Term Document Matrix Conversion
(tm_in <- TermDocumentMatrix(crude, control = list(stopwords = TRUE)))
converted <- as.wfm(tm_in)
head(converted)
summary(converted)
## A Document Term Matrix Conversion
(dtm_in <- DocumentTermMatrix(crude, control = list(stopwords = TRUE)))
summary(as.wfm(dtm_in))
## `apply_as_tm` Examples
## Create a wfm
a <- with(DATA, wfm(state, list(sex, adult)))
summary(a)
## Apply functions meant for a tm TermDocumentMatrix
out <- apply_as_tm(a, tm:::removeSparseTerms, sparse=0.6)
summary(out)
apply_as_tm(a, tm:::findAssocs, "computer", .8)
apply_as_tm(a, tm:::findFreqTerms, 2, 3)
apply_as_tm(a, tm:::Zipf_plot)
apply_as_tm(a, tm:::Heaps_plot)
apply_as_tm(a, tm:::plot.TermDocumentMatrix, corThreshold = 0.4)
library(proxy)
apply_as_tm(a, tm:::weightBin)
apply_as_tm(a, tm:::weightBin, to.qdap = FALSE)
apply_as_tm(a, tm:::weightSMART)
apply_as_tm(a, tm:::weightTfIdf)
## Convert tm Corpus to Dataframe
## A tm Corpus
library(tm)
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- Corpus(DirSource(reut21578),
readerControl = list(reader = readReut21578XML))
## Convert to dataframe
corp_df <- as.data.frame(reuters)
htruncdf(corp_df)
z <- as.Corpus(DATA$state, DATA$person,
demographic=DATA[, qcv(sex, adult, code)])
as.data.frame(z)
## Apply a qdap function
out <- formality(corp_df$text, corp_df$docs)
plot(out)
## Convert a qdap dataframe to tm package Corpus
(x <- with(DATA2, as.Corpus(state, list(person, class, day))))
library(tm)
inspect(x)
inspect_text(x)
class(x)
(y <- with(pres_debates2012, as.Corpus(dialogue, list(person, time))))
## Add demographic info to DMetaData of Corpus
z <- as.Corpus(DATA$state, DATA$person,
demographic=DATA[, qcv(sex, adult, code)])
lview(z)
lview(as.Corpus(DATA$state, DATA$person,
demographic=DATA$sex))
lview(as.Corpus(DATA$state, DATA$person,
demographic=list(DATA$sex, DATA$adult)))
## Apply qdap functions meant for dataframes from sentSplit to tm Corpus
library(tm)
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- Corpus(DirSource(reut21578),
readerControl = list(reader = readReut21578XML))
matches <- list(
oil = qcv(oil, crude),
money = c("economic", "money")
)
apply_as_df(reuters, word_stats)
apply_as_df(reuters, formality)
apply_as_df(reuters, word_list)
apply_as_df(reuters, polarity)
apply_as_df(reuters, Dissimilarity)
apply_as_df(reuters, diversity)
apply_as_df(reuters, pos_by)
apply_as_df(reuters, flesch_kincaid)
apply_as_df(reuters, trans_venn)
apply_as_df(reuters, gantt_plot)
apply_as_df(reuters, rank_freq_mplot)
apply_as_df(reuters, character_table)
(termco_out <- apply_as_df(reuters, termco, match.list = matches))
plot(termco_out, values = TRUE, high="red")
(wordcor_out <- apply_as_df(reuters, word_cor, word = unlist(matches)))
plot(wordcor_out)
(f_terms <- apply_as_df(reuters, freq_terms, at.least = 3))
plot(f_terms)
apply_as_df(reuters, trans_cloud)
## To use "all" rather than "docs" as "grouping.var"...
apply_as_df(reuters, trans_cloud, grouping.var=NULL,
target.words=matches, cloud.colors = c("red", "blue", "grey75"))
finds <- apply_as_df(reuters, freq_terms, at.least = 5,
top = 5, stopwords = Top100Words)
apply_as_df(reuters, dispersion_plot, match.terms = finds[, 1],
total.color = NULL)
## Filter for Term Document Matrix/Document Term Matrix
library(tm)
data(crude)
(tdm_in <- TermDocumentMatrix(crude, control = list(stopwords = TRUE)))
Filter(tdm_in, 5)
(dtm_in <- DocumentTermMatrix(crude, control = list(stopwords = TRUE)))
Filter(dtm_in, 5)
## Filter particular words based on max/min values
Filter(dtm_in, 5, 7)
Filter(dtm_in, 4, 4)
Filter(tdm_in, 3, 4)
Filter(tdm_in, 3, 4, stopwords = Top200Words)
## SPECIAL REMOVAL OF TERMS (more flexible consideration of words than wfm)
dat <- data.frame(
person = paste0("person_", 1:5),
tweets = c("test one two", "two apples","hashtag #apple",
"#apple #tree", "http://microsoft.com")
)
## remove specialty items
dat[[2]] <- rm_default(dat[[2]], pattern=pastex("@rm_url", "#apple\\b"))
myCorp <- tm::tm_map(crude, tm::removeWords, Top200Words)
myCorp %>% as.dtm() %>% tm::inspect()
## End(Not run)