When faced with a new corpus of social media text whose characteristics are unknown, it’s a good idea to start by conducting some descriptive analysis to understand how documents are similar or different.
Let’s learn some of those techniques with our previous example containing tweets by the four leading candidates in the 2016 Presidential primaries.
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.4.4
## Package version: 1.3.0
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
tweets <- read.csv("~/data/candidate-tweets.csv", stringsAsFactors=F)
# extract month data and subset only data during campaign
tweets$month <- substr(tweets$datetime, 1, 7)
tweets <- tweets[tweets$month>"2015-06",]
# create DFM at the candidate and month level
twcorpus <- corpus(tweets)
Keyness is a measure of to what extent some features are specific to a (group of) document in comparison to the rest of the corpus, taking into account that some features may be too rare.
twdfm <- dfm(twcorpus, groups=c("screen_name"), verbose=TRUE)
head(textstat_keyness(twdfm, target="realDonaldTrump",
measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="HillaryClinton",
measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="tedcruz",
measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="BernieSanders",
measure="chi2"), n=20)
twdfm <- dfm(twcorpus, groups=c("screen_name"), remove_punct=TRUE,
remove=c(stopwords("english"), 'rt', 'u', 's'), verbose=TRUE)
head(textstat_keyness(twdfm, target="realDonaldTrump",
measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="HillaryClinton",
measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="tedcruz",
measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="BernieSanders",
measure="chi2"), n=20)
trump <- corpus_subset(twcorpus, screen_name=="realDonaldTrump")
twdfm <- dfm(trump, remove_punct=TRUE,
remove=c(stopwords("english"), 'rt', 'u', 's'), verbose=TRUE)
head(textstat_keyness(twdfm, target=docvars(twdfm)$month<"2016-01",
measure="chi2"), n=20)
head(textstat_keyness(twdfm, target=docvars(twdfm)$month>"2016-03",
measure="chi2"), n=20)
clinton <- corpus_subset(twcorpus, screen_name=="HillaryClinton")
twdfm <- dfm(clinton, remove_punct=TRUE,
remove=c(stopwords("english"), 'rt', 'u', 's'), verbose=TRUE)
head(textstat_keyness(twdfm, target=docvars(twdfm)$month<"2016-01",
measure="chi2"), n=20)
head(textstat_keyness(twdfm, target=docvars(twdfm)$month>"2016-03",
measure="chi2"), n=20)
We can use textplot_xray
to visualize where some words appear in the corpus.
agg <- aggregate(text ~ screen_name, data=tweets, FUN=paste, collapse=" ")
twcorpus <- corpus(agg)
docnames(twcorpus) <- agg$screen_name
textplot_xray(kwic(twcorpus, "hillary"), scale="relative")
textplot_xray(kwic(twcorpus, "trump"), scale="relative")
textplot_xray(kwic(twcorpus, "sanders"), scale="relative")
textplot_xray(kwic(twcorpus, "crooked"), scale="relative")
textplot_xray(kwic(twcorpus, "mexic*"), scale="relative")
textplot_xray(kwic(twcorpus, "fake"), scale="relative")
textplot_xray(kwic(twcorpus, "immigr*"), scale="relative")
textplot_xray(kwic(twcorpus, "muslim*"), scale="relative")
textplot_xray(kwic(twcorpus, "gun*"), scale="relative")
We can identify documents that are similar to one another based on the frequency of words, using similarity
. There’s different metrics to compute similarity. Here we explore two of them: Jaccard distance and Cosine distance.
twdfm <- dfm(twcorpus, groups=c("screen_name"), verbose=TRUE)
## Creating a dfm from a corpus input...
## ... lowercasing
## ... found 4 documents, 36,423 features
## ... grouping texts
## ... created a 4 x 36,423 sparse dfm
## ... complete.
## Elapsed time: 1.56 seconds.
docnames(twdfm)
## [1] "BernieSanders" "HillaryClinton" "realDonaldTrump" "tedcruz"
textstat_simil(twdfm, margin="documents", method="jaccard")
## BernieSanders HillaryClinton realDonaldTrump
## HillaryClinton 0.2007540
## realDonaldTrump 0.1704245 0.1587010
## tedcruz 0.1455431 0.1393852 0.1584593
textstat_simil(twdfm, margin="documents", method="cosine")
## BernieSanders HillaryClinton realDonaldTrump
## HillaryClinton 0.9115943
## realDonaldTrump 0.8754789 0.8391635
## tedcruz 0.8782221 0.9307062 0.7829703
And the opposite: term similarity based on the frequency with which they appear in documents:
twdfm <- dfm(twcorpus, verbose=TRUE)
## Creating a dfm from a corpus input...
## ... lowercasing
## ... found 4 documents, 36,423 features
## ... created a 4 x 36,423 sparse dfm
## ... complete.
## Elapsed time: 1.44 seconds.
# term similarities
simils <- textstat_simil(twdfm, "america", margin="features", method="cosine")
# most similar features
df <- data.frame(
featname = rownames(simils),
simil = as.numeric(simils),
stringsAsFactors=F
)
head(df[order(simils, decreasing=TRUE),], n=5)
## featname simil
## 225 america 1.0000000
## 1865 statement 0.9987363
## 891 speech 0.9978940
## 2474 biggest 0.9978315
## 7497 declared 0.9974024
# another example
simils <- textstat_simil(twdfm, "immigration", margin="features", method="cosine")
# most similar features
df <- data.frame(
featname = rownames(simils),
simil = as.numeric(simils),
stringsAsFactors=F
)
head(df[order(simils, decreasing=TRUE),], n=5)
## featname simil
## 964 immigration 1.0000000
## 1485 11 0.9992747
## 7629 hitting 0.9988065
## 364 majority 0.9985681
## 774 hear 0.9985197
We can then used these distances to create a dendogram that can help us cluster documents.
twdfm <- dfm(twcorpus, groups=c("screen_name"), verbose=TRUE)
## Creating a dfm from a corpus input...
## ... lowercasing
## ... found 4 documents, 36,423 features
## ... grouping texts
## ... created a 4 x 36,423 sparse dfm
## ... complete.
## Elapsed time: 0.834 seconds.
# compute distances
distances <- textstat_dist(twdfm, margin="documents")
as.matrix(distances)
## BernieSanders HillaryClinton realDonaldTrump tedcruz
## BernieSanders 0.000 8643.537 7253.074 23081.70
## HillaryClinton 8643.537 0.000 9546.685 17224.74
## realDonaldTrump 7253.074 9546.685 0.000 22754.95
## tedcruz 23081.702 17224.742 22754.952 0.00
# clustering
cluster <- hclust(distances)
plot(cluster)