Exploring large-scale text datasets

When faced with a new corpus of social media text whose characteristics are unknown, it’s a good idea to start by conducting some descriptive analysis to understand how documents are similar or different.

Let’s learn some of those techniques with our previous example containing tweets by the four leading candidates in the 2016 Presidential primaries.

library(quanteda)
## Warning: package 'quanteda' was built under R version 3.4.4
## Package version: 1.3.0
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
tweets <- read.csv("~/data/candidate-tweets.csv", stringsAsFactors=F)
# extract month data and subset only data during campaign
tweets$month <- substr(tweets$datetime, 1, 7)
tweets <- tweets[tweets$month>"2015-06",]
# create DFM at the candidate and month level
twcorpus <- corpus(tweets)

Identifying most unique features of documents

Keyness is a measure of to what extent some features are specific to a (group of) document in comparison to the rest of the corpus, taking into account that some features may be too rare.

twdfm <- dfm(twcorpus, groups=c("screen_name"), verbose=TRUE)

head(textstat_keyness(twdfm, target="realDonaldTrump",
                      measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="HillaryClinton",
                      measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="tedcruz",
                      measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="BernieSanders",
                      measure="chi2"), n=20)


twdfm <- dfm(twcorpus, groups=c("screen_name"), remove_punct=TRUE,
             remove=c(stopwords("english"), 'rt', 'u', 's'), verbose=TRUE)
head(textstat_keyness(twdfm, target="realDonaldTrump",
                      measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="HillaryClinton",
                      measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="tedcruz",
                      measure="chi2"), n=20)
head(textstat_keyness(twdfm, target="BernieSanders",
                      measure="chi2"), n=20)

trump <- corpus_subset(twcorpus, screen_name=="realDonaldTrump")
twdfm <- dfm(trump, remove_punct=TRUE,
             remove=c(stopwords("english"), 'rt', 'u', 's'), verbose=TRUE)
head(textstat_keyness(twdfm, target=docvars(twdfm)$month<"2016-01", 
                      measure="chi2"), n=20)
head(textstat_keyness(twdfm, target=docvars(twdfm)$month>"2016-03", 
                      measure="chi2"), n=20)

clinton <- corpus_subset(twcorpus, screen_name=="HillaryClinton")
twdfm <- dfm(clinton, remove_punct=TRUE,
             remove=c(stopwords("english"), 'rt', 'u', 's'), verbose=TRUE)
head(textstat_keyness(twdfm, target=docvars(twdfm)$month<"2016-01", 
                      measure="chi2"), n=20)
head(textstat_keyness(twdfm, target=docvars(twdfm)$month>"2016-03", 
                      measure="chi2"), n=20)

We can use textplot_xray to visualize where some words appear in the corpus.

agg <- aggregate(text ~ screen_name, data=tweets, FUN=paste, collapse=" ")
twcorpus <- corpus(agg)
docnames(twcorpus) <- agg$screen_name

textplot_xray(kwic(twcorpus, "hillary"), scale="relative")

textplot_xray(kwic(twcorpus, "trump"), scale="relative")

textplot_xray(kwic(twcorpus, "sanders"), scale="relative")

textplot_xray(kwic(twcorpus, "crooked"), scale="relative")

textplot_xray(kwic(twcorpus, "mexic*"), scale="relative")

textplot_xray(kwic(twcorpus, "fake"), scale="relative")

textplot_xray(kwic(twcorpus, "immigr*"), scale="relative")

textplot_xray(kwic(twcorpus, "muslim*"), scale="relative")

textplot_xray(kwic(twcorpus, "gun*"), scale="relative")

Clustering documents and features

We can identify documents that are similar to one another based on the frequency of words, using similarity. There’s different metrics to compute similarity. Here we explore two of them: Jaccard distance and Cosine distance.

twdfm <- dfm(twcorpus, groups=c("screen_name"), verbose=TRUE)
## Creating a dfm from a corpus input...
##    ... lowercasing
##    ... found 4 documents, 36,423 features
##    ... grouping texts
##    ... created a 4 x 36,423 sparse dfm
##    ... complete. 
## Elapsed time: 1.56 seconds.
docnames(twdfm)
## [1] "BernieSanders"   "HillaryClinton"  "realDonaldTrump" "tedcruz"
textstat_simil(twdfm, margin="documents", method="jaccard")
##                 BernieSanders HillaryClinton realDonaldTrump
## HillaryClinton      0.2007540                               
## realDonaldTrump     0.1704245      0.1587010                
## tedcruz             0.1455431      0.1393852       0.1584593
textstat_simil(twdfm, margin="documents", method="cosine")
##                 BernieSanders HillaryClinton realDonaldTrump
## HillaryClinton      0.9115943                               
## realDonaldTrump     0.8754789      0.8391635                
## tedcruz             0.8782221      0.9307062       0.7829703

And the opposite: term similarity based on the frequency with which they appear in documents:

twdfm <- dfm(twcorpus, verbose=TRUE)
## Creating a dfm from a corpus input...
##    ... lowercasing
##    ... found 4 documents, 36,423 features
##    ... created a 4 x 36,423 sparse dfm
##    ... complete. 
## Elapsed time: 1.44 seconds.
# term similarities
simils <- textstat_simil(twdfm, "america", margin="features", method="cosine")
# most similar features
df <- data.frame(
  featname = rownames(simils),
  simil = as.numeric(simils),
  stringsAsFactors=F
)
head(df[order(simils, decreasing=TRUE),], n=5)
##       featname     simil
## 225    america 1.0000000
## 1865 statement 0.9987363
## 891     speech 0.9978940
## 2474   biggest 0.9978315
## 7497  declared 0.9974024
# another example
simils <- textstat_simil(twdfm, "immigration", margin="features", method="cosine")
# most similar features
df <- data.frame(
  featname = rownames(simils),
  simil = as.numeric(simils),
  stringsAsFactors=F
)
head(df[order(simils, decreasing=TRUE),], n=5)
##         featname     simil
## 964  immigration 1.0000000
## 1485          11 0.9992747
## 7629     hitting 0.9988065
## 364     majority 0.9985681
## 774         hear 0.9985197

We can then used these distances to create a dendogram that can help us cluster documents.

twdfm <- dfm(twcorpus, groups=c("screen_name"), verbose=TRUE)
## Creating a dfm from a corpus input...
##    ... lowercasing
##    ... found 4 documents, 36,423 features
##    ... grouping texts
##    ... created a 4 x 36,423 sparse dfm
##    ... complete. 
## Elapsed time: 0.834 seconds.
# compute distances
distances <- textstat_dist(twdfm, margin="documents")
as.matrix(distances)
##                 BernieSanders HillaryClinton realDonaldTrump  tedcruz
## BernieSanders           0.000       8643.537        7253.074 23081.70
## HillaryClinton       8643.537          0.000        9546.685 17224.74
## realDonaldTrump      7253.074       9546.685           0.000 22754.95
## tedcruz             23081.702      17224.742       22754.952     0.00
# clustering
cluster <- hclust(distances)
plot(cluster)