Exploring large-scale text datasets

A common type of analysis to understand the content of a corpus is to extract collocations – combinations of words that are more likely to appear together than what is expected based on their frequency distribution in the corpus as isolated words. There are different significante tests to identify whether a combination of words is a collocation or not (see the help file).

# reading data and computing additional variables
library(readtext)
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.4.2
## quanteda version 0.99.9
## Using 3 of 4 threads for parallel computing
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
inaug <- readtext(file='../data/inaugural/*.txt')
inaug$year <- stringr::str_sub(inaug$doc_id, 1, 4)

# creating corpus object
inaug <- corpus(inaug)

# collocations (currently under development)
nyttokens <- removeFeatures(tokens(tolower(inaug), remove_numbers=TRUE, remove_punct=TRUE), stopwords("english"))
colls <- textstat_collocations(nyttokens, size = 2)
head(colls)
##          collocation count length   lambda        z
## 1      united states   157      2 7.136319 38.69725
## 2             let us   101      2 5.772343 32.67503
## 3    fellow citizens    78      2 7.255403 30.43226
## 4          years ago    26      2 6.858419 20.59246
## 5    american people    41      2 3.692929 20.00313
## 6 federal government    32      2 4.591766 19.06968
head(colls[order(colls$count, decreasing=TRUE),], n=10)
##            collocation count length   lambda        z
## 1        united states   157      2 7.136319 38.69725
## 2               let us   101      2 5.772343 32.67503
## 3      fellow citizens    78      2 7.255403 30.43226
## 5      american people    41      2 3.692929 20.00313
## 6   federal government    32      2 4.591766 19.06968
## 18           men women    28      2 8.194445 15.88195
## 4            years ago    26      2 6.858419 20.59246
## 12          four years    26      2 7.894667 16.52531
## 11  general government    25      2 4.090299 16.57651
## 125            upon us    25      2 2.331845 11.07716
# now with up to 3 words
colls <- textstat_collocations(nyttokens, size = 3)
head(colls)
##                  collocation count length   lambda        z
## 1          nation one people     3      3 4.064590 3.477918
## 2               every one us     3      3 5.714997 3.375338
## 3 principles upon government     2      3 4.026849 3.028604
## 4         upon rights states     2      3 5.224460 2.875000
## 5        government can must     2      3 3.065317 2.862611
## 6              make war upon     2      3 5.221545 2.798089
head(colls[order(colls$count, decreasing=TRUE),], n=10)
##                    collocation count length       lambda            z
## 325 constitution united states    20      3  0.006118655  0.004085192
## 490       people united states    12      3 -1.060787005 -1.340759624
## 169    preserve protect defend    10      3  1.909088887  0.863086869
## 113           mr chief justice     9      3  2.746007979  1.280289892
## 255    president united states     8      3  0.832583720  0.407991175
## 537             four years ago     8      3 -5.047292324 -3.203509004
## 132 defend constitution united     7      3  1.914251398  1.116003527
## 293             let us resolve     7      3  0.304114478  0.147702444
## 211      united states america     6      3  1.343262018  0.655933758
## 286   government united states     6      3  0.268132195  0.172638083
# FROM THE DOCUMENTATION:
# extracting multi-part proper nouns (capitalized terms)
toks2 <- tokens(inaug)
toks2 <- tokens_remove(toks2, stopwords("english"), padding = TRUE)
toks2 <- tokens_select(toks2, "^([A-Z][a-z\\-]{2,})", valuetype = "regex", 
                       case_insensitive = FALSE, padding = TRUE)
seqs <- textstat_collocations(toks2, size = 3, tolower = FALSE)
head(seqs, 10)
##              collocation count length     lambda         z
## 1 United States Congress     2      3  -2.152316 -1.014581
## 2    Vice President Bush     2      3 -11.582731 -4.471091

Readability and lexical diversity

A text document can also be characterized based on its readability and lexical diversity, which capture different aspects of its complexity. There are MANY indices that compute this. Note that each of these functions is applied to a different type of object (corpus or dfm).

# readability
fk <- textstat_readability(inaug, "Flesch.Kincaid")
plot(aggregate(fk ~ unlist(inaug[["year"]]), FUN=mean), type="l")

# lexical diversity
inaugdfm <- dfm(inaug, remove_numbers=TRUE, remove_punct=TRUE, verbose=TRUE,
              remove=stopwords("english"))
## Creating a dfm from a corpus input...
##    ... lowercasing
##    ... found 58 documents, 9,281 features
## ...
## removed 136 features
## 
##    ... created a 58 x 9,145 sparse dfm
##    ... complete. 
## Elapsed time: 0.413 seconds.
ld <- textstat_lexdiv(inaugdfm, "TTR")
plot(aggregate(ld ~ unlist(inaug[["year"]]), FUN=mean), type="l")

Identifying most unique features of documents

One approach is to use TF-IDF weights instead of just token counts in the DFM:

rew <- tfidf(inaugdfm)
# now most frequent features are different
topfeatures(inaugdfm)
##     people government         us        can       upon       must 
##        574        564        478        471        372        366 
##      great        may     states      shall 
##        340        338        333        314
topfeatures(rew)
##      america        union constitution     congress      freedom 
##     52.39413     51.14846     40.21661     39.13390     38.31822 
##      revenue       public         laws       states      federal 
##     34.11779     33.74429     33.57614     33.52316     32.83026

Keyness is a measure of to what extent some features are specific to a (group of) document in comparison to the rest of the corpus, taking into account that some features may be too rare.

head(textstat_keyness(inaugdfm, target="2017-Trump.txt",
                      measure="chi2"), n=20)
head(textstat_keyness(inaugdfm,  target=docnames(inaugdfm)=="2017-Trump.txt", 
                      measure="lr"), n=20)
head(textstat_keyness(inaugdfm, target=docnames(inaugdfm)=="2009-Obama.txt",
                      measure="chi2"), n=20)
head(textstat_keyness(inaugdfm, target=docnames(inaugdfm)=="2009-Obama.txt",
                      measure="lr"), n=20)
head(textstat_keyness(inaugdfm, target=docvars(inaugdfm)$year>1990,
                      measure="chi2"), n=20)
head(textstat_keyness(inaugdfm, target=docvars(inaugdfm)$year>1990,
                      measure="lr"), n=20)

We can use textplot_xray to visualize where some words appear in the corpus.

textplot_xray(kwic(inaug, "america"))

textplot_xray(kwic(inaug, "immigration"))

textplot_xray(kwic(inaug, "god"))

Clustering documents and features

We can identify documents that are similar to one another based on the frequency of words, using similarity. There’s different metrics to compute similarity. Here we explore two of them: Jaccard distance and Cosine distance.

# document similarities
simils <- textstat_simil(inaugdfm, "2017-Trump.txt", margin="documents", method="jaccard")
# most similar documents
df <- data.frame(
  docname = rownames(simils),
  simil = as.numeric(simils),
  stringsAsFactors=F
)
tail(df[order(simils),])
##             docname     simil
## 47   1969-Nixon.txt 0.1558164
## 53 1993-Clinton.txt 0.1572104
## 58   2013-Obama.txt 0.1592292
## 54 1997-Clinton.txt 0.1748927
## 52    1989-Bush.txt 0.1792956
## 1    2017-Trump.txt 1.0000000
head(df[order(simils),])
##                docname      simil
## 3  1793-Washington.txt 0.03585657
## 12    1829-Jackson.txt 0.06420927
## 8     1813-Madison.txt 0.07500000
## 21    1865-Lincoln.txt 0.08047690
## 4       1797-Adams.txt 0.08395522
## 15   1841-Harrison.txt 0.08429503
# another example
simils <- textstat_simil(inaugdfm, "2013-Obama.txt", margin="documents", method="jaccard")
# most similar documents
df <- data.frame(
  docname = rownames(simils),
  simil = as.numeric(simils),
  stringsAsFactors=F
)
tail(df[order(simils),])
##             docname     simil
## 53 1993-Clinton.txt 0.1920398
## 56    2005-Bush.txt 0.1971067
## 54 1997-Clinton.txt 0.2010969
## 57   2009-Obama.txt 0.2058584
## 55    2001-Bush.txt 0.2069317
## 1    2013-Obama.txt 1.0000000
head(df[order(simils),])
##                docname      simil
## 3  1793-Washington.txt 0.02781641
## 21    1865-Lincoln.txt 0.07639681
## 12    1829-Jackson.txt 0.08471276
## 2  1789-Washington.txt 0.08675799
## 8     1813-Madison.txt 0.08720930
## 17     1849-Taylor.txt 0.09290709

And the opposite: term similarity based on the frequency with which they appear in documents:

# term similarities
simils <- textstat_simil(inaugdfm, "unemployment", margin="features", method="cosine")
# most similar features
df <- data.frame(
  featname = rownames(simils),
  simil = as.numeric(simils),
  stringsAsFactors=F
)
head(df[order(simils, decreasing=TRUE),], n=10)
##          featname     simil
## 1    unemployment 1.0000000
## 6975   inflations 1.0000000
## 6976    upheavals 1.0000000
## 6992      miracle 1.0000000
## 6925      lighten 0.8164966
## 6981       levels 0.8164966
## 7024        costs 0.8164966
## 6923       normal 0.7715167
## 6827       noting 0.7071068
## 6828   ruggedness 0.7071068
# another example...
simils <- textstat_simil(inaugdfm, "america", margin="features", method="cosine")
# most similar features
df <- data.frame(
  featname = rownames(simils),
  simil = as.numeric(simils),
  stringsAsFactors=F
)
head(df[order(simils, decreasing=TRUE),], n=10)
##        featname     simil
## 1       america 1.0000000
## 5627      today 0.8258359
## 326       world 0.7684245
## 241         new 0.7476929
## 6604  americans 0.7419876
## 1428       work 0.7050669
## 7636 challenges 0.7028413
## 4975     across 0.6982767
## 196      nation 0.6862938
## 1162      heart 0.6843764

Each of these can then be used to cluster documents:

recent <- inaugdfm[46:58,]
# compute distances
distances <- textstat_dist(recent, margin="documents")
as.matrix(distances)[1:5, 1:5]
##                 1969-Nixon.txt 1973-Nixon.txt 1977-Carter.txt
## 1969-Nixon.txt         0.00000       56.06247        52.15362
## 1973-Nixon.txt        56.06247        0.00000        58.57474
## 1977-Carter.txt       52.15362       58.57474         0.00000
## 1981-Reagan.txt       57.79273       62.42596        52.91503
## 1985-Reagan.txt       58.42089       61.51423        60.09160
##                 1981-Reagan.txt 1985-Reagan.txt
## 1969-Nixon.txt         57.79273        58.42089
## 1973-Nixon.txt         62.42596        61.51423
## 1977-Carter.txt        52.91503        60.09160
## 1981-Reagan.txt         0.00000        52.35456
## 1985-Reagan.txt        52.35456         0.00000
# clustering
cluster <- hclust(distances)
plot(cluster)

A different type of clustering is principal component analysis. This technique will try to identify a set of uncorrelated variables that capture most of the variance in the document-feature matrix. The first component will always capture the largest proportion of the variance; the second captures the second largest, etc. Looking at the relative proportion of the variance captured by the first component vs the rest, we can see to what extent we can reduce the dataset to just one dimension.

# Principal components analysis
pca <- prcomp(t(as.matrix(inaugdfm))) 
plot(pca) # first PC captures most of the variance

# plot first principal component
plot(pca$rotation[,1], pca$rotation[,2], type="n")
text(pca$rotation[,1], pca$rotation[,2], labels=docvars(inaugdfm)$year)

# looking at features for each PC
df <- data.frame(
  featname = featnames(inaugdfm),
  dim1 = pca$x[,1],
  dim2 = pca$x[,2],
  stringsAsFactors=FALSE
)

head(df[order(df$dim1),])
##              featname      dim1        dim2
## government government -91.99866 -17.7912072
## people         people -87.00006   0.1198702
## can               can -68.13895  13.8976886
## upon             upon -63.85988 -16.6943217
## states         states -58.80276 -27.4779196
## may               may -55.79085 -16.7114701
tail(df[order(df$dim1),])
##                featname     dim1         dim2
## manger           manger 1.035344  0.002379649
## emerson         emerson 1.035344  0.002379649
## stout             stout 1.035344  0.002379649
## anguished     anguished 1.035344  0.002379649
## arrive           arrive 1.050707 -0.037749736
## upbraidings upbraidings 1.050707 -0.037749736
head(df[order(df$dim2),])
##                  featname      dim1      dim2
## states             states -58.80276 -27.47792
## constitution constitution -40.01074 -26.34374
## power               power -43.46149 -17.97093
## government     government -91.99866 -17.79121
## union               union -32.00204 -17.00467
## may                   may -55.79085 -16.71147
tail(df[order(df$dim2),])
##         featname      dim1     dim2
## let          let -15.09276 20.09351
## must        must -50.45358 23.71271
## america  america -19.14462 28.03981
## new          new -28.91289 30.88054
## world      world -35.69570 39.69103
## us            us -55.42694 47.55002

A similar dimensionality reduction technique is correspondence analysis. We’ll see it with more detail when we get to networks, but note that the intuition and results are similar.

out <- textmodel_ca(inaugdfm)

# documents
df <- data.frame(
  docname = docnames(inaugdfm),
  year = docvars(inaugdfm)$year,
  dim1 = out$rowcoord[,1],
  dim2 = out$rowcoord[,2],
  stringsAsFactors=FALSE
)

head(df[order(df$dim1),])
##                                 docname year       dim1      dim2
## 1829-Jackson.txt       1829-Jackson.txt 1829 -1.0983660 0.7387520
## 1821-Monroe.txt         1821-Monroe.txt 1821 -1.0564043 0.7853386
## 1841-Harrison.txt     1841-Harrison.txt 1841 -1.0014370 1.1728084
## 1845-Polk.txt             1845-Polk.txt 1845 -0.9933083 0.6255691
## 1789-Washington.txt 1789-Washington.txt 1789 -0.9636421 1.5124776
## 1809-Madison.txt       1809-Madison.txt 1809 -0.9335471 1.0926989
tail(df[order(df$dim1),])
##                           docname year     dim1      dim2
## 2013-Obama.txt     2013-Obama.txt 2013 1.639234 0.8218771
## 1989-Bush.txt       1989-Bush.txt 1989 1.685393 0.5951564
## 1997-Clinton.txt 1997-Clinton.txt 1997 1.718459 0.6411029
## 2017-Trump.txt     2017-Trump.txt 2017 1.730343 1.0616905
## 1969-Nixon.txt     1969-Nixon.txt 1969 1.850139 0.5225313
## 1993-Clinton.txt 1993-Clinton.txt 1993 1.854807 0.7156803
head(df[order(df$dim2),])
##                               docname year         dim1      dim2
## 1921-Harding.txt     1921-Harding.txt 1921  0.564138453 -2.792178
## 1909-Taft.txt           1909-Taft.txt 1909 -0.772554288 -2.035984
## 1913-Wilson.txt       1913-Wilson.txt 1913  0.719416867 -1.556485
## 1893-Cleveland.txt 1893-Cleveland.txt 1893 -0.591715630 -1.516937
## 1929-Hoover.txt       1929-Hoover.txt 1929 -0.007755423 -1.452698
## 1889-Harrison.txt   1889-Harrison.txt 1889 -0.684028877 -1.193640
tail(df[order(df$dim2),])
##                                 docname year       dim1     dim2
## 2017-Trump.txt           2017-Trump.txt 2017  1.7303426 1.061690
## 2009-Obama.txt           2009-Obama.txt 2009  1.6267898 1.066331
## 1809-Madison.txt       1809-Madison.txt 1809 -0.9335471 1.092699
## 1841-Harrison.txt     1841-Harrison.txt 1841 -1.0014370 1.172808
## 1813-Madison.txt       1813-Madison.txt 1813 -0.7318243 1.200349
## 1789-Washington.txt 1789-Washington.txt 1789 -0.9636421 1.512478
plot(df$dim1, df$dim2, type="n")
text(df$dim1, df$dim2, labels=df$year)

# features
df <- data.frame(
  featname = featnames(inaugdfm),
  dim1 = out$colcoord[,1],
  dim2 = out$colcoord[,2],
  stringsAsFactors=FALSE
)

head(df[order(df$dim1),])
##                  featname      dim1     dim2
## customary       customary -1.809336 1.501415
## convinces       convinces -1.809336 1.501415
## admonishes     admonishes -1.809336 1.501415
## superintend   superintend -1.809336 1.501415
## confederate   confederate -1.809336 1.501415
## transcending transcending -1.809336 1.501415
tail(df[order(df$dim1),])
##                featname    dim1     dim2
## myriad           myriad 3.05542 1.454525
## well-doing   well-doing 3.05542 1.454525
## reap               reap 3.05542 1.454525
## faint             faint 3.05542 1.454525
## joyful           joyful 3.05542 1.454525
## mountaintop mountaintop 3.05542 1.454525
head(df[order(df$dim2),])
##              featname      dim1      dim2
## noting         noting 0.9293042 -5.674729
## ruggedness ruggedness 0.9293042 -5.674729
## withstood   withstood 0.9293042 -5.674729
## breathes     breathes 0.9293042 -5.674729
## clarified   clarified 0.9293042 -5.674729
## atmosphere atmosphere 0.9293042 -5.674729
tail(df[order(df$dim2),])
##                    featname      dim1     dim2
## renounce           renounce -1.587406 3.073909
## inapplicable   inapplicable -1.587406 3.073909
## emoluments       emoluments -1.587406 3.073909
## indispensably indispensably -1.587406 3.073909
## supplication   supplication -1.587406 3.073909
## deliberating   deliberating -1.587406 3.073909