library(quanteda)

## Package version: 3.2.3
## Unicode version: 14.0
## ICU version: 70.1

## Parallel computing: 8 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

library(quanteda.textstats)
library(quanteda.textplots)

Lexical diversity

textstat_lexdiv() calculates various measures of lexical diversity based on the number of unique types of tokens and the length of a document. It is useful for analyzing speakers’ or writers’ linguistic skill, or complexity of ideas expressed in documents.

# creating DFM for corpus of inaugural speeches
dfmat_inaug <- dfm(tokens(data_corpus_inaugural), 
                   verbose=TRUE)

## Creating a dfm from a tokens input...

##  ...lowercasing

##  ...found 59 documents, 9,439 features

##  ...complete, elapsed time: 0.264 seconds.

## Finished constructing a 59 x 9,439 sparse dfm.

# removing stopwords
dfmat_inaug <- dfm_remove(dfmat_inaug, stopwords('english'))
tstat_lexdiv <- textstat_lexdiv(dfmat_inaug)
tail(tstat_lexdiv, 5)

##      document       TTR
## 55  2005-Bush 0.6176753
## 56 2009-Obama 0.6828645
## 57 2013-Obama 0.6605238
## 58 2017-Trump 0.6409537
## 59 2021-Biden 0.5572316

plot(tstat_lexdiv$TTR, type = 'l', xaxt = 'n', 
     xlab = NULL, ylab = "TTR")
grid()
axis(1, at = seq_len(nrow(tstat_lexdiv)), labels = docvars(dfmat_inaug, 'President'))

Readability

textstat_readability() computes a metric of document complexity based on characteristics of the text such as number of words, sentence length, number of syllables, etc.

stat_read <- textstat_readability(data_corpus_inaugural,
                     measure = c("Flesch.Kincaid", "FOG"))
plot(stat_read$Flesch.Kincaid, type = 'l', xaxt = 'n', xlab = NULL, ylab = "Flesch.Kincaid")
grid()
axis(1, at = seq_len(nrow(stat_read)), 
     labels = docvars(dfmat_inaug, 'President'))

Distance metrics

textstat_simil() computes matrices of distances and similarities between documents. It is useful for comparing the feature use across different documents.

Euclidean distance:

docs <- c("this is document one", "this is document two")
(doc_dfm <- dfm(tokens(docs)))

## Document-feature matrix of: 2 documents, 5 features (20.00% sparse) and 0 docvars.
##        features
## docs    this is document one two
##   text1    1  1        1   1   0
##   text2    1  1        1   0   1

textstat_dist(doc_dfm, method="euclidean")

## textstat_dist object; method = "euclidean"
##       text1 text2
## text1     0  1.41
## text2  1.41     0

# let's do the math...
(d1 <- as.numeric(doc_dfm[1,]))

## [1] 1 1 1 1 0

(d2 <- as.numeric(doc_dfm[2,]))

## [1] 1 1 1 0 1

sqrt(sum((d1 - d2)^2))

## [1] 1.414214

Cosine similarity:

textstat_simil(doc_dfm, method="cosine")

## textstat_simil object; method = "cosine"
##       text1 text2
## text1  1.00  0.75
## text2  0.75  1.00

# some more math...
sum(d1 * d2) / ( sqrt(sum(d1^2)) *  sqrt(sum(d2^2)) )

## [1] 0.75

Note that these two metrics measure the opposite thing: Euclidean distance measures how different documents are, whereas cosine similarity measures how similar documents are. Of course, it’s easy to reverse them; generally, we can just say (1 - distance) = similarity.

And here’s an example of how we would apply these metrics in practice. Let’s say I want to build a recommendation engine for Taylor Swift songs, obtained from this source.

library(readr)
ts <- read_csv("../data/taylor_swift_lyrics.csv")

## Rows: 132 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Artist, Album, Title, Lyrics
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# now we find a song I like
song <- which(ts$Title=="You Need To Calm Down")
ts[song,]

## # A tibble: 1 × 4
##   Artist       Album Title                 Lyrics                               
##   <chr>        <chr> <chr>                 <chr>                                
## 1 Taylor Swift Lover You Need To Calm Down "You are somebody that I don't know\…

# pre-process the data
corp <- corpus(ts, text_field = "Lyrics")
docnames(corp) <- docvars(corp)$Title
toks <- tokens(corp, remove_punct=TRUE, remove_numbers=TRUE)
tdfm <- dfm(toks, verbose=TRUE, remove=stopwords("english"))

## Creating a dfm from a tokens input...
##  ...lowercasing
##  ...found 132 documents, 3,052 features

## Warning: 'remove' is deprecated; use dfm_remove() instead

##  ...removed 161 features
##  ...complete, elapsed time: 0.036 seconds.
## Finished constructing a 132 x 2,891 sparse dfm.

# and I will compute cosine similarity with respect to all other movies
simil <- textstat_simil(x=tdfm, 
                        y=tdfm[song,], method="cosine")
simil <- simil[order(simil, decreasing=TRUE),]
head(simil, n=5)

##                   You Need To Calm Down We Are Never Ever Getting Back Together 
##                               1.0000000                               0.3860419 
##                          State of Grace                       Come Back Be Here 
##                               0.2801466                               0.2408928 
##                               Invisible 
##                               0.2376642

# and we can read their lyrics
message(ts$Lyrics[ts$Title == names(simil)[2]])

## I remember when we broke up, the first time
## Saying, "This is it, I've had enough"
## 'Cause like we hadn't seen each other in a month
## When you, said you, needed space (what?)
## Then you come around again and say
## "Baby, I miss you and I swear I'm gonna change, trust me"
## Remember how that lasted for a day?
## I say, I hate you, we break up, you call me, I love you
## Ooh, oh-oh, oh-oh
## We called it off again last night
## But ooh, oh-oh, oh-oh
## This time I'm telling you, I'm telling you
## We are never ever, ever getting back together
## We are never ever, ever getting back together
## You go talk to your friends, talk to my friends, talk to me
## But we are never ever, ever, ever getting back together
## Like, ever
## I'm really gonna miss you picking fights
## And me, falling for it screaming that I'm right
## And you, would hide away and find your peace of mind
## With some indie record that's much cooler than mine
## Ooh, oh-oh, oh-oh
## You called me up again tonight
## But ooh, oh-oh, oh-oh
## This time I'm telling you, I'm telling you
## We are never ever, ever getting back together
## We are never ever, ever getting back together
## You go talk to your friends, talk to my friends, talk to me (talk to me)
## But, we are never ever, ever, ever getting back together
## Ooh, oh-oh, oh-oh yeah
## Ooh, oh-oh, oh-oh-oh yeah
## Ooh, oh-oh, oh-oh yeah
## Oh-oh-oh
## I used to think that we were forever ever, ever
## And I used to say, "Never say never"
## "Huh, so he calls me up and he's like, I still love you
## And I'm like I just I mean this is exhausting, you know
## Like we are never getting back together like, ever"
## We are never ever, ever getting back together
## We are never ever, ever getting back together
## You go talk to your friends, talk to my friends, talk to me
## But we are never ever ever ever getting back together
## (We) ooh, oh-oh, oh-oh
## Ooh, oh-oh, oh-oh (getting back together)
## (We) ooh, oh-oh, oh-oh
## Ooh, oh-oh, oh-oh (getting back together)
## You go talk to your friends, talk to my friends, talk to me (talk to me)
## But we are never ever, ever, ever getting back together

Identifying most unique features of documents

Keyness is a measure of to what extent some features are specific to a (group of) document in comparison to the rest of the corpus, taking into account that some features may be too rare.

# Donald Trump
head(textstat_keyness(dfmat_inaug, target="2017-Trump",
                      measure="chi2"), n=20)

##         feature      chi2            p n_target n_reference
## 1         obama 175.47131 0.000000e+00        3           0
## 2       america  98.56898 0.000000e+00       18         184
## 3   politicians  94.45425 0.000000e+00        2           0
## 4         stops  94.45425 0.000000e+00        2           0
## 5  transferring  94.45425 0.000000e+00        2           0
## 6     trillions  94.45425 0.000000e+00        2           0
## 7      everyone  89.53885 0.000000e+00        4           7
## 8        dreams  88.59008 0.000000e+00        5          13
## 9          jobs  68.89993 1.110223e-16        4          10
## 10         back  68.70331 1.110223e-16        6          27
## 11    protected  67.46086 2.220446e-16        5          18
## 12     mountain  61.97813 3.441691e-15        2           1
## 13      winning  61.97813 3.441691e-15        2           1
## 14    factories  55.21629 1.080247e-13        3           6
## 15       across  51.74990 6.303846e-13        5          24
## 16      borders  49.21176 2.297718e-12        3           7
## 17       breath  45.74596 1.346268e-11        2           2
## 18    victories  45.74596 1.346268e-11        2           2
## 19     american  36.39465 1.611451e-09       11         161
## 20    righteous  36.01137 1.961690e-09        2           3

textplot_keyness(textstat_keyness(dfmat_inaug, target="2017-Trump",
                      measure="chi2"))

# Joe Biden
head(textstat_keyness(dfmat_inaug, target="2021-Biden",
                      measure="chi2"), n=20)

##         feature      chi2            p n_target n_reference
## 1         virus 155.77280 0.000000e+00        4           0
## 2         story 150.17410 0.000000e+00        9          14
## 3             . 136.22694 0.000000e+00      210        4945
## 4       uniting 105.72925 0.000000e+00        3           0
## 5           cry  78.05765 0.000000e+00        3           1
## 6         unity  66.28899 4.440892e-16        8          29
## 7  disagreement  56.79464 4.840572e-14        2           0
## 8     extremism  56.79464 4.840572e-14        2           0
## 9        harris  56.79464 4.840572e-14        2           0
## 10     pandemic  56.79464 4.840572e-14        2           0
## 11       versus  56.79464 4.840572e-14        2           0
## 12    democracy  52.56633 4.160006e-13       10          58
## 13      america  48.99512 2.565947e-12       18         184
## 14         days  46.78806 7.909340e-12        6          22
## 15         soul  43.22394 4.881962e-11        4           9
## 16          get  42.51329 7.020373e-11        3           4
## 17    prevailed  42.51329 7.020373e-11        3           4
## 18      another  40.52155 1.944591e-10        9          59
## 19       racism  36.87656 1.258497e-09        2           1
## 20         lost  33.88868 5.835730e-09        4          12

textplot_keyness(textstat_keyness(dfmat_inaug, target="2021-Biden",
                      measure="chi2"))

Another example using song lyrics:

head(textstat_keyness(tdfm,
                      target=docvars(tdfm, "Album")=="reputation"), n=20)

##       feature      chi2            p n_target n_reference
## 1          ha 156.72632 0.000000e+00       38          14
## 2        made 139.89533 0.000000e+00       51          40
## 3     getaway 126.14359 0.000000e+00       21           0
## 4     endgame  94.39328 0.000000e+00       16           0
## 5    delicate  87.03835 0.000000e+00       16           1
## 6         ooh  70.48511 0.000000e+00       36          42
## 7    gorgeous  69.02227 1.110223e-16       12           0
## 8        usin  69.02227 1.110223e-16       12           0
## 9        look  67.25179 2.220446e-16       46          70
## 10      hands  58.25555 2.298162e-14       23          20
## 11 reputation  55.74006 8.271162e-14       11           1
## 12       tied  54.92990 1.249001e-13       15           6
## 13       call  51.41185 7.488454e-13       26          30
## 14      begin  46.21898 1.057476e-11       12           4
## 15       rest  43.87590 3.498701e-11       10           2
## 16        car  43.63204 3.962974e-11       21          23
## 17       doin  43.35604 4.563261e-11        9           1
## 18       lord  43.35604 4.563261e-11        9           1
## 19      games  40.68419 1.789292e-10       13           7
## 20     bought  39.11303 3.999664e-10       10           3

head(textstat_keyness(tdfm,
                      target=docvars(tdfm, "Album")=="folklore"), n=20)

##      feature     chi2            p n_target n_reference
## 1      woman 88.90694 0.000000e+00       11           0
## 2       gave 67.24075 2.220446e-16       17          13
## 3     pulled 62.24728 2.997602e-15        9           1
## 4     showed 59.75524 1.076916e-14       11           4
## 5      signs 57.23415 3.863576e-14       10           3
## 6       film 44.93604 2.035760e-11        7           1
## 7     august 44.27899 2.847556e-11        6           0
## 8   breathin 44.27899 2.847556e-11        6           0
## 9       seen 40.71022 1.765617e-10       12          11
## 10   warning 36.41510 1.594633e-09        6           1
## 11      give 35.83813 2.144101e-09       12          13
## 12     betty 35.40615 2.676408e-09        5           0
## 13   ruining 35.40615 2.676408e-09        5           0
## 14  would've 32.91234 9.640922e-09        9           7
## 15 marvelous 29.32065 6.133858e-08        7           4
## 16 someone's 28.04325 1.186342e-07        5           1
## 17     angry 26.58024 2.528159e-07        4           0
## 18    assume 26.58024 2.528159e-07        4           0
## 19  cardigan 26.58024 2.528159e-07        4           0
## 20   cursing 26.58024 2.528159e-07        4           0

Describing and comparing documents

Lexical diversity

Readability

Distance metrics

Identifying most unique features of documents