library(quanteda)
## Package version: 3.2.3
## Unicode version: 14.0
## ICU version: 70.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
library(quanteda.textstats)
library(quanteda.textplots)
textstat_lexdiv()
calculates various measures of lexical
diversity based on the number of unique types of tokens and the length
of a document. It is useful for analyzing speakers’ or writers’
linguistic skill, or complexity of ideas expressed in documents.
# creating DFM for corpus of inaugural speeches
dfmat_inaug <- dfm(tokens(data_corpus_inaugural),
verbose=TRUE)
## Creating a dfm from a tokens input...
## ...lowercasing
## ...found 59 documents, 9,439 features
## ...complete, elapsed time: 0.264 seconds.
## Finished constructing a 59 x 9,439 sparse dfm.
# removing stopwords
dfmat_inaug <- dfm_remove(dfmat_inaug, stopwords('english'))
tstat_lexdiv <- textstat_lexdiv(dfmat_inaug)
tail(tstat_lexdiv, 5)
## document TTR
## 55 2005-Bush 0.6176753
## 56 2009-Obama 0.6828645
## 57 2013-Obama 0.6605238
## 58 2017-Trump 0.6409537
## 59 2021-Biden 0.5572316
plot(tstat_lexdiv$TTR, type = 'l', xaxt = 'n',
xlab = NULL, ylab = "TTR")
grid()
axis(1, at = seq_len(nrow(tstat_lexdiv)), labels = docvars(dfmat_inaug, 'President'))
textstat_readability()
computes a metric of document
complexity based on characteristics of the text such as number of words,
sentence length, number of syllables, etc.
stat_read <- textstat_readability(data_corpus_inaugural,
measure = c("Flesch.Kincaid", "FOG"))
plot(stat_read$Flesch.Kincaid, type = 'l', xaxt = 'n', xlab = NULL, ylab = "Flesch.Kincaid")
grid()
axis(1, at = seq_len(nrow(stat_read)),
labels = docvars(dfmat_inaug, 'President'))
textstat_simil()
computes matrices of distances and
similarities between documents. It is useful for comparing the feature
use across different documents.
Euclidean distance:
docs <- c("this is document one", "this is document two")
(doc_dfm <- dfm(tokens(docs)))
## Document-feature matrix of: 2 documents, 5 features (20.00% sparse) and 0 docvars.
## features
## docs this is document one two
## text1 1 1 1 1 0
## text2 1 1 1 0 1
textstat_dist(doc_dfm, method="euclidean")
## textstat_dist object; method = "euclidean"
## text1 text2
## text1 0 1.41
## text2 1.41 0
# let's do the math...
(d1 <- as.numeric(doc_dfm[1,]))
## [1] 1 1 1 1 0
(d2 <- as.numeric(doc_dfm[2,]))
## [1] 1 1 1 0 1
sqrt(sum((d1 - d2)^2))
## [1] 1.414214
Cosine similarity:
textstat_simil(doc_dfm, method="cosine")
## textstat_simil object; method = "cosine"
## text1 text2
## text1 1.00 0.75
## text2 0.75 1.00
# some more math...
sum(d1 * d2) / ( sqrt(sum(d1^2)) * sqrt(sum(d2^2)) )
## [1] 0.75
Note that these two metrics measure the opposite thing: Euclidean distance measures how different documents are, whereas cosine similarity measures how similar documents are. Of course, it’s easy to reverse them; generally, we can just say (1 - distance) = similarity.
And here’s an example of how we would apply these metrics in practice. Let’s say I want to build a recommendation engine for Taylor Swift songs, obtained from this source.
library(readr)
ts <- read_csv("../data/taylor_swift_lyrics.csv")
## Rows: 132 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Artist, Album, Title, Lyrics
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# now we find a song I like
song <- which(ts$Title=="You Need To Calm Down")
ts[song,]
## # A tibble: 1 × 4
## Artist Album Title Lyrics
## <chr> <chr> <chr> <chr>
## 1 Taylor Swift Lover You Need To Calm Down "You are somebody that I don't know\…
# pre-process the data
corp <- corpus(ts, text_field = "Lyrics")
docnames(corp) <- docvars(corp)$Title
toks <- tokens(corp, remove_punct=TRUE, remove_numbers=TRUE)
tdfm <- dfm(toks, verbose=TRUE, remove=stopwords("english"))
## Creating a dfm from a tokens input...
## ...lowercasing
## ...found 132 documents, 3,052 features
## Warning: 'remove' is deprecated; use dfm_remove() instead
## ...removed 161 features
## ...complete, elapsed time: 0.036 seconds.
## Finished constructing a 132 x 2,891 sparse dfm.
# and I will compute cosine similarity with respect to all other movies
simil <- textstat_simil(x=tdfm,
y=tdfm[song,], method="cosine")
simil <- simil[order(simil, decreasing=TRUE),]
head(simil, n=5)
## You Need To Calm Down We Are Never Ever Getting Back Together
## 1.0000000 0.3860419
## State of Grace Come Back Be Here
## 0.2801466 0.2408928
## Invisible
## 0.2376642
# and we can read their lyrics
message(ts$Lyrics[ts$Title == names(simil)[2]])
## I remember when we broke up, the first time
## Saying, "This is it, I've had enough"
## 'Cause like we hadn't seen each other in a month
## When you, said you, needed space (what?)
## Then you come around again and say
## "Baby, I miss you and I swear I'm gonna change, trust me"
## Remember how that lasted for a day?
## I say, I hate you, we break up, you call me, I love you
## Ooh, oh-oh, oh-oh
## We called it off again last night
## But ooh, oh-oh, oh-oh
## This time I'm telling you, I'm telling you
## We are never ever, ever getting back together
## We are never ever, ever getting back together
## You go talk to your friends, talk to my friends, talk to me
## But we are never ever, ever, ever getting back together
## Like, ever
## I'm really gonna miss you picking fights
## And me, falling for it screaming that I'm right
## And you, would hide away and find your peace of mind
## With some indie record that's much cooler than mine
## Ooh, oh-oh, oh-oh
## You called me up again tonight
## But ooh, oh-oh, oh-oh
## This time I'm telling you, I'm telling you
## We are never ever, ever getting back together
## We are never ever, ever getting back together
## You go talk to your friends, talk to my friends, talk to me (talk to me)
## But, we are never ever, ever, ever getting back together
## Ooh, oh-oh, oh-oh yeah
## Ooh, oh-oh, oh-oh-oh yeah
## Ooh, oh-oh, oh-oh yeah
## Oh-oh-oh
## I used to think that we were forever ever, ever
## And I used to say, "Never say never"
## "Huh, so he calls me up and he's like, I still love you
## And I'm like I just I mean this is exhausting, you know
## Like we are never getting back together like, ever"
## We are never ever, ever getting back together
## We are never ever, ever getting back together
## You go talk to your friends, talk to my friends, talk to me
## But we are never ever ever ever getting back together
## (We) ooh, oh-oh, oh-oh
## Ooh, oh-oh, oh-oh (getting back together)
## (We) ooh, oh-oh, oh-oh
## Ooh, oh-oh, oh-oh (getting back together)
## You go talk to your friends, talk to my friends, talk to me (talk to me)
## But we are never ever, ever, ever getting back together
Keyness is a measure of to what extent some features are specific to a (group of) document in comparison to the rest of the corpus, taking into account that some features may be too rare.
# Donald Trump
head(textstat_keyness(dfmat_inaug, target="2017-Trump",
measure="chi2"), n=20)
## feature chi2 p n_target n_reference
## 1 obama 175.47131 0.000000e+00 3 0
## 2 america 98.56898 0.000000e+00 18 184
## 3 politicians 94.45425 0.000000e+00 2 0
## 4 stops 94.45425 0.000000e+00 2 0
## 5 transferring 94.45425 0.000000e+00 2 0
## 6 trillions 94.45425 0.000000e+00 2 0
## 7 everyone 89.53885 0.000000e+00 4 7
## 8 dreams 88.59008 0.000000e+00 5 13
## 9 jobs 68.89993 1.110223e-16 4 10
## 10 back 68.70331 1.110223e-16 6 27
## 11 protected 67.46086 2.220446e-16 5 18
## 12 mountain 61.97813 3.441691e-15 2 1
## 13 winning 61.97813 3.441691e-15 2 1
## 14 factories 55.21629 1.080247e-13 3 6
## 15 across 51.74990 6.303846e-13 5 24
## 16 borders 49.21176 2.297718e-12 3 7
## 17 breath 45.74596 1.346268e-11 2 2
## 18 victories 45.74596 1.346268e-11 2 2
## 19 american 36.39465 1.611451e-09 11 161
## 20 righteous 36.01137 1.961690e-09 2 3
textplot_keyness(textstat_keyness(dfmat_inaug, target="2017-Trump",
measure="chi2"))
# Joe Biden
head(textstat_keyness(dfmat_inaug, target="2021-Biden",
measure="chi2"), n=20)
## feature chi2 p n_target n_reference
## 1 virus 155.77280 0.000000e+00 4 0
## 2 story 150.17410 0.000000e+00 9 14
## 3 . 136.22694 0.000000e+00 210 4945
## 4 uniting 105.72925 0.000000e+00 3 0
## 5 cry 78.05765 0.000000e+00 3 1
## 6 unity 66.28899 4.440892e-16 8 29
## 7 disagreement 56.79464 4.840572e-14 2 0
## 8 extremism 56.79464 4.840572e-14 2 0
## 9 harris 56.79464 4.840572e-14 2 0
## 10 pandemic 56.79464 4.840572e-14 2 0
## 11 versus 56.79464 4.840572e-14 2 0
## 12 democracy 52.56633 4.160006e-13 10 58
## 13 america 48.99512 2.565947e-12 18 184
## 14 days 46.78806 7.909340e-12 6 22
## 15 soul 43.22394 4.881962e-11 4 9
## 16 get 42.51329 7.020373e-11 3 4
## 17 prevailed 42.51329 7.020373e-11 3 4
## 18 another 40.52155 1.944591e-10 9 59
## 19 racism 36.87656 1.258497e-09 2 1
## 20 lost 33.88868 5.835730e-09 4 12
textplot_keyness(textstat_keyness(dfmat_inaug, target="2021-Biden",
measure="chi2"))
Another example using song lyrics:
head(textstat_keyness(tdfm,
target=docvars(tdfm, "Album")=="reputation"), n=20)
## feature chi2 p n_target n_reference
## 1 ha 156.72632 0.000000e+00 38 14
## 2 made 139.89533 0.000000e+00 51 40
## 3 getaway 126.14359 0.000000e+00 21 0
## 4 endgame 94.39328 0.000000e+00 16 0
## 5 delicate 87.03835 0.000000e+00 16 1
## 6 ooh 70.48511 0.000000e+00 36 42
## 7 gorgeous 69.02227 1.110223e-16 12 0
## 8 usin 69.02227 1.110223e-16 12 0
## 9 look 67.25179 2.220446e-16 46 70
## 10 hands 58.25555 2.298162e-14 23 20
## 11 reputation 55.74006 8.271162e-14 11 1
## 12 tied 54.92990 1.249001e-13 15 6
## 13 call 51.41185 7.488454e-13 26 30
## 14 begin 46.21898 1.057476e-11 12 4
## 15 rest 43.87590 3.498701e-11 10 2
## 16 car 43.63204 3.962974e-11 21 23
## 17 doin 43.35604 4.563261e-11 9 1
## 18 lord 43.35604 4.563261e-11 9 1
## 19 games 40.68419 1.789292e-10 13 7
## 20 bought 39.11303 3.999664e-10 10 3
head(textstat_keyness(tdfm,
target=docvars(tdfm, "Album")=="folklore"), n=20)
## feature chi2 p n_target n_reference
## 1 woman 88.90694 0.000000e+00 11 0
## 2 gave 67.24075 2.220446e-16 17 13
## 3 pulled 62.24728 2.997602e-15 9 1
## 4 showed 59.75524 1.076916e-14 11 4
## 5 signs 57.23415 3.863576e-14 10 3
## 6 film 44.93604 2.035760e-11 7 1
## 7 august 44.27899 2.847556e-11 6 0
## 8 breathin 44.27899 2.847556e-11 6 0
## 9 seen 40.71022 1.765617e-10 12 11
## 10 warning 36.41510 1.594633e-09 6 1
## 11 give 35.83813 2.144101e-09 12 13
## 12 betty 35.40615 2.676408e-09 5 0
## 13 ruining 35.40615 2.676408e-09 5 0
## 14 would've 32.91234 9.640922e-09 9 7
## 15 marvelous 29.32065 6.133858e-08 7 4
## 16 someone's 28.04325 1.186342e-07 5 1
## 17 angry 26.58024 2.528159e-07 4 0
## 18 assume 26.58024 2.528159e-07 4 0
## 19 cardigan 26.58024 2.528159e-07 4 0
## 20 cursing 26.58024 2.528159e-07 4 0