Beyond this type of exploratory analysis, word embeddings can be very useful in analyses of large-scale text corpora in two different ways: to expand existing dictionaries and as a way to build features for a supervised learning classifier.
The code below shows how to expand a dictionary of uncivil words. By looking for other words with semantic similarity to each of these terms, we can identify words that we may not have thought of in the first place, either because they’re slang, new words or just misspellings of existing words.
Here we will use a different set of pre-trained word embeddings, which were computed on a large corpus of public Facebook posts on the pages of US Members of Congress that we collected from the Graph API.
#library(devtools)
#install_github("mukul13/rword2vec")
library(rword2vec)
library(lsa)
## Loading required package: SnowballC
distance(file_name = "../data/FBvec.bin",
search_word = "liberal",
num = 10)
## Entered word or sentence: liberal
##
## Word: liberal Position in vocabulary: 428
## word dist
## 1 leftist 0.875029563903809
## 2 lefty 0.808053195476532
## 3 lib 0.774020493030548
## 4 rightwing 0.768333077430725
## 5 progressive 0.766966998577118
## 6 left-wing 0.74224179983139
## 7 statist 0.741962492465973
## 8 right-wing 0.740352988243103
## 9 far-left 0.733825862407684
## 10 leftwing 0.715518414974213
distance(file_name = "../data/FBvec.bin",
search_word = "crooked",
num = 10)
## Entered word or sentence: crooked
##
## Word: crooked Position in vocabulary: 2225
## word dist
## 1 corrupt 0.782054841518402
## 2 thieving 0.683514535427094
## 3 slimy 0.675886511802673
## 4 teflon 0.669225692749023
## 5 crook 0.660020768642426
## 6 corupt 0.651829242706299
## 7 dishonest 0.645328283309937
## 8 conniving 0.636701285839081
## 9 corporatist 0.629674255847931
## 10 untrustworthy 0.623017013072968
distance(file_name = "../data/FBvec.bin",
search_word = "libtard",
num = 10)
## Entered word or sentence: libtard
##
## Word: libtard Position in vocabulary: 5753
## word dist
## 1 lib 0.798957586288452
## 2 lefty 0.771853387355804
## 3 libturd 0.762575328350067
## 4 teabagger 0.744283258914948
## 5 teabilly 0.715277075767517
## 6 liberal 0.709996342658997
## 7 retard 0.690707504749298
## 8 dumbass 0.690422177314758
## 9 rwnj 0.684058785438538
## 10 republitard 0.678197801113129
distance(file_name = "../data/FBvec.bin",
search_word = "douchebag",
num = 10)
## Entered word or sentence: douchebag
##
## Word: douchebag Position in vocabulary: 9781
## word dist
## 1 scumbag 0.808189928531647
## 2 moron 0.80128538608551
## 3 hypocrite 0.787607729434967
## 4 jackass 0.783857941627502
## 5 shitbag 0.773443937301636
## 6 pos 0.76619291305542
## 7 dipshit 0.757693469524384
## 8 loser 0.756536900997162
## 9 coward 0.755453526973724
## 10 poser 0.750370919704437
distance(file_name = "../data/FBvec.bin",
search_word = "idiot",
num = 10)
## Entered word or sentence: idiot
##
## Word: idiot Position in vocabulary: 646
## word dist
## 1 imbecile 0.867565214633942
## 2 asshole 0.848560094833374
## 3 moron 0.781079053878784
## 4 asshat 0.772150039672852
## 5 a-hole 0.765781462192535
## 6 ahole 0.760824918746948
## 7 asswipe 0.742586553096771
## 8 ignoramus 0.735219776630402
## 9 arsehole 0.732272684574127
## 10 idoit 0.720151424407959
We can also take the embeddings themselves as features at the word level and then aggregate to a document level as an alternative or complement to bag-of-word approaches.
Let’s see how this would work by running a supervised learning classifier that can help us predict incivility in a corpus of public Facebook comments.
library(quanteda)
## Package version: 3.2.3
## Unicode version: 14.0
## ICU version: 70.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
fb <- read.csv("../data/incivility.csv", stringsAsFactors = FALSE)
fb %>%
corpus(text_field="comment_message") %>%
tokens(remove_url=TRUE) %>%
dfm() %>%
dfm_wordstem() %>%
dfm_remove(stopwords("english")) %>%
dfm_trim(min_docfreq = 2, verbose=TRUE) -> fbdfm
## Removing features occurring:
## - in fewer than 2 documents: 4,055
## Total features removed: 4,055 (49.5%).
To get a sense of how easy or difficult this task is, first let’s run a bag-of-words lasso classifier:
set.seed(777)
training <- sample(1:nrow(fb), floor(.80 * nrow(fb)))
test <- (1:nrow(fb))[1:nrow(fb) %in% training == FALSE]
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
require(doMC)
## Loading required package: doMC
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(cores=3)
lasso <- cv.glmnet(fbdfm[training,], fb$attacks[training],
family="binomial", alpha=1, nfolds=5, parallel=TRUE, intercept=TRUE)
## function to compute performance metrics
performance <- function(ypred, y){
tab <- table(ypred, y)
accuracy <- (tab[1,1]+tab[2,2])/sum(tab)
precision <- (tab[2,2])/(tab[2,1]+tab[2,2])
recall <- tab[2,2]/(tab[1,2]+tab[2,2])
message("Accuracy = ", round(accuracy, 2), "\n",
"Precision = ", round(precision, 2), "\n",
"Recall = ", round(recall, 2))
}
# computing predicted values
preds <- predict(lasso, fbdfm[test,], type="class")
# confusion matrix
table(preds, fb$attacks[test])
##
## preds 0 1
## 0 66 26
## 1 174 343
# performance metrics
performance(preds, fb$attacks[test])
## Accuracy = 0.67
## Precision = 0.66
## Recall = 0.93
performance(preds==0, fb$attacks[test]==0)
## Accuracy = 0.67
## Precision = 0.72
## Recall = 0.28
Now let’s try adding word embeddings as features. To do so, first we will convert the word embeddings to a data frame, and then we will match the features from each document with their corresponding embeddings.
bin_to_txt("../data/FBvec.bin", "../data/FBvector.txt")
## $rfile_name
## [1] "../data/FBvec.bin"
##
## $routput_file
## [1] "../data/FBvector.txt"
# extracting word embeddings for words in corpus
w2v <- readr::read_delim("../data/FBvector.txt",
skip=1, delim=" ", quote="",
col_names=c("word", paste0("V", 1:100)))
## Rows: 107384 Columns: 101
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): word
## dbl (100): V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# keeping only embeddings for words in corpus
fb %>%
corpus(text_field="comment_message") %>%
tokens(remove_url=TRUE, remove_punct=TRUE) %>%
dfm() %>%
dfm_remove(stopwords("english")) -> fbdfm
w2v <- w2v[w2v$word %in% featnames(fbdfm),]
# let's do one comment as an example
fb$comment_message[3] # raw text
## [1] "Unbelievable level of incompetence!"
# bag-of-words DFM
vec <- as.numeric(fbdfm[3,])
# which words are not 0s?
(doc_words <- featnames(fbdfm)[vec>0])
## [1] "unbelievable" "level" "incompetence"
# let's extract the embeddings for those words
embed_vec <- w2v[w2v$word %in% doc_words, 2:101]
# a glimpse into the data
embed_vec[1:3, 1:10]
## # A tibble: 3 × 10
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -3.33 1.31 1.36 1.27 1.17 -1.36 -0.391 -0.106 0.525 -0.218
## 2 -0.471 0.485 -1.28 0.139 0.979 -0.643 -0.825 0.374 -1.64 -0.241
## 3 -2.34 -0.631 -0.368 0.0512 -1.20 -1.74 1.86 -1.91 0.460 0.353
# and now we aggregate to the comment level
embed <- colMeans(embed_vec)
# instead of feature counts, now this is how we represent the comment:
round(embed,2)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## -2.04 0.39 -0.09 0.49 0.32 -1.25 0.22 -0.55 -0.22 -0.04 0.72 -0.25 -0.96
## V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26
## -2.15 -1.46 0.10 -0.40 -0.32 0.70 0.64 -1.72 0.54 -1.70 -0.23 -0.31 0.29
## V27 V28 V29 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39
## -0.41 2.29 -0.20 -0.90 -0.33 0.89 0.12 -0.12 -0.64 1.67 -0.68 1.09 2.99
## V40 V41 V42 V43 V44 V45 V46 V47 V48 V49 V50 V51 V52
## 0.55 -1.92 0.94 -0.11 -0.07 -1.44 0.28 0.35 0.06 -0.25 -0.33 0.25 0.57
## V53 V54 V55 V56 V57 V58 V59 V60 V61 V62 V63 V64 V65
## -0.82 0.47 -1.49 1.15 0.67 0.19 0.44 0.33 1.21 -0.70 2.16 2.38 -1.72
## V66 V67 V68 V69 V70 V71 V72 V73 V74 V75 V76 V77 V78
## 0.08 1.24 1.85 -0.70 0.96 -1.80 -0.14 -0.32 0.46 1.56 -0.35 -0.63 1.18
## V79 V80 V81 V82 V83 V84 V85 V86 V87 V88 V89 V90 V91
## -0.95 -0.92 -0.47 0.26 -0.54 0.74 1.60 0.86 -0.38 0.00 -0.70 -1.09 2.00
## V92 V93 V94 V95 V96 V97 V98 V99 V100
## -0.55 -1.15 -0.59 1.24 -1.36 1.09 0.96 -0.59 -0.62
## now the same thing but for all comments:
# creating new feature matrix for embeddings
embed <- matrix(NA, nrow=ndoc(fbdfm), ncol=100)
for (i in 1:ndoc(fbdfm)){
if (i %% 100 == 0) message(i, '/', ndoc(fbdfm))
# extract word counts
vec <- as.numeric(fbdfm[i,])
# keep words with counts of 1 or more
doc_words <- featnames(fbdfm)[vec>0]
# extract embeddings for those words
embed_vec <- w2v[w2v$word %in% doc_words, 2:101]
# aggregate from word- to document-level embeddings by taking AVG
embed[i,] <- colMeans(embed_vec, na.rm=TRUE)
# if no words in embeddings, simply set to 0
if (nrow(embed_vec)==0) embed[i,] <- 0
}
## 100/3043
## 200/3043
## 300/3043
## 400/3043
## 500/3043
## 600/3043
## 700/3043
## 800/3043
## 900/3043
## 1000/3043
## 1100/3043
## 1200/3043
## 1300/3043
## 1400/3043
## 1500/3043
## 1600/3043
## 1700/3043
## 1800/3043
## 1900/3043
## 2000/3043
## 2100/3043
## 2200/3043
## 2300/3043
## 2400/3043
## 2500/3043
## 2600/3043
## 2700/3043
## 2800/3043
## 2900/3043
## 3000/3043
Let’s now try to replicate the lasso classifier we estimated earlier with this new feature set.
library(glmnet)
require(doMC)
registerDoMC(cores=3)
lasso <- cv.glmnet(embed[training,], fb$attacks[training],
family="binomial", alpha=1, nfolds=5, parallel=TRUE, intercept=TRUE)
# computing predicted values
preds <- predict(lasso, embed[test,], type="class")
# confusion matrix
table(preds, fb$attacks[test])
##
## preds 0 1
## 0 146 74
## 1 94 295
# performance metrics
performance(preds, fb$attacks[test])
## Accuracy = 0.72
## Precision = 0.76
## Recall = 0.8
performance(preds==0, fb$attacks[test]==0)
## Accuracy = 0.72
## Precision = 0.66
## Recall = 0.61
We generally find quite good performance with a much smaller set of features. Of course, one downside of this approach is that it’s very hard to interpret the coefficients we get from the lasso regression.
best.lambda <- which(lasso$lambda==lasso$lambda.1se)
beta <- lasso$glmnet.fit$beta[,best.lambda]
head(beta)
## V1 V2 V3 V4 V5 V6
## -0.009800617 -0.008686552 -0.107554636 0.000000000 -0.138759662 0.000000000
## identifying predictive features
df <- data.frame(coef = as.numeric(beta),
word = names(beta), stringsAsFactors=F)
df <- df[order(df$coef),]
head(df[,c("coef", "word")], n=30)
## coef word
## 16 -0.198572472 V16
## 51 -0.157944923 V51
## 32 -0.146243445 V32
## 54 -0.142586548 V54
## 5 -0.138759662 V5
## 94 -0.124538221 V94
## 14 -0.122831676 V14
## 3 -0.107554636 V3
## 45 -0.091359423 V45
## 91 -0.091077579 V91
## 38 -0.078903086 V38
## 49 -0.069730874 V49
## 26 -0.063569656 V26
## 40 -0.056534605 V40
## 29 -0.050364729 V29
## 98 -0.023280724 V98
## 89 -0.013449683 V89
## 1 -0.009800617 V1
## 2 -0.008686552 V2
## 4 0.000000000 V4
## 6 0.000000000 V6
## 8 0.000000000 V8
## 9 0.000000000 V9
## 11 0.000000000 V11
## 18 0.000000000 V18
## 19 0.000000000 V19
## 22 0.000000000 V22
## 23 0.000000000 V23
## 24 0.000000000 V24
## 25 0.000000000 V25
df <- df[order(df$coef, decreasing=TRUE),]
head(df[,c("coef", "word")], n=30)
## coef word
## 83 0.4678749311 V83
## 36 0.3168821630 V36
## 39 0.2638583579 V39
## 86 0.1919064570 V86
## 28 0.1659097625 V28
## 82 0.1559398616 V82
## 20 0.1522701495 V20
## 30 0.1464040583 V30
## 12 0.1317909466 V12
## 47 0.1257574480 V47
## 10 0.1180020632 V10
## 21 0.0960462444 V21
## 97 0.0862319763 V97
## 77 0.0760793242 V77
## 43 0.0657338714 V43
## 17 0.0655732201 V17
## 88 0.0529407499 V88
## 78 0.0445873022 V78
## 7 0.0391439210 V7
## 13 0.0355189129 V13
## 81 0.0341516364 V81
## 76 0.0219520409 V76
## 37 0.0198285131 V37
## 99 0.0029831180 V99
## 58 0.0011690824 V58
## 15 0.0006897521 V15
## 4 0.0000000000 V4
## 6 0.0000000000 V6
## 8 0.0000000000 V8
## 9 0.0000000000 V9