Word embeddings

Applications of word embeddings

Beyond this type of exploratory analysis, word embeddings can be very useful in analyses of large-scale text corpora in two different ways: to expand existing dictionaries and as a way to build features for a supervised learning classifier.

The code below shows how to expand a dictionary of uncivil words. By looking for other words with semantic similarity to each of these terms, we can identify words that we may not have thought of in the first place, either because they’re slang, new words or just misspellings of existing words.

Here we will use a different set of pre-trained word embeddings, which were computed on a large corpus of public Facebook posts on the pages of US Members of Congress that we collected from the Graph API.

#library(devtools)
#install_github("mukul13/rword2vec")
library(rword2vec)
library(lsa)

## Loading required package: SnowballC

distance(file_name = "../data/FBvec.bin",
        search_word = "liberal",
        num = 10)

## Entered word or sentence: liberal
## 
## Word: liberal  Position in vocabulary: 428

##           word              dist
## 1      leftist 0.875029563903809
## 2        lefty 0.808053195476532
## 3          lib 0.774020493030548
## 4    rightwing 0.768333077430725
## 5  progressive 0.766966998577118
## 6    left-wing  0.74224179983139
## 7      statist 0.741962492465973
## 8   right-wing 0.740352988243103
## 9     far-left 0.733825862407684
## 10    leftwing 0.715518414974213

distance(file_name = "../data/FBvec.bin",
        search_word = "crooked",
        num = 10)

## Entered word or sentence: crooked
## 
## Word: crooked  Position in vocabulary: 2225

##             word              dist
## 1        corrupt 0.782054841518402
## 2       thieving 0.683514535427094
## 3          slimy 0.675886511802673
## 4         teflon 0.669225692749023
## 5          crook 0.660020768642426
## 6         corupt 0.651829242706299
## 7      dishonest 0.645328283309937
## 8      conniving 0.636701285839081
## 9    corporatist 0.629674255847931
## 10 untrustworthy 0.623017013072968

distance(file_name = "../data/FBvec.bin",
        search_word = "libtard",
        num = 10)

## Entered word or sentence: libtard
## 
## Word: libtard  Position in vocabulary: 5753

##           word              dist
## 1          lib 0.798957586288452
## 2        lefty 0.771853387355804
## 3      libturd 0.762575328350067
## 4    teabagger 0.744283258914948
## 5     teabilly 0.715277075767517
## 6      liberal 0.709996342658997
## 7       retard 0.690707504749298
## 8      dumbass 0.690422177314758
## 9         rwnj 0.684058785438538
## 10 republitard 0.678197801113129

distance(file_name = "../data/FBvec.bin",
        search_word = "douchebag",
        num = 10)

## Entered word or sentence: douchebag
## 
## Word: douchebag  Position in vocabulary: 9781

##         word              dist
## 1    scumbag 0.808189928531647
## 2      moron  0.80128538608551
## 3  hypocrite 0.787607729434967
## 4    jackass 0.783857941627502
## 5    shitbag 0.773443937301636
## 6        pos  0.76619291305542
## 7    dipshit 0.757693469524384
## 8      loser 0.756536900997162
## 9     coward 0.755453526973724
## 10     poser 0.750370919704437

distance(file_name = "../data/FBvec.bin",
        search_word = "idiot",
        num = 10)

## Entered word or sentence: idiot
## 
## Word: idiot  Position in vocabulary: 646

##         word              dist
## 1   imbecile 0.867565214633942
## 2    asshole 0.848560094833374
## 3      moron 0.781079053878784
## 4     asshat 0.772150039672852
## 5     a-hole 0.765781462192535
## 6      ahole 0.760824918746948
## 7    asswipe 0.742586553096771
## 8  ignoramus 0.735219776630402
## 9   arsehole 0.732272684574127
## 10     idoit 0.720151424407959

We can also take the embeddings themselves as features at the word level and then aggregate to a document level as an alternative or complement to bag-of-word approaches.

Let’s see how this would work by running a supervised learning classifier that can help us predict incivility in a corpus of public Facebook comments.

library(quanteda)

## Package version: 3.2.3
## Unicode version: 14.0
## ICU version: 70.1

## Parallel computing: 8 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

fb <- read.csv("../data/incivility.csv", stringsAsFactors = FALSE)
fb %>% 
  corpus(text_field="comment_message") %>% 
  tokens(remove_url=TRUE) %>% 
  dfm() %>% 
  dfm_wordstem() %>% 
  dfm_remove(stopwords("english")) %>% 
  dfm_trim(min_docfreq = 2, verbose=TRUE) -> fbdfm

## Removing features occurring:

##   - in fewer than 2 documents: 4,055

##   Total features removed: 4,055 (49.5%).

To get a sense of how easy or difficult this task is, first let’s run a bag-of-words lasso classifier:

set.seed(777)
training <- sample(1:nrow(fb), floor(.80 * nrow(fb)))
test <- (1:nrow(fb))[1:nrow(fb) %in% training == FALSE]

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-4

require(doMC)

## Loading required package: doMC

## Loading required package: foreach

## Loading required package: iterators

## Loading required package: parallel

registerDoMC(cores=3)
lasso <- cv.glmnet(fbdfm[training,], fb$attacks[training], 
    family="binomial", alpha=1, nfolds=5, parallel=TRUE, intercept=TRUE)

## function to compute performance metrics
performance <- function(ypred, y){
    tab <- table(ypred, y)
    accuracy <- (tab[1,1]+tab[2,2])/sum(tab)
    precision <- (tab[2,2])/(tab[2,1]+tab[2,2])
    recall <- tab[2,2]/(tab[1,2]+tab[2,2])
    message("Accuracy = ", round(accuracy, 2), "\n",
            "Precision = ", round(precision, 2), "\n",
            "Recall = ", round(recall, 2))
}
# computing predicted values
preds <- predict(lasso, fbdfm[test,], type="class")
# confusion matrix
table(preds, fb$attacks[test])

##      
## preds   0   1
##     0  66  26
##     1 174 343

# performance metrics
performance(preds, fb$attacks[test])

## Accuracy = 0.67
## Precision = 0.66
## Recall = 0.93

performance(preds==0, fb$attacks[test]==0)

## Accuracy = 0.67
## Precision = 0.72
## Recall = 0.28

Now let’s try adding word embeddings as features. To do so, first we will convert the word embeddings to a data frame, and then we will match the features from each document with their corresponding embeddings.

bin_to_txt("../data/FBvec.bin", "../data/FBvector.txt")

## $rfile_name
## [1] "../data/FBvec.bin"
## 
## $routput_file
## [1] "../data/FBvector.txt"

# extracting word embeddings for words in corpus
w2v <- readr::read_delim("../data/FBvector.txt", 
                  skip=1, delim=" ", quote="",
                  col_names=c("word", paste0("V", 1:100)))

## Rows: 107384 Columns: 101
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr   (1): word
## dbl (100): V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# keeping only embeddings for words in corpus
fb %>% 
  corpus(text_field="comment_message") %>% 
  tokens(remove_url=TRUE, remove_punct=TRUE) %>% 
  dfm() %>% 
  dfm_remove(stopwords("english")) -> fbdfm
w2v <- w2v[w2v$word %in% featnames(fbdfm),]

# let's do one comment as an example
fb$comment_message[3] # raw text

## [1] "Unbelievable level of incompetence!"

# bag-of-words DFM
vec <- as.numeric(fbdfm[3,])
# which words are not 0s?
(doc_words <- featnames(fbdfm)[vec>0])

## [1] "unbelievable" "level"        "incompetence"

# let's extract the embeddings for those words
embed_vec <- w2v[w2v$word %in% doc_words, 2:101]
# a glimpse into the data
embed_vec[1:3, 1:10]

## # A tibble: 3 × 10
##       V1     V2     V3     V4     V5     V6     V7     V8     V9    V10
##    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 -3.33   1.31   1.36  1.27    1.17  -1.36  -0.391 -0.106  0.525 -0.218
## 2 -0.471  0.485 -1.28  0.139   0.979 -0.643 -0.825  0.374 -1.64  -0.241
## 3 -2.34  -0.631 -0.368 0.0512 -1.20  -1.74   1.86  -1.91   0.460  0.353

# and now we aggregate to the comment level
embed <- colMeans(embed_vec)
# instead of feature counts, now this is how we represent the comment:
round(embed,2)

##    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10   V11   V12   V13 
## -2.04  0.39 -0.09  0.49  0.32 -1.25  0.22 -0.55 -0.22 -0.04  0.72 -0.25 -0.96 
##   V14   V15   V16   V17   V18   V19   V20   V21   V22   V23   V24   V25   V26 
## -2.15 -1.46  0.10 -0.40 -0.32  0.70  0.64 -1.72  0.54 -1.70 -0.23 -0.31  0.29 
##   V27   V28   V29   V30   V31   V32   V33   V34   V35   V36   V37   V38   V39 
## -0.41  2.29 -0.20 -0.90 -0.33  0.89  0.12 -0.12 -0.64  1.67 -0.68  1.09  2.99 
##   V40   V41   V42   V43   V44   V45   V46   V47   V48   V49   V50   V51   V52 
##  0.55 -1.92  0.94 -0.11 -0.07 -1.44  0.28  0.35  0.06 -0.25 -0.33  0.25  0.57 
##   V53   V54   V55   V56   V57   V58   V59   V60   V61   V62   V63   V64   V65 
## -0.82  0.47 -1.49  1.15  0.67  0.19  0.44  0.33  1.21 -0.70  2.16  2.38 -1.72 
##   V66   V67   V68   V69   V70   V71   V72   V73   V74   V75   V76   V77   V78 
##  0.08  1.24  1.85 -0.70  0.96 -1.80 -0.14 -0.32  0.46  1.56 -0.35 -0.63  1.18 
##   V79   V80   V81   V82   V83   V84   V85   V86   V87   V88   V89   V90   V91 
## -0.95 -0.92 -0.47  0.26 -0.54  0.74  1.60  0.86 -0.38  0.00 -0.70 -1.09  2.00 
##   V92   V93   V94   V95   V96   V97   V98   V99  V100 
## -0.55 -1.15 -0.59  1.24 -1.36  1.09  0.96 -0.59 -0.62

## now the same thing but for all comments:
# creating new feature matrix for embeddings
embed <- matrix(NA, nrow=ndoc(fbdfm), ncol=100)
for (i in 1:ndoc(fbdfm)){
  if (i %% 100 == 0) message(i, '/', ndoc(fbdfm))
  # extract word counts
  vec <- as.numeric(fbdfm[i,])
  # keep words with counts of 1 or more
  doc_words <- featnames(fbdfm)[vec>0]
  # extract embeddings for those words
  embed_vec <- w2v[w2v$word %in% doc_words, 2:101]
  # aggregate from word- to document-level embeddings by taking AVG
  embed[i,] <- colMeans(embed_vec, na.rm=TRUE)
  # if no words in embeddings, simply set to 0
  if (nrow(embed_vec)==0) embed[i,] <- 0
}

## 100/3043
## 200/3043
## 300/3043
## 400/3043
## 500/3043
## 600/3043
## 700/3043
## 800/3043
## 900/3043
## 1000/3043
## 1100/3043
## 1200/3043
## 1300/3043
## 1400/3043
## 1500/3043
## 1600/3043
## 1700/3043
## 1800/3043
## 1900/3043
## 2000/3043
## 2100/3043
## 2200/3043
## 2300/3043
## 2400/3043
## 2500/3043
## 2600/3043
## 2700/3043
## 2800/3043
## 2900/3043
## 3000/3043

Let’s now try to replicate the lasso classifier we estimated earlier with this new feature set.

library(glmnet)
require(doMC)
registerDoMC(cores=3)
lasso <- cv.glmnet(embed[training,], fb$attacks[training], 
    family="binomial", alpha=1, nfolds=5, parallel=TRUE, intercept=TRUE)

# computing predicted values
preds <- predict(lasso, embed[test,], type="class")
# confusion matrix
table(preds, fb$attacks[test])

##      
## preds   0   1
##     0 146  74
##     1  94 295

# performance metrics
performance(preds, fb$attacks[test])

## Accuracy = 0.72
## Precision = 0.76
## Recall = 0.8

performance(preds==0, fb$attacks[test]==0)

## Accuracy = 0.72
## Precision = 0.66
## Recall = 0.61

We generally find quite good performance with a much smaller set of features. Of course, one downside of this approach is that it’s very hard to interpret the coefficients we get from the lasso regression.

best.lambda <- which(lasso$lambda==lasso$lambda.1se)
beta <- lasso$glmnet.fit$beta[,best.lambda]
head(beta)

##           V1           V2           V3           V4           V5           V6 
## -0.009800617 -0.008686552 -0.107554636  0.000000000 -0.138759662  0.000000000

## identifying predictive features
df <- data.frame(coef = as.numeric(beta),
                word = names(beta), stringsAsFactors=F)

df <- df[order(df$coef),]
head(df[,c("coef", "word")], n=30)

##            coef word
## 16 -0.198572472  V16
## 51 -0.157944923  V51
## 32 -0.146243445  V32
## 54 -0.142586548  V54
## 5  -0.138759662   V5
## 94 -0.124538221  V94
## 14 -0.122831676  V14
## 3  -0.107554636   V3
## 45 -0.091359423  V45
## 91 -0.091077579  V91
## 38 -0.078903086  V38
## 49 -0.069730874  V49
## 26 -0.063569656  V26
## 40 -0.056534605  V40
## 29 -0.050364729  V29
## 98 -0.023280724  V98
## 89 -0.013449683  V89
## 1  -0.009800617   V1
## 2  -0.008686552   V2
## 4   0.000000000   V4
## 6   0.000000000   V6
## 8   0.000000000   V8
## 9   0.000000000   V9
## 11  0.000000000  V11
## 18  0.000000000  V18
## 19  0.000000000  V19
## 22  0.000000000  V22
## 23  0.000000000  V23
## 24  0.000000000  V24
## 25  0.000000000  V25

df <- df[order(df$coef, decreasing=TRUE),]
head(df[,c("coef", "word")], n=30)

##            coef word
## 83 0.4678749311  V83
## 36 0.3168821630  V36
## 39 0.2638583579  V39
## 86 0.1919064570  V86
## 28 0.1659097625  V28
## 82 0.1559398616  V82
## 20 0.1522701495  V20
## 30 0.1464040583  V30
## 12 0.1317909466  V12
## 47 0.1257574480  V47
## 10 0.1180020632  V10
## 21 0.0960462444  V21
## 97 0.0862319763  V97
## 77 0.0760793242  V77
## 43 0.0657338714  V43
## 17 0.0655732201  V17
## 88 0.0529407499  V88
## 78 0.0445873022  V78
## 7  0.0391439210   V7
## 13 0.0355189129  V13
## 81 0.0341516364  V81
## 76 0.0219520409  V76
## 37 0.0198285131  V37
## 99 0.0029831180  V99
## 58 0.0011690824  V58
## 15 0.0006897521  V15
## 4  0.0000000000   V4
## 6  0.0000000000   V6
## 8  0.0000000000   V8
## 9  0.0000000000   V9

Word embeddings

Pablo Barbera

Applications of word embeddings