word2vec

Word embeddings is a way to tranform text into features. Instead of using vectors of word counts, words now are represented as positions on a latent multidimensional space. These positions are weights from an underlying deep learning models where the use of words are predicted based on the contiguous words. The idea is that words that have similar weights are likely to be used surrounded by the same words.

word2vec is a method to compute word embeddings developed by Google. There are others (e.g. Glove), but it is quite popular and we can use pre-trained models to speed up our analysis.

Let’s see what we can do with it usign the rword2vec package in R. The examples here are based on the package materials, available here.

library(devtools)
install_github("mukul13/rword2vec")
install.packages("lsa")
install.packages("readr")
library(rword2vec)
library(lsa)
## Loading required package: SnowballC

This is how you would train the model. Note that this chunk of code will take a LONG time and may not work on your computer, so don’t run it. There are different ways to train the model (see ?word2vec for details)

model <- word2vec(
    train_file = "text8",
    output_file = "vec.bin",
    binary=1,
    num_threads=3,
    debug_mode=1)

To speed up the process, I’m providing a pre-trained model, available in data/vec.bin. We can now use it to run some analyses.

We’ll start by computing the most similar words to a specific word, where similar means how close they are on the latent multidimensional space.

distance(file_name = "../data/vec.bin",
        search_word = "princess",
        num = 10)
## Entered word or sentence: princess
## 
## Word: princess  Position in vocabulary: 3419
##        word              dist
## 1   consort 0.734738826751709
## 2   heiress 0.718510031700134
## 3   duchess 0.715823769569397
## 4    prince 0.703364968299866
## 5   empress 0.690687596797943
## 6   matilda 0.688317775726318
## 7     queen 0.682406425476074
## 8  isabella 0.668479681015015
## 9  countess 0.665310502052307
## 10  dowager 0.662643551826477
distance(file_name = "../data/vec.bin",
    search_word = "terrible",
    num = 10)
## Entered word or sentence: terrible
## 
## Word: terrible  Position in vocabulary: 8301
##           word              dist
## 1       sorrow 0.621069073677063
## 2     ruthless 0.616687178611755
## 3        cruel 0.611717998981476
## 4  devastating 0.606187760829926
## 5     horrific 0.599025368690491
## 6      scourge 0.595880687236786
## 7        weary 0.586524903774261
## 8   pestilence 0.584030032157898
## 9       doomed 0.584006071090698
## 10   crippling 0.581335961818695
distance(file_name = "../data/vec.bin",
    search_word = "los angeles",
    num = 10)
## Entered word or sentence: los angeles
## 
## Word: los  Position in vocabulary: 1403
## 
## Word: angeles  Position in vocabulary: 1818
##          word              dist
## 1       altos 0.665398597717285
## 2    glendale 0.583160161972046
## 3    pasadena 0.581867218017578
## 4  california 0.576719701290131
## 5       vegas 0.568398952484131
## 6     burbank 0.567363500595093
## 7      alamos 0.566120028495789
## 8       cabos 0.560705065727234
## 9      llanos 0.545829594135284
## 10    anaheim 0.544156968593597
distance(file_name = "../data/vec.bin",
    search_word = "catalonia",
    num = 10)
## Entered word or sentence: catalonia
## 
## Word: catalonia  Position in vocabulary: 8318
##         word              dist
## 1     basque 0.734335660934448
## 2    galicia 0.732944011688232
## 3    andorra 0.717443227767944
## 4      ceuta 0.713729858398438
## 5    melilla  0.71345841884613
## 6     girona 0.698964536190033
## 7  andalusia 0.698002099990845
## 8   corsican 0.694753408432007
## 9    corsica 0.693952023983002
## 10   catalan 0.691003561019897
distance(file_name = "../data/vec.bin",
    search_word = "usc",
    num = 10)
## Entered word or sentence: usc
## 
## Word: usc  Position in vocabulary: 21407
##         word              dist
## 1       boca 0.570119798183441
## 2   grinnell 0.558408856391907
## 3   brandeis 0.528159260749817
## 4     pomona 0.524164497852325
## 5    hampton  0.52258312702179
## 6         uc 0.522114217281342
## 7     harvey 0.520093083381653
## 8     irvine 0.518377542495728
## 9  pensacola 0.517914593219757
## 10    vernon 0.517660856246948

Where do these similarities come from? Let’s extract the underlying word vectors.

# Extracting word vectors
bin_to_txt("../data/vec.bin", "../data/vector.txt")
## $rfile_name
## [1] "../data/vec.bin"
## 
## $routput_file
## [1] "../data/vector.txt"
library(readr)
data <- read_delim("../data/vector.txt", 
    skip=1, delim=" ",
    col_names=c("word", paste0("V", 1:100)))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   word = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 2)
## Warning: 1 parsing failure.
## row # A tibble: 1 x 5 col     row   col    expected     actual                 file expected   <int> <chr>       <chr>      <chr>                <chr> actual 1 71238  <NA> 101 columns 68 columns '../data/vector.txt' file # A tibble: 1 x 5
data[1:10, 1:6]
## # A tibble: 10 x 6
##     word        V1        V2        V3        V4        V5
##    <chr>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
##  1  </s>  0.004003  0.004419 -0.003830 -0.003278  0.001367
##  2   the  0.778298  1.076689 -0.004922  0.435960 -1.732718
##  3    of -0.249496 -0.099338 -0.684512  1.558972 -1.297621
##  4   and -0.735102  1.056327  0.603875  0.072289 -0.628802
##  5   one  1.334065  0.060792 -0.384789 -0.502668  0.064564
##  6    in -0.947346  0.844855 -0.979201  1.702073  0.190915
##  7     a  1.417365  1.266602 -1.660653  0.623197 -2.009520
##  8    to  1.352469 -0.377193 -2.087587  1.163166  1.247414
##  9  zero  0.540899 -1.070553  0.714852  0.217904  0.046385
## 10  nine  2.055000  0.017030 -0.602494 -1.583842  0.456879

That’s the value of each word for each of the first five dimensions. We can plot some of these to understand better exactly what we’re working with:

plot_words <- function(words, data){
  # empty plot
  plot(0, 0, xlim=c(-2.5, 2.5), ylim=c(-2.5,2.5), type="n",
       xlab="First dimension", ylab="Second dimension")
  for (word in words){
    # extract first two dimensions
    vector <- as.numeric(data[data$word==word,2:3])
    # add to plot
    text(vector[1], vector[2], labels=word)
  }
}

plot_words(c("good", "better", "bad", "worse"), data)

plot_words(c("microsoft", "yahoo", "apple", "mango", "peach"), data)

plot_words(c("atheist", "agnostic", "catholic", "buddhist", "protestant", "christian"), data)

plot_words(c("microsoft", "yahoo", "apple", "mango", "peach"), data)

Once we have the vectors for each word, we can compute the similarity between a pair of words:

similarity <- function(word1, word2){
    lsa::cosine(
        x=as.numeric(data[data$word==word1,2:101]),
        y=as.numeric(data[data$word==word2,2:101]))

}

similarity("australia", "england")
##           [,1]
## [1,] 0.6319489
similarity("australia", "canada")
##           [,1]
## [1,] 0.6800522
similarity("australia", "apple")
##           [,1]
## [1,] 0.0300495

The final function provided by the package is word_analogy, which helps us find regularities in the word vector space:

word_analogy(file_name = "../data/vec.bin",
    search_words = "king queen man",
    num = 1)
## 
## Word: king  Position in vocabulary: 187
## 
## Word: queen  Position in vocabulary: 903
## 
## Word: man  Position in vocabulary: 243
##    word              dist
## 1 woman 0.670807123184204
word_analogy(file_name = "../data/vec.bin",
    search_words = "paris france berlin",
    num = 1)
## 
## Word: paris  Position in vocabulary: 1055
## 
## Word: france  Position in vocabulary: 303
## 
## Word: berlin  Position in vocabulary: 1360
##      word              dist
## 1 germany 0.818466305732727
word_analogy(file_name = "../data/vec.bin",
    search_words = "man woman uncle",
    num = 2)
## 
## Word: man  Position in vocabulary: 243
## 
## Word: woman  Position in vocabulary: 1012
## 
## Word: uncle  Position in vocabulary: 4206
##    word              dist
## 1 niece 0.729662358760834
## 2  aunt 0.729258477687836
word_analogy(file_name = "../data/vec.bin",
    search_words = "man actor woman",
    num = 5)
## 
## Word: man  Position in vocabulary: 243
## 
## Word: actor  Position in vocabulary: 461
## 
## Word: woman  Position in vocabulary: 1012
##          word              dist
## 1     actress 0.815776824951172
## 2      singer 0.705898344516754
## 3  comedienne 0.665390908718109
## 4  playwright 0.655908346176147
## 5 entertainer 0.655762135982513
word_analogy(file_name = "../data/vec.bin",
    search_words = "france paris california",
    num = 1)
## 
## Word: france  Position in vocabulary: 303
## 
## Word: paris  Position in vocabulary: 1055
## 
## Word: california  Position in vocabulary: 722
##       word              dist
## 1 glendale 0.635997474193573
word_analogy(file_name = "../data/vec.bin",
    search_words = "usc ucla berkeley",
    num = 5)
## 
## Word: usc  Position in vocabulary: 21407
## 
## Word: ucla  Position in vocabulary: 14954
## 
## Word: berkeley  Position in vocabulary: 2723
##           word              dist
## 1          mit 0.543582797050476
## 2           uc 0.529780268669128
## 3 paleontology 0.526954293251038
## 4       leiden 0.522942841053009
## 5     stanford 0.520716428756714

And we can see some examples of algorithmic bias (but really, bias in the training data):

word_analogy(file_name = "../data/vec.bin",
    search_words = "man woman professor",
    num = 1)
## 
## Word: man  Position in vocabulary: 243
## 
## Word: woman  Position in vocabulary: 1012
## 
## Word: professor  Position in vocabulary: 1750
##       word              dist
## 1 lecturer 0.671598970890045
word_analogy(file_name = "../data/vec.bin",
    search_words = "man doctor woman",
    num = 5)
## 
## Word: man  Position in vocabulary: 243
## 
## Word: doctor  Position in vocabulary: 1907
## 
## Word: woman  Position in vocabulary: 1012
##         word              dist
## 1      nurse 0.520112752914429
## 2      child 0.513970971107483
## 3 prostitute 0.485813647508621
## 4   divorcee 0.485542565584183
## 5    surgeon 0.468920856714249