Supervised machine learning

Exercise: Supervised machine learning

Now let’s work with a dataset that contains all the tweets sent by Donald Trump, Ted Cruz, Hillary Clinton, and Bernie Sanders during the 2016 primary election campaign. Let’s pick Donald Trump and try to build a classifier to predict whether a tweet was published by him (from an Android device) or his campaign team (from an iPhone).

tweets <- read.csv('data/candidate-tweets.csv', stringsAsFactors=F)
tweets <- tweets[tweets$screen_name=="realDonaldTrump",]
# subsetting tweets in 2016
tweets$datetime <- as.POSIXct(tweets$datetime)
tweets <- tweets[tweets$datetime > as.POSIXct("2016-01-01 00:00:00"),]
# variable measuring if tweet is coming from an Android device
tweets$android <- ifelse(grepl("Android", tweets$source), 1, 0)
prop.table(table(tweets$android))

## 
##         0         1 
## 0.4975705 0.5024295

# removing URLs and handles
tweets$text <- gsub('https?://t.co/[A-Za-z0-9]+', '', tweets$text)
tweets$text <- gsub('@[0-9_A-Za-z]+', '@', tweets$text)

Create a training and test set, with 80% and 20%, respectively.

set.seed(123)
training <- sample(1:nrow(tweets), floor(.80 * nrow(tweets)))
test <- (1:nrow(tweets))[1:nrow(tweets) %in% training == FALSE]

Construct the DFM. You may want to experiment with different preprocessing techniques until you achieve better performance.

library(quanteda)

## quanteda version 0.9.9.65

## Using 3 of 4 cores for parallel computing

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

twcorpus <- corpus(tweets$text)
twdfm <- dfm(twcorpus, remove_punct=TRUE, remove_numbers=TRUE, remove=c(
  stopwords("english"), "t.co", "https", "rt", "amp", "http", "t.c", "can"))
#twdfm <- dfm_trim(twdfm, min_docfreq = 2)
textplot_wordcloud(twdfm, rot.per=0, scale=c(3.5, .75), max.words=100)

# adding time of day as features
tweets$datetime[1]

## [1] "2016-01-01 00:11:35 GMT"

substr(tweets$datetime[1], 12, 13)

## [1] "00"

tweets$hour <- substr(tweets$datetime, 12, 13)
newdfm <- dfm(corpus(tweets$hour), verbose=TRUE)

## Creating a dfm from a corpus ...

##    ... tokenizing texts

##    ... lowercasing

##    ... found 2,058 documents, 24 features

##    ... created a 2,058 x 24 sparse dfm
##    ... complete. 
## Elapsed time: 0.001 seconds.

twdfm <- cbind(twdfm, newdfm)

Now run the classifier. Then, compute the accuracy.

library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-10

require(doMC)

## Loading required package: doMC

## Loading required package: iterators

## Loading required package: parallel

registerDoMC(cores=3)
ridge <- cv.glmnet(twdfm[training,], tweets$android[training], 
    family="binomial", alpha=0, nfolds=5, parallel=TRUE,
    type.measure="deviance")
plot(ridge)

## function to compute accuracy
accuracy <- function(ypred, y){
    tab <- table(ypred, y)
    return(sum(diag(tab))/sum(tab))
}
# function to compute precision
precision <- function(ypred, y){
    tab <- table(ypred, y)
    return((tab[2,2])/(tab[2,1]+tab[2,2]))
}
# function to compute recall
recall <- function(ypred, y){
    tab <- table(ypred, y)
    return(tab[2,2]/(tab[1,2]+tab[2,2]))
}
# computing predicted values
preds <- predict(ridge, twdfm[test,], type="response") > mean(tweets$android[test])
# confusion matrix
table(preds, tweets$android[test])

##        
## preds     0   1
##   FALSE 150  33
##   TRUE   63 166

# performance metrics
accuracy(preds, tweets$android[test])

## [1] 0.7669903

precision(preds, tweets$android[test])

## [1] 0.7248908

recall(preds, tweets$android[test])

## [1] 0.8341709

Identify the features that better predict that tweets were sent by this candidate. What do you learn?

# from the different values of lambda, let's pick the best one
best.lambda <- which(ridge$lambda==ridge$lambda.min)
beta <- ridge$glmnet.fit$beta[,best.lambda]
head(beta)

##     believe       state  department         new      year's         eve 
##  0.11024005  0.07061251  0.19544650 -0.06800241  0.19544495  0.07059568

# looking at times of day
times <- tail(beta, n=24)
df <- data.frame(coef = as.numeric(times),
                time = as.numeric(names(times)), stringsAsFactors=F)
df$time <- df$time - 4
df <- df[order(df$time),]
plot(df$time, df$coef)
lines(df$time, df$coef)

## identifying predictive features
df <- data.frame(coef = as.numeric(beta),
                word = names(beta), stringsAsFactors=F)

df <- df[order(df$coef),]
head(df[,c("coef", "word")], n=30)

##            coef                  word
## 1354 -0.3192295              powerful
## 3132 -0.3158853               develop
## 2091 -0.3007428                  150m
## 3620 -0.3003854            completely
## 1260 -0.2974106             listeners
## 1903 -0.2973349           #commoncore
## 2975 -0.2948349               devotes
## 1519 -0.2913559                 greta
## 3398 -0.2894576             scratched
## 1940 -0.2879420                  cred
## 396  -0.2874535                  cams
## 1386 -0.2863421                   owe
## 1888 -0.2855663                   9am
## 2212 -0.2855495             insurance
## 2845 -0.2846090             disrupted
## 2019 -0.2830376               cheater
## 1731 -0.2825024                barely
## 1417 -0.2802746                trough
## 3609 -0.2777939                fading
## 3610 -0.2777930             guarantee
## 2633 -0.2773057                beaten
## 966  -0.2763225              ultimate
## 859  -0.2750178                 tiger
## 3314 -0.2748297 makeamericagreatagain
## 2167 -0.2739679            mouthpiece
## 3633 -0.2719811      #happymothersday
## 2470 -0.2702062                   dry
## 572  -0.2690262                   25k
## 1587 -0.2684620            particular
## 1490 -0.2661460            domination

paste(df$word[1:30], collapse=", ")

## [1] "powerful, develop, 150m, completely, listeners, #commoncore, devotes, greta, scratched, cred, cams, owe, 9am, insurance, disrupted, cheater, barely, trough, fading, guarantee, beaten, ultimate, tiger, makeamericagreatagain, mouthpiece, #happymothersday, dry, 25k, particular, domination"

df <- df[order(df$coef, decreasing=TRUE),]
head(df[,c("coef", "word")], n=30)

##           coef                  word
## 2107 0.3289978               forties
## 1859 0.3116942                   led
## 306  0.3033276                #iowan
## 2012 0.3009381              thankyou
## 1316 0.2959314          introduction
## 2061 0.2934836 #southcarolinaprimary
## 3540 0.2931940                mother
## 1017 0.2920383              comments
## 1841 0.2909955              #hannity
## 3062 0.2897049               corrupt
## 2945 0.2891977                    ty
## 3341 0.2887989             postcards
## 1964 0.2882788             hesitates
## 870  0.2867488             pensacola
## 3515 0.2864294             highlight
## 1356 0.2856473            presidents
## 2839 0.2851345                dayton
## 3194 0.2803789                easter
## 3374 0.2800565             rochester
## 406  0.2789853      #trump4president
## 2097 0.2789834               crushed
## 526  0.2786570                advice
## 3395 0.2784396              listened
## 2462 0.2783732                  plus
## 1525 0.2778416            corruption
## 2621 0.2760796                 saved
## 89   0.2741340              huckabee
## 759  0.2737709           appreciated
## 755  0.2737707              harrison
## 2796 0.2728532               #donald

paste(df$word[1:30], collapse=", ")

## [1] "forties, led, #iowan, thankyou, introduction, #southcarolinaprimary, mother, comments, #hannity, corrupt, ty, postcards, hesitates, pensacola, highlight, presidents, dayton, easter, rochester, #trump4president, crushed, advice, listened, plus, corruption, saved, huckabee, appreciated, harrison, #donald"

Trying a different classifier: Gradient Boosting of Decision Trees

library(xgboost)
# converting matrix object
X <- as(twdfm, "dgCMatrix")
# running xgboost model
tryEta <- c(1,2)
tryDepths <- c(1,2,4)
bestEta=NA
bestDepth=NA
bestAcc=0

for(eta in tryEta){
  for(dp in tryDepths){ 
    bst <- xgb.cv(data = X[training,], 
            label =  tweets$android[training], 
            max.depth = dp,
          eta = eta, 
          nthread = 4,
          nround = 500,
          nfold=5,
          print_every_n = 100L,
          objective = "binary:logistic")
    # cross-validated accuracy
    acc <- 1-mean(tail(bst$evaluation_log$test_error_mean))
        cat("Results for eta=",eta," and depth=", dp, " : ",
                acc," accuracy.\n",sep="")
        if(acc>bestAcc){
                bestEta=eta
                bestAcc=acc
                bestDepth=dp
        }
    }
}

## [1]  train-error:0.380923+0.004654   test-error:0.381522+0.019081 
## [101]    train-error:0.145657+0.004986   test-error:0.230870+0.013304 
## [201]    train-error:0.123481+0.004432   test-error:0.230265+0.010951 
## [301]    train-error:0.111633+0.003370   test-error:0.235728+0.011407 
## [401]    train-error:0.107229+0.002443   test-error:0.239974+0.012574 
## [500]    train-error:0.105255+0.002264   test-error:0.240588+0.014248 
## Results for eta=1 and depth=1 : 0.7595125 accuracy.
## [1]  train-error:0.338397+0.003534   test-error:0.341444+0.013915 
## [101]    train-error:0.101458+0.006612   test-error:0.225386+0.020369 
## [201]    train-error:0.075638+0.006506   test-error:0.232068+0.030892 
## [301]    train-error:0.060145+0.005704   test-error:0.241787+0.029654 
## [401]    train-error:0.050121+0.005102   test-error:0.244207+0.028989 
## [500]    train-error:0.041160+0.004692   test-error:0.239961+0.022329 
## Results for eta=1 and depth=2 : 0.7596323 accuracy.
## [1]  train-error:0.287515+0.005548   test-error:0.301337+0.025027 
## [101]    train-error:0.059234+0.004844   test-error:0.233284+0.016773 
## [201]    train-error:0.029617+0.001856   test-error:0.232062+0.022037 
## [301]    train-error:0.017922+0.001833   test-error:0.232665+0.023195 
## [401]    train-error:0.013518+0.001306   test-error:0.233886+0.022333 
## [500]    train-error:0.010784+0.001310   test-error:0.233890+0.022706 
## Results for eta=1 and depth=4 : 0.765909 accuracy.
## [1]  train-error:0.380922+0.004714   test-error:0.381509+0.019930 
## [101]    train-error:0.470373+0.032332   test-error:0.470241+0.009927 
## [201]    train-error:0.470373+0.032332   test-error:0.470241+0.009927 
## [301]    train-error:0.470373+0.032332   test-error:0.470241+0.009927 
## [401]    train-error:0.470373+0.032332   test-error:0.470241+0.009927 
## [500]    train-error:0.470373+0.032332   test-error:0.470241+0.009927 
## Results for eta=2 and depth=1 : 0.5297594 accuracy.
## [1]  train-error:0.338394+0.006524   test-error:0.340794+0.029435 
## [101]    train-error:0.455956+0.037992   test-error:0.444056+0.044872 
## [201]    train-error:0.465379+0.045655   test-error:0.431329+0.023306 
## [301]    train-error:0.465379+0.045655   test-error:0.431329+0.023306 
## [401]    train-error:0.465379+0.045655   test-error:0.431329+0.023306 
## [500]    train-error:0.465379+0.045655   test-error:0.431329+0.023306 
## Results for eta=2 and depth=2 : 0.568671 accuracy.
## [1]  train-error:0.286757+0.004916   test-error:0.305602+0.020300 
## [101]    train-error:0.489513+0.031601   test-error:0.479346+0.026067 
## [201]    train-error:0.489513+0.031601   test-error:0.479346+0.026067 
## [301]    train-error:0.489513+0.031601   test-error:0.479346+0.026067 
## [401]    train-error:0.489513+0.031601   test-error:0.479346+0.026067 
## [500]    train-error:0.489513+0.031601   test-error:0.479346+0.026067 
## Results for eta=2 and depth=4 : 0.5206538 accuracy.

cat("Best model has eta=",bestEta," and depth=", bestDepth, " : ",
    bestAcc," accuracy.\n",sep="")

## Best model has eta=1 and depth=4 : 0.765909 accuracy.

# running best model
rf <- xgboost(data = X[training,], 
    label = tweets$android[training], 
        max.depth = bestDepth,
    eta = bestEta, 
    nthread = 4,
    nround = 1000,
        print_every_n=100L,
    objective = "binary:logistic")

## [1]  train-error:0.288578 
## [101]    train-error:0.066221 
## [201]    train-error:0.036452 
## [301]    train-error:0.024301 
## [401]    train-error:0.015796 
## [501]    train-error:0.012151 
## [601]    train-error:0.011543 
## [701]    train-error:0.009113 
## [801]    train-error:0.009113 
## [901]    train-error:0.008505 
## [1000]   train-error:0.008505

# out-of-sample accuracy
preds <- predict(rf, X[test,])
cat("\nAccuracy on test set=", round(accuracy(preds>.50, tweets$android[test]),3))

## 
## Accuracy on test set= 0.762

cat("\nPrecision on test set=", round(precision(preds>.50, tweets$android[test]),3))

## 
## Precision on test set= 0.739

cat("\nRecall on test set=", round(recall(preds>.50, tweets$android[test]),3))

## 
## Recall on test set= 0.784

# feature importance
labels <- dimnames(X)[[2]]
importance <- xgb.importance(labels, model = rf, data=X, label=tweets$android)

## Warning in `[.data.table`(result, , `:=`("RealCover", as.numeric(vec)), :
## with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.

importance <- importance[order(importance$Gain, decreasing=TRUE),]
head(importance, n=20)

##                    Feature        Split        Gain        Cover
##  1:                  thank          1.5 0.092877123 0.0057196530
##  2:                      u           20 0.037401798 0.0033220662
##  3:                     12 -9.53674e-07 0.035671053 0.0030841464
##  4: #makeamericagreatagain -9.53674e-07 0.035616359 0.0116694675
##  5:                     13 -9.53674e-07 0.026439178 0.0042178725
##  6:                     14 -9.53674e-07 0.023764903 0.0060561334
##  7:                     04 -9.53674e-07 0.022778167 0.0079180063
##  8:                   join -9.53674e-07 0.022765320 0.0012606874
##  9:                     15 -9.53674e-07 0.018111213 0.0115206824
## 10:                     11 -9.53674e-07 0.017749135 0.0022013085
## 11:             #votetrump -9.53674e-07 0.011454005 0.0100246841
## 12:                     17 -9.53674e-07 0.011290095 0.0099398695
## 13:                     19 -9.53674e-07 0.010888909 0.0089646516
## 14:                    job            2 0.010593182 0.0007286837
## 15:                  trump          1.5 0.010454531 0.0043770919
## 16:                  trump            4 0.010186134 0.0075704737
## 17:                  video          1.5 0.009648655 0.0026244868
## 18:                     20 -9.53674e-07 0.009555074 0.0106000105
## 19:                     18 -9.53674e-07 0.008393567 0.0119837285
## 20:                   much          1.5 0.008362693 0.0044849840
##        Frequency RealCover RealCover %
##  1: 0.0064355408        31 0.029980658
##  2: 0.0029063733         9 0.008704062
##  3: 0.0014531866        44 0.042553191
##  4: 0.0141166701         9 0.008704062
##  5: 0.0026987752        57 0.055125725
##  6: 0.0068507370        44 0.042553191
##  7: 0.0091343160        39 0.037717602
##  8: 0.0004151962         1 0.000967118
##  9: 0.0128710816        42 0.040618956
## 10: 0.0008303924        15 0.014506770
## 11: 0.0089267179         3 0.002901354
## 12: 0.0101723064        20 0.019342360
## 13: 0.0085115217        12 0.011605416
## 14: 0.0008303924         6 0.005802708
## 15: 0.0083039236        81 0.078336557
## 16: 0.0114178950        91 0.088007737
## 17: 0.0012455885         1 0.000967118
## 18: 0.0091343160        11 0.010638298
## 19: 0.0110026988         9 0.008704062
## 20: 0.0026987752        15 0.014506770

# adding sign
sums <- list()
for (v in 0:1){
    sums[[v+1]] <- colSums(X[tweets[,"android"]==v,])
}
sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
    
df <- data.frame(
    Feature = labels, 
    sign = sign-1,
    stringsAsFactors=F)
importance <- merge(importance, df, by="Feature")
    
## best predictors
for (v in 0:1){
    cat("\n\n")
    cat("value==", v)
    importance <- importance[order(importance$Gain, decreasing=TRUE),]
    print(head(importance[importance$sign==v,], n=50))
    cat("\n")
    cat(paste(unique(head(importance$Feature[importance$sign==v], n=50)), collapse=", "))
}

## 
## 
## value== 0                   Feature        Split        Gain        Cover
##  1:                  thank          1.5 0.092877123 5.719653e-03
##  2:                      u           20 0.037401798 3.322066e-03
##  3: #makeamericagreatagain -9.53674e-07 0.035616359 1.166947e-02
##  4:                   join -9.53674e-07 0.022765320 1.260687e-03
##  5:             #votetrump -9.53674e-07 0.011454005 1.002468e-02
##  6:                     17 -9.53674e-07 0.011290095 9.939869e-03
##  7:                     19 -9.53674e-07 0.010888909 8.964652e-03
##  8:                  video          1.5 0.009648655 2.624487e-03
##  9:                     20 -9.53674e-07 0.009555074 1.060001e-02
## 10:                     18 -9.53674e-07 0.008393567 1.198373e-02
## 11:                     22 -9.53674e-07 0.008118831 1.128236e-02
## 12:                     23 -9.53674e-07 0.007378553 7.782440e-03
## 13:                     21 -9.53674e-07 0.006812569 8.269242e-03
## 14:               tomorrow -9.53674e-07 0.006386310 4.554853e-03
## 15:                  thank            2 0.005838967 1.790418e-03
## 16:                      w            4 0.005771816 2.203474e-03
## 17:                    via -9.53674e-07 0.005421917 6.324302e-04
## 18:                     00 -9.53674e-07 0.005226998 9.963289e-03
## 19:               national          1.5 0.005016697 3.267454e-03
## 20:                   soon -9.53674e-07 0.004640665 2.824576e-03
## 21:           pennsylvania -9.53674e-07 0.003906521 6.338297e-04
## 22:                  #fitn -9.53674e-07 0.003850067 1.088315e-03
## 23:                  times -9.53674e-07 0.003808673 2.505758e-03
## 24:                     06 -9.53674e-07 0.003459074 1.270131e-02
## 25:                   york            4 0.003230675 1.615261e-03
## 26:                massive -9.53674e-07 0.003222589 4.239371e-03
## 27:              dishonest -9.53674e-07 0.003043879 1.660732e-03
## 28:             #gopdebate -9.53674e-07 0.002850391 1.320380e-03
## 29:                      t            4 0.002741548 1.497644e-03
## 30:                    est -9.53674e-07 0.002736103 6.295781e-04
## 31:                     02 -9.53674e-07 0.002635378 7.877000e-03
## 32:                   poll            4 0.002615047 4.156703e-03
## 33:                     us          1.5 0.002540507 5.170648e-03
## 34:             supporters -9.53674e-07 0.002533213 1.013577e-03
## 35:                     01 -9.53674e-07 0.002503560 6.196699e-03
## 36:                support -9.53674e-07 0.002478543 1.510386e-03
## 37:                     07 -9.53674e-07 0.002426384 5.197319e-03
## 38:                forward -9.53674e-07 0.002398556 2.589300e-03
## 39:                  event -9.53674e-07 0.002296520 1.017616e-03
## 40:              zuckerman -9.53674e-07 0.002267034 1.018324e-03
## 41:          #supertuesday -9.53674e-07 0.002189334 1.036561e-03
## 42:             university -9.53674e-07 0.002099887 1.012314e-03
## 43:              elizabeth -9.53674e-07 0.002047364 6.001713e-04
## 44:                      u          3.5 0.001993550 4.991130e-03
## 45:                     us            4 0.001928396 5.954970e-03
## 46:                goldman -9.53674e-07 0.001895827 5.541513e-05
## 47:                    new            2 0.001753643 9.252950e-05
## 48:                    new            4 0.001738081 4.795232e-03
## 49:                tuesday -9.53674e-07 0.001726909 1.305572e-03
## 50:                      u          4.5 0.001593165 7.859458e-03
##                    Feature        Split        Gain        Cover
##        Frequency RealCover RealCover % sign
##  1: 0.0064355408        31 0.029980658    0
##  2: 0.0029063733         9 0.008704062    0
##  3: 0.0141166701         9 0.008704062    0
##  4: 0.0004151962         1 0.000967118    0
##  5: 0.0089267179         3 0.002901354    0
##  6: 0.0101723064        20 0.019342360    0
##  7: 0.0085115217        12 0.011605416    0
##  8: 0.0012455885         1 0.000967118    0
##  9: 0.0091343160        11 0.010638298    0
## 10: 0.0110026988         9 0.008704062    0
## 11: 0.0122482873        20 0.019342360    0
## 12: 0.0074735312         9 0.008704062    0
## 13: 0.0074735312        11 0.010638298    0
## 14: 0.0039443637         3 0.002901354    0
## 15: 0.0062279427        29 0.028046422    0
## 16: 0.0014531866         3 0.002901354    0
## 17: 0.0004151962         1 0.000967118    0
## 18: 0.0107951007        16 0.015473888    0
## 19: 0.0020759809         3 0.002901354    0
## 20: 0.0016607847         6 0.005802708    0
## 21: 0.0002075981         0 0.000000000    0
## 22: 0.0004151962         0 0.000000000    0
## 23: 0.0012455885         2 0.001934236    0
## 24: 0.0116254930        10 0.009671180    0
## 25: 0.0012455885         6 0.005802708    0
## 26: 0.0029063733         2 0.001934236    0
## 27: 0.0008303924         5 0.004835590    0
## 28: 0.0006227943         3 0.002901354    0
## 29: 0.0006227943         0 0.000000000    0
## 30: 0.0002075981         0 0.000000000    0
## 31: 0.0095495121        27 0.026112186    0
## 32: 0.0039443637        14 0.013539652    0
## 33: 0.0041519618         9 0.008704062    0
## 34: 0.0004151962         3 0.002901354    0
## 35: 0.0058127465         6 0.005802708    0
## 36: 0.0010379905         8 0.007736944    0
## 37: 0.0041519618         4 0.003868472    0
## 38: 0.0016607847         3 0.002901354    0
## 39: 0.0004151962         3 0.002901354    0
## 40: 0.0004151962         0 0.000000000    0
## 41: 0.0004151962         1 0.000967118    0
## 42: 0.0004151962         2 0.001934236    0
## 43: 0.0002075981         0 0.000000000    0
## 44: 0.0049823542        12 0.011605416    0
## 45: 0.0047747561         9 0.008704062    0
## 46: 0.0002075981         1 0.000967118    0
## 47: 0.0004151962        22 0.021276596    0
## 48: 0.0062279427        23 0.022243714    0
## 49: 0.0006227943         1 0.000967118    0
## 50: 0.0074735312         8 0.007736944    0
##        Frequency RealCover RealCover % sign
## 
## thank, u, #makeamericagreatagain, join, #votetrump, 17, 19, video, 20, 18, 22, 23, 21, tomorrow, w, via, 00, national, soon, pennsylvania, #fitn, times, 06, york, massive, dishonest, #gopdebate, t, est, 02, poll, us, supporters, 01, support, 07, forward, event, zuckerman, #supertuesday, university, elizabeth, goldman, new, tuesday
## 
## value== 1        Feature        Split        Gain        Cover    Frequency
##  1:          12 -9.53674e-07 0.035671053 0.0030841464 0.0014531866
##  2:          13 -9.53674e-07 0.026439178 0.0042178725 0.0026987752
##  3:          14 -9.53674e-07 0.023764903 0.0060561334 0.0068507370
##  4:          04 -9.53674e-07 0.022778167 0.0079180063 0.0091343160
##  5:          15 -9.53674e-07 0.018111213 0.0115206824 0.0128710816
##  6:          11 -9.53674e-07 0.017749135 0.0022013085 0.0008303924
##  7:         job            2 0.010593182 0.0007286837 0.0008303924
##  8:       trump          1.5 0.010454531 0.0043770919 0.0083039236
##  9:       trump            4 0.010186134 0.0075704737 0.0114178950
## 10:        much          1.5 0.008362693 0.0044849840 0.0026987752
## 11:         win            4 0.008085170 0.0045746190 0.0039443637
## 12:         wow -9.53674e-07 0.007057593 0.0079298144 0.0070583351
## 13:        just            2 0.007028191 0.0002922437 0.0026987752
## 14: donaldtrump -9.53674e-07 0.006725871 0.0012135612 0.0004151962
## 15:         big            6 0.006155326 0.0050688491 0.0045671580
## 16:        said -9.53674e-07 0.005159194 0.0159992828 0.0155698568
## 17:        will          1.5 0.005128423 0.0026561235 0.0141166701
## 18:         p.m -9.53674e-07 0.005075584 0.0015805046 0.0006227943
## 19:         one -9.53674e-07 0.005063292 0.0101493802 0.0087191198
## 20:        cruz          1.5 0.004893105 0.0049469326 0.0066431389
## 21:     hillary            4 0.004793922 0.0043360535 0.0041519618
## 22:         got          1.5 0.004772025 0.0027929498 0.0016607847
## 23:        best -9.53674e-07 0.004388935 0.0061054452 0.0043595599
## 24:          05 -9.53674e-07 0.004352847 0.0078121825 0.0078887274
## 25:          03 -9.53674e-07 0.004344592 0.0111161729 0.0130786797
## 26:        will            4 0.004342077 0.0006710725 0.0101723064
## 27:  candidates            4 0.003944141 0.0010757651 0.0004151962
## 28:       women            4 0.003813570 0.0009857106 0.0004151962
## 29:         u.s -9.53674e-07 0.003744277 0.0009874371 0.0004151962
## 30:     america            2 0.003710449 0.0002708185 0.0024911771
## 31:        will            2 0.003697765 0.0006765452 0.0105875026
## 32:        cruz            2 0.003473759 0.0003713714 0.0037367656
## 33:      voting -9.53674e-07 0.003427715 0.0018093515 0.0008303924
## 34:       great          1.5 0.003293033 0.0036453397 0.0099647083
## 35:    millions -9.53674e-07 0.003228836 0.0010363900 0.0004151962
## 36:        many            4 0.003225302 0.0037570719 0.0033215694
## 37:         job            4 0.003117017 0.0028094398 0.0020759809
## 38:          16 -9.53674e-07 0.003102981 0.0114184405 0.0114178950
## 39:    carolina -9.53674e-07 0.003099213 0.0051447175 0.0039443637
## 40:         won -9.53674e-07 0.003021040 0.0026760425 0.0018683828
## 41:       media -9.53674e-07 0.003005130 0.0024866458 0.0014531866
## 42:       rally -9.53674e-07 0.002840934 0.0049259275 0.0039443637
## 43:       happy -9.53674e-07 0.002687876 0.0014763303 0.0008303924
## 44:   president            6 0.002623698 0.0031940929 0.0029063733
## 45:     nothing -9.53674e-07 0.002579016 0.0083539180 0.0072659332
## 46:        know -9.53674e-07 0.002523419 0.0020380226 0.0010379905
## 47:     looking -9.53674e-07 0.002486268 0.0027605050 0.0014531866
## 48:       great            2 0.002446437 0.0007049091 0.0095495121
## 49: lightweight -9.53674e-07 0.002421978 0.0024289129 0.0020759809
## 50:        bill            4 0.002389521 0.0009176661 0.0004151962
##         Feature        Split        Gain        Cover    Frequency
##     RealCover RealCover % sign
##  1:        44 0.042553191    1
##  2:        57 0.055125725    1
##  3:        44 0.042553191    1
##  4:        39 0.037717602    1
##  5:        42 0.040618956    1
##  6:        15 0.014506770    1
##  7:         6 0.005802708    1
##  8:        81 0.078336557    1
##  9:        91 0.088007737    1
## 10:        15 0.014506770    1
## 11:        19 0.018375242    1
## 12:        24 0.023210832    1
## 13:        50 0.048355899    1
## 14:         6 0.005802708    1
## 15:        31 0.029980658    1
## 16:        24 0.023210832    1
## 17:       100 0.096711799    1
## 18:         4 0.003868472    1
## 19:        21 0.020309478    1
## 20:        69 0.066731141    1
## 21:        41 0.039651838    1
## 22:         9 0.008704062    1
## 23:         6 0.005802708    1
## 24:        22 0.021276596    1
## 25:        28 0.027079304    1
## 26:       103 0.099613153    1
## 27:         8 0.007736944    1
## 28:         7 0.006769826    1
## 29:        15 0.014506770    1
## 30:        38 0.036750484    1
## 31:        99 0.095744681    1
## 32:        62 0.059961315    1
## 33:         8 0.007736944    1
## 34:        78 0.075435203    1
## 35:        10 0.009671180    1
## 36:        22 0.021276596    1
## 37:        10 0.009671180    1
## 38:        27 0.026112186    1
## 39:        14 0.013539652    1
## 40:        12 0.011605416    1
## 41:        14 0.013539652    1
## 42:         7 0.006769826    1
## 43:         3 0.002901354    1
## 44:        16 0.015473888    1
## 45:         8 0.007736944    1
## 46:         6 0.005802708    1
## 47:        11 0.010638298    1
## 48:        87 0.084139265    1
## 49:         6 0.005802708    1
## 50:         1 0.000967118    1
##     RealCover RealCover % sign
## 
## 12, 13, 14, 04, 15, 11, job, trump, much, win, wow, just, donaldtrump, big, said, will, p.m, one, cruz, hillary, got, best, 05, 03, candidates, women, u.s, america, voting, great, millions, many, 16, carolina, won, media, rally, happy, president, nothing, know, looking, lightweight, bill

Supervised machine learning

Pablo Barbera

June 28, 2017

Exercise: Supervised machine learning