Now let’s work with a dataset that contains all the tweets sent by Donald Trump, Ted Cruz, Hillary Clinton, and Bernie Sanders during the 2016 primary election campaign. Let’s pick Donald Trump and try to build a classifier to predict whether a tweet was published by him (from an Android device) or his campaign team (from an iPhone).
tweets <- read.csv('data/candidate-tweets.csv', stringsAsFactors=F)
tweets <- tweets[tweets$screen_name=="realDonaldTrump",]
# subsetting tweets in 2016
tweets$datetime <- as.POSIXct(tweets$datetime)
tweets <- tweets[tweets$datetime > as.POSIXct("2016-01-01 00:00:00"),]
# variable measuring if tweet is coming from an Android device
tweets$android <- ifelse(grepl("Android", tweets$source), 1, 0)
prop.table(table(tweets$android))
##
## 0 1
## 0.4975705 0.5024295
# removing URLs and handles
tweets$text <- gsub('https?://t.co/[A-Za-z0-9]+', '', tweets$text)
tweets$text <- gsub('@[0-9_A-Za-z]+', '@', tweets$text)
Create a training and test set, with 80% and 20%, respectively.
set.seed(123)
training <- sample(1:nrow(tweets), floor(.80 * nrow(tweets)))
test <- (1:nrow(tweets))[1:nrow(tweets) %in% training == FALSE]
Construct the DFM. You may want to experiment with different preprocessing techniques until you achieve better performance.
library(quanteda)
## quanteda version 0.9.9.65
## Using 3 of 4 cores for parallel computing
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
twcorpus <- corpus(tweets$text)
twdfm <- dfm(twcorpus, remove_punct=TRUE, remove_numbers=TRUE, remove=c(
stopwords("english"), "t.co", "https", "rt", "amp", "http", "t.c", "can"))
#twdfm <- dfm_trim(twdfm, min_docfreq = 2)
textplot_wordcloud(twdfm, rot.per=0, scale=c(3.5, .75), max.words=100)
# adding time of day as features
tweets$datetime[1]
## [1] "2016-01-01 00:11:35 GMT"
substr(tweets$datetime[1], 12, 13)
## [1] "00"
tweets$hour <- substr(tweets$datetime, 12, 13)
newdfm <- dfm(corpus(tweets$hour), verbose=TRUE)
## Creating a dfm from a corpus ...
## ... tokenizing texts
## ... lowercasing
## ... found 2,058 documents, 24 features
## ... created a 2,058 x 24 sparse dfm
## ... complete.
## Elapsed time: 0.001 seconds.
twdfm <- cbind(twdfm, newdfm)
Now run the classifier. Then, compute the accuracy.
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-10
require(doMC)
## Loading required package: doMC
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(cores=3)
ridge <- cv.glmnet(twdfm[training,], tweets$android[training],
family="binomial", alpha=0, nfolds=5, parallel=TRUE,
type.measure="deviance")
plot(ridge)
## function to compute accuracy
accuracy <- function(ypred, y){
tab <- table(ypred, y)
return(sum(diag(tab))/sum(tab))
}
# function to compute precision
precision <- function(ypred, y){
tab <- table(ypred, y)
return((tab[2,2])/(tab[2,1]+tab[2,2]))
}
# function to compute recall
recall <- function(ypred, y){
tab <- table(ypred, y)
return(tab[2,2]/(tab[1,2]+tab[2,2]))
}
# computing predicted values
preds <- predict(ridge, twdfm[test,], type="response") > mean(tweets$android[test])
# confusion matrix
table(preds, tweets$android[test])
##
## preds 0 1
## FALSE 150 33
## TRUE 63 166
# performance metrics
accuracy(preds, tweets$android[test])
## [1] 0.7669903
precision(preds, tweets$android[test])
## [1] 0.7248908
recall(preds, tweets$android[test])
## [1] 0.8341709
Identify the features that better predict that tweets were sent by this candidate. What do you learn?
# from the different values of lambda, let's pick the best one
best.lambda <- which(ridge$lambda==ridge$lambda.min)
beta <- ridge$glmnet.fit$beta[,best.lambda]
head(beta)
## believe state department new year's eve
## 0.11024005 0.07061251 0.19544650 -0.06800241 0.19544495 0.07059568
# looking at times of day
times <- tail(beta, n=24)
df <- data.frame(coef = as.numeric(times),
time = as.numeric(names(times)), stringsAsFactors=F)
df$time <- df$time - 4
df <- df[order(df$time),]
plot(df$time, df$coef)
lines(df$time, df$coef)
## identifying predictive features
df <- data.frame(coef = as.numeric(beta),
word = names(beta), stringsAsFactors=F)
df <- df[order(df$coef),]
head(df[,c("coef", "word")], n=30)
## coef word
## 1354 -0.3192295 powerful
## 3132 -0.3158853 develop
## 2091 -0.3007428 150m
## 3620 -0.3003854 completely
## 1260 -0.2974106 listeners
## 1903 -0.2973349 #commoncore
## 2975 -0.2948349 devotes
## 1519 -0.2913559 greta
## 3398 -0.2894576 scratched
## 1940 -0.2879420 cred
## 396 -0.2874535 cams
## 1386 -0.2863421 owe
## 1888 -0.2855663 9am
## 2212 -0.2855495 insurance
## 2845 -0.2846090 disrupted
## 2019 -0.2830376 cheater
## 1731 -0.2825024 barely
## 1417 -0.2802746 trough
## 3609 -0.2777939 fading
## 3610 -0.2777930 guarantee
## 2633 -0.2773057 beaten
## 966 -0.2763225 ultimate
## 859 -0.2750178 tiger
## 3314 -0.2748297 makeamericagreatagain
## 2167 -0.2739679 mouthpiece
## 3633 -0.2719811 #happymothersday
## 2470 -0.2702062 dry
## 572 -0.2690262 25k
## 1587 -0.2684620 particular
## 1490 -0.2661460 domination
paste(df$word[1:30], collapse=", ")
## [1] "powerful, develop, 150m, completely, listeners, #commoncore, devotes, greta, scratched, cred, cams, owe, 9am, insurance, disrupted, cheater, barely, trough, fading, guarantee, beaten, ultimate, tiger, makeamericagreatagain, mouthpiece, #happymothersday, dry, 25k, particular, domination"
df <- df[order(df$coef, decreasing=TRUE),]
head(df[,c("coef", "word")], n=30)
## coef word
## 2107 0.3289978 forties
## 1859 0.3116942 led
## 306 0.3033276 #iowan
## 2012 0.3009381 thankyou
## 1316 0.2959314 introduction
## 2061 0.2934836 #southcarolinaprimary
## 3540 0.2931940 mother
## 1017 0.2920383 comments
## 1841 0.2909955 #hannity
## 3062 0.2897049 corrupt
## 2945 0.2891977 ty
## 3341 0.2887989 postcards
## 1964 0.2882788 hesitates
## 870 0.2867488 pensacola
## 3515 0.2864294 highlight
## 1356 0.2856473 presidents
## 2839 0.2851345 dayton
## 3194 0.2803789 easter
## 3374 0.2800565 rochester
## 406 0.2789853 #trump4president
## 2097 0.2789834 crushed
## 526 0.2786570 advice
## 3395 0.2784396 listened
## 2462 0.2783732 plus
## 1525 0.2778416 corruption
## 2621 0.2760796 saved
## 89 0.2741340 huckabee
## 759 0.2737709 appreciated
## 755 0.2737707 harrison
## 2796 0.2728532 #donald
paste(df$word[1:30], collapse=", ")
## [1] "forties, led, #iowan, thankyou, introduction, #southcarolinaprimary, mother, comments, #hannity, corrupt, ty, postcards, hesitates, pensacola, highlight, presidents, dayton, easter, rochester, #trump4president, crushed, advice, listened, plus, corruption, saved, huckabee, appreciated, harrison, #donald"
Trying a different classifier: Gradient Boosting of Decision Trees
library(xgboost)
# converting matrix object
X <- as(twdfm, "dgCMatrix")
# running xgboost model
tryEta <- c(1,2)
tryDepths <- c(1,2,4)
bestEta=NA
bestDepth=NA
bestAcc=0
for(eta in tryEta){
for(dp in tryDepths){
bst <- xgb.cv(data = X[training,],
label = tweets$android[training],
max.depth = dp,
eta = eta,
nthread = 4,
nround = 500,
nfold=5,
print_every_n = 100L,
objective = "binary:logistic")
# cross-validated accuracy
acc <- 1-mean(tail(bst$evaluation_log$test_error_mean))
cat("Results for eta=",eta," and depth=", dp, " : ",
acc," accuracy.\n",sep="")
if(acc>bestAcc){
bestEta=eta
bestAcc=acc
bestDepth=dp
}
}
}
## [1] train-error:0.380923+0.004654 test-error:0.381522+0.019081
## [101] train-error:0.145657+0.004986 test-error:0.230870+0.013304
## [201] train-error:0.123481+0.004432 test-error:0.230265+0.010951
## [301] train-error:0.111633+0.003370 test-error:0.235728+0.011407
## [401] train-error:0.107229+0.002443 test-error:0.239974+0.012574
## [500] train-error:0.105255+0.002264 test-error:0.240588+0.014248
## Results for eta=1 and depth=1 : 0.7595125 accuracy.
## [1] train-error:0.338397+0.003534 test-error:0.341444+0.013915
## [101] train-error:0.101458+0.006612 test-error:0.225386+0.020369
## [201] train-error:0.075638+0.006506 test-error:0.232068+0.030892
## [301] train-error:0.060145+0.005704 test-error:0.241787+0.029654
## [401] train-error:0.050121+0.005102 test-error:0.244207+0.028989
## [500] train-error:0.041160+0.004692 test-error:0.239961+0.022329
## Results for eta=1 and depth=2 : 0.7596323 accuracy.
## [1] train-error:0.287515+0.005548 test-error:0.301337+0.025027
## [101] train-error:0.059234+0.004844 test-error:0.233284+0.016773
## [201] train-error:0.029617+0.001856 test-error:0.232062+0.022037
## [301] train-error:0.017922+0.001833 test-error:0.232665+0.023195
## [401] train-error:0.013518+0.001306 test-error:0.233886+0.022333
## [500] train-error:0.010784+0.001310 test-error:0.233890+0.022706
## Results for eta=1 and depth=4 : 0.765909 accuracy.
## [1] train-error:0.380922+0.004714 test-error:0.381509+0.019930
## [101] train-error:0.470373+0.032332 test-error:0.470241+0.009927
## [201] train-error:0.470373+0.032332 test-error:0.470241+0.009927
## [301] train-error:0.470373+0.032332 test-error:0.470241+0.009927
## [401] train-error:0.470373+0.032332 test-error:0.470241+0.009927
## [500] train-error:0.470373+0.032332 test-error:0.470241+0.009927
## Results for eta=2 and depth=1 : 0.5297594 accuracy.
## [1] train-error:0.338394+0.006524 test-error:0.340794+0.029435
## [101] train-error:0.455956+0.037992 test-error:0.444056+0.044872
## [201] train-error:0.465379+0.045655 test-error:0.431329+0.023306
## [301] train-error:0.465379+0.045655 test-error:0.431329+0.023306
## [401] train-error:0.465379+0.045655 test-error:0.431329+0.023306
## [500] train-error:0.465379+0.045655 test-error:0.431329+0.023306
## Results for eta=2 and depth=2 : 0.568671 accuracy.
## [1] train-error:0.286757+0.004916 test-error:0.305602+0.020300
## [101] train-error:0.489513+0.031601 test-error:0.479346+0.026067
## [201] train-error:0.489513+0.031601 test-error:0.479346+0.026067
## [301] train-error:0.489513+0.031601 test-error:0.479346+0.026067
## [401] train-error:0.489513+0.031601 test-error:0.479346+0.026067
## [500] train-error:0.489513+0.031601 test-error:0.479346+0.026067
## Results for eta=2 and depth=4 : 0.5206538 accuracy.
cat("Best model has eta=",bestEta," and depth=", bestDepth, " : ",
bestAcc," accuracy.\n",sep="")
## Best model has eta=1 and depth=4 : 0.765909 accuracy.
# running best model
rf <- xgboost(data = X[training,],
label = tweets$android[training],
max.depth = bestDepth,
eta = bestEta,
nthread = 4,
nround = 1000,
print_every_n=100L,
objective = "binary:logistic")
## [1] train-error:0.288578
## [101] train-error:0.066221
## [201] train-error:0.036452
## [301] train-error:0.024301
## [401] train-error:0.015796
## [501] train-error:0.012151
## [601] train-error:0.011543
## [701] train-error:0.009113
## [801] train-error:0.009113
## [901] train-error:0.008505
## [1000] train-error:0.008505
# out-of-sample accuracy
preds <- predict(rf, X[test,])
cat("\nAccuracy on test set=", round(accuracy(preds>.50, tweets$android[test]),3))
##
## Accuracy on test set= 0.762
cat("\nPrecision on test set=", round(precision(preds>.50, tweets$android[test]),3))
##
## Precision on test set= 0.739
cat("\nRecall on test set=", round(recall(preds>.50, tweets$android[test]),3))
##
## Recall on test set= 0.784
# feature importance
labels <- dimnames(X)[[2]]
importance <- xgb.importance(labels, model = rf, data=X, label=tweets$android)
## Warning in `[.data.table`(result, , `:=`("RealCover", as.numeric(vec)), :
## with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.
importance <- importance[order(importance$Gain, decreasing=TRUE),]
head(importance, n=20)
## Feature Split Gain Cover
## 1: thank 1.5 0.092877123 0.0057196530
## 2: u 20 0.037401798 0.0033220662
## 3: 12 -9.53674e-07 0.035671053 0.0030841464
## 4: #makeamericagreatagain -9.53674e-07 0.035616359 0.0116694675
## 5: 13 -9.53674e-07 0.026439178 0.0042178725
## 6: 14 -9.53674e-07 0.023764903 0.0060561334
## 7: 04 -9.53674e-07 0.022778167 0.0079180063
## 8: join -9.53674e-07 0.022765320 0.0012606874
## 9: 15 -9.53674e-07 0.018111213 0.0115206824
## 10: 11 -9.53674e-07 0.017749135 0.0022013085
## 11: #votetrump -9.53674e-07 0.011454005 0.0100246841
## 12: 17 -9.53674e-07 0.011290095 0.0099398695
## 13: 19 -9.53674e-07 0.010888909 0.0089646516
## 14: job 2 0.010593182 0.0007286837
## 15: trump 1.5 0.010454531 0.0043770919
## 16: trump 4 0.010186134 0.0075704737
## 17: video 1.5 0.009648655 0.0026244868
## 18: 20 -9.53674e-07 0.009555074 0.0106000105
## 19: 18 -9.53674e-07 0.008393567 0.0119837285
## 20: much 1.5 0.008362693 0.0044849840
## Frequency RealCover RealCover %
## 1: 0.0064355408 31 0.029980658
## 2: 0.0029063733 9 0.008704062
## 3: 0.0014531866 44 0.042553191
## 4: 0.0141166701 9 0.008704062
## 5: 0.0026987752 57 0.055125725
## 6: 0.0068507370 44 0.042553191
## 7: 0.0091343160 39 0.037717602
## 8: 0.0004151962 1 0.000967118
## 9: 0.0128710816 42 0.040618956
## 10: 0.0008303924 15 0.014506770
## 11: 0.0089267179 3 0.002901354
## 12: 0.0101723064 20 0.019342360
## 13: 0.0085115217 12 0.011605416
## 14: 0.0008303924 6 0.005802708
## 15: 0.0083039236 81 0.078336557
## 16: 0.0114178950 91 0.088007737
## 17: 0.0012455885 1 0.000967118
## 18: 0.0091343160 11 0.010638298
## 19: 0.0110026988 9 0.008704062
## 20: 0.0026987752 15 0.014506770
# adding sign
sums <- list()
for (v in 0:1){
sums[[v+1]] <- colSums(X[tweets[,"android"]==v,])
}
sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
df <- data.frame(
Feature = labels,
sign = sign-1,
stringsAsFactors=F)
importance <- merge(importance, df, by="Feature")
## best predictors
for (v in 0:1){
cat("\n\n")
cat("value==", v)
importance <- importance[order(importance$Gain, decreasing=TRUE),]
print(head(importance[importance$sign==v,], n=50))
cat("\n")
cat(paste(unique(head(importance$Feature[importance$sign==v], n=50)), collapse=", "))
}
##
##
## value== 0 Feature Split Gain Cover
## 1: thank 1.5 0.092877123 5.719653e-03
## 2: u 20 0.037401798 3.322066e-03
## 3: #makeamericagreatagain -9.53674e-07 0.035616359 1.166947e-02
## 4: join -9.53674e-07 0.022765320 1.260687e-03
## 5: #votetrump -9.53674e-07 0.011454005 1.002468e-02
## 6: 17 -9.53674e-07 0.011290095 9.939869e-03
## 7: 19 -9.53674e-07 0.010888909 8.964652e-03
## 8: video 1.5 0.009648655 2.624487e-03
## 9: 20 -9.53674e-07 0.009555074 1.060001e-02
## 10: 18 -9.53674e-07 0.008393567 1.198373e-02
## 11: 22 -9.53674e-07 0.008118831 1.128236e-02
## 12: 23 -9.53674e-07 0.007378553 7.782440e-03
## 13: 21 -9.53674e-07 0.006812569 8.269242e-03
## 14: tomorrow -9.53674e-07 0.006386310 4.554853e-03
## 15: thank 2 0.005838967 1.790418e-03
## 16: w 4 0.005771816 2.203474e-03
## 17: via -9.53674e-07 0.005421917 6.324302e-04
## 18: 00 -9.53674e-07 0.005226998 9.963289e-03
## 19: national 1.5 0.005016697 3.267454e-03
## 20: soon -9.53674e-07 0.004640665 2.824576e-03
## 21: pennsylvania -9.53674e-07 0.003906521 6.338297e-04
## 22: #fitn -9.53674e-07 0.003850067 1.088315e-03
## 23: times -9.53674e-07 0.003808673 2.505758e-03
## 24: 06 -9.53674e-07 0.003459074 1.270131e-02
## 25: york 4 0.003230675 1.615261e-03
## 26: massive -9.53674e-07 0.003222589 4.239371e-03
## 27: dishonest -9.53674e-07 0.003043879 1.660732e-03
## 28: #gopdebate -9.53674e-07 0.002850391 1.320380e-03
## 29: t 4 0.002741548 1.497644e-03
## 30: est -9.53674e-07 0.002736103 6.295781e-04
## 31: 02 -9.53674e-07 0.002635378 7.877000e-03
## 32: poll 4 0.002615047 4.156703e-03
## 33: us 1.5 0.002540507 5.170648e-03
## 34: supporters -9.53674e-07 0.002533213 1.013577e-03
## 35: 01 -9.53674e-07 0.002503560 6.196699e-03
## 36: support -9.53674e-07 0.002478543 1.510386e-03
## 37: 07 -9.53674e-07 0.002426384 5.197319e-03
## 38: forward -9.53674e-07 0.002398556 2.589300e-03
## 39: event -9.53674e-07 0.002296520 1.017616e-03
## 40: zuckerman -9.53674e-07 0.002267034 1.018324e-03
## 41: #supertuesday -9.53674e-07 0.002189334 1.036561e-03
## 42: university -9.53674e-07 0.002099887 1.012314e-03
## 43: elizabeth -9.53674e-07 0.002047364 6.001713e-04
## 44: u 3.5 0.001993550 4.991130e-03
## 45: us 4 0.001928396 5.954970e-03
## 46: goldman -9.53674e-07 0.001895827 5.541513e-05
## 47: new 2 0.001753643 9.252950e-05
## 48: new 4 0.001738081 4.795232e-03
## 49: tuesday -9.53674e-07 0.001726909 1.305572e-03
## 50: u 4.5 0.001593165 7.859458e-03
## Feature Split Gain Cover
## Frequency RealCover RealCover % sign
## 1: 0.0064355408 31 0.029980658 0
## 2: 0.0029063733 9 0.008704062 0
## 3: 0.0141166701 9 0.008704062 0
## 4: 0.0004151962 1 0.000967118 0
## 5: 0.0089267179 3 0.002901354 0
## 6: 0.0101723064 20 0.019342360 0
## 7: 0.0085115217 12 0.011605416 0
## 8: 0.0012455885 1 0.000967118 0
## 9: 0.0091343160 11 0.010638298 0
## 10: 0.0110026988 9 0.008704062 0
## 11: 0.0122482873 20 0.019342360 0
## 12: 0.0074735312 9 0.008704062 0
## 13: 0.0074735312 11 0.010638298 0
## 14: 0.0039443637 3 0.002901354 0
## 15: 0.0062279427 29 0.028046422 0
## 16: 0.0014531866 3 0.002901354 0
## 17: 0.0004151962 1 0.000967118 0
## 18: 0.0107951007 16 0.015473888 0
## 19: 0.0020759809 3 0.002901354 0
## 20: 0.0016607847 6 0.005802708 0
## 21: 0.0002075981 0 0.000000000 0
## 22: 0.0004151962 0 0.000000000 0
## 23: 0.0012455885 2 0.001934236 0
## 24: 0.0116254930 10 0.009671180 0
## 25: 0.0012455885 6 0.005802708 0
## 26: 0.0029063733 2 0.001934236 0
## 27: 0.0008303924 5 0.004835590 0
## 28: 0.0006227943 3 0.002901354 0
## 29: 0.0006227943 0 0.000000000 0
## 30: 0.0002075981 0 0.000000000 0
## 31: 0.0095495121 27 0.026112186 0
## 32: 0.0039443637 14 0.013539652 0
## 33: 0.0041519618 9 0.008704062 0
## 34: 0.0004151962 3 0.002901354 0
## 35: 0.0058127465 6 0.005802708 0
## 36: 0.0010379905 8 0.007736944 0
## 37: 0.0041519618 4 0.003868472 0
## 38: 0.0016607847 3 0.002901354 0
## 39: 0.0004151962 3 0.002901354 0
## 40: 0.0004151962 0 0.000000000 0
## 41: 0.0004151962 1 0.000967118 0
## 42: 0.0004151962 2 0.001934236 0
## 43: 0.0002075981 0 0.000000000 0
## 44: 0.0049823542 12 0.011605416 0
## 45: 0.0047747561 9 0.008704062 0
## 46: 0.0002075981 1 0.000967118 0
## 47: 0.0004151962 22 0.021276596 0
## 48: 0.0062279427 23 0.022243714 0
## 49: 0.0006227943 1 0.000967118 0
## 50: 0.0074735312 8 0.007736944 0
## Frequency RealCover RealCover % sign
##
## thank, u, #makeamericagreatagain, join, #votetrump, 17, 19, video, 20, 18, 22, 23, 21, tomorrow, w, via, 00, national, soon, pennsylvania, #fitn, times, 06, york, massive, dishonest, #gopdebate, t, est, 02, poll, us, supporters, 01, support, 07, forward, event, zuckerman, #supertuesday, university, elizabeth, goldman, new, tuesday
##
## value== 1 Feature Split Gain Cover Frequency
## 1: 12 -9.53674e-07 0.035671053 0.0030841464 0.0014531866
## 2: 13 -9.53674e-07 0.026439178 0.0042178725 0.0026987752
## 3: 14 -9.53674e-07 0.023764903 0.0060561334 0.0068507370
## 4: 04 -9.53674e-07 0.022778167 0.0079180063 0.0091343160
## 5: 15 -9.53674e-07 0.018111213 0.0115206824 0.0128710816
## 6: 11 -9.53674e-07 0.017749135 0.0022013085 0.0008303924
## 7: job 2 0.010593182 0.0007286837 0.0008303924
## 8: trump 1.5 0.010454531 0.0043770919 0.0083039236
## 9: trump 4 0.010186134 0.0075704737 0.0114178950
## 10: much 1.5 0.008362693 0.0044849840 0.0026987752
## 11: win 4 0.008085170 0.0045746190 0.0039443637
## 12: wow -9.53674e-07 0.007057593 0.0079298144 0.0070583351
## 13: just 2 0.007028191 0.0002922437 0.0026987752
## 14: donaldtrump -9.53674e-07 0.006725871 0.0012135612 0.0004151962
## 15: big 6 0.006155326 0.0050688491 0.0045671580
## 16: said -9.53674e-07 0.005159194 0.0159992828 0.0155698568
## 17: will 1.5 0.005128423 0.0026561235 0.0141166701
## 18: p.m -9.53674e-07 0.005075584 0.0015805046 0.0006227943
## 19: one -9.53674e-07 0.005063292 0.0101493802 0.0087191198
## 20: cruz 1.5 0.004893105 0.0049469326 0.0066431389
## 21: hillary 4 0.004793922 0.0043360535 0.0041519618
## 22: got 1.5 0.004772025 0.0027929498 0.0016607847
## 23: best -9.53674e-07 0.004388935 0.0061054452 0.0043595599
## 24: 05 -9.53674e-07 0.004352847 0.0078121825 0.0078887274
## 25: 03 -9.53674e-07 0.004344592 0.0111161729 0.0130786797
## 26: will 4 0.004342077 0.0006710725 0.0101723064
## 27: candidates 4 0.003944141 0.0010757651 0.0004151962
## 28: women 4 0.003813570 0.0009857106 0.0004151962
## 29: u.s -9.53674e-07 0.003744277 0.0009874371 0.0004151962
## 30: america 2 0.003710449 0.0002708185 0.0024911771
## 31: will 2 0.003697765 0.0006765452 0.0105875026
## 32: cruz 2 0.003473759 0.0003713714 0.0037367656
## 33: voting -9.53674e-07 0.003427715 0.0018093515 0.0008303924
## 34: great 1.5 0.003293033 0.0036453397 0.0099647083
## 35: millions -9.53674e-07 0.003228836 0.0010363900 0.0004151962
## 36: many 4 0.003225302 0.0037570719 0.0033215694
## 37: job 4 0.003117017 0.0028094398 0.0020759809
## 38: 16 -9.53674e-07 0.003102981 0.0114184405 0.0114178950
## 39: carolina -9.53674e-07 0.003099213 0.0051447175 0.0039443637
## 40: won -9.53674e-07 0.003021040 0.0026760425 0.0018683828
## 41: media -9.53674e-07 0.003005130 0.0024866458 0.0014531866
## 42: rally -9.53674e-07 0.002840934 0.0049259275 0.0039443637
## 43: happy -9.53674e-07 0.002687876 0.0014763303 0.0008303924
## 44: president 6 0.002623698 0.0031940929 0.0029063733
## 45: nothing -9.53674e-07 0.002579016 0.0083539180 0.0072659332
## 46: know -9.53674e-07 0.002523419 0.0020380226 0.0010379905
## 47: looking -9.53674e-07 0.002486268 0.0027605050 0.0014531866
## 48: great 2 0.002446437 0.0007049091 0.0095495121
## 49: lightweight -9.53674e-07 0.002421978 0.0024289129 0.0020759809
## 50: bill 4 0.002389521 0.0009176661 0.0004151962
## Feature Split Gain Cover Frequency
## RealCover RealCover % sign
## 1: 44 0.042553191 1
## 2: 57 0.055125725 1
## 3: 44 0.042553191 1
## 4: 39 0.037717602 1
## 5: 42 0.040618956 1
## 6: 15 0.014506770 1
## 7: 6 0.005802708 1
## 8: 81 0.078336557 1
## 9: 91 0.088007737 1
## 10: 15 0.014506770 1
## 11: 19 0.018375242 1
## 12: 24 0.023210832 1
## 13: 50 0.048355899 1
## 14: 6 0.005802708 1
## 15: 31 0.029980658 1
## 16: 24 0.023210832 1
## 17: 100 0.096711799 1
## 18: 4 0.003868472 1
## 19: 21 0.020309478 1
## 20: 69 0.066731141 1
## 21: 41 0.039651838 1
## 22: 9 0.008704062 1
## 23: 6 0.005802708 1
## 24: 22 0.021276596 1
## 25: 28 0.027079304 1
## 26: 103 0.099613153 1
## 27: 8 0.007736944 1
## 28: 7 0.006769826 1
## 29: 15 0.014506770 1
## 30: 38 0.036750484 1
## 31: 99 0.095744681 1
## 32: 62 0.059961315 1
## 33: 8 0.007736944 1
## 34: 78 0.075435203 1
## 35: 10 0.009671180 1
## 36: 22 0.021276596 1
## 37: 10 0.009671180 1
## 38: 27 0.026112186 1
## 39: 14 0.013539652 1
## 40: 12 0.011605416 1
## 41: 14 0.013539652 1
## 42: 7 0.006769826 1
## 43: 3 0.002901354 1
## 44: 16 0.015473888 1
## 45: 8 0.007736944 1
## 46: 6 0.005802708 1
## 47: 11 0.010638298 1
## 48: 87 0.084139265 1
## 49: 6 0.005802708 1
## 50: 1 0.000967118 1
## RealCover RealCover % sign
##
## 12, 13, 14, 04, 15, 11, job, trump, much, win, wow, just, donaldtrump, big, said, will, p.m, one, cruz, hillary, got, best, 05, 03, candidates, women, u.s, america, voting, great, millions, many, 16, carolina, won, media, rally, happy, president, nothing, know, looking, lightweight, bill