The file data/congress-facebook.rdata contains a data frame with the Facebook posts of all current Members of Congress, with row corresponding to a different document, after collapsing all the Facebook posts from a single member into only one character string.

We’re going to run LDA to explore the different topics that Members of Congress discuss on Facebook, and whether there is variation across different characteristics, such as party and chamber.

You can skip to the end of this script if you want to see how I collected and cleaned the data, but for now let’s jump straight to the topic modeling part:

load("data/congress-facebook.rdata")
nrow(posts)
## [1] 502
str(posts)
## 'data.frame':    502 obs. of  6 variables:
##  $ text   : chr  "Enjoyed seeing all the TX-02 constituents up in D.C. this week! Lots of young Texans have been coming by the of"| __truncated__ "A foreign adversary launched an attack on our democracy.  That is a threat which every American, regardless of "| __truncated__ "Watch video here of my statement calling for increased federal support of 1890s Land-grant HBCUs. ICYMI: Earlie"| __truncated__ "NA Children's health should be bipartisan. #TrumpCare cuts billions from Medicaid and leaves 3 million children"| __truncated__ ...
##  $ account: chr  "106631626049851" "109135405838588" "113303673339" "115356957005" ...
##  $ name   : chr  "Ted Poe" "Wm. Lacy Clay" "David Scott" "Henry C. \"Hank\" Johnson, Jr." ...
##  $ party  : chr  "Republican" "Democrat" "Democrat" "Democrat" ...
##  $ type   : chr  "rep" "rep" "rep" "rep" ...
##  $ gender : chr  "M" "M" "M" "M" ...

Creating the document-feature matrix in quanteda:

library(quanteda)
## quanteda version 0.9.9.65
## Using 3 of 4 cores for parallel computing
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
fbcorpus <- corpus(posts$text)
cdfm <- dfm(fbcorpus, remove=stopwords("english"), verbose=TRUE,
               remove_punct=TRUE, remove_numbers=TRUE)
## Creating a dfm from a corpus ...
##    ... tokenizing texts
##    ... lowercasing
##    ... found 502 documents, 92,444 features
## ...
## dfm_select removed 171 features and 0 documents, padding 0s for 0 features and 0 documents.
##    ... created a 502 x 90,753 sparse dfm
##    ... complete. 
## Elapsed time: 0.488 seconds.
cdfm <- dfm_trim(cdfm, min_docfreq = 2, verbose=TRUE)
## Removing features occurring:
##   - in fewer than 2 document: 54,988
##   Total features removed: 54,988 (60.6%).
# we now export to a format that we can run the topic model with
dtm <- convert(cdfm, to="topicmodels")
## Loading required namespace: tm

And now we can run the topic model

# estimate LDA with K topics
library(topicmodels)
K <- 50
lda <- LDA(dtm, k = K, method = "Gibbs", 
                control = list(verbose=25L, seed = 123, burnin = 100, iter = 500))

save(lda, file="backup/lda-output.Rdata")

The code above will take some time, so I already ran it before and put it in the backup folder. Let’s start looking at the results…

library(topicmodels)
load("backup/lda-output.Rdata")
terms(lda)
##         Topic 1         Topic 2         Topic 3         Topic 4 
##    "washington"      "grassley"   "connecticut"           "san" 
##         Topic 5         Topic 6         Topic 7         Topic 8 
##      "colorado"          "http"     "president"         "texas" 
##         Topic 9        Topic 10        Topic 11        Topic 12 
##         "today"     "president"           "new"         "trump" 
##        Topic 13        Topic 14        Topic 15        Topic 16 
##      "illinois"          "will"        "senate"        "bit.ly" 
##        Topic 17        Topic 18        Topic 19        Topic 20 
##        "health"      "military"        "school"      "virginia" 
##        Topic 21        Topic 22        Topic 23        Topic 24 
##         "north"         "tulsi"           "joe"       "florida" 
##        Topic 25        Topic 26        Topic 27        Topic 28 
##         "maine"        "people"           "new"      "politics" 
##        Topic 29        Topic 30        Topic 31        Topic 32 
##      "veterans"            "de"      "michigan"          "bill" 
##        Topic 33        Topic 34        Topic 35        Topic 36 
## "massachusetts"          "said"           "new"      "maryland" 
##        Topic 37        Topic 38        Topic 39        Topic 40 
##        "nevada"         "north"    "california"        "kansas" 
##        Topic 41        Topic 42        Topic 43        Topic 44 
##       "georgia"           "new"       "alabama"         "trump" 
##        Topic 45        Topic 46        Topic 47        Topic 48 
##         "https"          "town"  "pennsylvania"          "ohio" 
##        Topic 49        Topic 50 
##           "act"     "minnesota"
# top 10 words associated to each topic
trms <- t(terms(lda, k=10))
# some topics are easy to identify
trms[6,] # media appearances
##  [1] "http"                "https"               "news"               
##  [4] "press-releases"      "media-center"        "watch"              
##  [7] "read"                "www.youtube.com"     "v"                  
## [10] "documentsingle.aspx"
trms[15,] # supreme court nomination
##  [1] "senate"     "senator"    "judge"      "secretary"  "court"     
##  [6] "nomination" "general"    "nominee"    "gorsuch"    "hearing"
trms[17,] # repealing obamacare
##  [1] "health"    "care"      "obamacare" "house"     "american" 
##  [6] "act"       "tax"       "americans" "will"      "repeal"
trms[18,] # foreign policy
##  [1] "military" "u.s"      "security" "united"   "today"    "foreign" 
##  [7] "israel"   "allies"   "defense"  "north"
trms[32,] # budgetary issues?
##  [1] "bill"       "repeal"     "spending"   "debt"       "healthcare"
##  [6] "budget"     "paul"       "rep"        "vote"       "government"
trms[33,] # energy policy
##  [1] "massachusetts" "climate"       "u.s"           "trump"        
##  [5] "clean"         "energy"        "nuclear"       "oil"          
##  [9] "change"        "boston"
trms[44,] # Trump-Russia investigation
##  [1] "trump"         "house"         "president"     "investigation"
##  [5] "committee"     "russian"       "russia"        "trump's"      
##  [9] "intelligence"  "election"
trms[46,] # town halls
##  [1] "town"          "hall"          "office"        "district"     
##  [5] "will"          "questions"     "congressional" "congressman"  
##  [9] "constituents"  "na"
trms[49,] # legislative terms
##  [1] "act"         "h.r"         "house"       "bill"        "passed"     
##  [6] "voted"       "legislation" "vote"        "regulations" "support"
# but many others appear to simply refer to states
trms[1,]
##  [1] "washington" "oregon"     "nebraska"   "seattle"    "community" 
##  [6] "sound"      "oregonians" "rep"        "eastern"    "pacific"
trms[2,]
##  [1] "grassley"  "senator"   "kentucky"  "idaho"     "wyoming"  
##  [6] "comer"     "delaware"  "#kentucky" "https"     "said"
# so probably we would want to clean the data to get rid of state names

Let’s look at how often different Members of Congress mention each topic…

# matrix with topic distributions
mat <- lda@gamma
dim(mat)
## [1] 502  50
mat[1, 1:50]
##  [1] 0.0001518142 0.0003036284 0.0004554425 0.0003036284 0.0001518142
##  [6] 0.0027326552 0.0198876575 0.0154850463 0.0156368605 0.0139669045
## [11] 0.0004554425 0.0006072567 0.0015181418 0.1381509033 0.0010626993
## [16] 0.0001518142 0.0347654471 0.1263093973 0.1017155002 0.0003036284
## [21] 0.0001518142 0.0001518142 0.0007590709 0.0003036284 0.0001518142
## [26] 0.1694246243 0.0044026112 0.0004554425 0.0200394717 0.0007590709
## [31] 0.0001518142 0.0004554425 0.0003036284 0.2899650827 0.0003036284
## [36] 0.0015181418 0.0021253985 0.0001518142 0.0003036284 0.0003036284
## [41] 0.0003036284 0.0003036284 0.0001518142 0.0036435403 0.0004554425
## [46] 0.0159404888 0.0001518142 0.0001518142 0.0113860635 0.0012145134
# i.e.: Member of Congress 1 is talking 16.9% of the time about Topic 26

# who is the Member of Congress that talks more about...
# immigration?
posts$name[which.max(mat[,44])] # Trump-Russia
## [1] "Maxine Waters"
posts$name[which.max(mat[,17])] # repealing obamacare
## [1] "Steve Scalise"
# northwest?
posts$name[which.max(mat[,1])]
## [1] "Rick Larsen"
# supreme court nomination
posts$name[which.max(mat[,15])]
## [1] "Patrick J. Leahy"
# looking at differences across parties
rep.props <- apply(lda@gamma[posts$party=="Republican",], 2, mean)
dem.props <- apply(lda@gamma[posts$party=="Democrat",], 2, mean)
ratio <- rep.props / (dem.props + rep.props)

# % of topic usage by party
ratio[15] # supreme court nomination
## [1] 0.4614485
ratio[17] # repealing obamacare
## [1] 0.902871
ratio[18] # foreign policy
## [1] 0.6692792
ratio[32] # budgetary issues?
## [1] 0.7922182
ratio[33] # energy policy
## [1] 0.184171
ratio[44] # Trump-Russia investigation
## [1] 0.1612373

And now, here’s how I collected and cleaned the data.

First of all, let’s scrape the dataset with the social media accounts of Members of Congress.

scrapeCongressData <- function(){
  ## Downloading Congress data
  txt <- httr::content(httr::GET(paste0("https://raw.githubusercontent.com/unitedstates/",
                            "congress-legislators/master/legislators-current.yaml")), 'text')
  congress <- yaml::yaml.load(txt)
  congress <- data.frame(
    id = unlistCongress(congress, c('id', 'thomas')),
    bioid = unlistCongress(congress, c('id', 'bioguide')),
    name = unlistCongress(congress, c('name', 'official_full')),
    gender = unlistCongress(congress, c('bio', 'gender')),
    type = unlist(lapply(congress, function(x)
      x$terms[[length(x$terms)]]$type)),
    party = unlist(lapply(congress, function(x)
      x$terms[[length(x$terms)]]$party)),
    stringsAsFactors=F)
  ## Downloading List of Social Media Accounts
  txt <- httr::content(httr::GET(paste0("https://raw.githubusercontent.com/unitedstates/",
                            "congress-legislators/master/legislators-social-media.yaml")), 'text')
  sm <- yaml::yaml.load(txt)
  sm <- data.frame(
    bioid = unlistCongress(sm, c('id', 'bioguide')),
    twitter = unlistCongress(sm, c('social', 'twitter')),
    facebook = unlistCongress(sm, c('social', 'facebook')),
    youtube = unlistCongress(sm, c('social', 'youtube')),
    stringsAsFactors=F)
  ## merging
  df <- merge(congress, sm, all.x=TRUE)
  return(df)
}


unlistCongress <- function(lst, field){
  if (length(field)==1){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst, '[[', field))
  }
  if (length(field)==2){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst, function(x) x[[field[1]]][[field[2]]]))
  }
  return(vect)
}

congress <- scrapeCongressData()

And this is a copy of the file I downloaded…

congress <- read.csv("data/congress-social-media.csv", stringsAsFactors=F)

Keeping only members of congress with FB accounts

table(is.na(congress$facebook))
congress <- congress[!is.na(congress$facebook),]

Collecting Facebook page data

library(Rfacebook)
fb_oauth <- "YOUR_TOKEN_HERE"
# get yours from: https://developers.facebook.com/tools/explorer/

# list of accounts
accounts <- congress$facebook

# removing those already downloaded
accounts.done <- list.files("data/facebook")
accounts.left <- accounts[tolower(accounts) %in% tolower(gsub(".csv", "", accounts.done)) == FALSE]

# loop over accounts
for (account in accounts.left){
    message(account)

    # download page data (with error handling)
    error <- tryCatch(df <- getPage(account, token=fb_oauth, n=5000, since='2017/01/01',
                                    reactions=FALSE, api="v2.9"),
        error=function(e) e)
    if (inherits(error, 'error')){ next }
    
    ## cleaning text from \n (line breaks)
    df$message <- gsub("\n", " ", df$message)

    ## save page data csv format
    write.csv(df, file=paste0("data/facebook/", account, ".csv"),
        row.names=F)
}

Merging text from Facebook pages into a single data frame where all posts are collapsed into a single string.

accounts <- sort(gsub(".csv", "", list.files("data/facebook")))
congress <- read.csv("data/congress-social-media.csv", stringsAsFactors=F)
congress <- congress[congress$facebook %in% accounts,]
congress <- congress[order(congress$facebook),]

posts <- data.frame(
    text = "",
    account = accounts,
    name = congress$name,
    party = congress$party,
    type = congress$type,
    gender = congress$gender, stringsAsFactors=F )

for (account in accounts){
    message(account)
    d <- read.csv(paste0("data/facebook/", account, ".csv"), 
        stringsAsFactors=F)
    posts$text[posts$account==account] <- paste(d$message, collapse=" ")
}

nrow(posts)
save(posts, file="data/congress-facebook.rdata")