Loops using the foreach package

The foreach package improves the way in which we run loops in R, and provides a construct to run loops in parallel.

The basic structure of loops with the package is:

# Without parallelization --> %do%
output <- foreach(i = 'some object to iterate over', 'options') %do% {some r code}

# With parallelization --> %dopar%
output <- foreach(i = 'some object to iterate over', 'options') %dopar% {some r code}

As a first example, we can use foreach just like a for loop without parallelization

library(foreach)
result <- foreach(x = c(4,9,16)) %do% sqrt(x)
result
## [[1]]
## [1] 2
## 
## [[2]]
## [1] 3
## 
## [[3]]
## [1] 4

Note that, unlike a regular for loop, foreach returns an object (by default a list) that contains the results compiled across all iterations.

We can change the object returned by specifying the function used to combine results across iterations with the .combine option:

result <- foreach(x = c(4,9,16), .combine = 'c') %do% sqrt(x)
class(result)
## [1] "numeric"

Other options for .combine are: cbind, rbind, +, *:

# cbind...
result <- foreach(x = c(4,9,16), .combine = 'cbind') %do% c(sqrt(x), log(x), x^2)
class(result)
## [1] "matrix"
result
##       result.1  result.2   result.3
## [1,]  2.000000  3.000000   4.000000
## [2,]  1.386294  2.197225   2.772589
## [3,] 16.000000 81.000000 256.000000
# rbind
result <- foreach(x = c(4,9,16), .combine = 'rbind') %do% c(sqrt(x), log(x), x^2)
class(result)
## [1] "matrix"
result
##          [,1]     [,2] [,3]
## result.1    2 1.386294   16
## result.2    3 2.197225   81
## result.3    4 2.772589  256
# sum
result <- foreach(x = c(4,9,16), .combine = '+') %do% sqrt(x)
class(result)
## [1] "numeric"
result
## [1] 9

Parallelizing our loops using foreach and doParallel

Before we can parallelize our code, we need to declare a “cluster” – that is, we need to tell R that we have multiple cores – so that R knows how to execute the code. These are the steps involved in this process:

  1. Create the cluster. Note that we need to load the doParallel package to extend the functionality of foreach.
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
myCluster <- makeCluster(3, # number of cores to use
                         type = "PSOCK") # type of cluster

First, we choose the number of cores we want to use. You can check how many your computer has by running detectCores(). One good rule of thumb is to always leave one core unused for other tasks.

detectCores()
## [1] 4

We can choose between two types of clusters:

  1. Register the cluster with the ‘foreach’ package
registerDoParallel(myCluster)

If you’re running this locally, you can check your Monitor App to see that new instances of R were launched in your computer.

  1. And now we’re ready to use our cluster! We only have to change %do% to %dopar%
output <- foreach(i = 'some object to iterate over', 'options') %dopar% {some r code}

For example:

result <- foreach(x = c(4,9,16), .combine = 'c') %dopar% sqrt(x)
  1. Always remember to stop the cluster when you have finished!
stopCluster(myCluster)

Let’s run some tests to see the improvement in performance:

d <- read.csv("../data/UK-tweets.csv", stringsAsFactors=FALSE)
nsims <- 500

# without parallelization

system.time({
r <- foreach(1:nsims, .combine='c') %do% {
  smp <- sample(1:nrow(d), replace=TRUE)
  reg <- lm(log(favourites_count+1) ~ 
              communication + followers_count, data=d[smp,])
  coef(reg)[2]
}})
##    user  system elapsed 
##   4.517   0.071   4.879
quantile(r, probs=c(.025, 0.975))
##      2.5%     97.5% 
## 0.4277786 0.7541396
# with parallelization

myCluster <- makeCluster(3, type = "FORK") # why "FORK"?
registerDoParallel(myCluster)

system.time({
r <- foreach(1:nsims, .combine='c') %dopar% {
  smp <- sample(1:nrow(d), replace=TRUE)
  reg <- lm(log(favourites_count+1) ~ communication + followers_count, data=d[smp,])
  coef(reg)[2]
}})
##    user  system elapsed 
##   0.374   0.069   3.998
stopCluster(myCluster)

quantile(r, probs=c(.025, 0.975))
##      2.5%     97.5% 
## 0.4362723 0.7572310

Let’s run another example: here we are generating 50 word clouds, one for each inaugural speech. Note that parallelization here takes advantage of splitting across different cores the computation; but the gains are not coming sa much from reading the files into R.

fls <- list.files("../data/inaugural", full.names=TRUE)[1:50]
library(quanteda)
## quanteda version 0.9.9.65
## Using 3 of 4 cores for parallel computing
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
dir.create("../data/wordclouds")

# regular loop
init <- Sys.time()

r <- foreach(i = 1:length(fls)) %do% {
  txt <- readLines(fls[i])
  txt <- paste(txt, collapse="\n")
  dfm <- dfm(corpus(txt), remove=stopwords("english"), remove_punct=TRUE, ngrams=1:2)
  pdf(paste0("../data/wordclouds/", i, ".pdf"), height=5, width=5)
  textplot_wordcloud(dfm, rot.per=0, scale=c(2.5, .75), max.words=50)
  dev.off()

}

Sys.time() - init
## Time difference of 1.826907 mins
# parallelized loop

myCluster <- makeCluster(3, type = "FORK") # why "FORK"?
registerDoParallel(myCluster)

init <- Sys.time()

r <- foreach(i = 1:length(fls)) %dopar% {
  txt <- readLines(fls[i])
  txt <- paste(txt, collapse="\n")
  dfm <- dfm(corpus(txt), remove=stopwords("english"), remove_punct=TRUE, ngrams=1:2)
  pdf(paste0("../data/wordclouds/", i, ".pdf"), height=5, width=5)
  textplot_wordcloud(dfm, rot.per=0, scale=c(3.5, .75), max.words=100)
  dev.off()

}

Sys.time() - init
## Time difference of 1.013001 mins
stopCluster(myCluster)