Vectorization of R code

Let’s create the following two vectors with random values sampled from a uniform distribution:

set.seed(777)
n <- 1000000
x <- runif(n)
y <- runif(n)

Which of the following chunks of code will run faster?

loop <- function(){
  z <- rep(NA, n)
  for (i in 1:n){
    z[i] <- x[i] + y[1]
  }
  return(z)
}
sapply_code <- function(){
  z <- sapply(1:n, function(i) x[i] + y[i])
  return(z)
}
vectorized <- function(){
  z <- x + y
  return(z)
}
system.time(loop())
##    user  system elapsed 
##   0.200   0.010   0.275
system.time(sapply_code())
##    user  system elapsed 
##   2.301   0.079   2.552
system.time(vectorized())
##    user  system elapsed 
##   0.003   0.001   0.004

There are many other cases where vectorized functions can dramatically speed up our code. The first example shows the power of ifelse and the [ (replacement) function.

set.seed(123)
n <- 10000000
ideology <- sample(1:10, n, replace=TRUE)

# vectorized code
group_ideology <- function(){
  groups <- rep(NA, n)
  groups[ideology>5] <- "right"
  groups[ideology<5] <- "left"
  return(groups)
}

# non-vectorized code
ideology_loop <- function(){
  groups <- rep(NA, n)
  for (i in 1:n){
    if (ideology[i]>5) groups[i] <- "left"
    if (ideology[i]>5) groups[i] <- "right"
  }
}

# benchmarks
system.time(group_ideology())
##    user  system elapsed 
##   0.491   0.114   0.643
system.time(ideology_loop())
##    user  system elapsed 
##   3.968   0.215   4.475

This second example shows how matrix operations are vectorized too:

set.seed(777)
n <- 1000000
df <- data.frame(grade1=runif(n, 0, 100),
                 grade2=runif(n, 0, 100),
                 grade3=runif(n, 0, 100),
                 grade4=runif(n, 0, 100),
                 grade5=runif(n, 0, 100))

system.time(total1 <- df$grade1 + df$grade2 + df$grade3 + df$grade4 + df$grade5)
##    user  system elapsed 
##   0.010   0.004   0.015
system.time(total2 <- rowSums(df))
##    user  system elapsed 
##   0.036   0.014   0.053
total3 <- rep(NA, n)
system.time(for (i in 1:n){
  total3[i] <- df$grade1[i] + df$grade2[i] + df$grade3[i] + df$grade4[i] + df$grade5[i]
  }
)
##    user  system elapsed 
##  30.209   0.489  31.469
# why is rowSums slower than + ?

df <- as.matrix(df)

system.time(total1 <- df[,1] + df[,2] + df[,3] + df[,4] + df[,5])
##    user  system elapsed 
##   0.055   0.030   0.092
system.time(total2 <- rowSums(df))
##    user  system elapsed 
##   0.019   0.004   0.024

The other important way to write more efficient code is to avoid memory copy:

set.seed(123)
n <- 10000

func1 <- function(){
  rnd <- c()
  for (i in 1:n){
    rnd <- c(rnd, rnorm(n=1))
  }
  return(rnd)
}

func2 <- function(){
  rnd <- rep(NA, n)
  for (i in 1:n){
    rnd[i] <- rnorm(n=1)
  }
  return(rnd)
}

system.time(func1())
##    user  system elapsed 
##   0.195   0.139   0.341
system.time(func2())
##    user  system elapsed 
##   0.018   0.004   0.023
# of course, a WAY more efficient way to run this, with vectorized function...

func3 <- function(){
  rnd <- rnorm(n=n)
  return(rnd)
}

system.time(func3())
##    user  system elapsed 
##   0.001   0.000   0.000

Memory copy applies as well to data frames. A very frequent scenario looks something like the following. You can avoid it by storing results in a list, and then using do.call(rbind, LIST) to convert back to a single data frame.

set.seed(123)
n <- 200

## using rbind
func1 <- function(){
  df <- data.frame()
  for (i in 1:n){
      df <- rbind(df,  data.frame(
        age = sample(18:100, n, replace=TRUE),
        state = sample(state.abb, n, replace=TRUE),
        party = sample(c("R", "D", "I"), n, replace=TRUE)
      ) )
  }
  return(df)
}

## using lists to avoid memory copy
init <- Sys.time()

func2 <- function(){
  df.list <- list()
  for (i in 1:n){
      df.list[[i]] <- data.frame(
        age = sample(18:100, n, replace=TRUE),
        state = sample(state.abb, n, replace=TRUE),
        party = sample(c("R", "D", "I"), n, replace=TRUE)
      )
  }
  df <- do.call(rbind, df.list)
  return(df)
}


system.time(func1())
##    user  system elapsed 
##   0.983   0.369   1.398
system.time(func2())
##    user  system elapsed 
##   0.156   0.028   0.189

Finding slow spots in your code using Rprof

Rprof() # start monitoring
invisible(func1()) # run your code (but don't show it!)
Rprof(NULL) # stop
summaryRprof() # see results
## $by.self
##              self.time self.pct total.time total.pct
## "as.vector"       0.34    24.29       0.34     24.29
## "factor"          0.30    21.43       0.46     32.86
## "[<-.factor"      0.30    21.43       0.38     27.14
## "rbind"           0.18    12.86       1.40    100.00
## "attr"            0.08     5.71       0.08      5.71
## "is.na"           0.08     5.71       0.08      5.71
## "[<-"             0.02     1.43       0.40     28.57
## "any"             0.02     1.43       0.02      1.43
## "levels"          0.02     1.43       0.02      1.43
## "sample.int"      0.02     1.43       0.02      1.43
## "sum"             0.02     1.43       0.02      1.43
## "unique"          0.02     1.43       0.02      1.43
## 
## $by.total
##                           total.time total.pct self.time self.pct
## "rbind"                         1.40    100.00      0.18    12.86
## "block_exec"                    1.40    100.00      0.00     0.00
## "call_block"                    1.40    100.00      0.00     0.00
## "eval"                          1.40    100.00      0.00     0.00
## "evaluate_call"                 1.40    100.00      0.00     0.00
## "evaluate"                      1.40    100.00      0.00     0.00
## "func1"                         1.40    100.00      0.00     0.00
## "handle"                        1.40    100.00      0.00     0.00
## "in_dir"                        1.40    100.00      0.00     0.00
## "knitr::knit"                   1.40    100.00      0.00     0.00
## "process_file"                  1.40    100.00      0.00     0.00
## "process_group.block"           1.40    100.00      0.00     0.00
## "process_group"                 1.40    100.00      0.00     0.00
## "rmarkdown::render"             1.40    100.00      0.00     0.00
## "timing_fn"                     1.40    100.00      0.00     0.00
## "withCallingHandlers"           1.40    100.00      0.00     0.00
## "withVisible"                   1.40    100.00      0.00     0.00
## "factor"                        0.46     32.86      0.30    21.43
## "[<-"                           0.40     28.57      0.02     1.43
## "[<-.factor"                    0.38     27.14      0.30    21.43
## "as.vector"                     0.34     24.29      0.34    24.29
## "as.vector.factor"              0.32     22.86      0.00     0.00
## "attr"                          0.08      5.71      0.08     5.71
## "is.na"                         0.08      5.71      0.08     5.71
## "data.frame"                    0.06      4.29      0.00     0.00
## "as.data.frame.character"       0.04      2.86      0.00     0.00
## "as.data.frame"                 0.04      2.86      0.00     0.00
## "any"                           0.02      1.43      0.02     1.43
## "levels"                        0.02      1.43      0.02     1.43
## "sample.int"                    0.02      1.43      0.02     1.43
## "sum"                           0.02      1.43      0.02     1.43
## "unique"                        0.02      1.43      0.02     1.43
## ".deparseOpts"                  0.02      1.43      0.00     0.00
## "%in%"                          0.02      1.43      0.00     0.00
## "deparse"                       0.02      1.43      0.00     0.00
## "mode"                          0.02      1.43      0.00     0.00
## "sample"                        0.02      1.43      0.00     0.00
## 
## $sample.interval
## [1] 0.02
## 
## $sampling.time
## [1] 1.4

How to read large files faster? There are different packages to help with this; and also being specific about the data you’re reading will help. readr is a good option:

system.time(d <- read.csv("~/data/UK-tweets.csv"))
##    user  system elapsed 
##   0.159   0.004   0.169
system.time(d <- read.csv("~/data/UK-tweets.csv", stringsAsFactors=F))
##    user  system elapsed 
##   0.092   0.002   0.095
library(readr)
system.time(d <- read_csv("~/data/UK-tweets.csv"))
## Parsed with column specification:
## cols(
##   id = col_double(),
##   screen_name = col_character(),
##   text = col_character(),
##   polite = col_character(),
##   sentiment = col_character(),
##   communication = col_character(),
##   retweet_count = col_integer(),
##   followers_count = col_integer(),
##   favourites_count = col_integer(),
##   created_at = col_character()
## )
##    user  system elapsed 
##   0.035   0.008   0.047