Efficient programming with R

Vectorization of R code

Let’s create the following two vectors with random values sampled from a uniform distribution:

set.seed(777)
n <- 1000000
x <- runif(n)
y <- runif(n)

Which of the following chunks of code will run faster?

loop <- function(){
  z <- rep(NA, n)
  for (i in 1:n){
    z[i] <- x[i] + y[1]
  }
  return(z)
}

sapply_code <- function(){
  z <- sapply(1:n, function(i) x[i] + y[i])
  return(z)
}

vectorized <- function(){
  z <- x + y
  return(z)
}

system.time(loop())

##    user  system elapsed 
##   0.202   0.008   0.229

system.time(sapply_code())

##    user  system elapsed 
##   2.501   0.059   2.652

system.time(vectorized())

##    user  system elapsed 
##   0.002   0.001   0.006

Other examples where vectorized functions can dramatically speed up our code. The first example shows the power of ifelse and the [ (replacement) function.

set.seed(123)
n <- 10000000
ideology <- sample(1:10, n, replace=TRUE)

# vectorized code
group_ideology <- function(){
  groups <- rep(NA, n)
  groups[ideology>5] <- "right"
  groups[ideology<5] <- "left"
  return(groups)
}

# non-vectorized code
ideology_loop <- function(){
  groups <- rep(NA, n)
  for (i in 1:n){
    if (ideology[i]>5) groups[i] <- "left"
    if (ideology[i]>5) groups[i] <- "right"
  }
}

# benchmarks
system.time(group_ideology())

##    user  system elapsed 
##   0.487   0.080   0.591

system.time(ideology_loop())

##    user  system elapsed 
##   4.273   0.078   4.745

This second example shows how matrix operations are vectorized too:

set.seed(777)
n <- 1000000
df <- data.frame(grade1=runif(n, 0, 100),
                 grade2=runif(n, 0, 100),
                 grade3=runif(n, 0, 100),
                 grade4=runif(n, 0, 100),
                 grade5=runif(n, 0, 100))

system.time(total1 <- df$grade1 + df$grade2 + df$grade3 + df$grade4 + df$grade5)

##    user  system elapsed 
##   0.011   0.003   0.028

system.time(total2 <- rowSums(df))

##    user  system elapsed 
##   0.047   0.030   0.163

total3 <- rep(NA, n)
system.time(for (i in 1:n){
  total3[i] <- df$grade1[i] + df$grade2[i] + df$grade3[i] + df$grade4[i] + df$grade5[i]
  }
)

##    user  system elapsed 
##  33.192   0.376  37.113

# why is rowSums slower than + ?

df <- as.matrix(df)

system.time(total1 <- df[,1] + df[,2] + df[,3] + df[,4] + df[,5])

##    user  system elapsed 
##   0.053   0.022   0.078

system.time(total2 <- rowSums(df))

##    user  system elapsed 
##   0.021   0.008   0.030

The other way to write more efficient code is to avoid memory copy:

set.seed(123)
n <- 10000

func1 <- function(){
  rnd <- c()
  for (i in 1:n){
    rnd <- c(rnd, rnorm(n=1))
  }
  return(rnd)
}

func2 <- function(){
  rnd <- rep(NA, n)
  for (i in 1:n){
    rnd[i] <- rnorm(n=1)
  }
  return(rnd)
}

system.time(func1())

##    user  system elapsed 
##   0.186   0.126   0.330

system.time(func2())

##    user  system elapsed 
##   0.017   0.004   0.020

# of course, a WAY more efficient way to run this, with vectorized function...

func3 <- function(){
  rnd <- rnorm(n=n)
  return(rnd)
}

system.time(func3())

##    user  system elapsed 
##   0.000   0.000   0.001

Memory copy applies as well to data frames. A very frequent scenario looks something like the following. You can avoid it by storing results in a list, and then using do.call(rbind, LIST) to convert back to a single data frame.

set.seed(123)
n <- 200

## using rbind
func1 <- function(){
  df <- data.frame()
  for (i in 1:n){
      df <- rbind(df,  data.frame(
        age = sample(18:100, n, replace=TRUE),
        state = sample(state.abb, n, replace=TRUE),
        party = sample(c("R", "D", "I"), n, replace=TRUE)
      ) )
  }
  return(df)
}

## using lists to avoid memory copy
init <- Sys.time()

func2 <- function(){
  df.list <- list()
  for (i in 1:n){
      df.list[[i]] <- data.frame(
        age = sample(18:100, n, replace=TRUE),
        state = sample(state.abb, n, replace=TRUE),
        party = sample(c("R", "D", "I"), n, replace=TRUE)
      )
  }
  df <- do.call(rbind, df.list)
  return(df)
}


system.time(func1())

##    user  system elapsed 
##   0.933   0.345   1.558

system.time(func2())

##    user  system elapsed 
##   0.260   0.036   0.654

Finding slow spots in your code using Rprof

Rprof() # start monitoring
invisible(func1()) # run your code (but don't show it!)
Rprof(NULL) # stop
summaryRprof() # see results

## $by.self
##                        self.time self.pct total.time total.pct
## "[<-.factor"                0.50    31.65       0.52     32.91
## "as.vector"                 0.36    22.78       0.38     24.05
## "factor"                    0.34    21.52       0.54     34.18
## "attr"                      0.08     5.06       0.08      5.06
## "rbind"                     0.06     3.80       1.58    100.00
## "as.vector.factor"          0.02     1.27       0.38     24.05
## "as.data.frame.vector"      0.02     1.27       0.04      2.53
## "force"                     0.02     1.27       0.04      2.53
## "attributes"                0.02     1.27       0.02      1.27
## "is.na"                     0.02     1.27       0.02      1.27
## "levels"                    0.02     1.27       0.02      1.27
## "make.names"                0.02     1.27       0.02      1.27
## "mode"                      0.02     1.27       0.02      1.27
## "names"                     0.02     1.27       0.02      1.27
## "sample"                    0.02     1.27       0.02      1.27
## "sort.list"                 0.02     1.27       0.02      1.27
## "unique"                    0.02     1.27       0.02      1.27
## 
## $by.total
##                           total.time total.pct self.time self.pct
## "rbind"                         1.58    100.00      0.06     3.80
## "block_exec"                    1.58    100.00      0.00     0.00
## "call_block"                    1.58    100.00      0.00     0.00
## "eval"                          1.58    100.00      0.00     0.00
## "evaluate_call"                 1.58    100.00      0.00     0.00
## "evaluate"                      1.58    100.00      0.00     0.00
## "func1"                         1.58    100.00      0.00     0.00
## "handle"                        1.58    100.00      0.00     0.00
## "in_dir"                        1.58    100.00      0.00     0.00
## "knitr::knit"                   1.58    100.00      0.00     0.00
## "process_file"                  1.58    100.00      0.00     0.00
## "process_group.block"           1.58    100.00      0.00     0.00
## "process_group"                 1.58    100.00      0.00     0.00
## "rmarkdown::render"             1.58    100.00      0.00     0.00
## "timing_fn"                     1.58    100.00      0.00     0.00
## "withCallingHandlers"           1.58    100.00      0.00     0.00
## "withVisible"                   1.58    100.00      0.00     0.00
## "factor"                        0.54     34.18      0.34    21.52
## "[<-.factor"                    0.52     32.91      0.50    31.65
## "[<-"                           0.52     32.91      0.00     0.00
## "as.vector"                     0.38     24.05      0.36    22.78
## "as.vector.factor"              0.38     24.05      0.02     1.27
## "data.frame"                    0.14      8.86      0.00     0.00
## "as.data.frame"                 0.10      6.33      0.00     0.00
## "attr"                          0.08      5.06      0.08     5.06
## "as.data.frame.character"       0.08      5.06      0.00     0.00
## "as.data.frame.vector"          0.04      2.53      0.02     1.27
## "force"                         0.04      2.53      0.02     1.27
## "attributes"                    0.02      1.27      0.02     1.27
## "is.na"                         0.02      1.27      0.02     1.27
## "levels"                        0.02      1.27      0.02     1.27
## "make.names"                    0.02      1.27      0.02     1.27
## "mode"                          0.02      1.27      0.02     1.27
## "names"                         0.02      1.27      0.02     1.27
## "sample"                        0.02      1.27      0.02     1.27
## "sort.list"                     0.02      1.27      0.02     1.27
## "unique"                        0.02      1.27      0.02     1.27
## "%in%"                          0.02      1.27      0.00     0.00
## "as.data.frame.integer"         0.02      1.27      0.00     0.00
## "deparse"                       0.02      1.27      0.00     0.00
## "paste"                         0.02      1.27      0.00     0.00
## "structure"                     0.02      1.27      0.00     0.00
## 
## $sample.interval
## [1] 0.02
## 
## $sampling.time
## [1] 1.58

How to read large files faster? There are different packages to help with this; and also being specific about the data you’re reading will help. readr is a good option:

system.time(d <- read.csv("../data/EP-elections-tweets.csv"))

##    user  system elapsed 
##   0.257   0.008   0.300

system.time(d <- read.csv("../data/EP-elections-tweets.csv", stringsAsFactors=F))

##    user  system elapsed 
##   0.175   0.006   0.194

library(readr)
system.time(d <- read_csv("../data/EP-elections-tweets.csv"))

## Parsed with column specification:
## cols(
##   text = col_character(),
##   screen_name = col_character(),
##   id = col_double(),
##   polite = col_character()
## )

##    user  system elapsed 
##   0.048   0.009   0.063

Efficient programming with R

Pablo Barbera

September 12, 2017

Vectorization of R code