Efficient programming with R

Vectorization of R code

Let’s create the following two vectors with random values sampled from a uniform distribution:

set.seed(777)
n <- 1000000
x <- runif(n)
y <- runif(n)

Which of the following chunks of code will run faster?

# manual loop over each pair of values
loop <- function(){
  z <- rep(NA, n)
  for (i in 1:n){
    z[i] <- x[i] + y[i]
  }
  return(z)
}

# loop using sapply in base R
sapply_code <- function(){
  z <- sapply(1:n, function(i) x[i] + y[i])
  return(z)
}

# vectorized code with `+`
vectorized <- function(){
  z <- x + y
  return(z)
}

system.time(loop())

##    user  system elapsed 
##   0.121   0.005   0.126

system.time(sapply_code())

##    user  system elapsed 
##   1.404   0.042   1.449

system.time(vectorized())

##    user  system elapsed 
##   0.001   0.000   0.001

There are many other cases where vectorized functions can dramatically speed up our code. For example, the ifelse function.

set.seed(123)
n <- 10000000
ideology <- sample(1:10, n, replace=TRUE)

# vectorized code
group_ideology <- function(){
  groups <- ifelse(ideology>5, "right", "left")
  return(groups)
}

# non-vectorized code
ideology_loop <- function(){
  groups <- rep(NA, n)
  for (i in 1:n){
    if (ideology[i]<5) groups[i] <- "left"
    if (ideology[i]>5) groups[i] <- "right"
  }
}

# benchmarks
system.time(group_ideology())

##    user  system elapsed 
##   1.485   0.098   1.585

system.time(ideology_loop())

##    user  system elapsed 
##   2.566   0.017   2.584

The other important way to write more efficient code is to avoid memory copy:

set.seed(123)
n <- 10000
# function to generate n random numbers
func1 <- function(){
  rnd <- c()
  for (i in 1:n){
    rnd <- c(rnd, rnorm(n=1))
  }
  return(rnd)
}

func2 <- function(){
  rnd <- rep(NA, n)
  for (i in 1:n){
    rnd[i] <- rnorm(n=1)
  }
  return(rnd)
}

system.time(func1())

##    user  system elapsed 
##   0.183   0.160   0.343

system.time(func2())

##    user  system elapsed 
##   0.019   0.000   0.019

# of course, a WAY more efficient way to run this, with vectorized function...
func3 <- function(){
  rnd <- rnorm(n=n)
  return(rnd)
}

system.time(func3())

##    user  system elapsed 
##       0       0       0

Memory copy applies as well to data frames. A very frequent scenario looks something like the following..

set.seed(123)
n <- 200

## using rbind
func1 <- function(){
  df <- data.frame()
  for (i in 1:n){
      df <- rbind(df,  data.frame(
        age = sample(18:100, n, replace=TRUE),
        state = sample(state.abb, n, replace=TRUE),
        party = sample(c("R", "D", "I"), n, replace=TRUE)
      ) )
  }
  return(df)
}

You can avoid it by storing results in a list, and then using do.call(rbind, LIST) to convert back to a single data frame

## using lists to avoid memory copy
func2 <- function(){
  df.list <- list()
  for (i in 1:n){
      df.list[[i]] <- data.frame(
        age = sample(18:100, n, replace=TRUE),
        state = sample(state.abb, n, replace=TRUE),
        party = sample(c("R", "D", "I"), n, replace=TRUE)
      )
  }
  df <- do.call(rbind, df.list)
  return(df)
}

system.time(func1())

##    user  system elapsed 
##   0.263   0.071   0.334

system.time(func2())

##    user  system elapsed 
##   0.065   0.005   0.069

For complex script, how can you find what is slowing your code down? You can use Rprof

Rprof() # start monitoring
invisible(func1()) # run your code (but don't show it!)
Rprof(NULL) # stop
summaryRprof() # see results

## $by.self
##              self.time self.pct total.time total.pct
## "rbind"           0.22    73.33       0.30    100.00
## "data.frame"      0.04    13.33       0.08     26.67
## "deparse"         0.02     6.67       0.02      6.67
## "structure"       0.02     6.67       0.02      6.67
## 
## $by.total
##                           total.time total.pct self.time self.pct
## "rbind"                         0.30    100.00      0.22    73.33
## "block_exec"                    0.30    100.00      0.00     0.00
## "call_block"                    0.30    100.00      0.00     0.00
## "eval"                          0.30    100.00      0.00     0.00
## "evaluate_call"                 0.30    100.00      0.00     0.00
## "evaluate::evaluate"            0.30    100.00      0.00     0.00
## "evaluate"                      0.30    100.00      0.00     0.00
## "func1"                         0.30    100.00      0.00     0.00
## "handle"                        0.30    100.00      0.00     0.00
## "in_dir"                        0.30    100.00      0.00     0.00
## "knitr::knit"                   0.30    100.00      0.00     0.00
## "process_file"                  0.30    100.00      0.00     0.00
## "process_group.block"           0.30    100.00      0.00     0.00
## "process_group"                 0.30    100.00      0.00     0.00
## "rmarkdown::render"             0.30    100.00      0.00     0.00
## "timing_fn"                     0.30    100.00      0.00     0.00
## "withCallingHandlers"           0.30    100.00      0.00     0.00
## "withVisible"                   0.30    100.00      0.00     0.00
## "data.frame"                    0.08     26.67      0.04    13.33
## "as.data.frame.character"       0.04     13.33      0.00     0.00
## "as.data.frame"                 0.04     13.33      0.00     0.00
## "deparse"                       0.02      6.67      0.02     6.67
## "structure"                     0.02      6.67      0.02     6.67
## "%in%"                          0.02      6.67      0.00     0.00
## "as.data.frame.vector"          0.02      6.67      0.00     0.00
## "deparse1"                      0.02      6.67      0.00     0.00
## "mode"                          0.02      6.67      0.00     0.00
## "paste"                         0.02      6.67      0.00     0.00
## 
## $sample.interval
## [1] 0.02
## 
## $sampling.time
## [1] 0.3

Efficient programming with R

Pablo Barbera

Vectorization of R code