In a recursive/online procedure, we frequently modify some components passed into a function and want the modification to have permanent effect. A simple example is running min and max:
# X: scalar, x.min: running min, x.max: running max
example1 = function(X, x.min, x.max)
{
if (X < x.min) x.min = X
if (X > x.max) x.max = X
}
Nevertheless, when this function ends, local variables x.min and x.max are destroyed automatically. To have permanent effect on the passed in arguments x.min and x.max, we have several solutions in R.
The first solution to the above problem is to declare x.min and x.max as global variables and use super assignment in the function:
example2 = function(X)
{
if (!exists("x.min") || X < x.min) x.min <<- X
if (!exists("x.max") || X > x.max) x.max <<- X
}
# running example2
data = c(9,5,2,7,11)
for (i in 1:length(data))
{
example2(data[i])
print(c(x.min, x.max))
}
## [1] 9 9
## [1] 5 9
## [1] 2 9
## [1] 2 9
## [1] 2 11
Despite its simplicity, super assignment is a bad practice in general. We can see that we need to check their existence and have to make sure the other functions will not affect them unconsciously.
The second category of solution is to return x.min and x.max from the function so that we can store them properly. While we cannot return multiple variables at the same time, we can warp the variables we need with a list:
example3 = function(X, L)
{
if (is.null(L$x.min) || X < L$x.min) L$x.min = X
if (is.null(L$x.max) || X > L$x.max) L$x.max = X
L
}
# running example3
data = c(9,5,2,7,11)
L = list(x.min=NULL, x.max=NULL)
for (i in 1:length(data))
{
L = example3(data[i], L)
print(c(L$x.min, L$x.max))
}
## [1] 9 9
## [1] 5 9
## [1] 2 9
## [1] 2 9
## [1] 2 11
While this is safer than using global variables, two major issues with this approach are that we have to access the variables via the list as well as passing a (large) list repeatedly may hinder the program performance.
The last category of solutions is to use ‘pass by reference’ discussed in many elementary programming courses. This is in fact the first solution that came up in my mind as I learnt C before R. Unfortunately, there is no natural support of pointer in R. Some substitutes include eval.parent(), setRefClass(), R.oo packages etc.
Update: while there is no pointer in R, it is possible to use reference semantic under environment. I notice this property when I try unload a list to the environment so that I do not need to access the list multiple times.
example4 = function(X, E)
{
if (is.null(E$x.min)) E$x.min = E$x.max = X
if (X < E$x.min) E$x.min = X
if (X > E$x.max) E$x.max = X
}
# running example4
data = c(9,5,2,7,11)
E = new.env()
E$x.max = E$x.min = NULL
for (i in 1:length(data))
{
example4(data[i], E)
print(c(E$x.min, E$x.max))
}
## [1] 9 9
## [1] 5 9
## [1] 2 9
## [1] 2 9
## [1] 2 11
This solution has been added to the comparison.
While different methods are proposed during the discussion in our research group, we did not find any comprehensive comparison of them online. Intuitively, different methods incur different amounts of overhead when the variables are accessed. In order to develop the best package for our project, we try to compare them with different number of variables to be passed and different number of iterations. The functions that implement different solutions are as follows:
# super assignment
add.global = function(n) #arbitrary number of variables
{
for (i in 1:n)
eval(parse(text=paste0("x.global",i,"<<-","x.global",i,"+1")))
}
# return list
add.list = function(n, L)
{
for (i in 1:n)
eval(parse(text=paste0("L$x",i,"=","L$x",i,"+1")))
L
}
# environment
add.env = function(n, E)
{
for (i in 1:n)
eval(parse(text=paste0("E$x",i,"=","E$x",i,"+1")))
}
# use with() to avoid access environment multiple times
add.env.with = function(E)
{
with(E, {
for (i in 1:n)
eval(parse(text=paste0("x",i,"=","x",i,"+1")))
})
}
# use super assignment in a pre-set environment
add.env.set = function(n)
{
for (i in 1:n)
eval(parse(text=paste0("x",i,"<<-","x",i,"+1")))
}
We compare them with the following setting and functions to perform simulation:
library(microbenchmark)
# setting
nRep = 30
n.iter = c(1e3, 1e4)
n.var = c(5, 50)
# iterating the solutions
f.global = function(nIter, nVar)
{
for (i in 1:nVar)
eval(parse(text=paste0("x.global",i,"<<-0")))
for (i in 1:nIter)
add.global(nVar)
x.global1
}
f.list = function(nIter, nVar)
{
L = list()
for (i in 1:nVar)
eval(parse(text=paste0("L$x",i,"=0")))
for (i in 1:nIter)
L = add.list(nVar, L)
L$x1
}
f.env = function(nIter, nVar)
{
E = new.env()
for (i in 1:nVar)
eval(parse(text=paste0("E$x",i,"=0")))
for (i in 1:nIter)
add.env(nVar, E)
E$x1
}
f.env.with = function(nIter, nVar)
{
E = new.env()
for (i in 1:nVar)
eval(parse(text=paste0("E$x",i,"=0")))
E$n = nVar # need to put n into the environment
for (i in 1:nIter)
add.env.with(E)
E$x1
}
f.env.set = function(nIter, nVar)
{
E = new.env()
for (i in 1:nVar)
eval(parse(text=paste0("E$x",i,"=0")))
environment(add.env.set) = E #pre-set the environment
for (i in 1:nIter)
add.env.set(nVar)
E$x1
}
We try to confirm their equivalence by looking at the first variables:
n = n.iter[1]
m = n.var[1]
cat(c(f.global(n,m),
f.list(n,m),
f.env(n,m),
f.env.with(n,m),
f.env.set(n,m)))
## 1000 1000 1000 1000 1000
The results can be found below:
# compare the performance
for (i in 1:length(n.iter))
{
for (j in 1:length(n.var))
{
n = n.iter[i]
m = n.var[j]
cat(paste0(n, " iterations, ", m, " variables\n"))
print(microbenchmark(
f.global(n,m),
f.list(n,m),
f.env(n,m),
f.env.with(n,m),
f.env.set(n,m), times=nRep)
)
cat("\n")
}
}
## 1000 iterations, 5 variables
## Unit: milliseconds
## expr min lq mean median uq max neval
## f.global(n, m) 62.6507 64.2941 65.29962 64.71595 65.3437 72.8843 30
## f.list(n, m) 72.5899 72.9904 73.79888 73.35795 73.9477 81.9056 30
## f.env(n, m) 69.3427 72.3090 72.69684 72.66405 73.1303 77.2786 30
## f.env.with(n, m) 66.2744 68.6742 69.79636 69.01050 70.2823 75.5699 30
## f.env.set(n, m) 59.8187 62.3075 63.89641 62.48345 62.8760 96.6574 30
##
## 1000 iterations, 50 variables
## Unit: milliseconds
## expr min lq mean median uq max neval
## f.global(n, m) 632.6913 643.4070 648.6035 646.6416 651.3560 676.5365 30
## f.list(n, m) 727.6148 752.8430 755.7736 756.9351 761.8968 792.7547 30
## f.env(n, m) 703.3076 723.4363 728.6458 725.7711 730.7909 762.4145 30
## f.env.with(n, m) 609.1059 624.6730 631.8855 633.3892 636.1545 671.5078 30
## f.env.set(n, m) 603.5491 622.0154 627.9367 628.6824 630.8707 665.4141 30
##
## 10000 iterations, 5 variables
## Unit: milliseconds
## expr min lq mean median uq max neval
## f.global(n, m) 641.7234 645.1672 651.3296 647.7379 651.2587 687.7432 30
## f.list(n, m) 729.6711 733.8312 737.9047 736.3810 737.9839 773.3437 30
## f.env(n, m) 722.2199 726.2229 730.5405 727.2925 730.5958 767.5532 30
## f.env.with(n, m) 686.1373 688.4164 693.6996 691.3629 693.6030 732.8450 30
## f.env.set(n, m) 613.9361 622.5752 631.9774 628.3284 630.2193 665.1760 30
##
## 10000 iterations, 50 variables
## Unit: seconds
## expr min lq mean median uq max neval
## f.global(n, m) 6.396850 6.416284 6.435752 6.435482 6.452797 6.475530 30
## f.list(n, m) 7.517962 7.543795 7.558136 7.557651 7.571331 7.607847 30
## f.env(n, m) 7.201885 7.221154 7.242742 7.236901 7.250907 7.464927 30
## f.env.with(n, m) 6.283786 6.298064 6.315226 6.307654 6.332223 6.382697 30
## f.env.set(n, m) 6.184889 6.199760 6.216334 6.215632 6.226908 6.260251 30
From the simulations, we can see that accessing a list or environment with $ incurs a considerable amount of overhead. Besides, it is not convenient to do so in the code. Therefore, we should avoid accessing their elements with $ if we have many variables to update.
For super assignment, we should only use it if the environment can be well controlled. As in add.env.set(), the environment has to be set outside before we call it in f.env.set(). If we want to set the environment in the add function, we have to use with() instead.