Utility function for distributing computations among a pool of workers for parallel processing.
distribute_load(x, n = 1)
integer
number of item to process.
integer
number of threads.
list
object.
This function returns a list
containing an element for
each worker. Each element contains a integer
vector
specifying the indices that the worker should process.
# \dontrun{
# imagine that we have 10 jobs that need processing. For simplicity,
# our jobs will involve adding 1 to each element in 1:10.
values <- 1:10
# we could complete this processing using the following vectorized code
result <- 1 + 1:10
print(result)
#> [1] 2 3 4 5 6 7 8 9 10 11
# however, if our jobs were complex then we would be better off using
# functionals
result <- lapply(1:10, function(x) x + 1)
print(result)
#> [[1]]
#> [1] 2
#>
#> [[2]]
#> [1] 3
#>
#> [[3]]
#> [1] 4
#>
#> [[4]]
#> [1] 5
#>
#> [[5]]
#> [1] 6
#>
#> [[6]]
#> [1] 7
#>
#> [[7]]
#> [1] 8
#>
#> [[8]]
#> [1] 9
#>
#> [[9]]
#> [1] 10
#>
#> [[10]]
#> [1] 11
#>
# we could do one better, and use the "plyr" package to handle the
# processing
result <- plyr::llply(1:10, function(x) x + 1)
print(result)
#> [[1]]
#> [1] 2
#>
#> [[2]]
#> [1] 3
#>
#> [[3]]
#> [1] 4
#>
#> [[4]]
#> [1] 5
#>
#> [[5]]
#> [1] 6
#>
#> [[6]]
#> [1] 7
#>
#> [[7]]
#> [1] 8
#>
#> [[8]]
#> [1] 9
#>
#> [[9]]
#> [1] 10
#>
#> [[10]]
#> [1] 11
#>
# we could also use the parallel processing options available through "plyr"
# to use more computation resources to complete the jobs (note that since
# these jobs are very quick to process this is actually slower).
cl <- parallel::makeCluster(2, "PSOCK")
doParallel::registerDoParallel(cl)
result <- plyr::llply(1:10, function(x) x + 1, .parallel = TRUE)
#> Warning: <anonymous>: ... may be used in an incorrect context: ‘.fun(piece, ...)’
#> Warning: <anonymous>: ... may be used in an incorrect context: ‘.fun(piece, ...)’
cl <- parallel::stopCluster(cl)
print(result)
#> [[1]]
#> [1] 2
#>
#> [[2]]
#> [1] 3
#>
#> [[3]]
#> [1] 4
#>
#> [[4]]
#> [1] 5
#>
#> [[5]]
#> [1] 6
#>
#> [[6]]
#> [1] 7
#>
#> [[7]]
#> [1] 8
#>
#> [[8]]
#> [1] 9
#>
#> [[9]]
#> [1] 10
#>
#> [[10]]
#> [1] 11
#>
# however this approach iterates over each element individually, we could
# use the distribute_load function to split the N jobs up into K super
# jobs, and evaluate each super job using vectorized code.
x <- 1:10
cl <- parallel::makeCluster(2, "PSOCK")
parallel::clusterExport(cl, 'x', envir = environment())
doParallel::registerDoParallel(cl)
l <- distribute_load(length(x), n = 2)
result <- plyr::llply(l, function(i) x[i] + 1, .parallel = TRUE)
#> Warning: <anonymous>: ... may be used in an incorrect context: ‘.fun(piece, ...)’
#> Warning: <anonymous>: ... may be used in an incorrect context: ‘.fun(piece, ...)’
cl <- parallel::stopCluster(cl)
print(result)
#> [[1]]
#> [1] 2 3 4 5 6
#>
#> [[2]]
#> [1] 7 8 9 10 11
#>
# }