ensemble.batch {BiodiversityR}  R Documentation 
Suitability mapping based on ensembles of modelling algorithms: batch processing
Description
The main function allows for batch processing of different species and different environmental RasterStacks. The function makes internal calls to ensemble.calibrate.weights
, ensemble.calibrate.models
and ensemble.raster
.
Usage
ensemble.batch(x = NULL, xn = c(x),
species.presence = NULL, species.absence = NULL,
presence.min = 20, thin.km = 0.1,
an = 1000, excludep = FALSE, target.groups = FALSE,
get.block = FALSE, block.default = runif(1) > 0.5, get.subblocks = FALSE,
SSB.reduce = FALSE, CIRCLES.d = 250000,
k.splits = 4, k.test = 0,
n.ensembles = 1,
VIF.max = 10, VIF.keep = NULL,
SINK = FALSE, CATCH.OFF = FALSE,
RASTER.datatype = "INT2S", RASTER.NAflag = 32767,
models.save = FALSE,
threshold.method = "spec_sens", threshold.sensitivity = 0.9,
threshold.PresenceAbsence = FALSE,
ENSEMBLE.best = 0, ENSEMBLE.min = 0.7, ENSEMBLE.exponent = 1,
ENSEMBLE.weight.min = 0.05,
input.weights = NULL,
MAXENT = 1, MAXNET = 1, MAXLIKE = 1, GBM = 1, GBMSTEP = 0, RF = 1, CF = 1,
GLM = 1, GLMSTEP = 1, GAM = 1, GAMSTEP = 1, MGCV = 1, MGCVFIX = 0,
EARTH = 1, RPART = 1, NNET = 1, FDA = 1, SVM = 1 , SVME = 1, GLMNET = 1,
BIOCLIM.O = 0, BIOCLIM = 1, DOMAIN = 1, MAHAL = 1, MAHAL01 = 1,
PROBIT = FALSE,
Yweights = "BIOMOD",
layer.drops = NULL, factors = NULL, dummy.vars = NULL,
formulae.defaults = TRUE, maxit = 100,
MAXENT.a = NULL, MAXENT.an = 10000,
MAXENT.path = paste(getwd(), "/models/maxent", sep=""),
MAXNET.classes = "default", MAXNET.clamp = FALSE, MAXNET.type = "cloglog",
MAXLIKE.formula = NULL, MAXLIKE.method = "BFGS",
GBM.formula = NULL, GBM.n.trees = 2001,
GBMSTEP.tree.complexity = 5, GBMSTEP.learning.rate = 0.005,
GBMSTEP.bag.fraction = 0.5, GBMSTEP.step.size = 100,
RF.formula = NULL, RF.ntree = 751, RF.mtry = floor(sqrt(raster::nlayers(x))),
CF.formula = NULL, CF.ntree = 751, CF.mtry = floor(sqrt(raster::nlayers(x))),
GLM.formula = NULL, GLM.family = binomial(link = "logit"),
GLMSTEP.steps = 1000, STEP.formula = NULL, GLMSTEP.scope = NULL, GLMSTEP.k = 2,
GAM.formula = NULL, GAM.family = binomial(link = "logit"),
GAMSTEP.steps = 1000, GAMSTEP.scope = NULL, GAMSTEP.pos = 1,
MGCV.formula = NULL, MGCV.select = FALSE,
MGCVFIX.formula = NULL,
EARTH.formula = NULL,
EARTH.glm = list(family = binomial(link = "logit"), maxit = maxit),
RPART.formula = NULL, RPART.xval = 50,
NNET.formula = NULL, NNET.size = 8, NNET.decay = 0.01,
FDA.formula = NULL,
SVM.formula = NULL, SVME.formula = NULL,
GLMNET.nlambda = 100, GLMNET.class = FALSE,
BIOCLIM.O.fraction = 0.9,
MAHAL.shape = 1)
ensemble.mean(RASTER.species.name = "Species001", RASTER.stack.name = "base",
positive.filters = c("tif", "_ENSEMBLE_"), negative.filters = c("xml"),
RASTER.format = "GTiff", RASTER.datatype = "INT2S", RASTER.NAflag = 32767,
abs.breaks = 6, pres.breaks = 6, sd.breaks = 9,
p = NULL, a = NULL,
pt = NULL, at = NULL,
threshold = 1,
threshold.method = "spec_sens", threshold.sensitivity = 0.9,
threshold.PresenceAbsence = FALSE)
ensemble.plot(RASTER.species.name = "Species001", RASTER.stack.name = "base",
plot.method=c("suitability", "presence", "count",
"consensussuitability", "consensuspresence", "consensuscount", "consensussd"),
dev.new.width = 7, dev.new.height = 7,
main = paste(RASTER.species.name, " ", plot.method,
" for ", RASTER.stack.name, sep=""),
positive.filters = c("tif"), negative.filters = c("xml"),
p=NULL, a=NULL,
threshold = 1,
threshold.method = "spec_sens", threshold.sensitivity = 0.9,
threshold.PresenceAbsence = FALSE,
abs.breaks = 6, abs.col = NULL,
pres.breaks = 6, pres.col = NULL,
sd.breaks = 9, sd.col = NULL,
absencePresence.col = NULL,
count.col = NULL, ...)
Arguments
x 
RasterStack object ( 
xn 
RasterStack object ( 
species.presence 
presence points used for calibrating the suitability models, available in 3column (species, x, y) or (species, lon, lat) dataframe 
species.absence 
background points used for calibrating the suitability models, either available in a 3column (species, x, y) or (species, lon, lat), or available in a 2column (x, y) or (lon, lat) dataframe. In case of a 2column dataframe, the same background locations will be used for all species. 
presence.min 
minimum number of presence locations for the organism (if smaller, no models are fitted). 
thin.km 
Threshold for minimum distance (km) between presence point locations for focal species for model calibrations in each run. A new data set is randomly selected via 
an 
number of background points for calibration to be selected with 
excludep 
parameter that indicates (if 
target.groups 
Parameter that indicates (if 
get.block 
if 
block.default 
if 
get.subblocks 
if 
SSB.reduce 
If 
CIRCLES.d 
Radius in m of circular neighbourhoods (created with 
k 
If larger than 1, the mumber of groups to split between calibration (k1) and evaluation (1) data sets (for example, 
k.splits 
If larger than 1, the number of splits for the 
k.test 
If larger than 1, the mumber of groups to split between calibration (k1) and evaluation (1) data sets when calibrating the final models (for example, 
n.ensembles 
If larger than 1, the number of different ensembles generated per species in batch processing. 
VIF.max 
Maximum Variance Inflation Factor of variables; see 
VIF.keep 
character vector with names of the variables to be kept; see 
SINK 
Append the results to a text file in subfolder 'outputs' (if 
CATCH.OFF 
Disable calls to function 
RASTER.format 
Format of the raster files that will be generated. See 
RASTER.datatype 
Format of the raster files that will be generated. See 
RASTER.NAflag 
Value that is used to store missing data. See 
models.save 
Save the list with model details to a file (if 
threshold.method 
Method to calculate the threshold between predicted absence and presence; possibilities include 
threshold.sensitivity 
Sensitivity value for 
threshold.PresenceAbsence 
If 
ENSEMBLE.best 
The number of individual suitability models to be used in the consensus suitability map (based on a weighted average). In case this parameter is smaller than 1 or larger than the number of positive input weights of individual models, then all individual suitability models with positive input weights are included in the consensus suitability map. In case a vector is provided, 
ENSEMBLE.min 
The minimum input weight (typically corresponding to AUC values) for a model to be included in the ensemble. In case a vector is provided, function 
ENSEMBLE.exponent 
Exponent applied to AUC values to convert AUC values into weights (for example, an exponent of 2 converts input weights of 0.7, 0.8 and 0.9 into 0.7^2=0.49, 0.8^2=0.64 and 0.9^2=0.81). See details. 
ENSEMBLE.weight.min 
The minimum output weight for models included in the ensemble, applying to weights that sum to one. Note that 
input.weights 
array with numeric values for the different modelling algorithms; if 
MAXENT 
Input weight for a maximum entropy model ( 
MAXNET 
number: if larger than 0, then a maximum entropy model ( 
MAXLIKE 
Input weight for a maxlike model ( 
GBM 
Input weight for a boosted regression trees model ( 
GBMSTEP 
Input weight for a stepwise boosted regression trees model ( 
RF 
Input weight for a random forest model ( 
CF 
number: if larger than 0, then a random forest model ( 
GLM 
Input weight for a generalized linear model ( 
GLMSTEP 
Input weight for a stepwise generalized linear model ( 
GAM 
Input weight for a generalized additive model ( 
GAMSTEP 
Input weight for a stepwise generalized additive model ( 
MGCV 
Input weight for a generalized additive model ( 
MGCVFIX 
number: if larger than 0, then a generalized additive model with fixed d.f. regression splines ( 
EARTH 
Input weight for a multivariate adaptive regression spline model ( 
RPART 
Input weight for a recursive partioning and regression tree model ( 
NNET 
Input weight for an artificial neural network model ( 
FDA 
Input weight for a flexible discriminant analysis model ( 
SVM 
Input weight for a support vector machine model ( 
SVME 
Input weight for a support vector machine model ( 
GLMNET 
Input weight for a GLM with lasso or elasticnet regularization ( 
BIOCLIM.O 
Input weight for the original BIOCLIM algorithm ( 
BIOCLIM 
Input weight for the BIOCLIM algorithm ( 
DOMAIN 
Input weight for the DOMAIN algorithm ( 
MAHAL 
Input weight for the Mahalonobis algorithm ( 
MAHAL01 
Input weight for the Mahalanobis algorithm ( 
PROBIT 
If 
Yweights 
chooses how cases of presence and background (absence) are weighted; 
layer.drops 
vector that indicates which layers should be removed from RasterStack 
factors 
vector that indicates which variables are factors; see also 
dummy.vars 
vector that indicates which variables are dummy variables (influences formulae suggestions) 
formulae.defaults 
Suggest formulae for most of the models (if 
maxit 
Maximum number of iterations for some of the models. See also 
MAXENT.a 
background points used for calibrating the maximum entropy model ( 
MAXENT.an 
number of background points for calibration to be selected with 
MAXENT.path 
path to the directory where output files of the maximum entropy model are stored; see also 
MAXNET.classes 
continuous feature classes, either "default" or any subset of "lqpht" (linear, quadratic, product, hinge, threshold). Note that the "default" option chooses feature classes based on the number of presence locations as "l" (< 10 locations), "lq" (10  14 locations), "lqh" (15  79 locations) or "lqph" (> 79 locations). See also 
MAXNET.clamp 
restrict predictors and features to the range seen during model training; see also 
MAXNET.type 
type of response required; see also 
MAXLIKE.formula 
formula for the maxlike algorithm; see also 
MAXLIKE.method 
method for the maxlike algorithm; see also 
GBM.formula 
formula for the boosted regression trees algorithm; see also 
GBM.n.trees 
total number of trees to fit for the boosted regression trees model; see also 
GBMSTEP.tree.complexity 
complexity of individual trees for stepwise boosted regression trees; see also 
GBMSTEP.learning.rate 
weight applied to individual trees for stepwise boosted regression trees; see also 
GBMSTEP.bag.fraction 
proportion of observations used in selecting variables for stepwise boosted regression trees; see also 
GBMSTEP.step.size 
number of trees to add at each cycle for stepwise boosted regression trees (should be small enough to result in a smaller holdout deviance than the initial number of trees [50]); see also 
RF.formula 
formula for the random forest algorithm; see also 
RF.ntree 
number of trees to grow for random forest algorithm; see also 
RF.mtry 
number of variables randomly sampled as candidates at each split for random forest algorithm; see also 
CF.formula 
formula for random forest algorithm; see also 
CF.ntree 
number of trees to grow in a forest; see also 
CF.mtry 
number of input variables randomly sampled as candidates at each node for random forest like algorithms; see also 
GLM.formula 
formula for the generalized linear model; see also 
GLM.family 
description of the error distribution and link function for the generalized linear model; see also 
GLMSTEP.steps 
maximum number of steps to be considered for stepwise generalized linear model; see also 
STEP.formula 
formula for the "starting model" to be considered for stepwise generalized linear model; see also 
GLMSTEP.scope 
range of models examined in the stepwise search; see also 
GLMSTEP.k 
multiple of the number of degrees of freedom used for the penalty (only k = 2 gives the genuine AIC); see also 
GAM.formula 
formula for the generalized additive model; see also 
GAM.family 
description of the error distribution and link function for the generalized additive model; see also 
GAMSTEP.steps 
maximum number of steps to be considered in the stepwise generalized additive model; see also 
GAMSTEP.scope 
range of models examined in the stepwise search n the stepwise generalized additive model; see also 
GAMSTEP.pos 
parameter expected to be set to 1 to allow for fitting of the stepwise generalized additive model 
MGCV.formula 
formula for the generalized additive model; see also 
MGCV.select 
if 
MGCVFIX.formula 
formula for the generalized additive model with fixed d.f. regression splines; see also 
EARTH.formula 
formula for the multivariate adaptive regression spline model; see also 
EARTH.glm 

RPART.formula 
formula for the recursive partioning and regression tree model; see also 
RPART.xval 
number of crossvalidations for the recursive partioning and regression tree model; see also 
NNET.formula 
formula for the artificial neural network model; see also 
NNET.size 
number of units in the hidden layer for the artificial neural network model; see also 
NNET.decay 
parameter of weight decay for the artificial neural network model; see also 
FDA.formula 
formula for the flexible discriminant analysis model; see also 
SVM.formula 
formula for the support vector machine model; see also 
SVME.formula 
formula for the support vector machine model; see also 
GLMNET.nlambda 
The number of 
GLMNET.class 
Use the predicted class to calculate the mean predictions of GLMNET; see also 
BIOCLIM.O.fraction 
Fraction of range representing the optimal limits, default value of 0.9 as in the original BIOCLIM software ( 
MAHAL.shape 
parameter that influences the transformation of output values of 
RASTER.species.name 
First part of the names of the raster files, expected to identify the modelled species (or organism). 
RASTER.stack.name 
Last part of the names of the raster files, expected to identify the predictor stack used. 
positive.filters 
vector that indicates parts of filenames for files that will be included in the calculation of the mean probability values 
negative.filters 
vector that indicates parts of filenames for files that will not be included in the calculation of the mean probability values 
abs.breaks 
Number of breaks in the colouring scheme for absence (only applies to 
pres.breaks 
Number of breaks in the colouring scheme for presence (only applies to 
sd.breaks 
Number of breaks in the colouring scheme for standard deviation (only applies to 
p 
presence points used for calibrating the suitability models, typically available in 2column (x, y) or (lon, lat) dataframe; see also 
a 
background points used for calibrating the suitability models, typically available in 2column (x, y) or (lon, lat) dataframe; see also 
pt 
presence points used for evaluating the suitability models, typically available in 2column (lon, lat) dataframe; see also 
at 
background points used for calibrating the suitability models, typicall available in 2column (lon, lat) dataframe; see also 
threshold 
Threshold value that will be used to distinguish between presence and absence. If < 0, then a threshold value will be calculated from the provided presence 
plot.method 
Choice of maps to be plotted: 
dev.new.width 
Width for new graphics device ( 
dev.new.height 
Heigth for new graphics device ( 
main 
main title for the plots. 
abs.col 
specify colours for absence (see examples on how not to plot areas where the species is predicted absent) 
pres.col 
specify colours for presence 
sd.col 
specify colours for standard deviation 
absencePresence.col 
specify colours for absence  presence maps (see examples on how not to plot areas where the species is predicted absent) 
count.col 
specify colours for number of algorithms or ensembles (see examples on how not to plot areas where the species is predicted absent) 
... 
Other items passed to function 
Details
This function allows for batch processing of different species and different environmental RasterStacks. The function makes internal calls to ensemble.spatialThin
, ensemble.VIF
, ensemble.calibrate.weights
, ensemble.calibrate.models
and ensemble.raster
.
Different ensemble runs allow for different random selection of kfold subsets, background locations or spatial thinning of presence locations.
ensemble.calibrate.weights
results in a crossvalidation procedure whereby the data set is split in calibration and testing subsets and the best weights for the ensemble model are determined (including the possibility for weights = 0).
ensemble.calibrate.models
is the step whereby models are calibrated using all the available presence data.
ensemble.raster
is the final step whereby raster layers are produced for the ensemble model.
Function ensemble.mean
results in raster layers that are based on the summary of several ensemble layers: the new ensemble has probability values that are the mean of the probabilities of the different raster layers, the presenceabsence threshold is derived for this new ensemble layer, whereas the count reflects the number of ensemble layers where presence was predicted. Note the assumption that input probabilities are scaled between 0 and 1000 (as the output from ensemble.raster
), whereas thresholds are based on actual probabilities (scaled between 0 and 1). After the mean probability has been calculated, the niche overlap (nicheOverlap
) with the different input layers is calculated.
Function ensemble.plot
plots suitability, presenceabsence or count maps. In the case of suitability maps, the presenceabsence threshold needs to be provide as suitabilities smaller than the threshold will be coloured red to orange, whereas suitabilities larger than the threshold will be coloured light blue to dark blue.
Value
The function finally results in ensemble raster layers for each species, including the fitted values for the ensemble model, the estimated presenceabsence and the count of the number of submodels prediction presence and absence.
Author(s)
Roeland Kindt (World Agroforestry Centre), Eike Luedeling (World Agroforestry Centre) and Evert Thomas (Bioversity International)
References
Kindt R. 2018. Ensemble species distribution modelling with transformed suitability values. Environmental Modelling & Software 100: 136145. doi:10.1016/j.envsoft.2017.11.009
Buisson L, Thuiller W, Casajus N, Lek S and Grenouillet G. 2010. Uncertainty in ensemble forecasting of species distribution. Global Change Biology 16: 11451157
Phillips SJ, Dudik M, Elith J et al. 2009. Sample selection bias and presenceonly distribution models: implications for background and pseudoabsence data. Ecological Applications 19: 181197.
See Also
ensemble.calibrate.weights
, ensemble.calibrate.models
, ensemble.raster
Examples
## Not run:
# based on examples in the dismo package
# get predictor variables
library(dismo)
predictor.files < list.files(path=paste(system.file(package="dismo"), '/ex', sep=''),
pattern='grd', full.names=TRUE)
predictors < stack(predictor.files)
# subset based on Variance Inflation Factors
predictors < subset(predictors, subset=c("bio5", "bio6",
"bio16", "bio17", "biome"))
predictors
predictors@title < "base"
# presence points
presence_file < paste(system.file(package="dismo"), '/ex/bradypus.csv', sep='')
pres < read.table(presence_file, header=TRUE, sep=',')
pres[,1] < rep("Bradypus", nrow(pres))
# choose background points
background < randomPoints(predictors, n=1000, extf = 1.00)
# north and south for new predictions (as if new climates)
ext2 < extent(90, 32, 0, 23)
predictors2 < crop(predictors, y=ext2)
predictors2 < stack(predictors2)
predictors2@title < "north"
ext3 < extent(90, 32, 33, 0)
predictors3 < crop(predictors, y=ext3)
predictors3 < stack(predictors3)
predictors3@title < "south"
# fit 3 ensembles with batch processing, choosing the best ensemble model based on the
# average weights of 4fold split of calibration and testing data
# final models use all available presence data and average weights determined by the
# ensemble.calibrate.weights function (called internally)
# batch processing can handle several species by using 3column species.presence and
# species.absence data sets
# note that these calculations can take a while
ensemble.nofactors < ensemble.batch(x=predictors,
xn=c(predictors, predictors2, predictors3),
species.presence=pres,
species.absence=background,
k.splits=4, k.test=0,
n.ensembles=3,
SINK=TRUE,
layer.drops=c("biome"),
ENSEMBLE.best=0, ENSEMBLE.exponent=c(1, 2, 3),
ENSEMBLE.min=0.7,
MAXENT=0, MAXNET=1, MAXLIKE=1, GBM=1, GBMSTEP=0, RF=1, CF=1,
GLM=1, GLMSTEP=1, GAM=1, GAMSTEP=1, MGCV=1, MGCVFIX=1,
EARTH=1, RPART=1, NNET=1, FDA=1, SVM=1, SVME=1, GLMNET=1,
BIOCLIM.O=1, BIOCLIM=1, DOMAIN=1, MAHAL=0, MAHAL01=1,
PROBIT=TRUE,
Yweights="BIOMOD",
formulae.defaults=TRUE)
# summaries for the 3 ensembles for the species
# summaries are based on files in folders ensemble/suitability,
# ensemble/presence and ensemble/count
# ensemble.mean is used internally in ensemble.batch
ensemble.mean(RASTER.species.name="Bradypus", RASTER.stack.name="base",
p=pres, a=background)
# plot mean suitability without specifying colours
plot1 < ensemble.plot(RASTER.species.name="Bradypus", RASTER.stack.name="base",
plot.method="consensussuitability",
p=pres, a=background, abs.breaks=4, pres.breaks=9)
plot1
# only colour the areas where species is predicted to be present
# option is invoked by having no absence breaks
# same colourscheme as \url{http://www.worldagroforestry.org/atlascentralamerica}
LAatlascols < grDevices::colorRampPalette(c("#FFFF80", "#38E009","#1A93AB", "#0C1078"))
plot2 < ensemble.plot(RASTER.species.name="Bradypus", RASTER.stack.name="base",
plot.method="consensussuitability",
p=pres, a=background, abs.breaks=0, pres.breaks=9, pres.col=LAatlascols(8))
plot2
# only colour the areas where species is predicted to be present
# option is invoked by only setting one colour for absencepresence
plot3 < ensemble.plot(RASTER.species.name="Bradypus", RASTER.stack.name="base",
plot.method="consensuspresence",
absencePresence.col=c("#90EE90"))
# only colour presence area by specifying colours > 0
plot4 < ensemble.plot(RASTER.species.name="Bradypus", RASTER.stack.name="base",
plot.method="consensuscount",
count.col=LAatlascols(3))
## End(Not run)