ensemble.calibrate.models {BiodiversityR}  R Documentation 
Suitability mapping based on ensembles of modelling algorithms: calibration of models and weights
Description
The basic function ensemble.calibrate.models
allows to evaluate different algorithms for (species) suitability modelling, including maximum entropy (MAXENT), boosted regression trees, random forests, generalized linear models (including stepwise selection of explanatory variables), generalized additive models (including stepwise selection of explanatory variables), multivariate adaptive regression splines, regression trees, artificial neural networks, flexible discriminant analysis, support vector machines, the BIOCLIM algorithm, the DOMAIN algorithm and the Mahalanobis algorithm. These sets of functions were developed in parallel with the biomod2
package, especially for inclusion of the maximum entropy algorithm, but also to allow for a more direct integration with the BiodiversityR package, more direct handling of model formulae and greater focus on mapping. Researchers and students of species distribution are strongly encouraged to familiarize themselves with all the options of the BIOMOD and dismo packages.
Usage
ensemble.calibrate.models(x = NULL, p = NULL,
a = NULL, an = 1000, excludep = FALSE, target.groups = FALSE,
k = 0, pt = NULL, at = NULL, SSB.reduce = FALSE, CIRCLES.d = 250000,
TrainData = NULL, TestData = NULL,
VIF = FALSE, COR = FALSE,
SINK = FALSE, PLOTS = FALSE, CATCH.OFF = FALSE,
threshold.method = "spec_sens", threshold.sensitivity = 0.9,
threshold.PresenceAbsence = FALSE,
evaluations.keep = FALSE,
models.list = NULL, models.keep = FALSE,
models.save = FALSE, species.name = "Species001",
ENSEMBLE.tune = FALSE,
ENSEMBLE.best = 0, ENSEMBLE.min = 0.7, ENSEMBLE.exponent = 1,
ENSEMBLE.weight.min = 0.05,
input.weights = NULL,
MAXENT = 1, MAXNET = 1, MAXLIKE = 1, GBM = 1, GBMSTEP = 1, RF = 1, CF = 1,
GLM = 1, GLMSTEP = 1, GAM = 1, GAMSTEP = 1, MGCV = 1, MGCVFIX = 0,
EARTH = 1, RPART = 1, NNET = 1, FDA = 1, SVM = 1 , SVME = 1, GLMNET = 1,
BIOCLIM.O = 0, BIOCLIM = 1, DOMAIN = 1, MAHAL = 1, MAHAL01 = 1,
PROBIT = FALSE,
Yweights = "BIOMOD",
layer.drops = NULL, factors = NULL, dummy.vars = NULL,
formulae.defaults = TRUE, maxit = 100,
MAXENT.a = NULL, MAXENT.an = 10000,
MAXENT.path = paste(getwd(), "/models/maxent_", species.name, sep=""),
MAXNET.classes = "default", MAXNET.clamp = FALSE, MAXNET.type = "cloglog",
MAXLIKE.formula = NULL, MAXLIKE.method = "BFGS",
GBM.formula = NULL, GBM.n.trees = 2001,
GBMSTEP.gbm.x = 2:(ncol(TrainData.orig)), GBMSTEP.tree.complexity = 5,
GBMSTEP.learning.rate = 0.005, GBMSTEP.bag.fraction = 0.5,
GBMSTEP.step.size = 100,
RF.formula = NULL, RF.ntree = 751, RF.mtry = floor(sqrt(ncol(TrainData.vars))),
CF.formula = NULL, CF.ntree = 751, CF.mtry = floor(sqrt(ncol(TrainData.vars))),
GLM.formula = NULL, GLM.family = binomial(link = "logit"),
GLMSTEP.steps = 1000, STEP.formula = NULL, GLMSTEP.scope = NULL,
GLMSTEP.k = 2,
GAM.formula = NULL, GAM.family = binomial(link = "logit"),
GAMSTEP.steps = 1000, GAMSTEP.scope = NULL, GAMSTEP.pos = 1,
MGCV.formula = NULL, MGCV.select = FALSE,
MGCVFIX.formula = NULL,
EARTH.formula = NULL,
EARTH.glm = list(family = binomial(link = "logit"), maxit = maxit),
RPART.formula = NULL, RPART.xval = 50,
NNET.formula = NULL, NNET.size = 8, NNET.decay = 0.01,
FDA.formula = NULL,
SVM.formula = NULL,
SVME.formula = NULL,
GLMNET.nlambda = 100, GLMNET.class = FALSE,
BIOCLIM.O.fraction = 0.9,
MAHAL.shape = 1)
ensemble.calibrate.weights(x = NULL, p = NULL, TrainTestData=NULL,
a = NULL, an = 1000,
get.block = FALSE, block.default = TRUE, get.subblocks = FALSE,
SSB.reduce = FALSE, CIRCLES.d = 100000,
excludep = FALSE, target.groups = FALSE,
k = 4,
VIF = FALSE, COR = FALSE,
SINK = FALSE, PLOTS = FALSE, CATCH.OFF = FALSE,
data.keep = FALSE,
species.name = "Species001",
threshold.method = "spec_sens", threshold.sensitivity = 0.9,
threshold.PresenceAbsence = FALSE,
ENSEMBLE.tune = FALSE,
ENSEMBLE.best = 0, ENSEMBLE.min = 0.7, ENSEMBLE.exponent = 1,
ENSEMBLE.weight.min = 0.05,
input.weights = NULL,
MAXENT = 1, MAXNET = 1, MAXLIKE = 1, GBM = 1, GBMSTEP = 1, RF = 1, CF = 1,
GLM = 1, GLMSTEP = 1, GAM = 1, GAMSTEP = 1, MGCV = 1, MGCVFIX = 0,
EARTH = 1, RPART = 1, NNET = 1, FDA = 1, SVM = 1 , SVME = 1, GLMNET = 1,
BIOCLIM.O = 0, BIOCLIM = 1, DOMAIN = 1, MAHAL = 1, MAHAL01 = 1,
PROBIT = FALSE,
Yweights = "BIOMOD",
layer.drops = NULL, factors = NULL, dummy.vars = NULL,
formulae.defaults = TRUE, maxit = 100,
MAXENT.a = NULL, MAXENT.an = 10000,
MAXENT.path = paste(getwd(), "/models/maxent_", species.name, sep=""),
MAXNET.classes = "default", MAXNET.clamp = FALSE, MAXNET.type = "cloglog",
MAXLIKE.formula = NULL, MAXLIKE.method = "BFGS",
GBM.formula = NULL, GBM.n.trees = 2001,
GBMSTEP.gbm.x = 2:(length(var.names)+1), GBMSTEP.tree.complexity = 5,
GBMSTEP.learning.rate = 0.005,
GBMSTEP.bag.fraction = 0.5, GBMSTEP.step.size = 100,
RF.formula = NULL, RF.ntree = 751, RF.mtry = floor(sqrt(length(var.names))),
CF.formula = NULL, CF.ntree = 751, CF.mtry = floor(sqrt(length(var.names))),
GLM.formula = NULL, GLM.family = binomial(link = "logit"),
GLMSTEP.steps = 1000, STEP.formula = NULL, GLMSTEP.scope = NULL, GLMSTEP.k = 2,
GAM.formula = NULL, GAM.family = binomial(link = "logit"),
GAMSTEP.steps = 1000, GAMSTEP.scope = NULL, GAMSTEP.pos = 1,
MGCV.formula = NULL, MGCV.select = FALSE,
MGCVFIX.formula = NULL,
EARTH.formula = NULL,
EARTH.glm = list(family = binomial(link = "logit"), maxit = maxit),
RPART.formula = NULL, RPART.xval = 50,
NNET.formula = NULL, NNET.size = 8, NNET.decay = 0.01,
FDA.formula = NULL,
SVM.formula = NULL,
SVME.formula = NULL,
GLMNET.nlambda = 100, GLMNET.class = FALSE,
BIOCLIM.O.fraction = 0.9,
MAHAL.shape = 1)
ensemble.calibrate.models.gbm(x = NULL, p = NULL, a = NULL, an = 1000, excludep = FALSE,
k = 4,
TrainData = NULL,
VIF = FALSE, COR = FALSE,
SINK = FALSE, PLOTS = FALSE,
species.name = "Species001",
Yweights = "BIOMOD",
layer.drops = NULL, factors = NULL,
GBMSTEP.gbm.x = 2:(ncol(TrainData.orig)),
complexity = c(3:6), learning = c(0.005, 0.002, 0.001),
GBMSTEP.bag.fraction = 0.5, GBMSTEP.step.size = 100)
ensemble.calibrate.models.nnet(x = NULL, p = NULL, a = NULL, an = 1000, excludep = FALSE,
k = 4,
TrainData = NULL,
VIF = FALSE, COR = FALSE,
SINK = FALSE, PLOTS = FALSE,
species.name = "Species001",
Yweights = "BIOMOD",
layer.drops = NULL, factors = NULL,
formulae.defaults = TRUE, maxit = 100,
NNET.formula = NULL,
sizes = c(2, 4, 6, 8), decays = c(0.1, 0.05, 0.01, 0.001) )
ensemble.drop1(x = NULL, p = NULL,
a = NULL, an = 1000, excludep = FALSE, target.groups = FALSE,
k = 0, pt = NULL, at = NULL, SSB.reduce = FALSE, CIRCLES.d = 100000,
TrainData = NULL, TestData = NULL,
VIF = FALSE, COR = FALSE,
SINK = FALSE,
species.name = "Species001",
difference = FALSE, variables.alone = FALSE,
ENSEMBLE.tune = FALSE,
ENSEMBLE.best = 0, ENSEMBLE.min = 0.7, ENSEMBLE.exponent = 1,
input.weights = NULL,
MAXENT = 1, MAXNET = 1, MAXLIKE = 1, GBM = 1, GBMSTEP = 0, RF = 1, CF = 1,
GLM = 1, GLMSTEP = 1, GAM = 1, GAMSTEP = 1, MGCV = 1, MGCVFIX = 0,
EARTH = 1, RPART = 1, NNET = 1, FDA = 1, SVM = 1, SVME = 1, GLMNET = 1,
BIOCLIM.O = 0, BIOCLIM = 1, DOMAIN = 1, MAHAL = 1, MAHAL01 = 1,
PROBIT = FALSE,
Yweights = "BIOMOD",
layer.drops = NULL, factors = NULL, dummy.vars = NULL,
maxit = 100,
MAXENT.a = NULL, MAXENT.an = 10000,
MAXENT.path = paste(getwd(), "/models/maxent_", species.name, sep=""),
MAXNET.classes = "default", MAXNET.clamp = FALSE, MAXNET.type = "cloglog",
MAXLIKE.method = "BFGS",
GBM.n.trees = 2001,
GBMSTEP.tree.complexity = 5, GBMSTEP.learning.rate = 0.005,
GBMSTEP.bag.fraction = 0.5, GBMSTEP.step.size = 100,
RF.ntree = 751,
CF.ntree = 751,
GLM.family = binomial(link = "logit"),
GLMSTEP.steps = 1000, GLMSTEP.scope = NULL, GLMSTEP.k = 2,
GAM.family = binomial(link = "logit"),
GAMSTEP.steps = 1000, GAMSTEP.scope = NULL, GAMSTEP.pos = 1,
MGCV.select = FALSE,
EARTH.glm = list(family = binomial(link = "logit"), maxit = maxit),
RPART.xval = 50,
NNET.size = 8, NNET.decay = 0.01,
GLMNET.nlambda = 100, GLMNET.class = FALSE,
BIOCLIM.O.fraction = 0.9,
MAHAL.shape = 1)
ensemble.weights(weights = c(0.9, 0.8, 0.7, 0.5),
best = 0, min.weight = 0,
exponent = 1, digits = 6)
ensemble.strategy(TrainData = NULL, TestData = NULL,
verbose = FALSE,
ENSEMBLE.best = c(4:10), ENSEMBLE.min = c(0.7),
ENSEMBLE.exponent = c(1, 2, 3) )
ensemble.formulae(x,
layer.drops = NULL, factors = NULL, dummy.vars = NULL, weights = NULL)
ensemble.threshold(eval, threshold.method = "spec_sens", threshold.sensitivity = 0.9,
threshold.PresenceAbsence = FALSE, Pres, Abs)
ensemble.VIF(x = NULL, a = NULL, an = 10000,
VIF.max = 10, keep = NULL,
layer.drops = NULL, factors = NULL, dummy.vars = NULL)
ensemble.VIF.dataframe(x=NULL,
VIF.max=10, keep=NULL,
car=TRUE, silent=F)
ensemble.pairs(x = NULL, a = NULL, an = 10000)
Arguments
x 
RasterStack object ( 
p 
presence points used for calibrating the suitability models, typically available in 2column (lon, lat) dataframe; see also 
a 
background points used for calibrating the suitability models (except for 
an 
number of background points for calibration to be selected with 
excludep 
parameter that indicates (if 
target.groups 
Parameter that indicates (if 
k 
If larger than 1, the number of groups to split between calibration (k1) and evaluation (1) data sets (for example, 
pt 
presence points used for evaluating the suitability models, available in 2column (lon, lat) dataframe; see also 
at 
background points used for evaluating the suitability models, available in 2column (lon, lat) dataframe; see also 
SSB.reduce 
If 
CIRCLES.d 
Radius in m of circular neighbourhoods (created with 
TrainData 
dataframe with first column 'pb' describing presence (1) and absence (0) and other columns containing explanatory variables; see also 
TestData 
dataframe with first column 'pb' describing presence (1) and absence (0) and other columns containing explanatory variables; see also 
VIF 
Estimate the variance inflation factors based on a linear model calibrated on the training data (if 
COR 
Provide information on the correlation between the numeric explanatory variables (if 
SINK 
Append the results to a text file in subfolder 'outputs' (if 
PLOTS 
Disabled option of plotting evaluation results(BiodiversityR version 2.91)  see examples how to plot results afterwards and also 
CATCH.OFF 
Disable calls to function 
threshold.method 
Method to calculate the threshold between predicted absence and presence; possibilities include 
threshold.sensitivity 
Sensitivity value for 
threshold.PresenceAbsence 
If 
evaluations.keep 
Keep the results of evaluations (if 
models.list 
list with 'old' model objects such as 
models.keep 
store the details for each suitability modelling algorithm (if 
models.save 
Save the list with model details to a file (if 
species.name 
Name by which the model details will be saved to a file; see also argument 
data.keep 
Keep the data for each kfold crossvalidation run (if 
ENSEMBLE.tune 
Determine weights for the ensemble model based on AUC values (if 
ENSEMBLE.best 
The number of individual suitability models to be used in the consensus suitability map (based on a weighted average). In case this parameter is smaller than 1 or larger than the number of positive input weights of individual models, then all individual suitability models with positive input weights are included in the consensus suitability map. In case a vector is provided, 
ENSEMBLE.min 
The minimum input weight (typically corresponding to AUC values) for a model to be included in the ensemble. In case a vector is provided, function 
ENSEMBLE.exponent 
Exponent applied to AUC values to convert AUC values into weights (for example, an exponent of 2 converts input weights of 0.7, 0.8 and 0.9 into 0.7^2=0.49, 0.8^2=0.64 and 0.9^2=0.81). See details. 
ENSEMBLE.weight.min 
The minimum output weight for models included in the ensemble, applying to weights that sum to one. Note that 
input.weights 
array with numeric values for the different modelling algorithms; if 
MAXENT 
number: if larger than 0, then a maximum entropy model ( 
MAXNET 
number: if larger than 0, then a maximum entropy model ( 
MAXLIKE 
number: if larger than 0, then a maxlike model ( 
GBM 
number: if larger than 0, then a boosted regression trees model ( 
GBMSTEP 
number: if larger than 0, then a stepwise boosted regression trees model ( 
RF 
number: if larger than 0, then a random forest model ( 
CF 
number: if larger than 0, then a random forest model ( 
GLM 
number: if larger than 0, then a generalized linear model ( 
GLMSTEP 
number: if larger than 0, then a stepwise generalized linear model ( 
GAM 
number: if larger than 0, then a generalized additive model ( 
GAMSTEP 
number: if larger than 0, then a stepwise generalized additive model ( 
MGCV 
number: if larger than 0, then a generalized additive model ( 
MGCVFIX 
number: if larger than 0, then a generalized additive model with fixed d.f. regression splines ( 
EARTH 
number: if larger than 0, then a multivariate adaptive regression spline model ( 
RPART 
number: if larger than 0, then a recursive partioning and regression tree model ( 
NNET 
number: if larger than 0, then an artificial neural network model ( 
FDA 
number: if larger than 0, then a flexible discriminant analysis model ( 
SVM 
number: if larger than 0, then a support vector machine model ( 
SVME 
number: if larger than 0, then a support vector machine model ( 
GLMNET 
number: if larger than 0, then a GLM with lasso or elasticnet regularization ( 
BIOCLIM.O 
number: if larger than 0, then the original BIOCLIM algorithm ( 
BIOCLIM 
number: if larger than 0, then the BIOCLIM algorithm ( 
DOMAIN 
number: if larger than 0, then the DOMAIN algorithm ( 
MAHAL 
number: if larger than 0, then the Mahalanobis algorithm ( 
MAHAL01 
number: if larger than 0, then the Mahalanobis algorithm ( 
PROBIT 
If 
Yweights 
chooses how cases of presence and background (absence) are weighted; 
layer.drops 
vector that indicates which layers should be removed from RasterStack 
factors 
vector that indicates which variables are factors; see also 
dummy.vars 
vector that indicates which variables are dummy variables (influences formulae suggestions) 
formulae.defaults 
Suggest formulae for most of the models (if 
maxit 
Maximum number of iterations for some of the models. See also 
MAXENT.a 
background points used for calibrating the maximum entropy model ( 
MAXENT.an 
number of background points for calibration to be selected with 
MAXENT.path 
path to the directory where output files of the maximum entropy model are stored; see also 
MAXNET.classes 
continuous feature classes, either "default" or any subset of "lqpht" (linear, quadratic, product, hinge, threshold). Note that the "default" option chooses feature classes based on the number of presence locations as "l" (< 10 locations), "lq" (10  14 locations), "lqh" (15  79 locations) or "lqph" (> 79 locations). See also 
MAXNET.clamp 
restrict predictors and features to the range seen during model training; see also 
MAXNET.type 
type of response required; see also 
MAXLIKE.formula 
formula for the maxlike algorithm; see also 
MAXLIKE.method 
method for the maxlike algorithm; see also 
GBM.formula 
formula for the boosted regression trees algorithm; see also 
GBM.n.trees 
total number of trees to fit for the boosted regression trees model; see also 
GBMSTEP.gbm.x 
indices of column numbers with explanatory variables for stepwise boosted regression trees; see also 
GBMSTEP.tree.complexity 
complexity of individual trees for stepwise boosted regression trees; see also 
GBMSTEP.learning.rate 
weight applied to individual trees for stepwise boosted regression trees; see also 
GBMSTEP.bag.fraction 
proportion of observations used in selecting variables for stepwise boosted regression trees; see also 
GBMSTEP.step.size 
number of trees to add at each cycle for stepwise boosted regression trees (should be small enough to result in a smaller holdout deviance than the initial number of trees [50]); see also 
RF.formula 
formula for random forest algorithm; see also 
RF.ntree 
number of trees to grow for random forest algorithm; see also 
RF.mtry 
number of variables randomly sampled as candidates at each split for random forest algorithm; see also 
CF.formula 
formula for random forest algorithm; see also 
CF.ntree 
number of trees to grow in a forest; see also 
CF.mtry 
number of input variables randomly sampled as candidates at each node for random forest like algorithms; see also 
GLM.formula 
formula for the generalized linear model; see also 
GLM.family 
description of the error distribution and link function for the generalized linear model; see also 
GLMSTEP.steps 
maximum number of steps to be considered for stepwise generalized linear model; see also 
STEP.formula 
formula for the "starting model" to be considered for stepwise generalized linear model; see also 
GLMSTEP.scope 
range of models examined in the stepwise search; see also 
GLMSTEP.k 
multiple of the number of degrees of freedom used for the penalty (only k = 2 gives the genuine AIC); see also 
GAM.formula 
formula for the generalized additive model; see also 
GAM.family 
description of the error distribution and link function for the generalized additive model; see also 
GAMSTEP.steps 
maximum number of steps to be considered in the stepwise generalized additive model; see also 
GAMSTEP.scope 
range of models examined in the stepwise search n the stepwise generalized additive model; see also 
GAMSTEP.pos 
parameter expected to be set to 1 to allow for fitting of the stepwise generalized additive model 
MGCV.formula 
formula for the generalized additive model; see also 
MGCV.select 
if 
MGCVFIX.formula 
formula for the generalized additive model with fixed d.f. regression splines; see also 
EARTH.formula 
formula for the multivariate adaptive regression spline model; see also 
EARTH.glm 

RPART.formula 
formula for the recursive partioning and regression tree model; see also 
RPART.xval 
number of crossvalidations for the recursive partioning and regression tree model; see also 
NNET.formula 
formula for the artificial neural network model; see also 
NNET.size 
number of units in the hidden layer for the artificial neural network model; see also 
NNET.decay 
parameter of weight decay for the artificial neural network model; see also 
FDA.formula 
formula for the flexible discriminant analysis model; see also 
SVM.formula 
formula for the support vector machine model; see also 
SVME.formula 
formula for the support vector machine model; see also 
GLMNET.nlambda 
The number of 
GLMNET.class 
Use the predicted class to calculate the mean predictions of GLMNET; see 
BIOCLIM.O.fraction 
Fraction of range representing the optimal limits, default value of 0.9 as in the original BIOCLIM software ( 
MAHAL.shape 
parameter that influences the transformation of output values of 
TrainTestData 
dataframe with first column 'pb' describing presence (1) and absence (0) and other columns containing explanatory variables; see also 
get.block 
if 
block.default 
if 
get.subblocks 
if 
complexity 
vector with values of complexity of individual trees ( 
learning 
vector with values of weights applied to individual trees ( 
sizes 
vector with values of number of units in the hidden layer for the artificial neural network model; see also 
decays 
vector with values of weight decay for the artificial neural network model; see also 
difference 
if 
variables.alone 
if 
weights 
input weights for the 
best 
The number of final weights. In case this parameter is smaller than 1 or larger than the number of positive input weights of individual models, then this parameter is ignored. 
min.weight 
The minimum input weight to be included in the output. 
exponent 
Exponent applied to AUC values to convert AUC values into weights (for example, an exponent of 2 converts input weights of 0.7, 0.8 and 0.9 into 0.7^2=0.49, 0.8^2=0.64 and 0.9^2=0.81). See details. 
digits 
Number of number of decimal places in the output weights; see also 
verbose 
If 
eval 
ModelEvaluation object obtained by 
Pres 
Suitabilities (probabilities) at presence locations 
Abs 
Suitabilities (probabilities) at background locations 
VIF.max 
Maximum Variance Inflation Factor of the selected subset of variables. In case that at least one of the variables has VIF larger than VIF.max, then the variable with the highest VIF will be removed in the next step. 
keep 
character vector with names of the variables to be kept. 
car 
Also provide results from 
silent 
Limit textual output. 
Details
The basic function ensemble.calibrate.models
first calibrates individual suitability models based on presence locations p
and background locations a
, then evaluates these suitability models based on presence locations pt
and background locations at
. While calibrating and testing individual models, results obtained via the evaluate
function can be saved (evaluations.keep
).
As an alternative to providing presence locations p
, models can be calibrated with data provided in TrainData
. In case that both p
and TrainData
are provided, then models will be calibrated with TrainData
.
Calibration of the maximum entropy (MAXENT) algorithm is not based on background locations a
, but based on background locations MAXENT.a
instead. However, to compare evaluations with evaluations of other algorithms, during evaluations of the MAXENT algorithm, presence locations p
and background locations a
are used (and not background locations MAXENT.a
).
Output from the GLMNET algorithm is calculated as the mean of the output from predict.glmnet
. With option GLMNET.class = TRUE
, the mean output is the mean prediction of class 1. With option GLMNET.class = FALSE
, the mean output is the mean of the responses.
As the Mahalanobis function (mahal
) does not always provide values within the range of 0  1, the output values are rescaled with option MAHAL01
by first subtracting the value of 1  MAHAL.shape
from each prediction, followed by calculating the absolute value, followed by calculating the reciprocal value and finally multiplying this reciprocal value with MAHAL.shape
. As this rescaling method does not estimate probabilities, inclusion in the calculation of a (weighted) average of ensemble probabilities may be problematic and the PROBIT
transformation may help here (the same applies to other distancebased methods).
With parameter ENSEMBLE.best
, the subset of best models (evaluated by the individual AUC values) can be selected and only those models will be used for calculating the ensemble model (in other words, weights for models not included in the ensemble will be set to zero). It is possible to further increase the contribution to the ensemble model for models with higher AUC values through parameter ENSEMBLE.exponent
. With ENSEMBLE.exponent = 2
, AUC values of 0.7, 0.8 and 0.9 are converted into weights of 0.7^2=0.49, 0.8^2=0.64 and 0.9^2=0.81). With ENSEMBLE.exponent = 4
, AUC values of 0.7, 0.8 and 0.9 are converted into weights of 0.7^4=0.2401, 0.8^4=0.4096 and 0.9^2=0.6561).
ENSEMBLE.tune
will result in an internal procedure whereby the best selection of parameter values for ENSEMBLE.min
, ENSEMBLE.best
or ENSEMBLE.exponent
can be identified. Through a factorial procedure, the ensemble model with best AUC for a specific combination of parameter values is identified. The procedure also provides the weights that correspond to the best ensemble. In case that ENSEMBLE.tune
is set to FALSE
, then the ensemble is calculated based on the input weights.
Function ensemble.calibrate.weights
splits the presence and background locations in a userdefined (k
) number of subsets (i.e. kfold crossvalidation), then sequentially calibrates individual suitability models with (k1
) combined subsets and evaluates those with the remaining one subset, whereby each subset is used once for evaluation in the userdefined number (k
) of runs. For example, k = 4
results in splitting the locations in 4 subsets, then using one of these subsets in turn for evaluations (see also kfold
). Note that for the maximum entropy (MAXENT) algorithm, the same background data will be used in each crossvalidation run (this is based on the assumption that a large number (~10000) of background locations are used).
Among the output from function ensemble.calibrate.weights
are suggested weights for an ensemble model (output.weights
and output.weights.AUC
), and information on the respective AUC values of the ensemble model with the suggested weights for each of the (k
) subsets. Suggested weights output.weights
are calculated as the average of the weights of the different algorithms (submodels) of the k
ensembles. Suggested weights output.weights.AUC
are calculated as the average of the AUC of the different algorithms of the for the k
runs.
Function ensemble.calibrate.models.gbm
allows to test various combinations of parameters tree.complexity
and learning.rate
for the gbm.step
model.
Function ensemble.calibrate.models.nnet
allows to test various combinations of parameters size
and decay
for the nnet
model.
Function ensemble.drop1
allows to test the effects of leaving out each of the explanatory variables, and comparing these results with the "full" model. Note that option of difference = TRUE
may result in positive values, indicating that the model without the explanatory variable having larger AUC than the "full" model. A procedure is included to estimate the deviance of a model based on the fitted values, using 2 * (sum(x*log(x)) + sum((1x)*log(1x))) where x is a vector of the fitted values for a respective model. (It was checked that this procedure results in similar deviance estimates for the null and 'full' models for glm, but note that it is not certain whether deviance can be calculated in a similar way for other submodels.)
Function ensemble.formulae
provides suggestions for formulae that can be used for ensemble.calibrate.models
and ensemble.raster
. This function is always used internally by the ensemble.drop1
function.
The ensemble.weights
function is used internally by the ensemble.calibrate.models
and ensemble.raster
functions, using the input weights for the different suitability modelling algorithms. Ties between input weights result in the same output weights.
The ensemble.strategy
function is used internally by the ensemble.calibrate.models
function, using the train and test data sets with predictions of the different suitability modelling algorithms and different combinations of parameters ENSEMBLE.best
, ENSEMBLE.min
and ENSEMBLE.exponent
. The final ensemble model is based on the parameters that generate the best AUC.
The ensemble.threshold
function is used internally by the ensemble.calibrate.models
, ensemble.mean
and ensemble.plot
functions. threshold2005.mean
results in calculating the mean value of threshold methods that resulted in better results (calculated by optimal.thresholds
with methods of ObsPrev
, MeanProb
, MaxSens+Spec
, Sens=Spec
and MinROCdist
) in a study by Liu et al. (Ecography 28: 385393. 2005). threshold2005.min
results in calculating the mean value of threshold methods that resulted in better results (calculated by optimal.thresholds
with methods of ObsPrev
, MeanProb
and MaxSens+Spec
) in a study by Liu et al. (Ecography 28: 385393. 2005). threshold2013.mean
results in calculating the mean value of threshold methods that resulted in better results (calculated by optimal.thresholds
with methods of ObsPrev
, MeanProb
, MaxSens+Spec
, Sens=Spec
and MinROCdist
) in a study by Liu et al. (J. Biogeogr. 40: 778789. 2013). threshold2013.min
results in calculating the minimum value of threshold methods that resulted in better results (calculated by optimal.thresholds
with methods of ObsPrev
, MeanProb
, MaxSens+Spec
, Sens=Spec
and MinROCdist
) in a study by Liu et al. (J. Biogeogr. 40: 778789. 2013).
Function ensemble.VIF
implements a stepwise procedure whereby the explanatory variable with highest Variance Inflation Factor is removed from the list of variables. The procedure ends when no variable has VIF larger than parameter VIF.max
.
Function ensemble.pairs
provides a matrix of scatterplots similar to the example of pairs
for version 3.4.3 of that package.
Value
Function ensemble.calibrate.models
(potentially) returns a list with results from evaluations (via evaluate
) of calibration and test runs of individual suitability models.
Function ensemble.calibrate.weights
returns a matrix with, for each individual suitability model, the AUC of each run and the average AUC over the runs. Models are sorted by the average AUC. The average AUC for each model can be used as input weights for the ensemble.raster
function.
Functions ensemble.calibrate.models.gbm
and ensemble.calibrate.models.nnet
return a matrix with, for each combination of model parameters, the AUC of each run and the average AUC. Models are sorted by the average AUC.
Author(s)
Roeland Kindt (World Agroforestry Centre)
References
Kindt R. 2018. Ensemble species distribution modelling with transformed suitability values. Environmental Modelling & Software 100: 136145. doi:10.1016/j.envsoft.2017.11.009
Buisson L, Thuiller W, Casajus N, Lek S and Grenouillet G. 2010. Uncertainty in ensemble forecasting of species distribution. Global Change Biology 16: 11451157
Liu C, Berry PM, Dawson TP and Pearson RC. 2005. Selecting thresholds of occurrence in the prediction of species distributions. Ecography 28: 385393
Liu C, White M and Newell G. 2013. Selecting thresholds for the prediction of species occurrence with presenceonly data. Journal of Biogeography 40: 778789
Phillips SJ, Dudik M, Elith J et al. 2009. Sample selection bias and presenceonly distribution models: implications for background and pseudoabsence data. Ecological Applications 19: 181197.
See Also
ensemble.raster
, ensemble.batch
Examples
## Not run:
# based on examples in the dismo package
# get predictor variables
library(dismo)
predictor.files < list.files(path=paste(system.file(package="dismo"), '/ex', sep=''),
pattern='grd', full.names=TRUE)
predictors < stack(predictor.files)
# subset based on Variance Inflation Factors
predictors < subset(predictors, subset=c("bio5", "bio6",
"bio16", "bio17", "biome"))
predictors
predictors@title < "predictors"
# presence points
presence_file < paste(system.file(package="dismo"), '/ex/bradypus.csv', sep='')
pres < read.table(presence_file, header=TRUE, sep=',')[,1]
# the kfold function randomly assigns data to groups;
# groups are used as calibration (1/4) and training (3/4) data
groupp < kfold(pres, 4)
pres_train < pres[groupp != 1, ]
pres_test < pres[groupp == 1, ]
# choose background points
background < randomPoints(predictors, n=1000, extf=1.00)
colnames(background)=c('lon', 'lat')
groupa < kfold(background, 4)
backg_train < background[groupa != 1, ]
backg_test < background[groupa == 1, ]
# formulae for random forest and generalized linear model
# compare with: ensemble.formulae(predictors, factors=c("biome"))
rfformula < as.formula(pb ~ bio5+bio6+bio16+bio17)
glmformula < as.formula(pb ~ bio5 + I(bio5^2) + I(bio5^3) +
bio6 + I(bio6^2) + I(bio6^3) + bio16 + I(bio16^2) + I(bio16^3) +
bio17 + I(bio17^2) + I(bio17^3) )
# fit four ensemble models (RF, GLM, BIOCLIM, DOMAIN)
# factors removed for BIOCLIM, DOMAIN, MAHAL
ensemble.nofactors < ensemble.calibrate.models(x=predictors, p=pres_train, a=backg_train,
pt=pres_test, at=backg_test,
species.name="Bradypus",
ENSEMBLE.tune=TRUE,
ENSEMBLE.min = 0.65,
MAXENT=0, MAXNET=0, MAXLIKE=0, GBM=0, GBMSTEP=0, RF=1, CF=0,
GLM=1, GLMSTEP=0, GAM=0, GAMSTEP=0, MGCV=0, MGCVFIX=0,
EARTH=0, RPART=0, NNET=0, FDA=0, SVM=0, SVME=0, GLMNET=0,
BIOCLIM.O=0, BIOCLIM=1, DOMAIN=1, MAHAL=0, MAHAL01=0,
Yweights="BIOMOD",
factors="biome",
evaluations.keep=TRUE, models.keep=TRUE,
RF.formula=rfformula,
GLM.formula=glmformula)
# with option models.keep, all model objects are saved in ensemble object
# the same slots can be used to replace model objects with new calibrations
ensemble.nofactors$models$RF
summary(ensemble.nofactors$models$GLM)
ensemble.nofactors$models$BIOCLIM
ensemble.nofactors$models$DOMAIN
ensemble.nofactors$models$formulae
# evaluations are kept in different slot
attributes(ensemble.nofactors$evaluations)
plot(ensemble.nofactors$evaluations$RF.T, "ROC")
# fit four ensemble models (RF, GLM, BIOCLIM, DOMAIN) using default formulae
# variable 'biome' is not included as explanatory variable
# results are provided in a file in the 'outputs' subfolder of the working
# directory
ensemble.nofactors < ensemble.calibrate.models(x=predictors,
p=pres_train, a=backg_train,
pt=pres_test, at=backg_test,
layer.drops="biome",
species.name="Bradypus",
ENSEMBLE.tune=TRUE,
ENSEMBLE.min=0.65,
SINK=TRUE,
MAXENT=0, MAXNET=0, MAXLIKE=0, GBM=0, GBMSTEP=0, RF=1, CF=0,
GLM=1, GLMSTEP=0, GAM=0,
GAMSTEP=0, MGCV=0, MGCVFIX=0, EARTH=0, RPART=0, NNET=0, FDA=0,
SVM=0, SVME=0, GLMNET=0,
BIOCLIM.O=0, BIOCLIM=1, DOMAIN=1, MAHAL=0, MAHAL01=0,
Yweights="BIOMOD",
evaluations.keep=TRUE,
formulae.defaults=TRUE)
# after fitting the individual algorithms (submodels),
# transform predictions with a probit link.
ensemble.nofactors < ensemble.calibrate.models(x=predictors,
p=pres_train, a=backg_train,
pt=pres_test, at=backg_test,
layer.drops="biome",
species.name="Bradypus",
SINK=TRUE,
ENSEMBLE.tune=TRUE,
ENSEMBLE.min=0.65,
MAXENT=0, MAXNET=0, MAXLIKE=0, GBM=0, GBMSTEP=0, RF=1, CF=0,
GLM=1, GLMSTEP=0, GAM=0, GAMSTEP=0, MGCV=0, MGCVFIX=0,
EARTH=0, RPART=0, NNET=0, FDA=0, SVM=0, SVME=0, GLMNET=0,
BIOCLIM.O=0, BIOCLIM=1, DOMAIN=1, MAHAL=0, MAHAL01=0,
PROBIT=TRUE,
Yweights="BIOMOD", factors="biome",
evaluations.keep=TRUE,
formulae.defaults=TRUE)
# Instead of providing presence and background locations, provide data.frames.
# Because 'biome' is a factor, RasterStack needs to be provided
# to check for levels in the Training and Testing data set.
TrainData1 < prepareData(x=predictors, p=pres_train, b=backg_train,
factors=c("biome"), xy=FALSE)
TestData1 < prepareData(x=predictors, p=pres_test, b=backg_test,
factors=c("biome"), xy=FALSE)
ensemble.factors1 < ensemble.calibrate.models(x=predictors,
TrainData=TrainData1, TestData=TestData1,
p=pres_train, a=backg_train,
pt=pres_test, at=backg_test,
species.name="Bradypus",
SINK=TRUE,
ENSEMBLE.tune=TRUE,
ENSEMBLE.min=0.65,
MAXENT=0, MAXNET=1, MAXLIKE=1, GBM=1, GBMSTEP=0, RF=1, CF=1,
GLM=1, GLMSTEP=1, GAM=1, GAMSTEP=1, MGCV=1, MGCVFIX=0,
EARTH=1, RPART=1, NNET=1, FDA=1, SVM=1, SVME=1, GLMNET=1,
BIOCLIM.O=1, BIOCLIM=1, DOMAIN=1, MAHAL=0, MAHAL01=1,
Yweights="BIOMOD", factors="biome",
evaluations.keep=TRUE)
# compare different methods of calculating ensembles
ensemble.factors2 < ensemble.calibrate.models(x=predictors,
TrainData=TrainData1, TestData=TestData1,
p=pres_train, a=backg_train,
pt=pres_test, at=backg_test,
species.name="Bradypus",
SINK=TRUE,
ENSEMBLE.tune=TRUE,
MAXENT=0, MAXNET=1, MAXLIKE=1, GBM=1, GBMSTEP=0, RF=1, CF=1,
GLM=1, GLMSTEP=1, GAM=1, GAMSTEP=1, MGCV=1, MGCVFIX=1,
EARTH=1, RPART=1, NNET=1, FDA=1, SVM=1, SVME=1, GLMNET=1,
BIOCLIM.O=1, BIOCLIM=1, DOMAIN=1, MAHAL=0, MAHAL01=1,
ENSEMBLE.best=c(4:10), ENSEMBLE.exponent=c(1, 2, 3),
Yweights="BIOMOD", factors="biome",
evaluations.keep=TRUE)
# test performance of different suitability models
# data are split in 4 subsets, each used once for evaluation
ensemble.nofactors2 < ensemble.calibrate.weights(x=predictors,
p=pres, a=background, k=4,
species.name="Bradypus",
SINK=TRUE, PROBIT=TRUE,
MAXENT=0, MAXNET=1, MAXLIKE=1, GBM=1, GBMSTEP=0, RF=1, CF=1,
GLM=1, GLMSTEP=1, GAM=1, GAMSTEP=1, MGCV=1, MGCVFIX=1,
EARTH=1, RPART=1, NNET=1, FDA=1, SVM=1, SVME=1, GLMNET=1,
BIOCLIM.O=1, BIOCLIM=1, DOMAIN=1, MAHAL=0, MAHAL01=1,
ENSEMBLE.tune=TRUE,
ENSEMBLE.best=0, ENSEMBLE.exponent=c(1, 2, 3),
ENSEMBLE.min=0.7,
Yweights="BIOMOD",
formulae.defaults=TRUE)
ensemble.nofactors2$AUC.table
ensemble.nofactors2$eval.table.all
# test the result of leaving out one of the variables from the model
# note that positive differences indicate that the model without the variable
# has higher AUC than the full model
ensemble.variables < ensemble.drop1(x=predictors,
p=pres, a=background, k=4,
species.name="Bradypus",
SINK=TRUE,
difference=TRUE,
VIF=TRUE, PROBIT=TRUE,
MAXENT=0, MAXNET=1, MAXLIKE=1, GBM=1, GBMSTEP=0, RF=1, CF=1,
GLM=1, GLMSTEP=1, GAM=1, GAMSTEP=1, MGCV=1, MGCVFIX=1,
EARTH=1, RPART=1, NNET=1, FDA=1, SVM=1, SVME=1, GLMNET=1,
BIOCLIM.O=1, BIOCLIM=1, DOMAIN=1, MAHAL=0, MAHAL01=1,
ENSEMBLE.tune=TRUE,
ENSEMBLE.best=0, ENSEMBLE.exponent=c(1, 2, 3),
ENSEMBLE.min=0.7,
Yweights="BIOMOD", factors="biome")
ensemble.variables
# use function ensemble.VIF to select a subset of variables
# factor variables are not handled well by the function
# and therefore factors are removed
# however, one can check for factors with car::vif through
# the ensemble.calibrate.models function
# VIF.analysis$var.drops can be used as input for ensemble.calibrate.models or
# ensemble.calibrate.weights
predictors < stack(predictor.files)
predictors < subset(predictors, subset=c("bio1", "bio5", "bio6", "bio8",
"bio12", "bio16", "bio17", "biome"))
ensemble.pairs(predictors)
VIF.analysis < ensemble.VIF(predictors, factors="biome")
VIF.analysis
# alternative solution where bio1 and bio12 are kept
VIF.analysis < ensemble.VIF(predictors, factors="biome",
keep=c("bio1", "bio12"))
VIF.analysis
## End(Not run)