R: Regression model refitting

Refit {sharp}

R Documentation

Regression model refitting

Description

Refits the regression model with stably selected variables as predictors (without penalisation). Variables in xdata not evaluated in the stability selection model will automatically be included as predictors.

Usage

Refit(
  xdata,
  ydata,
  stability = NULL,
  family = NULL,
  implementation = NULL,
  Lambda = NULL,
  seed = 1,
  verbose = TRUE,
  ...
)

Recalibrate(
  xdata,
  ydata,
  stability = NULL,
  family = NULL,
  implementation = NULL,
  Lambda = NULL,
  seed = 1,
  verbose = TRUE,
  ...
)

Arguments

`xdata`	matrix of predictors with observations as rows and variables as columns.
`ydata`	optional vector or matrix of outcome(s). If `family` is set to `"binomial"` or `"multinomial"`, `ydata` can be a vector with character/numeric values or a factor.
`stability`	output of `VariableSelection` or `BiSelection`. If `stability=NULL` (the default), a model including all variables in `xdata` as predictors is fitted. Argument `family` must be provided in this case.
`family`	type of regression model. Possible values include `"gaussian"` (linear regression), `"binomial"` (logistic regression), `"multinomial"` (multinomial regression), and `"cox"` (survival analysis). If provided, this argument must be consistent with input `stability`.
`implementation`	optional function to refit the model. If `stability` is the output of `VariableSelection`, a regression model is refitted. If `implementation=NULL` and `Lambda=0`, this is done using `lm` (for linear regression), `coxph` (Cox regression), `glm` (logistic regression), or `multinom` (multinomial regression). If `Lambda=NULL`, a Ridge regression is fitted and calibrated by cross validation using `cv.glmnet`. The function `PLS` is used if `stability` is the output of `BiSelection`.
`Lambda`	optional vector of penalty parameters.
`seed`	value of the seed to initialise the random number generator and ensure reproducibility of the results (see `set.seed`).
`verbose`	logical indicating if a loading bar and messages should be printed.
`...`	additional arguments to be passed to the function provided in `implementation`.

Value

The output as obtained from:

`\link[stats]{lm}`	for linear regression (`"gaussian"` family).
`\link[survival]{coxph}`	for Cox regression (`"cox"` family).
`\link[stats]{glm}`	for logistic regression (`"binomial"` family).
`\link[nnet]{multinom}`	for multinomial regression (`"multinomial"` family).

Examples


## Linear regression

# Data simulation
set.seed(1)
simul <- SimulateRegression(n = 100, pk = 50, family = "gaussian")

# Data split
ids_train <- Resample(
  data = simul$ydata,
  tau = 0.5, family = "gaussian"
)
xtrain <- simul$xdata[ids_train, , drop = FALSE]
ytrain <- simul$ydata[ids_train, , drop = FALSE]
xrefit <- simul$xdata[-ids_train, , drop = FALSE]
yrefit <- simul$ydata[-ids_train, , drop = FALSE]

# Stability selection
stab <- VariableSelection(xdata = xtrain, ydata = ytrain, family = "gaussian")
print(SelectedVariables(stab))

# Refitting the model
refitted <- Refit(
  xdata = xrefit, ydata = yrefit,
  stability = stab
)
refitted$coefficients # refitted coefficients
head(refitted$fitted.values) # refitted predicted values

# Fitting the full model (including all possible predictors)
refitted <- Refit(
  xdata = simul$xdata, ydata = simul$ydata,
  family = "gaussian"
)
refitted$coefficients # refitted coefficients


## Logistic regression

# Data simulation
set.seed(1)
simul <- SimulateRegression(n = 200, pk = 20, family = "binomial")

# Data split
ids_train <- Resample(
  data = simul$ydata,
  tau = 0.5, family = "binomial"
)
xtrain <- simul$xdata[ids_train, , drop = FALSE]
ytrain <- simul$ydata[ids_train, , drop = FALSE]
xrefit <- simul$xdata[-ids_train, , drop = FALSE]
yrefit <- simul$ydata[-ids_train, , drop = FALSE]

# Stability selection
stab <- VariableSelection(xdata = xtrain, ydata = ytrain, family = "binomial")

# Refitting the model
refitted <- Refit(
  xdata = xrefit, ydata = yrefit,
  stability = stab
)
refitted$coefficients # refitted coefficients
head(refitted$fitted.values) # refitted predicted probabilities

## Partial Least Squares (multiple components)
if (requireNamespace("sgPLS", quietly = TRUE)) {
  # Data simulation
  set.seed(1)
  simul <- SimulateRegression(n = 500, pk = 15, q = 3, family = "gaussian")

  # Data split
  ids_train <- Resample(
    data = simul$ydata,
    tau = 0.5, family = "gaussian"
  )
  xtrain <- simul$xdata[ids_train, , drop = FALSE]
  ytrain <- simul$ydata[ids_train, , drop = FALSE]
  xrefit <- simul$xdata[-ids_train, , drop = FALSE]
  yrefit <- simul$ydata[-ids_train, , drop = FALSE]

  # Stability selection
  stab <- BiSelection(
    xdata = xtrain, ydata = ytrain,
    family = "gaussian", ncomp = 3,
    LambdaX = seq_len(ncol(xtrain) - 1),
    LambdaY = seq_len(ncol(ytrain) - 1),
    implementation = SparsePLS
  )
  plot(stab)

  # Refitting the model
  refitted <- Refit(
    xdata = xrefit, ydata = yrefit,
    stability = stab
  )
  refitted$Wmat # refitted X-weights
  refitted$Cmat # refitted Y-weights
}