Refit {sharp}R Documentation

Regression model refitting

Description

Refits the regression model with stably selected variables as predictors (without penalisation). Variables in xdata not evaluated in the stability selection model will automatically be included as predictors.

Usage

Refit(
  xdata,
  ydata,
  stability = NULL,
  family = NULL,
  implementation = NULL,
  Lambda = NULL,
  seed = 1,
  verbose = TRUE,
  ...
)

Recalibrate(
  xdata,
  ydata,
  stability = NULL,
  family = NULL,
  implementation = NULL,
  Lambda = NULL,
  seed = 1,
  verbose = TRUE,
  ...
)

Arguments

xdata

matrix of predictors with observations as rows and variables as columns.

ydata

optional vector or matrix of outcome(s). If family is set to "binomial" or "multinomial", ydata can be a vector with character/numeric values or a factor.

stability

output of VariableSelection or BiSelection. If stability=NULL (the default), a model including all variables in xdata as predictors is fitted. Argument family must be provided in this case.

family

type of regression model. Possible values include "gaussian" (linear regression), "binomial" (logistic regression), "multinomial" (multinomial regression), and "cox" (survival analysis). If provided, this argument must be consistent with input stability.

implementation

optional function to refit the model. If stability is the output of VariableSelection, a regression model is refitted. If implementation=NULL and Lambda=0, this is done using lm (for linear regression), coxph (Cox regression), glm (logistic regression), or multinom (multinomial regression). If Lambda=NULL, a Ridge regression is fitted and calibrated by cross validation using cv.glmnet. The function PLS is used if stability is the output of BiSelection.

Lambda

optional vector of penalty parameters.

seed

value of the seed to initialise the random number generator and ensure reproducibility of the results (see set.seed).

verbose

logical indicating if a loading bar and messages should be printed.

...

additional arguments to be passed to the function provided in implementation.

Value

The output as obtained from:

\link[stats]{lm}

for linear regression ("gaussian" family).

\link[survival]{coxph}

for Cox regression ("cox" family).

\link[stats]{glm}

for logistic regression ("binomial" family).

\link[nnet]{multinom}

for multinomial regression ("multinomial" family).

See Also

VariableSelection

Examples


## Linear regression

# Data simulation
set.seed(1)
simul <- SimulateRegression(n = 100, pk = 50, family = "gaussian")

# Data split
ids_train <- Resample(
  data = simul$ydata,
  tau = 0.5, family = "gaussian"
)
xtrain <- simul$xdata[ids_train, , drop = FALSE]
ytrain <- simul$ydata[ids_train, , drop = FALSE]
xrefit <- simul$xdata[-ids_train, , drop = FALSE]
yrefit <- simul$ydata[-ids_train, , drop = FALSE]

# Stability selection
stab <- VariableSelection(xdata = xtrain, ydata = ytrain, family = "gaussian")
print(SelectedVariables(stab))

# Refitting the model
refitted <- Refit(
  xdata = xrefit, ydata = yrefit,
  stability = stab
)
refitted$coefficients # refitted coefficients
head(refitted$fitted.values) # refitted predicted values

# Fitting the full model (including all possible predictors)
refitted <- Refit(
  xdata = simul$xdata, ydata = simul$ydata,
  family = "gaussian"
)
refitted$coefficients # refitted coefficients


## Logistic regression

# Data simulation
set.seed(1)
simul <- SimulateRegression(n = 200, pk = 20, family = "binomial")

# Data split
ids_train <- Resample(
  data = simul$ydata,
  tau = 0.5, family = "binomial"
)
xtrain <- simul$xdata[ids_train, , drop = FALSE]
ytrain <- simul$ydata[ids_train, , drop = FALSE]
xrefit <- simul$xdata[-ids_train, , drop = FALSE]
yrefit <- simul$ydata[-ids_train, , drop = FALSE]

# Stability selection
stab <- VariableSelection(xdata = xtrain, ydata = ytrain, family = "binomial")

# Refitting the model
refitted <- Refit(
  xdata = xrefit, ydata = yrefit,
  stability = stab
)
refitted$coefficients # refitted coefficients
head(refitted$fitted.values) # refitted predicted probabilities

## Partial Least Squares (multiple components)
if (requireNamespace("sgPLS", quietly = TRUE)) {
  # Data simulation
  set.seed(1)
  simul <- SimulateRegression(n = 500, pk = 15, q = 3, family = "gaussian")

  # Data split
  ids_train <- Resample(
    data = simul$ydata,
    tau = 0.5, family = "gaussian"
  )
  xtrain <- simul$xdata[ids_train, , drop = FALSE]
  ytrain <- simul$ydata[ids_train, , drop = FALSE]
  xrefit <- simul$xdata[-ids_train, , drop = FALSE]
  yrefit <- simul$ydata[-ids_train, , drop = FALSE]

  # Stability selection
  stab <- BiSelection(
    xdata = xtrain, ydata = ytrain,
    family = "gaussian", ncomp = 3,
    LambdaX = seq_len(ncol(xtrain) - 1),
    LambdaY = seq_len(ncol(ytrain) - 1),
    implementation = SparsePLS
  )
  plot(stab)

  # Refitting the model
  refitted <- Refit(
    xdata = xrefit, ydata = yrefit,
    stability = stab
  )
  refitted$Wmat # refitted X-weights
  refitted$Cmat # refitted Y-weights
}



[Package sharp version 1.4.6 Index]