qe-Series Predictive Functions {qeML}R Documentation

Quick-and-Easy Machine Learning Wrappers

Description

Quick access to machine learning methods, with a very simple interface. "Works right out of the box!": Just one call needed to fit, no preliminary setup of model etc. The simplicity also makes the series useful for teaching.

Usage

qeLogit(data,yName,holdout=floor(min(1000,0.1*nrow(data))),yesYVal=NULL)
qeLin(data,yName,noBeta0=FALSE,holdout=floor(min(1000,0.1*nrow(data))))
qeKNN(data,yName,k,scaleX=TRUE,smoothingFtn=mean,yesYVal=NULL,
   expandVars=NULL,expandVals =NULL,holdout=floor(min(1000,0.1*nrow(data))))
qeRF(data,yName,nTree=500,minNodeSize=10,mtry=floor(sqrt(ncol(data)))+1,
   holdout=floor(min(1000,0.1*nrow(data))))
qeRFranger(data,yName,nTree=500,minNodeSize=10,
   mtry=floor(sqrt(ncol(data)))+1,deweightPars=NULL,
   holdout=floor(min(1000,0.1*nrow(data))),yesYVal="") 
qeRFgrf(data,yName,nTree=2000,minNodeSize=5,mtry=floor(sqrt(ncol(data)))+1,
   ll=FALSE,lambda=0.1,splitCutoff=sqrt(nrow(data)),
   holdout=floor(min(1000,0.1*nrow(data))))
qeSVM(data,yName,gamma=1.0,cost=1.0,kernel='radial',degree=2,
   allDefaults=FALSE,holdout=floor(min(1000,0.1*nrow(data))))
qeGBoost(data,yName,nTree=100,minNodeSize=10,learnRate=0.1,
   holdout=floor(min(1000,0.1*nrow(data))))
qeAdaBoost(data, yName, treeDepth = 3, nRounds = 100, rpartControl = NULL, 
    holdout = floor(min(1000, 0.1 * nrow(data)))) 
qeLightGBoost(data,yName,nTree=100,minNodeSize=10,learnRate=0.1,
   holdout=floor(min(1000,0.1*nrow(data))))
qeNeural(data,yName,hidden=c(100,100),nEpoch=30,
   acts=rep("relu",length(hidden)),learnRate=0.001,
   conv=NULL,xShape=NULL,
   holdout=floor(min(1000,0.1*nrow(data))))
qeLASSO(data,yName,alpha=1,holdout=floor(min(1000,0.1*nrow(data))))
qePolyLin(data,yName,deg=2,maxInteractDeg = deg,
   holdout=floor(min(1000,0.1*nrow(data))))
qePolyLog(data,yName,deg=2,maxInteractDeg = deg,
   holdout=floor(min(1000,0.1*nrow(data))))
qePCA(data,yName,qeName,opts=NULL,pcaProp,
   holdout=floor(min(1000,0.1*nrow(data))))
qeUMAP(data,yName,qeName,opts=NULL,
   holdout=floor(min(1000,0.1*nrow(data))),scaleX=FALSE,
   nComps=NULL,nNeighbors=NULL)
qeDT(data,yName,alpha=0.05,minsplit=20,minbucket=7,maxdepth=0,mtry=0,
   holdout=floor(min(1000,0.1*nrow(data))))
qeFOCI(data,yName,numCores=1,parPlat="none",
   yesYLevel=NULL)
qeFOCIrand(data,yName,xSetSize,nXSets)
qeFOCImult(data,yName,numCores=1,
   parPlat="none",coalesce='union')
qeLinKNN(data,yName,k=25,scaleX=TRUE,smoothingFtn=mean,
   expandVars=NULL,expandVals=NULL,
   holdout=floor(min(1000,0.1*nrow(data))))
qePolyLASSO(data,yName,deg=2,maxInteractDeg=deg,alpha=0,
   holdout=floor(min(1000,0.1*nrow(data))))
qeROC(dataIn,qeOut,yLevelName)
qeXGBoost(data,yName,nRounds=250,
   params=list(eta=0.3,max_depth=6,alpha=0),
   holdout=floor(min(1000,0.1*nrow(data))))
qeDeepnet(data,yName,hidden=c(10),activationfun="sigm",
   learningrate=0.8,momentum=0.5,learningrate_scale=1,
   numepochs=3,batchsize=100,hidden_dropout=0,yesYVal=NULL,
   holdout=floor(min(1000,0.1*nrow(data))))
qeRpart(data,yName,minBucket=10,holdout=floor(min(1000,
   0.1*nrow(data)))) 
qeParallel(data,yName,qeFtnName,dataName,opts=NULL,cls=1,
   libs=NULL,holdout=NULL)
checkPkgLoaded(pkgName,whereObtain='CRAN') 
## S3 method for class 'qeParallel'
predict(object,newx,...)
## S3 method for class 'qeLogit'
predict(object,newx,...)
## S3 method for class 'qeLin'
predict(object,newx,useTrainRow1=TRUE,...)
## S3 method for class 'qeKNN'
predict(object,newx,newxK=1,...)
## S3 method for class 'qeRF'
predict(object,newx,...)
## S3 method for class 'qeRFranger'
predict(object,newx,...)
## S3 method for class 'qeRFgrf'
predict(object,newx,...)
## S3 method for class 'qeSVM'
predict(object,newx,...)
## S3 method for class 'qeGBoost'
predict(object,newx,newNTree=NULL,...)
## S3 method for class 'qeLightGBoost'
predict(object,newx,...)
## S3 method for class 'qeNeural'
predict(object,newx,k=NULL,...)
## S3 method for class 'qeLASSO'
predict(object,newx,...)
## S3 method for class 'qePoly'
predict(object,newx)
## S3 method for class 'qePCA'
predict(object,newx,...)
## S3 method for class 'qeUMAP'
predict(object,newx,...)
## S3 method for class 'qeDeepnet'
predict(object,newx,...)
## S3 method for class 'qeRpart'
predict(object,newx,...)
## S3 method for class 'qeLASSO'
plot(x,...)
## S3 method for class 'qeRF'
plot(x,...)
## S3 method for class 'qeRpart'
plot(x,boxPalette=c("red","yellow","green","blue"),...) 

Arguments

...

Further arguments.

cls

Cluster in the sense of parallel package. If not of class cluster, this is either a positive integer, indicating the desired number of cores, or a character vector, indicating the machines on which the cluster is to be formed.

libs

Character vector listing libraries needed to be loaded for qeFtnName.

dataName

Name of the data argument.

hidden_dropout

Drop out fraction for hidden layer.

batchsize

Batch size.

numepochs

Number of iterations to conduct.

learningrate

Learning rate.

momentum

Momemtum

learningrate_scale

Learning rate will be multiplied by this at each iteration, allowing for decay.

activationfun

Can be 'sigm', 'tanh' or 'linear'.

newNTree

Number of trees to use in prediction.

newxK

If predicting new cases, number of nearest neighbors to smooth in the object returned by qeKNN.

useTrainRow1

If TRUE, take names in newx from row 1 in the training data.

newx

New data to be predicted.

object

An object returned by a qe-series function.

minsplit

Minimum number of data points in a node.

minbucket

Minimum number of data points in a terminal node.

minBucket

Minimum number of data points in a terminal node.

maxdepth

Maximum number of levels in a tree.

qeName

Name of qe-series predictive function.

qeFtnName

Name of qe-series predictive function.

conv

R list specifying the convolutional layers, if any.

deweightPars

Values for de-emphasizing variables in a tree node split, e.g. 'list(age=0.2,gender=0.5)'.

allDefaults

Use all default values of the wrapped function.

expandVars

Columns to be emphasized.

expandVals

Emphasis values; a value less than 1 means de-emphasis.

mtry

Number of variables randomly tried at each split.

yesYVal

Y value to be considered "yes," to be coded 1 rather than 0.

yesYLevel

Y value to be considered "yes," to be coded 1 rather than 0.

noBeta0

No intercept term.

pcaProp

Desired proportion of overall variance for the PCs.'

data

Dataframe, training set. Classification case is signaled via labels column being an R factor.

dataIn

See data.

qeOut

Output from a qe-series function.

yName

Name of the class labels column.

holdout

If not NULL, form a holdout set of the specified size. After fitting to the remaining data, evaluate accuracy on the test set.

k

Number of nearest neighbors. In functions other than qeKNN for which this is an argument, it is the number of neighbors to use in finding conditional probabilities via knnCalib.

smoothingFtn

As in kNN.

scaleX

Scale the features.

nTree

Number of trees.

minNodeSize

Minimum number of data points in a tree node.

learnRate

Learning rate.

hidden

Vector of units per hidden layer. Fractional values indicated dropout proportions. Can be specified as a string, e.g. '100,50', for use with qeFT.

nEpoch

Number of iterations in neural net.

acts

Vector of names of the activation functions, one per hidden layer. Choices inclde 'relu', 'sigmoid', 'tanh', 'softmax', 'elu', 'selu'.

alpha

In the case of qeDT, a p-value cutoff criterion. Otherwise 1 for LASSO, 2 for ridge.

gamma

Scale parameter in e1071::svm.

cost

Cost parameter in e1071::svm.

kernel

In the case of qeSVM, this is One of 'linear','radial','polynomial' and 'sigmoid'.

degree

Degree of SVM polynomial kernel, if any.

opts

R list of optional arguments for none, some or all of th functions in qeFtnList.

nComps

Number of UMAP components to extract.

nNeighbors

Number of nearest neighbors to use in UMAP.

ll

If TRUE, use local linear forest.

lambda

Ridge lambda for local linear forest.

splitCutoff

For leaves smaller than this value, do not fit linear model. Just use the linear model fit to the entire dataset.

xShape

Input X data shape, e.g. c(28,28) for 28x28 grayscale images. Must be non-NULL if conv is.

treeDepth

Number of levels in each tree.

nRounds

Number of boosting rounds.

rpartControl

An R list specifying properties of fitted trees.

numCores

Number of cores to use in parallel computation.

parPlat

Parallel platforParallel platform. Valid values are 'none', 'cluster' (output of parallel::makeCluster), and 'locThreads' (local cores).

xSetSize

Size of subsets of the predictor variables.

nXSets

Number of subsets of the predictor variables.

coalesce

Method for combining variable sets.

deg

Degree of a polynomial.

maxInteractDeg

Maximul degree of interaction terms in a polynomial.

yLevelName

Name of the class to be considered a positive response in a classification problem.

params

Tuning parameters for xgboost, e.g. params=list(eta=0.1,max_depth=8).

boxPalette

Color palette.

pkgName

Name of wrapped package.

whereObtain

Location.

x

A qe-series function return object.

Details

As noted, these functions are intended for quick, first-level analysis of regression/machine learning problems. Emphasis here is on convenience and simplicity.

The idea is that, given a new dataset, the analyst can quickly and easily try fitting a number of models in succession, say first k-NN, then random forests:

# built-in data on major league baseball players
> data(mlb)  
> mlb <- mlb[,3:6]  # position, height, weight, age

# fit models
> knnout <- qeKNN(mlb,'Weight',k=25)
> rfout <- qeRF(mlb,'Weight')

# mean abs. pred. error on holdout set, in pounds
> knnout$testAcc
[1] 11.75644
> rfout$testAcc
[1] 12.6787

# predict a new case
> newx <- data.frame(Position='Catcher',Height=73.5,Age=26)
> predict(knnout,newx)
       [,1]
[1,] 204.04
> predict(rfout,newx)
      11 
199.1714

# many of the functions include algorithm-specific output
> lassout <- qeLASSO(mlb,'Weight')
holdout set has  101 rows
> lassout$testAcc
[1] 14.27337
> lassout$coefs  # sparse result?
10 x 1 sparse Matrix of class "dgCMatrix"
                                    s1
(Intercept)               -109.2909416
Position.Catcher             0.4408752
Position.First_Baseman       4.8308437
Position.Outfielder          .        
Position.Relief_Pitcher      .        
Position.Second_Baseman     -0.7846501
Position.Shortstop          -4.2291338
Position.Starting_Pitcher    .        
Height                       4.0039114
Age                          0.5352793

The holdout argument triggers formation of a holdout set and the corresponding cross-validation evaluation of predictive power. Note that if a holdout is formed, the return value will consist of the fit on the training set, not on the full original dataset.

The qe* functions do model fit. Each of them has a predict method, and some also have a plot method.

Arguments for qe* are at least:

Typically there are also algorithm-specific hyperparameter arguments.

Arguments for predict are at least:

For both the fitting function and the prediction function, there may be additional algorithm-specific parameters; default values are provided.

Some notes on specific functions:

In most cases, the full basket of options in the wrapped function is not reflected. Use of arguments not presented in the qe function requires direct use the relevant packages.

Value

The value returned by qe* functions depends on the algorithm, but with some commonality, e.g. classif, a logical value indicating whether the problem was of classification type.

If a holdout set was requested, an additional returned component will be testAcc, the accuracy on the holdout set. This will be Mean Absolute Prediction Error in the regression case, and proportion of misclassified cases in the classification case.

The value returned by the predict functions is an R list with components as follows:

Classification case:

Regression case: vector of predicted values

Author(s)

Norm Matloff

Examples


# see also 'details' above

## Not run: 

data(peFactors)  
pef <- peFactors[,c(1,3,5,7:9)]  
# most people in the dataset have at least a Bachelor's degree; so let's
# just consider Master's (code 14) and PhD (code 16) as special
pef$educ <- toSubFactor(pef$educ,c('14','16'))  

# predict occupation; 6 classes, 100, 101, 102, 106, 140, 141, using SVM
svmout <- qeSVM(pef,'occ',holdout=NULL) 
# as example of prediction, take the 8th case, but change the gender and
# age to female and 25; note that by setting k to non-null, we are
# requesting that conditional probabilities be calculated, via
# knnCalib(), here using 25 nearest neighbors
newx <- pef[8,-3] 
newx$sex <- '2'
newx$age <- 25
predict(svmout,newx,k=25)
# $predClasses
#   8 
# 100 
# Levels: 100 101 102 106 140 141
# $dvals
#      102/101    102/100   102/141  102/140  102/106    101/100  101/141
# 8 -0.7774038 -0.5132022 0.9997894 1.003251 0.999688 -0.4023077 1.000419
#    101/140   101/106  100/141  100/140  100/106   141/140    141/106   140/106
# 8 1.000474 0.9997371 1.000088 1.000026 1.000126 0.9460703 -0.4974625 -1.035721
# 
# $probs
#       100  101  102  106 140  141
# [1,] 0.24 0.52 0.12 0.08   0 0.04
#
# so, occupation code 100 is predicted, with a 0.36 conditional
# probability

# if holdout evaluation is desired as well, say 1000 cases, seed 9999:
> svmout <- qeSVM(pef,'occ',holdout=c(1000,9999)) 
> svmout$testAcc
[1] 0.622  # 62

# linear
# lm() doesn't like numeric factor levels, so prepend an 'a'
pef$occ <- prepend('a',pef$occ)
lmout <- qeLin(pef,'occ')
predict(lmout,pef[1,-3])  # occ 100, prob 0.3316
lmout <- qeLin(pef,'wageinc')
predict(lmout,pef[1,-5])  # 70857.79


## End(Not run)


[Package qeML version 1.1 Index]