catSplitEncoding {catSplit}R Documentation

Encode categorical variables using split information of CART

Description

Encode categorical variables using split information of CART

Usage

catSplitEncoding(
  targetVariable,
  trainData,
  testData,
  problemType,
  datasetName,
  catVariables
)

Arguments

targetVariable

target variable that we want to predict.

trainData

training data.

testData

testing data.

problemType

classification or regression.

datasetName

Name of the dataset, could be any string name.

catVariables

List of categorical variables in the dataset.

Value

dataframe that is the encoding of categorical variables.

Examples


library("OpenML")
library("farff")
library("stringr")
library("stats")
library("data.table")
library("rpart")
library("catSplit")




# An example dataset from OpenML
datInfo <- getOMLDataSet(data.id = 41283, verbosity = 0)
targetVariable <- datInfo$target.features
dat <- datInfo$data
datasetName <- datInfo$desc$name
catVariables <- names(Filter(is.factor, dat))
# Remove target variable from catVariables
catVariables <- catVariables[!(catVariables %in% targetVariable)]
problemType <- "classification"
# Split dat to train and test sets
smp_size <- floor(0.75 * nrow(dat))
train_ind <- sample(seq_len(nrow(dat)), size = smp_size)
train <- as.data.frame.matrix(dat[train_ind, ])
test <- as.data.frame.matrix(dat[-train_ind, ])
# Outputs a list containing 2 files: encoding frame for train data, encoding frame for test data
train_and_test_cat = catSplitEncoding(targetVariable = targetVariable,
                                                 trainData = train,
                                                 testData = test,
                                                 problemType = problemType,
                                                 datasetName = datasetName,
                                                 catVariables = catVariables)
# Get transformed train and test sets from the output list
trainCat = train_and_test_cat[1]
testCat = train_and_test_cat[2]

# Drop categorical variables from the original train and test data
trainData <- train[!names(train) %in% catVariables]
testData <- test[!names(test) %in% catVariables]

# Merge encoding frame and original data
train <- cbind(trainCat, trainData)
test <- cbind(testCat, testData)



[Package catSplit version 0.1.0 Index]