w_naive_bayes {rbooster}R Documentation

Naive Bayes algorithm with case weights

Description

Function for Naive Bayes algorithm classification with case weights.

Usage

w_naive_bayes(x_train, y_train, w = NULL, discretize = TRUE, breaks = 3)

w_gaussian_naive_bayes(x_train, y_train, w = NULL)

w_discrete_naive_bayes(x_train, y_train, breaks = 3, w = NULL)

Arguments

x_train

explanatory variables.

y_train

a factor class variable.

w

a vector of case weights.

discretize

If TRUE numerical variables are discretized and discrete naive bayes is applied,

breaks

number of break points for discretization. Ignored if discretize = TRUE.

Details

w_naive_bayes calls w_gaussian_naive_bayes or w_discrete_naive_bayes.

if discrete = FALSE, w_gaussian_naive_bayes is called. It uses Gaussian densities with case weights and allows multiclass classification.

if discrete = TRUE, w_discrete_naive_bayes is called. It uses conditional probabilities for each category with laplace smoothing and allows multiclass classification.

Value

a w_naive_bayes object with below components.

n_train

Number of cases in the input dataset.

p

Number of explanatory variables.

x_classes

A list of datasets, which are x_train separated for each class.

n_classes

Number of cases for each class in input dataset.

k_classes

Number of classes in class variable.

priors

Prior probabilities.

class_names

Names of classes in class variable.

means

Weighted mean estimations for each variable.

stds

Weighted standart deviation estimations for each variable.

categories

Labels for discretized variables.

boundaries

Upper and lower boundaries for discretization.

ps

probabilities for each variable categories.

Examples


library(rbooster)
## short functions for cross-validation and data simulation
cv_sampler <- function(y, train_proportion) {
 unlist(lapply(unique(y), function(m) sample(which(y==m), round(sum(y==m))*train_proportion)))
}

data_simulation <- function(n, p, k, train_proportion){
 means <- seq(0, k*1.5, length.out = k)
 x <- do.call(rbind, lapply(means,
                            function(m) matrix(data = rnorm(n = round(n/k)*p,
                                                            mean = m,
                                                            sd = 2),
                                               nrow = round(n/k))))
 y <- factor(rep(letters[1:k], each = round(n/k)))
 train_i <- cv_sampler(y, train_proportion)

 data <- data.frame(x, y = y)
 data_train <- data[train_i,]
 data_test <- data[-train_i,]
 return(list(data = data,
             data_train = data_train,
             data_test = data_test))
}

### binary classification example
n <- 500
p <- 10
k <- 2
dat <- data_simulation(n = n, p = p, k = k, train_proportion = 0.8)
x <- dat$data[,1:p]
y <- dat$data[,p+1]

x_train <- dat$data_train[,1:p]
y_train <- dat$data_train[,p+1]

x_test <- dat$data_test[,1:p]
y_test <- dat$data_test[,p+1]

## discretized Naive Bayes classification
mm1 <- w_naive_bayes(x_train = x_train, y_train = y_train, discretize = TRUE, breaks = 4)
preds1 <- predict(object = mm1, newdata = x_test, type = "pred")
table(y_test, preds1)
# or
mm2 <- w_discrete_naive_bayes(x_train = x_train, y_train = y_train, breaks = 4)
preds2 <- predict(object = mm2, newdata = x_test, type = "pred")
table(y_test, preds2)

## Gaussian Naive Bayes classification
mm3 <- w_naive_bayes(x_train = x_train, y_train = y_train, discretize = FALSE)
preds3 <- predict(object = mm3, newdata = x_test, type = "pred")
table(y_test, preds3)

#or
mm4 <- w_gaussian_naive_bayes(x_train = x_train, y_train = y_train)
preds4 <- predict(object = mm4, newdata = x_test, type = "pred")
table(y_test, preds4)

## multiclass example
n <- 500
p <- 10
k <- 5
dat <- data_simulation(n = n, p = p, k = k, train_proportion = 0.8)
x <- dat$data[,1:p]
y <- dat$data[,p+1]

x_train <- dat$data_train[,1:p]
y_train <- dat$data_train[,p+1]

x_test <- dat$data_test[,1:p]
y_test <- dat$data_test[,p+1]

# discretized
mm5 <- w_discrete_naive_bayes(x_train = x_train, y_train = y_train, breaks = 4)
preds5 <- predict(object = mm5, newdata = x_test, type = "pred")
table(y_test, preds5)

# gaussian
mm6 <- w_gaussian_naive_bayes(x_train = x_train, y_train = y_train)
preds6 <- predict(object = mm6, newdata = x_test, type = "pred")
table(y_test, preds6)

## example for case weights
n <- 500
p <- 10
k <- 5
dat <- data_simulation(n = n, p = p, k = k, train_proportion = 0.8)
x <- dat$data[,1:p]
y <- dat$data[,p+1]

x_train <- dat$data_train[,1:p]
y_train <- dat$data_train[,p+1]

# discretized
weights <- ifelse(y_train == "a" | y_train == "c", 1, 0.01)

mm7 <- w_discrete_naive_bayes(x_train = x_train, y_train = y_train, breaks = 4, w = weights)

preds7 <- predict(object = mm7, newdata = x_test, type = "pred")
table(y_test, preds7)

# gaussian
weights <- ifelse(y_train == "b" | y_train == "d", 1, 0.01)

mm8 <- w_gaussian_naive_bayes(x_train = x_train, y_train = y_train, w = weights)

preds8 <- predict(object = mm8, newdata = x_test, type = "pred")
table(y_test, preds8)


[Package rbooster version 1.1.0 Index]