R: Naive Bayes algorithm with case weights

w_naive_bayes {rbooster}

R Documentation

Naive Bayes algorithm with case weights

Description

Function for Naive Bayes algorithm classification with case weights.

Usage

w_naive_bayes(x_train, y_train, w = NULL, discretize = TRUE, breaks = 3)

w_gaussian_naive_bayes(x_train, y_train, w = NULL)

w_discrete_naive_bayes(x_train, y_train, breaks = 3, w = NULL)

Arguments

`x_train`	explanatory variables.
`y_train`	a factor class variable.
`w`	a vector of case weights.
`discretize`	If `TRUE` numerical variables are discretized and discrete naive bayes is applied,
`breaks`	number of break points for discretization. Ignored if `discretize = TRUE`.

Details

w_naive_bayes calls w_gaussian_naive_bayes or w_discrete_naive_bayes.

if discrete = FALSE, w_gaussian_naive_bayes is called. It uses Gaussian densities with case weights and allows multiclass classification.

if discrete = TRUE, w_discrete_naive_bayes is called. It uses conditional probabilities for each category with laplace smoothing and allows multiclass classification.

Value

a w_naive_bayes object with below components.

`n_train`	Number of cases in the input dataset.
`p`	Number of explanatory variables.
`x_classes`	A list of datasets, which are `x_train` separated for each class.
`n_classes`	Number of cases for each class in input dataset.
`k_classes`	Number of classes in class variable.
`priors`	Prior probabilities.
`class_names`	Names of classes in class variable.
`means`	Weighted mean estimations for each variable.
`stds`	Weighted standart deviation estimations for each variable.
`categories`	Labels for discretized variables.
`boundaries`	Upper and lower boundaries for discretization.
`ps`	probabilities for each variable categories.

Examples


library(rbooster)
## short functions for cross-validation and data simulation
cv_sampler <- function(y, train_proportion) {
 unlist(lapply(unique(y), function(m) sample(which(y==m), round(sum(y==m))*train_proportion)))
}

data_simulation <- function(n, p, k, train_proportion){
 means <- seq(0, k*1.5, length.out = k)
 x <- do.call(rbind, lapply(means,
                            function(m) matrix(data = rnorm(n = round(n/k)*p,
                                                            mean = m,
                                                            sd = 2),
                                               nrow = round(n/k))))
 y <- factor(rep(letters[1:k], each = round(n/k)))
 train_i <- cv_sampler(y, train_proportion)

 data <- data.frame(x, y = y)
 data_train <- data[train_i,]
 data_test <- data[-train_i,]
 return(list(data = data,
             data_train = data_train,
             data_test = data_test))
}

### binary classification example
n <- 500
p <- 10
k <- 2
dat <- data_simulation(n = n, p = p, k = k, train_proportion = 0.8)
x <- dat$data[,1:p]
y <- dat$data[,p+1]

x_train <- dat$data_train[,1:p]
y_train <- dat$data_train[,p+1]

x_test <- dat$data_test[,1:p]
y_test <- dat$data_test[,p+1]

## discretized Naive Bayes classification
mm1 <- w_naive_bayes(x_train = x_train, y_train = y_train, discretize = TRUE, breaks = 4)
preds1 <- predict(object = mm1, newdata = x_test, type = "pred")
table(y_test, preds1)
# or
mm2 <- w_discrete_naive_bayes(x_train = x_train, y_train = y_train, breaks = 4)
preds2 <- predict(object = mm2, newdata = x_test, type = "pred")
table(y_test, preds2)

## Gaussian Naive Bayes classification
mm3 <- w_naive_bayes(x_train = x_train, y_train = y_train, discretize = FALSE)
preds3 <- predict(object = mm3, newdata = x_test, type = "pred")
table(y_test, preds3)

#or
mm4 <- w_gaussian_naive_bayes(x_train = x_train, y_train = y_train)
preds4 <- predict(object = mm4, newdata = x_test, type = "pred")
table(y_test, preds4)

## multiclass example
n <- 500
p <- 10
k <- 5
dat <- data_simulation(n = n, p = p, k = k, train_proportion = 0.8)
x <- dat$data[,1:p]
y <- dat$data[,p+1]

x_train <- dat$data_train[,1:p]
y_train <- dat$data_train[,p+1]

x_test <- dat$data_test[,1:p]
y_test <- dat$data_test[,p+1]

# discretized
mm5 <- w_discrete_naive_bayes(x_train = x_train, y_train = y_train, breaks = 4)
preds5 <- predict(object = mm5, newdata = x_test, type = "pred")
table(y_test, preds5)

# gaussian
mm6 <- w_gaussian_naive_bayes(x_train = x_train, y_train = y_train)
preds6 <- predict(object = mm6, newdata = x_test, type = "pred")
table(y_test, preds6)

## example for case weights
n <- 500
p <- 10
k <- 5
dat <- data_simulation(n = n, p = p, k = k, train_proportion = 0.8)
x <- dat$data[,1:p]
y <- dat$data[,p+1]

x_train <- dat$data_train[,1:p]
y_train <- dat$data_train[,p+1]

# discretized
weights <- ifelse(y_train == "a" | y_train == "c", 1, 0.01)

mm7 <- w_discrete_naive_bayes(x_train = x_train, y_train = y_train, breaks = 4, w = weights)

preds7 <- predict(object = mm7, newdata = x_test, type = "pred")
table(y_test, preds7)

# gaussian
weights <- ifelse(y_train == "b" | y_train == "d", 1, 0.01)

mm8 <- w_gaussian_naive_bayes(x_train = x_train, y_train = y_train, w = weights)

preds8 <- predict(object = mm8, newdata = x_test, type = "pred")
table(y_test, preds8)

[Package rbooster version 1.1.0 Index]