R: Calculate the Neural Network model domain applicability score

nn_domain_score {viraldomain}

R Documentation

Calculate the Neural Network model domain applicability score

Description

This function fits a Neural Network model to the provided data and computes a domain applicability score based on PCA distances.

Usage

nn_domain_score(
  featured_col,
  train_data,
  nn_hyperparameters,
  test_data,
  threshold_value
)

Arguments

`featured_col`	The name of the featured column in the training data.
`train_data`	The training data used to fit the Neural Network model.
`nn_hyperparameters`	A list of Neural Network hyperparameters, including hidden_units, penalty, and epochs.
`test_data`	The testing domain data used to calculate the domain applicability score.
`threshold_value`	The threshold value for domain applicability scoring.

Value

A tibble with the domain applicability scores.

Examples

library(viraldomain)
library(dplyr)

# Set the seed for reproducibility
set.seed(1234)

# Create a tibble with the training data
data(viral)

# Number of imputations needed
num_imputations <- sum(viral$vl_2022 <= 40)  # Count values below 40 cpm

# Impute unique values
imputed_values <- unique(rexp(num_imputations, rate = 1/13))

# Create a new tibble with mutated/imputed viral load
imputed_viral <- viral |>
  mutate(imputed_vl_2022 = ifelse(vl_2022 <= 40, imputed_values, vl_2022),
         log10_imputed_vl_2022 = log10(ifelse(vl_2022 <= 40, imputed_values, vl_2022)),
         jittered_log10_imputed_vl_2022 = jitter(log10_imputed_vl_2022))

# Create a new tibble with mutated/imputed cd4 counts
imputed_viral <- imputed_viral |>
  mutate(
    jittered_cd_2022 = ifelse(
    duplicated(cd_2022),
    cd_2022 + sample(1:100, length(cd_2022), replace = TRUE),
    cd_2022
    )
  )

# New data frame with mutated/imputed columns
imp_viral <- imputed_viral |>
select(jittered_cd_2022, jittered_log10_imputed_vl_2022) |>
scale() |>
as.data.frame()

# Set the seed for reproducibility
set.seed(1234)

# Create a tibble with the testing data
data(sero)

# Number of imputations needed
num_imputations <- sum(sero$vl_2022 <= 40)  # Count values below 40 cpm

# Impute unique values
imputed_values <- unique(rexp(num_imputations, rate = 1/13))

# Create a new tibble with mutated/imputed viral load
imputed_sero <- sero |>
  mutate(imputed_vl_2022 = ifelse(vl_2022 <= 40, imputed_values, vl_2022),
         log10_imputed_vl_2022 = log10(ifelse(vl_2022 <= 40, imputed_values, vl_2022)),
         jittered_log10_imputed_vl_2022 = jitter(log10_imputed_vl_2022))

# Create a new tibble with mutated/imputed cd
imputed_sero <- imputed_sero |>
  mutate(
    jittered_cd_2022 = ifelse(
    duplicated(cd_2022),
    cd_2022 + sample(1:100, length(cd_2022), replace = TRUE),
    cd_2022
    )
  )

# New data frame with mutated/imputed columns
imp_sero <- imputed_sero |>
select(jittered_cd_2022, jittered_log10_imputed_vl_2022) |>
scale() |>
as.data.frame()

# Specify your function parameters
featured_col <- "jittered_cd_2022"
train_data <- imp_viral
nn_hyperparameters <- list(hidden_units = 1, penalty = 0.3746312,  epochs =  480)
test_data <- imp_sero
threshold_value <- 0.99

# Call the function
nn_domain_score(featured_col, train_data, nn_hyperparameters, test_data, threshold_value)

[Package viraldomain version 0.0.3 Index]