R: Preprocessed data set to mimic a small pharmacogenomic...

exampleGDSC {BayesSUR}

R Documentation

Preprocessed data set to mimic a small pharmacogenomic example

Description

Preprocessed data set to mimic a small pharmacogenetic example from the Genomics of Drug Sensitivity in Cancer (GDSC) database, with p=850 gene features as explanatory variables, s=7 drugs sensitivity data as response variables and data for n=498 cell lines. Gene features include p1=343 gene expression features (GEX), p2=426 by copy number variations (CNV) and p3=68 mutated genes (MUT). Loading the data will load the associated blockList (and mrfG) objects needed to fit the model with BayesSUR(). The R code for generating the simulated data is given in the Examples paragraph.

#importFrom plyr mapvalues #importFrom data.table like

Usage

exampleGDSC

Format

An object of class list of length 3.

Examples

# Load the GDSC sample dataset
data("exampleGDSC", package = "BayesSUR")
str(exampleGDSC)

## Not run: 
# ===============
# This code below is to do preprocessing of GDSC data and obtain the complete dataset
# "exampleGDSC.rda" above. The user needs load the datasets from
# https://www.cancerrxgene.org release 5.
# But downloading and transforming the three used datasets below to *.csv files first.
# ===============

requireNamespace("plyr", quietly = TRUE)
requireNamespace("data.table", quietly = TRUE)


features <- data.frame(read.csv("/gdsc_en_input_w5.csv", head = T))
names.fea <- strsplit(rownames(features), "")
features <- t(features)
p <- c(13321, 13747 - 13321, 13818 - 13747)
Cell.Line <- rownames(features)
features <- data.frame(Cell.Line, features)

ic50_00 <- data.frame(read.csv("gdsc_drug_sensitivity_fitted_data_w5.csv", head = T))
ic50_0 <- ic50_00[, c(1, 4, 7)]
drug.id <- data.frame(read.csv("gdsc_tissue_output_w5.csv", head = T))[, c(1, 3)]
drug.id2 <- drug.id[!duplicated(drug.id$drug.id), ]
# delete drug.id=1066 since ID1066 and ID156 both correspond drug AZD6482,
# and no ID1066 in the "suppl.Data1" by Garnett et al. (2012)
drug.id2 <- drug.id2[drug.id2$drug.id != 1066, ]
drug.id2$drug.name <- as.character(drug.id2$drug.name)
drug.id2$drug.name <- substr(drug.id2$drug.name, 1, nchar(drug.id2$drug.name) - 6)
drug.id2$drug.name <- gsub(" ", "-", drug.id2$drug.name)

ic50 <- ic50_0
# mapping the drug_id to drug names in drug sensitivity data set
ic50$drug_id <- plyr::mapvalues(ic50$drug_id, from = drug.id2[, 2], to = drug.id2[, 1])
colnames(ic50) <- c("Cell.Line", "compound", "IC50")

# transform drug sensitivity overall cell lines to a data matrix
y0 <- reshape(ic50, v.names = "IC50", timevar = "compound", 
              idvar = "Cell.Line", direction = "wide")
y0$Cell.Line <- gsub("-", ".", y0$Cell.Line)

# ===============
# select nonmissing pharmacological data
# ===============
y00 <- y0
m0 <- dim(y0)[2] - 1
eps <- 0.05
# r1.na is better to be not smaller than r2.na
r1.na <- 0.3
r2.na <- 0.2
k <- 1
while (sum(is.na(y0[, 2:(1 + m0)])) > 0) {
  r1.na <- r1.na - eps / k
  r2.na <- r1.na - eps / k
  k <- k + 1
  ## select drugs with <30% (decreasing with k) missing data overall cell lines
  na.y <- apply(y0[, 2:(1 + m0)], 2, function(xx) sum(is.na(xx)) / length(xx))
  while (sum(na.y < r1.na) < m0) {
    y0 <- y0[, -c(1 + which(na.y >= r1.na))]
    m0 <- sum(na.y < r1.na)
    na.y <- apply(y0[, 2:(1 + m0)], 2, function(xx) sum(is.na(xx)) / length(xx))
  }

  ## select cell lines with treatment of at least 80% (increasing with k) drugs
  na.y0 <- apply(y0[, 2:(1 + m0)], 1, function(xx) sum(is.na(xx)) / length(xx))
  while (sum(na.y0 < r2.na) < (dim(y0)[1])) {
    y0 <- y0[na.y0 < r2.na, ]
    na.y0 <- apply(y0[, 2:(1 + m0)], 1, function(xx) sum(is.na(xx)) / length(xx))
  }
  num.na <- sum(is.na(y0[, 2:(1 + m0)]))
  message("#{NA}=", num.na, "\n", "r1.na =", r1.na, ", r2.na =", r2.na, "\n")
}

# ===============
# combine drug sensitivity, tissues and molecular features
# ===============
yx <- merge(y0, features, by = "Cell.Line")
names.cell.line <- yx$Cell.Line
names.drug <- colnames(yx)[2:(dim(y0)[2])]
names.drug <- substr(names.drug, 6, nchar(names.drug))
# numbers of gene expression features, copy number festures and muatation features
p <- c(13321, 13747 - 13321, 13818 - 13747)
num.nonpen <- 13
yx <- data.matrix(yx[, -1])
y <- yx[, 1:(dim(y0)[2] - 1)]
x <- cbind(yx[, dim(y0)[2] - 1 + sum(p) + 1:num.nonpen], yx[, dim(y0)[2] - 1 + 1:sum(p)])

# delete genes with only one mutated cell line
x <- x[, 
  -c(num.nonpen + p[1] + p[2] + which(colSums(x[, num.nonpen + p[1] + p[2] + 1:p[3]]) <= 1))]
p[3] <- ncol(x) - num.nonpen - p[1] - p[2]

GDSC <- list(
  y = y, x = x, p = p, num.nonpen = num.nonpen, names.cell.line = names.cell.line,
  names.drug = names.drug
)


## ================
## ================
## select a small set of drugs
## ================
## ================

name_drugs <- c(
  "Methotrexate", "RDEA119", "PD-0325901", "CI-1040", "AZD6244", "Nilotinib",
  "Axitinib"
)

# extract the drugs' pharmacological profiling and tissue dummy
YX0 <- cbind(GDSC$y[, colnames(GDSC$y) %in% paste("IC50.", name_drugs, sep = "")]
[, c(1, 3, 6, 4, 7, 2, 5)], GDSC$x[, 1:GDSC$num.nonpen])
colnames(YX0) <- c(name_drugs, colnames(GDSC$x)[1:GDSC$num.nonpen])
# extract the genetic information of CNV & MUT
X23 <- GDSC$x[, GDSC$num.nonpen + GDSC$p[1] + 1:(p[2] + p[3])]
colnames(X23)[1:p[2]] <- paste(substr(
  colnames(X23)[1:p[2]], 1,
  nchar(colnames(X23)[1:p[2]]) - 3
), ".CNV", sep = "")

# locate all genes with CNV or MUT information
name_genes_duplicate <- c(
  substr(colnames(X23)[1:p[2]], 1, nchar(colnames(X23)[1:p[2]]) - 4),
  substr(colnames(X23)[p[2] + 1:p[3]], 1, nchar(colnames(X23)[p[2] + 1:p[3]]) - 4)
)
name_genes <- name_genes_duplicate[!duplicated(name_genes_duplicate)]

# select the GEX which have the common genes with CNV or MUT
X1 <- 
  GDSC$x[, GDSC$num.nonpen + which(colnames(GDSC$x)[GDSC$num.nonpen + 1:p[1]] %in% name_genes)]

p[1] <- ncol(X1)
X1 <- log(X1)

# summary the data information
exampleGDSC <- list(data = cbind(YX0, X1, X23))
exampleGDSC$blockList <- list(
  1:length(name_drugs), length(name_drugs) + 1:GDSC$num.nonpen,
  ncol(YX0) + 1:sum(p)
)

# ========================
# construct the G matrix: edge potentials in the MRF prior
# ========================

# edges between drugs: Group1 ("RDEA119","17-AAG","PD-0325901","CI-1040" and "AZD6244")
# indexed as (2:5)
# http://software.broadinstitute.org/gsea/msigdb/cards/KEGG_MAPK_SIGNALING_PATHWAY
pathway_genes <- read.table("MAPK_pathway.txt")[[1]]
Idx_Pathway1 <- which(c(colnames(X1), name_genes_duplicate) %in% pathway_genes)
Gmrf_Group1Pathway1 <- t(combn(rep(Idx_Pathway1, each = length(2:5)) +
  rep((2:5 - 1) * sum(p), times = length(Idx_Pathway1)), 2))

# edges between drugs: Group2 ("Nilotinib","Axitinib") indexed as (6:7)
# delete gene ABL2
Idx_Pathway2 <- which(c(colnames(X1), name_genes_duplicate) %like% "BCR" |
  c(colnames(X1), name_genes_duplicate) %like% "ABL")[-c(3, 5)]
Gmrf_Group2Pathway2 <- t(combn(rep(Idx_Pathway2, each = length(6:7)) +
  rep((6:7 - 1) * sum(p), times = length(Idx_Pathway2)), 2))

# edges between the common gene in different data sources
Gmrf_CommonGene <- NULL
list_CommonGene <- list(0)
k <- 1
for (i in 1:length(name_genes)) {
  Idx_CommonGene <- which(c(colnames(X1), name_genes_duplicate) == name_genes[i])
  if (length(Idx_CommonGene) > 1) {
    Gmrf_CommonGene <- rbind(Gmrf_CommonGene, 
    t(combn(rep(Idx_CommonGene, each = length(name_drugs))
    + rep((1:length(name_drugs) - 1) * sum(p), times = length(Idx_CommonGene)), 2)))
    k <- k + 1
  }
}
Gmrf_duplicate <- rbind(Gmrf_Group1Pathway1, Gmrf_Group2Pathway2, Gmrf_CommonGene)
Gmrf <- Gmrf_duplicate[!duplicated(Gmrf_duplicate), ]
exampleGDSC$mrfG <- Gmrf

# create the target gene names of the two groups of drugs
targetGenes1 <- matrix(Idx_Pathway1, nrow = 1)
colnames(targetGenes1) <- colnames(exampleGDSC$data)[seq_along(targetGene$group1)]
targetGenes2 <- matrix(Idx_Pathway2, nrow = 1)
colnames(targetGenes2) <- colnames(exampleGDSC$data)[seq_along(targetGene$group2)]

targetGene <- list(group1 = targetGenes1, group2 = targetGenes2)

## Write data file exampleGDSC.rda to the user's directory by save()

## End(Not run)

[Package BayesSUR version 2.2-1 Index]