R: Simulated data set to mimic a small expression quantitative...

exampleEQTL {BayesSUR}

R Documentation

Simulated data set to mimic a small expression quantitative trait loci (eQTL) example

Description

Simulated data set to mimic a small expression quantitative trait loci (eQTL) example, with p=150 single nucleotide polymorphisms (SNPs) as explanatory variables, s=10 gene expression features as response variables and data for n=100 observations. Loading the data will load the associated blockList object needed to fit the model with BayesSUR(). The R code for generating the simulated data is given in the Examples paragraph.

#importFrom BDgraph rgwish #importFrom gRbase mcsMAT #importFrom scrime simulateSNPs

Usage

exampleEQTL

Format

An object of class list of length 4.

Examples

# Load the eQTL sample dataset
data("exampleEQTL", package = "BayesSUR")
str(exampleEQTL)

## Not run: 
# ===============
# The code below is to show how to generate the dataset "exampleEQTL.rda" above
# ===============

requireNamespace("BDgraph", quietly = TRUE)
requireNamespace("gRbase", quietly = TRUE)
requireNamespace("scrime", quietly = TRUE)

########################### Problem Dimensions
n <- 100
p <- 150
s <- 10

############################ Select a set of n x p (SNPs) covariates

## The synthetic data in the paper use a subset of the real SNPs as covariates,
# but as the NFBC66 dataset is confidential we'll use scrime to sample similar data

x <- scrime::simulateSNPs(c(n, 10), p, c(3, 2), prop.explain = c(0.9, 0.95))$data[1:n, ]
x <- cbind(rep(1, n), x)

####################################################################

graph_pattern <- 2

snr <- 25

corr_param <- 0.9

### Create the underlying graph
if (graph_pattern == 1) {
  ### 1) Random but full
  G <- matrix(1, s, s)
  Prime <- list(c(1:s))
  Res <- Prime
  Sep <- list()
} else if (graph_pattern == 2) {
  ### 2) Block Diagonal structure
  Prime <- list(
    c(1:floor(s * 2 / 3)),
    c((floor(s * 2 / 3) + 1):(ceiling(s * 4 / 5) - 1)),
    c(ceiling(s * 4 / 5):s)
  )

  Res <- Prime
  Sep <- lapply(Res, function(x) which(x == -99))

  G <- matrix(0, s, s)
  for (i in Prime) {
    G[i, i] <- 1
  }
} else if (graph_pattern == 3) {
  ### 3) Decomposable model
  Prime <- list(
    c(1:floor(s * 5 / 12), ceiling(s * 9 / 10):s),
    c(floor(s * 2 / 9):(ceiling(s * 2 / 3) - 1)),
    c(ceiling(s * 2 / 3):(ceiling(s * 4 / 5) - 1)),
    c(ceiling(s * 4 / 5):s)
  )

  Sep <- list()
  H <- list()
  for (i in 2:length(Prime)) {
    H <- union(H, Prime[[i - 1]])
    Sep[[i - 1]] <- intersect(H, Prime[[i]])
  }

  Res <- list()
  Res[[1]] <- Prime[[1]]
  for (i in 2:length(Prime)) {
    Res[[i]] <- setdiff(Prime[[i]], Sep[[i - 1]])
  }

  G <- matrix(0, s, s)
  for (i in Prime) {
    G[i, i] <- 1
  }

  ## decomp check
  dimnames(G) <- list(1:s, 1:s)
  length(gRbase::mcsMAT(G - diag(s))) > 0
} else if (graph_pattern == 4) {
  ### 4) Non-decomposable model
  nblocks <- 5
  nElemPerBlock <- c(
    floor(s / 4), floor(s / 2) - 1 - floor(s / 4),
    ceiling(s * 2 / 3) - 1 - floor(s / 2), 7
  )
  nElemPerBlock <- c(nElemPerBlock, s - sum(nElemPerBlock))
  res <- 1:s
  blockIdx <- list()
  for (i in 1:nblocks) {
    # blockIdx[[i]] = sample(res,nElemPerBlock[i])
    blockIdx[[i]] <- res[1:nElemPerBlock[i]]
    res <- setdiff(res, blockIdx[[i]])
  }

  G <- matrix(0, s, s)
  ## add diagonal
  for (i in 1:nblocks) {
    G[blockIdx[[i]], blockIdx[[i]]] <- 1
  }
  ## add cycle
  G[blockIdx[[1]], blockIdx[[2]]] <- 1
  G[blockIdx[[2]], blockIdx[[1]]] <- 1
  G[blockIdx[[1]], blockIdx[[5]]] <- 1
  G[blockIdx[[5]], blockIdx[[1]]] <- 1
  G[blockIdx[[2]], blockIdx[[3]]] <- 1
  G[blockIdx[[3]], blockIdx[[2]]] <- 1
  G[blockIdx[[3]], blockIdx[[5]]] <- 1
  G[blockIdx[[5]], blockIdx[[3]]] <- 1

  ## decomp check
  dimnames(G) <- list(1:s, 1:s)
  length(gRbase::mcsMAT(G - diag(s))) > 0

  # Prime = blockIdx
  Res <- blockIdx ## this is not correct but not used in the non-decomp case
}

### Gamma Pattern
gamma <- matrix(0, p + 1, s)
gamma[1, ] <- 1


### 2) Extra Patterns

## outcomes (correlated in the decomp model) have some predictors in common
gamma[6:10, 6:9] <- 1

## outcomes (correlated in the decomp model) have some predictors in common
# gamma[16:20,14:15] = 1

## outcomes (sort-of correlated [pair-wise] in the decomp model)
# have predictors in common 6:15
gamma[26:30, 4:8] <- 1

## outcomes (NOT correlated in the decomp model) have predictors in common 16:17
gamma[36:40, c(3:5, 9:10)] <- 1

## these predictors are associated with ALL the outcomes
gamma[46:50, ] <- 1

combn11 <- combn(rep((6:9 - 1) * p, each = length(6:10 - 1)) + rep(6:10 - 1, 
                  times = length(6:9)), 2)
combn31 <- combn(rep((4:8 - 1) * p, each = length(26:30 - 1)) + rep(26:30 - 1, 
                  times = length(4:8)), 2)
combn32 <- combn(rep((4:8 - 1) * p, each = length(46:50 - 1)) + rep(46:50 - 1, 
                  times = length(4:8)), 2)
combn41 <- combn(rep((3:5 - 1) * p, each = length(36:40 - 1)) + rep(36:40 - 1, 
                  times = length(3:5)), 2)
combn42 <- combn(rep((3:5 - 1) * p, each = length(46:50 - 1)) + rep(46:50 - 1, 
                  times = length(3:5)), 2)
combn51 <- combn(rep((9:10 - 1) * p, each = length(36:40 - 1)) + rep(36:40 - 1, 
                  times = length(9:10)), 2)
combn52 <- combn(rep((9:10 - 1) * p, each = length(46:50 - 1)) + rep(46:50 - 1, 
                  times = length(9:10)), 2)

Gmrf <- rbind(t(combn11), t(combn31), t(combn32), t(combn41), t(combn42), t(combn51), t(combn52))

## get for every correlated bunch in the decomposable model,

if (graph_pattern < 4) {
  # a different set of predictors
  for (i in 1:length(Prime)) {
    gamma[6:10 + (i + 6) * 10, Prime[[i]]] <- 1
  } ## for each Prime component

  ## for every Residual instead
  for (i in 1:length(Res)) {
    gamma[6:10 + (i + 10) * 10, Res[[i]]] <- 1
  }
} else {
  for (i in 1:length(Prime)) {
    gamma[6:10 + (i + 4) * 10, Prime[[i]]] <- 1
  } ## for each Prime component

  ## for every Residual instead
  for (i in 1:length(Res)) {
    gamma[6:10 + (i + 9) * 10, Res[[i]]] <- 1
  }
}

#### Sample the betas
sd_b <- 1
b <- matrix(rnorm((p + 1) * s, 0, sd_b), p + 1, s)

xb <- matrix(NA, n, s)

for (i in 1:s) {
  if (sum(gamma[, i]) > 1) {
    xb[, i] <- x[, gamma[, i] == 1] %*% b[gamma[, i] == 1, i]
  } else {
    xb[, i] <- rep(1, n) * b[1, i]
  }
}

## Sample the variance
v_r <- mean(diag(var(xb))) / snr

nu <- s + 1

M <- matrix(corr_param, s, s)
diag(M) <- rep(1, s)

P <- BDgraph::rgwish(n = 1, adj = G, b = 3, D = v_r * M)

var <- solve(P)

factor <- 10
factor_min <- 0.01
factor_max <- 1000
count <- 0
maxit <- 10000

factor_prev <- 1

repeat{
  var <- var / factor * factor_prev

  ### Sample the errors and the Ys
  cVar <- chol(as.matrix(var))
  # err = matrix(rnorm(n*s),n,s) %*% cVar
  err <- matrix(rnorm(n * s, sd = 0.5), n, s) %*% cVar
  y <- xb + err

  ## Reparametrisation ( assuming PEO is 1:s )
  cVar <- t(cVar) # make it lower-tri
  S <- diag(diag(cVar))
  sigma <- S * S
  L <- cVar %*% solve(S)
  rho <- diag(s) - solve(L)

  ### S/N Ratio
  emp_snr <- mean(diag(var(xb) %*% solve(sigma)))
  emp_g_snr <- mean(diag(var((err) %*% t(rho)) %*% solve(sigma)))

  ##############

  if (abs(emp_snr - snr) < (snr / 10) | count > maxit) {
    break
  } else {
    if (emp_snr < snr) { # increase factor
      factor_min <- factor
    } else { # decrease factor
      factor_max <- factor
    }
    factor_prev <- factor
    factor <- (factor_min + factor_max) / 2
  }
  count <- count + 1
}

#################
colnames(y) <- paste("GEX", 1:ncol(y), sep = "")
colnames(G) <- colnames(y)
Gy <- G
gamma <- gamma[-1, ]
mrfG <- Gmrf[!duplicated(Gmrf), ]
data <- cbind(y, x[, -1]) # leave out the intercept because is coded inside already

exampleEQTL <- list(data = data, blockList = list(1:s, s + 1:p))

## Write data file to the user's directory by save()

## End(Not run)

[Package BayesSUR version 2.2-1 Index]