DPMGibbsSkewN {NPflow} | R Documentation |
Slice Sampling of Dirichlet Process Mixture of skew normal distributions
Description
Slice Sampling of Dirichlet Process Mixture of skew normal distributions
Usage
DPMGibbsSkewN(
z,
hyperG0,
a = 1e-04,
b = 1e-04,
N,
doPlot = TRUE,
nbclust_init = 30,
plotevery = N/10,
diagVar = TRUE,
use_variance_hyperprior = TRUE,
verbose = TRUE,
...
)
Arguments
z |
data matrix |
hyperG0 |
prior mixing distribution. |
a |
shape hyperparameter of the Gamma prior
on the concentration parameter of the Dirichlet Process. Default is |
b |
scale hyperparameter of the Gamma prior
on the concentration parameter of the Dirichlet Process. Default is |
N |
number of MCMC iterations. |
doPlot |
logical flag indicating whether to plot MCMC iteration or not.
Default to |
nbclust_init |
number of clusters at initialization. Default to 30 (or less if there are less than 30 observations). |
plotevery |
an integer indicating the interval between plotted iterations when |
diagVar |
logical flag indicating whether the variance of a cluster is a diagonal matrix.
Default is |
use_variance_hyperprior |
logical flag indicating whether a hyperprior is added
for the variance parameter. Default is |
verbose |
logical flag indicating whether partition info is written in the console at each MCMC iteration. |
... |
additional arguments to be passed to |
Value
a object of class DPMclust
with the following attributes:
mcmc_partitions: |
a list of length |
alpha: |
a vector of length |
U_SS_list: |
a list of length |
weights_list: |
|
logposterior_list: |
a list of length |
data: |
the data matrix |
nb_mcmcit: |
the number of MCMC iterations |
clust_distrib: |
the parametric distribution of the mixture component - |
hyperG0: |
the prior on the cluster location |
Author(s)
Boris Hejblum
References
Hejblum BP, Alkhassim C, Gottardo R, Caron F and Thiebaut R (2019) Sequential Dirichlet Process Mixtures of Multivariate Skew t-distributions for Model-based Clustering of Flow Cytometry Data. The Annals of Applied Statistics, 13(1): 638-660. <doi: 10.1214/18-AOAS1209> <arXiv: 1702.04407> https://arxiv.org/abs/1702.04407 doi:10.1214/18-AOAS1209
Examples
rm(list=ls())
#Number of data
n <- 1000
set.seed(123)
d <- 2
ncl <- 4
# Sample data
sdev <- array(dim=c(d,d,ncl))
#xi <- matrix(nrow=d, ncol=ncl, c(-1.5, 1, 1.5, 1, 1.5, -2, -2, -2))
xi <- matrix(nrow=d, ncol=ncl, c(-0.5, 0, 0.5, 0, 0.5, -1, -1, 1))
##xi <- matrix(nrow=d, ncol=ncl, c(-0.3, 0, 0.5, 0.5, 0.5, -1.2, -1, 1))
psi <- matrix(nrow=d, ncol=4, c(0.4, -0.6, 0.8, 0, 0.3, -0.7, -0.3, -1.2))
p <- c(0.2, 0.1, 0.4, 0.3) # frequence des clusters
sdev[, ,1] <- matrix(nrow=d, ncol=d, c(0.3, 0, 0, 0.3))
sdev[, ,2] <- matrix(nrow=d, ncol=d, c(0.1, 0, 0, 0.3))
sdev[, ,3] <- matrix(nrow=d, ncol=d, c(0.3, 0.15, 0.15, 0.3))
sdev[, ,4] <- .3*diag(2)
c <- rep(0,n)
z <- matrix(0, nrow=d, ncol=n)
for(k in 1:n){
c[k] = which(rmultinom(n=1, size=1, prob=p)!=0)
z[,k] <- xi[, c[k]] + psi[, c[k]]*abs(rnorm(1)) + sdev[, , c[k]]%*%matrix(rnorm(d, mean = 0,
sd = 1), nrow=d, ncol=1)
#cat(k, "/", n, " observations simulated\n", sep="")
}
# Set parameters of G0
hyperG0 <- list()
hyperG0[["b_xi"]] <- rep(0,d)
hyperG0[["b_psi"]] <- rep(0,d)
hyperG0[["kappa"]] <- 0.0001
hyperG0[["D_xi"]] <- 100
hyperG0[["D_psi"]] <- 100
hyperG0[["nu"]] <- d + 1
hyperG0[["lambda"]] <- diag(d)
# hyperprior on the Scale parameter of DPM
a <- 0.0001
b <- 0.0001
# do some plots
doPlot <- TRUE
nbclust_init <- 30
## Data
########
library(ggplot2)
p <- (ggplot(data.frame("X"=z[1,], "Y"=z[2,]), aes(x=X, y=Y))
+ geom_point()
+ ggtitle("Simple example in 2d data")
+xlab("D1")
+ylab("D2")
+theme_bw())
p
c2plot <- factor(c)
levels(c2plot) <- c("3", "2", "4", "1")
pp <- (ggplot(data.frame("X"=z[1,], "Y"=z[2,], "Cluster"=as.character(c2plot)))
+ geom_point(aes(x=X, y=Y, colour=Cluster, fill=Cluster))
+ ggtitle("Slightly overlapping skew-normal simulation\n")
+ xlab("D1")
+ ylab("D2")
+ theme_bw()
+ scale_colour_discrete(guide=guide_legend(override.aes = list(size = 6, shape=22))))
pp
## alpha priors plots
#####################
prioralpha <- data.frame("alpha"=rgamma(n=5000, shape=a, scale=1/b),
"distribution" =factor(rep("prior",5000),
levels=c("prior", "posterior")))
p <- (ggplot(prioralpha, aes(x=alpha))
+ geom_histogram(aes(y=..density..),
colour="black", fill="white")
+ geom_density(alpha=.2, fill="red")
+ ggtitle(paste("Prior distribution on alpha: Gamma(", a,
",", b, ")\n", sep=""))
)
p
if(interactive()){
# Gibbs sampler for Dirichlet Process Mixtures
##############################################
MCMCsample_sn <- DPMGibbsSkewN(z, hyperG0, a, b, N=2500,
doPlot, nbclust_init, plotevery=200,
gg.add=list(theme_bw(),
guides(shape=guide_legend(override.aes = list(fill="grey45")))),
diagVar=FALSE)
s <- summary(MCMCsample_sn, burnin = 2000, thin=10)
#cluster_est_binder(MCMCsample_sn$mcmc_partitions[1000:1500])
print(s)
plot(s)
#plot_ConvDPM(MCMCsample_sn, from=2)
# k-means
plot(x=z[1,], y=z[2,], col=kmeans(t(z), centers=4)$cluster,
xlab = "d = 1", ylab= "d = 2", main="k-means with K=4 clusters")
KM <- kmeans(t(z), centers=4)
KMclust <- factor(KM$cluster)
levels(KMclust) <- c("2", "4", "1", "3")
dataKM <- data.frame("X"=z[1,], "Y"=z[2,],
"Cluster"=as.character(KMclust))
dataCenters <- data.frame("X"=KM$centers[,1],
"Y"=KM$centers[,2],
"Cluster"=c("2", "4", "1", "3"))
p <- (ggplot(dataKM)
+ geom_point(aes(x=X, y=Y, col=Cluster))
+ geom_point(aes(x=X, y=Y, fill=Cluster, order=Cluster),
data=dataCenters, shape=22, size=5)
+ scale_colour_discrete(name="Cluster",
guide=guide_legend(override.aes=list(size=6, shape=22)))
+ ggtitle("K-means with K=4 clusters\n")
+ theme_bw()
)
p
postalpha <- data.frame("alpha"=MCMCsample_sn$alpha[501:1000],
"distribution" = factor(rep("posterior",1000-500),
levels=c("prior", "posterior")))
postK <- data.frame("K"=sapply(lapply(postalpha$alpha, "["),
function(x){sum(x/(x+0:(1000-1)))}))
p <- (ggplot(postalpha, aes(x=alpha))
+ geom_histogram(aes(y=..density..), binwidth=.1,
colour="black", fill="white")
+ geom_density(alpha=.2, fill="blue")
+ ggtitle("Posterior distribution of alpha\n")
# Ignore NA values for mean
# Overlay with transparent density plot
+ geom_vline(aes(xintercept=mean(alpha, na.rm=T)),
color="red", linetype="dashed", size=1)
)
p
p <- (ggplot(postK, aes(x=K))
+ geom_histogram(aes(y=..density..),
colour="black", fill="white")
+ geom_density(alpha=.2, fill="blue")
+ ggtitle("Posterior distribution of predicted K\n")
# Ignore NA values for mean
# Overlay with transparent density plot
+ geom_vline(aes(xintercept=mean(K, na.rm=T)),
color="red", linetype="dashed", size=1)
#+ scale_x_continuous(breaks=c(0:6)*2, minor_breaks=c(0:6)*2+1)
+ scale_x_continuous(breaks=c(1:12))
)
p
p <- (ggplot(drop=FALSE, alpha=.6)
+ geom_density(aes(x=alpha, fill=distribution),
color=NA, alpha=.6,
data=postalpha)
+ geom_density(aes(x=alpha, fill=distribution),
color=NA, alpha=.6,
data=prioralpha)
+ ggtitle("Prior and posterior distributions of alpha\n")
+ scale_fill_discrete(drop=FALSE)
+ theme_bw()
+ xlim(0,100)
)
p
#Skew Normal
n=100000
xi <- 0
d <- 0.995
alpha <- d/sqrt(1-d^2)
z <- rtruncnorm(n,a=0, b=Inf)
e <- rnorm(n, mean = 0, sd = 1)
x <- d*z + sqrt(1-d^2)*e
o <- 1
y <- xi+o*x
nu=1.3
w <- rgamma(n,scale=nu/2, shape=nu/2)
yy <- xi+o*x/w
snd <- data.frame("Y"=y,"YY"=yy)
p <- (ggplot(snd)+geom_density(aes(x=Y), fill="blue", alpha=.2)
+ theme_bw()
+ ylab("Density")
+ ggtitle("Y~SN(0,1,10)\n")
+ xlim(-1,6)
+ ylim(0,0.8)
)
p
p <- (ggplot(snd)+geom_density(aes(x=YY), fill="blue", alpha=.2)
+ theme_bw()
+ ylab("Density")
+ ggtitle("Y~ST(0,1,10,1.3)\n")
+ xlim(-2,40)
+ ylim(0,0.8)
)
p
p <- (ggplot(snd)
+ geom_density(aes(x=Y, fill="blue"), alpha=.2)
+ geom_density(aes(x=YY, fill="red"), alpha=.2)
+ theme_bw()
+theme(legend.text = element_text(size = 13), legend.position="bottom")
+ ylab("Density")
+ xlim(-2,40)
+ ylim(0,0.8)
+ scale_fill_manual(name="", labels=c("Y~SN(0,1,10) ", "Y~ST(0,1,10,1.3)"),
guide="legend", values=c("blue", "red"))
)
p
#Variations
n=100000
xi <- -1
d <- 0.995
alpha <- d/sqrt(1-d^2)
z <- rtruncnorm(n,a=0, b=Inf)
e <- rnorm(n, mean = 0, sd = 1)
x <- d*z + sqrt(1-d^2)*e
snd <- data.frame("X"=x)
p <- (ggplot(snd)+geom_density(aes(x=X), fill="blue", alpha=.2)
+ theme_bw()
+ ylab("Density")
+ ggtitle("X~SN(10)\n")
+ xlim(-1.5,4)
+ ylim(0,1.6)
)
p
o <- 0.5
y <- xi+o*x
snd <- data.frame("Y"=y)
p <- (ggplot(snd)+geom_density(aes(x=Y), fill="blue", alpha=.2)
+ theme_bw()
+ ylab("Density")
+ ggtitle("Y~SN(-1,1,10)\n")
+ xlim(-1.5,4)
+ ylim(0,1.6)
)
p
#Simple toy example
###################
n <- 500
set.seed(12345)
d <- 2
ncl <- 4
# Sample data
sdev <- array(dim=c(d,d,ncl))
xi <- matrix(nrow=d, ncol=ncl, c(-1.5, 1, 1.5, 1, 1.5, -2, -2, -2))
psi <- matrix(nrow=d, ncol=4, c(0.4, -0.6, 0.8, 0, 0.3, -0.7, -0.3, -1.2))
p <- c(0.2, 0.1, 0.4, 0.3) # frequence des clusters
sdev[, ,1] <- matrix(nrow=d, ncol=d, c(0.3, 0, 0, 0.3))
sdev[, ,2] <- matrix(nrow=d, ncol=d, c(0.1, 0, 0, 0.3))
sdev[, ,3] <- matrix(nrow=d, ncol=d, c(0.3, 0.15, 0.15, 0.3))
sdev[, ,4] <- .3*diag(2)
#' # Set parameters of G0
hyperG0 <- list()
hyperG0[["b_xi"]] <- rep(0,d)
hyperG0[["b_psi"]] <- rep(0,d)
hyperG0[["kappa"]] <- 0.0001
hyperG0[["D_xi"]] <- 100
hyperG0[["D_psi"]] <- 100
hyperG0[["nu"]] <- d + 1
hyperG0[["lambda"]] <- diag(d)
c <- rep(0,n)
z <- matrix(0, nrow=d, ncol=n)
for(k in 1:n){
c[k] = which(rmultinom(n=1, size=1, prob=p)!=0)
z[,k] <- xi[, c[k]] + psi[, c[k]]*abs(rnorm(1)) + sdev[, , c[k]]%*%matrix(rnorm(d, mean = 0,
sd = 1), nrow=d, ncol=1)
cat(k, "/", n, " observations simulated\n", sep="")
}
MCMCsample_sn_sep <- DPMGibbsSkewN(z, hyperG0, a, b, N=600,
doPlot, nbclust_init, plotevery=100,
gg.add=list(theme_bw(),
guides(shape=guide_legend(override.aes = list(fill="grey45")))),
diagVar=TRUE)
s <- summary(MCMCsample_sn, burnin = 400)
}