R code for SimSeq Simulation

## SIMSEQ: INDUCED COVARIANCE STRUCTURE

 

# n.group <- 5

# n.genes <- 10

# p <- 500

# n <- 100

# ncom <- 3

 

simseq.data <- function(n, p, n.group, n.genes, ncom, P){

 

  library(SimSeq)

  library(mgcv)

 

  # lists for results

  data.list <- list()

  score.list <- list()

 

  # pars

  de.n <- n.group*n.genes

  ee.n <- p-de.n

 

  # INPUT DATA

 

  data(kidney)

  counts <- kidney$counts # Matrix of read counts from KIRC dataset

  treatment <- kidney$treatment # Treatment vector indicating Non-Tumor or Tumor columns

 

  # mean read count > 10

  nonzero <- rowMeans(counts) > 10

  counts2 <- counts[nonzero,]

  # vahintaan kymmenessa sarakkeessa read count

  more2 <- rowSums(counts2 != 0) > 100

  nonzero.data <- counts2[more2,]

  # transpose

  t.nonzero <- t(nonzero.data)

 

  #################################################

 

  # sample randomly 50 genes to be the differentially expressed genes

  ind.de <- sort(sample(1:ncol(t.nonzero), de.n))

  # make a list of groups

  group.l <- split(ind.de, ceiling(seq_along(ind.de)/n.genes))

 

  # replicate data

  rep.data <- rbind(t.nonzero, t.nonzero, t.nonzero, t.nonzero, t.nonzero)

  # trt group

  rep.trt <- c(treatment, treatment, treatment, treatment, treatment)

  # sample id

  rep.replic <- rep(1:(nrow(rep.data)/2), each = 2)

 

  # matrix of the same size as t.nonzero

  # ind.de columns have nonzero values, rest are zeros

 

  add.m <- matrix(0, nrow=nrow(rep.data), ncol = ncol(rep.data))

  lambdas <- rep(0, n.group)

  for(j in 1:n.group){

    for(i in 1:nrow(rep.data)){

      add.m[i, group.l[[j]]] <- sample(c(rep.data[,group.l[[j]]]), 1)

     

    }

  }

 

  source.data <- rep.data+add.m

 

  source.data2 <- source.data

  for(j in 1:n.group){

    source.data2[, group.l[[j]]] <- round(source.data[,group.l[[j]]]/2, 0)

  }

 

 

  # all genes

  genes <- sort(c(sample((1:ncol(rep.data))[-ind.de], (p-de.n)), ind.de))

 

  diff.ind <- match(sort(ind.de), sort(genes))

 

  # interactions (within each group of DE-genes)

 

  split.diff <- split(diff.ind, ceiling(seq_along(diff.ind)/n.genes))

  cons <- lapply(split.diff, FUN=combn, 2 )

  t.cons <- lapply(cons, FUN=t)

  pair.con <- do.call(rbind, t.cons)

 

  # equally expressed

  ee.genes <- genes[-diff.ind]

 

  # true positives

  cond.pos <- matrix(0, ncol = p, nrow = p)

  cond.pos[pair.con] <- 1

  pos.m <- cond.pos

 

  # true negatives

  cond.neg <- matrix(1, ncol = p, nrow = p)

  cond.neg <- ifelse(upper.tri(cond.neg), 1, 0)

  cond.neg <- cond.neg - cond.pos

  neg.m <- cond.neg

 

  # normalization factor

  nf <- apply(source.data, 1, quantile, 0.75)

 

  for(sim.round in 1:P){

   

    cat("sim = ", iter <- sim.round, "\n")

   

    s <- matrix(0, p, p)

    # generate data

    data.sim <- SimData(counts = t(source.data2), treatment = rep.trt, sort.method = "paired",

                        replic = rep.replic,

                        k.ind = (n/2), genes.select = genes, genes.diff = ind.de,

                        samp.independent = FALSE, norm.factors = nf)

   

    sim.counts <- data.sim$counts

    t.sim.counts <- t(sim.counts)

    boxplot(t.sim.counts)

   

    data.list[[sim.round]] <- t.sim.counts

   

    # scale data

    data.sc <- scale(t.sim.counts)

   

    # PLS-components

    standr <- function(x) x/as.numeric(sqrt(t(x)%*%x))

   

    tot.c <- as.vector(apply(t.sim.counts, 1, sum))

   

    for(i in 1:p){

     

      X <- data.sc[,-i]

      y <- data.sc[,i]

     

      c <- matrix(0, p-1, ncom)

      TT <- matrix(0, nrow(data.sc), ncom)

     

      c.var <- rep(0, ncom)

      SS <- rep(0, ncom)

      var.expl <- rep(0, ncom)

     

      tX<- X

      for(k in 1:ncom){ # construct ncom PLS components

        c[,k] <- standr(t(tX)%*%y)

        TT[,k] <- tX%*%c[,k]

        tX <- tX-TT[,k]%*%solve(t(TT[,k])%*%TT[,k])%*%t(TT[,k])%*%tX

      }

     

      # glm with negative-binomial link

     

      temp.data <- as.data.frame(cbind(t.sim.counts[,i], TT, tot.c))

      comp.names <- paste("C", 1:ncom, sep = '')

      colnames(temp.data) <- c('y', comp.names, 't.counts')

     

      names <- colnames(temp.data[2:(ncom+1)])

      osa1 <- paste("y~")

      osa2 <- paste(names, collapse = "+")

      form <- paste(osa1, osa2, sep = "")

      form2 <- as.formula(form)

     

      mod <- gam(form2, offset=log(t.counts), family=nb(),data=temp.data)

      summary(mod)

     

      b <- as.matrix(coef(mod)[-1])

     

      temp.beta <- c%*%b

     

      if (i == 1){

        s[i,] <- c(0,temp.beta)

      } else {

        if (i == p){

          s[i,] <- c(temp.beta,0)

        } else {

          s[i,] <- c(temp.beta[1:(i-1)],0,temp.beta[i:(p-1)])

        } 

      }

     

    }

   

    s <- 0.5*(s+t(s))

    s <- s-0.5*(max(s)+min(s))

    s <- s/(0.5*(max(s)-min(s)))

    diag(s) <- 1

   

    score.list[[sim.round]] <- s

   

  }

  return(list(data.list = data.list,

              score.list = score.list,

              pos.m = pos.m, neg.m = neg.m))

}

 

 

data1 <- simseq.data(20, 100, 5, 5, 3, 1000)

data2 <- simseq.data(20, 1000, 5, 15, 3, 1000)

data3 <- simseq.data(100, 100, 5, 5, 3, 1000)

data4 <- simseq.data(100, 1000, 5, 15, 3, 1000)

 

 

## FDR

 

fdr.fun <- function(qvals, score, pos, neg){

 

  # save the results

  tp.data <- list()

  sens.data <- list()

  fp.ind.l <- list()

 

  # condition positive

  cond.pos <- pos

  # condition negative

  cond.neg <- neg

 

  # loop over all selected qvalues

 

  j <- 1

 

  for(qvalue in qvals){

    cat("qval = ", iter <- qvalue , "\n")

   

    i <- 1

   

    sens.q <- matrix(0, ncol = 5, nrow = length(score))

    tp.q <- matrix(0, ncol = 5, nrow = length(score))

   

    # loop over all generated data sets

    for(d in 1:length(score)){

     

      s <- score[[d]]

      s.up <- s[upper.tri(s)]

      m <- ncol(s)

     

      b <- matrix(0, m, m)

      out.density <- density(s.up, adjust = 1)

      #out.density <- density(s.up,bw="nrd")

      b[upper.tri(b, diag=FALSE)] <- (dnorm(s.up,mean(s.up),sd(s.up))/approx(out.density$x, out.density$y,

                                                                            s.up, method = "linear")$y)

     

      apu <- matrix(0, m, m)

      apu[upper.tri(b, diag=FALSE)] <- 1

      apu[which(b > qvalue, arr.ind = TRUE)] <- 0

     

      fdr.m <- which(apu == 1, arr.ind=T)

     

      # test positive

      test.pos.fdr <- matrix(0, ncol = m, nrow = m)

      test.pos.fdr[fdr.m] <- 1

     

      # test negative

      test.neg.fdr <- matrix(1, ncol = m, nrow = m)

      test.neg.fdr <- ifelse(upper.tri(test.neg.fdr), 1, 0)

      test.neg.fdr[fdr.m] <- 0

     

      # true positives

      tp.m.fdr <- cond.pos+test.pos.fdr

      tp.fdr <- length(which(tp.m.fdr == 2))

     

      # false positives

      fp.m.fdr <- cond.neg + test.pos.fdr

      fp.fdr <- length(which(fp.m.fdr == 2))

      fp.ind <- which(fp.m.fdr ==2, arr.ind = TRUE)

      fp.ind.l[[d]] <- fp.ind

 

     

      # true negatives

      tn.m.fdr <- cond.neg + test.neg.fdr

      tn.fdr <- length(which(tn.m.fdr == 2))

     

      # false negatives

      fn.m.fdr <- cond.pos + test.neg.fdr

      fn.fdr <- length(which(fn.m.fdr == 2))

     

      # sensitivity

      if(tp.fdr+fn.fdr == 0){

        sens.fdr <- 0

      } else {

        sens.fdr <- tp.fdr/(tp.fdr+fn.fdr)

      }

     

      # specificity

      if(fp.fdr+tn.fdr == 0){

        spec.fdr <- 0

      } else {

        spec.fdr <- tn.fdr/(fp.fdr+tn.fdr)

      }

     

      # true discovery rate

      if(tp.fdr+fp.fdr == 0){

        tdr.fdr <- 0

      } else {

        tdr.fdr <- tp.fdr/(tp.fdr+fp.fdr)

      }

     

      # true non-discovery rate

      if(tn.fdr+fn.fdr == 0){

        tnr.fdr <- 0

      } else {

        tnr.fdr <- tn.fdr/(tn.fdr+fn.fdr)

      }

     

     

      # save the results into ith row of a matrix

      tp.q[i,1] <- qvalue

      tp.q[i,2] <- tp.fdr

      tp.q[i,3] <- fp.fdr

      tp.q[i,4] <- tn.fdr

      tp.q[i,5] <- fn.fdr

     

      sens.q[i,1] <- qvalue

      sens.q[i,2] <- sens.fdr

      sens.q[i,3] <- spec.fdr

      sens.q[i,4] <- tdr.fdr

      sens.q[i,5] <- tnr.fdr

     

      i <- i+1

    }

   

    tp.data[[j]] <- tp.q

    sens.data[[j]] <- sens.q

   

    j <- j+1

   

  }

  return(list(tp.list = tp.data, sens.list = sens.data, fp.ind.l = fp.ind.l ))

}

 

 

## CALL FUNCTION

 

qvals <- seq(0, 1, by = 0.05)

FDR1 <- fdr.fun(qvals, data1$score.list, data1$pos.m, data1$neg.m)

FDR2 <- fdr.fun(qvals, data2$score.list, data2$pos.m, data2$neg.m)

FDR3 <- fdr.fun(qvals, data3$score.list, data3$pos.m, data3$neg.m)

FDR4 <- fdr.fun(qvals, data4$score.list, data4$pos.m, data4$neg.m)

means1 <- lapply(FDR1$sens.list, colMeans)

means2 <- lapply(FDR2$sens.list, colMeans)

means3 <- lapply(FDR3$sens.list, colMeans)

means4 <- lapply(FDR4$sens.list, colMeans)

 

 

Made with Namu6