## cPLS -FUNCTION # data = takes in a matrix of raw read count data (integers), # where rows represent genes and columns represent samples # ncom = number of PLS-components computed, 3 by default cPLS <- function(data, ncom=3){ library(MASS) library(mgcv) # transpose data tdata <- t(data) # number of genes g <- ncol(tdata) # total read counts tot.c <- as.vector(apply(tdata, 1, sum)) # scale data data.sc <- scale(tdata) # matrix for association scores s <- matrix(0, g, g) # function for stand. pls-components standr <- function(x) x/as.numeric(sqrt(t(x)%*%x)) for(i in 1:g){ cat("gene = ", i, "\n") # PLS-components X <- data.sc[,-i] y <- data.sc[,i] c <- matrix(0, g-1, ncom) TT <- matrix(0, nrow(data.sc), ncom) tX<- X for(k in 1:ncom){ # construct ncom PLS components c[,k] <- standr(t(tX)%*%y) TT[,k] <- tX%*%c[,k] tX <- tX-TT[,k]%*%solve(t(TT[,k])%*%TT[,k])%*%t(TT[,k])%*%tX } # glm with negative-binomial link temp.data <- as.data.frame(cbind(tdata[,i], TT, tot.c)) comp.names <- paste("C", 1:ncom, sep = '') colnames(temp.data) <- c('y', comp.names, 't.counts') names <- colnames(temp.data[2:(ncom+1)]) osa1 <- paste("y~") osa2 <- paste(names, collapse = "+") form <- paste(osa1, osa2, sep = "") form2 <- as.formula(form) mod <- gam(form2, offset=log(t.counts), family=nb(),data=temp.data) summary(mod) b <- as.matrix(coef(mod)[-1]) beta <- c%*%b if (i == 1){ s[i,] <- c(0,beta) } else { if (i == g){ s[i,] <- c(beta,0) } else { s[i,] <- c(beta[1:(i-1)],0,beta[i:(g-1)]) } } } # symmetrize and scale the scores (max score = 1, min = -1) s <- 0.5*(s+t(s)) s <- s-0.5*(max(s)+min(s)) s <- s/(0.5*(max(s)-min(s))) diag(s) <- 1 return(list(score.m = s)) }