### ### FUNCTIONS FOR ESTIMATING MARGINAL PEARSON AND KENDALL ### CORRELATION COEFFICIENTS FOR CLUSTERED DATA ### ### FROM: Lorenz DJ, Datta S, Harkema, SJ. "Marginal Association Measures for Clustered Data" ### Statistics in Medicine. ### ### CODE BY: Doug Lorenz ### ### ### MARGINAL PEARSON CORRELATION FUNCTION ### ### Inputs: vectors "x" and "y" are the data ### vector "cluster" identifies the clusters ### ### Outputs: Correlation estimate and asymptotic variance ### pearson.clust <- function(x, y, cluster) { M <- length(unique(cluster)) clmean.x <- tapply(x, cluster, mean) clmean.y <- tapply(y, cluster, mean) clmean.xy <- tapply(x*y, cluster, mean) clmean.xx <- tapply(x^2, cluster, mean) clmean.yy <- tapply(y^2, cluster, mean) est <- (mean(clmean.xy) - mean(clmean.x)*mean(clmean.y))/ sqrt((mean(clmean.xx)-mean(clmean.x)^2)*(mean(clmean.yy)-mean(clmean.y)^2)) moment.var <- var(cbind(clmean.x, clmean.y, clmean.xy, clmean.xx, clmean.yy)) grad <- c(est*mean(clmean.x)/(mean(clmean.xx)-mean(clmean.x)^2)- mean(clmean.y)/sqrt((mean(clmean.xx)-mean(clmean.x)^2)*(mean(clmean.yy)-mean(clmean.y)^2)), est*mean(clmean.y)/(mean(clmean.yy)-mean(clmean.y)^2)- mean(clmean.x)/sqrt((mean(clmean.xx)-mean(clmean.x)^2)*(mean(clmean.yy)-mean(clmean.y)^2)), 1/sqrt((mean(clmean.xx)-mean(clmean.x)^2)*(mean(clmean.yy)-mean(clmean.y)^2)), -est/(2*(mean(clmean.xx)-mean(clmean.x)^2)), -est/(2*(mean(clmean.yy)-mean(clmean.y)^2))) var.est <- (1/M) * t(grad) %*% moment.var %*% grad c(est, var.est) } ### ### MARGINAL KENDALL CORRELATION FUNCTION ### ### Inputs: vectors "x" and "y" are the data ### vector "cluster" identifies the clusters ### ### Outputs: Correlation estimate and asymptotic variance ### ### Requires functions ecdf.clust() and hajek.var() given below ### cor.clust <- function(x, y, cluster) { cl.size <- as.numeric(tapply(cluster, cluster, length)) M <- length(unique(cluster)) ecdf.x <- ecdf.clust(x, x, cluster, cl.size) ecdf.y <- ecdf.clust(y, y, cluster, cl.size) ni.cl <- outer(1/cl.size, 1/cl.size) temp.x <- outer(ecdf.x, ecdf.x, function(x1, x2) sign(x1 - x2)) temp.y <- outer(ecdf.y, ecdf.y, function(y1, y2) sign(y1 - y2)) temp <- rowsum(temp.x*temp.y, cluster, reorder=F) sum.cl <- t(rowsum(t(temp), cluster, reorder=F)) diag(sum.cl) <- 0 est <- sum(ni.cl*sum.cl)/(M*(M-1)) var.est <- (16/M)*var(hajek.var(x, y, cluster, cl.size)) c(est, var.est) } ### ### FUNCTION TO CALCULATE WITHIN-CLUSTER EMPIRICAL DISTRIBUTION FUNCTION ### Required for Kendall coefficient ### ### Inputs: vector "q", the quantiles at which the calculate the ecdf (generally will be "x") ### vector "x", the data ### vector "cluster", identifying the clusters ### vector "cl.size", the size of each cluster ### ### Output: vector of same length as "x" giving the within-cluster ecdf ### ecdf.clust <- function(q, x, cluster, cl.size) { temp <- outer(x, q, "<=")*1 sum.x <- rowsum(temp, cluster, reorder=F) ecdf.x <- sum.x/cl.size colSums(ecdf.x)/(nrow(ecdf.x)+1) } ### ### FUNCTION REQUIRED TO CALCULATE KENDALL VARIANCE VIA HAJEK PROJECTION ### ### Inputs: vectors "x" and "y", the data ### vector "cluster", the cluster identifier ### vector cl.size, the cluster sizes ### ### Outputs: Single number, a multiple of the Kendall coefficient variance ### hajek.var <- function(x, y, cluster, cl.size) { temp.less.x <- outer(x, x, "<")*1 temp.greater.x <- 1-temp.less.x temp.less.y <- outer(y, y, "<")*1 temp.greater.y <- 1-temp.less.y temp <- temp.less.x*temp.less.y + temp.greater.x*temp.greater.y temp.sum <- colSums(temp)/length(cluster) tapply(temp.sum, cluster, sum)/cl.size }