# ================================================== # # Exploratory Analysis of Biological Data using R # # # # Boris Steipe # # # # # # Some prior contributions by: # # Raphael Gottardo, FHCRC # # Sohrab Shah, UBC # # # # # # Module 5: Hypothesis testing # # # # ================================================== # # ================================================== # Basic statistical tests # ================================================== # Two sample t-test: # In the two-sample t-test, we take multiple # observations of our samples and ask: is the # mean of the observations of the samples different # from each other. # t-test a single gene of the GSE26922 dataset for # differential expression. Remember that these are # triplicates. Therefore our first sample is characterized # by three replicates at T0 (columns 1:3), and the # second sample comprises all other columns lumped # together. Remember: under the null hypothesis # they should all be the same, therefore we do not # need to consider the different time points # separately. # Let's look at the top and the bottom differentially # expressed genes in tT as determined by the # eBayes() function of the limma package? # No. 1 in tT tT$ID[1] ntT1 <- which(rownames(ex) == tT$ID[1]) # No. 250 in tT tT$ID[nrow(tT)] ntT250 <- which(rownames(ex) == tT$ID[nrow(tT)]) # retrieve the actual log(expression) values # from ex gtT1 <- ex[ntT1, ] gtT250 <- ex[ntT250, ] plot(gtT1, type="b", ylim=c(8,14), col="darkseagreen") lines(gtT250, type="b", col="limegreen") # perform a sample-against-sample t-test for the cycle # values against the T0 values g1 <- t.test(ex[ntT1, 1:3], ex[ntT1, 4:18]) g1 g250 <- t.test(gtT250[1:3], gtT250[4:18]) g250 # What is in the results object? typeInfo(g1) g1$p.value # Let's compute p-values for # all the 32,321 rows of ex tAll <- rep(0, nrow(ex)) for (i in 1:nrow(ex)) { tAll[i] <- t.test(ex[i, 1:3], ex[i, 4:18])$p.value } hist(tAll) range(tAll) gpMin <- which(tAll == min(tAll)) gpMax <- which(tAll == max(tAll)) plot(ex[gpMin, ], type="b", ylim=c(8,14)) lines(ex[gpMax, ], type="b", col="red") lines(a, type="b", ylim=c(8,14), col="darkseagreen") lines(b, type="b", col="limegreen") # what are the t-values for these four genes? tVal <- t.test(ex[ntT1, 1:3], ex[ntT1, 4:18])$statistic tVal[2] <- t.test(ex[ntT250, 1:3], ex[ntT250, 4:18])$statistic tVal[3] <- t.test(ex[gpMin, 1:3], ex[gpMin, 4:18])$statistic tVal[4] <- t.test(ex[gpMax, 1:3], ex[gpMax, 4:18])$statistic names(tVal) <- c("ntT1", "ntT250", "gpMin", "gpMax") tVal # How do the observed p-values relate to the # t-statistic? x <- seq(-7, 7, 0.1) f <- dt(x, df=16) plot(x, f, xlab="x", ylab="density", type="l", lwd=2) abline(h = 0, lwd = 0.3) segments(tVal, -0.01, tVal, 0.0, col="red") # Note that the underlying assumptions for t-tests # are oK for microarrays, but less suitable for # NGS data. See here: # http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0123658 # for alternatives. # ============================================= # Wilcoxon test # Non-parametric test principle illustrated. # generate two random data samples with slightly # different means set.seed(53) n <- 25 M <- matrix(nrow = n+n, ncol=2) for (i in 1:n) { M[i,1] <- rnorm(1, 10, 1) M[i,2] <- 1 M[i+n,1] <- rnorm(1, 11, 1) M[i+n,2] <- 2 } boxplot(M[1:25,1], M[26:50,1]) # this is how different the two groups are plot(M[,1], col=M[,2]) # we can see the trend in the values, plotted by index plot(sample(1:(2*n)), M[,1], col=M[,2]) # however, with random indices it is # quite hard to tell that these are # two different distributions ... # The Wilcoxon test counts for each observation of one # group how many observations from the other group are # ranked below it. # To illustrate: o <- order(M[,1]) plot(M[o,1], col=M[o,2]) wilcox.test(M[1:n,1], M[(1:n)+n,1]) # Exercise - try this on expression profiles ... # - what profiles would you like to check for similarity? # - pull the data out of "ex" # - should you use all 18 datapoints, or compare the # six means of three replicates? # What is H0 (your null hypothesis) in case? # ============================================= # Multiple testing problem # When we repeat a test that can have a false # positive outcome, there is a small chance # that it will turn out falsely positive. When # we repeat the test many times, it is virtually # gueranteed that we will have a number - albeit # small - of false positives. When we test for # significant differential expression across the # genome, we are repeating our test 32,000 times. set.seed(11358) n <- 32321 synthVals <- as.matrix(rep(0, n*18)) dim(synthVals) <- c(n, 18) for (i in 1:n) { synthVals[i,1:18] <- sample(ex[i, ]) } # To do: spike in some "real data"! # To discuss: is sample a good approach? myt.test <- function(n){ p <- t.test(synthVals[n,1:3], synthVals[n,4:18])$p.value return(p) } pVals <- rep(0, n) for (i in 1:n) { pVals[i] <- myt.test(i) } hist(pVals, breaks=50) abline(v=0.05, col="red") as.numeric(TRUE) # 1 ... TRUE is coerced into 1 sum(as.numeric(pVals < 0.05)) # ... so summing # over all TRUE # values counts # them. min(pVals) rowMin <- which(pVals == min(pVals)) plot(synthVals[rowMin, ], type="b", col="darkseagreen") # "Bonferroni correction" # The solution is to rescale the p-value cutoff # for significance, by dividing it by the number # of observations. Our cutoff of p = 0.05 # becomes ... 0.05 / n # 1.546982e-06 # ... i.e. a very small number instead. # What we have done is: replaced false positive # problems with false negatives. # But let's consider what that means, applied to # our real data, and our synthetic data: # our real data ... sum(as.numeric(tAll < 0.05)) # as is sum(as.numeric(tAll < 0.05 / n)) # Bonferroni corrected # the synthetic data ... sum(as.numeric(pVals < 0.05)) # as is sum(as.numeric(pVals < 0.05 / n)) # Bonferroni corrected # ============================================= # FDR # A modern new concept is to use FDR # (False Discovery Rate) instead, to control p-Values # ... set.seed(100) N <- 10000 alpha <- 0.05 y1 <- matrix(rnorm(9000*4, 0, 1), 9000, 4) y2 <- matrix(rnorm(1000*4, 5, 1), 1000, 4) y <- rbind(y1, y2) myt.test <- function(y){ t.test(y, alternative="two.sided")$p.value } P <- apply(y, 1, myt.test) sum(P