# ================================================== # # Exploratory Analysis of Biological Data using R # # # # Boris Steipe # # # # # # Some prior contributions by: # # Raphael Gottardo, FHCRC # # Sohrab Shah, UBC # # # # # # Module 2: Regression # # # # ================================================== # # ============================================= # Scenario # ============================================= # ============================================= # Synthetic data: linear model # ============================================= # This synthetic sample generates observations that # could come from measuring height and weight of # a human cohort. # We generate random heights in an interval, then # calculate hypothetical weights according to a simple # linear equation. Finally we add "errors" to the # weights. # The goal of our analysis is to recover the parameters # of our synthetic data. synthHWsamples <- function(n) { set.seed(83) # parameters for a height vs. weight plot hmin <- 1.5 hmax <- 2.3 M <- matrix(nrow=n,ncol=2) # generate a column of numbers in the interval M[,1] <- runif(n, hmin, hmax) # generate a column of numbers with a linear model M[,2] <- 40 * M[,1] + 1 # add some errors M[,2] <- M[,2] + rnorm(n, 0, 15) return(M) } HW<-synthHWsamples(50) # generate 50 proband's heights and weights plot(HW, xlab="Height (m)", ylab="Weight (kg)") # plot with labels cor(HW[,1], HW[,2]) # calculate correlation # ============================================= # What does a correlation coefficient "mean"? x<-rnorm(50) # 50 random deviates from a N(0,1) # y values: a fraction is y<-x and a fraction is random r <- 0.99; y <- (r * x) + ((1-r) * rnorm(50)); plot(x,y); cor(x,y) r <- 0.8; y <- (r * x) + ((1-r) * rnorm(50)); plot(x,y); cor(x,y) r <- 0.4; y <- (r * x) + ((1-r) * rnorm(50)); plot(x,y); cor(x,y) r <- 0.01; y <- (r * x) + ((1-r) * rnorm(50)); plot(x,y); cor(x,y) # What if the relation is not linear? x<-runif(50,-1,1) r <- 0.9 y <- (r * x) + ((1-r) * rnorm(50)); plot(x,y); cor(x,y) # periodic ... y <- (r * cos(x*pi)) + ((1-r) * rnorm(50)); plot(x,y); cor(x,y) # polynomial ... y <- (r * x*x) + ((1-r) * rnorm(50)); plot(x,y); cor(x,y) # exponential y <- (r * exp(5*x)) + ((1-r) * rnorm(50)); plot(x,y); cor(x,y) # circular ... a <- (r * cos(x*pi)) + ((1-r) * rnorm(50)) b <- (r * sin(x*pi)) + ((1-r) * rnorm(50)) plot(a,b); cor(a,b) # ============================================= # After becoming more familiar with correlations, # we return to analyzing our data. # R provides lm() for regression analysis: # Remember: the true parameters were weight = 40 * height + 1 ?lm lm(HW[,2] ~ HW[,1]) # What are these numbers? # How do they relate to our question? # Is the estimate any good? # replot the data plot(HW, xlab="Height (m)", ylab="Weight (kg)") # plot with labels # plot a regression line abline(lm(HW[,2] ~ HW[,1]), col="firebrick", lwd=3) # calculate residuals res <- resid(lm(HW[,2] ~ HW[,1])) # calculate idealized values fit <- fitted(lm(HW[,2] ~ HW[,1])) # plot differences segments(HW[,1], HW[,2], HW[,1], fit, col="#AA000044") # plot and analyze residuals plot(fit, res) cor(fit, res) # Calculate and plot prediction and confidence limits. # PREDICTION limits give boundaries on future observations, # they characterize how well the model is expected to # accommodate new data. # # CONFIDENCE limits give boundaries on adequate models. # They characterize how well we can trust our model # parameters. pp<-predict(lm(HW[,2] ~ HW[,1]), int="p") pc<-predict(lm(HW[,2] ~ HW[,1]), int="c") head(pc) # Now plot pp and pc limits # first sort on x o <- order(HW[,1]) # o is an index vector, sorted on x-values HW2 <- HW[o,] # second, recompute pp, pc in sorted order pc<-predict(lm(HW2[,2] ~ HW2[,1]), int="c") pp<-predict(lm(HW2[,2] ~ HW2[,1]), int="p") # Then plot plot(HW2, xlab="Height (m)", ylab="Weight (kg)", ylim=range(HW2[,2], pp)) matlines(HW2[,1], pc, lty=c(1,2,2), col="slategrey") matlines(HW2[,1], pp, lty=c(1,3,3), col="firebrick") # ============================================= # Non linear least-squares fit # ============================================= # Here is an example for a non-linear least-squares fit # to a set of observations, using a formula that we # define ourselves. # We simulate the risk of contracting a disease at # a certain age according to a logistic function # centred on 50 years and spread in onset approximately # between 0 and 100 years. # ============================================= # Synthetic data: arbitrary model # ============================================= # We employ a strategy that can be used to generate samples # for any arbitrary target distribution. To achieve this # we generate a random, uniformly distributed variate x # in an interval in which we are interested. For each # variate, we calculate the corresponding function value f(x). Then # we "roll dice" whether to accept or reject the first variate: # We generate a second uniform variate z in the range of the # target function in our interval. If f(x) is smaller than z, we # accept x as a sample in our distribution. # Consider this code: ageOfOnset <- function (n) { X <- c() i <- 1 while (i < n) { age<-floor(runif(1,1,100)) s<-runif(1) # sample uniform between 0 and 1 # i.e. in the range of the function if (s < 1-(1/(1+exp(0.1*(age-50))))) { # if s is smaller than the function value ... X <- append(X, age) # ... add this age to the vector i<-i+1 # ... and increment i. } # Else, try again until n successful attempts. } return(X) } fx <- function(x) {1-(1/(1+exp(0.1*(x))))} curve(fx, xlim=c(-50,50), col="red", lwd="2") ages <- ageOfOnset(10000) head(ages, 20) myCounts <- tabulate(ages) head(myCounts) plot(myCounts) # How can we recover the parameters 0.1 and 50 ... # 1-(1/(1+exp(0.1*(age-50))))) # ^^^ ^^ # ... from the dataset? # Non-linear least-squares fit of a self-defined function! fz <- function(t, S, tm, B) { S*(1-(1/(1+exp(B*(t-tm))))) } # here: # t is the time # S is a scaling factor # tm is the mean age of onset # B is the risk # Try some "reasonable" starting parameters, otherwise the # curve-fit might not converge. curve(fz(x, S=180, tm=48, B=0.2), add=TRUE, col="slategrey", lwd=1, lty=2) # now use nls() to fit the formula against our data age <- c(1:99) nlsFz <- nls(myCounts ~ fz(age, S, med, B), start=c(S=120, med=30, B=0.9)) nlsFz summary(nlsFz) coef(nlsFz) p <- coef(nlsFz) # plot the curve with the fitted parameters curve(fz(x, S=p[1], tm=p[2], B=p[3]), add=TRUE, col="firebrick", lwd=2) # Let us explore the set of curves that are compatible with the solution. # For this, we calculate a number of curves with randomly modified parameters # and plot them with semitransparent lines. sdev <- confint(nlsFz) sdev # calculate the mean and standard deviation for all three parameters a <- p - sdev[,1] b <- sdev[,2] - p a; b sdev <- (a+b)/2 plot(myCounts) n <- 5 for (i in 1:n) { rS <- rnorm(1, mean=p[1], sd=sdev[1]) rT <- rnorm(1, mean=p[2], sd=sdev[2]) rB <- rnorm(1, mean=p[3], sd=sdev[3]) curve(fz(x, S=rS, tm=rT, B=rB), add=TRUE, col=cm.colors(n)[i], lwd=3) } # do this many times and plot the curve with a thin, blue line plot(myCounts, type="n") n <- 800 for (i in 1:n) { rS <- rnorm(1, mean=p[1], sd=sdev[1]) rT <- rnorm(1, mean=p[2], sd=sdev[2]) rB <- rnorm(1, mean=p[3], sd=sdev[3]) curve(fz(x, S=rS, tm=rT, B=rB), add=TRUE, col="#77BEFB", lwd=0.03) } points(myCounts) # For a discussion of how to apply a weighting function to the weights = # parameter of nls(), including a good use of deparse() and sys.calls(), # see: http://www.r-bloggers.com/a-weighting-function-for-nls-nlslm/ # ============================================= # Alternatives to Pearson correlation - the MIC # ============================================= # The Maximal Information Coefficient is implemented # in the R package Minerva. Let's try it out with # an example. # We consider which cell types in the sup3 data are # most highly correlated regarding their expression # differences. # Let's first look at one cell pair: B-cells and # macrophages. dB <- sup3[ , "B.ctrl"] - sup3[ , "B.LPS"] dMF <- sup3[ ,"MF.ctrl"] - sup3[ ,"MF.LPS"] plot(dB, dMF) cor(dB, dMF) # 0.461983 # That seems like a very significant correlation. What # value could we expect to observe by chance? To evaluate # that, we can shuffle the data and compile the distribution # of correlation coefficient from random comparison of # genes: L <- length(dMF) R_results <- numeric(10000) for (i in 1:10000) { R_results[i] <- cor(dB, dMF[sample(1:L)]) } hist(R_results, xlim = c(-0.6, 0.6)) abline(v = cor(dB, dMF), col = "firebrick") # Let's try the same thing with the MIC: if (!require(minerva, quietly = TRUE)) { install.packages("minerva") library(minerva) } ?mine mine(dB, dMF) # 0.1468241 # This will take MUCH longer, thus run for 1000 # trials only... MIC_results <- numeric(1000) for (i in 1:1000) { MIC_results[i] <- mine(dB, dMF[sample(1:L)])$MIC } hist(MIC_results, xlim = c(0.0, 0.2)) abline(v = mine(dB, dMF)$MIC, col = "turquoise") # Try to calculate R and MIC for all pairs... # The strategy is as follows: # 1 - create a dataframe that contains each cell type's # differential expression values for all genes. # 2 - calculate correlations for all unique pairs and # save the values # 3 - calculate MIC for same # 4 - evaluate correlations. # Which two cell types are the most highly correlated? # You can also calcualte correlations for genes - # Which genes are the most similar in their response? # [End]