# ================================================== #
# Exploratory Analysis of Biological Data using R    #
#                                                    #
# Boris Steipe <boris.steipe@utoronto.ca>            #
#                                                    #
#                                                    #
# Some prior contributions by:                       #
#   Raphael Gottardo, FHCRC                          #
#   Sohrab Shah, UBC                                 #
#                                                    #
#                                                    #
# Module 3: Dimension reduction                      #
#                                                    #
# ================================================== #


# =============================================
# PCA introduction
# =============================================

# Synthetic data example
# 500 normally distributed samples each: uncorrelated
set.seed(2707)
x1 <- rnorm(500,0,1)
y1 <- rnorm(500,0,1)

# generate y2 corrleated with (dependent on) x1
y2 <- 2*x1 + y1
y2 <- y2-mean(y2)
y2 <- y2 / sd(y2)

# Create a lattice plot with two rows and two columns
oPar <- par(mfrow = c(2,2)) # set new and save old graphics state
par(mfrow = c(2,2))
# four plots ...
hist(x1)
hist(y2)
plot(x1, y1)
plot(x1, y2)
par(oPar) # restore graphics state parameters


# calculate a PCA of x1 and y2
pcaSample <- prcomp(cbind(x1,y2))

# here are the information items from the returned list of results
pcaSample
pcaSample$sdev
pcaSample$rotation
summary(pcaSample)
head(pcaSample$x)

# Compare the histograms before and after the rotation:
oPar <- par(mfrow = c(2,2))
hist(x1, xlim=c(-4,4), ylim=c(0,150), main="")
hist(y2, xlim=c(-4,4), ylim=c(0,150), main="")
hist(pcaSample$x[,1], xlim=c(-4,4), ylim=c(0,150), 
	main="", col=rgb(0.86,0,0,0.5))
hist(pcaSample$x[,2], xlim=c(-4,4), ylim=c(0,150), 
	main="", col=rgb(0.31, 0.5, 0.74, 0.5))
par(oPar) # restore graphics state parameters

# Plot the sample along the Principal Components as axes
plot(pcaSample$x[,1],pcaSample$x[,2], xlim=c(-4,4), ylim=c(-4,4))

# ==================================================
# EDA with PCA
# The relative importance of PCs
# ==================================================

# load one of the sample data sets in the R distribution

library(MASS)
data(crabs)

head(crabs)
# Two types: blue and orange
# Two genders: female and male
# FL frontal lobe size (mm)
# RW rear width (mm)
# CL carapace length (mm)
# CW carapace width (mm)
# BD body depth (mm)

# annotate...
fac <- as.factor(paste(crabs[, 1], crabs[, 2],sep="."))
head(fac)
c(fac[1], fac[51], fac[101], fac[151])
as.numeric(c(fac[1], fac[51], fac[101], fac[151]))

plot(crabs[, 4:8], pch=as.numeric(fac))
plot(crabs[, 4:5], pch=as.numeric(fac))
plot(crabs[, 5:6], pch=as.numeric(fac))

# Apply principal components analysis to the five measured dimensions
head(crabs)pcaCrabs <- prcomp(crabs[, 4:8])plot(pcaCrabs)summary(pcaCrabs)
str(pcaCrabs)

# Plot projections along the components into a scatterplot.# Axes for points are scaled as values, for vectors as variance
# Default for biplot() is the first and second component.

biplot(pcaCrabs, xlabs=as.numeric(fac))
legend(81, -63,c("1: B.F", "2: B.M", "3: O.F", "4: O.M"), box.col=1, bg="lightgrey")

# Plot the first against the third principal component
biplot(pcaCrabs, xlabs=as.numeric(fac), choices = c(1, 3))
legend(84, -63, c("1: B.F", "2: B.M", "3: O.F", "4: O.M"), box.col=1, bg="lightgrey")

# Plot the second against the third principal component
biplot(pcaCrabs, xlabs=as.numeric(fac), choices = c(2, 3))
legend(-14.8, 16.2, c("1: B.F", "2: B.M", "3: O.F", "4: O.M"), box.col=1, bg="lightgrey")

# ===================================================
# Task: plot the last plot (without vectors) with plotting
# symbols that correspond to the gender and type of crab:
# orange and blue circles for females and triangles for males.

# Advanced: also make symbol-size depend on carapace length or
# the mean of all five measurements.
# ===================================================


# My solution.
# 1. Create only the plotting frame
plot(pcaCrabs$x[,2], pcaCrabs$x[,3], type = "n")
crabPoints <- function(n) {
	if (crabs[n,"sp"] == "B") { 
		sp = "#0066FF"
	} else {
		sp = "#FF9900"
	}
	if (crabs[n,"sex"] == "M") { 
		sex = 24  # triangle
	} else {
		sex = 21  # circle
	}
	points(pcaCrabs$x[n,2],
	       pcaCrabs$x[n,3],
	       col = sp, bg = sp,
	       pch = sex)
}

for (i in 1:nrow(crabs)) {
	crabPoints(i)
}


# ==================================================
# EDA with PCA
# Exploring the structure of datasets
# ==================================================

# Load the Raymond Cho dataset of expression profiles
# containing 237 genes known or suspected to be involved
# in cell-cycle regulation.

cho.data <- read.table("Logcho_237_4class.txt", skip=1)[, 3:19]nrow(cho.data)head(cho.data)

# compare the general trends of the measurements
boxplot(cho.data)

# Since we are looking for expression changes
# in a cycle, we are less interested in the 
# absolute values of expression, and more in
# whether values change in a cyclical fashion.
# As a first step to normalize the data, we can
# therefore subtract the row-mean from each row.

for (i in 1:nrow(cho.data)) {
	cho.data[i,] <- cho.data[i,] - mean(as.numeric(cho.data[i,]))
}

boxplot(cho.data)

# This has caused the averages and ranges to become quite
# divergent between experiments. We should normalize them
# so as to make sure every experiment (dimension)
# contributes equally to our principal component analysis.
 
# Let's create a function to normalize the data.

normDat <- function(x) {
	# input: a column from a dataframe
	x <- as.numeric(x)
	norm <- x - mean(x)       # subtract means
	norm <- norm / sd(norm)   # scale SD to 1
	return(norm)
}

for (i in 1:ncol(cho.data)) {
	cho.data[ ,i] <- normDat(cho.data[ ,i])
}

boxplot(cho.data)
# Change the column namesnames(cho.data) <- paste("t", 1:17, sep="")head(cho.data)

pcaCho <- prcomp(cho.data)plot(pcaCho, n=17)

# Compare the PCs to PCs from a matrix of random numbers
# ... eg as follows:
rU = matrix(runif(17 * 237,0,3), nrow = 237, ncol=17)
plot(prcomp(rU), n=17, ylim = c(0, 3))

# Exercise: compare the PCs to a matrix of permuted
# observations.

# Explore the correlations along the first few principal components

oPar <- par(mfrow = c(2,2))
biplot(pcaCho)biplot(pcaCho, choices = c(1, 3))biplot(pcaCho, choices = c(2, 3))biplot(pcaCho, choices = c(3, 4))
par(oPar)


# Examine the actual principal components in a paralell-coordinates
# plot for the seventeen dimensions. In microarray analysis we call
# these the "eigengenes", with reference to the "eigenvalues" of 
# linear algebra.
matplot(pcaCho$rotation[, 1:3],
        type="b", lwd=3,
        ylab="PCs")
        
legend(14, 0.6, 
       c("PC1", "PC2", "PC3"),
       bg="lightgrey",
       col=1:3, lty=1:3, lwd=2,
       pch=as.character(1:3))


# ====================
# ... select some genes to see how similar they are

# label with gene names
cho.genes <- as.character(read.table("logcho_237_4class.txt", skip=1)[, 1])
head(cho.genes)


biplot(pcaCho, choices = c(1, 2), cex=0.5)

Sel1 <- c(193, 142, 235, 98, 73, 54, 114)

matplot(t(cho.data[Sel1, ]), 
        type="b", lwd=3, col="3",
        ylab="Clustered genes",
        xlab = "Timepoints")
        
        
# TASK: select and plot 10 random genes for comparison


# ====================

# Choose a few gene indices from a different biplot and
# compare...

biplot(pcaCho, choices = c(1, 3), cex=0.8)

Sel2 <- c(20, 227, 143, 128, 192, 75)matplot(t(cho.data[Sel1, ]),         type="b", lwd=3, col="1",        ylab="Clustered genes",        xlab = "Timepoints",        ylim = c(-4, 4))oPar <- par(new=TRUE)matplot(t(cho.data[Sel2, ]),        type="b", lwd=3, col="2",        ylab="",        xlab="",        ylim = c(-4, 4))par(oPar)

# ==================================================
# Exploring the structure of datasets
# relative to preconceived models
# ==================================================

# PCA will show us what is actually found in our data. However we might
# already have some a priori ideas of what might be interesting. If we
# can cast these ideas in a model, we can simply calculate correlation
# coefficients between observation and model and project the data along
# these dimensions.

# PCA is a projection of the data under a high-dimensional
# rotation. This model-based analysis can be seen as a 
# projection along a particular model.

# Here we define three simple models: 
# a half-sine, one sine wave and two sine waves over the course of
# the experiment.


Nval <- 17ProfHalf <- 1:NvalProfFull <- 1:Nval
ProfTwo  <- 1:Nval
ProfHalf <- sin((ProfHalf-1) * (pi / (Nval-1)))ProfFull <- sin((ProfFull-1) * ((2 * pi) / (Nval-1)))
ProfTwo  <- sin((ProfTwo-1) * ((4 * pi) / (Nval-1)))plot(ProfHalf, type="l",    xlim = c(1, Nval),    ylim = c(-2, 2), col="2")lines(ProfFull, type="l",
   xlim = c(1, Nval), 
   ylim = c(-2, 2))
lines(ProfTwo, type="l",
   xlim = c(1, Nval), 
   ylim = c(-2, 2), col="3")

# plotting the correlation of each datapoint against our two models
n <- dim(cho.data)[1]CorHalf <- rep(0, n)CorFull <- rep(0, n)
CorTwo <- rep(0, n)
for (i in 1:n) {  x <- c(t(cho.data[i, ]))  CorHalf[i] <- cor(x, ProfHalf)  CorFull[i]  <- cor(x, ProfFull)
  CorTwo[i]  <- cor(x, ProfTwo)
}

plot(CorHalf, CorFull, type="n")text(CorHalf, CorFull)

plot(CorTwo, CorFull, type="n")
text(CorTwo, CorFull)


# Select indices from the plot and plot paralell coordinates
# of the actual values.

Sel1 <- c(233, 237, 143, 170, 229, 128 )Sel2 <- c(236, 187, 205, 171, 181, 102)Sel3 <- c(142, 3, 90, 78, 130, 101, 91)
oPar <- par(mfrow = c(3,1), mar=c(1,1,1,1)) # reduce margin sizesmatplot(t(cho.data[Sel1, ]), type="b", lwd=3,   ylab="Similar genes",    xlab = "Timepoints",    ylim = c(-4, 4),    col="2")matplot(t(cho.data[Sel2, ]), type="b", lwd=3,    ylab="",    xlab="",    ylim = c(-4, 4),    col="1")matplot(t(cho.data[Sel3, ]), type="b", lwd=3,    ylab="",   xlab="",   ylim = c(-4, 4),   col="4")par(oPar)

# TASK: create the same plot, but this time choose the 5
#       genes each that have the highest correlation with
#       our models.

# ============================================================
# t-SNE
# ============================================================
# t-Stochastic Neighbour Embedding is a powerful dimension re-
# duction algorithm developed in the lab of Geoff Hinton at UofT.
# Its implementation for flow cytometry - viSNE -is available
# from the Dana Pe'er lab (as Matlab code). 
# http://www.c2b2.columbia.edu/danapeerlab/html/cyt.html
# It is the basis for a very powerful flow-cytometry exploration tool:
# Amir et al. (2013) Nature Biotechnology, doi:10.1038/nbt.2594
#
# Below we will try the t-SNE algorithm for exploration of  
# some of the data we have looked at before.

if (!require(tsne, quietly=TRUE)) {
  install.packages("tsne")
  library(tsne)
}
?tsne

# Apply tsne to crabs data
# set up a plotting routine
ecb <- function(x){ 
	plot(x,t='n');
	points(x, pch=crabs.pch, bg=crabs.col, cex=crabs.c)
	}

set.seed(112358)
tsne_crabs <- tsne(crabs[,4:8], epoch_callback = ecb, perplexity=50)


# Apply tsne to the cell-cycle data

sel1 <- which(CorTwo > 0.77) # Pick a subset, as above. We will track whether
sel1                         # the subset is cohesive during the iterations.

sel2 <- which(CorTwo < -0.6) # Negative correlations
sel2                         


matplot(t(cho.data[sel1, ]), type="b", lwd=3, 
   ylab="Normalized expression (AU)",
   xlab="timepoint",
   ylim = c(-4, 4),
   col="firebrick")
for (i in 1:length(sel2)) {
	lines(t(cho.data[sel2[i], ]), type="l", lwd=2, lty=2, col="turquoise")
}

   
ecb <- function(x){ 
	plot(x,t='n');
	points(x[sel1,], bg="firebrick", pch=21, cex=2)
	points(x[sel2,], bg="turquoise", pch=21, cex=2)
	text(x, labels=paste(1:dim(cho.data)[1]), cex=0.7)
}
	
set.seed(123456)
tsne_cho <- tsne(cho.data[,1:17], epoch_callback = ecb, perplexity=50)

# Pick a cluster of genes in the vicinity of the originally
# selected values

sel3 <- c(226, 227, 237, 75, 4, 192, 128, 146, 200, 107)
matplot(t(cho.data[sel1, ]), type="b", lwd=3, 
   ylab="Normalized expression (AU)",
   xlab="timepoint",
   ylim = c(-4, 4),
   col="firebrick")
for (i in 1:length(sel3)) {
	lines(t(cho.data[sel3[i], ]), type="l", lwd=2, col="slategray")
}

# TASK: where are these genes on the model-correlation plot?


# Exercise for self study: plot the position of a two-cycle sine function,
# its inverse, a two-cycle cosine function and its inverse on the SNE map.
# Are there genes in the vicinity of these reference points? Are these
# related to the ones you found earlier when projecting along models?
# Hint: you'll have to re-run the embedding after adding the profiles
# to the data, and then highlight the profile points.

# [end]