R Gene expression clustering

From "A B C"
Revision as of 19:28, 5 March 2014 by Boris (talk | contribs) (Created page with "<div id="APB"> <div class="b1"> R gene expression clustering </div> {{dev}} Gene expression clustering <source lang=R> # =================================================...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

R gene expression clustering


This page is a placeholder, or under current development; it is here principally to establish the logical framework of the site. The material on this page is correct, but incomplete.


Gene expression clustering

# ================================================== #
# BCB420 / JTB2020                                   #
# March 5 2014                                       #
# Clustering                                         #
#                                                    #
# Boris Steipe <boris.steipe@utoronto.ca>            #
# ================================================== #

# ==================================================
# Data
# ==================================================
# 
# Let's find some cell-cycle data in GEO, for clustering.
# The goal is to identify coregulated genes, but we don't
# know what their response to time in the cell-cycle will
# be: going up? Going down?

# The following code is slightly adapted from GEO2R on 
# "Cell cycle expression profiles in HeLa cells" (GSE26922)
# see: http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE26922
# There are triplicate measurements for t0 (blocked) and 
# t= 2,4,6,8 and 12h post block-release

################################################################
# First, we need to install some analysis packages from bioconductor
# The following commands do for the bioconductor system what 
# install.packages() does for CRAN.

source("http://bioconductor.org/biocLite.R")
biocLite("Biobase")
biocLite("GEOquery")
biocLite("limma")

# Then we load the libraries....
library(Biobase)
library(GEOquery)
library(limma)

# load series and platform data from GEO
gset <- getGEO("GSE26922", GSEMatrix =TRUE)

if (length(gset) > 1) idx <- grep("GPL6244", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]



# check what we have

head(gset)

# make proper column names to match toptable 
fvarLabels(gset) <- make.names(fvarLabels(gset))

# group names for all samples
sml <- c("G0","G0","G0","G1","G1","G1","G2","G2","G2","G3","G3","G3","G4","G4","G4","G5","G5","G5");

# log2 transform
ex <- exprs(gset)
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
          (qx[6]-qx[1] > 50 && qx[2] > 0) ||
          (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
  exprs(gset) <- log2(ex) }

# set up the data and proceed with analysis
fl <- as.factor(sml)
gset$description <- fl
design <- model.matrix(~ description + 0, gset)
colnames(design) <- levels(fl)
fit <- lmFit(gset, design)
cont.matrix <- makeContrasts(G5-G0, G1-G0, G2-G1, G3-G2, G4-G3, G5-G4, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)

# load NCBI platform annotation
gpl <- annotation(gset)
platf <- getGEO(gpl, AnnotGPL=TRUE)
ncbifd <- data.frame(attr(dataTable(platf), "table"))

# replace original platform annotation
tT <- tT[setdiff(colnames(tT), setdiff(fvarLabels(gset), "ID"))]
tT <- merge(tT, ncbifd, by="ID")
tT <- tT[order(tT$P.Value), ]  # restore correct order

tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","F","Gene.symbol","Gene.title"))
# We will use these 250 genes for clustering
# The top gene has the ID 8117594: what are the original data?

exprs(gset)["8117594",]

# let's make a table from these data
symbols <- tT$Gene.Symbol
data <- c()

for (i in 1:length(tT$ID)) {
	v <- c()
	v  <- c(v,  mean(exprs(gset)[tT$ID[i], 1:3]))
	v  <- c(v,  mean(exprs(gset)[tT$ID[i], 4:6]))
	v  <- c(v,  mean(exprs(gset)[tT$ID[i], 7:9]))
	v  <- c(v,  mean(exprs(gset)[tT$ID[i], 10:12]))
	v  <- c(v,  mean(exprs(gset)[tT$ID[i], 13:15]))
	v  <- c(v,  mean(exprs(gset)[tT$ID[i], 16:18]))
	data <- rbind(data, v)
}
colnames(data) <- c("t0", "t2", "t4", "t6", "t8", "t12")
rownames(data) <- tT$ID


# ==================================================
# Heatmap
# ==================================================
# This is a standard view of mutlidimensional data.

heatmap(data)


# ==================================================
# Hierarchical clustering
# ==================================================

# first we need to produce a distance table

distData <-dist(data, method = "euclidean")

# single-linkage hierarchical clustering
hc.single<-hclust(distData, method = "single", members=NULL)

plot(hc.single)

# draw rectangles at different cut-levels
rect.hclust(hc.single,k=2)
rect.hclust(hc.single,k=5)
rect.hclust(hc.single,k=10)
rect.hclust(hc.single,k=20)
rect.hclust(hc.single,k=50)

# now retrieve the actual indices and use them to generate
# paralell coordinate plots

class.single<-cutree(hc.single, k = 50)

# Explain the output
class.single

table(class.single)
sort(table(class.single))

oPar <- par(mfrow=c(2,2))
matplot(t(data[class.single==2,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.single==9,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.single==4,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.single==17,]),type="l", xlab="time",ylab="log expression value")
par(oPar)



# as an alternative, try Wards- linkage clustering (and read up on the 
# options: complete - and average-linkage clustering)
hc.ward <-hclust(distData, method = "ward", members=NULL)

plot(hc.ward)

# draw rectangles
rect.hclust(hc.ward,k=50)

# now retrieve the actual indices and use them to generate
# paralell coordinate plots

class.ward<-cutree(hc.ward, k = 50)
sort(table(class.ward))

oPar <- par(mfrow=c(2,2))
matplot(t(data[class.ward==2,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.ward==15,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.ward==34,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.ward==25,]),type="l", xlab="time",ylab="log expression value")
par(oPar)


# ==================================================
# Partitioning clustering
# ==================================================

# === K-means ======================================

?kmeans
cl<-kmeans(data, 5)

plot(data,col=cl$cluster)
points(cl$centers, col = 1:5, pch = 8, cex=2)

# But: be aware ...
# ... K-means does not guarantee a globally optimal solution,
# merely a locally converged one.

# === K-medoids ======================================

# load library "cluster" for K-medoid partitioning
library(cluster)
  set.seed(112358)
  cl<-pam(data, 5)
  plot(data, col=cl$cluster)
  plot(cl) # shows boundary and silhouette plots

# ==================================================
# Affinity propagation clustering
# ==================================================
# Based on B. J. Frey and D. Dueck. Clustering by
# passing messages between data points. 
# Science, 315(5814):972–976, 2007

install.packages("apcluster")
library(apcluster)

apRes <- apcluster(negDistMat(r=2), data)
apRes

heatmap(apRes)

length(apRes)
cutree(apRes)

oPar <- par(mfrow=c(3,2))
matplot(t(data[unlist(apRes[2]),]),type="l",xlab="time",ylab="log expression value")
matplot(t(data[unlist(apRes[3]),]),type="l",xlab="time",ylab="log expression value")
matplot(t(data[unlist(apRes[8]),]),type="l",xlab="time",ylab="log expression value")
matplot(t(data[unlist(apRes[14]),]),type="l",xlab="time",ylab="log expression value")
matplot(t(data[unlist(apRes[15]),]),type="l",xlab="time",ylab="log expression value")
matplot(t(data[unlist(apRes[9]),]),type="l",xlab="time",ylab="log expression value")
par(oPar)


# try the same with the full 237 row dataset
cho.all<-as.matrix(read.table("Data/logcho_237_4class.txt",skip=1)[,3:19])
apAll <- apcluster(negDistMat(r=2), cho.all)
apAll

oPar <- par(mfrow=c(3,3))
for (i in 1:8) {
	matplot(t(cho.all[unlist(apAll[i]),]), type="l",
	xlab="time",ylab="log expression value")
}
par(oPar)


# ==================================================
# t-stochastic Neighbour embedding
# ==================================================

install.packages("tsne")
library(tsne)

# set up a plotting routine
plotProgress <- function(x){ 
	plot(x, type='n');
#	text(x, labels = rownames(data), cex=0.5)
	points(x, pch=21, col="#6677FF", bg="firebrick")
	}

set.seed(112359)
tsneOut <- tsne(data, epoch = 100, epoch_callback = plotProgress, perplexity=10)



# [End]



-->