R Gene expression clustering
Jump to navigation
Jump to search
R gene expression clustering
This page is a placeholder, or under current development; it is here principally to establish the logical framework of the site. The material on this page is correct, but incomplete.
Gene expression clustering
# ================================================== #
# BCB420 / JTB2020 #
# March 5 2014 #
# Clustering #
# #
# Boris Steipe <boris.steipe@utoronto.ca> #
# ================================================== #
# ==================================================
# Data
# ==================================================
#
# Let's find some cell-cycle data in GEO, for clustering.
# The goal is to identify coregulated genes, but we don't
# know what their response to time in the cell-cycle will
# be: going up? Going down?
# The following code is slightly adapted from GEO2R on
# "Cell cycle expression profiles in HeLa cells" (GSE26922)
# see: http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE26922
# There are triplicate measurements for t0 (blocked) and
# t= 2,4,6,8 and 12h post block-release
################################################################
# First, we need to install some analysis packages from bioconductor
# The following commands do for the bioconductor system what
# install.packages() does for CRAN.
source("http://bioconductor.org/biocLite.R")
biocLite("Biobase")
biocLite("GEOquery")
biocLite("limma")
# Then we load the libraries....
library(Biobase)
library(GEOquery)
library(limma)
# load series and platform data from GEO
gset <- getGEO("GSE26922", GSEMatrix =TRUE)
if (length(gset) > 1) idx <- grep("GPL6244", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# check what we have
head(gset)
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
# group names for all samples
sml <- c("G0","G0","G0","G1","G1","G1","G2","G2","G2","G3","G3","G3","G4","G4","G4","G5","G5","G5");
# log2 transform
ex <- exprs(gset)
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
exprs(gset) <- log2(ex) }
# set up the data and proceed with analysis
fl <- as.factor(sml)
gset$description <- fl
design <- model.matrix(~ description + 0, gset)
colnames(design) <- levels(fl)
fit <- lmFit(gset, design)
cont.matrix <- makeContrasts(G5-G0, G1-G0, G2-G1, G3-G2, G4-G3, G5-G4, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
# load NCBI platform annotation
gpl <- annotation(gset)
platf <- getGEO(gpl, AnnotGPL=TRUE)
ncbifd <- data.frame(attr(dataTable(platf), "table"))
# replace original platform annotation
tT <- tT[setdiff(colnames(tT), setdiff(fvarLabels(gset), "ID"))]
tT <- merge(tT, ncbifd, by="ID")
tT <- tT[order(tT$P.Value), ] # restore correct order
tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","F","Gene.symbol","Gene.title"))
# We will use these 250 genes for clustering
# The top gene has the ID 8117594: what are the original data?
exprs(gset)["8117594",]
# let's make a table from these data
symbols <- tT$Gene.Symbol
data <- c()
for (i in 1:length(tT$ID)) {
v <- c()
v <- c(v, mean(exprs(gset)[tT$ID[i], 1:3]))
v <- c(v, mean(exprs(gset)[tT$ID[i], 4:6]))
v <- c(v, mean(exprs(gset)[tT$ID[i], 7:9]))
v <- c(v, mean(exprs(gset)[tT$ID[i], 10:12]))
v <- c(v, mean(exprs(gset)[tT$ID[i], 13:15]))
v <- c(v, mean(exprs(gset)[tT$ID[i], 16:18]))
data <- rbind(data, v)
}
colnames(data) <- c("t0", "t2", "t4", "t6", "t8", "t12")
rownames(data) <- tT$ID
# ==================================================
# Heatmap
# ==================================================
# This is a standard view of mutlidimensional data.
heatmap(data)
# ==================================================
# Hierarchical clustering
# ==================================================
# first we need to produce a distance table
distData <-dist(data, method = "euclidean")
# single-linkage hierarchical clustering
hc.single<-hclust(distData, method = "single", members=NULL)
plot(hc.single)
# draw rectangles at different cut-levels
rect.hclust(hc.single,k=2)
rect.hclust(hc.single,k=5)
rect.hclust(hc.single,k=10)
rect.hclust(hc.single,k=20)
rect.hclust(hc.single,k=50)
# now retrieve the actual indices and use them to generate
# paralell coordinate plots
class.single<-cutree(hc.single, k = 50)
# Explain the output
class.single
table(class.single)
sort(table(class.single))
oPar <- par(mfrow=c(2,2))
matplot(t(data[class.single==2,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.single==9,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.single==4,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.single==17,]),type="l", xlab="time",ylab="log expression value")
par(oPar)
# as an alternative, try Wards- linkage clustering (and read up on the
# options: complete - and average-linkage clustering)
hc.ward <-hclust(distData, method = "ward", members=NULL)
plot(hc.ward)
# draw rectangles
rect.hclust(hc.ward,k=50)
# now retrieve the actual indices and use them to generate
# paralell coordinate plots
class.ward<-cutree(hc.ward, k = 50)
sort(table(class.ward))
oPar <- par(mfrow=c(2,2))
matplot(t(data[class.ward==2,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.ward==15,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.ward==34,]),type="l", xlab="time",ylab="log expression value")
matplot(t(data[class.ward==25,]),type="l", xlab="time",ylab="log expression value")
par(oPar)
# ==================================================
# Partitioning clustering
# ==================================================
# === K-means ======================================
?kmeans
cl<-kmeans(data, 5)
plot(data,col=cl$cluster)
points(cl$centers, col = 1:5, pch = 8, cex=2)
# But: be aware ...
# ... K-means does not guarantee a globally optimal solution,
# merely a locally converged one.
# === K-medoids ======================================
# load library "cluster" for K-medoid partitioning
library(cluster)
set.seed(112358)
cl<-pam(data, 5)
plot(data, col=cl$cluster)
plot(cl) # shows boundary and silhouette plots
# ==================================================
# Affinity propagation clustering
# ==================================================
# Based on B. J. Frey and D. Dueck. Clustering by
# passing messages between data points.
# Science, 315(5814):972–976, 2007
install.packages("apcluster")
library(apcluster)
apRes <- apcluster(negDistMat(r=2), data)
apRes
heatmap(apRes)
length(apRes)
cutree(apRes)
oPar <- par(mfrow=c(3,2))
matplot(t(data[unlist(apRes[2]),]),type="l",xlab="time",ylab="log expression value")
matplot(t(data[unlist(apRes[3]),]),type="l",xlab="time",ylab="log expression value")
matplot(t(data[unlist(apRes[8]),]),type="l",xlab="time",ylab="log expression value")
matplot(t(data[unlist(apRes[14]),]),type="l",xlab="time",ylab="log expression value")
matplot(t(data[unlist(apRes[15]),]),type="l",xlab="time",ylab="log expression value")
matplot(t(data[unlist(apRes[9]),]),type="l",xlab="time",ylab="log expression value")
par(oPar)
# try the same with the full 237 row dataset
cho.all<-as.matrix(read.table("Data/logcho_237_4class.txt",skip=1)[,3:19])
apAll <- apcluster(negDistMat(r=2), cho.all)
apAll
oPar <- par(mfrow=c(3,3))
for (i in 1:8) {
matplot(t(cho.all[unlist(apAll[i]),]), type="l",
xlab="time",ylab="log expression value")
}
par(oPar)
# ==================================================
# t-stochastic Neighbour embedding
# ==================================================
install.packages("tsne")
library(tsne)
# set up a plotting routine
plotProgress <- function(x){
plot(x, type='n');
# text(x, labels = rownames(data), cex=0.5)
points(x, pch=21, col="#6677FF", bg="firebrick")
}
set.seed(112359)
tsneOut <- tsne(data, epoch = 100, epoch_callback = plotProgress, perplexity=10)
# [End]
-->