# ================================================== #
# Exploratory Analysis of Biological Data using R    #
#                                                    #
# Boris Steipe <boris.steipe@utoronto.ca>            #
#                                                    #
# Some prior contributions by:                       #
#   Raphael Gottardo, FHCRC                          #
#   Sohrab Shah, UBC                                 #
#                                                    #
#                                                    #
# Module 1: EDA (Exploratory Data Analysis)          #
#                                                    #
# ================================================== #


# ==================================================
# Preparations
# ==================================================

# These items should already have been done yesterday.

# Task:
# 1 - You should have a workshop directory on your
#     computer.
# 2 - R Studio should be open and you should have
#     set the current "working directory" to your
#     workshop directory.
# 3 - This file (the one you are reading now) should
#     be saved in that directory...
# 4 - ... and opened in R Studio.
#     Keep the code and comments you write all
#     together in this file.
# 5 - Download PlottingReference.R from the Wiki to
#     your workshop directory - and also open it in
#     R Studio.
# 6 - A file utilities.R should exist in your directory
#     it should contain the function typeInfo(); the
#     function should have been loaded when R Studio
#     started up.
#     (If the function is not loaded, open yesterday's
#     script and reload it.)
# 7 - Confirm:
getwd()   # Confirm the correct directory
dir()     # Confirm that the right files are present.


# ==== CHECKPOINT  ... =====================


# Optional task:
# If you are waiting for others to finish at a "Checkpoint"
# here are two suggestions:
# - Study how RUnit can be used to write tests for
#   code correctness as you develop code:
#     1 - Install/load the "RUnit" package.
#     2 - Explore the vignettes it contains.
#     3 - Explore the functions it contains.
# - Learn about regular expressions at
#    http://steipe.biochemistry.utoronto.ca/abc/index.php/Regular_Expressions


# ==================================================
# Load Data
# ==================================================

# Task:
# In yesterday's workshop we have worked with a
# a supplementary datafile from a 2014 publication
# on single-cell RNAseq for the automatic definition
# of tissue types. The data consists of gene-names,
# Expression values of clustered cells in the
# presence and absence of LPS stimulation, and
# cluster assignment labels. The corresponding
# paper is on the Wiki.

# We have experimented with loading the data and 
# identified a number of issues. Here is a command
# to load the data that addresses these issues:
# - unneccesary header rows are skipped
# - the first data row is not used as a column header
# - the column classes have been defined as one
#   character column and 15 numeric columns
# - column names corresponding to Fig 3B have
#   been defined
# - strings are treated as strings, not factors

# TASK: Read the Supplementary Table 3 into R
#       with the following command:

sup3 <- read.csv("Table_S3.csv",
                  skip = 6,
                  header = FALSE,
                  colClasses = c("character", rep("numeric", 15)),
                  col.names = c("genes",
                                "B.ctrl", "B.LPS",
                                "MF.ctrl", "MF.LPS",
                                "NK.ctrl", "NK.LPS",
                                "Mo.ctrl", "Mo.LPS",
                                "pDC.ctrl", "pDC.LPS",
                                "DC1.ctrl", "DC1.LPS",
                                "DC2.ctrl", "DC2.LPS",
                                "cluster"),
                  stringsAsFactors = FALSE)
str(sup3)
head(sup3)

# ==================================================
# Subsetting
# ==================================================

# Here is a quiz on subsetting. Write R expressions
# that get the following data from sup3:

# rows 1:10 of the first two columns 

# rows 1:10 of the first two columns in reverse order

# rows 1:10 of the first two columns in reverse order,
# but not the third row of the result

# rows 1:10 of the first two columns in random order
#     hint: use sample()

# rows 1:10 of the first two columns, ordered by
# the value in the second column, ascending
#     hint: use order()


# rows 1:10 of the column named Mo.LPS

# rows 1:10 of the columns named Mo.LPS and Mo.ctrl


# all genes with gene-names that are three characters long
#    hint: use nchar()

# column 1:2 of all rows with gene-names that contain
#    the string "Il"
#    hint: use grep()

# all genes for which B-cells are stimulated by LPS by
# more than 2 log units.


# ==================================================
# Descriptive statistics
# ==================================================

set.seed(100)
x <- rnorm(100, mean=0, sd=1)
mean(x)
median(x)
IQR(x)
var(x)
sd(x)
summary(x)

# Task:
# 1 - What would be interesting characterizations
#     of sup3? Explore them.

# =============================================
# Quantiles: what is the threshold that has a 
# given fraction of values above/below it?

x <- seq(-4, 4, 0.1)
f <- dnorm(x, mean=0, sd=1)
q90 <- qnorm(0.90, mean = 0, sd = 1)
plot(x, f, xlab="x", ylab="density", type="l", lwd=5)
abline(v=q90, col=2, lwd=5)

# =============================================
# empirical quantiles

set.seed(100)
x <- rnorm(100, mean=0, sd=1)
quantile(x)
quantile(x, probs=c(.1, .2, .9))

# =============================================

set.seed(100)
x <- rnorm(1000, mean=0, sd=1)
boxplot(x)


# Careful - a boxplot per se can obscure
# important structure in your data. Consider
# for example this bimodal distribution
x <- rnorm(100, mean=-2, sd=1)
x <- c(x, rnorm(100, mean=2, sd=1))
hist(x)
boxplot(x)

# Violin plot
if (!require(ggplot2, quietly=TRUE)) {
  install.packages("ggplot2")
  library(ggplot2)
}
X <- as.data.frame(x)

p <- ggplot(X, aes(1,x))
p + geom_violin()
# See ggplot2 introductory tutorial at 
#     http://www.r-bloggers.com/basic-introduction-to-ggplot2/
# and violin plot documentation at
#     http://docs.ggplot2.org/current/geom_violin.html


# Plotting more than one column with a boxplot
# places the plots side by side.
colnames(sup3)
boxplot(sup3[ ,c("MF.ctrl", "MF.LPS")])
boxplot(sup3[ ,2:14])


# Task:
# 1 - What would be interesting quantiles
#     and boxplots in sup3? Explore this.
#     Interpret the result.


# ==================================================
# Explore plot types
# (Section 1 of PlottingReference.R)
# ==================================================

# ==================================================
# Probability distributions
# ==================================================

# The binomial distribution
?dbinom
x <- 0:1
f <- dbinom(x, size=1, 0.1)
plot(x, f, xlab="x", ylab="density", type="h", lwd=5)

set.seed(100)
x <- rbinom(100, size=1, prob=.1)
hist(x)


# ==================================================
# The Normal distribution
# ==================================================
?dnorm
x <- seq(-4, 4, 0.1)
f <- dnorm(x, mean=0, sd=1)
plot(x, f, xlab="x", ylab="density", lwd=5, type="l")


# ==================================================
# Explore lines (Section 3 of PlottingReference.R)
# ==================================================

x <- seq(-4, 4, 0.1)
f <- dnorm(x, mean=0, sd=1)
plot(x, f, xlab="x", ylab="density", lwd=5, lty=3, type="l")

# =============================================
# Overlay a histogram with a line
set.seed(100)
x <- rnorm(100, mean=0, sd=1)
hist(x)
lines(seq(-3,3,0.1),50*dnorm(seq(-3,3,0.1)), col="red")


# ==== QQ PLOTS ====================================

set.seed(100)
x <- rnorm(100, mean=0, sd=1)
qqnorm(x)
qqline(x, col=2)

# =============================================
# Plotting lines and legends
# Example: compare the normal distribution with
# the t-distribution
x <- seq(-4, 4, 0.1)
f1 <- dnorm(x, mean=0, sd=1)
f2 <- dt(x, df=2)
plot(x, f1, xlab="x", ylab="density", lwd=5, type="l")
lines(x, f2, lwd=5, col=2)
legend(-4, .4, c("Normal", "t2"), col=1:2, lwd=5)

# =============================================
# use qqplot to compare normally distributed samples with
# t-distributed samples

set.seed(100)
t <- rt(100, df=2)
qqnorm(t)
qqline(t, col=2)


# =============================================
# QQ- plot: sample against sample
set.seed(100)
x <- rnorm(100, mean=0, sd=1)
t <- rt(100, df=2)
qqplot(x, t)

# Task:
# 1 - What columns of sup3 could be compared with
#     a qqplot? Explore this. Interpret the result.

# ==================================================
# Flow cytometry data
# ==================================================

# Task:
# 1 - Download the data file GvHD.txt and save it to
#     your project directory.


# GvHD flow cytometry data
gvhd <- read.table("GvHD.txt", header=TRUE)
# Only extract the CD3 positive cells
gvhdCD3p <- as.data.frame(gvhd[gvhd[, 5]>280, 3:6])
plot(gvhdCD3p[, 1:2])

# ==================================================
# Explore scatter plots
# Topics:
# Section 6 - Plotting symbols and characters
#         2 - Colors
#         4 - Coordinates
# ... of PlottingReference.R)
# ==================================================
# Scatter plots are extremely useful, but learning
# a bit about R's plotting capabilities will help
# tremendously to create INFORMATIVE plots.


# Some special packages
# The overlap in the GvHD data can obscure
# data relationships. Here are some alternatives
# for dense plots:

# load "hexbin" package from CRAN
if (!require(hexbin, quietly=TRUE)) {
  install.packages("hexbin")
  library(hexbin)
}

# variant one: hexbin
hb <- hexbin(gvhdCD3p[, 1:2], xbins = 20)
plot(hb, colramp = colorRampPalette(c("#FFFFDD",
                                      "#77EE99",
                                      "#3377AA",
                                      "#0033BB")))

# load "prada" package from BioConductor
if (!require(prada, quietly=TRUE)) {
    source("http://www.bioconductor.org/biocLite.R")
    biocLite("prada")
}

# variant two: smoothScatter
smoothScatter(gvhdCD3p[, 1:2], nrpoints=50, pch=20, cex=0.5, col="#6633BB55")

# variant three: colors vary by density
plot (gvhdCD3p[, c(1,2)], col=densCols(gvhdCD3p[, 1], gvhdCD3p[, 2]), pch=16)


# Task:
# 1 - Create a scatterplot from differences in
#     LPS activation between a pair of cell types in sup3.
# 2 - Add a line that shows the situation if all
#     activations were equal.
# 3 - Redo the plot with density Coloring

plot(sup3[ ,"Mo.ctrl"] - sup3[ ,"Mo.LPS"], 
     sup3[ ,"MF.ctrl"] - sup3[ ,"MF.LPS"])
abline(0,1,col = "red")

# =============================================
# Trellis plots: all against all

plot(gvhdCD3p, pch=".")

# =============================================

boxplot(gvhdCD3p)

# =============================================

oPar <- par(mfrow=c(2, 2))
hist(gvhdCD3p[, 1], 50, main=names(gvhdCD3p)[1], xlab="fluorescent intensity", ylim=c(0, 120))
hist(gvhdCD3p[, 2], 50, main=names(gvhdCD3p)[2], xlab="fluorescent intensity", ylim=c(0, 120))
hist(gvhdCD3p[, 3], 50, main=names(gvhdCD3p)[3], xlab="fluorescent intensity", ylim=c(0, 120))
hist(gvhdCD3p[, 4], 50, main=names(gvhdCD3p)[4], xlab="fluorescent intensity", ylim=c(0, 120))
par(oPar)


# =============================================

# ==================================================
# Explore 3D plots
# 
# Section 8 - 8 - Plots of X-Y-Z coordinates
# ... of PlottingReference.R)
# ==================================================


# [End]