# ================================================== #
# Introduction to R                                  #
#                                                    #
# Boris Steipe <boris.steipe@utoronto.ca>            #
#                                                    #
# ================================================== #

# ================================================== #
#                                                    #
# Sample solutions for tasks                         #
#                                                    #
# ================================================== #


# TASK - getting text into R objects - the "characteristic"
#        genes ...
#   part 1: manually
# ================================================
#
# use a text-processor to replace each occurrence of a
# paragraph-break into the string ", ". Then wrap the
# string into c() and assign it.

cGenes <- c("Cd19", "Cd79b", "Cd22", "Cd37", "Ctsd",
            "Apoe", "C1qa", "C1qb", "C1qc", "Csf1r",
            "Slpi", "Tlr2", "Mmp13", "Marco", "Ifng", 
            "Gzmb", "Myc", "Xcl1", "Ccl5", "Gzma", 
            "Nkg7", "Spic", "Cebpb", "Lyz2", "Sfpi1", 
            "Nfkbiz", "Bst2", "Siglech", "Ly6d", "Irf8", 
            "Cst3", "Naaa", "Ccr7", "Cxcl9", "Traf1", 
            "Relb", "Itgax", "Tmem176b", "Tnf", "Tnfaip3", 
            "Nfkbia", "Il15", "Cxcl10", "Ifit1", "Isg15", 
            "Irf7")

#   part 2: all at once
# ================================================
#
cGenes2 <- unlist(strsplit("Cd19
Cd79b
Cd22
Cd37
Ctsd
Apoe
C1qa
C1qb
C1qc
Csf1r
Slpi
Tlr2
Mmp13
Marco
Ifng
Gzmb
Myc
Xcl1
Ccl5
Gzma
Nkg7
Spic
Cebpb
Lyz2
Sfpi1
Nfkbiz
Bst2
Siglech
Ly6d
Irf8
Cst3
Naaa
Ccr7
Cxcl9
Traf1
Relb
Itgax
Tmem176b
Tnf
Tnfaip3
Nfkbia
Il15
Cxcl10
Ifit1
Isg15
Irf7", "\\s"))

identical(cGenes, cGenes2)


# TASK - binomial name 
#   part 1: function for 1 string
# ================================================

# The functionality:
x <- "arabidopsis thaliana" 
y <- strsplit(toupper(x), " ")
y
paste(substr(y[[1]][1], 1, 3), 
      substr(y[[1]][2], 1, 2), 
      collapse='', sep='')
y

# as a function:
biCode <- function(s) {
	# Return a five letter species
	# code from the first two elements of 
	# a white-space separated string.
    v <- strsplit(toupper(s), " ")
    code <- paste(substr(v[[1]][1], 1, 3), 
        substr(v[[1]][2], 1, 2), 
        collapse='', sep='')
    return(code)
}
biCode(x)


# TASK - cell type labels 
# ================================================

ctLabels <- c(rep("B", 4),
              rep("MF", 10),
              rep("NK", 7),
              rep("Mo", 5),
              rep("pDC", 4),
              rep("pDC1", 5),
              rep("pDC2", 3),
              rep("all", 8) )

cGeneCells <- cbind(cGenes, ctLabels)


# TASK - read Table_S3.csv
# ================================================

# I see stuff in the header 
# - I can either fix this in the csv
# - skip it with read.table, or
# - delete it from the result


rawDat <- read.table("Table_S3.csv",
                     header = FALSE,
                     sep = ",")
head(rawDat, 10)
rawDat <- rawDat[-(1:6), ]
head(rawDat, 10)   # now note rownames problem
types <- c("genes",
           "B.ctrl",
           "B.LPS",
           "MF.ctrl",
           "MF.LPS",
           "NK.ctrl",
           "NK.LPS",
           "Mo.ctrl",
           "Mo.LPS",
           "pDC.ctrl",
           "pDC.LPS",
           "DC1.ctrl",
           "DC1.LPS",
           "DC2.ctrl",
           "DC2.LPS",
           "cluster")

colnames(rawDat) <- types

# Fix rownames problem
nrow(rawDat)
rownames(rawDat) <- 1:nrow(rawDat)

typeInfo(rawDat)
                      
# redo with stringsAsFactors = FALSE

rawDat <- read.table("table_S3.csv",
                     header = FALSE,
                     sep = ",",
                     stringsAsFactors = FALSE)
rawDat <- rawDat[-(1:6), ]
colnames(rawDat) <- types
rownames(rawDat) <- 1:nrow(rawDat)

sup3 <- as.matrix(rawDat[,2:16], ncol=15)
class(sup3) <- "numeric"
sup3 <- data.frame(genes=rawDat[ ,1], sup3, stringsAsFactors = FALSE)
head(sup3)
typeInfo(sup3)
                  
# Done.
                      
# read.csv version
?read.csv
rawDat <- read.csv("table_S3.csv",
                   header = FALSE,
                   stringsAsFactors = FALSE)
                   
head(rawDat, 10)
rawDat <- rawDat[-(1:6), ]
head(rawDat, 10)   # now note rownames problem
types <- c("genes",
           "B.ctrl",
           "B.LPS",
           "MF.ctrl",
           "MF.LPS",
           "NK.ctrl",
           "NK.LPS",
           "Mo.ctrl",
           "Mo.LPS",
           "pDC.ctrl",
           "pDC.LPS",
           "DC1.ctrl",
           "DC1.LPS",
           "DC2.ctrl",
           "DC2.LPS",
           "cluster")

colnames(rawDat) <- types

# Fix rownames problem
nrow(rawDat)
rownames(rawDat) <- 1:nrow(rawDat)

typeInfo(rawDat)
                  

sup3 <- as.matrix(rawDat[,2:16], ncol=15)
class(sup3) <- "numeric"
sup3 <- data.frame(genes=rawDat[ ,1], sup3, stringsAsFactors = FALSE)
head(sup3)
typeInfo(sup3)

                      
# TASK: check if our "characteristic genes" are all in the table
#=========================
cGenes %in% sup3$genes


# then find the enrichment vectors for the subset
# Bst2, Siglech, Ly6d, Irf8

which("Bst2" == cGenes)  # 27

v <- c(which(cGenes[27] == sup3$genes),                 
       which(cGenes[28] == sup3$genes),                 
       which(cGenes[29] == sup3$genes),                 
       which(cGenes[30] == sup3$genes) ) 

sup3[v, ]

# ... but this is awkward. Better:
?apply

               
# TASK: write a biCode() function for vectors
#=========================

# The function pattern:
biCodes <- function(s) {
    # initialize a vector
    # loop over the input
    # collect the code
    # return the result
}


biCode <- function(s) {
	# Return a vector of five letter species
	# codes from the first two elements of 
	# white-space separated strings.
	
	codes <- character(length(s))
	for (i in 1:length(s)) {
        v <- strsplit(toupper(s[i]), " ")
        codes[i] <- paste(substr(v[[1]][1], 1, 3), 
                      substr(v[[1]][2], 1, 2), 
                      collapse='', sep='')
    }
    return(codes)
}

x <- c("Petroselinum crispum",
       "Salvia officinalis",
       "Rosmarinus officinalis",
       "Thymus vulgaris")

biCode(x)


# TASK: differentially enriched genes
=========================================                      


summary(sup3[ ,2])
head(sup3)
dfMo <- abs(sup3[ ,"Mo.ctrl"] - sup3[ ,"Mo.LPS"]) 
summary(dfMo)
 
# example of order()
dfMo[1:5]
order(dfMo[1:5])
dfMo[order(dfMo[1:5])]
 
# for all genes ...
dfMoOrdered <- order(dfMo)
dfGenes <- tail(sup3[dfMoOrdered, 1], 10)

[END]