# ================================================== # # Introduction to R # # # # Boris Steipe # # # # ================================================== # # ================================================== # # # # Sample solutions for tasks # # # # ================================================== # # TASK - getting text into R objects - the "characteristic" # genes ... # part 1: manually # ================================================ # # use a text-processor to replace each occurrence of a # paragraph-break into the string ", ". Then wrap the # string into c() and assign it. cGenes <- c("Cd19", "Cd79b", "Cd22", "Cd37", "Ctsd", "Apoe", "C1qa", "C1qb", "C1qc", "Csf1r", "Slpi", "Tlr2", "Mmp13", "Marco", "Ifng", "Gzmb", "Myc", "Xcl1", "Ccl5", "Gzma", "Nkg7", "Spic", "Cebpb", "Lyz2", "Sfpi1", "Nfkbiz", "Bst2", "Siglech", "Ly6d", "Irf8", "Cst3", "Naaa", "Ccr7", "Cxcl9", "Traf1", "Relb", "Itgax", "Tmem176b", "Tnf", "Tnfaip3", "Nfkbia", "Il15", "Cxcl10", "Ifit1", "Isg15", "Irf7") # part 2: all at once # ================================================ # cGenes2 <- unlist(strsplit("Cd19 Cd79b Cd22 Cd37 Ctsd Apoe C1qa C1qb C1qc Csf1r Slpi Tlr2 Mmp13 Marco Ifng Gzmb Myc Xcl1 Ccl5 Gzma Nkg7 Spic Cebpb Lyz2 Sfpi1 Nfkbiz Bst2 Siglech Ly6d Irf8 Cst3 Naaa Ccr7 Cxcl9 Traf1 Relb Itgax Tmem176b Tnf Tnfaip3 Nfkbia Il15 Cxcl10 Ifit1 Isg15 Irf7", "\\s")) identical(cGenes, cGenes2) # TASK - binomial name # part 1: function for 1 string # ================================================ # The functionality: x <- "arabidopsis thaliana" y <- strsplit(toupper(x), " ") y paste(substr(y[[1]][1], 1, 3), substr(y[[1]][2], 1, 2), collapse='', sep='') y # as a function: biCode <- function(s) { # Return a five letter species # code from the first two elements of # a white-space separated string. v <- strsplit(toupper(s), " ") code <- paste(substr(v[[1]][1], 1, 3), substr(v[[1]][2], 1, 2), collapse='', sep='') return(code) } biCode(x) # TASK - cell type labels # ================================================ ctLabels <- c(rep("B", 4), rep("MF", 10), rep("NK", 7), rep("Mo", 5), rep("pDC", 4), rep("pDC1", 5), rep("pDC2", 3), rep("all", 8) ) cGeneCells <- cbind(cGenes, ctLabels) # TASK - read Table_S3.csv # ================================================ # I see stuff in the header # - I can either fix this in the csv # - skip it with read.table, or # - delete it from the result rawDat <- read.table("Table_S3.csv", header = FALSE, sep = ",") head(rawDat, 10) rawDat <- rawDat[-(1:6), ] head(rawDat, 10) # now note rownames problem types <- c("genes", "B.ctrl", "B.LPS", "MF.ctrl", "MF.LPS", "NK.ctrl", "NK.LPS", "Mo.ctrl", "Mo.LPS", "pDC.ctrl", "pDC.LPS", "DC1.ctrl", "DC1.LPS", "DC2.ctrl", "DC2.LPS", "cluster") colnames(rawDat) <- types # Fix rownames problem nrow(rawDat) rownames(rawDat) <- 1:nrow(rawDat) typeInfo(rawDat) # redo with stringsAsFactors = FALSE rawDat <- read.table("table_S3.csv", header = FALSE, sep = ",", stringsAsFactors = FALSE) rawDat <- rawDat[-(1:6), ] colnames(rawDat) <- types rownames(rawDat) <- 1:nrow(rawDat) sup3 <- as.matrix(rawDat[,2:16], ncol=15) class(sup3) <- "numeric" sup3 <- data.frame(genes=rawDat[ ,1], sup3, stringsAsFactors = FALSE) head(sup3) typeInfo(sup3) # Done. # read.csv version ?read.csv rawDat <- read.csv("table_S3.csv", header = FALSE, stringsAsFactors = FALSE) head(rawDat, 10) rawDat <- rawDat[-(1:6), ] head(rawDat, 10) # now note rownames problem types <- c("genes", "B.ctrl", "B.LPS", "MF.ctrl", "MF.LPS", "NK.ctrl", "NK.LPS", "Mo.ctrl", "Mo.LPS", "pDC.ctrl", "pDC.LPS", "DC1.ctrl", "DC1.LPS", "DC2.ctrl", "DC2.LPS", "cluster") colnames(rawDat) <- types # Fix rownames problem nrow(rawDat) rownames(rawDat) <- 1:nrow(rawDat) typeInfo(rawDat) sup3 <- as.matrix(rawDat[,2:16], ncol=15) class(sup3) <- "numeric" sup3 <- data.frame(genes=rawDat[ ,1], sup3, stringsAsFactors = FALSE) head(sup3) typeInfo(sup3) # TASK: check if our "characteristic genes" are all in the table #========================= cGenes %in% sup3$genes # then find the enrichment vectors for the subset # Bst2, Siglech, Ly6d, Irf8 which("Bst2" == cGenes) # 27 v <- c(which(cGenes[27] == sup3$genes), which(cGenes[28] == sup3$genes), which(cGenes[29] == sup3$genes), which(cGenes[30] == sup3$genes) ) sup3[v, ] # ... but this is awkward. Better: ?apply # TASK: write a biCode() function for vectors #========================= # The function pattern: biCodes <- function(s) { # initialize a vector # loop over the input # collect the code # return the result } biCode <- function(s) { # Return a vector of five letter species # codes from the first two elements of # white-space separated strings. codes <- character(length(s)) for (i in 1:length(s)) { v <- strsplit(toupper(s[i]), " ") codes[i] <- paste(substr(v[[1]][1], 1, 3), substr(v[[1]][2], 1, 2), collapse='', sep='') } return(codes) } x <- c("Petroselinum crispum", "Salvia officinalis", "Rosmarinus officinalis", "Thymus vulgaris") biCode(x) # TASK: differentially enriched genes ========================================= summary(sup3[ ,2]) head(sup3) dfMo <- abs(sup3[ ,"Mo.ctrl"] - sup3[ ,"Mo.LPS"]) summary(dfMo) # example of order() dfMo[1:5] order(dfMo[1:5]) dfMo[order(dfMo[1:5])] # for all genes ... dfMoOrdered <- order(dfMo) dfGenes <- tail(sup3[dfMoOrdered, 1], 10) [END]