# ================================================== # # Introduction to R # # # # Boris Steipe # # # # ================================================== # # ================================================== # # # # Chapter 1: The R Environment # # # # ================================================== # # Set up your workspace # ===================== # Let's set the current "working directory" to your # workshop directory. You have a workshop directory, right? # Where are you now? # Select the line below, or place your cursor in the line, # then hit to execute the line. getwd() # ==== CHECKPOINT ========================= # Place a green post-it on the lid of your # laptop when this worked, place a pink- # post-it if it didn't work. # ========================================= # syntax to set the working directory... setwd("/path/of/your/directory") # the argument is a string setwd("/path/of/your/course\ directory") # "escape" blank spaces list.files() # A note for Windows: the "\" (backslash) has a particular # meaning in strings: it "escapes" the following character. # Therefore something like setwd("C:\My\R\files\") will NOT # work. You have to "escape the escape character" to turn it # into a "literal" backslash: setwd("C:\\My\\R\\files\\") or # R will translate for you: setwd("C:/My/R/files/") # I usually write setwd("whatever/path") as the first command # in my scripts. # But how do you know what the full path for the working # directory is? # There is a neat trick to get the path for a file (on the Mac). # Drag and drop a file into a script. This will put the full path # of the file into your script (... as the argument to a source() # command). # But unfortunately this does not work in R-Studio. There, you can # use the menu to change the working directory. Then copy the # command, which includes the path, into your script. # ================================================== # Using the console # ================================================== # Note when you type a " or (, R automatically adds the # corresponding closing element. Similar, if you select # characters and type a quotation mark or bracket # your text is not overwritten but enclosed. # Sometimes you get stuck when the console tries to # break a statement over several lines and you don't # supply the correct closing element... # Try using the escape character to break out of this. # Also: code completion in R Studio... # TASK: type "sys" in the console to get a list of system- # related functions. Find Sys.time(), and use the tab # to autocomplete. # Note that code completion works only in the console, # not in the script editor. # ================================================== # Using scripts # ================================================== # You can simply type R commands in the console and work this way. # This is convenient, since you can view # your command history (how?) and use , # etc. to retrieve and edit previous commands. # But it's much better to use scripts for everything. # In fact: # ALL YOUR R WORK SHOULD ALWAYS GO INTO SCRIPTS. # This way you are on a good path to robust, reproducible # work habits. # You simply type your commands in the script, as you # would type them into the console. # Then, put the cursor in any line, or select a block of code # and press # (Mac) # (Win) # to execute it in the console. # Try: length(dir("~", pattern = ".txt")) # Now select only what's between the outer brackets, # and executing this portion of code. # Select multiple lines to execute a whole block. for (i in 1:length(dir("~"))) { print(dir("~")[i]) } # You can use source("filename") to execute an entire # script at once. # TASK: download a template script from the workshop Wiki, # save it to your workshop directory, and open a copy(!) # in R Studio. Save that under "workshopNotes.R" or similar # and use it as a scratchpad for the workshop. # ==== CHECKPOINT ========================= # Place a green post-it on the lid of your # laptop when you are done, place a pink- # post-it if there are problems. # ========================================= # The idea is to have your notes and your R code in # the same place. This is a first step towards # "literate programming". # ================================================== # Help, documentation other information # ================================================== # Recapitulate ... ?dir # help("dir") ??dir # same as help.search("dir") apropos("^dir") # the "sos" package to look for functions everywhere if (!require(sos, quietly=TRUE)) { install.packages("sos") library(sos) } ls("package:sos") # contents of "sos" browseVignettes("sos") # documentation # use "sos" to find more functions... findFn("directory") # ================================================== # # # # Chapter 2: First steps # # # # ================================================== # # ================================================== # Getting data into R # ================================================== # === Assigning variables x <- "1" x phi <- 2 * pi phi x <- pi x # Question: how many significant digits does R process? # How do you print them all? # TASK: find out how to control the number of digits # printed in a print() expression. # ==== CHECKPOINT ========================= # Place a green post-it on the lid of your # laptop when you are done, place a pink- # post-it if you can't solve this. # ========================================= # sprintf() gives the most control sprintf("%50.49f", pi) # === Digression: Anatomy of a function ... # Various functions exist to display the properties of R objects. # Here is a function that combines them: typeInfo <- function(x) { print(x, digits=22) cat("str: ") str(x) cat("mode: ", mode(x), "\n") cat("typeof: ", typeof(x), "\n") cat("class: ", class(x), "\n") # if there are attributes, print them too if (! is.null(attributes(x))) { cat("attributes:\n") print(attributes(x)) } } # That's a useful utility to have. # Let's take it apart. # Where do we save it so it becomes available whenever # we start up R? # TASK: Place the function typeInfo() into a file named "Utilities.R" # Configure R to load Utilities.R upon startup. # Exit R Studio, and start it again. How do you know # the function is now available? # ==== CHECKPOINT ========================= # Place a green post-it on the lid of your # laptop when you are done, place a pink- # post-it if there are problems. # ========================================= # === DATA TYPES ... # === Creating vectors # Recapitulate: v <- c(1, 1, 2, 3, 5, 8) v v <- c(v, 13, 25) v 1:3 seq(-0.5, 0.5, by = 0.1) rep("Ha", 3) genes <- c("Spic", "Cebpb", "Lyz2", "Sfpi1", "Nfkbiz") # These are some of the genes that are markers for # monocytes... # Digression: A paper from modern high-throughput # biology... # TASK: Download and browse the Jaitin et al. paper. # # We will make it our goal to analyse the supplementary data from this # paper. # Often the data we need can be copied and pasted from simple # text files. # TASK: open the text file for "Fig_3-CharacteristicGenes". # (the link is in the Wiki) in your browser. I have # prepared this file from the text in the actual # paper. # How do we get this data into a vector? # First think of a way to do this by hand. # Then, design a function that would # do this conveniently. # -> See sample solution if needed. # ==== CHECKPOINT ========================= # Place a green post-it on the lid of your # laptop when you are done, place a pink- # post-it if there are problems. # ========================================= # TASK: Consolidate working with text: convert a binomial # scientific name into a 5-letter label. The string # you work with should be something like # "arabidopsis thaliana". The result should be # "ARATH". # -> See sample solution if needed. # ==== CHECKPOINT ... ===================== # Vectors can be joined together to make matrices, # but be careful that they have the same type. When # in doubt, use dataframes instead. # TASK: Make a vector with cell-type labels, the elements # (taken from the columns of Fig. 3B) should # match each row of Fig. 3. Use rep() expressions, # (e.g. rep("B", 4), don't type out every line. # TASK: Make a matrix, so that each row contains # a characteristic gene and the cell type it # characterizes. Hint: use cbind(). # -> See sample solution if needed. # ==== CHECKPOINT ... ===================== # =================================================== # Lists # =================================================== # Lists are the workhorses of R. They are objects that # can contain data of any type. Most of R's more advanced # objects are lists "under the hood". # A brief example: pUC19 <- list(size = 2686, # number marker = "ampicillin", # character ori = "ColE1", accession = "L01397", inLab = TRUE, # Boolean sites = list(EcoRI=c(396), # List AclI=c(1924, 2297), DraI=c(1563, 1582, 2274), BanI=c(235, 408, 550, 1647) ) ) pUC19[[1]] pUC19[[3]] pUC19$ori pUC19$sites pUC19$sites$BanI pUC19$sites$BanI[2] unlist(pUC19) # Explain .... # =================================================== # Data frames # =================================================== # A data frame is a matrix or a "set" of data. It is # a list of vectors and/or factors of the same length, # that are related "across", such that data in the # same position come from the same experimental unit # (subject, animal, etc). # Since it is basically a list, the columns can have different type! myDF <- data.frame(genes = c("Abc1", "Qrz", "Fubr31"), expr = c(168059, 23490578, 34), induced = c(TRUE, FALSE, FALSE)) myDF[-2,] typeInfo(myDF) # What is it with the factors? ... myDF <- data.frame(genes = c("Abc1", "Qrz", "Fubr31"), expr = c(168059, 23490578, 34), induced = c(TRUE, FALSE, FALSE), stringsAsFactors = FALSE) typeInfo(myDF) # Why do we need data frames if they do the much # the same as a list? # More efficient storage, and indexing! # R's read...() functions return data frames. # ... which gets us back to our goal of studying details # of the Jaitin et al. paper: # Are the "known" markers of Fig. 2 D enriched # as expected in the cell types? # The relevant data is in supplementary table 3, # which is an Excel spreadsheet. # TASK: Download supplementary table 3 from the workshop Wiki: # A word on Excel: it's a very good spreadsheet program, # it is miserable and often wrong on statistics, # and it makes horrible, horrible plots. # To elaborate - see the two links below: # http://www.practicalstats.com/xlsstats/excelstats.html # http://www.burns-stat.com/documents/tutorials/spreadsheet-addiction/ # ... these are not merely cosmetic problems! # Therefore: It's oK to keep data in Excel spreadsheets # if you must - but read it into R for any analysis! # But be cautious: one of the problems of Excel is that # it truncates numeric precision. Protip: convert all # cells to "text" before export. # There are many other "read" functions. # Refer to the R Data Import / Export manual # http://cran.r-project.org/doc/manuals/R-data.html # See: ?read.table # ... includes read.csv and read.delim ?read.fwf # ... for "fixed width format" ?readLines # ... for reading in text-files line by line # Excel spreadsheets should be converted to csv # or tsv format. Alternatively the package # xlsreadwrite is available via CRAN ... see # http://cran.r-project.org/web/packages/xlsReadWrite/ # ... but I think this is unsound practice. # Task: # 1 - load the data in Table_S3.xls on the Wiki # into Excel, and save it as # a .csv (comma separated values) file. # 2 - Examine the file (a plain text file) in a # text-editor (such as R). # 3 - Read the table into R, assigning it to a variable. # I usually give the first input of data the variable # name "rawDat" since it will usually be processed before # it becomes meaningful for analysis. # 4 - Use head() to look at the beginning of the object. # 5 - Remove any unneeded header rows. # 6 - Give the columns names that reflect the cell type (cf. # Figure 2c), and the stimulus status. # 7 - Use typeInfo() to analyse the object you have created. # -> See sample solution if needed. # ==== CHECKPOINT ... ===================== # Much output. Nb. for a heavy-duty function, we should rewrite # typeInfo() to limit the output ... # Now: what is it with the "factors". # ================================================== # Digression: Factors... # ================================================== # Many of R's dataframe methods convert all strings # into factors by default. Factors are special types: # they are nominally strings - (m, f) or (agree, neutral, # disagree) or such. But underlyingly they are coded as integers # that identify the "levels" of the factor. # To illustrate. genders <- factor(c("m", "f", "f", "m", "f")) genders typeInfo(genders) is.ordered(genders) # We can define ordered factors by adding some details to # our factor() command - e.g. tumor grades: sampleGrades <- factor(c("G1", "G3", "G3", "G1", "G2", "G1"), levels = c("G1", "G2", "G3", "G4"), ordered = TRUE) sampleGrades # Note that G4 is a level although it # was not observed in the data is.ordered(sampleGrades) # Factors are useful since they support a number of analysis # methods such as ordering boxplots, or calculating # For more on factors, have a look at this factor tutorial # by Jenny Bryan: # http://www.stat.ubc.ca/~jenny/STAT545A/block08_bossYourFactors.html # and this discussion on their use: # http://stackoverflow.com/questions/3445316/factors-in-r-more-than-an-annoyance # # But for our purposes, the default behavior of R, to # treat all strings as factors is entirely unwanted # and needs to be turned off. Always use the parameter # stringsAsFactors = FALSE to achieve this. If you don't # you are in for some bad surprises if e.g. there is # a character "contaminant" in a numeric column. # Note: you can change R's default behaviour and set a global # option to "turn factors off". Some advocate this, # I don't think this is a good idea, since you may # encounter packages or functions that make # assumptions about R's default behaviour and will thus # fail. myDF <- data.frame(data = c("N/A", 1, 1, 2, 3, 5, 8)) typeInfo(myDF) myDF <- myDF[-1, ] myDF myDF2 <- as.numeric(myDF) myDF2 # Whoa! what just happened ? myDF3 <- as.numeric(as.character(myDF)) myDF3 # :-) # TASK: Repair the sup3 data.frame - reload it with # stringsAsFactors = FALSE # -> See sample solution if needed. # ==== CHECKPOINT ... ===================== # Check to see what we have ... sup3[1:10,] head(sup3) tail(sup3) nrow(sup3) ncol(sup3) sup3$genes[1:10] sup3$genes[1:10] # Now we can finally return to our original question and # check how enrichment profiles characterize cell-types # in the data, we can try sup3[sup3$genes == "Cd19", ] sup3[sup3$genes == "Lyz2", ] # But! sup3[sup3$genes == "B220", ] # Not found! # B220 has a number of synonyms as a quick Google search shows: B220 CD45 CD45R GP180 L-CA LCA LY5 PTPRC T200 # We should check if the gene is listed under any of # these synonyms in the data... # Is there a convenient way to convert such lines into # a character vector? # Yes: as we have done above, define the list as one string constant, then use # strsplit() on it. s <- "B220 CD45 CD45R GP180 L-CA LCA LY5 PTPRC T200" B220synonyms <- unlist(strsplit(s, "\n")) # The way to check whether a string is contained # in data is the _extremely_ useful %in% operator. ?"%in%" B220synonyms %in% toupper(sup3$genes) # However - no luck. None of the synonyms is in the table either. # How do we know that this is not a problem with our expression? # Positive control! c(B220synonyms, "CD19") %in% toupper(sup3$genes) # TASK: check if our "characteristic genes" are all in the table # then find the enrichment vectors for the subset # Bst2, Siglech, Ly6d, Irf8 # -> See sample solution if needed. # ==== CHECKPOINT ... ===================== # To do more than that, we really need to look at writing # "programs". # ================================================== # # # # Chapter 3: Programming basics # # # # ================================================== # # We need two major concepts to program: # - defining conditions (TRUE/FALSE) # - repeating commands (loops) x <- -2 if(x>0) { print(x) } else if (x==0) { print(0) } else { print(-x) } # Logical vectors ?TRUE # Explore conditional statements if (TRUE) {print("true")} else {print("false")} if ("true") {print("true")} else {print("false")} if ("t") {print("true")} else {print("false")} if (1) {print("true")} else {print("false")} if (0) {print("true")} else {print("false")} if (-1) {print("true")} else {print("false")} if (pi) {print("true")} else {print("false")} if (Inf) {print("true")} else {print("false")} if (NULL) {print("true")} else {print("false")} if (NA) {print("true")} else {print("false")} if (NaN) {print("true")} else {print("false")} if (Inf) {print("true")} else {print("false")} # Logical operators TRUE ! TRUE FALSE > TRUE FALSE < TRUE FALSE < -1 0 == FALSE "x" == "u" "x" >= "u" "x" <= "u" "x" != "u" TRUE | FALSE TRUE & FALSE # equality and identity ?identical a <- TRUE b <- TRUE a; b a == b identical(a, b) b <- 1 a; b a == b identical(a, b) # some other useful tests for conditional expressions ?all ?any ?duplicated ?exists ?is.character ?is.factor ?is.integer ?is.null ?is.numeric ?is.unsorted ?is.vector # ============================================= # Loops # for loop n <- 1000000 x <- rnorm(n,10,1) y <- sqrt(x) # Vectors can grow as necessary ... lim <- 50000 # try this with successively larger values l <- c(); system.time( for (i in 1:lim) { l[i] <- sqrt(x[i]) } ) # ... at a great performance cost. Precreating them for the # required length is much more efficient. lim <- 50000 # try this with successively larger values l <- numeric(lim) system.time( for (i in 1:lim) { l[i] <- sqrt(x[i]) } ) # while loop x <- "" while (substr(x, nchar(x), nchar(x)) != "G") { x <- paste(x, sample(LETTERS[1:15], 1), sep="") } x # ============================================= # Efficiency of loops # A big topic among R-tisans... y <- numeric(length(x)) z <- y system.time(for (i in 1:length(x)) {y[i] <- sqrt(x[i])}) system.time(z <- sqrt(x)) identical(y, z) # The efficient vector code underneath R causes # operations that can be vectorized to be about # 100 times faster than if done in a loop. # But what to do with matrices? # Consider: n <- 1000000 x <- rnorm(n,10,1) x <- cbind(x, rnorm(n,10,1)) x <- cbind(x, rnorm(n,10,1)) y <- numeric(length(x)) system.time(for (i in 1:nrow(x)) {y[i] <- sqrt(sum(x[i,]))}) # R has a set of functions that take advantage # of this: ?apply and its siblings: z <- numeric(length(x)) system.time(z <- apply(x, 1, function(swan) {sqrt(sum(swan))})) identical(y,z) # Usually the time savings are considered important. # I often prefer the explicitness of simple for-loops # for development and debugging. # ============================================= # A note on namespaces mf <- function(a){ for (i in 1:a) { cat("Hello ") } } mf(3) typeof(3) typeof <- function(a) { # Here we "inadvertently" for (i in 1:a) { # overwrite an inbuilt cat("Hello ") # function name. } } typeof(3) rm(typeof) typeof(3) # TASK: write a biCodes function that # operates on vectors. # -> See sample solution if needed. # ==== CHECKPOINT ... ===================== # ================================================== # Back to our dataset # ================================================== # TASK: what genes have the characteristics of the # unlabelled cluster in Figure 3? # Develop the structure of a program to find them. # Implement it step by step. # ==== CHECKPOINT ... ===================== # ================================================== # # # # Chapter 4: Basic Analysis # # # # ================================================== # # ================================================== # Filtering data # ================================================== # TASK: find the top 30 most differentially enriched # genes in the Mo cells. Hint: you will need to sort # results ... but sort() is not the function you need, # you need order(). # Then find the same for Mf cells. # Then find the union of the two sets. Plot the # differential enrichment of one against the other. # -> See sample solution if needed. # ==== CHECKPOINT ... ===================== # ================================================== # Basic plots and slightly more advanced plots # ================================================== # Task: Plot boxplots for the different cell-types, # then plot the actual values of requested # genes. # Task: show the differential expression as a # barplot. # Task: Barplots are bad. Improve the barplot according to # Weissgerber et al.'s ideas # More plotting topics. # ================================================== # Integrating data # ================================================== # TASK: Data integration # 1 - find all genes that look like a Monocyte # response to LPS stimulation. # 2 - translate their gene symbols to Entrez # IDs using http://biodbnet.abcc.ncifcrf.gov/ # 3 - see whether they are co-expressed (i.e. # presumably coregulated) at http://coxpresdb.jp/ # [End]