# subsettingExercises.R # A subsetting and filtering review # Create some synthetic expression profiles for 1000 genes: # 12 timesteps starting at ~ 0 and adding or subtracting one expression # unit each. # # A function to create random strings of n letters # randName <- function(n) { s <- paste(sample(letters, n), collapse="") return(s) } # Create a datafrome with a name column # myData <- data.frame(genes=rep("", 1000), stringsAsFactors = FALSE) set.seed(112358) for (i in 1:nrow(myData)) { myData[i,"genes"] <- randName(8) } # Starting expression values: myData <- cbind(myData, numeric(nrow(myData))) head(myData) for (i in 1:nrow(myData)) { myData[i,2] <- rnorm(1, 0, 0.5) } head(myData) # add 11 expression levels: +- 1 + a bit of noise for (i in 1:11) { expValues <- numeric(nrow(myData)) for (thisRow in 1:nrow(myData)) { expValues[thisRow] <- myData[thisRow,ncol(myData)] + sample(c(1,-1), 1) + rnorm(1, 0, 0.3) } myData <- cbind(myData, expValues) } colnames(myData) <- c("genes", "V01", "V02", "V03", "V04", "V05", "V06", "V07", "V08", "V09", "V10", "V11", "V12") head(myData) # Now analyze the data # rows 1:10 of the colum 1 and 2 1:10 1:2 myData[c(1,2,3,4,5,6,7,8,9,10), c(1,2)] myData[1:10, 1:2] # rows 1:10 of the first two columns in reverse order 10:1 myData[10:1, 1:2] # rows 1:10 of the first two columns in reverse order, # but not the third row of the result myData[c(10:4,2,1), 1:2] (myData[10:1, 1:2])[-3, ] # rows 1:10 of the first two columns in random order # hint: use sample() sample(1:10) myData[sample(1:10), 1:2] # rows 1:10 of the first two columns, ordered by # the value in the second column, ascending # hint: use order() myData[1:10,2] order(myData[1:10,2]) myData[order(myData[1:10,2]), 1:2] # column 1:2 of all rows with gene-names that contain # the begin with "q" # hint: use substr(x, start, stop) substr(myData[,"genes"], 1, 1) == "a" myData[substr(myData[,"genes"], 1, 1) == "a", 1:2] # the row of the gene with the highest final expression level max(myData[,ncol(myData)]) myData[myData[,ncol(myData)] == max(myData[,ncol(myData)]), ] [END]