# subsettingExercises.R
# A subsetting and filtering review

# Create some synthetic expression profiles for 1000 genes:
#   12 timesteps starting at ~ 0 and adding or subtracting one expression
#   unit each.
#

# A function to create random strings of n letters
#
randName <- function(n) {
  s <- paste(sample(letters, n), collapse="")
  return(s)
}

# Create a datafrome with a name column
#

myData <- data.frame(genes=rep("", 1000), stringsAsFactors = FALSE)

set.seed(112358)
for (i in 1:nrow(myData)) {
  myData[i,"genes"] <- randName(8)
}

# Starting expression values:
myData <- cbind(myData, numeric(nrow(myData)))
head(myData)

for (i in 1:nrow(myData)) {
  myData[i,2] <- rnorm(1, 0, 0.5)
}
head(myData)

# add 11 expression levels: +- 1 + a bit of noise

for (i in 1:11) {
    expValues <- numeric(nrow(myData))
    for (thisRow in 1:nrow(myData)) {
      expValues[thisRow] <- myData[thisRow,ncol(myData)] + sample(c(1,-1), 1) + rnorm(1, 0, 0.3)
    }
    myData <- cbind(myData, expValues)
}

colnames(myData) <- c("genes", "V01", "V02", "V03", "V04", "V05",
                       "V06", "V07", "V08", "V09", "V10", "V11", "V12")

head(myData)


# Now analyze the data
# rows 1:10 of the colum 1 and 2


         1:10  1:2
    myData[c(1,2,3,4,5,6,7,8,9,10), c(1,2)]
    myData[1:10, 1:2]


# rows 1:10 of the first two columns in reverse order


           10:1
    myData[10:1, 1:2]


# rows 1:10 of the first two columns in reverse order,
# but not the third row of the result


     myData[c(10:4,2,1), 1:2]
    (myData[10:1, 1:2])[-3, ]


# rows 1:10 of the first two columns in random order
#     hint: use sample()


           sample(1:10)
    myData[sample(1:10), 1:2]


# rows 1:10 of the first two columns, ordered by
# the value in the second column, ascending
#     hint: use order()


             myData[1:10,2]
       order(myData[1:10,2])
myData[order(myData[1:10,2]), 1:2]


# column 1:2 of all rows with gene-names that contain
# the begin with "q" 
# hint: use substr(x, start, stop)


       substr(myData[,"genes"], 1, 1) == "a"
myData[substr(myData[,"genes"], 1, 1) == "a", 1:2]


# the row of the gene with the highest final expression level


                                max(myData[,ncol(myData)])
myData[myData[,ncol(myData)] == max(myData[,ncol(myData)]), ]


[END]