Difference between revisions of "R tutorial"

input prompt

/Users/Pierette/Documents/BCB420
C:\Users\Pulcinella\Documents\CBW
C:\Users\Pantalone\Documents\JTB2020-2017
/Users/Brighella/Documents/UofT Stuffz/Courses/more/Comp Sys biol. course
C:\Users\Tartaglia\Documents\KUWTK\<Coursecode>
>

C:\Program Files\Git\cmd\git.exe
git
(None)
(None)
git
> getwd()

getwd()

<your name>

Elcid Barrett

> help(rnorm)
>
> ?rnorm
>
> ?binom     
No documentation for 'binom' in specified packages and libraries:
you could try '??binom'
> ??binom
>
> ?Binomial
>
> ?apropos     
> apropos("med")   # all functions that contain the string "med"
> apropos("^med")  # all functions that begin with the string 
> apropos("med$")  # all functions that end with the string
> ?"+"
> ?"~"
> ?"["
> ?"%in%"
>
> ?var
> getwd()
[1] "/Users/steipe/R"
> setwd("~") # Note: ~ is the "tilde" - the squiggly line - not the straight hyphen
> getwd()
[1] "/Users/steipe"
> setwd("~/../chen")  
> getwd()
[1] "/Users/chen"
> setwd("/Users/steipe/abc/R_samples")  
> getwd()
[1] "Users/steipe/abc/R_samples"
> ?Startup
> ls() 
character(0)
>
> a <- 1; b <-2; eps <- 0.0001
> ls() 
[1] "a"   "b"   "eps"
>
> rm(a) 
> ls()
[1] "b"   "eps"
>
rm(list= ls()) 
> ls() 
character(0)
>
> library()
> search() 
 [1] ".GlobalEnv"        "tools:RGUI"        "package:stats"     "package:graphics" 
 [5] "package:grDevices" "package:utils"     "package:datasets"  "package:methods"  
 [9] "Autoloads"         "package:base"
> ?vignette
> ??install
> ?install.packages
> install.packages("seqinr")   # Note: quoted string!
also installing the dependency ‘ade4’

trying URL 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.2/ade4_1.7-2.tgz'
Content type 'application/x-gzip' length 3365088 bytes (3.2 MB)
==================================================
downloaded 3.2 MB

trying URL 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.2/seqinr_3.1-3.tgz'
Content type 'application/x-gzip' length 2462893 bytes (2.3 MB)
==================================================
downloaded 2.3 MB

The downloaded binary packages are in
	/var/folders/mx/ld0hdst54jjf11hpcjh8snfr0000gn/T//Rtmpsy5GMx/downloaded_packages

> library(seqinr)     # This refers to an installed page. No quotes here...
> library(help="seqinr")
> ls("package:seqinr")
  [1] "a"                       "aaa"                     "AAstat"                 
  [4] "acnucclose"              "acnucopen"               "al2bp"                  
     [...]
[205] "where.is.this.acc"       "words"                   "words.pos"              
[208] "write.fasta"             "zscore"                 
> ?a
> a("Tyr")
[1] "Y"
> choosebank()
 [1] "genbank"       "embl"          "emblwgs"       "swissprot"     "ensembl"      
    [...]
 [31] "refseqViruses"
?data
data(package="seqinr")   # list the available data
data(aaindex)            # load ''aaindex''
?aaindex                 # what is this?
aaindex$FASG890101       # two of the indices ...
aaindex$PONJ960101

# Lets use the data: plot amino acid codes by hydrophobicity and volume

plot(aaindex$FASG890101$I,
     aaindex$PONJ960101$I, 
     xlab="hydrophobicity", ylab="volume", type="n")
text(aaindex$FASG890101$I, 
     aaindex$PONJ960101$I, 
     labels=a(names(aaindex$FASG890101$I)))
choosebank("swissprot")
mySeq <- query("mySeq", "N=MBP1_YEAST")
mbp1 <- getSequence(mySeq)
closebank()
x <- AAstat(mbp1[[1]])
barplot(sort(x$Compo))
if (!require(seqinr, quietly=TRUE)) {
    install.packages("seqinr")
    library(seqinr)
}
if (!require(sos, quietly=TRUE)) {
    install.packages("sos")
    library(sos)
}

findFn("moving average")
# sample script:
# define a vector
a <- c(1, 1, 2, 3, 5, 8, 13)
# list its contents
a
# calculate the mean of its values
mean(a)
5
5 + 3
5 + 1 / 2 # Think first is this 3 or 5.5
3 * 2 + 1
3 * (2 + 1)
2^3 # Exponentiation
8 ^ (1/3) # Third root via exponentiation
7 %% 2  # Modulo operation (remainder of integer division)
7 %/% 2 # Integer division
cos(pi) #"pi" is a predefined constant.
sin(pi) # Note the rounding error. This number is not really different from zero.
sin(30 * pi/180) # Trigonometric functions use radians as their argument - this conversion calculates sin(30 degrees)
exp(1) # "e" is not predefined, but easy to calculate.
log(exp(1)) # functions can be arguments to functions - they are evaluated from the inside out.
log(10000) / log(10) # log() calculates natural logarithms; convert to any base by dividing by the log of the base. Here: log to base 10.
exp(complex(r=0, i=pi)) #Euler's identity
complex(1)
complex(4)
complex(1, 2) # imaginary part missing: if it's missing it defaults to zero
complex(1, 2, 3) # one complex number
complex(4, 2, 3) # four complex numbers
complex(real = 0, imaginary = pi) # defining values via named parameters
complex(imaginary = pi, real = 0) # same thing - if names are used, order is not important
complex(re = 0, im = pi) # names can be abbreviated ...
complex(r = 0, i = pi)   # ... to the shortest string that is unique among the named parameters.
                         # Use this feature with discretion to keep your code readable.
complex(i = pi, 1, 0) # Think: what have I done here? Why does this work?
exp(complex(i = pi, 1, 0)) # (The complex number above is the same as in Euler's identity.)
?make.names
?reserved
a <- 5
a
a + 3
b <- 8
b
a + b
a == b # not assignment: equality test
a != b # not equal
a < b  # less than
# I don't like...
col <- c("red", "grey")
hist(rnorm(200), col=col)

# I prefer instead...
rgStripes <- c("red", "grey")
hist(rnorm(200), col=rgStripes)
info <- function(x) {
    print(x)  
    cat("str:    ")                
    str(x)  
    cat("mode:   ", mode(x), "\n")
    cat("typeof: ", typeof(x), "\n")
    cat("class:  ", class(x), "\n")
    # if there are attributes, print them too
    if (! is.null(attributes(x))) {
        cat("attributes:\n")
        print(attributes(x))
    }
}
info( 3 > 5 ) # Note: a > 5 is a logical expression, its value is FALSE.
info( 3 < 5 ) 

info( 3.0 )  # Double precision floating point number
info( 3.0e0 )  # Same value, exponential notation

info( 3 )  # Note: numbers are double precision floats by default.
info( as.integer(3) )  # If we really want an integer, we must coerce to type integer.

info( as.character(3) )  # Forcing the number to be interpreted as a character.

# More coercions. For each of these, first think what result you would expect:
info( as.numeric("3") )   # character as numeric
info( as.numeric("3.141592653") )   # string as numeric
info( as.numeric(pi) )   # not a string, but a predefined constant
info( as.numeric("pi") )   # another string as numeric. Ooops - what went wrong?

info( as.complex(1) )  
info( as.logical(0) )  
info( as.logical(1) )  
info( as.logical(-1) )  
info( as.logical(pi) )      # any non-zero number is TRUE ...
info( as.logical("pie") )   # ... but not non-numeric types. NA means "Not Available".

info( as.character(pi) )

info( Inf )
info( NaN )
info( NA )
info( NULL )
info( as.factor("M") )     # factor
info( Sys.time() )         # time
info( letters )            # inbuilt
info( 1:4 )                # numeric vector
info( matrix(1:4, nrow=2)) # numeric matrix
info( list(arabic = 1:3, roman = c("I", "II", "III")))
info( data.frame(arabic = 1:3, roman = c("I", "II", "III"), stringsAsFactors=FALSE))
info( a ~ b )              # a formula
info( info )               # the function itself
a <- 7
b <- 6:7
str(a)             # num 7
str(b)             # int [1:2] 6 7
a == b[2]          # TRUE
identical(b[2], a) # FALSE ! Not identical! Why?
                   # (see the str() results above.)

# If you need to be sure that a number is an
# integer, write it with an "L" after the number:
c <- 7L
str(c)             # int 7
identical(b[2], c) # TRUE
#Create a vector and list its contents and length:
f <- c(1, 1, 3, 5, 8, 13, 21)
f
length(f)

# Various ways to retrieve values from the vector.
f[1] # By index: "1" is first element. 
f[length(f)] # length() is the index of the last element.
1:4 # This is the range operator
f[1:4] # using the range operator (it generates a sequence and returns it in a vector)
f[4:1] # same thing, backwards
seq(from=2, to=6, by=2) # The seq() function is a flexible, generic way to generate sequences
seq(2, 6, 2) # Same thing: arguments in default order
f[seq(2, 6, 2)]

# since a scalar is a vector of length 1, does this work?
5[1]

# ...using an index vector with positive indices
a <- c(1, 3, 4, 1) # the elements of index vectors must be 
                   # valid indices of the target vector. 
                   # The index vector can be of any length.
f[a] # In this case, four elements are retrieved from f[]

# ...using an index vector with negative indices
a <- -(1:4) # If elements of index vectors are negative integers,
            # the corresponding elements are excluded.
f[a] # Here, the first four elements are omitted from f[]
f[-((length(f)-3):length(f))] # Here, the last four elements are omitted from f[]

# ...using a logical vector
f>4 # A logical expression operating on the target vector
    # returns a vector of logical elements. It has the
    # same length as the target vector.
f[f>4]; # We can use this logical vector to extract only
        # elements for which the logical expression evaluates as TRUE.
        # This is sometimes called "filtering".

# Example: extending the Fibonacci series for three steps. 
# Think: How does this work? What numbers are we adding here and why does the result end up in the vector?
f <- c(f, f[length(f)-1] + f[length(f)]); f 
f <- c(f, f[length(f)-1] + f[length(f)]); f 
f <- c(f, f[length(f)-1] + f[length(f)]); f 

# coercion: all elements of vectors must be of the same mode
c(1, 2.0, "3", TRUE)
[1] "1"    "2"    "3"    "TRUE"
f
f+1
f*2

# computing with two vectors of same length
a <- f[-1]; a # like f[], but omitting the first element
b <- f[1:(length(f)-1)]; b # like f[], but shortened by the least element
c <- a / b # the "golden ratio", phi (~1.61803 or (1+sqrt(5))/2 ), 
           # an irrational number, is approximated by the ratio of
           # two consecutive Fibonacci numbers.
c
abs(c - ((1+sqrt(5))/2)) # Calculating the error of the approximation, element by element
x <- 8; sample(6:x)
x <- 7; sample(6:x)
x <- 6; sample(6:x)  # Oi!

# also consider
x <- 6:8; seq(x)
x <- 6:7; seq(x)
x <- 6:6; seq(x)    # Oi vay!
safeSample <- function(x, size, ...) {
	# Replace the sample() function to ensure sampling from a single
	# value gives that value with probability p == 1.
        # Respect additional arguments if present.
    if (length(x) == 1 && is.numeric(x) && x > 0) {
    	if (missing(size)) size <- 1
        return(rep(x, size))
    } else {
        return(sample(x, size, ...))
    }
}
a <- 1:12
a
dim(a) <- c(2,6)
a
dim(a) <- c(2,2,3)
a
dim(a)    # returns a vector
dim(a)[3]  # only the third value of the vector
a <- 1:4
b <- 5:8
m1 <- rbind(a, b)
m1   # difference between rbind() and cbind()
m2 <- cbind(a, b)
m2
m <- cbind(m2, 9:12)
m
m[1,] # first row
m[,2] # second column
m[3,2] # element at row == 3, column == 2
m[3:4, 1:2] # submatrix: rows 3 to 4 and columns 1 to 2
pUC19 <- list(size=2686, marker="ampicillin", ori="ColE1", accession="L01397", BanI=c(235, 408, 550, 1647) )
pUC19[[1]]
pUC19[[2]]
pUC19$ori
pUC19$BanI[2]
Name	Size	Marker	Ori	Sites
pUC19	2686	Amp	ColE1	EcoRI, SacI, SmaI, BamHI, XbaI, PstI, HindIII
pBR322	4361	Amp, Tet	ColE1	EcoRI, ClaI, HindIII
pACYC184	4245	Tet, Cam	p15A	ClaI, HindIII

plasmidData <- read.table("plasmidData.tsv", sep="\t", header=TRUE, stringsAsFactors = FALSE)
plasmidData   # show what the data frame contains
pD2 <- edit(plasmidData)
> ?"["     # Note that you need quotation marks around the operator for this.
plasmidData[1, ]
plasmidData[2, ]

# we can extract more than one row by specifying
# the rows we want in a vector ...
plasmidData[c(1, 2), ]   

# ... this works in any order ...
plasmidData[c(3, 1), ]   

# ... and for any number of rows ...
plasmidData[c(1, 2, 1, 2, 1, 2), ]   

# Same for columns
plasmidData[ , 2 ]

# We can select rows and columns by name if a name has been defined...
plasmidData[,"Name"]
plasmidData$Name      # different syntax, same thing. This is the syntax I use most frequently.

# Watch this!
plasmidData$Name[plasmidData$Ori != "ColE1"]
# What happened here?
# plasmidData$Ori != "ColE1" is a logical expression, it gives a vector of TRUE/FALSE values
plasmidData$Ori != "ColE1"

# We insert this vector into the square brackets. R then returns all rows for
# which the vector is TRUE.

# In this way we can "filter" for values
plasmidData$Size > 3000
plasmidData$Name[plasmidData$Size > 3000]

# This principle is what we use when we want to "sort" an object
# by some value. The function order() is used to return values
# that are sorted. Remember this: not sort() but order().
order(plasmidData$Size)
plasmidData[order(plasmidData$Size), ]

# grep() matches substrings in strings 
grep("Tet", plasmidData$Marker)
plasmidData[grep("Tet", plasmidData$Marker), ]
plasmidData[grep("Tet", plasmidData$Marker), "Ori"]
x <- sample(1:10)
x
x[4] <- 99
x
x <- x[order(x)]
x
# Simple "if" statement:
# Rolling a die. If you get a "six", you get to roll again.

x <- sample(1:6, 1)
if (x == 6) {
    x <- c(x, sample(1:6, 1))
}
print(x)

# "if", "else if", and "else"
# Here is a popular dice game called high-low.

a <-  sample(1:6, 1)
b <-  sample(1:6, 1)
if (a + b > 7) {
    print("high")
} else if (a + b < 7) {
    print("low")
} else {
    print("seven")
}
x <- c(1, 3, 5, 7, 11, 13, 17)
x > 3 &  x < 17 # FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
x [x > 3 &  x < 17]  #  5  7 11 13

x > 3 && x < 17 # FALSE
x <- numeric()

if (length(x) == 0 |  is.na(x)) { print("zero") }  # throws an error, because is.na() is
                                                   # evaluated even though x has length zero.

if (length(x) == 0 || is.na(x)) { print("zero") }  # no error: length test is TRUE so is.na()
                                                   # never gets evaluated.
# Let's stay with the high-low game for a moment:
# What are the odds of winning?
# Let's simulate some runs with a "for" loop.

N <- 25000
outcomes <- character(N)  # initialize an empty vector
for (i in 1:N) {          # repeat, assign each element of 1:N to
                          # the variable "i" in turn
    a <-  sample(1:6, 1)
    b <-  sample(1:6, 1)
    if (a + b > 7) {
        outcomes[i] <- "high"
    } else if  (a + b < 7) {
        outcomes[i] <- "low"
    } else {
        outcomes[i] <- "seven"
    }
}
head(outcomes, 36)
table(outcomes)  # the table() function tabulates the elements
                 # of a vector

round((36 * table(outcomes))/N) # Can you explain this expression?
# Let's assume we are playing high-low in a casino. You can bet
# high or low. You get two dollars for one if you win, nothing
# if you lose. If you bet "high", you lose if we roll "low"
# or "seven". Thus your chances of winning are 15/36 = 42%. You play
# the following strategy: start with 33 dollars. Bet one dollar.
# If you win, good. If you loose, triple your bet. Stop the game
# when your funds are gone (bad), or if you have more than 100
# dollars (good) - i.e. you have tripled the funds you risked.
# Also stop if we've played more than 100 rounds and start
# getting bored.

set.seed(1234567)
funds <- 33
bet <- 1         # our first bet

nPlays <- 0      # this counts how often we've played
MAXPLAYS <- 100 

while (funds > 0 && funds < 100 && nPlays < MAXPLAYS) {

    bet <- min(bet, funds)  # can't bet more than we have.
    funds <- funds - bet    # place the bet
    a <-  sample(1:6, 1)    # roll the dice
    b <-  sample(1:6, 1)

    # we always play "high"
    if (a + b > 7) {        # we win :-)     
        result <- "Win!  "
        funds <- funds + (2 * bet)
        bet <- 1            # reset the bet to one dollar
    } else {                # we loose :-(
        result <- "Loose."
        bet <- 3 * bet      # increase the bet to 3 times previous
    }
    print(paste("Round", nPlays, result, 
                "Funds now:", funds,
                "Next bet:", bet))
    nPlays <- nPlays + 1
}

# Now before you get carried away - try this with different seeds
# and you'll quickly figure out that the odds of beating the game
# are not all that great...
#defining the function:
myFunction <- function(myParameters) { 
	result <- doSomethingWith(myParameters)
	return(result)
}

# using the function:
myEpiphany <- myFunction(Arguments)
biCode <- function(s) { 
	substr(s, 4, 5) <- substr(strsplit(s,"\\s+")[[1]][2], 1, 2)
	return (toupper(substr(s, 1, 5)))
}

biCode("Homo sapiens")              # HOMSA
biCode("saccharomyces cerevisiae")  # SACCE
fibSeq <- function(n) { 
   if (n < 1) { return( c(0) ) }
   else if (n == 1) { return( c(1) ) }
   else if (n == 2) { return( c(1, 1) ) }
   else {
      v <- c(1, 1)
      for ( i in 3:n ) {
         v <- c(v, v[length(v)-1] + v[length(v)])
      }
      return( v )
   }
}
rollDice <- function(len=1, MIN=1, MAX=6) {
    v <- rep(0, len)
    for (i in 1:len) {
        x <- runif(1, min=MIN, max=MAX)
        x <- as.integer(x)
        v[i] <- x
    }
    return(v)
}
rollDice()
table(rollDice(1000))
debug(rollDice)
rollDice(10)
debugging in: rollDice(10)
debug at #1: {
    v <- rep(0, len)
    for (i in 1:len) {
        x <- runif(1, min = MIN, max = MAX)
        x <- as.integer(x)
    	v[i] <- x
    }
    return(v)
}
Browse[2]> 
debug at #2: v <- rep(0, len)
Browse[2]> 
debug at #3: for (i in 1:len) {
    x <- runif(1, min = MIN, max = MAX)
    x <- as.integer(x)
    v[i] <- x
}
Browse[2]> 
debug at #4: x <- runif(1, min = MIN, max = MAX)
Browse[2]> 
debug at #5: x <- as.integer(x)
Browse[2]> x   # Here we examine the current value of x
[1] 4.506351
Browse[2]> 
debug at #6: v[i] <- x
Browse[2]> 
debug at #4: x <- runif(1, min = MIN, max = MAX)
Browse[2]> v
[1] 4      # Aha: as.integer() truncates, but doesn't round!
Browse[2]> Q
undebug(rollDice)
rollDice <- function(len=1, MIN=1, MAX=6) {
    v <- rep(0, len)
    for (i in 1:len) {
    	x <- runif(1, min=MIN, max=MAX+1)
    	x <- as.integer(x)
    	v[i] <- x
    }
    return(v)
}
table(rollDice(1000))
# Disclaimer 1: this function would be better
# written as ...

rollDice <- function(len=1, MIN=1, MAX=6) {
	return(as.integer(runif(len, min=MIN, max=MAX+1)))
}

# Check the output:
table(rollDice(1000))

# This works, since runif() can return a vector of deviates,
# but if we write the function this way we can't check the value of
# individual trials.

# Disclaimer 2: the function relies on a side-effect of as.integer(), which is
# to drop the digits after the comma when it converts. More explicit and
# therefore clearer would be to use the function floor() instead. Here, the
# truncation is not a side effect, but the desired behaviour. This is
# actually important: there is no guarantee how as.integer() constructs an
# integer from a float, it could e.g. round, instead of truncating. But rounding
# would give a wrong distribution! An error that may be hard to spot. (You
# can easily try using the round() function and think about how the result is wrong.)

# A better alternative is thus to write:

rollDice <- function(len=1, MIN=1, MAX=6) {
	return(floor(runif(len, min=MIN, max=MAX+1)))
}

# Disclaimer 3
# A base R function exists that already rolls dice in the required way: sample()

table(sample(1:6, 1000, replace=TRUE))

@@ Line 50: / Line 50: @@
 A '''path''' is the complete specification of where a file is located in the directory tree of your computer. Paths are simply directories strung together into a long string, separated by a forward slash "/" (on Mac or Unix) or a backslash "\" on Windows. Take note! When writing Windows paths in '''R''',  you have to use the "wrong" forward slash to specify the path. '''R''' will translate Unix-style paths into Windows-style paths automatically - but the backslash would be interpreted as an "escape" character that gives the following character a special meaning.
-;Examples
+;Folder name and path examples
-*<code>/Users/Pierette/Documents/BCB420</code> ◁ Looking good on a Mac.
+*<span style="background:#AAEEBB;"> <tt>/Users/Pierette/Documents/BCB420</tt>&nbsp;&nbsp;◁&nbsp;Looking good on a Mac.</span>
-*<code>C:\Users\Pulcinella\Documents\CBW</code> ◁ Looking good on a Windows computer.
+*<span style="background:#AAEEBB;"> <tt>C:\Users\Pulcinella\Documents\CBW</tt>&nbsp;&nbsp;◁&nbsp;Looking good on a Windows computer.</span>
-*<code>C:\Users\Pantalone\Documents\JTB2020-2017</code> ◁ Problem. No special characters please.
-*<code>/Users/Brighella/Documents/UofT Stuffz/Courses/more/Comp Sys biol. course</code> ◁ Problem. Please follow instructions carefully.
+*<span style="background:#EEA9AF;"> <tt>C:\Users\Pantalone\Documents\JTB2020-2017</tt>&nbsp;&nbsp;◁&nbsp;Problem. No special characters please.</span>
-*<code>C:\Users\Tartaglia\Documents\KUWTK</code> ◁ I can't even ...
+*<span style="background:#EEA9AF;"> <tt>/Users/Brighella/Documents/UofT Stuffz/Courses/more/Comp Sys biol. course</tt>&nbsp;&nbsp;◁&nbsp;Problem. Please read instructions more carefully.</span>
+*<span style="background:#EEA9AF;"> <tt>C:\Users\Tartaglia\Documents\KUWTK\&lt;Coursecode&gt;</tt>&nbsp;&nbsp;◁&nbsp;I can't even ...</span>

Difference between revisions of "R tutorial"

Revision as of 23:52, 17 December 2016

Contents

The environment

Files, directories and paths

Install R

Install R Studio

"Projects"

Git Version control

Notation

User interface

The Help system

Working directory

.Rprofile - startup commands

... unix systems

... Mac OS X systems

...Windows systems

The "Workspace"

Packages

Scripts, Projects and Version Control

Simple commands

Operators

Functions

Variables

Scalar data

Vectors

Matrices

Lists

Data frames

Subsetting

Control structures

if and else

for

while

Writing your own functions

Coding style

Debugging

Finishing

Notes

Further reading and resources

Navigation menu

Search