R tutorial

input prompt

<your name>

Elcid Barrett

your name
<Elcid Barret>
> getwd()

getwd()

 “”„«» 
 ‘’‚‹› 
/Users/Pierette/Documents/BCB420
C:\Users\Pulcinella\Documents\CBW
"C:/Users/Pulcinella/Documents/CBW"
C:\Users\Pantalone\Documents\JTB2020-2017
/Users/Brighella/Documents/UofT Stuffz/Courses/more/Comp Sys biol. course
C:\Users\Tartaglia\Documents\KUWTK\<Coursecode>
>

C:\Program Files\Git\cmd\git.exe
git
(None)
(None)
git
> help(rnorm)
>
> ?rnorm
>
> ?binom     
No documentation for 'binom' in specified packages and libraries:
you could try '??binom'
> ??binom
>
> ?Binomial
>
> ?apropos     
> apropos("med")   # all functions that contain the string "med"
> apropos("^med")  # all functions that begin with the string 
> apropos("med$")  # all functions that end with the string
> ?"+"
> ?"~"
> ?"["
> ?"%in%"
>
> ?var
> getwd()
[1] "/Users/steipe/R"
> setwd("~") # Note: ~ is the "tilde" - the squiggly line - not the straight hyphen
> getwd()
[1] "/Users/steipe"
> setwd("~/../chen")  
> getwd()
[1] "/Users/chen"
> setwd("/Users/steipe/abc/R_samples")  
> getwd()
[1] "Users/steipe/abc/R_samples"
> ?Startup
> ls() 
character(0)
>
> a <- 3
> rm(a) 
> ls()
[1] "b"   "c"
>
rm(list = ls()) 
> ls() 
character(0)
>
> library()
> search() 
 [1] ".GlobalEnv"        "tools:RGUI"        "package:stats"     "package:graphics" 
 [5] "package:grDevices" "package:utils"     "package:datasets"  "package:methods"  
 [9] "Autoloads"         "package:base"
> ?vignette
> ??install
> ?install.packages
> install.packages("seqinr")   # Note: quoted string!
also installing the dependency ‘ade4’

trying URL 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.2/ade4_1.7-2.tgz'
Content type 'application/x-gzip' length 3365088 bytes (3.2 MB)
==================================================
downloaded 3.2 MB

trying URL 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.2/seqinr_3.1-3.tgz'
Content type 'application/x-gzip' length 2462893 bytes (2.3 MB)
==================================================
downloaded 2.3 MB

The downloaded binary packages are in
	/var/folders/mx/ld0hdst54jjf11hpcjh8snfr0000gn/T//Rtmpsy5GMx/downloaded_packages

> library(seqinr)     # This refers to an installed page. No quotes here...
> library(help="seqinr")
> ls("package:seqinr")
  [1] "a"                       "aaa"                     "AAstat"                 
  [4] "acnucclose"              "acnucopen"               "al2bp"                  
     [...]
[205] "where.is.this.acc"       "words"                   "words.pos"              
[208] "write.fasta"             "zscore"                 
> ?a
> a("Tyr")
[1] "Y"
> choosebank()
 [1] "genbank"       "embl"          "emblwgs"       "swissprot"     "ensembl"      
    [...]
 [31] "refseqViruses"
?data
data(package="seqinr")   # list the available data
data(aaindex)            # load ''aaindex''
?aaindex                 # what is this?
aaindex$FASG890101       # two of the indices ...
aaindex$PONJ960101

# Lets use the data: plot amino acid codes by hydrophobicity and volume

plot(aaindex$FASG890101$I,
     aaindex$PONJ960101$I, 
     xlab="hydrophobicity", ylab="volume", type="n")
text(aaindex$FASG890101$I, 
     aaindex$PONJ960101$I, 
     labels=a(names(aaindex$FASG890101$I)))
choosebank("swissprot")
mySeq <- query("mySeq", "N=MBP1_YEAST")
mbp1 <- getSequence(mySeq)
closebank()
x <- AAstat(mbp1[[1]])
barplot(sort(x$Compo))
if (!require(seqinr, quietly=TRUE)) {
    install.packages("seqinr")
    library(seqinr)
}
if (!require(sos, quietly=TRUE)) {
    install.packages("sos")
    library(sos)
}

findFn("moving average")
5
5 + 3
5 + 1 / 2 # Think first is this 3 or 5.5
3 * 2 + 1
3 * (2 + 1)
2^3 # Exponentiation
8 ^ (1/3) # Third root via exponentiation
7 %% 2  # Modulo operation (remainder of integer division)
7 %/% 2 # Integer division
cos(pi) #"pi" is a predefined constant.
sin(pi) # Note the rounding error. This number is not really different from zero.
sin(30 * pi/180) # Trigonometric functions use radians as their argument - this conversion calculates sin(30 degrees)
exp(1) # "e" is not predefined, but easy to calculate.
log(exp(1)) # functions can be arguments to functions - they are evaluated from the inside out.
log(10000) / log(10) # log() calculates natural logarithms; convert to any base by dividing by the log of the base. Here: log to base 10.
exp(complex(r=0, i=pi)) #Euler's identity
complex(1)
complex(4)
complex(1, 2) # imaginary part missing: if it's missing it defaults to zero
complex(1, 2, 3) # one complex number
complex(4, 2, 3) # four complex numbers
complex(real = 0, imaginary = pi) # defining values via named parameters
complex(imaginary = pi, real = 0) # same thing - if names are used, order is not important
complex(re = 0, im = pi) # names can be abbreviated ...
complex(r = 0, i = pi)   # ... to the shortest string that is unique among the named parameters.
                         # Use this feature with discretion to keep your code readable.
complex(i = pi, 1, 0) # Think: what have I done here? Why does this work?
exp(complex(i = pi, 1, 0)) # (The complex number above is the same as in Euler's identity.)
?make.names
?reserved
a <- 5
a
a + 3
b <- 8
b
a + b
a == b # not assignment: equality test
a != b # not equal
a < b  # less than
# I don't like...
col <- c("red", "grey")
hist(rnorm(200), col=col)

# I prefer instead...
rgStripes <- c("red", "grey")
hist(rnorm(200), col=rgStripes)
typeof(TRUE)
class(3L)
mode(print)
#Let's have a brief look at the function itself: typing a function name without its parentheses returns the source code for the function:
objectInfo

# Various objects:

#Scalars:
objectInfo( 3.0 )    # Double precision floating point number
objectInfo( 3.0e0 )  # Same value, exponential notation

objectInfo( 3 )   # Note: integers are double precision floats by default.
objectInfo( 3L )  # If we really want an integer, we must use R's 
                  # special integer notation ...
objectInfo( as.integer(3) )  # or explicitly "coerce" to type integer...

# Coercions: For each of these, first think what result you would expect:
objectInfo( as.character(3) )  # Forcing the number to be interpreted as a character.
objectInfo( as.numeric("3") )   # character as numeric
objectInfo( as.numeric("3.141592653") )  # string as numeric. Where do the
                                         # non-zero digits at the end come from?
objectInfo( as.numeric(pi) )    # not a string, but a predefined constant
objectInfo( as.numeric("pi") )  # another string as numeric ... Ooops -
                                # why the warning?
objectInfo( as.complex(1) )  

objectInfo( as.logical(0) )  
objectInfo( as.logical(1) )  
objectInfo( as.logical(-1) )  
objectInfo( as.logical(pi) )      # any non-zero number is TRUE ...
objectInfo( as.logical("pie") )   # ... but not non-numeric types.
                                  # NA means "Not Available".
objectInfo( as.character(pi) )    # Interesting: the conversion eats digits.

objectInfo( Inf )                # Larger than the largest representable number
objectInfo( -Inf )               # ... or smaller
objectInfo( NaN )                # "Not a Number" is numeric
objectInfo( NA )                 # "Not Available" - i.e. missing value is
                                 # logical

# NULL
objectInfo( NULL )     # NULL is nothing. Not 0, not NaN,
                       # not FALSE - nothing. NULL is the value that is 
                       # returned by expressions or
                       # functions when the result is undefined. 

objectInfo( as.factor("M") )     # factor
objectInfo( Sys.time() )         # time
objectInfo( letters )            # inbuilt
objectInfo( 1:4 )                # numeric vector
objectInfo( matrix(1:4, nrow=2)) # numeric matrix
objectInfo( data.frame(arabic = 1:3,                           # dataframe
                       roman = c("I", "II", "III"), 
                       stringsAsFactors = FALSE))
objectInfo( list(arabic = 1:7, roman = c("I", "II", "III")))   # list

# Expressions:
objectInfo( 3 > 5 ) # Note: any combination of variables via the logical
                    # operators ! == != > < >= <= | || & and && is a 
                    # logical expression, with values TRUE or FALSE.
objectInfo( 3 < 5 ) 
objectInfo( 1:6 > 4 ) 

objectInfo( a ~ b )              # a formula
objectInfo( objectInfo )         # this function itself
a <- 7
b <- 6:7
str(a)             # num 7
str(b)             # int [1:2] 6 7
a == b[2]          # TRUE
identical(b[2], a) # FALSE ! Not identical! Why?
                   # (see the str() results above.)

# If you need to be sure that a number is an
# integer, write it with an "L" after the number:
c <- 7L
str(c)             # int 7
identical(b[2], c) # TRUE
# The c() function concatenates elements into a vector
c(2, 4, 6)

#Create a vector and list its contents and length:
f <- c(1, 1, 3, 5, 8, 13, 21)
f
length(f)

# Often, for teaching code, I want to demonstrate the contents of an object after 
# assigning it. I can simply wrap the assignment into parentheses to achieve that.
# Parentheses return the value of whatever they enclose. So ...
a <- 17
# ... assigns 17 to the variable "a". But this happens silently. However ...
( a <- 17 )
# ... returns the result of the assignment. I will use this idiom often.

( f <- c(1, 1, 3, 5, 8, 13, 21, 34, 55, 89) )

# Coercion: 
# all elements of vectors must be of the same mode
c(1, 2.0, "3", TRUE)  # trying to get a vector with mixed modes ...
[1] "1"    "2"    "3"    "TRUE" 

# ... shows that all elements are silently being coerced
# to character mode. The emphasis is on _silently_. This might
# be unexpected, for example if you are reading numeric data
# from a text-file but someone has entered a " " for a missing
# value... 

# Various ways to retrieve values from the vector:

# Extracting by index ...
f[1] # "1" is first element. 
f[length(f)] # length() is the index of the last element.

# With a vector of indices ...
1:4 # This is the range operator
f[1:4] # using the range operator (it generates a sequence and returns it in a vector)
f[4:1] # same thing, backwards
seq(from=2, to=6, by=2) # The seq() function is a flexible, generic way to generate sequences
seq(2, 6, 2) # Same thing: arguments in default order
f[seq(2, 6, 2)]

# since a scalar is a vector of length 1, does this work?
5[1]

# ...using an index vector with positive indices
a <- c(1, 3, 4, 1) # the elements of index vectors must be 
                   # valid indices of the target vector. 
                   # The index vector can be of any length.
f[a] # In this case, four elements are retrieved from f[]

# Negative indices omit elements ...
# ...using an index vector with negative indices

# If elements of index vectors are negative integers,
# the corresponding elements are excluded.
( a <- -(1:4) ) # Note that this is NOT the same as -1:4

f[a] # Here, the first four elements are omitted from f[]
f[-((length(f)-3):length(f))] # Here, the last four elements are omitted from f[]

# Extracting with a logical vector...
f > 4 # A logical expression operating on the target vector
      # returns a vector of logical elements. It has the
      # same length as the target vector.
f[f > 4]; # We can use this logical vector to extract only
          # elements for which the logical expression evaluates as TRUE.
          # This is sometimes called "filtering".
# Note: the logical vector is aligned with the elements of the original
# vector. You can't retrieve elements more than once, as you could
# with index vectors. If the logical vector is shorter than its target
# it is "recycled" to the full length.

# Example: extending the Fibonacci series for three steps. 
# Think: How does this work? What numbers are we adding here and why does the result end up in the vector?
( f <- c(f, f[length(f)-1] + f[length(f)]) )
( f <- c(f, f[length(f)-1] + f[length(f)]) )
( f <- c(f, f[length(f)-1] + f[length(f)]) )

# Some more thoughts about "["
# "[" is not just a special character, it is an operator. It
# operates on whatever it is attached to on the left. We have attached it
# to vectors above, but we can also attach it directly to function
# expressions, if the function returns a vector. For example, the 
# summary() function returns some basic statistics on a vector:
summary(f)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00    5.00   21.00   75.69   89.00  377.00 

# This is a vector of six numbers:
length(summary(f))

# We can extract e.g. the median like so:
summary(f)[3]

# ... or the boundaries of the interquartile range:
summary(f)[c(2, 5)]

# Note that the elements that summary() returns are "named".
# "Names" are attributes.
objectInfo(summary(f))

# The names() function can retrieve (or set) names:
names(summary(f))

# ... which brings us to yet another way to extract elements from Vectors:

# Extracting named elements...
# If the vector has named elements, vectors of names can be used exactly like
# index vectors:

summary(f)["Median"]
summary(f)[c("Max", "Min")]  # Oooops - I mistyped. But you can fix the expression, right?
f
f+1
f*2

# computing with two vectors of same length
f  # the Fibonacci numbers you have defined above
( a <- f[-1] )  # like f, but omitting the first element
( b <- f[1:(length(f)-1)] ) # like f, but shortened by the least element
c <- a / b # the "golden ratio", phi (~1.61803 or (1+sqrt(5))/2 ), 
           # an irrational number, is approximated by the ratio of
           # two consecutive Fibonacci numbers.
c
abs(c - ((1+sqrt(5))/2)) # Calculating the error of the approximation, element by element
x <- 8; sample(6:x)
x <- 7; sample(6:x)
x <- 6; sample(6:x)  # Oi!

# also consider
x <- 6:8; seq(x)
x <- 6:7; seq(x)
x <- 6:6; seq(x)    # Oi vay!
safeSample <- function(x, size, ...) {
	# Replace the sample() function to ensure sampling from a single
	# value gives that value with probability p == 1.
        # Respect additional arguments if present.
    if (length(x) == 1 && is.numeric(x) && x > 0) {
    	if (missing(size)) size <- 1
        return(rep(x, size))
    } else {
        return(sample(x, size, ...))
    }
}
( a <- 1:12 )
dim(a) <- c(2,6)
a
dim(a) <- c(2,2,3)
a
dim(a)    # returns a vector
dim(a)[3]  # only the third value of the vector
( a  <- 1:4 )
( b  <- 5:8 )
( m1 <- rbind(a, b) )
( m2 <- cbind(a, b) )
( m  <- cbind(m2, c = 9:12) )  # naming a column :c" while cbind()'ing it
m[1,] # first row
m[, 2] # second column
m[3, 2] # element at row == 3, column == 2
m[3:4, 1:2] # submatrix: rows 3 to 4 and columns 1 to 2
Name	Size	Marker	Ori	Sites
pUC19	2686	Amp	ColE1	EcoRI, SacI, SmaI, BamHI, XbaI, PstI, HindIII
pBR322	4361	Amp, Tet	ColE1	EcoRI, ClaI, HindIII
pACYC184	4245	Tet, Cam	p15A	ClaI, HindIII

plasmidData <- read.table("plasmidData.tsv", sep="\t", header=TRUE, stringsAsFactors = FALSE)
plasmidData   # show what the data frame contains
pUC19 <- list(size=2686, marker="ampicillin", ori="ColE1", accession="L01397", BanI=c(235, 408, 550, 1647) )
pUC19[[1]]
pUC19[[2]]
pUC19$ori
pUC19$BanI[2]
> ?"["     # Note that you need quotation marks around the operator for this.
plasmidData[1, ]
plasmidData[2, ]

# we can extract more than one row by specifying
# the rows we want in a vector ...
plasmidData[c(1, 2), ]   

# ... this works in any order ...
plasmidData[c(3, 1), ]   

# ... and for any number of rows ...
plasmidData[c(1, 2, 1, 2, 1, 2), ]   

# Same for columns
plasmidData[ , 2 ]

# We can select rows and columns by name if a name has been defined...
plasmidData[,"Name"]
plasmidData$Name      # different syntax, same thing. This is the syntax I use most frequently.

# Watch this!
plasmidData$Name[plasmidData$Ori != "ColE1"]
# What happened here?
# plasmidData$Ori != "ColE1" is a logical expression, it gives a vector of TRUE/FALSE values
plasmidData$Ori != "ColE1"

# We insert this vector into the square brackets. R then returns all rows for
# which the vector is TRUE.

# In this way we can "filter" for values
plasmidData$Size > 3000
plasmidData$Name[plasmidData$Size > 3000]

# This principle is what we use when we want to "sort" an object
# by some value. The function order() is used to return values
# that are sorted. Remember this: not sort() but order().
order(plasmidData$Size)
plasmidData[order(plasmidData$Size), ]

# grep() matches substrings in strings and returns a vector of indices
grep("Tet", plasmidData$Marker)
plasmidData[grep("Tet", plasmidData$Marker), ]
plasmidData[grep("Tet", plasmidData$Marker), "Ori"]
( x <- sample(1:10) )
x[4] <- 99
x
( x <- x[order(x)] )
# Simple "if" statement:
# Rolling a die. If you get a "six", you get to roll again.

x <- sample(1:6, 1)
if (x == 6) {
    x <- c(x, sample(1:6, 1))
}
print(x)

# "if", "else if", and "else"
# Here is a popular dice game called high-low.

a <-  sample(1:6, 1)
b <-  sample(1:6, 1)
if (a + b > 7) {
    print("high")
} else if (a + b < 7) {
    print("low")
} else {
    print("seven")
}
x <- c(1, 3, 5, 7, 11, 13, 17)
x > 3 &  x < 17 # FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE: all comparisons
x [x > 3 &  x < 17]  #  5  7 11 13

x > 3 && x < 17 # FALSE: stop at the first FALSE
x <- numeric()

if (length(x) == 0 |  is.na(x)) { print("zero") }  # throws an error, because is.na() is
                                                   # evaluated even though x has length zero.

if (length(x) == 0 || is.na(x)) { print("zero") }  # no error: length test is TRUE so is.na()
                                                   # never gets evaluated.
# Let's stay with the high-low game for a moment:
# What are the odds of winning?
# Let's simulate some runs with a "for" loop.

N <- 25000
outcomes <- character(N)  # initialize an empty vector
for (i in 1:N) {          # repeat, assign each element of 1:N to
                          # the variable "i" in turn
    a <-  sample(1:6, 1)
    b <-  sample(1:6, 1)
    if (a + b > 7) {
        outcomes[i] <- "high"
    } else if  (a + b < 7) {
        outcomes[i] <- "low"
    } else {
        outcomes[i] <- "seven"
    }
}
head(outcomes, 36)
table(outcomes)  # the table() function tabulates the elements
                 # of a vector

round((36 * table(outcomes))/N) # Can you explain this expression?
# Let's assume we are playing high-low in a casino. You can bet
# high or low. You get two dollars for one if you win, nothing
# if you lose. If you bet "high", you lose if we roll "low"
# or "seven". Thus your chances of winning are 15/36 = 42%. You play
# the following strategy: start with 33 dollars. Bet one dollar.
# If you win, good. If you loose, triple your bet. Stop the game
# when your funds are gone (bad), or if you have more than 100
# dollars (good) - i.e. you have tripled the funds you risked.
# Also stop if you've played more than 100 rounds and start
# getting bored.

set.seed(1234567)
funds <- 33
bet <- 1         # our first bet

nPlays <- 0      # this counts how often we've played
MAXPLAYS <- 100 

while (funds > 0 && funds < 100 && nPlays < MAXPLAYS) {

    bet <- min(bet, funds)  # can't bet more than we have.
    funds <- funds - bet    # place the bet
    a <-  sample(1:6, 1)    # roll the dice
    b <-  sample(1:6, 1)

    # we always play "high"
    if (a + b > 7) {        # we win :-)     
        result <- "Win!  "
        funds <- funds + (2 * bet)
        bet <- 1            # reset the bet to one dollar
    } else {                # we loose :-(
        result <- "Loose."
        bet <- 3 * bet      # increase the bet to 3 times previous
    }
    print(paste("Round", nPlays, result, 
                "Funds now:", funds,
                "Next bet:", bet))
    nPlays <- nPlays + 1
}

# Now before you get carried away - try this with different seeds
# and you'll quickly figure out that the odds of beating the game
# are not all that great...
#defining the function:
myFunction <- function(myParameters) { 
	result <- doSomethingWith(myParameters)
	return(result)
}
biCode <- function(s) { 
	substr(s, 4, 5) <- substr(strsplit(s,"\\s+")[[1]][2], 1, 2)
	return (toupper(substr(s, 1, 5)))
}

biCode("Homo sapiens")              # HOMSA
biCode("saccharomyces cerevisiae")  # SACCE
fibSeq <- function(n) { 
   if (n < 1) { return( 0 ) }
   else if (n == 1) { return( 1 ) }
   else if (n == 2) { return( c(1, 1) ) }
   else {
      v <- numeric(n)
      v[1] <- 1
      v[2] <- 1
      for ( i in 3:n ) {
         v[n] <- v[n-2] + v[n-1]
      }
      return( v )
   }
}
rollDice <- function(len=1, MIN=1, MAX=6) {
    v <- rep(0, len)
    for (i in 1:len) {
        x <- runif(1, min=MIN, max=MAX)
        x <- as.integer(x)
        v[i] <- x
    }
    return(v)
}
rollDice()
table(rollDice(1000))
debug(rollDice)
rollDice(10)
debugging in: rollDice(10)
debug at #1: {
    v <- rep(0, len)
    for (i in 1:len) {
        x <- runif(1, min = MIN, max = MAX)
        x <- as.integer(x)
    	v[i] <- x
    }
    return(v)
}
Browse[2]> 
debug at #2: v <- rep(0, len)
Browse[2]> 
debug at #3: for (i in 1:len) {
    x <- runif(1, min = MIN, max = MAX)
    x <- as.integer(x)
    v[i] <- x
}
Browse[2]> 
debug at #4: x <- runif(1, min = MIN, max = MAX)
Browse[2]> 
debug at #5: x <- as.integer(x)
Browse[2]> x   # Here we examine the current value of x
[1] 4.506351
Browse[2]> 
debug at #6: v[i] <- x
Browse[2]> 
debug at #4: x <- runif(1, min = MIN, max = MAX)
Browse[2]> v
[1] 4      # Aha: as.integer() truncates, but doesn't round!
Browse[2]> Q
undebug(rollDice)
rollDice <- function(len=1, MIN=1, MAX=6) {
    v <- rep(0, len)
    for (i in 1:len) {
    	x <- runif(1, min=MIN, max=MAX+1)
    	x <- as.integer(x)
    	v[i] <- x
    }
    return(v)
}
table(rollDice(1000))
# Disclaimer 1: this function would be better
# written as ...

rollDice <- function(len=1, MIN=1, MAX=6) {
	return(as.integer(runif(len, min=MIN, max=MAX+1)))
}

# Check the output:
table(rollDice(1000))

# This works, since runif() can return a vector of deviates,
# but if we write the function this way we can't check the value of
# individual trials.

# Disclaimer 2: the function relies on a side-effect of as.integer(), which is
# to drop the digits after the comma when it converts. More explicit and
# therefore clearer would be to use the function floor() instead. Here, the
# truncation is not a side effect, but the desired behaviour. This is
# actually important: there is no guarantee how as.integer() constructs an
# integer from a float, it could e.g. round, instead of truncating. But rounding
# would give a wrong distribution! An error that may be hard to spot. (You
# can easily try using the round() function and think about how the result is wrong.)

# A better alternative is thus to write:

rollDice <- function(len=1, MIN=1, MAX=6) {
	return(floor(runif(len, min=MIN, max=MAX+1)))
}

# Disclaimer 3
# A base R function exists that already rolls dice in the required way: sample()

table(sample(1:6, 1000, replace=TRUE))

R tutorial

Contents

Before you begin: Notation and Formatting

The environment

Files, directories and paths

Install R

Install R Studio

"Projects"

Git Version control

Typing code or executing it?

User interface

The Help system

Working directory

.Rprofile - startup commands

... unix systems

... Mac OS X systems

...Windows systems

The "Workspace"

Packages

Simple commands

Operators

Functions

Variables

Scalar data

Vectors

Matrices

Data frames

Lists

Subsetting

Control structures

if and else

for

while

Writing your own functions

Coding style

Debugging

Finishing

Notes

Further reading and resources

Navigation menu

Search