User:Boris/Temp2

Computing with PDB files

Let's apply some of the principles we have covered above to perform computations with biological coordinates.
Task:
First, orient yourself to the actual contents of a PDB file:
Access the PDB page for 1BM8.
Use the pulldown menu towards the top-right to Display files → PDB file.
Study the file, in particular the SEQRES, HELIX and SHEET records, and the structure of ATOM records.
To compute with these coordinates, copy the following R script to your computer and follow it carefully. Make sure you understand each line of code. Ask on the list if anything is not clear.
#bio3d example
# Boris Steipe, October 2013 
setwd("~/Documents/07.TEACHING/36-BCH441\ Bioinformatics\ 2013/")

# In this example of protein structure interpretation, we ...
#   - load the library "bio3D" which supports work with
#     protein structure files,
#   - explore some elementary functions of the library
#   - assemble a script to compare H-bond lengths between
#     alpha-helical and beta-sheet structures.

# bio3d is not available via CRAN, but needs to be installed
# from source. Instructions are here:
# http://thegrantlab.org/bio3d/download/download.html

library(bio3d)

lbio3d() # ... lists the newly installed  functions.
         # they all have help files associated.
         # More information is available on the so-called
         # "vignette", distributed with most R packages:
         # http://thegrantlab.org/bio3d/bio3d_vignette.pdf         

apses <- read.pdb("1bm8")  # load a molecule directly from PDB

# ======== RAMACHANDRAN PLOT ==============================
# Calculate a Ramachandran plot for the structure
tor <- torsion.pdb(apses)
plot(tor$phi, tor$psi)

# As you can see, there are a number of points in the upper-right
# quadrant of the plot. This combination of phi-psi angles defines
# the conformation of a left-handed alpha helix and is generally
# only observed for glycine residues. Let's replot the data, but
# color the points for glycine residues differently. First, we 
# get a vector of glycine residue indices in the structure:

seq <- seq.pdb(apses)

# Explore the result object and extract the indices of GLY resiues.
seq == "G"
which(seq == "G")
as.numeric(which(seq == "G"))
Glys <- as.numeric(which(seq == "G"))

# Now plot all non-gly residues.
# Remember: negative indices exclude items from a vector
plot(tor$phi[-Glys], tor$psi[-Glys], xlim=c(-180,180), ylim=c(-180,180))

# Now plot GLY only, but with red dots:
points(tor$phi[Glys], tor$psi[Glys], pch=21, cex=0.9, bg="#BB0000")

# As you see, four residues in the upper-right quadrant are
# not glycine. But what residues are these? Is there an
# error in our script? 
# You could try as an additional exercise to add the residue
# types or indices of the non-GLY outliers to the plot. 
# Use the function text(). Then you could verify the
# backbone torsion angles in VMD.

# Are there any cis-peptide bonds in the structure?
tor$omega
#... gives us a quick answer. But wait - what values do we
# expect? And why are the values so different?
# Consider this plot: what am I doing here and why?
x <- tor$omega[tor$omega > 0]
x <- c(x, 360 + tor$omega[tor$omega < 0])
hist(x, xlim=c(90,270))
abline(v=180, col="red")

# ======= DISTANCES =================================
# Let's do something a little less trivial and compare
# backbone H-bond distances between helices and strands.
#
# For this, we need to poke around a bit in the way 
# bio3d stores information.
# The basic object is a list:

typeof(apses)

# The list contains components...

names(apses)

# ... which we can access either through the double square
# bracket, or the $ syntax:

apses[["seqres"]]
apses$seqres

# Most of these lists objects can be vectors, other lists
# or scalars

typeof(apses$atom)
typeof(apses$helix)
typeof(apses$sheet)
typeof(apses$seqres)
typeof(apses$xyz)
typeof(apses$calpha)

# This means information can be obtained in several
# different ways. 

# By atom ...
i <- 5
apses$atom[i,]
apses$atom[i, c("x", "y", "z")]   # Note that these are strings!
apses$xyz[c(i*3-2, i*3-1, i*3)]   # Note that these are numbers

# By residue ...
i <- "48"	#string!
apses$atom[apses$atom[,"resno"] == i, ]
i <- 7
apses$seqres[7]

# ... wait, what???
# Isn't that supposed to be the same?
# What's going on here?
# Can you figure it out?

# By chain ...
# (Well - the 1BM8 file has only chain "A", but if there were more
# than one chain, you could select them this way ...). 

paste(aa321(apses$seqres[names(apses$seqres)=="A"]), collapse='')
paste(aa321(apses$atom[apses$atom[,"chain"] == "A" & apses$atom[,"elety"] == "CA","resid"]), collapse='')

# To understand nested statements, especially like the one 
# above, select them piece by piece and execute the pieces to
# see  how the output of one expression works as the argument
# in another expression.



# Back to measuring H-bond distances:
# secondary structure is defined in the list components ...$helix
# and ...$strand.

# We need to 
# - collect all residue indices for alpha helices resp.
#      beta strands;
# - fetch the atom coordinates;
# - calculate all N, O distances using dist.xyz();
# - filter them for distances that indicate H-bonds; and,
# - plot the results.

# Secondary structure can either be obtained from
# definitions contained in many PDB files, or by running
# the DSSP algorithm IF(!) you have it installed on your
# machine. The 1BM8 file contains definitions 

apses$helix
apses$sheet


# (1): collect the residue numbers
# between the segment boundaries.

H <- c() # This will contain the helix residue numbers
for (i in 1:length(apses$helix)) {
	H <- c(H, apses$helix$start[i]:apses$helix$end[i])
}

# Doing the same for the sheet residue numbers requires
# very similar code. Rather than retype the code, it is
# better to write a function that can do both.

getSecondary <- function(sec) {
	R <- c()	
	for (i in 1:length(sec$start)) {
		R <- c(R, sec$start[i]:sec$end[i])
	}
	return(R)
}




# Compare ...
H
getSecondary(apses$helix)

# ... and use for strands

E <- getSecondary(apses$sheet)


# Now here's a problem: these numbers refer to the 
# residue numbers as defined in the atom records. They
# can't be used directly to address e.g. the first, second
# third residue etc. since the first residue has the
# residue number 4...  
apses$atom[1,]

# Therefore we need to
# 1: convert the numbers to strings;
# 2: search for atom records that contain these strings.
#
# Essentially, we don't treat the "residue numbers" as numbers,
# but as generic labels. That's fine, as long as they are unique.

# (2): fetch coordinates of N and O atoms
# for residues in alpha- and beta- conformation.

# For one residue, the procedure goes as follows:

res <- H[17] # pick an arbitrary alpha-helix residue to illustrate
res
res <- as.character(res)
res

# all atom rows for this residue
apses$atom[apses$atom[,"resno"] == res, ]

# add condition: row with "N" atom only
apses$atom[apses$atom[,"resno"] == res & 
           apses$atom[,"elety"] == "N", ]

# add column selection: "x", "y", "z"
apses$atom[apses$atom[,"resno"] == res &
           apses$atom[,"elety"] == "N", 
	       c("x", "y", "z")] 

# convert to numbers
as.numeric (
	apses$atom[apses$atom[,"resno"] == res &
    apses$atom[,"elety"] == "N", 
	c("x", "y", "z")] 
)

# Now we need to add this into a matrix as we iterate over
# the desired residues. We need to execute the procedure
# four times for alpha and beta Ns and Os respectively.
# Rather than duplicate the code four times over,
# we write a function. This is prudent practice for writing
# software: never duplicate code, because if you need to make
# changes it is too easy to forget making the change
# consistently in all copies.


getAtom <- function(PDB, r, AT) {
	# Function to iterate over residue number strings and 
	# return a matrix of x, y, z triplets for each atom
	# of a requested type.
	mat <- c() 	#initialize as empty matrix
	for (i in 1:length(r)) {
		res <- as.character(r[i])	
		v <- as.numeric (
				PDB$atom[PDB$atom[,"resno"] == res &
	    		PDB$atom[,"elety"] == AT, 
				c("x", "y", "z")] 
		)
		mat <- rbind(mat, v)
	}
	return(mat)
}

# Now run the functions with the parameters we need...
H.xyz.N <- getAtom(apses, H, "N")
H.xyz.O <- getAtom(apses, H, "O")
E.xyz.N <- getAtom(apses, E, "N")
E.xyz.O <- getAtom(apses, E, "O")


# (3): calculate distances between N and
# O atoms to find H-bonds.

# We spent most of our effort so far just
# preparing our raw data for analysis. Now
# we can actually start measuring. bio3d provides
# the function dist.xyz() - but we
# could easily code this ourselves.

# Consider the distance of the first (N,O) pair.
H.xyz.N[1,]
H.xyz.O[1,]

a <- H.xyz.N[1,]
b <- H.xyz.O[1,]

dist.xyz(a, b)

# or ...
sqrt( (a[1]-b[1])^2 + (a[2]-b[2])^2 + (a[3]-b[3])^2 )
# ... i.e. pythagoras theorem in 3D.


# Calculating all pairwise distances from a matrix of
# xyz coordinates sounds like a useful function.

PairDist.xyz <- function(a, b) {
	PD <- c()
	for (i in 1:nrow(a)) {
		for (j in 1:nrow(b)) {
			PD <- c(PD, dist.xyz(a[i,], b[j,]))					
		}
	}
	return(PD)
}

# And see what we get:
X <- PairDist.xyz(H.xyz.N, H.xyz.O)
hist(X)

# let's zoom in on the shorter distances, in which we expect
# hydrogen bonds:
hist(X[X < 4.0], breaks=seq(2.0, 4.0, 0.1), xlim=c(2.0,4.0))

# There is a large peak at about 2.2Å, and another
# large peak above 3.5Å. But these are not typical hydrogen
# bond distances! Rather these are (N,O) pairs in peptide
# bonds, and within residues. That's not good, it will
# cause all sorts of problems with statistics later.
# We should exclude all distances between N of a residue
# and O of a preceeding residue, and all (N,O) pairs in the
# same residue. But for this, we need to store atom type
# and residue information with the coordinates. Our code
# will get a bit bulkier. It's often hard to find a good
# balance between terse generic code, and code that
# handles special cases.

# We could do two things:
# - add a column with residue information to the coordinates
# - add a column with atom type information 
# ... but actually we also would need chain information, and
# then we really have almost everything that is contained in the record
# in the first place.

# This suggests a different strategy: let's rewrite our function
# getAtom() to return indices, not coordinates, and use the indices
# to extract coordinates, and THEN we can add all sorts of
# additional constraints.

# Here we set up the function with a default chain argument

getAtomIndex <- function(PDB, V_res, elety, chain="A") {
	# Function to access a bio3d pdb object, iterate over
	# a vector of residues and return a vector of indices
	# to matching atom elements. Nb. bio3d handles insert
	# and alt fields incorrectly: their values should not
	# be NA but " ",  i.e. a single blank. Therefore this
	# function does not test for "alt" and "insert".
	
	v <- c() 	#initialize as empty vector
	for (i in 1:length(V_res)) {
		res <- as.character(V_res[i])	
		x <- which(PDB$atom[,"resno"] == res &
				   PDB$atom[,"chain"] == chain &
	    		   PDB$atom[,"elety"] == elety)
		v <- c(v, x)
	}
	return(v)
}

# test this ...
getAtomIndex(apses, H, "N")
getAtomIndex(apses, H, "O")

# That looks correct: O atoms should be stored three
# rows further down: the sequence of atoms in a PDB
# file is usually N, CA, C, O ... followed by the side
# chain coordinates.

# Now to extract the coordinates and calculate distances.
# Our function needs to take the PDB object and
# two vectors of atom indices as argument, and return
# a vector of pair-distances.

PairDist.idx <- function(PDB, a, b) {
	PD <- c()
	for (i in 1:length(a)) {
		p <- as.numeric(PDB$atom[a[i], c("x", "y", "z")])
		for (j in 1:length(b)) {
			q <- as.numeric(PDB$atom[b[j], c("x", "y", "z")])
			PD <- c(PD, dist.xyz(p, q))					
		}
	}
	return(PD)
}

# Let's see if this looks correct:

H.N <- getAtomIndex(apses, H, "N")
H.O <- getAtomIndex(apses, H, "O")
X <- PairDist.idx(apses, H.N, H.O)
hist(X[X < 4.0], breaks=seq(2.0, 4.0, 0.1), xlim=c(2.0,4.0))

# Now we are back where we started out from, but with 
# a different logic of the function that we can easily
# modify to exclude (N_i, O_i-1) distances (peptide bond),
# and (N_i, O_i) distances (within residue).

HB.idx <- function(PDB, a, b) {
	HBcutoff <- 4.0
	PD <- c()
	for (i in 1:length(a)) {
		p <- as.numeric(PDB$atom[a[i], c("x", "y", "z")])
		res_i <- as.numeric(PDB$atom[a[i], "resno"])
		for (j in 1:length(b)) {
			q <- as.numeric(PDB$atom[b[j], c("x", "y", "z")])
			res_j <- as.numeric(PDB$atom[a[j], "resno"])
			if (res_i != res_j+1 &
			    res_i != res_j     ) {
			    	d <- dist.xyz(p, q)
			    	if (d < HBcutoff) {
 						PD <- c(PD, d)
			    	}
			}
		}
	}
	return(PD)
}

# test this: 
X <- HB.idx(apses, H.N, H.O)
hist(X)

# ... and this looks much more like the distribution we are
# seeking.

# Why did we go along this detour for coding the 
# function? The point is that there are usually
# several ways to use or define datastructures and
# functions. Which one is the best way may not be 
# obvious until you understand the problem better.
# At first, we wrote a very generic function that 
# extracts atom coordinates to be able to compute
# with them. This is simple and elegant. But we
# recognized limitations in that we could not
# make more sophisticated selections that we needed
# to reflect our biological idea of hydrogen
# bonds. Thus we changed our datastructure
# and functions to accomodate our new requirements
# better. You have to be flexible and able to look
# at a task from different angles to succeed.

# Finally we can calculate alpha- and beta- structure
# bonds and compare them. In this section we'll explore
# different options for histogram plots.

H.N <- getAtomIndex(apses, H, "N")
H.O <- getAtomIndex(apses, H, "O")
dH <- HB.idx(apses, H.N, H.O)

E.N <- getAtomIndex(apses, E, "N")
E.O <- getAtomIndex(apses, E, "O")
dE <- HB.idx(apses, E.N, E.O)

# The plain histogram functions without parameters
# give us white stacks.

hist(dH)

# and ... 
hist(dE)

# We can see that the histrograms look different
# but that is better visualized by showing two plots
# in the same window. We use the par() function, for
# more flexible layout, look up the layout() function.
?par
?layout

opar <- par(no.readonly=TRUE)  # store current state
par(mfrow=c(2,1))  # set graphics parameters: 2 rows, one column

# plot two histograms
hist(dH)   
hist(dE)


# add color:
hist(dH, col="#DD0055") 
hist(dE, col="#00AA70")



# For better comparison, plot both in the 
# same window:

hist(dH, col="#DD0055") 
hist(dE, col="#00AA70", add=TRUE)

# ... oops, we dind't reset the graphics parameters.
# You can either close the window, a new window
# will open with default parameters, or ...
par(opar)      # ... reset the graphics parameters

hist(dH, col="#DD0055") 
hist(dE, col="#00AA70", add=TRUE)

# We see that the leftmost column of the sheet bonds
# overlaps the helix bonds. Not good. But we
# can make the colors transparent! We just need to
# add a fourth set of two hexadecimal-numbers to
# the #RRGGBB triplet. Lets use 2/3 transparent,
# in hexadecimal, 1/3 of 256 is x55 - i.e. an
# RGB triplet specied as #RRGGBB55 is only 33%
# opaque:

hist(dH, col="#DD005555") 
hist(dE, col="#00AA7055", add=TRUE)

# To finalize the plots, let's do two more things:
# Explicitly define the breaks, to make sure they
# match up - otherwise they would not need to...
# see for example: 

hist(dH, col="#DD005555") 
hist(dE[dE < 3], col="#00AA7055", add=TRUE)

# Breaks are a parameter in hist() that can
# either be a scaler, to define how many columns
# you want, or a vector, that defines the actual
# breakpoints.
brk=seq(2.4, 4.0, 0.1)

hist(dH, col="#DD005555", breaks=brk) 
hist(dE, col="#00AA7055", breaks=brk, add=TRUE)

# The last thing to do is to think about rescaling the plot.
# You notice that the y-axis is scaled in absolute frequency.
# That gives us some impression of the relative frequency,
# but it is of course skewed by observing relatively more
# or less of one type of secondary structure in a protein.
# As part of the hist() function we can rescale the values so
# that the sum over all is one: set the prameter freq=FALSE.

hist(dH, col="#DD005555", breaks=brk, freq=FALSE) 
hist(dE, col="#00AA7055", breaks=brk, freq=FALSE, add=TRUE)

# Adding labels and legend ...

hH <- hist(dH,
        freq=FALSE,
        breaks=brk, 
		col="#DD005550",
		xlab="(N,O) distance (Å)",
		ylab="Density",
		ylim=c(0,4),
		main="Helix and Sheet H-bond lengths")
hE <- hist(dE,
        freq=FALSE,
        breaks=brk,
        col="#00AA7060",
        add=TRUE)

legend('topright',
		c(paste("alpha (N = ", sum(hH$counts), ")"),
		  paste("beta  (N = ", sum(hE$counts), ")")),
        fill = c("#DD005550", "#00AA7060"), bty = 'n',
        border = NA)
# ===========================================================
# With all the functions we have defined,
# it is easy to try this with a larger protein. In fact
# 3ugj is VERY large. The calculation will take a few
# minutes:

pdb <- read.pdb("3ugj")

H <- getSecondary(pdb$helix)
E <- getSecondary(pdb$sheet)

H.N <- getAtomIndex(pdb, H, "N")
H.O <- getAtomIndex(pdb, H, "O")
dH <- HB.idx(pdb, H.N, H.O)

E.N <- getAtomIndex(pdb, E, "N")
E.O <- getAtomIndex(pdb, E, "O")
dE <- HB.idx(pdb, E.N, E.O)

brk=seq(2.4, 4.0, 0.1)

hH <- hist(dH,
        freq=FALSE,
        breaks=brk, 
		col="#DD005550",
		xlab="(N,O) distance (Å)",
		ylab="Density",
		ylim=c(0,4),
		main="Helix and Sheet H-bond lengths")
hE <- hist(dE,
        freq=FALSE,
        breaks=brk,
        col="#00AA7060",
        add=TRUE)

legend('topright',
		c(paste("alpha (N = ", sum(hH$counts), ")"),
		  paste("beta  (N = ", sum(hE$counts), ")")),
        fill = c("#DD005550", "#00AA7060"), bty = 'n',
        border = NA)

# It looks more and more that the distribution is 
# indeed different. Our sample is large, but derives
# from a single protein.
# To do database scale statistics, we should look 
# at many more proteins. To give you a sense of how,
# let's do this for just ten proteins, taken from
# the architecture level of the CATH database for 
# mixed alpha-beta proteins (see:
# http://www.cathdb.info/browse/browse_hierarchy_tree):

PDBarchitectures <- c("3A4R", "A")
names(PDBarchitectures) <- c("ID", "chain")
PDBarchitectures <- rbind(PDBarchitectures, c("1EWF","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("2VXN","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1I3K","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1C0P","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("3QVP","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1J5U","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("2IMH","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("3NVS","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1UD9","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1XKN","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("1OZN","A"))
PDBarchitectures <- rbind(PDBarchitectures, c("2DKJ","A"))

dH <- c()
dE <- c()

for (i in 1:nrow(PDBarchitectures)) {
	pdb <- read.pdb(PDBarchitectures[i,1])
	chain <- PDBarchitectures[i,2]
	H <- getSecondary(pdb$helix)
	H.N <- getAtomIndex(pdb, H, "N", chain)
	H.O <- getAtomIndex(pdb, H, "O", chain)
	dH <- c(dH, HB.idx(pdb, H.N, H.O))
	
	E <- getSecondary(pdb$sheet)
	E.N <- getAtomIndex(pdb, E, "N", chain)
	E.O <- getAtomIndex(pdb, E, "O", chain)
	dE <- c(dE, HB.idx(pdb, E.N, E.O))
}

brk=seq(2.0, 4.0, 0.1)

hH <- hist(dH,
        freq=FALSE,
        breaks=brk, 
		col="#DD005550",
		xlab="(N,O) distance (Å)",
		ylab="Density",
		ylim=c(0,4),
		main="Helix and Sheet H-bond lengths")
hE <- hist(dE,
        freq=FALSE,
        breaks=brk,
        col="#00AA7060",
        add=TRUE)

legend('topright',
		c(paste("alpha (N = ", sum(hH$counts), ")"),
		  paste("beta  (N = ", sum(hE$counts), ")")),
        fill = c("#DD005550", "#00AA7060"), bty = 'n',
        border = NA)

# Nice. 
# That's all.
User:Boris/Temp2

Computing with PDB files

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Sections

Tools