# dbUtilities.R # # Purpose: Utility functions for a Protein datamodel # Version: 0.7.2 # Date: Nov 2015 # Author: Boris and class # # ToDo: Add more tables. # Accept either taxID OR organism name # and pull the other from NCBI. # Allow search by either feature name or psID in # getFeatureFasta() # Write function to type-check and str_trim arguments. # Write function to renumber pdb. # Split code into ABCdbUtil, ABC3DUtil, ABCphyloUtil. # Notes: Cf. schema sketch at # http://steipe.biochemistry.utoronto.ca/abc/index.php/File:ProteinDataModel.1.jpg # Currently implements "protein", # "taxonomy", "proteinFeature" and "feature" # tables. # # V 0.7.2 bugfix in addFeatureToDB() ... this time for real. # V 0.7.1 bugfix in addFeatureToDB() # V 0.7 added format and selection options to getFeatureFasta() # to support more flexible output. # fixed bug in getSeq() ... wrong variable name. # fixed bug in makeNames() ... wrong variable name. # manually added "KilA" to the E. coli gene. # added a safeIter function to skip iteration over # an empty object. # added option to scan Prosite with sequence instead # of ID, in fetchPrositeFeatures(), # update Feature Data version to 1.1 # align addFeatureToDB() function with new Feature # Data Format. # sanitize all relevant input with str_trim() # V 0.6.2 fixed bug in AddProteinToDB() that mixed up list # input with dataframe output. # V 0.6.1 fixed wrong argument in printArgs # fixed logic of removing tmp file in updateVerifiedFile # V 0.6 added proteinFeature and feature tables, function # to fetch features from prositeScan and # function to get FASTA for feture ranges. # added function to mereg old an new DB versions. # extensive maintenance. # update refDB with scanProsite features. # V 0.5 added refDB database object with 48 reference # APSES domain proteins from 10 fungi and # E. Coli as outgroup. Optionally load it # into memory upon sourcing this script. # generalize update function to handle all sources # and target, while making the update of # THIS file in its proper location the default. # V 0.4.1: issue: fails on incomplete data from NCBI # fixed. NCBI sent empty values for missing # genome coordinates. Set all missing values # to NA before calling addProteinToDB # V 0.4: added fetchProteinData() for data retrieval from # NCBI and UniProt # changed addProteinToDB() to accept either output of # fetchProteinData() or named arguments # make symbol names internally consistent # added DB version check # Test whether an entry for addProteinToDB() # already exists, allow to add duplicate or # replace existing entry. # Style maintenance: # use CamelCase consistently # keep variables and messages in the same line # of code when using paste() # V 0.3.1: maintenace update # V 0.3: replaced MD5 verification with digest() sha-1 of # of R-object, since MD5 of files on Windows # gave varying results. # V 0.2: Changed get functions to return vectors of # multiple matches. # added makeNames() # added biCodes() # added updateDbUtilities()) # # ========================================================== # ==== MESSAGE ============================================ cat("Loading db_utilities.R ...\n") # ==== CONSTANTS ========================================= cat(" Constants:\n") DBVERSION <- "Protein Data 1.0" cat(paste(" DBVERSION: \"", DBVERSION, "\"\n", sep="")) UTILVERSION <- "dbUtilities 0.7.2" cat(paste(" UTILVERSION: \"", UTILVERSION, "\"\n", sep="")) LOADREFDB <- TRUE cat(paste(" LOADREFDB: \"", as.character(LOADREFDB), "\"\n", sep="")) # ==== PACKAGES ========================================== cat(" Packages:\n") # Needed for in memory sha-1 hash for verification of # file updates. if (! require(digest, quietly=TRUE)) { install.packages("digest") library(digest) } cat(" \"digest\" (hash functions)\n") # Needed for fetching UniProt data via POST if (!require(httr, quietly=TRUE)) { install.packages("httr") library(httr) } cat(" \"httr\" (http GET and POST functions)\n") # Needed to parse NCBI etuils XML responses if (!require(XML, quietly=TRUE)) { install.packages("XML") library(XML) } cat(" \"XML\" (parsing XML trees)\n") # Needed for str_trim() if (!require(stringr, quietly=TRUE)) { install.packages("stringr") library(stringr) } cat(" \"stringr\" (convenient string manipulation)\n") # ==== FUNCTIONS ========================================= cat(" Functions:\n") # Utility functions to list arguments of functions contained # in this file when it is being sourced. printArgs <- function(fName) { fArgs <- formals(fName) fName <- deparse(substitute(fName)) if (length(fArgs) == 0) { cat(paste(" ", fName, "()\n", sep="")) } else if (length(fArgs) == 1) { cat(paste(" ", fName, "(", makeArgText(fArgs[1]), ")\n", sep="")) } else { cat(paste(" ", fName, "(", sep="")) for (i in 1:length(fArgs)) { cat(paste(makeArgText(fArgs[i]), sep="")) if (i < length(fArgs)) { cat(", ") } else { cat(")\n") } } } } makeArgText <- function(pl) { arg <- names(pl) return(arg) } safeIter <- function(idx) { # returns a vector 1:idx if idx is greater than # zero. Returns NULL otherwise. This avoids iterating # a loop once over a zero length object. # use: for (i in safeIter(length(idx))) { ... } if (length(idx) != 1 | ! is.null(nrow(idx))) { stop("Value to iterate over is not a scalar.") } if (idx > 0) { return(1:idx) } else { return(NULL) } } # ==== createDB ============================================ createProteinDB <- function() { # Returns an empty list with DBVERSION and database # tables initialized db <- list() db$version <- DBVERSION db$taxonomy <- data.frame(id = numeric(), organismName = character(), stringsAsFactors = FALSE) db$protein <- data.frame(id = numeric(), name = character(), refSeqID = character(), uniProtID = character(), taxID = numeric(), genomeXref = character(), genomeFrom = numeric(), genomeTo = numeric(), seq = character(), stringsAsFactors = FALSE) db$proteinFeature <- data.frame( id = numeric(), proteinID = character(), featureID = character(), start = numeric(), end = numeric(), stringsAsFactors = FALSE) db$feature <- data.frame(id = character(), def = character(), stringsAsFactors = FALSE) return(db) } printArgs(createProteinDB) # ==== checkVersion ======================================== checkVersion <- function(db) { # stop if db does not have current version if (db$version != DBVERSION) { stop(paste("Database in argument has version \"", db$version, "\". Expecting \"", DBVERSION, "\" instead.", sep="")) } } printArgs(checkVersion) # ==== in2seq ============================================== in2seq <- function(s, uc = TRUE, lc = FALSE, noAmbig = TRUE) { # Sanitizes protein sequence input and converts it into a # sequence string. Case can be optionally changed, # default is to convert to upper case. # Letters that are not one-letter aa code - such as # ambiguity codes - throw an error, if not explicitly # permitted. s <- paste(unlist(s), collapse="") # flatten whatever structure it has s <- gsub("[^a-zA-Z]", "", s) # remove all non-letters if (noAmbig) { ambCodes <- "([bjouxzBJOUXZ])" # parentheses capture the match ambChar <- unlist(regmatches(s, regexec(ambCodes, s)))[1] if (! is.na(ambChar)) { stop(paste("Input contains ambiguous letter: \"", ambChar, "\"", sep="")) } } if (uc) { s <- toupper(s)} if (lc) { s <- tolower(s)} return(s) } printArgs(in2seq) # ==== in2vec ============================================== in2vec <- function(s, ...) { # Sanitizes protein sequence input and expands it into a # vector of single characters. Arguments for in2seq are # passed through via the three-dots parameter syntax. s <- in2seq(s, ...) return(unlist(strsplit(s, ""))) } printArgs(in2vec) # ==== node2string ========================================= node2string <- function(doc, tag) { # Extracts tagged contents from an XML response. # Contents between any and is returned in # a vector of strings. path <- paste("//", tag, "/text()", sep="") nodes <- getNodeSet(doc, path) return(unlist(lapply(nodes, toString.XMLNode))) } printArgs(node2string) # ==== fetchProteinData ==================================== fetchProteinData <- function(refSeqID, verbose=TRUE) { # Retrieves data for a given protein refseq ID from # NCBI and UniProt. NCBI data is retrived via the # eutils API. # cf. http://www.ncbi.nlm.nih.gov/books/n/helpeutils/ # UniProt IDs are retrieved from the # ID mapping service via a POST call. # cf. http://www.uniprot.org/help/programmatic_access # Returns a list of data items. data <- list() data$type <- "Protein Data V 1.0" data$refSeqID <- str_trim(refSeqID) # get UniProt ID for refseq URL <- "http://www.uniprot.org/mapping/" response <- POST(URL, body = list(from = "P_REFSEQ_AC", to = "ACC", format = "tab", query = data$refSeqID)) if (response$status_code == 200) { # 200: oK data$uniProtID <- str_trim(unlist(strsplit(content(response), "\\s+"))[4]) if (is.na(data$uniProtID)) { warning(paste("UniProt ID mapping service returned NA.", "Check your RefSeqID.")) } } else { data$uniProtID <- NA warning(paste("UniProt ID mapping not available:", "server returned status", response$status_code)) } # Base URL for NCBI eutils API calls eUtilsBase <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" # search for GI of this refSeqID URL <- paste(eUtilsBase, "esearch.fcgi?", "db=protein", "&term=", data$refSeqID, sep="") data$GID <- as.integer(node2string(htmlParse(URL), "id")) if (is.null(data$GID)) { stop("No GID found for this refSeqID. Check your input.")} # get NCBI summary data URL <- paste(eUtilsBase, "esummary.fcgi?", "db=protein", "&id=", data$GID, "&version=2.0", sep="") response <- htmlParse(URL) data$taxID <- as.integer(node2string(response, "taxid")) data$organismName <- str_trim(node2string(response, "organism")) # Fetch the sequence URL <- paste(eUtilsBase, "efetch.fcgi?", "db=protein", "&id=", data$GID, "&retmode=text", "&rettype=fasta", sep="") # retrieve FASTA, split into lines tmp <- node2string(htmlParse(URL), "p") tmp <- str_trim(tmp) tmp <- unlist(strsplit(tmp, "\\n")) data$FASTAheader <- tmp[1] data$seq <- in2seq(tmp[-1]) # Fetch the crossreference to the gene DB URL <- paste(eUtilsBase, "elink.fcgi?", "dbfrom=protein", "&db=gene", "&id=", data$GID, sep="") data$geneID <- as.integer(node2string(htmlParse(URL), "linksetdb/id")) # Fetch the gene summary URL <- paste(eUtilsBase, "esummary.fcgi?", "&db=gene", "&id=", data$geneID, sep="") response <- htmlParse(URL) data$name <- str_trim(node2string(response, "name")) data$genomeXref <- str_trim(node2string(response, "chraccver")) data$genomeFrom <- as.integer(node2string(response, "chrstart")[1]) data$genomeTo <- as.integer(node2string(response, "chrstop")[1]) if (verbose) { cat(paste("Data retrieved:\n")) cat(paste(" $name: ", data$name, "\n")) cat(paste(" $refSeqID: ", data$refSeqID, "\n")) cat(paste(" $uniProtID: ", data$uniProtID, "\n")) cat(paste(" $GID: ", data$GID, "\n")) cat(paste(" $taxID: ", data$taxID, "\n")) cat(paste(" $organismName: ", data$organismName, "\n")) cat(paste(" $genomeXref: ", data$genomeXref, "\n")) cat(paste(" $geneID: ", data$geneID, "\n")) cat(paste(" $genomeFrom: ", data$genomeFrom, "\n")) cat(paste(" $genomeTo: ", data$genomeTo, "\n")) cat(paste(" $FASTAheader: ", data$FASTAheader, "\n")) len <- nchar(data$seq) cat(paste(" $seq: ", substr(data$seq, 1, 10), " ... ", substr(data$seq, len-10, len), " (", len, " AA)", "\n\n", sep="")) } return(data) } printArgs(fetchProteinData) # ==== fetchPrositeFeatures ==================================== fetchPrositeFeatures <- function(database, extID, useSeq = TRUE, verbose=TRUE) { # Retrieves data from Scan Prosite using a query sequence. # UniProt or RefSeq ID or Name are passed as argument extID, # the correpsonding sequence is retrieved from the protein # table and processed. # # This function screen-scrapes the tabular ScanProsite # output. # Returns a list: # $pID: protein table ID of query # $start: Feature start # $end: Feature end # $id: Prosite ID of feature # $def: Prosite name of feature # ToDo: is the first argument a database object or the string? # If it's a string, useSeq must be FALSE to continue... # If it's a database, verify it, then continue below extID <- str_trim(extID) # Identify the type of ext ID and fetch its internal id pN <- getTableID(database, "protein", name = extID) pU <- getTableID(database, "protein", uniProtID = extID) pR <- getTableID(database, "protein", refSeqID = extID) #ToDo.: If useSeq is FALSE, work with the prosite ID nMatch <- length(pN) + length(pU) + length(pR) if (nMatch == 0) { stop(paste("ID \"", extID, "\" not found in database.", sep="")) } else if (nMatch > 1) { stop(paste("ID \"", extID, "\" is not unique in database.", sep="")) } if (length(pN) == 1) { pID <- pN } else if (length(pU) == 1) { pID <- pU } else { pID <- pR } # Retrieve the sequence seq <- database$protein$seq[database$protein$id == pID] # prepare the output features <- list() features$type <- "Feature Data V 1.1" features$pID <- pID features$extID <- character() features$start <- numeric() features$end <- numeric() features$id <- character() features$def <- character() PREFIX <- "ps|" fasta <- paste(">", extID, "\n", seq, "\n", sep="") URL <- "http://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" if (verbose) { cat(paste("Scanning Prosite for ", extID, "... ")) } response <- POST(URL, body = list(meta = "opt1", meta1_protein = "opt1", seq = fasta, skip = "on", output = "tabular")) lines <- unlist(strsplit(content(response,"text"), "\\n")) pattern <- paste("^", extID, sep="") lines <- lines[grep(pattern, lines)] if (length(lines) > 0) { for (line in lines) { tokens <- unlist(strsplit(line, "\\t|\\|")) features$extID <- c(features$extID, tokens[1]) features$start <- c(features$start, as.integer(tokens[2])) features$end <- c(features$end, as.integer(tokens[3])) features$id <- c(features$id, paste(PREFIX, tokens[4], sep="")) features$def <- c(features$def, tokens[5]) } if (verbose) {cat(paste(" ", length(lines), "feature(s).\n")) } } else { if (verbose) { cat(paste(" WARNING: No feature found.\n")) } } return(features) } printArgs(fetchPrositeFeatures) # ==== addProteinToDB ============================================= addProteinToDB <- function(database, x, override = FALSE, replace = TRUE) { # Add a new protein entry to the database, with associated # taxonomy entry. x is a protein data list item produced by # fetchProteinData. If override is true, replace an existing # entry (replace: TRUE), or add a duplicate entry # (replace: FALSE) if (missing(database)) { stop("\"database\" argument is missing with no default.") } checkVersion(database) if (missing(x)) { stop("No protein data list was passed as argument.") } else { # expect that x is list of data items if (is.null(x$type)) { stop("Protein data argument must be a list of data items.") } expectedType <- "Protein Data V 1.0" if (x$type != expectedType) { stop(paste("List of data items has type \"", x$type, "\". Expecting \"", expectedType, "\" instead.", sep="")) } } if (length(x$taxID) == 0) { stop("taxID argument is missing with no default.") } if (! is.numeric(x$taxID)) { stop(paste("taxID \"", x$taxID, "\" is not numeric. Please correct.", sep="")) } # handle taxID if (! any(database$taxonomy$id == x$taxID, na.rm = TRUE)) { # new taxID if (length(x$organismName) == 0) { stop(paste("taxID", x$taxID, "is not yet in database, but organismName", "is missing with no default.")) } else { # add this organism to the taxonomy table entry <- data.frame(id = x$taxID, organismName = x$organismName, stringsAsFactors = FALSE) database$taxonomy <- rbind(database$taxonomy, entry) } } # handle protein. # pID is 1 if the table is empty, max() + 1 otherwise. if (nrow(database$protein) == 0) { pID <- 1 } else { pID <- max(database$protein$id) + 1 } # Ensure all of the values exist. if (length(x$name) == 0) { x$name <- NA } if (length(x$refSeqID) == 0) { x$refSeqID <- NA } if (length(x$uniProtID) == 0) { x$uniProtID <- NA } if (length(x$taxID) == 0) { x$taxID <- NA } if (length(x$genomeXref) == 0) { x$genomeXref <- NA } if (length(x$genomeFrom) == 0) { x$genomeFrom <- NA } else if (x$genomeFrom == 999999999) { x$genomeFrom <- NA } else if (x$genomeFrom == 0) { x$genomeFrom <- NA} if (length(x$genomeTo) == 0) { x$genomeTo <- NA } else if (x$genomeTo == 999999999) { x$genomeTo <- NA } else if (x$genomeTo == 0) { x$genomeTo <- NA} if (length(x$seq) == 0) { x$seq <- NA } if (length(x$organismName) == 0) { x$organismName <- NA } # fix any sequence problems x$seq <- in2seq(x$seq) # prepare a dataframe row entry <- data.frame(id = pID, name = x$name, refSeqID = x$refSeqID, uniProtID = x$uniProtID, taxID = x$taxID, genomeXref = x$genomeXref, genomeFrom = x$genomeFrom, genomeTo = x$genomeTo, seq = x$seq, stringsAsFactors = FALSE) # put columns in correct table order entry <- entry[ , colnames(database$protein)] # check whether refSeqID or uniProtID already exist in the # database. Overwrite or duplicate as requested. if (any(database$protein$refSeqID == entry$refSeqID, na.rm = TRUE) || any(database$protein$uniProtID == entry$uniProtID, na.rm = TRUE)) { duplicate <- TRUE } else { duplicate <- FALSE } if (duplicate) { if (! override) { stop(paste("refSeqID and/or uniProtID already exist in database.\n", " You must set \"override = TRUE\" to add duplicate entries", "or replace existing entries.")) } else if (replace){ pid <- database$protein$id[which(database$protein$refSeqID == entry$refSeqID)] pid <- c(pid, database$protein$id[which(database$protein$uniProtID == entry$uniProtID)]) pid <- unique(pid) if (length(pid) == 1) { # replace existing unique entry entry$id <- pid database$protein[which(database$protein$id == entry$id), ] <- entry } else { stop(paste("More than one matching refSeqID and/or uniProtID found in database.\n", " This function can't replace more than one entry.", "First delete additional entries.")) } } else { # do not replace existing but # add entry as new record to database database$protein <- rbind(database$protein, entry) } } else { # not duplicate database$protein <- rbind(database$protein, entry) } return(database) } printArgs(addProteinToDB) # ==== addFeatureToDB ============================================= addFeatureToDB <- function(database, x) { # Add new proteinFeature table entries to the database, with associated # feature table entries. x is a feature data list item produced by # fetchPrositeFeatures. if (missing(database)) { stop("\"database\" argument is missing with no default.") } checkVersion(database) if (! missing(x)) { # expect that x is list of feature items if (is.null(x$type)) { stop("Argument is not a feature list.") } expectedType <- "Feature Data V 1.1" if (x$type != expectedType) { stop(paste("List of feature data items has type \"", x$type, "\". Expecting \"", expectedType, "\" instead.", sep="")) } } else { stop("Argument missing. Function needs a feature list to work with.") } len <- length(x$extID) if (len == 0) { cat("WARNING: skipping this empty list.\n") } else { testIntegrity(x$extID, len, is.character) testIntegrity(x$start, len, is.numeric) testIntegrity(x$end, len, is.numeric) testIntegrity(x$id, len, is.character) testIntegrity(x$def, len, is.character) pID <- x$pID # Iterate over all features in table for (i in safeIter(len)) { # get existing feature ID or add new feature to table fID <- getTableID(database, "feature", featureID = x$id[i]) if (length(fID) == 0) { # new ID: add to table entry <- data.frame(id = x$id[i], def = x$def[i], stringsAsFactors = FALSE) database$feature <- rbind(database$feature, entry) fID <- x$id[i] } # make or create pfID for proteinFeature table if (length(database$proteinFeature$id) == 0) { pfID <- 1 } else { pfID <- max(database$proteinFeature$id) + 1 } # we may add the same feature to the same protein several # times, even with the same start and end coordinates. We # don't enforce uniqueness here (but may at a later point). # Make entry for proteinFeature table entry <- data.frame(id = pfID, proteinID = pID, featureID = fID, start = x$start[i], end = x$end[i], stringsAsFactors = FALSE) database$proteinFeature <- rbind(database$proteinFeature, entry[colnames(database$proteinFeature)]) } # end for (i in safeIter(len)) } # end if (len == 0) return(database) } printArgs(addFeatureToDB) testIntegrity <- function(x, len, fConfirm) { # utility function to test whether feature values exist and # are of the expected type if (length(x) == 0 | any(x == "") | any(is.na(x)) | !any(fConfirm(x))) { stop(paste("Data integrity check failed:", paste(x, collapse=" "))) } if (length(x) != len) { stop(paste("Vector does not have expected length", len, ":", paste(x, collapse=" "))) } } # ==== setTableValue ======================================= setTableValue <- function(database, table, id = NULL, name = NULL, refSeqID = NULL, uniProtID = NULL, taxID = NULL, genomeXref = NULL, genomeFrom = NULL, genomeTo = NULL, seq = NULL, organismName = NULL) { # Set database values for existing entries if (missing(database) | missing(table)) { stop("Database or table is missing with no default.") } checkVersion(database) table <- str_trim(table) if (table == "protein") { if (is.null(id)) { stop("Protein id is missing with no default.") } row <- which(database$protein$id == id) if (! is.null(name)) { database$protein[row, "name"] <- as.character(name) } if (! is.null(refSeqID)) { database$protein[row, "refSeqID"] <- as.character(refSeqID) } if (! is.null(uniProtID)) { database$protein[row, "uniProtID"] <- as.character(uniProtID) } if (! is.null(taxID)) { # must be numeric ... if (! is.numeric(taxID)) { stop(paste("taxID", taxID, "is not numeric. Please correct.")) } # must exist in taxonomy table ... if (! any(database$taxonomy$id == taxID, na.rm = TRUE)) { # new taxID stop(paste("taxID", taxID, "not found in taxonomy table.", "Please update taxonomy table and try again.")) } # all good, update it... database$protein[row, "taxID"] <- taxID } if (! is.null(genomeXref)) { database$protein[row, "genomeXref"] <- genomeXref} if (! is.null(genomeFrom)) { database$protein[row, "genomeFrom"] <- genomeFrom} if (! is.null(genomeTo)) { database$protein[row, "genomeTo"] <- genomeTo} if (! is.null(seq)) { database$protein[row, "seq"] <- in2seq(seq)} } else if (table == "taxonomy") { if (missing(taxID)) { stop("taxID is missing with no default.") } if (! any(database$taxonomy$id == taxID, na.rm = TRUE)) { stop(paste(" Can't set values for this taxID.", taxID, "was not found in taxonomy table.")) } row <- which(database$taxonomy$id == taxID) if (organismName != "") { database$taxonomy[row, "organismName"] <- organismName } } else { stop(paste("This function has no code to update table \"", table, "\".", "Please enter a valid table name.")) } return(database) } printArgs(setTableValue) # ==== getTableID ============================================= getTableID <- function(database, table, name = NULL, refSeqID = NULL, uniProtID = NULL, taxID = NULL, organismName = NULL, proteinID = NULL, featureID = NULL, featureDef = NULL) { # Get a vector of IDs from a database table from all rows # for which all of the requested attributes are true. # Note: if no restrictions are entered, ALL ids are returned. # Currently there is no code to select from genome coordinates, # or query for sequence substrings. if (missing(database) | missing(table)) { stop("Database or table is missing with no default.") } checkVersion(database) if (table == "protein") { sel <- rep(TRUE, nrow(database$protein)) # initialize if (! is.null(name)) { sel <- sel & database$protein[, "name"] %in% name } if (! is.null(refSeqID)) { sel <- sel & database$protein[, "refSeqID"] %in% refSeqID } if (! is.null(uniProtID)) { sel <- sel & database$protein[, "uniProtID"] %in% uniProtID } if (! is.null(taxID)) { sel <- sel & database$protein[, "taxID"] %in% taxID } sel <- database$protein$id[sel] # get ids by selecting from vector } else if (table == "taxonomy") { sel <- rep(TRUE, nrow(database$taxonomy)) # initialize if (! is.null(taxID)) { sel <- sel & database$taxonomy[, "id"] %in% taxID } if (! is.null(organismName)) { sel <- sel & database$taxonomy[, "organismName"] %in% organismName } sel <- database$taxonomy$id[sel] # get ids by selecting from vector } else if (table == "proteinFeature") { sel <- rep(TRUE, nrow(database$proteinFeature)) # initialize if (! is.null(proteinID)) { sel <- sel & database$proteinFeature[, "proteinID"] %in% proteinID } if (! is.null(featureID)) { sel <- sel & database$proteinFeature[, "featureID"] %in% featureID } sel <- database$proteinFeature$id[sel] # get ids by selecting from vector } else if (table == "feature") { sel <- rep(TRUE, nrow(database$feature)) # initialize if (! is.null(featureID)) { sel <- sel & database$feature[, "id"] %in% featureID } if (! is.null(featureDef)) { sel <- sel & database$feature[, "def"] %in% featureDef } sel <- database$feature$id[sel] # get ids by selecting from vector } else { stop(paste("This function has no code to select from table \"", table, "\".", "Please enter a valid table name.")) } return(sel) } printArgs(getTableID) # ==== getSeq ============================================== getSeq <- function(database, ...) { # Retrieves all sequences for given id matches from the # protein table. Uppercase, to make Biostrings happy. if (missing(database)) { stop("Database argument is missing with no default.") } checkVersion(database) IDs <- getTableID(database, table= "protein", ...) seq <- database$protein[database$protein$id %in% IDs, "seq"] return(toupper(seq)) } printArgs(getSeq) # ==== getFeatureFasta ===================================== getFeatureFasta <- function(database, fName, ids, exactlyOne = FALSE, outFormat = "raw", width=60) { # Retrieves sequences for given fName matches in # multi FASTA format. # ids is a vector of unique identifiers from the protein # table. If ids is missing, we retrieve feature # sequences for all ids. # If exactlyOne is TRUE, we guarantee one sequence for # each id. If an id does not have a requested feature, # a pseudo sequence of ten hyphens is returned. If an # id has more than one of a requested feature, only # the first one is returned, with a warning. # outFormat = raw returns a raw string that can be displayed # with cat(). Each header and each line of sequence is # "\n" terminated. # outFormat = df returns a dataframe with head and seq # columns. No "\n" is added to the strings. # width controls the number of characters in a line of # raw string output. if (missing(database)) { stop("Database argument is missing with no default.") } if (missing(fName)) { stop("Feature name argument is missing with no default.") } checkVersion(database) if (missing(ids)) { # use all ids in the database ids <- database$protein$id } fName <- str_trim(fName) # get the Feature ID for the requested fName fID <- getTableID(database, table= "feature", featureDef = fName) if (length(fID) == 0) { stop(paste("No entry in feature table for feature Name \"", fName, "\".")) } if (length(fID) > 1) { stop(paste("Feature name \"", fName, "\" is not unique in the features table.\n", "I can't tell which one to use.")) } # prepare empty "fasta" output object if (outFormat == "raw") { fasta <- character() } else if (outFormat == "df") { fasta <- data.frame(head = character(), seq = character(), stringsAsFactors = FALSE) } else { stop("Unknown format \"", outFormat, "\" requested.") } # process all protein IDs for (i in safeIter(length(ids))) { pID <- ids[i] if (! pID %in% database$protein$id) { stop(paste("ID \"", pID, "\" is not present in protein table.")) } pRow <- which(database$protein$id == pID) # fetch the data that is the same for each proteinFeature tmp <- database$protein$taxID[pRow] tmp <- database$taxonomy[database$taxonomy$id == tmp, "organismName"] pCode <- biCode(tmp) pName <- database$protein$name[pRow] # get all proteinFeature ids that match pID AND fID pfIDs <- getTableID(database, table = "proteinFeature", proteinID = pID, featureID = fID) if (length(pfIDs) > 1 & exactlyOne) { # there's more than one match # but we requested to get only # one ... # ... drop a warning and use only the first one. warning(" More than one ", fName, " annotation found for ", pName, " (pfIDs: ", paste(pfIDs, collapse=", "), ").\n", " Using only the first one.") pfIDs <- pfIDs[1] } # prepare a dataframe to collect the information fOut <- data.frame(start = numeric(), end = numeric(), seq = character(), stringsAsFactors = FALSE) if (length(pfIDs) == 0) { # No annotation of this # feature for this pID ... if (exactlyOne) { # but we requested to get at least # one ... # make a dummy feature sequence fTmp <- data.frame(start = 0, end = 0, seq = "----------", stringsAsFactors = FALSE) fOut <- rbind(fOut, fTmp) } else { # do nothing } } else { # Annotation exists, process it for (j in safeIter(length(pfIDs))) { fStart <- database$proteinFeature$start[pfIDs[j]] fEnd <- database$proteinFeature$end[pfIDs[j]] fSeq <- substr(database$protein$seq[pRow], fStart, fEnd) fTmp <- data.frame(start = fStart, end = fEnd, seq = fSeq, stringsAsFactors = FALSE) fOut <- rbind(fOut, fTmp) } } # process fOut according to the requested output format for (j in safeIter(nrow(fOut))) { start <- fOut$start[j] end <- fOut$end[j] header <- paste(">", pName, "_", pCode, " ", fName, " ", start, ":", end, sep = "") seq <- fOut$seq[j] if (outFormat == "raw") { fasta <- paste(c(fasta, header, "\n", chopString(seq, width)), sep = "", collapse ="") } if (outFormat == "df") { fasta <- rbind(fasta, data.frame(head = header, seq = seq, stringsAsFactors = FALSE)) } } } # end for (i in 1:length(ids)) return(fasta) } printArgs(getFeatureFasta) chopString <- function(s, w) { # Chop strings into lines of width w. # All lines are terminated with a newline "\n" pattern <- paste(".{1,", w, "}", sep="") tmp <- paste(unlist(str_match_all(s, pattern)[[1]]), collapse = "\n") return(paste(tmp, "\n", sep="")) } # ==== writeSeqSet ============================================= writeSeqSet <- function(seqSet, file, format = "mfa", blockSize = 50) { # Output a seqSet of class "MsaAAMultipleAlignment" or # "AAStringSet" to multi-FASTA format. FASTA headers are # taken from object names. # if format == "mfa", write a multi FASTA output # if format == "ali", write a Claustal W output # MAXNAMEWID <- 15 # Maximum name width for "ali" format format <- str_trim(format) if (missing(seqSet)) { stop("Input object missing from arguments with no default.") } if (missing(file)) { writeToFile <- FALSE } else { writeToFile <- TRUE file <- str_trim(file) sink(file) # divert output to file } # Extract the raw data from the objects depending on # their respective class and put this # into a named vector of strings. if (class(seqSet)[1] == "MsaAAMultipleAlignment") { strings <- character(nrow(seqSet)) for (i in 1:nrow(seqSet)) { strings[i] <- as.character(seqSet@unmasked[i]) names(strings)[i] <- seqSet@unmasked@ranges@NAMES[i] } } else if (class(seqSet)[1] == "AAStringSet") { strings <- character(length(seqSet)) for (i in 1:length(seqSet)) { strings[i] <- as.character(seqSet[i]) names(strings)[i] <- seqSet@ranges@NAMES[i] } } else { stop(paste("Input object of class", class(seqSet)[1], "can't be handled by this function.")) } if (format == "mfa") { for (i in 1:length(strings)) { # output FASTA header cat(paste(">", names(strings)[i], "\n", sep="")) # output sequence in blocks cat(chopString(strings[i], blockSize)) cat("\n") # output an empty line } } else if (format == "ali") { SEP <- paste(rep(" ", MAXNAMEWID), collapse="") cat("CLUSTAL W formatted alignment\n") cat("\n") pattern <- paste(".{1,", blockSize, "}", sep="") allBlocks <- character() for (i in 1:length(strings)) { # make labels for rownames label <- substr(paste(names(strings)[i], SEP, sep = ""), 1, MAXNAMEWID) label <- paste(label, " ", sep="") # chop strings into blocks blocks <- unlist(str_match_all(strings[i], pattern)[[1]]) dim(blocks) <-c(1, length(blocks)) rownames(blocks) <- label allBlocks <- rbind(allBlocks, blocks) } for (i in 1:ncol(allBlocks)) { for (j in 1:nrow(allBlocks)) { cat(rownames(allBlocks)[j]) cat(allBlocks[j, i]) cat("\n") } cat("\n\n") } } if (writeToFile) { sink() # Done. Close the diversion. } } printArgs(writeSeqSet) # ==== mergeDB ===================================== # This function is now retired. If it is reactivated: caution - # can't simply merge protein table but need to construct unique IDs # and update proteinID's in proteinFeature table. # mergeDB <- function(old, new) { # # Merge records in protein and taxonomy table # # from old to new # oTx <- old$taxonomy$id # nTx <- new$taxonomy$id # missing <- !(oTx %in% nTx) # new$taxonomy <- rbind(new$taxonomy, old$taxonomy[missing, ]) # oRef <- old$protein$refSeqID # nRef <- new$protein$refSeqID # missing <- !(oRef %in% nRef) # new$protein <- rbind(new$protein, old$protein[missing, ]) # return(new) # } # printArgs(mergeDB) # ==== biCode ============================================== biCode <- function(s) { # Creates an uppercase five-letter code from a # binomial species name. Drops strain IDs etc. s <- str_trim(s) substr(s, 4, 5) <- substr(strsplit(s,"\\s+")[[1]][2], 1, 2) return (toupper(substr(s, 1, 5))) } printArgs(biCode) # ==== makeNames =========================================== makeNames <- function(database, width = 10, ...) { # Return a vector of meanigful names for database # proteins. Names are the last (width - six) characters # of the proteinName entry, plus a biCode, separeated # by an underscore. if (missing(database)) { stop("Database argument is missing with no default.") } checkVersion(database) ids <- getTableID(database, table= "protein", ...) vNames <- character(length(ids)) for (i in 1:length(ids)) { # fetch the name n <- database$protein[database$protein$id == ids[i], "name"] # replace blanks with underscores n <- gsub("\\s+", "_", n) # take only the last (width - six) characters n <- substr(n, nchar(n) - (width - 6) + 1, nchar(n)) # fetch the organismName tax <- database$protein[database$protein$id == ids[i], "taxID"] s <- database$taxonomy[database$taxonomy$id == tax, "organismName"] # make a five-letter code from the species name s <- biCode(s) # paste it together vNames[i] <- paste(n, s, sep="_") } return(vNames) } printArgs(makeNames) # ==== updateVerifiedFile ================================== updateVerifiedFile <- function(key, fromPath = "http://steipe.biochemistry.utoronto.ca/abc/images/", fromFile = "f/f9/DbUtilities.R", toPath = PROJECTDIR, toFile = "dbUtilities.R", sourceDownloadedFile = TRUE) { # Fetch file from URL, verify it, save it, and source it. # Defaults are to download THIS file from the # course repository for update, and replace its # old copy locally. # Requires package digest. sourceFile <- paste(fromPath, fromFile, sep="") targetFile <- paste(toPath, toFile, sep="") # download file to temporary file tmpFile <- tempfile() download.file(sourceFile, tmpFile, method="auto") # verify hash sha1 <- digest(paste(readLines(tmpFile, warn=FALSE), collapse=" "), file=FALSE, algo="sha1") if (sha1 == key) { # move the file file.rename(from = tmpFile, to = targetFile) if (sourceDownloadedFile) { # run the file... source(targetFile) } } else { file.remove(tmpFile) stop("SHA-1 hash of downloaded file does not match the key passed to this function.") } } printArgs(updateVerifiedFile) # Undocumented function to verify the update key: verifyKey <- function(key, fromPath = "http://steipe.biochemistry.utoronto.ca/abc/images/", fromFile = "f/f9/DbUtilities.R") { sourceFile <- paste(fromPath, fromFile, sep="") tmpFile <- tempfile() download.file(sourceFile, tmpFile, method="auto") sha1 <- digest(paste(readLines(tmpFile, warn=FALSE), collapse=" "), file=FALSE, algo="sha1") if (sha1 == key) { cat("\nSuccess: key matches digest of downloaded file.\n\n") } else { cat("\nPANIC: KEY DOESN'T MATCH DIGEST OF DOWNLOADED FILE.\n\n") } file.remove(tmpFile) } # ==== makeKey ============================================= makeKey <- function() { # Make a SHA-1 hash from the local version of this file # for verifying downloads. localFile <- dir(paste(PROJECTDIR), pattern="dbUtilities.R", full.names=TRUE) sha1 <- digest(paste(readLines(localFile, warn=FALSE), collapse=" "), file=FALSE, algo="sha1") return(sha1) } printArgs(makeKey) # ==== LOAD refDB REFERENCE DATABASE ======================= # 48 reference APSES domain proteins from 10 fungi and # E. Coli as outgroup. ScanProsite annotations included. if (LOADREFDB) { refDB <- structure(list(version = "Protein Data 1.0", taxonomy = structure(list( id = c(240176L, 283643L, 930090L, 671144L, 418459L, 237631L, 227321L, 367110L, 284812L, 559292L, 562L), organismName = c("Coprinopsis cinerea okayama7#130", "Cryptococcus neoformans var. neoformans B-3501A", "Bipolaris oryzae ATCC 44560", "Wallemia mellicola CBS 633.66", "Puccinia graminis f. sp. tritici CRL 75-36-700-3", "Ustilago maydis 521", "Aspergillus nidulans FGSC A4", "Neurospora crassa OR74A", "Schizosaccharomyces pombe 972h-", "Saccharomyces cerevisiae S288c", "Escherichia coli")), .Names = c("id", "organismName"), row.names = c(NA, 11L), class = "data.frame"), protein = structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48), name = c("CC1G_01306", "CNBB4890", "COCMIDRAFT_338", "WALSEDRAFT_68476", "PGTG_08863", "CC1G_00846", "WALSEDRAFT_59726", "UMAG_11222", "COCMIDRAFT_8533", "PGTG_02039", "AN3154.2", "AN6715.2", "NCU07587", "UMAG_05338", "NCU07246", "CNBD0840", "res1", "MBP1", "res2", "SWI4", "cdc10", "AN5836.2", "PHD1", "Asm-1", "SOK2", "CC1G_08099", "UMAG_15042", "COCMIDRAFT_103135", "PGTG_11943", "UMAG_04778", "WALSEDRAFT_68479", "PGTG_03082", "bqt4", "COCMIDRAFT_81480", "NCU06560", "AN5405.2", "AN0162.2", "PGTG_05590", "COCMIDRAFT_105954", "WALSEDRAFT_69819", "CNBB2840", "CC1G_13964", "COCMIDRAFT_96253", "CC1G_14426", "XBP1", "NCU06339", "UMAG_11055", "KilA"), refSeqID = c("XP_001837394", "XP_776961", "XP_007682304", "XP_006957790", "XP_003327086", "XP_001831299", "XP_006957051", "XP_011392621", "XP_007691662", "XP_003320997", "XP_660758", "XP_664319", "XP_962967", "XP_011392041", "XP_955821", "XP_776035", "NP_595496", "NP_010227", "NP_593032", "NP_011036", "NP_596132", "XP_663440", "NP_012881", "XP_960837", "NP_013729", "XP_001836714", "XP_011388143", "XP_007690905", "XP_003330006", "XP_011391646", "XP_006957792", "XP_003321545", "NP_596166", "XP_007682909", "XP_962267", "XP_663009", "XP_657766", "XP_003323688", "XP_007691967", "XP_006959479", "XP_777052", "XP_002911924", "XP_007688318", "XP_002911429", "NP_012165", "XP_962373", "XP_011390537", "WP_000200358"), uniProtID = c("A8NYC6", "F5HFJ0", "W6ZM86", "I4YDD8", "E3KED4", "A8N8X1", "I4YGC0", "A0A0D1DP35", "W6ZE71", "E3JX03", "Q5B8H6", "Q5AYB5", "Q7SBG9", "A0A0D1BWD8", "Q7RW59", "F5HBL5", "P33520", "P39678", "P41412", "P25302", "P01129", "P36011", "P36093", "Q1K6U0", "P53438", "A8NVH3", "A0A0D1CVS5", "W6ZGE9", "E3KMR2", "A0A0D1DQM4", "I4YDE0", "E3JYK1", "O60158", "W6ZKJ4", "Q7S9H5", "Q5B225", "Q5BH18", "E3K4V4", "W6Z1H5", "I4Y911", "F5HHR9", "D6RKH9", "W6ZCR2", "D6RMB0", "P40489", "Q7S9W7", "A0A0D1DZM8", "D7ZJS9"), taxID = c(240176L, 283643L, 930090L, 671144L, 418459L, 240176L, 671144L, 237631L, 930090L, 418459L, 227321L, 227321L, 367110L, 237631L, 367110L, 283643L, 284812L, 559292L, 284812L, 559292L, 284812L, 227321L, 559292L, 367110L, 559292L, 240176L, 237631L, 930090L, 418459L, 237631L, 671144L, 418459L, 284812L, 930090L, 367110L, 227321L, 227321L, 418459L, 930090L, 671144L, 283643L, 240176L, 930090L, 240176L, 559292L, 367110L, 237631L, 562L), genomeXref = c(NA, "NC_009178.1", NA, NA, NA, NA, NA, "NC_026499.1", NA, NA, NA, NA, "NC_026504.1", "NC_026496.1", "NC_026505.1", "NC_009180.1", "NC_003423.3", "NC_001136.10", "NC_003424.3", "NC_001137.3", "NC_003423.3", NA, "NC_001143.9", "NC_026505.1", "NC_001145.3", NA, "NC_026480.1", NA, NA, "NC_026494.1", NA, NA, "NC_003423.3", NA, "NC_026504.1", NA, NA, NA, NA, NA, "NC_009178.1", NA, NA, NA, "NC_001141.2", "NC_026504.1", "NC_026488.1", NA), genomeFrom = c(NA, 2043901L, NA, NA, NA, NA, NA, 150554L, NA, NA, NA, NA, 160232L, 247386L, 6093554L, 292497L, 1232587L, 352876L, 686542L, 385875L, 2763829L, NA, 356747L, 3980679L, 305592L, NA, 689244L, NA, NA, 235674L, NA, NA, 2840822L, NA, 1632913L, NA, NA, NA, NA, NA, 792098L, NA, NA, NA, 177249L, 2857075L, 486507L, NA), genomeTo = c(NA, 2046558L, NA, NA, NA, NA, NA, 152738L, NA, NA, NA, NA, 156399L, 250703L, 6097220L, 289386L, 1235586L, 355377L, 689178L, 382594L, 2760740L, NA, 357847L, 3977114L, 303235L, NA, 691118L, NA, NA, 237248L, NA, NA, 2842833L, NA, 1630182L, NA, NA, NA, NA, NA, 790646L, NA, NA, NA, 175306L, 2854774L, 485047L, NA), seq = c("MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVINTRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRYRQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACAMGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVDVAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINNHDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKASTRCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLENALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQVQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAAGCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG", "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKEKETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVMTDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLGIGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLTAGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAKYYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILEDERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKERDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVREQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQIAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ", "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAAAVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADGNRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDVGVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSSSNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKKGETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTASHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLNDEEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELVREVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV", "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQPRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFERGRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDGLSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDELGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIALIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASDFGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKEIQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLEGTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC", "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETYHLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVKVSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSNLARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASSQSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESSQIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNHDLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDEDGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPGGTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERDLDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERARRITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRESVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESLNEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR", "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQGGYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPSNLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYHSQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRPPSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSFDVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSKKSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLANKLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAATRELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVSFSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVESEAQVVDIGRVSGFMQKVRDGII", "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSKVFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAIDGSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRVNHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLSKLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSPLLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILRNIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQEECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENANKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ", "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRSRRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARYADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQTALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADYGDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGPAGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTEIEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALSTSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTMAAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGAP", "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRRSDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGSHAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSATRPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPTEPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNTALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCAIKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLMEVQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAIDQTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKEDDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRMLKEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD", "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKVAGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSSASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIPHSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENPNSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQLLELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKHLVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEIRSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQNGIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTCTNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNKLLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL", "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDVFSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVAAMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDLLRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKSSRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETASSIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKLTSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLEQIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKFDVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA", "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSGFEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEKVQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQPQQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNGRYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPERFQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQKAVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTASQPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPAGSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRSIDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLPSAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEILMRDRGQDW", "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVNATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDTPTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADFPNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPTEPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTALHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAVKGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVGADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTLHSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKTQSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVESEKPELEIARVRRFLGGVEGVVH", "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTTSALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNIPTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQRAQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSLRPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAADGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRITVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLADLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVTNSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETALGIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISALTRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQNLRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVGTSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGPDVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA", "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAVPTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFSTGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMGDVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMKIGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIPNKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTVQNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAEMATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDENETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEARGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGGGQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV", "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQHFQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSARTKILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGLPGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTMSMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGLDNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHSALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAGVKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLDAGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNATEASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVPGLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVSVCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS", "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVPSERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMSGIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTALHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASSKSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSSKDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNLAMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELADISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS", "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTSIRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA", "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSSSTLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDKYEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRLSQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLISIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLLQSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTLNEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPSDFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLIAMSCGINPEDLSLEILDAVEEALTREK", "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQFSKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSPGTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFNNDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINSSNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMKQPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEYKQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITKSIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDNIGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQQQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPSKILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKVSSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSLEKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTILQFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA", "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMELSCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQPLISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQNVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTSKDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLVTNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQDKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPRQKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQKSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVSLCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA", "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSISSQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGMINGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQRNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSLSNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQRGSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTPRTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPDSRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR", "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYAVPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKSIAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRSEKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEIATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS", "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAGQSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMTRGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRKDSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQGNSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQSSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAVNSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADMPSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAHRRR", "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQSQQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQSMAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGPPSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSYNLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATGSNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSMHLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNISAAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNATMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQLPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK", "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNSKADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASSNVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYSHSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAVLRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQEDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSAPHRPW", "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSAHHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVNGMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWEDEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGIADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLSSLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMTAAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQRSGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE", "PRHSKQTTNLRCRLFASSILSPCQASRRIPPHARLSGQLTRESLRQTSQPWTPTKPLSREHVNTVKLELPSISSVHARGPADTWYPSHYATKPAVSGERLPALPQIQSHPSTSSNYSSPRGDSISSGSVSGGSASSNTSYAASVNGQTTGFKTPSPKHTPQSLRRDSQSLNTQSVQSSPFGTTQEGYSFAPSGYNSMNQMQSYADVHQSHMATAAHAPASAPPSGLSHYSYPPQPSMMQSQHQYSQGPPGYPPYGYPGGVPSQIPASSSMNQAMVPSTLQLPAMSSGAPASSLPGSQSYQTQTFDHTGQVAPPGMKPRVTATLWEDEGSLCFQVEAKGVCVARREDNHMINGTKLLNVAGMTRGRRDGILKSEKTRHVVKIGPMHLKGVWIPFERALEFANKEKITEQLYPLFVHDIGALLYHPSNQTRSSVGSAAMAAVDRNRRPDPMQTHQRYLSGPAASQPPSLHHHHSMSNPIATAISQPPHAIQPHPSSGRPGIDRAHTFPTPPTSASSIMGMGNQGSSYEWNGNNVQNPQGGQPLSIDTGLSNARSVPTTPASTPPGAVQQGMSYASGQSFDGSRPMYSGPPSQPGQYTQGQPMMGYRQDGSYPKTEMAPPSRINDVPDEGEVKQPDGMMPQGHEQVAPPPQGTEGEHDHGNEYTHSNASYNGNRGPYGYPPNGPPGAMHPDHPHLSPEMTGSPHQNGSGRATPRSAATGQPQWSSGYPTPQRQAPPSSNLYNVMSDPRGASNGNATHDAYQGPGAVPQYATQGYPPTNGVNSGKRGRDDEEEDPYRPDSVQGDDMSGLKRRKTLEGGAVGGPYADPTPGLQRAHTMTAQRGRR", "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMVHFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGPGEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQARYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPSMRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPNQIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTAGCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVLTSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ", "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNKVNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMSQKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYGLESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAGYGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPDPESAQLFTIHDFGSDPFYAEQVERG", "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPFERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFSPQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSSSSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR", "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAVSTGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLPPTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSLFAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGKRDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHPRHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSAQANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGDAAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKRLIMEWNPSC", "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSAFPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEGEPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPSSSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVNKGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITKLESEVYYEKRKVRALGGIAIGLGVGAILPFLF", "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRNRMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWIEALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKPDATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVETNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAKKAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV", "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSPPSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDPSDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATTVTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRVDEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAETEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYVANVL", "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQDFRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKMLNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIATEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSSPEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIPSSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYRDDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHDATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA", "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSPQQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRALLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATLQSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISVEMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRERVRNRALMGVTAAFALAKPALVLLEA", "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYTKKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISASSFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRSPTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTNDENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTNTLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGGGGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA", "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQIYPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLRENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEETTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRPRSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCVGDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS", "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKIESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSSPAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPLKKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQNPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF", "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANGTHHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLVQLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVTEASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVAVADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL", "MDAAIPTPRLQRNNTITIPKPSLVLRTPNQPKSKSKHIADDDSNNQPLVPSQTRQSHSIVPKPQDQIPEFPPPHVILHKDDAGSKVFHALARSLLSVDNRATTVKDLADLAVNNGLGCQNSSAATQAITTYLRMHNERCEADHDQPLLLSHNMSGTDADDDLIPALYSLQGGNPKKLCPNRKTNFRKNTTVWYLSRATGAACPFARAGIRLCDYDVLTEEDPEKEHKRRRSKHFDSISAGQKRKRPLRSCVASGLASDSESEKGEDKRPQKRLTLRIKLNGAFTPRPQPERSQVSSDDDSSDEEEPMEVDNSDRESEAPESKKEEEEEWRLPPYPRRSISIPCYTPSYEGAYPQFPLHNHYHDPFRRSPSLAFSSGSPPPDSEDEVDDFHITMTRTDDFPEDFSSESESEGETQFESPGPRSPSAPPLPSTSITVKEEPRDLQSMLDAWDDLDAGLTEPNVVRVEAGPLALKSEPLDLWDWDSEPTARIKQEDLSFDSLFPSDSAFSTPSLSSPSTSSRLTPEFTGSQSISHDDVQDSPSRSNTVRLRSKTVPVFSTPSSNDTPASLSVPPPPSLNARSNTISGESAESPLLPSSSSLPPSIASLIQSMNTLSAAVSPSSLVLSPVTPPSGSDAVVVHTCQPCSPPITATQIEDISVYQMVLGSFHFLRRIDTDFVNLSPIAAYNKSPFPVITTIPNATPIKGSPTVSGIWVPLSAAQAYLRDHPAEGSEFDIFLSDQLYERFPSALQDFVKSNVPTRSLNQFGRHFGSTLQQFTQHPPVPLTPNEVLARNPPQLQLQQITTPSPTSNAYTLSASLSISEKHHAAMIEPPLNAAEQEIFELCVVPDWDRDVDSGSSAPGSATPGPNAEPRGSRGESDMQVDEQECTTLEAGDSDSSSLTSLSSSPEVSGEDDLMKEGAGPSSPTLPAPPQSLKPSPSSSDEMETALPPASSDLDVGSIVKEGTKETSTGVSANEDSSTTVVPQRKRKGSSSRPNRPAPLRRSKRVAEIAAHHNPSSPSTSTPASVNTRSRRRGSRNSLS", "MTFDHTGQIAPPGAEPRVTADLWEEEGTRYFQVEARGVCVARREDNHMINGTKLLSAAGITRSRRDGILKSEKTRHVVKTGPMYL", "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEAEEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPPTNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRTKSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSKKPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTAITYLPNFL", "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAAQNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASRENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAEFEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQSYKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQNVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSNDKNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTYDHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFKFKTNSKQ", "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDTMRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLRGISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSHSNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSPRPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQDQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQTSLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTTTATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDADDEGDYEHEQQYRRKRRRLLLVGRAKSF", "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPKLPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRFLDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEVAEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVRVESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRVVRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHLASILPW", "MTSFQLSLISREIDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQWLSPKFAVQVSRWVREWMSGERTTAEMPVHLKRYMVNRSRIPHTHFSILNELTFNLVAPLEQAGYTLPEKMVPDISQGRVFSQWLRDNRNVEPKTFPTYDHEYPDGRVYPARLYPNEYLADFKEHFNNIWLPQYAPKYFADRDKKALALIEKIMLPNLDGNEQF" )), .Names = c("id", "name", "refSeqID", "uniProtID", "taxID", "genomeXref", "genomeFrom", "genomeTo", "seq"), row.names = c(NA, 48L), class = "data.frame"), proteinFeature = structure(list( id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107), proteinID = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 23, 24, 25, 26, 27, 28, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 44, 45, 46, 47, 48), featureID = c("ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS51299", "ps|PS50297", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50297", "ps|PS50088", "ps|PS50297", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50297", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS50088", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS00152", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51299", "ps|PS51301"), start = c(6, 270, 270, 389, 17, 328, 328, 447, 9, 266, 266, 387, 83, 321, 321, 441, 90, 442, 442, 561, 17, 409, 409, 529, 6, 250, 250, 369, 7, 245, 245, 364, 56, 397, 397, 538, 43, 344, 344, 465, 9, 260, 260, 100, 440, 50, 396, 396, 536, 163, 590, 715, 14, 268, 268, 390, 114, 477, 477, 618, 6, 236, 236, 351, 5, 394, 394, 512, 8, 247, 247, 368, 37, 520, 520, 635, 641, 66, 356, 356, 483, 129, 186, 116, 414, 36, 233, 318, 465, 48, 35, 4, 273, 38, 66, 62, 103, 64, 125, 101, 26, 62, 34, 282, 1, 105, 7), end = c(112, 302, 426, 421, 123, 360, 479, 479, 115, 298, 424, 419, 192, 353, 473, 473, 196, 474, 598, 593, 125, 441, 561, 561, 112, 282, 406, 401, 113, 277, 401, 396, 163, 429, 580, 570, 149, 376, 511, 497, 115, 292, 302, 207, 516, 161, 428, 568, 568, 269, 747, 747, 123, 300, 422, 422, 222, 509, 650, 650, 112, 268, 268, 394, 111, 426, 549, 544, 115, 279, 400, 400, 147, 552, 552, 678, 673, 173, 382, 515, 515, 235, 292, 222, 520, 148, 339, 424, 474, 160, 147, 111, 380, 147, 176, 169, 221, 176, 241, 219, 145, 178, 148, 395, 62, 267, 110)), .Names = c("id", "proteinID", "featureID", "start", "end"), row.names = c(NA, 107L), class = "data.frame"), feature = structure(list(id = c("ps|PS51299", "ps|PS50088", "ps|PS50297", "ps|PS00152", "ps|PS51301"), def = c("HTH_APSES", "ANK_REPEAT", "ANK_REP_REGION", "ATPASE_ALPHA_BETA", "KILA_N" )), .Names = c("id", "def"), row.names = c(NA, 5L), class = "data.frame")), .Names = c("version", "taxonomy", "protein", "proteinFeature", "feature")) cat("\nLoaded \"refDB\" reference database object\n") } # ==== MESSAGE ============================================ cat("\n") # ==== TESTS ============================================= # TBD # [END]