BioPerl example FASTA

From "A B C"
Jump to navigation Jump to search

BioPerl example: retrieving sequence data


The contents of this page has recently been imported from an older version of this Wiki. This page may contain outdated information, information that is irrelevant for this Wiki, information that needs to be differently structured, outdated syntax, and/or broken links. Use with caution!


The following short script containins a subroutine that retrieves sequence data from GenBank, SwissProt or EMBL.



 

Contents

#!/usr/bin/perl
use strict;
use warnings;

###########################################################################
# This script retrieves sequence data from GenBank, SwissProt and EMBL.
#
# Usage: 	retrieve.pl '<database type>|<sequence id>'
# Examples: 	retrieve.pl 'gi|22532774'    # GenBank, gi
# 		retrieve.pl 'gb|U41105.2'    # GenBank, accession number
# 		retrieve.pl 'spi|BOLA_HAEIN' # Swissprot, ID
# 		retrieve.pl 'spa|P43780'     # Swissprot, accession number
# 		retrieve.pl 'emi|BUM'        # EMBL, ID
#		retrieve.pl 'ema|J02231'     # EMBL, accession number 
# Description:	The script gives an example of how to use the subroutine
# 		to look at the sequence description or sequence, and to
# 		create a FASTA format file.
############################################################################

use Bio::DB::GenBank;
use Bio::DB::SwissProt;
use Bio::DB::EMBL;

my $full_id = $ARGV[0];
my $seq_obj_ref = &getSequence($full_id); 

# Using the output:
if ($seq_obj_ref == '0') { print "Failed.\n"; }
print $$seq_obj_ref->desc, "\n";
print $$seq_obj_ref->seq, "\n";

# To create a FASTA file:
my $out = Bio::SeqIO->new(-format => 'Fasta',
			  -file   => '>outputfile.fa');
$out->write_seq($$seq_obj_ref);

sub getSequence {
 	my $error_desc;
 	my ($db_obj, $seq_obj);
 
 	if ($full_id =~ /gi\|(.*)/) {
 		$db_obj = Bio::DB::GenBank->new;
 		$seq_obj = $db_obj->get_Seq_by_gi($1);
	} elsif ($full_id =~ /gb\|(.*)/) {
		$db_obj = Bio::DB::GenBank->new;
		$seq_obj = $db_obj->get_Seq_by_acc($1);
	} elsif ($full_id =~ /spi\|(.*)/) {
		$db_obj = Bio::DB::SwissProt->new;
		$seq_obj = $db_obj->get_Seq_by_id($1);
	} elsif ($full_id =~ /spa\|(.*)/) {
		$db_obj = Bio::DB::SwissProt->new;
		$seq_obj = $db_obj->get_Seq_by_acc($1);
	} elsif ($full_id =~ /emi\|(.*)/) {
		$db_obj = Bio::DB::EMBL->new;
		$seq_obj = $db_obj->get_Seq_by_id($1);
	} elsif ($full_id =~ /ema\|(.*)/) {
		$db_obj = Bio::DB::EMBL->new;
		$seq_obj = $db_obj->get_Seq_by_acc($1);
	} else {
		$error_desc = "Input type not recognized.";
		return 0;
	}	

	unless (defined($seq_obj)) {
		$error_desc = "Sequence retrieval failed.";
		return 0;
	}

	return \$seq_obj;
}


   

Further reading and resources