#!/usr/bin/perl -w =pod =head1 NAME bp_blastp_seq.pl =head1 SYNOPSIS bp_blastp_seq.pl --matrix BLOSUM62 --expect 5.0 protein_sequence_file =head1 OPTIONS -h short help --help include description -s/--matrix scoring matrix (BLOSUM45, BLOSUM62, BLOSUM80, PAM70, PAM30) -E/--expect E()-value threshold (default 5.0) --db database (human refseq by default) =head1 DESCRIPTION Ccompares the sequence (in FASTA format) in the file specfied and compares that sequence to human refseq proteins (default) at the NCBI BLASTP site, reporting the search parameter and the high scoring sequences. =head1 AUTHOR William R. Pearson, wrp@virginia.edu =cut use strict; use Bio::SeqIO; use Bio::Tools::Run::RemoteBlast; use Pod::Usage; use Getopt::Long; my ($acc,$database,$expect,$matrix) = ('', 'GP/9606.9558/RefSeq_protein', 5.0, ""); use vars qw($help $shelp); my %matrix_gaps = ( BLOSUM45 => "15 2", BLOSUM62 => "11 1", PAM70 => "10 1", PAM30 => "9 1", ); pod2usage(1) unless @ARGV; GetOptions("db:s"=>\$database, "E:f" => \$expect, "expect:f" => \$expect, "s:s" => \$matrix, "matrix:s" => \$matrix, "h|?" => \$shelp, "help" => \$help, ); pod2usage(1) if $shelp; pod2usage(exitstatus => 0, verbose => 2) if $help; my $file = shift @ARGV; my $seqio = Bio::SeqIO->new(-file => $file, -format => "fasta"); my $query_seq = $seqio->next_seq; # then setup a blast search my (@params, $remote_blast_object, $blast_file, $r, $rc); my $sleep_time = 2; @params = (-prog=>'blastp', -data=>$database, -expect=>$expect); if ($matrix && $matrix_gaps{$matrix}) { $Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = $matrix; $Bio::Tools::Run::RemoteBlast::HEADER{'GAPCOSTS'} = $matrix_gaps{$matrix}; } # (possibly select taxon) # not used -- GP/9606.9558/RefSeq_protein does the correct select, and returns the correct database size # $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'txid9606 [ORGN]'; # E()-values and database sizes are often incorrect when ENTREZ_QUERY is used. $remote_blast_object = Bio::Tools::Run::RemoteBlast->new(@params); $r = $remote_blast_object->submit_blast($query_seq); while ( my @rids = $remote_blast_object->each_rid ) { foreach my $rid ( @rids ) { $rc = $remote_blast_object->retrieve_blast($rid); if( !ref($rc) ) { # $rc not a reference => error or job not yet finished if( $rc < 0 ) { $remote_blast_object->remove_rid($rid); print "Error return code for BlastID code $rid ... \n";} sleep $sleep_time; if ($sleep_time < 120) {$sleep_time *= 2;} } else { my $result = $rc->next_result(); my $filename = $result->query_name()."\.out"; $remote_blast_object->save_output($filename); $remote_blast_object->remove_rid($rid); # print description of search print "#Query: Name: ", $result->query_name(), "; Length: ",$result->query_length() . "\n"; print "#Database: Name: \"$database\"; Entries: ". $result->database_entries() . "; Residues: " . $result->database_letters() . "\n"; print "#Parameters: "; for my $param ( $result->available_parameters() ) { print " $param: ". $result->get_parameter($param)."; "; } print "\n"; # print list of hits: print "#Hits:\tACC \tE()\tbits\tf_id\ta_len\tdescr\n"; while ( my $hit = $result->next_hit ) { my $hit_descr = $hit->description(); # simplify concatenated descriptions if ($hit_descr =~ m/\001/) {($hit_descr) = split(/\001/,$hit_descr);} # clean up SwissProt results $hit_descr =~ s/RecName: Full=//g; $hit_descr =~ s/AltName: Full=//g; my @hsps = (); # get HSPs while ( my $hsp = $hit->next_hsp ) { push @hsps, {e_val=>$hsp->evalue(), bits => $hsp->bits(), f_id => $hsp->frac_identical(), q_start => $hsp->start('query'), q_end => $hsp->end('query'), s_start => $hsp->start('sbjct'), s_end => $hsp->end('sbjct'), a_len => $hsp->end('query') - $hsp->start('query') + 1, } } # clean up the target accession my $l_acc = $hit->name; $l_acc =~ s/ref\|//; $l_acc =~ s/\.\d+\|//; print $l_acc, "\t"; for my $hsp_p ( @hsps ) { printf("%.2g\t%.1f\t%.3f\t%d",$hsp_p->{e_val},$hsp_p->{bits}, $hsp_p->{f_id}, $hsp_p->{a_len}); } print "\t$hit_descr\n"; } } } } __END__