#!/usr/bin/perl -w

=pod

=head1 NAME

 bp_blastp.pl

=head1 SYNOPSIS

 bp_blastp.pl --matrix BLOSUM62 --expect 5.0 protein_accession

=head1 OPTIONS

 -h	short help
 --help include description
 -s/--matrix scoring matrix (BLOSUM45, BLOSUM62, BLOSUM80, PAM70, PAM30)
 -E/--expect E()-value threshold (default 5.0)
 --db database (human refseq by default)

=head1 DESCRIPTION

C extracts the protein sequence specified by accession
(e.g. C or C) and compares that sequence to human
refseq proteins (default) at the NCBI BLASTP site, reporting the
search parameter and the high scoring sequences.

=head1 AUTHOR

William R. Pearson, wrp@virginia.edu

=cut

use strict;
use Bio::DB::SwissProt;
use Bio::DB::GenPept;
use Bio::Tools::Run::RemoteBlast;

use Pod::Usage;
use Getopt::Long;

my ($acc,$database,$expect,$matrix) = ('', 'GP/9606.9558/RefSeq_protein', 5.0, "");
use vars qw($help $shelp);

my %matrix_gaps = ( BLOSUM45 => "15 2",
		    BLOSUM62 => "11 1",
		    PAM70 => "10 1",
		    PAM30 => "9 1",
    );

pod2usage(1) unless @ARGV;

GetOptions("db:s"=>\$database,
	   "E:f" => \$expect,
	   "expect:f" => \$expect,
	   "s:s" => \$matrix,
	   "matrix:s" => \$matrix,
	   "h|?" => \$shelp,
	   "help" => \$help,
    );

pod2usage(1) if $shelp;
pod2usage(exitstatus => 0, verbose => 2) if $help;

$acc = shift @ARGV;

my ($query_seq, $gp);

if ($acc =~ m/[NXY]P_\d+/) {
  $gp = new Bio::DB::GenPept;
  $query_seq = $gp->get_Seq_by_acc($acc);
}
elsif ($acc =~ m/_/) {
  $gp = new Bio::DB::SwissProt;
  $query_seq = $gp->get_Seq_by_id($acc);
}
else {
  $gp = new Bio::DB::GenPept;
  $query_seq = $gp->get_Seq_by_acc($acc);
}

die "Cannot find $acc" unless $query_seq;

# then setup a blast search
my (@params, $remote_blast_object, $blast_file, $r, $rc);
my $sleep_time = 2;

@params = (-prog=>'blastp', -data=>$database, -expect=>$expect);

if ($matrix && $matrix_gaps{$matrix}) {
  $Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = $matrix;
  $Bio::Tools::Run::RemoteBlast::HEADER{'GAPCOSTS'} = $matrix_gaps{$matrix};
}

# (possibly select taxon)
# not used -- GP/9606.9558/RefSeq_protein does the correct select, and returns the correct database size
# $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'txid9606 [ORGN]';
# E()-values and database sizes are often incorrect when ENTREZ_QUERY is used.

$remote_blast_object = Bio::Tools::Run::RemoteBlast->new(@params);

$r = $remote_blast_object->submit_blast($query_seq);

while ( my @rids = $remote_blast_object->each_rid ) {
  foreach my $rid ( @rids ) {
    $rc = $remote_blast_object->retrieve_blast($rid);
    if( !ref($rc) ) {    # $rc not a reference => error or job not yet finished
        if( $rc < 0 ) { $remote_blast_object->remove_rid($rid);
             print "Error return code for BlastID code $rid ... \n";}
        sleep $sleep_time; if ($sleep_time < 120) {$sleep_time *= 2;}
    }
    else {
      my $result = $rc->next_result();

      my $filename = $result->query_name()."\.out";
      $remote_blast_object->save_output($filename);
      $remote_blast_object->remove_rid($rid);
# print description of search
      print "#Query: Name: ", $result->query_name(), "; Length: ",$result->query_length() . "\n";
      print "#Database: Name: \"$database\"; Entries: ". $result->database_entries() . "; Residues: " . $result->database_letters() . "\n";
      print "#Parameters: ";
      for my $param ( $result->available_parameters() ) {
	  print "  $param: ". $result->get_parameter($param)."; ";
      }
      print "\n";
# print list of hits:
      print "#Hits:\tACC     \tE()\tbits\tf_id\ta_len\tdescr\n";
      while ( my $hit = $result->next_hit ) {
	my $hit_descr = $hit->description();
	# simplify concatenated descriptions
	if ($hit_descr =~ m/\001/) {($hit_descr) =  split(/\001/,$hit_descr);}
	# clean up SwissProt results
	$hit_descr =~ s/RecName: Full=//g;
	$hit_descr =~ s/AltName: Full=//g;
	my @hsps = ();
	# get HSPs
	while ( my $hsp = $hit->next_hsp ) {
	  push @hsps, {e_val=>$hsp->evalue(),
		       bits => $hsp->bits(),
		       f_id => $hsp->frac_identical(),
		       q_start => $hsp->start('query'),
		       q_end => $hsp->end('query'),
		       s_start => $hsp->start('sbjct'),
		       s_end => $hsp->end('sbjct'),
		       a_len => $hsp->end('query') - $hsp->start('query') + 1,
		      }
	}
	# clean up the target accession
	my $l_acc = $hit->name;
	$l_acc =~ s/ref\|//;
	$l_acc =~ s/\.\d+\|//;

	print $l_acc, "\t";
	for my $hsp_p ( @hsps ) {
	    printf("%.2g\t%.1f\t%.3f\t%d",$hsp_p->{e_val},$hsp_p->{bits}, $hsp_p->{f_id},
		   $hsp_p->{a_len});
	}
	print "\t$hit_descr\n";
      }
    }
  }
}


__END__