#!/usr/bin/perl -w
use strict;
use LWP::Simple;
use Getopt::Long;
use Pod::Usage;
my ($db, $shelp, $help) = ("protein", 0, 0);
GetOptions('db:s' => \$db,
'help' => \$help,
'h|?' => \$shelp,
);
pod2usage(1) if $shelp;
pod2usage(exitstatus => 0, verbose => 2) if $help;
pod2usage(1) unless @ARGV;
my $taxon = shift @ARGV;
my $base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
my $query = "srcdb_refseq[prop]+AND+$taxon"."[orgn]";
my $gi_url = $base . "esearch.fcgi?db=$db&term=$query&usehistory=y";
my $seq_url = $base . "esearch.fcgi?db=$db&term=$query&usehistory=y";
#post the esearch URL
my $esearch_result = get($gi_url);
my ($count, $querykey, $webenv) = ($esearch_result =~
m|(\d+).*(\d+).*(\S+)|s);
die "No sequences" if ($count < 1);
my $retmax=400;
my $first_line = 1;
for (my $retstart = 0; $retstart < $count; $retstart += $retmax) {
$gi_url = $base . "efetch.fcgi?"
. "retstart=$retstart&retmax=$retmax"
. "&db=$db&query_key=$querykey&WebEnv=$webenv"
. "&rettype=fasta&retmode=text";
print get($gi_url);
}
# have a list of @gis, now get the sequences
__END__
=pod
=head1 NAME
down_genome_refseq.pl
=head1 SYNOPSIS
down_genome_refseq.pl taxon_id
=head1 OPTIONS
-h short help
--help include description
--db database: protein (default) | nucleotide
=head1 DESCRIPTION
C downloads the set of RefSeq proteins
belonging to a taxonomy.
C
downloads a set of human RefSeq proteins in FASTA format.
=head1 AUTHOR
William R. Pearson, wrp@virginia.edu
=cut