#!/usr/bin/perl -w use strict; use LWP::Simple; use Getopt::Long; use Pod::Usage; my ($db, $shelp, $help) = ("protein", 0, 0); GetOptions('db:s' => \$db, 'help' => \$help, 'h|?' => \$shelp, ); pod2usage(1) if $shelp; pod2usage(exitstatus => 0, verbose => 2) if $help; pod2usage(1) unless @ARGV; my $taxon = shift @ARGV; my $base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'; my $query = "srcdb_refseq[prop]+AND+$taxon"."[orgn]"; my $gi_url = $base . "esearch.fcgi?db=$db&term=$query&usehistory=y"; my $seq_url = $base . "esearch.fcgi?db=$db&term=$query&usehistory=y"; #post the esearch URL my $esearch_result = get($gi_url); my ($count, $querykey, $webenv) = ($esearch_result =~ m|(\d+).*(\d+).*(\S+)|s); die "No sequences" if ($count < 1); my $retmax=400; my $first_line = 1; for (my $retstart = 0; $retstart < $count; $retstart += $retmax) { $gi_url = $base . "efetch.fcgi?" . "retstart=$retstart&retmax=$retmax" . "&db=$db&query_key=$querykey&WebEnv=$webenv" . "&rettype=fasta&retmode=text"; print get($gi_url); } # have a list of @gis, now get the sequences __END__ =pod =head1 NAME down_genome_refseq.pl =head1 SYNOPSIS down_genome_refseq.pl taxon_id =head1 OPTIONS -h short help --help include description --db database: protein (default) | nucleotide =head1 DESCRIPTION C downloads the set of RefSeq proteins belonging to a taxonomy. C downloads a set of human RefSeq proteins in FASTA format. =head1 AUTHOR William R. Pearson, wrp@virginia.edu =cut