#!/usr/bin/perl

#  fasta_clean.pl -- edits a fasta file descriptor line
#  used to modify fasta file descriptors from UCSC downloads to work
#  with MEME, or other programs that need distinct identifiers
#  
#  fasta_clean.pl --meme_chr fasta.file > clean_fasta_file
#
#  takes a UCSC sequence download file and either (1) changes all the spaces to '_':
#  or, with --meme_chr, removes the text that does not specify the chromosome location
#
#  thus, starting with
#  >mm9_ct_UserTrack_3545_0 range=chr2:67108861-67108870 5'pad=0 3'pad=0 strand=+ repeatMasking=lower
#  fasta_clean.pl produces:
#  >mm9_ct_UserTrack_3545_0_range=chr2:67108861-67108870_5'pad=0_3'pad=0_strand=+_repeatMasking=lower
#  fasta_clean.pl --meme_chr input_file   produces:
#  >chr2:67108861-67108870

use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;

my ($ucsc_clean, $help, $shelp) = (0,0,0);

GetOptions(
    "h" => \$shelp,
    "help" => \$help,
    "meme_chr" => \$ucsc_clean,
    );

pod2usage(1) if $shelp;
pod2usage(exitstatus => 0, verbose => 2) if $help;
pod2usage(1) unless (@ARGV || -f STDIN || -p STDIN );

while (my $line = <>) {
  chomp($line);
  if ($line =~ m/^>/) {
    if ($ucsc_clean) {
      $line =~ s/^>\S+\s+range=(\S+)\s.*$/>$1/;
    }
    else {
      $line =~ s/\s/_/g;
    }
    print "$line\n";
  }
  else {print "$line\n";}
}

exit(0);

=pod

=head1 NAME

clean_fasta.pl

=head1 SYNOPSIS

 clean_fasta.pl --meme_chr  sequence_UCSC.txt

=head1 OPTIONS

 -h	short help
 --help include description
 --meme_chr edit description to only provide chromosome location

=head1 DESCRIPTION

C takes a UCSC sequence download file and either (1) changes all the spaces to '_':
or, with --meme_chr, removes the text that does not specify the chromosome location.
Thus, starting with:

  >mm9_ct_UserTrack_3545_0 range=chr2:67108861-67108870 5'pad=0 3'pad=0 strand=+ repeatMasking=lower

C produces:

  >mm9_ct_UserTrack_3545_0_range=chr2:67108861-67108870_5'pad=0_3'pad=0_strand=+_repeatMasking=lower

C produces:

  >chr2:67108861-67108870

=head1 AUTHOR

William R. Pearson, wrp@virginia.edu

=cut