#!/usr/bin/perl
# fasta_clean.pl -- edits a fasta file descriptor line
# used to modify fasta file descriptors from UCSC downloads to work
# with MEME, or other programs that need distinct identifiers
#
# fasta_clean.pl --meme_chr fasta.file > clean_fasta_file
#
# takes a UCSC sequence download file and either (1) changes all the spaces to '_':
# or, with --meme_chr, removes the text that does not specify the chromosome location
#
# thus, starting with
# >mm9_ct_UserTrack_3545_0 range=chr2:67108861-67108870 5'pad=0 3'pad=0 strand=+ repeatMasking=lower
# fasta_clean.pl produces:
# >mm9_ct_UserTrack_3545_0_range=chr2:67108861-67108870_5'pad=0_3'pad=0_strand=+_repeatMasking=lower
# fasta_clean.pl --meme_chr input_file produces:
# >chr2:67108861-67108870
use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;
my ($ucsc_clean, $help, $shelp) = (0,0,0);
GetOptions(
"h" => \$shelp,
"help" => \$help,
"meme_chr" => \$ucsc_clean,
);
pod2usage(1) if $shelp;
pod2usage(exitstatus => 0, verbose => 2) if $help;
pod2usage(1) unless (@ARGV || -f STDIN || -p STDIN );
while (my $line = <>) {
chomp($line);
if ($line =~ m/^>/) {
if ($ucsc_clean) {
$line =~ s/^>\S+\s+range=(\S+)\s.*$/>$1/;
}
else {
$line =~ s/\s/_/g;
}
print "$line\n";
}
else {print "$line\n";}
}
exit(0);
=pod
=head1 NAME
clean_fasta.pl
=head1 SYNOPSIS
clean_fasta.pl --meme_chr sequence_UCSC.txt
=head1 OPTIONS
-h short help
--help include description
--meme_chr edit description to only provide chromosome location
=head1 DESCRIPTION
C takes a UCSC sequence download file and either (1) changes all the spaces to '_':
or, with --meme_chr, removes the text that does not specify the chromosome location.
Thus, starting with:
>mm9_ct_UserTrack_3545_0 range=chr2:67108861-67108870 5'pad=0 3'pad=0 strand=+ repeatMasking=lower
C produces:
>mm9_ct_UserTrack_3545_0_range=chr2:67108861-67108870_5'pad=0_3'pad=0_strand=+_repeatMasking=lower
C produces:
>chr2:67108861-67108870
=head1 AUTHOR
William R. Pearson, wrp@virginia.edu
=cut