#!/usr/bin/perl # fasta_clean.pl -- edits a fasta file descriptor line # used to modify fasta file descriptors from UCSC downloads to work # with MEME, or other programs that need distinct identifiers # # fasta_clean.pl --meme_chr fasta.file > clean_fasta_file # # takes a UCSC sequence download file and either (1) changes all the spaces to '_': # or, with --meme_chr, removes the text that does not specify the chromosome location # # thus, starting with # >mm9_ct_UserTrack_3545_0 range=chr2:67108861-67108870 5'pad=0 3'pad=0 strand=+ repeatMasking=lower # fasta_clean.pl produces: # >mm9_ct_UserTrack_3545_0_range=chr2:67108861-67108870_5'pad=0_3'pad=0_strand=+_repeatMasking=lower # fasta_clean.pl --meme_chr input_file produces: # >chr2:67108861-67108870 use warnings; use strict; use Getopt::Long; use Pod::Usage; my ($ucsc_clean, $help, $shelp) = (0,0,0); GetOptions( "h" => \$shelp, "help" => \$help, "meme_chr" => \$ucsc_clean, ); pod2usage(1) if $shelp; pod2usage(exitstatus => 0, verbose => 2) if $help; pod2usage(1) unless (@ARGV || -f STDIN || -p STDIN ); while (my $line = <>) { chomp($line); if ($line =~ m/^>/) { if ($ucsc_clean) { $line =~ s/^>\S+\s+range=(\S+)\s.*$/>$1/; } else { $line =~ s/\s/_/g; } print "$line\n"; } else {print "$line\n";} } exit(0); =pod =head1 NAME clean_fasta.pl =head1 SYNOPSIS clean_fasta.pl --meme_chr sequence_UCSC.txt =head1 OPTIONS -h short help --help include description --meme_chr edit description to only provide chromosome location =head1 DESCRIPTION Ctakes a UCSC sequence download file and either (1) changes all the spaces to '_': or, with --meme_chr, removes the text that does not specify the chromosome location. Thus, starting with: >mm9_ct_UserTrack_3545_0 range=chr2:67108861-67108870 5'pad=0 3'pad=0 strand=+ repeatMasking=lower C produces: >mm9_ct_UserTrack_3545_0_range=chr2:67108861-67108870_5'pad=0_3'pad=0_strand=+_repeatMasking=lower C produces: >chr2:67108861-67108870 =head1 AUTHOR William R. Pearson, wrp@virginia.edu =cut