Friday, March 21, 2014

Perl script to get fasta sequences longer than a certain length

 #!/usr/bin/perl  
 use strict;  
 use warnings;  
 # Input parameters  
 open FASTA, $ARGV[0] or die $!;  
 my $seqst_temp="";  
 my $seqs = {GENENAME =>my $genename,LEN =>my $qcom};  
 my $iso="";  
 my $minval=$ARGV[1];  
 while($line = <FASTA>){  
 if($line=~ /^>/){  
 if($header){  
 $seqs{$seqst_temp}{GENENAME}=$header;  
 $seqs{$seqst_temp}{LEN}=length $seqst_temp;  
 }  
 chomp $line;  
 $header="";  
 $header=$line;  
 $seqst_temp="";  
 }  
 else{  
 $line =~ s/[\n\t\f\r_0-9\s]//g;  
 $seqst_temp .= $line;  
 }  
 }#end of while loop  
 if($header){  
 $seqs{$seqst_temp}{GENENAME}=$header;  
 $seqs{$seqst_temp}{LEN}=length $seqst_temp;  
 }  
 close FASTA;  
 foreach $iso (sort keys %seqs) {  
      if($seqs{$iso}{LEN} > $minval){  
      print "$seqs{$iso}{GENENAME}\n";  
      print "$iso\n";  
      }  
 }  

Example: perl getlongerthan.pl genome.fasta 100 > longerthan.100.fasta