Monday, August 11, 2014

Get overlapping sequences from multifasta file

 #!/usr/bin/perl  
 #use strict;  
 use warnings;  
 # Input parameters  
 open FASTA, $ARGV[0] or die $!;  
 my $seqst_temp="";  
 my %seqs = ();  
 my $iso="";  
 my $maxlen=0;  
 my $maxval="";  
 while($line = <FASTA>){  
 if($line=~ /^>/){  
 if($header){  
 $seqs{$header}=$seqst_temp;  
 }  
 chomp $line;  
 $header="";  
 $header=$line;  
 $seqst_temp="";  
 }  
 else{  
 $line =~ s/[\n\t\f\r_0-9\s]//g;  
 $seqst_temp .= $line;  
 }  
 }#end of while loop  
 if($header){  
 $seqs{$header}=$seqst_temp;  
 }  
 close FASTA;  
 $maxlen=0;  
 foreach $iso (sort keys %seqs) {  
 my $line1=$iso;  
 my $line2=$seqs{$iso};  
 my $flag=0,$overlap=500,$length=1000;  
 $seqlen=length $line2;  
 while(($seqlen-$flag)>$length){  
 if(($seqlen-($flag+$overlap))<$length){  
 $length=$seqlen-$flag;  
 }  
 $nextseq=substr $line2,$flag,$length;  
 print $line1.":".$flag."-".($flag+$length)."\n";  
 print $nextseq."\n";  
 $flag=$flag+$overlap;  
 }#end of seqlen while loop  
 }  

No comments: