Thursday, September 25, 2014

Genome base pair correction tool

High rates of sequencing errors in Next-generation sequencing methods can percolate down into genome assemblies. Presence of incorrect bases in genome assemblies can have an impact on analysis that use the genome assembly for functional analysis.While programs like REAPR are able to correct genome assemblies based on mapped reads, it is focused on identifying large scale structural errors in genome assemblies. 

Draft genome assemblies are being generated for a large number of species using Next-generation sequencing methods. Here we present BCD (Base Correction Don), a tool that conservatively identifies single base-pair level errors in genome assembly sequences based on paired-end reads mapped to the genome.

 #!/usr/bin/perl  
 #use strict;  
 use warnings;  
 use List::Util qw(min max);  
 #perl fixbases.pl example.fasta example.coverage > example.corrected.fasta  
 # Input parameters  
 my $genome = $ARGV[0];  
 my $coverage = $ARGV[1];  
 my $totalcount=0;  
 my $minreadcoverage=20;# minimum number of reads required at site to consider it  
 my $maxreadcoverage=200;  
 my $maxindelcount=5;  
 open FASTA1, $genome or die $!;  
 open LOGS, ">test.bed" or die $!;  
 my $seqst_temp="";  
 my %seqs = ();  
 while($line = <FASTA1>){  
     if($line=~ /^>/){  
      if($header){$seqs{$header}=$seqst_temp;}  
      chomp $line;$header="";$header=$line;$seqst_temp="";  
     }  
     else{$line =~ s/[\n\t\f\r_0-9\s]//g;$seqst_temp .= $line;}  
  }#end of while loop  
     if($header){$seqs{$header}=$seqst_temp;}  
 close FASTA1;  
 open FASTA2, $coverage or die $!;  
 while($line = <FASTA2>){  
 chomp $line;$line=$line . "\t}";  
 my @jelly=split(/\s+/,$line);  
 splice @jelly, 4, 0, 'SnowWhite', 'Humbert';  
 my $scaf=$jelly[0];  
 my $pos=$jelly[1];  
 my $oldbase=$jelly[2];  
 my $totalreads=$jelly[3];  
 my @jellyA=split(/\:/,$jelly[7]);#A  
 my @jellyC=split(/\:/,$jelly[8]);#C  
 my @jellyG=split(/\:/,$jelly[9]);#G  
 my @jellyT=split(/\:/,$jelly[10]);#T  
 my @jellyN=split(/\:/,$jelly[11]);#N  
 my $indelcount=0;  
 if(!($jelly[12]=~ /\}/)){  
 my @jellyindel=split(/\:/,$jelly[12]);#indel  
 $indelcount=$jellyindel[1];  
 }  
 my @counts;  
 push @counts,$jellyA[1];  
 push @counts,$jellyC[1];  
 push @counts,$jellyG[1];  
 push @counts,$jellyT[1];  
 push @counts,$jellyN[1];  
 push @counts,$indelcount;  
     if(($totalreads>$minreadcoverage)&&($indelcount<$maxindelcount)&&($totalreads<$maxreadcoverage)){$totalcount++;  
 my $maxbase= max @counts;  
 my $maxbasepercent=$maxbase*0.1;  
 my $maxbaseval=$oldbase;  
 if($jellyA[1]==$maxbase){$maxbaseval="A";}  
 elsif($jellyC[1]==$maxbase){$maxbaseval="C";}  
 elsif($jellyG[1]==$maxbase){$maxbaseval="G";}  
 elsif($jellyT[1]==$maxbase){$maxbaseval="T";}  
 elsif($jellyN[1]==$maxbase){$maxbaseval="N";}  
 else{$maxbaseval=$oldbase;}  
 my $maxbaseval2=lc $maxbaseval;  
 if((($oldbase=~ /[Aa]/)&&($jellyA[1]==0))||(($oldbase=~ /[Cc]/)&&($jellyC[1]==0))||(($oldbase=~ /[Gg]/)&&($jellyG[1]==0))||(($oldbase=~ /[Tt]/)&&($jellyT[1]==0))){  
 print LOGS "$scaf\t$pos\t$pos\t$maxbaseval\t$oldbase.\t$jellyA[1]\t$jellyC[1]\t$jellyG[1]\t$jellyT[1]\t$jellyN[1]\tzero\t$totalreads\n";  
 my $key=">" . $scaf;  
 my $prepos=$pos-1;#1 based positions  
 substr($seqs{$key},$prepos,1)= $maxbaseval;  
 }  
 elsif((($oldbase=~ /[Aa]/)&&($jellyA[1]<$maxbasepercent))||(($oldbase=~ /[Cc]/)&&($jellyC[1]<$maxbasepercent))||(($oldbase=~ /[Gg]/)&&($jellyG[1]<$maxbasepercent))||(($oldbase=~ /[Tt]/)&&($jellyT[1]<$maxbasepercent))){  
 print LOGS "$scaf\t$pos\t$pos\t$maxbaseval\t$oldbase.\t$jellyA[1]\t$jellyC[1]\t$jellyG[1]\t$jellyT[1]\t$jellyN[1]\tpercent\t$totalreads\n";  
 my $key=">" . $scaf;  
 my $prepos=$pos-1;#1 based positions  
 substr($seqs{$key},$prepos,1)= $maxbaseval;  
 }  
     }  
  }#end of while loop  
 close FASTA2;  
 my $iso;  
 foreach $iso (sort keys %seqs) {  
 print "$iso\n$seqs{$iso}\n";  
 }  
 #scaffold_0   50   c    45   GENOMEseq    {    =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:1:29.00:44.00:29.00:0:1:0.06:0.00:0.00:0:0.00:100.00:0.03 G:43:28.49:64.21:27.42:20:23:0.59:0.04:234.35:20:0.62:97.95:0.44    T:1:29.00:68.00:29.00:1:0:0.92:0.16:659.00:1:0.53:98.00:0.53  N:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00   }  
 #scaffold_0   1    g    43   =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00   G:40:33.40:59.08:31.00:19:21:0.00:0.03:0.00:0:0.00:0.00:0.00  T:2:31.00:33.00:31.00:0:2:0.00:0.05:0.00:0:0.00:0.00:0.00    N:1:60.00:33.00:25.00:1:0:0.00:0.05:0.00:0:0.00:0.00:0.00  
 print LOGS "$totalcount\n";  
 close LOGS;  

All paired end data used for performing the genome assembly were mapped back to the genome assembly using bwa read mapper with default settings. Number of reads supporting each of the four bases at individual positions was tabulated using the program bam-readcount. The above script was then used to correct the genome for single base pair errors.