Thursday, January 20, 2011

Minimus2 larger datasets with Nucmer

While Blat is much more faster than Nucmer, its results can be very different from the results from Nucmer. A version of Minimus2 which splits the data before feeding it to Nucmer and merging it back before processing further is able to give a result which is almost always identical to the results from Nucmer. The changes required to be done are given below:

NUCMER = nucmer
DELTAFILTER = delta-filter
SHOWCOORDS = show-coords
RUNNUCMER = perl runnucmer.pl
#------------------------------------------------------------------------------#

## Building AMOS bank & Dumping reads
10: rm -fr $(BANK)
11: $(BINDIR)/bank-transact -c -z -b $(BANK) -m $(TGT)
12: $(BINDIR)/dumpreads $(BANK) -M $(REFCOUNT) > $(REFSEQ)
13: $(BINDIR)/dumpreads $(BANK) -m $(REFCOUNT) > $(QRYSEQ)

## Getting overlaps
20: $(RUNNUCMER) $(OVERLAP) $(REFSEQ) $(QRYSEQ) $(PREFIX)
21: $(SHOWCOORDS) -H -c -l -o -r -I $(MINID) $(ALIGN) | $(BINDIR)/nucmerAnnotate | egrep 'BEGIN|END|CONTAIN|IDENTITY' > $(COORDS)
22: $(BINDIR)/nucmer2ovl -ignore $(MAXTRIM) -tab $(COORDS) | $(BINDIR)/sort2 > $(OVLTAB)


The code for runnucmer.pl is given below:

#!/usr/bin/perl

$allowedsize=100000000;

$overlap=$ARGV[0];
$mainref=$ARGV[1];
$mainqry=$ARGV[2];
$mainpre=$ARGV[3];
$reflen=0;
$refcount=1;

$qrylen=0;
$qrycount=1;

#open first ref file
open FILE1, ">>",$mainpre."ref".$refcount or die $!;

#reading the refrence sequences
open FILE2, "<",$mainref or die $!;


$seqst_temp="";
$header = ;#reading the first header

while($z = ){
if($z=~ />/)
{#next sequence
$reflen=$reflen+length $seqst_temp;
if($reflen < $allowedsize)
{
print FILE1 $header;
print FILE1 $seqst_temp;
print FILE1 "\n";
}
else
{
$refcount++;
$reflen=0;
close FILE1;
open FILE1, ">>",$mainpre."ref".$refcount or die $!;
print FILE1 $header;
print FILE1 $seqst_temp;
print FILE1 "\n";
}
$header=$z;
$seqst_temp="";
}
else
{
$z =~ s/[\n\t\f\r_0-9\s]//g;
$seqst_temp .= $z;
}
}#end of File while loop

if($reflen < $allowedsize)
{
print FILE1 $header;
print FILE1 $seqst_temp;
print FILE1 "\n";
}
else
{
$refcount++;
$reflen=0;
close FILE1;
open FILE1, ">>",$mainpre."ref".$refcount or die $!;
print FILE1 $header;
print FILE1 $seqst_temp;
print FILE1 "\n";
}
$header=$z;
$seqst_temp="";



close FILE1;
close FILE2;

#open first qry file
open FILE1, ">>",$mainpre."qry".$qrycount or die $!;

#reading the refrence sequences
open FILE2, "<",$mainqry or die $!;


$seqst_temp="";
$header = ;#reading the first header

while($z = ){
if($z=~ />/)
{#next sequence
$qrylen=$qrylen+length $seqst_temp;
if($qrylen < $allowedsize)
{
print FILE1 $header;
print FILE1 $seqst_temp;
print FILE1 "\n";
}
else
{
$qrycount++;
close FILE1;
$qrylen=0;
open FILE1, ">>",$mainpre."qry".$qrycount or die $!;
print FILE1 $header;
print FILE1 $seqst_temp;
print FILE1 "\n";
}
$header=$z;
$seqst_temp="";
}
else
{
$z =~ s/[\n\t\f\r_0-9\s]//g;
$seqst_temp .= $z;
}
}#end of File while loop

if($qrylen < $allowedsize)
{
print FILE1 $header;
print FILE1 $seqst_temp;
print FILE1 "\n";
}
else
{
$qrycount++;
close FILE1;
$qrylen=0;
open FILE1, ">>",$mainpre."qry".$qrycount or die $!;
print FILE1 $header;
print FILE1 $seqst_temp;
print FILE1 "\n";
}
$header=$z;
$seqst_temp="";

close FILE1;
close FILE2;

for($i=1;$i<=$qrycount;$i++){
for($j=1;$j<=$refcount;$j++){
$refseq=$mainpre."ref".$j;
$qryseq=$mainpre."qry".$i;
$prefix=$mainpre."nuc".$j."".$i;
system("/bubo/home/h7/nagarjun/glob/tools/MUMmer3.22/nucmer -maxmatch -c $overlap $refseq $qryseq -p $prefix");
}
}


#merging delta files
open FILE1, ">>",$mainpre.".delta" or die $!;
for($i=1;$i<=$qrycount;$i++){
for($j=1;$j<=$refcount;$j++){
open FILE2, "<",$mainpre."nuc".$j."".$i.".delta" or die $!;
if(($i==1)&&($j==1)){while($a=){print FILE1 $a;}}
else{$a=;$a=;while($a=){print FILE1 $a;}}
}
}

1 comment:

Daniel Brami said...

Hi Nagarjun,

I am curious to try your contig assembly pipeline but there are far too many errors int the perl code for me to do a simple cut and paste.
Would you be so kind as to sending the functionnal perl script as an email attachment?

Regards,

Daniel Brami
dbrami "AT" jcvi DOT org