Nagarjun's blog: March 2010

Tuesday, March 30, 2010

Blast - XML output parser

The below script parses the output from blast with XML as option and stores the values in an array.

#!/usr/bin/perl
#---blast output file
my $infile = "blastout";
open(IN, "$infile");

while((my $line = ) && ($hitnumber[0] < $taketill)){ #reading the first $taketill numbers
chomp($line);
if ($line =~ /\/) {#reading hit number
push(@hitnums,split(/\<\/Hit_num\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading definiton
push(@hitdefs,split(/\<\/Hit_def\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading length
push(@hitlens,split(/\<\/Hit_len\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading bitscore
push(@bitscores,split(/\<\/Hsp_bit-score\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {
push(@scores,split(/\<\/Hsp_score\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading evalue
push(@evalues,split(/\<\/Hsp_evalue\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading query match begin
push(@qfroms,split(/\<\/Hsp_query-from\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading query match end
push(@qtos,split(/\<\/Hsp_query-to\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading hit match begin
push(@hfroms,split(/\<\/Hsp_hit-from\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading hit match end
push(@htos,split(/\<\/Hsp_hit-to\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading query frame
push(@qframes,split(/\<\/Hsp_query-frame\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading hit frame
push(@hframes,split(/\<\/Hsp_hit-frame\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading identities
push(@identities,split(/\<\/Hsp_identity\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading positives
push(@positives,split(/\<\/Hsp_positive\>/,(split(/\/, $line))[1]));
}
if ($line =~ /\/) {#reading alignment length
push(@algnlens,split(/\<\/Hsp_align-len\>/,(split(/\/, $line))[1]));
}
}

Friday, March 19, 2010

Strand specific translation of DNA to aminoacid sequence in PERL

PERL script to read the annotation file and update translated ORF's it into the mysql database and writing it into a multifasta file.

use DBI;

$dbh = DBI->connect('DBI:mysql:meta', 'root', 'password'
) || die "Could not connect to database: $DBI::errstr";

my ($infile) = @ARGV;
open(IN, "$infile");
my $outfile ="multifasta";
open(OUT,">>$outfile");

my $jcvi_read,$strand,$start,$stop,@seq,$ofset,$leng,$wrtseq;

while(my $line = ){
chomp($line);

my @a = split(/ /, $line);
my @c = split(/\s+/, $line);
my @b = split(/\_/, $a[0]);
$jcvi_read=$b[2];
$start = $c[3];
$stop = $c[4];
$strand = $c[6];
$pep = $c[8];

$sth = $dbh->prepare("SELECT sequence FROM metadata WHERE jcvi_read='$jcvi_read'");
$sth->execute();

@seq = $sth->fetchrow_array();

$ofset=$start - 1;
$leng=$stop-$start;
$wrtseq=substr $seq[0], $ofset, $leng;

$sth->finish();

#function to translate DNA to Amino acid based on standard genetic code
sub codon2aa{
my($codon)=@_;
$codon= uc $codon;
my(%genetic_code) = (
'TCA'=>'S', #Serine
'TCC'=>'S', #Serine
'TCG'=>'S', #Serine
'TCT'=>'S', #Serine
'TTC'=>'F', #Phenylalanine
'TTT'=>'F', #Phenylalanine
'TTA'=>'L', #Leucine
'TTG'=>'L', #Leucine
'TAC'=>'Y', #Tyrosine
'TAT'=>'Y', #Tyrosine
'TAA'=>'_', #Stop
'TAG'=>'_', #Stop
'TGC'=>'C', #Cysteine
'TGT'=>'C', #Cysteine
'TGA'=>'_', #Stop
'TGG'=>'W', #Tryptophan
'CTA'=>'L', #Leucine
'CTC'=>'L', #Leucine
'CTG'=>'L', #Leucine
'CTT'=>'L', #Leucine
'CCA'=>'P', #Proline
'CAT'=>'H', #Histidine
'CAA'=>'Q', #Glutamine
'CAG'=>'Q', #Glutamine
'CGA'=>'R', #Arginine
'CGC'=>'R', #Arginine
'CGG'=>'R', #Arginine
'CGT'=>'R', #Arginine
'ATA'=>'I', #Isoleucine
'ATC'=>'I', #Isoleucine
'ATT'=>'I', #Isoleucine
'ATG'=>'M', #Methionine
'ACA'=>'T', #Threonine
'ACC'=>'T', #Threonine
'ACG'=>'T', #Threonine
'ACT'=>'T', #Threonine
'AAC'=>'N', #Asparagine
'AAT'=>'N', #Asparagine
'AAA'=>'K', #Lysine
'AAG'=>'K', #Lysine
'AGC'=>'S', #Serine#Valine
'AGT'=>'S', #Serine
'AGA'=>'R', #Arginine
'AGG'=>'R', #Arginine
'CCC'=>'P', #Proline
'CCG'=>'P', #Proline
'CCT'=>'P', #Proline
'CAC'=>'H', #Histidine
'GTA'=>'V', #Valine
'GTC'=>'V', #Valine
'GTG'=>'V', #Valine
'GTT'=>'V', #Valine
'GCA'=>'A', #Alanine
'GCC'=>'A', #Alanine
'GCG'=>'A', #Alanine
'GCT'=>'A', #Alanine
'GAC'=>'D', #Aspartic Acid
'GAT'=>'D', #Aspartic Acid
'GAA'=>'E', #Glutamic Acid
'GAG'=>'E', #Glutamic Acid
'GGA'=>'G', #Glycine
'GGC'=>'G', #Glycine
'GGG'=>'G', #Glycine
'GGT'=>'G', #Glycine
);

if(exists $genetic_code{$codon}){
return $genetic_code{$codon};

}

else{

return 'X';

}
}
if($strand == '-')
#reverse complementing -ve strands
{$wrtseq=reverse $wrtseq;
$wrtseq=~ tr/ATGC/TACG/;}

my $dna=$wrtseq;
#my $dna="TCATTCTCATTC";
my $protein='';
my $codon3;
for(my $i=0; $i<(length($dna)-2); $i+=3){ $codon3=substr($dna,$i,3); $protein.= codon2aa($codon3); } print $pep."\n"; $dbh->do("INSERT into iddata (jcvi_read,strand,start,stop,protein,orf_id) VALUES('$jcvi_read','$strand','$start','$stop','$protein','$pep')");

print OUT ">".$jcvi_read."|".$pep."\n";
print OUT $protein."\n";
}
$dbh->disconnect();

Thursday, March 4, 2010

Updating mySql database from Perl

Perl script to read multifasta file calculate GC content,GC Skew and insert into mySql database.

#!/usr/bin/perl

use DBI;

$dbh = DBI->connect('DBI:mysql:meta', 'root', 'password'
) || die "Could not connect to database: $DBI::errstr";

my ($infile) = @ARGV;
open(IN, "$infile");

my $jcvi_read,$leng,$gccont=0,$gcskew=0,$seq="";

while(my $line = ){
chomp($line);
if ($line =~ /^\>/) {#reading header line

if($jcvi_read !=0){
$gccont=($gccont/$leng)*100;

$dbh->do("INSERT into metadata (jcvi_read,full_length,gc_content,gc_skew_orient,sequence) VALUES('$jcvi_read','$leng','$gccont','$gcskew','$seq')");#inserting the data into mySql Database
}
$gccont=0;
$seq="";
my @a = split(/\//, $line);
my @b = split(/\_/, $a[0]);
$jcvi_read = $b[2];
my @c = split(/\=/, $a[1]);
$leng = $c[1];
# print $jcvi_read."\n";
}
else{#reading and analysing the sequence
$seq = $seq.$line;
my @char =split(//,$line);
foreach(@char){
if($_ eq "G"|| $_ eq "C"){$gccont++;}# Calculating GC content
if($_ eq "G"){$gcskew++;}#Calcualting GC skew
if($_ eq "C"){$gcskew--;}
}
}
#print $gcskew."\n";
}
$dbh->disconnect();