Friday, March 19, 2010

Strand specific translation of DNA to aminoacid sequence in PERL

PERL script to read the annotation file and update translated ORF's it into the mysql database and writing it into a multifasta file.

use DBI;

$dbh = DBI->connect('DBI:mysql:meta', 'root', 'password'
) || die "Could not connect to database: $DBI::errstr";

my ($infile) = @ARGV;
open(IN, "$infile");
my $outfile ="multifasta";
open(OUT,">>$outfile");

my $jcvi_read,$strand,$start,$stop,@seq,$ofset,$leng,$wrtseq;

while(my $line = ){
chomp($line);

my @a = split(/ /, $line);
my @c = split(/\s+/, $line);
my @b = split(/\_/, $a[0]);
$jcvi_read=$b[2];
$start = $c[3];
$stop = $c[4];
$strand = $c[6];
$pep = $c[8];


$sth = $dbh->prepare("SELECT sequence FROM metadata WHERE jcvi_read='$jcvi_read'");
$sth->execute();

@seq = $sth->fetchrow_array();

$ofset=$start - 1;
$leng=$stop-$start;
$wrtseq=substr $seq[0], $ofset, $leng;



$sth->finish();

#function to translate DNA to Amino acid based on standard genetic code
sub codon2aa{
my($codon)=@_;
$codon= uc $codon;
my(%genetic_code) = (
'TCA'=>'S', #Serine
'TCC'=>'S', #Serine
'TCG'=>'S', #Serine
'TCT'=>'S', #Serine
'TTC'=>'F', #Phenylalanine
'TTT'=>'F', #Phenylalanine
'TTA'=>'L', #Leucine
'TTG'=>'L', #Leucine
'TAC'=>'Y', #Tyrosine
'TAT'=>'Y', #Tyrosine
'TAA'=>'_', #Stop
'TAG'=>'_', #Stop
'TGC'=>'C', #Cysteine
'TGT'=>'C', #Cysteine
'TGA'=>'_', #Stop
'TGG'=>'W', #Tryptophan
'CTA'=>'L', #Leucine
'CTC'=>'L', #Leucine
'CTG'=>'L', #Leucine
'CTT'=>'L', #Leucine
'CCA'=>'P', #Proline
'CAT'=>'H', #Histidine
'CAA'=>'Q', #Glutamine
'CAG'=>'Q', #Glutamine
'CGA'=>'R', #Arginine
'CGC'=>'R', #Arginine
'CGG'=>'R', #Arginine
'CGT'=>'R', #Arginine
'ATA'=>'I', #Isoleucine
'ATC'=>'I', #Isoleucine
'ATT'=>'I', #Isoleucine
'ATG'=>'M', #Methionine
'ACA'=>'T', #Threonine
'ACC'=>'T', #Threonine
'ACG'=>'T', #Threonine
'ACT'=>'T', #Threonine
'AAC'=>'N', #Asparagine
'AAT'=>'N', #Asparagine
'AAA'=>'K', #Lysine
'AAG'=>'K', #Lysine
'AGC'=>'S', #Serine#Valine
'AGT'=>'S', #Serine
'AGA'=>'R', #Arginine
'AGG'=>'R', #Arginine
'CCC'=>'P', #Proline
'CCG'=>'P', #Proline
'CCT'=>'P', #Proline
'CAC'=>'H', #Histidine
'GTA'=>'V', #Valine
'GTC'=>'V', #Valine
'GTG'=>'V', #Valine
'GTT'=>'V', #Valine
'GCA'=>'A', #Alanine
'GCC'=>'A', #Alanine
'GCG'=>'A', #Alanine
'GCT'=>'A', #Alanine
'GAC'=>'D', #Aspartic Acid
'GAT'=>'D', #Aspartic Acid
'GAA'=>'E', #Glutamic Acid
'GAG'=>'E', #Glutamic Acid
'GGA'=>'G', #Glycine
'GGC'=>'G', #Glycine
'GGG'=>'G', #Glycine
'GGT'=>'G', #Glycine
);


if(exists $genetic_code{$codon}){
return $genetic_code{$codon};

}

else{

return 'X';



}
}
if($strand == '-')
#reverse complementing -ve strands
{$wrtseq=reverse $wrtseq;
$wrtseq=~ tr/ATGC/TACG/;}

my $dna=$wrtseq;
#my $dna="TCATTCTCATTC";
my $protein='';
my $codon3;
for(my $i=0; $i<(length($dna)-2); $i+=3){ $codon3=substr($dna,$i,3); $protein.= codon2aa($codon3); } print $pep."\n"; $dbh->do("INSERT into iddata (jcvi_read,strand,start,stop,protein,orf_id) VALUES('$jcvi_read','$strand','$start','$stop','$protein','$pep')");

print OUT ">".$jcvi_read."|".$pep."\n";
print OUT $protein."\n";
}
$dbh->disconnect();

6 comments:

Anonymous said...

Correctly your article helped me very much in my college assignment. Hats incorrect to you send, will look forward in behalf of more interdependent articles soon as its one of my choice question to read.

Anonymous said...

Isoleucine is "I" not "T"

Author said...

thanks for the correction. Changed it

nikki said...

hey can u please post simplified version of this script ...i'm not able to understand the last part n the place where u have not specified anything like return x and if($strand==-)

nikki said...

hey can you please send simplified version of this script ...i'm not able to understand the last part ...

nikki said...

can u please send simplified version of this script