Nagarjun's blog: Strand specific translation of DNA to aminoacid sequence in PERL

Friday, March 19, 2010

Strand specific translation of DNA to aminoacid sequence in PERL

PERL script to read the annotation file and update translated ORF's it into the mysql database and writing it into a multifasta file.

use DBI;

$dbh = DBI->connect('DBI:mysql:meta', 'root', 'password'
) || die "Could not connect to database: $DBI::errstr";

my ($infile) = @ARGV;
open(IN, "$infile");
my $outfile ="multifasta";
open(OUT,">>$outfile");

my $jcvi_read,$strand,$start,$stop,@seq,$ofset,$leng,$wrtseq;

while(my $line = ){
chomp($line);

my @a = split(/ /, $line);
my @c = split(/\s+/, $line);
my @b = split(/\_/, $a[0]);
$jcvi_read=$b[2];
$start = $c[3];
$stop = $c[4];
$strand = $c[6];
$pep = $c[8];

$sth = $dbh->prepare("SELECT sequence FROM metadata WHERE jcvi_read='$jcvi_read'");
$sth->execute();

@seq = $sth->fetchrow_array();

$ofset=$start - 1;
$leng=$stop-$start;
$wrtseq=substr $seq[0], $ofset, $leng;

$sth->finish();

#function to translate DNA to Amino acid based on standard genetic code
sub codon2aa{
my($codon)=@_;
$codon= uc $codon;
my(%genetic_code) = (
'TCA'=>'S', #Serine
'TCC'=>'S', #Serine
'TCG'=>'S', #Serine
'TCT'=>'S', #Serine
'TTC'=>'F', #Phenylalanine
'TTT'=>'F', #Phenylalanine
'TTA'=>'L', #Leucine
'TTG'=>'L', #Leucine
'TAC'=>'Y', #Tyrosine
'TAT'=>'Y', #Tyrosine
'TAA'=>'_', #Stop
'TAG'=>'_', #Stop
'TGC'=>'C', #Cysteine
'TGT'=>'C', #Cysteine
'TGA'=>'_', #Stop
'TGG'=>'W', #Tryptophan
'CTA'=>'L', #Leucine
'CTC'=>'L', #Leucine
'CTG'=>'L', #Leucine
'CTT'=>'L', #Leucine
'CCA'=>'P', #Proline
'CAT'=>'H', #Histidine
'CAA'=>'Q', #Glutamine
'CAG'=>'Q', #Glutamine
'CGA'=>'R', #Arginine
'CGC'=>'R', #Arginine
'CGG'=>'R', #Arginine
'CGT'=>'R', #Arginine
'ATA'=>'I', #Isoleucine
'ATC'=>'I', #Isoleucine
'ATT'=>'I', #Isoleucine
'ATG'=>'M', #Methionine
'ACA'=>'T', #Threonine
'ACC'=>'T', #Threonine
'ACG'=>'T', #Threonine
'ACT'=>'T', #Threonine
'AAC'=>'N', #Asparagine
'AAT'=>'N', #Asparagine
'AAA'=>'K', #Lysine
'AAG'=>'K', #Lysine
'AGC'=>'S', #Serine#Valine
'AGT'=>'S', #Serine
'AGA'=>'R', #Arginine
'AGG'=>'R', #Arginine
'CCC'=>'P', #Proline
'CCG'=>'P', #Proline
'CCT'=>'P', #Proline
'CAC'=>'H', #Histidine
'GTA'=>'V', #Valine
'GTC'=>'V', #Valine
'GTG'=>'V', #Valine
'GTT'=>'V', #Valine
'GCA'=>'A', #Alanine
'GCC'=>'A', #Alanine
'GCG'=>'A', #Alanine
'GCT'=>'A', #Alanine
'GAC'=>'D', #Aspartic Acid
'GAT'=>'D', #Aspartic Acid
'GAA'=>'E', #Glutamic Acid
'GAG'=>'E', #Glutamic Acid
'GGA'=>'G', #Glycine
'GGC'=>'G', #Glycine
'GGG'=>'G', #Glycine
'GGT'=>'G', #Glycine
);

if(exists $genetic_code{$codon}){
return $genetic_code{$codon};

}

else{

return 'X';

}
}
if($strand == '-')
#reverse complementing -ve strands
{$wrtseq=reverse $wrtseq;
$wrtseq=~ tr/ATGC/TACG/;}

my $dna=$wrtseq;
#my $dna="TCATTCTCATTC";
my $protein='';
my $codon3;
for(my $i=0; $i<(length($dna)-2); $i+=3){ $codon3=substr($dna,$i,3); $protein.= codon2aa($codon3); } print $pep."\n"; $dbh->do("INSERT into iddata (jcvi_read,strand,start,stop,protein,orf_id) VALUES('$jcvi_read','$strand','$start','$stop','$protein','$pep')");

print OUT ">".$jcvi_read."|".$pep."\n";
print OUT $protein."\n";
}
$dbh->disconnect();

6 comments:

Anonymous said...: Correctly your article helped me very much in my college assignment. Hats incorrect to you send, will look forward in behalf of more interdependent articles soon as its one of my choice question to read.; March 22, 2010 at 5:27 PM
Anonymous said...: Isoleucine is "I" not "T"; April 8, 2010 at 2:26 PM
Author said...: thanks for the correction. Changed it; April 11, 2010 at 6:24 PM
nikki said...: hey can u please post simplified version of this script ...i'm not able to understand the last part n the place where u have not specified anything like return x and if($strand==-); December 6, 2011 at 6:08 AM
nikki said...: hey can you please send simplified version of this script ...i'm not able to understand the last part ...; December 6, 2011 at 6:09 AM
nikki said...: can u please send simplified version of this script; December 6, 2011 at 6:10 AM