#!/usr/bin/perl
my $gtf1 = $ARGV[0];
my %scaf=();
my $start=();
my %end=();
my %multi=();
open(GTF1, $gtf1);
while($z=<GTF1>){
chomp $z;
my @parts=split(/\t/,$z);
my $start=$parts[3];
my $end=$parts[4];
my @parts2=split(/\s/,$parts[8]);
my $gene=$parts2[1];
$gene=~s/[\;\"]//g;
##print "$gene\t$start\t$end\n";
if(!(exists $scaf{$gene})){$scaf{$gene}=$parts[0];}
elsif(!($scaf{$gene}=~m/$parts[0]/)){$multi{$gene}=1;}
if(!(exists $start{$gene})){$start{$gene}=$start;$end{$gene}=$end;}
else{$end{$gene}=$end;}
}
close(GTF1);
foreach my $keygene(sort keys %scaf){
if($start{$keygene}>$end{$keygene}){$temp=$start{$keygene};$start{$keygene}=$end{$keygene};$end{$keygene}=$temp;}
if(!(exists $multi{$keygene})){print "$scaf{$keygene}\t$start{$keygene}\t$end{$keygene}\t$keygene\n";}
else{print "Error:Gene $keygene on mutiple scaffolds\n";}
}
Sunday, May 26, 2013
Get gene start and end from GTF file
In a GTF file that does not have CDS records annotated, its tricky to get the start and end coordinates of a gene. This script does just that and prints a nice BED format file. It also report the genes that occur on multiple scaffolds or chromosomes as errors. These can be greped out using grep -v "Error".
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment