Perl script that reads through a VCF file and prints it out in the "calls" format.
Although i could not find any proper documentation for the calls format, it can be described as diploid genotype calls represented using ambiguous DNA bases for each individual with the first line being the individual id's or names and the following lines being the genotypes at different positions.
Although i could not find any proper documentation for the calls format, it can be described as diploid genotype calls represented using ambiguous DNA bases for each individual with the first line being the individual id's or names and the following lines being the genotypes at different positions.
#!/usr/bin/perl
use List::Util qw[min max];
# Input parameters
my $vcf_file = $ARGV[0];
#Go through fasta file, extract sequences
open(IN, $vcf_file);
while($z=<IN>) {
if($z=~m/^##/){}
elsif($z=~m/^#/){
print "scaffold\tposition";
chomp $z;
@values=split(/\t/,$z);
$totind=scalar @values;
for($i=9;$i<$totind;$i++){
print "\t$values[$i]";
}
print "\n";
}
else{
chomp $z;
@values=split(/\t/,$z);
$refbase=$values[3];
$altbase=$values[4];
if(length $altbase < 2){#skip tri-allelic sites
print "$values[0]\t$values[1]";
$totind=scalar @values;
for($i=9;$i<$totind;$i++){
@parts=split(/\:/,$values[$i]);
$parts[0] =~ s/\///g;$parts[0] =~ s/0/$refbase/g;$parts[0] =~ s/1/$altbase/g;
$parts[0] =~ s/[A][A]/A/ig;$parts[0] =~ s/[T][T]/T/ig;$parts[0] =~ s/[C][C]/C/ig;$parts[0] =~ s/[G][G]/G/ig;$parts[0] =~ s/[.][.]/N/g;
$parts[0] =~ s/[AT][AT]/W/ig;$parts[0] =~ s/[GT][GT]/K/ig;$parts[0] =~ s/[AC][AC]/M/ig;$parts[0] =~ s/[AG][AG]/R/ig;$parts[0] =~ s/[CT][CT]/Y/ig;$parts[0] =~ s/[CG][CG]/S/ig;
print "\t$parts[0]";
}
print "\n";
}
}
}#end of file while
close IN;
#perl vcftocalls.pl genotyped.vcf > genotyped.calls
No comments:
Post a Comment