#!/usr/local/bin/perl -w
use strict;
use Getopt::Long;

# v4: used for getting sample specific counts for DP, GQ, A,C,G,T,IR
# v3: used for getting fasta iub codes
# v2: sample specific info

my $usage = qq(Desc: Converts vcf file to fasta

Syntax: zcat <input_vcf_file> | perl $0 <OPTIONS>

OPTIONS are:
--fasta_sample_prefix <out_prefix>

--norun: outputs flags only

Example: 
zcat <yourvcf> | perl $0  --fasta_sample_prefix out
\n);

die($usage) if (@ARGV<1);

# default args
my $verbose=0;
#my $fmapq=();
my $fasta_sample_prefix="";
my $output_prefix="";
my $cmd = "";
my $norun=0;
my $stpos = 1;
my $enpos = 0;
GetOptions ( 'verbose' => \$verbose,
			 'norun' => \$norun,
			 'cmd=s' => \$cmd,
			 'stpos=i' => \$stpos,
			 'enpos=i' => \$enpos,
			 'fasta_sample_prefix=s' => \$fasta_sample_prefix,
			 'output_prefix=s' => \$output_prefix,
);


my %bases2iub = (
				 'A.' => 'A',
				 'C.' => 'C',
				 'G.' => 'G',
				 'T.' => 'T',
				 'AA' => 'A',
				 'CC' => 'C',
				 'GG' => 'G',
				 'TT' => 'T',
				 'AC' => 'M',
				 'AG' => 'R',
				 'AT' => 'W',
				 'CA' => 'M',
				 'CG' => 'S',
				 'CT' => 'Y',
				 'GA' => 'R',
				 'GC' => 'S',
				 'GT' => 'K',
				 'TA' => 'W',
				 'TC' => 'Y',
				 'TG' => 'K',
				 '..' => 'N',
);

#my @fmapq_set = split(/,/, $fmapq );

print "verbose: .$verbose.\n";
print "cmd: .$cmd.\n";
print "stpos: .$stpos.\n";
print "enpos: .$enpos.\n";
print "Output_prefix: .$fasta_sample_prefix.\n";
print "norun = .$norun.\n";
exit 0 if $norun;

my @cols = ();
my $nsamples = 0;

my $fchr=0;
my $fpos=1;
my $fref=3;
my $falt=4;
my $fqual=5; # do not use this field
my $fjunk2=6; # do not use this field
my $finfo=7;
my $ffmt=8;

my $debug=0;

my $nl=0;
my $nsites_not_snp=0;
my $pos=0;
my $this_pos=$stpos-1;
my $last_pos=0;

# setup filehandles
my @fho;
my $s_off = 9; # sample field offset
my $last_line = "";
while (<>) {
  next if /^##/;
  #print "$_\n";
  if (/^#CHROM/) { 
	@cols=split(/\s+/,$_);
	$nsamples = $#cols-8;
	# build filehandles
	for (my $i=$s_off;$i<=$#cols;$i++) {
	  #my $of=$fasta_sample_prefix.":".$cols[$i];
	  my $of=$fasta_sample_prefix;
	  local *FILE;
	  print "opening file: $of\n";
	  open( FILE, ">$of" );
	  if ( $output_prefix ne "" ) {
		print FILE ">$output_prefix\n";
		#print FILE "\n";
	  }
	  push @fho, *FILE;
	}
	if ( $verbose ) {
	  print "# There are $nsamples samples\n";
	  for (my $i=0;$i<=$#cols;$i++) {
		print "$i $cols[$i]\n";
	  }
	}
	next;
  };
  $nl++; # number of lines;
  if ($debug && $nl<10) {print "# SNP $nl\n"};
  
  # is this a snp?
  my @v=split(/\t/,$_);
  
  my $ref=$v[$fref];
  my $alt=$v[$falt];
  
  if (length($ref)!=1 || length($alt)!=1) {
	$nsites_not_snp++;
	if ($debug && $nl<10) {
	  print "skipping line: $_\n"
	}
	next;
  };

  # info
#  my %info=();
#  for my $x (split(/;/,$v[$finfo])) {
#	my @temp=split(/=/,$x);
#	if ($#temp == 0) { $info{$x}=1}
#	else { $info{$temp[0]} = $temp[1]};
#  }
#
#  if ($debug && $nl<10) {
#	print "$_\n";
#	while ((my $k,my $v)=each(%info)) {
#	  print $k."=>".$v."\n";
#	}
#  }
#



  # do the samples

#  # fmt
#  my %fmt=();
#  for my $x (split(/;/,$v[$ffmt])) {
#	my @temp=split(/=/,$x);
#	if ($#temp == 0) { $fmt{$x}=1}
#	else { $fmt{$temp[0]} = $temp[1]};
#  }
#
#  if ($debug && $nl<10) {
#	print "# .. sample specifics\n";
#  }
#
  # get position
  $last_pos = $this_pos;
  $this_pos = $v[$fpos];

  if ( $last_pos == $this_pos )  {
	print "*Warning: found multiple snp calls on adjacent lines - quitting";
	print "last_line,line =\nlast: $last_line\nthis: $_\n\n";
	next;
  }
  #next if $last_pos == $this_pos; # ignore multiple snp calls 

  my $diff_pos = $this_pos - $last_pos - 1;

  # build data for each sample
  my @fmta=split(/:/,$v[$ffmt]);
  for (my $s=0;$s<$nsamples;$s++) {
	my $file = $fho[ $s ];
	my @sd=split(/:/,$v[$s+$s_off]); # sample data
	my %fmt;
	for (my $k=0;$k<=$#fmta;$k++) {
	  my $key=$fmta[$k];
	  $fmt{$key}=$sd[$k];
	  last if $key eq "GT";
	}

	#print "gt: $fmt{GT}\n";
	$fmt{GT} =~ s/\s+$//g;
	my $bases = substr($fmt{GT},0,1).substr($fmt{GT},-1,1);
	$bases =~ s/0/$ref/g;
	$bases =~ s/1/$alt/g;

	#print "$bases : ";
	if ( ! exists $bases2iub{$bases} ) {
	  print "yikes; could not find iub for base combination: .$bases. \n for line: $_\n";
	  die;
	}
	my $iub = $bases2iub{ $bases };
	#print "$iub\n";

	if ( $diff_pos >= 1 ) {
		print $file "-" x $diff_pos;
	  }

	if ($v[6] eq "LowQual" ) {$iub = "N" };
	print $file "$iub";
  }
  #$this_pos += $diff_pos+1;

  # now process the data
  $last_line=$_;
}

# tidy: add trailing characters
$last_pos = $this_pos;
if ( $enpos != 0 ) {
  print "adding trailing characters: \n";
  
  my $diff_pos = $enpos - $last_pos -1;
  $diff_pos ++ if $nl==0;
  print "(enpos, lastpos, diffpos) = ($enpos, $last_pos, $diff_pos)\n ";
  if ( $diff_pos >= 1 ) {
	for (my $s=0;$s<$nsamples;$s++) {
	  my $file = $fho[ $s ];
	  print $file "-" x $diff_pos;
	}
  }
}

# close
print "# closing files\n";
foreach my $file (@fho ) {
  close $file;
}
