#!/usr/bin/perl

#    This library is free software; you can redistribute it and/or
#    modify it under the terms of the GNU Lesser General Public
#    License as published by the Free Software Foundation; either
#    version 2.1 of the License, or (at your option) any later version.
#
#    This library is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#    Lesser General Public License for more details.
#
#    You should have received a copy of the GNU Lesser General Public
#    License along with this library ('COPYING'); if not, write to the Free Software
#    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

use strict;
use warnings;
use XML::Twig;

=head1 NAME

mipe2snps.pl - Generates list of SNPs from a MIPE file
  included in output: PCR ID, SNP ID, SNP pos, SNP amb, allele freqs, number of animals genotyped, SNP rank, flanking seqs, SNP remarks
  based on MIPE version v1.1
  arguments: * mipe_file
             * (optional) list of PCR IDs

=head1 SYNOPSIS

mipe2snps.pl your_file.mipe <pcr_id1> <pcr_id2>

=head1 ADDITIONAL INFO

See http://mipe.sourceforge.net

=head1 AUTHOR

Jan Aerts (jan.aerts@bbsrc.ac.uk)

=cut

my %amb_codes = ( M => ['A', 'C']
                , R => ['A', 'G']
		, W => ['A', 'T']
		, S => ['C', 'G']
		, Y => ['C', 'T']
		, K => ['G', 'T']
		, V => ['A', 'C', 'G']
		, H => ['A', 'C', 'T']
		, D => ['A', 'G', 'T']
		, B => ['C', 'G', 'T']
		, N => ['A', 'C', 'G', 'T']
		);

my ( $file, @pcr_ids ) = @ARGV;
if ( not defined $file ) { die "Please provide filename\n" };
my $twig = XML::Twig->new( TwigHandlers => { pcr => \&pcr }
                         , pretty_print => 'indented' );
$twig->parsefile($file);
exit;

sub pcr {
  my ( $twig, $pcr ) = @_;

  my $to_include = 0;
  my $pcr_id = $pcr->{att}->{id};
  if ( scalar @pcr_ids > 0 ) {
    $to_include = 0;
    foreach ( @pcr_ids ) {
      if ( $pcr_id =~ /$_/i ) {
        $to_include = 1;
      }
    }
  } else {
    $to_include = 1;
  }
  
  if ( $to_include ) {
    if ( defined $pcr->first_child('use') ) {
      my $use_seq;
      my $snp_seq;
      if ( defined $pcr->first_child('use')->first_child('seq') ) {
        $use_seq = $pcr->first_child('use')->first_child('seq')->text;
      }
      my @snps = $pcr->first_child('use')->children('snp');
      foreach my $snp ( @snps ) {
        my $snp_id = $snp->{att}->{id};
        my $snp_amb = ( defined $snp->first_child('amb') ) ? $snp->first_child('amb')->text : 'NO AMB';
        my $snp_pos = ( defined $snp->first_child('pos') ) ? $snp->first_child('pos')->text : 'NO POS';
        my $snp_pos_design = ( defined $snp->first_child('pos_design') ) ? $snp->first_child('pos_design')->text : 'NO POS_DESIGN';
        my $snp_pos_source = ( defined $snp->first_child('pos_source') ) ? $snp->first_child('pos_source')->text : 'NO POS_SOURCE';
        my $snp_rank = ( defined $snp->first_child('rank') ) ? $snp->first_child('rank')->text : 'NO RANK';
        my @snp_remarks = $snp->children('remark');
        my $snp_remark = 'NO REMARK';
        my %occurences;
	my $snp_minorallelefreq = 'NO FREQS';
	my $number_of_samples = 0;
        if ( defined $pcr->first_child('use')->first_child('sample') ) {
	  my @samples = $pcr->first_child('use')->children('sample');
	  foreach my $sample ( @samples ) {
	    my @genotypes = $sample->children('genotype');
	    foreach my $genotype ( @genotypes ) {
	      if ( $genotype->first_child('snp_id')->text eq $snp_id ) {
                $number_of_samples++;
	        $occurences{$genotype->first_child('amb')->text}++;
	      }
	    }
	  }
          my %freqs;
          $snp_minorallelefreq = '';
          foreach my $allele ( keys %occurences ) {
            if ( $allele eq 'A' or $allele eq 'C' or $allele eq 'G' or $allele eq 'T' ) {
              $freqs{$allele} = 2*$occurences{$allele};
            } else{
              foreach my $code ( keys %amb_codes ) {
                if ( $allele eq $code ) {
                  foreach my $key ( @{$amb_codes{$code}} ) {
                    $freqs{$key} += $occurences{$allele};
                  }
                }
              }
            }
          }
          my $total_alleles;
          foreach my $allele ( keys %freqs ) {
            $total_alleles += $freqs{$allele};
          }
          foreach my $allele ( keys %freqs ) {
            $freqs{$allele} /= $total_alleles;
          }
          foreach my $allele ( sort { $freqs{$a} <=> $freqs{$b} } keys %freqs ) {
            $snp_minorallelefreq .= $allele . '(' . sprintf("%.2f", $freqs{$allele}) . ');'
          }
          chop $snp_minorallelefreq;
	}

        if ( defined $use_seq and defined $snp->first_child('pos') ) {
          my $_amb;
          $snp_seq = substr($use_seq, $snp_pos - 11, 10);
          if ( defined $snp->first_child('amb') ) {
            $_amb = $snp->first_child('amb')->text;
          } else {
            $_amb = 'N';
          }
          $snp_seq .= $_amb;
          $snp_seq .= substr($use_seq, $snp_pos, 10);
        } else {
          $snp_seq = 'UNKNOWN';
        }


        if ( scalar @snp_remarks > 0 ) {
          my @remark_text;
          foreach ( @snp_remarks ) {
  	  push @remark_text, $_->text;
  	}
  	$snp_remark = join('; ', @remark_text);
        }
        print $pcr_id, "\t", $snp_id, "\t", $snp_pos, "\t", $snp_pos_design, "\t", $snp_pos_source, "\t", $snp_amb, "\t", $snp_minorallelefreq, "\t", $number_of_samples, "\t", $snp_rank, "\t", $snp_seq, "\t", $snp_remark, "\n";
      }
    }
  }
}
