#!/usr/bin/perl -w

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
    if 0; # not running under some shell
# $Id: search2gff.PLS 15088 2008-12-04 02:49:09Z bosborne $

# Author:      Jason Stajich <jason-at-bioperl-dot-org>
# Description: Turn SearchIO parseable report(s) into a GFF report
#
=head1 NAME

search2gff - Turn SearchIO parseable reports(s) into a GFF report

=head1 SYNOPSIS

Usage:
  search2gff [-o outputfile] [-f reportformat] [-i inputfilename]  OR file1 file2 ..

=head1 DESCRIPTION

This script will turn a protein Search report (BLASTP, FASTP, SSEARCH, 
AXT, WABA) into a GFF File.

The options are:

   -i infilename      - (optional) inputfilename, will read
                        either ARGV files or from STDIN
   -o filename        - the output filename [default STDOUT]
   -f format          - search result format (blast, fasta,waba,axt)
                        (ssearch is fasta format). default is blast.
   -t/--type seqtype  - if you want to see query or hit information
                        in the GFF report
   -s/--source        - specify the source (will be algorithm name
                        otherwise like BLASTN)
   --method           - the method tag (primary_tag) of the features
                        (default is similarity)
   --scorefunc        - a string or a file that when parsed evaluates
                        to a closure which will be passed a feature
                        object and that returns the score to be printed
   --locfunc          - a string or a file that when parsed evaluates
                        to a closure which will be passed two
                        features, query and hit, and returns the
                        location (Bio::LocationI compliant) for the
                        GFF3 feature created for each HSP; the closure
                        may use the clone_loc() and create_loc()
                        functions for convenience, see their PODs
   --onehsp           - only print the first HSP feature for each hit
   -p/--parent        - the parent to which HSP features should refer
                        if not the name of the hit or query (depending
                        on --type)
   --target/--notarget - whether to always add the Target tag or not
   -h                 - this help menu
   --version          - GFF version to use (put a 3 here to use gff 3)
   --component        - generate GFF component fields (chromosome)
   -m/--match         - generate a 'match' line which is a container
                        of all the similarity HSPs
   --addid            - add ID tag in the absence of --match
   -c/--cutoff        - specify an evalue cutoff

Additionally specify the filenames you want to process on the
command-line.  If no files are specified then STDIN input is assumed.
You specify this by doing: search2gff E<lt> file1 file2 file3

=head1 AUTHOR

Jason Stajich, jason-at-bioperl-dot-org

=head1 Contributors

Hilmar Lapp, hlapp-at-gmx-dot-net

=cut

use strict;
use Bio::Tools::GFF;
use Getopt::Long;
use Bio::SearchIO;
use Bio::Location::Simple; # pre-declare to simplify $locfunc implementations
use Bio::Location::Atomic; # pre-declare to simplify $locfunc implementations
use Storable qw(dclone);   # for cloning location objects
use Bio::Factory::FTLocationFactory;

my ($output,       # output file (if not stdout)
    $input,        # name of the input file
    $format,       # format of the input file, defauly is blast
    $type,         # 'query' or 'hit' 
    $cutoff,       # cut-off value for e-value filter
    $sourcetag,    # explicit source tag (will be taken from program
                   # otherwise
    $methodtag,    # primary tag (a.k.a. method), default 'similarity'
    $gffver,       # GFF version (dialect) to write  
    $scorefunc,    # closure returning the score for a passed feature
    $locfunc,      # closure returning a location object for a passed
                   # query and hit feature
    $addid,        # flag: whether to always add the ID for $match == 0
    $parent,       # the name of the parent to use; if set and $match == 0
                   # will always add the target
    $comp,         # flag: whether to print a component feature
    $addtarget,    # flag: whether to always add the Target tag, default
                   # is true
    $match,        # flag: whether to print match lines as containers
    $onehsp,       # flag: whether to consider only the first HSP for a hit
    $quiet,        # flag: run quietly
    $help);        # flag: show help screen

# set defaults:
$format    = 'blast'; 
$type      = 'query';
$gffver    = 2;
$methodtag = "similarity";
$addtarget = 1;

GetOptions(
	   'i|input:s'  => \$input,
	   'component'  => \$comp,
	   'm|match'    => \$match,
	   'o|output:s' => \$output,
	   'f|format:s' => \$format,
	   's|source:s' => \$sourcetag,
           'method=s'   => \$methodtag,
           'addid'      => \$addid,
           'scorefunc=s'=> \$scorefunc,
           'locfunc=s'  => \$locfunc,
           'p|parent=s' => \$parent,
           'target!'    => \$addtarget,
           'onehsp'     => \$onehsp,
	   't|type:s'   => \$type,
	   'c|cutoff:s' => \$cutoff,
	   'v|version:i'=> \$gffver,
           'q|quiet'    => \$quiet,
	   'h|help'     => sub{ exec('perldoc',$0);
				exit(0)
				},
	   );
$type = lc($type);
if( $type =~ /target/ ) { $type = 'hit' }
elsif( $type ne 'query' && $type ne 'hit' ) {
    die("seqtype must be either 'query' or 'hit'");
}

# custom or default function returning the score
$scorefunc = defined($scorefunc) ? parse_code($scorefunc) : sub {shift->score};

# custom or default function returning the location
$locfunc = defined($locfunc) ? parse_code($locfunc) : sub { shift->location };

# if --match is given then $addid needs to be disabled
$addid = undef if $addid && $match;

# if no input is provided STDIN will be used
my $parser = new Bio::SearchIO(-format => $format,
			       -verbose => $quiet ? -1 : 0,
			       -file   => $input);

my $out;
if( defined $output ) {
    $out = new Bio::Tools::GFF(-gff_version => $gffver,
			       -file => ">$output");
} else { 
    $out = new Bio::Tools::GFF(-gff_version => $gffver); # STDOUT
}
my (%seen_hit,%seen);
my $other = $type eq 'query' ? 'hit' : 'query';

while( my $result = $parser->next_result ) {
    my $qname = $result->query_name;
    if ( $comp && $type eq 'query' && 
	 $result->query_length ) {
	$out->write_feature(Bio::SeqFeature::Generic->new
			    (-start       => 1,
			     -end         => $result->query_length,
			     -seq_id      => $qname,
			     -source_tag  => 'chromosome',
			     -primary_tag => 'Component',
			     -tag         => {
				 'Sequence' => $qname
				 }));
    }
    while( my $hit = $result->next_hit ) {
	next if( defined $cutoff && $hit->significance > $cutoff);
	my $acc = $qname;
	if( $seen{$qname."-".$hit->name}++ ) {
	    $acc = $qname."-".$seen{$qname.'-'.$hit->name};
	}
	
	if( $comp && $type eq 'hit' && $hit->length &&
	    ! $seen_hit{$hit->name}++ ) {
	    $out->write_feature(Bio::SeqFeature::Generic->new
				(-start       => 1,
				 -end         => $hit->length,
				 -seq_id      => $hit->name,
				 -source_tag  => 'chromosome',
				 -primary_tag => 'Component',
				 -tag         => {
				     'Sequence' => $hit->name
				     }));
	}
	my (%min,%max,$seqid,$name,$st);
	while( my $hsp = $hit->next_hsp ) {
	    my $feature = new Bio::SeqFeature::Generic;
	    my ($proxyfor,$otherf);
	    if( $type eq 'query' ) {
		($proxyfor,$otherf) = ($hsp->query,
				      $hsp->hit);
		$name  ||= $hit->name;
	    } else {
		($otherf,$proxyfor) = ($hsp->query,
				      $hsp->hit);
		$name ||= $acc;
	    }
	    $proxyfor->score($hit->bits) unless( $proxyfor->score );
	    if (($gffver == 3) && ($match || $parent)) {
		$feature->add_tag_value('Parent', $parent || $name);
	    }
	    
	    $min{$type} = $proxyfor->start 
                unless defined $min{$type} && $min{$type} < $proxyfor->start;
	    $max{$type} = $proxyfor->end 
                unless defined $max{$type} && $max{$type} > $proxyfor->end;
	    $min{$other} = $otherf->start 
                unless defined $min{$other} && $min{$other} < $otherf->start;
	    $max{$other} = $otherf->end 
                unless defined $max{$other} && $max{$other} > $otherf->end;
	    if ($addtarget || $match) {
                $feature->add_tag_value('Target', 'Sequence:'.$name);
                $feature->add_tag_value('Target', $otherf->start);
                $feature->add_tag_value('Target', $otherf->end);
            }
            if ($addid) {
                $feature->add_tag_value('ID', $name);
            }

	    $feature->location(&$locfunc($proxyfor,$otherf));
	    #  strand for feature is always going to be product of
	    #  query & hit strands so that target can always be just
	    #  '+'
	    $feature->strand ( $proxyfor->strand * $otherf->strand);
	    if( $sourcetag ) { 
		$feature->source_tag($sourcetag);
	    } else {
		$feature->source_tag($proxyfor->source_tag);
	    }
	    $feature->score(&$scorefunc($proxyfor));
	    $feature->frame($proxyfor->frame);
	    $feature->seq_id($proxyfor->seq_id );
	    $feature->primary_tag($methodtag);
            # add annotation if encoded in the query description
            my $desc = $result->query_description;
            while ($desc =~ /\/([^=]+)=(\S+)/g) {
                $feature->add_tag_value($1,$2);
            }
	    $seqid ||= $proxyfor->seq_id;
	    $out->write_feature($feature);
	    $st ||= $sourcetag || $proxyfor->source_tag;
            last if $onehsp;
	}
	if( $match ) { 
	    
	    my $matchf = Bio::SeqFeature::Generic->new
		(-start => $min{$type},
		 -end   => $max{$type},
		 -strand=> $hit->strand($type)*$hit->strand($other),
		 -primary_tag => 'match',
		 -source_tag  => $st,
		 -score => $hit->bits,
		 -seq_id => $seqid);
	    if( $gffver == 3 ) { 
		$matchf->add_tag_value('ID', $name);
	    }
	    $matchf->add_tag_value('Target', "Sequence:$name");
	    $out->write_feature($matchf);
	}
    }
}

sub parse_code{
    my $src = shift;
    my $code;

    # file or subroutine?
    if(-r $src) {
        if(! (($code = do $src) && (ref($code) eq "CODE"))) {
            die "error in parsing code block $src: $@" if $@;
            die "unable to read file $src: $!" if $!;
            die "failed to run $src, or it failed to return a closure";
        }
    } else {
        $code = eval $src;
        die "error in parsing code block \"$src\": $@" if $@;
        die "\"$src\" fails to return a closure"
            unless ref($code) eq "CODE";
    }
    return $code;
}

=head2 clone_loc

 Title   : clone_loc
 Usage   : my $l = clone_loc($feature->location);
 Function: Helper function to simplify the task of cloning locations
           for --locfunc closures.

           Presently simply implemented using Storable::dclone().
 Example :
 Returns : A L<Bio::LocationI> object of the same type and with the
           same properties as the argument, but physically different.
           All structured properties will be cloned as well.
 Args    : A L<Bio::LocationI> compliant object


=cut

sub clone_loc{
    return dclone(shift);
}

=head2 create_loc

 Title   : create_loc
 Usage   : my $l = create_loc("10..12");
 Function: Helper function to simplify the task of creating locations
           for --locfunc closures. Creates a location from a feature-
           table formatted string.

 Example :
 Returns : A L<Bio::LocationI> object representing the location given
           as formatted string.
 Args    : A GenBank feature-table formatted string.


=cut

sub create_loc{
    return Bio::Factory::FTLocationFactory->from_string(shift);
}
