#!/usr/bin/env ruby

localdb = "sp"        
# database name from which homologues are collected 
# by locally installed blast. Leave this if you do 
# not use the '-l' option.

mafftpath = "mafft"   
# path of mafft. "/usr/local/bin/mafft"
# if mafft is in your command path, "mafft" is ok.

blastpath = "blastall"   
# path of blastall. 
# if blastall is in your command path, "blastall" is ok.

# mafft-homologs.rb  v. 2.1 aligns sequences together with homologues 
# automatically collected from SwissProt via NCBI BLAST.
#
# mafft > 5.58 is required
#
# Usage:
#   mafft-homologs.rb [options] input > output
# Options:
#   -a #      the number of collected sequences (default: 50)
#   -e #      threshold value (default: 1e-10)
#   -o "xxx"  options for mafft 
#             (default: " --op 1.53 --ep 0.123 --maxiterate 1000")
#   -l        locally carries out blast searches instead of NCBI blast
#             (requires locally installed blast and a database)
#   -f        outputs collected homologues also (default: off)
#   -w        entire sequences are subjected to BLAST search 
#             (default: well-aligned region only)

require 'getopts'
require 'tempfile'

# mktemp
temp_vf = Tempfile.new("_vf").path
temp_if = Tempfile.new("_if").path
temp_pf = Tempfile.new("_pf").path
temp_af = Tempfile.new("_af").path
temp_qf = Tempfile.new("_qf").path
temp_bf = Tempfile.new("_bf").path
temp_rid = Tempfile.new("_rid").path
temp_res = Tempfile.new("_res").path


system( mafftpath + " --help > #{temp_vf} 2>&1" )
pfp = File.open( "#{temp_vf}", 'r' )
while pfp.gets
	break if $_ =~ /MAFFT v/
end
pfp.close
if( $_ ) then
	mafftversion = sub( /^\D*/, "" ).split(" ").slice(0).strip.to_s
else
	mafftversion = "0"
end
if( mafftversion < "5.58" ) then
	puts ""
	puts "======================================================"
	puts "Install new mafft (v. >= 5.58)"
	puts "======================================================"
	puts ""
	exit
end

srand ( 0 )

def readfasta( fp, name, seq )
	nseq = 0
	tmpseq = ""
	while fp.gets
		if $_ =~ /^>/ then
			name.push( $_.sub(/>/,"").strip )
			seq.push( tmpseq ) if nseq > 0
			nseq += 1
			tmpseq = ""
		else
			tmpseq += $_.strip
		end
	end
	seq.push( tmpseq )
	return nseq
end

nadd = 50
eval = 1e-10
local = 0
fullout = 0
entiresearch = 0
corewin = 50
corethr = 0.3
mafftopt = " --op 1.53 --ep 0.123 --localpair --maxiterate 1000 --reorder "
if getopts( "s", "f", "w", "l", "h", "e:", "a:", "o:", "c:", "d:" ) == nil ||  ARGV.length == 0 || $OPT_h then
	puts "Usage: #{$0} [-h -l -e# -a# -o\"[options for mafft]\"] input_file"
	exit
end

if $OPT_c then
	corewin = $OPT_c.to_i
end
if $OPT_d then
	corethr = $OPT_d.to_f
end
if $OPT_w
	entiresearch = 1
end
if $OPT_f
	fullout = 1
end
if $OPT_s
	fullout = 0
end
if $OPT_l
	local = 1
end
if $OPT_e then
	eval = $OPT_e.to_f
end
if $OPT_a then
	nadd = $OPT_a.to_i
end
if $OPT_o then
	mafftopt += " " + $OPT_o + " "
end

system "cat " + ARGV.to_s + " > #{temp_if}"
ar = mafftopt.split(" ")
nar = ar.length
for i in 0..(nar-1)
	if ar[i] == "--seed" then
		system "cat #{ar[i+1]} >> #{temp_if}"
	end
end

nseq = 0
ifp = File.open( "#{temp_if}", 'r' )
	while ifp.gets
		nseq += 1 if $_ =~ /^>/
	end
ifp.close

if nseq >= 100 then
	STDERR.puts "The number of input sequences must be <100."
	exit
elsif nseq == 1 then
	system( "cp #{temp_if}"  + " #{temp_pf}" )
else
	STDERR.puts "Performing preliminary alignment .. "
	if entiresearch == 1 then
#		system( mafftpath + " --maxiterate 1000 --localpair #{temp_if} > #{temp_pf}" )
		system( mafftpath + " --maxiterate 0 --retree 2 #{temp_if} > #{temp_pf}" )
	else
		system( mafftpath + " --maxiterate 1000 --localpair --core --coreext --corethr #{corethr.to_s} --corewin #{corewin.to_s} #{temp_if} > #{temp_pf}" )
	end
end

pfp = File.open( "#{temp_pf}", 'r' )
inname = []
inseq = []
slen = []
act = []
nin = 0
nin = readfasta( pfp, inname, inseq )
for i in 0..(nin-1)
	slen.push( inseq[i].gsub(/-/,"").length )
	act.push( 1 )
end
pfp.close

pfp = File.open( "#{temp_if}", 'r' )
orname = []
orseq = []
nin = 0
nin = readfasta( pfp, orname, orseq )
pfp.close

allen = inseq[0].length
for i in 0..(nin-2)
	for j in (i+1)..(nin-1)
		next if act[i] == 0
		next if act[j] == 0
		pid = 0.0
		total = 0
		for a in 0..(allen-1)
			next if inseq[i][a,1] == "-" || inseq[j][a,1] == "-"
			total += 1
			pid += 1.0 if inseq[i][a,1] == inseq[j][a,1]
		end
		pid /= total
#		puts "#{i.to_s}, #{j.to_s}, #{pid.to_s}"
		if pid > 0.5 then
			if slen[i] < slen[j]
				act[i] = 0 
			else
				act[j] = 0 
			end
		end
	end
end
#p act


afp = File.open( "#{temp_af}", 'w' )

STDERR.puts "Searching .. \n"
ids = []
add = []
sco = []
for i in 0..(nin-1)
	inseq[i].gsub!(/-/,"")
	afp.puts ">" + orname[i]
	afp.puts orseq[i]

#	afp.puts ">" + inname[i]
#	afp.puts inseq[i]

	STDERR.puts "Query (#{i+1}/#{nin})\n" + inname[i]
	if act[i] == 0 then
		STDERR.puts "Skip.\n\n"
		next 
	end

	if local == 0 then
		command = "lynx -source 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?QUERY=" + inseq[i] + "&DATABASE=swissprot&HITLIST_SIZE=" + nadd.to_s + "&FILTER=L&EXPECT='" + eval.to_s + "'&FORMAT_TYPE=TEXT&PROGRAM=blastp&SERVICE=plain&NCBI_GI=on&PAGE=Proteins&CMD=Put' > #{temp_rid}"
		system command
	
		ridp = File.open( "#{temp_rid}", 'r' )
		while ridp.gets
			break if $_ =~ / RID = (.*)/
		end
		ridp.close
		rid = $1.strip
		STDERR.puts "Submitted to NCBI. rid = " + rid
	
		STDERR.printf "Waiting "
		while 1 
			STDERR.printf "."
			sleep 10
			command = "lynx -source 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi?RID=" + rid + "&DESCRIPTIONS=500&ALIGNMENTS=" + nadd.to_s + "&ALIGNMENT_TYPE=Pairwise&OVERVIEW=no&CMD=Get&FORMAT_TYPE=XML' > #{temp_res}"
			system command
			resp = File.open( "#{temp_res}", 'r' )
#			resp.gets
#			if $_ =~ /WAITING/ then
#				resp.close
#				next
#			end
			while( resp.gets )
				break if $_ =~ /QBlastInfoBegin/
			end
			resp.gets
			if $_ =~ /WAITING/ then
				resp.close
				next
			else
				resp.close
				break
			end
		end
	else
#		puts "Not supported"
#		exit
		qfp = File.open( "#{temp_qf}", 'w' )
			qfp.puts "> "
			qfp.puts inseq[i]
		qfp.close
		command = blastpath + "  -p blastp  -e #{eval} -b 1000 -m 7 -i #{temp_qf} -d #{localdb} > #{temp_res}"
		system command
		resp = File.open( "#{temp_res}", 'r' )
	end
	STDERR.puts " Done.\n\n"

	resp = File.open( "#{temp_res}", 'r' )
	while 1
		while resp.gets
			break if $_ =~ /<Hit_id>(.*)<\/Hit_id>/ || $_ =~ /(<Iteration_stat>)/
		end
		id = $1
		break if $_ =~ /<Iteration_stat>/
#		p id
		while resp.gets
			break if $_ =~ /<Hsp_bit-score>(.*)<\/Hsp_bit-score>/
		end
		score = $1.to_f
#		p score

		known = ids.index( id )
		if known != nil then
			if sco[known] >= score then
				next
			else
				ids.delete_at( known )
				add.delete_at( known )
				sco.delete_at( known )
			end
		end
		while resp.gets
			break if $_ =~ /<Hsp_hseq>(.*)<\/Hsp_hseq>/
		end
#		break if $1 == nil
		target = $1.sub( /-/, "" )
#		p target
#		STDERR.puts "adding 1 seq"
		ids.push( id )
		sco.push( score )
		add.push( target )
	end
	resp.close
end

n = ids.length

outnum = 0
while n > 0 && outnum < nadd
	m = rand( n )
	afp.puts ">_addedbymaffte_" + ids[m]
	afp.puts add[m]
	ids.delete_at( m )
	add.delete_at( m )
	n -= 1
	outnum += 1
end
afp.close

STDERR.puts "Performing alignment .. "
system( mafftpath + mafftopt + " #{temp_af} > #{temp_bf}" )
STDERR.puts "done."

bfp = File.open( "#{temp_bf}", 'r' )
outseq = []
outnam = []
readfasta( bfp, outnam, outseq )
bfp.close

outseq2 = []
outnam2 = []

len = outseq.length
for i in 0..(len-1)
#	p outnam[i]
	if fullout == 0 && outnam[i] =~ /^_addedbymaffte_/ then
		next
	end
	outseq2.push( outseq[i] )
	outnam2.push( outnam[i].sub( /^_addedbymaffte_/, "_ho_" ) )
end

nout = outseq2.length
len = outseq[0].length
p = len
while p>0
	p -= 1
    allgap = 1
    for j in 0..(nout-1)
		if outseq2[j][p,1] != "-" then
			allgap = 0
			break
		end
    end
    if allgap == 1 then
        for j in 0..(nout-1)
            outseq2[j][p,1] = ""
        end
    end
end
for i in 0..(nout-1)
	puts ">" + outnam2[i]
	puts outseq2[i].gsub( /.{1,60}/, "\\0\n" )
end


#system( "rm -rf #{temp_if} #{temp_vf} #{temp_af} #{temp_bf} #{temp_pf} #{temp_qf} #{temp_res} #{temp_rid}" )
