#!/usr/bin/perl -w

my $window = 200;
my $step = 50;
my $pfolddir = glob "~/pfold/bin";
my $grammar = "$pfolddir/article.grm";
my $tmpfasta = "TMPFASTA$$";
my $verbose = 0;

sub cleanup {
    unlink $tmpfasta if -e $tmpfasta;
    unlink "$tmpfasta.col" if -e "$tmpfasta.col";
}
END { cleanup() }
$SIG{"INT"} = sub { cleanup() };
$SIG{"KILL"} = sub { cleanup() };

my $newick2col = "newick2col";
my $fasta2col = "fasta2col";
my $scfg = "scfg";
# my $tkfalign = "tkfalign";

my $usage = "\n";
$usage .= "Usage: $0 <Newick tree file> <FASTA alignment file>\n";
$usage .= "           [-v]  verbose\n";
$usage .= "      [-w <bp>]  window size; default is $window\n";
$usage .= "      [-s <bp>]  step size; default is $step\n";
$usage .= "     [-p <dir>]  PFOLD binaries directory; default is '$pfolddir'\n";
$usage .= "[-g <filename>]  grammar filename; default is '$grammar'\n";
$usage .= "[-t <filename>]  temporary FASTA filename; default is '$tmpfasta'\n";
$usage .= "\n";

my @argv;
while (@ARGV) {
    my $arg = shift;
    if ($arg =~ /^-/) {
	if ($arg eq "-v") { $verbose = 1 }
	elsif ($arg eq "-w") { defined ($window = shift) or die $usage }
	elsif ($arg eq "-s") { defined ($step = shift) or die $usage }
	elsif ($arg eq "-p") { defined ($pfolddir = shift) or die $usage }
	elsif ($arg eq "-g") { defined ($grammar = shift) or die $usage }
	elsif ($arg eq "-t") { defined ($tmpfasta = shift) or die $usage }
	else { die $usage }
    } else {
	push @argv, $arg;
    }
}
die $usage unless @argv == 2;
my ($newick, $fasta) = @argv;

# convert Newick to col
system "$pfolddir/$newick2col $newick >$newick.col";

# read FASTA file
my %seq;
my @name;
my $name;
open FASTA, "<$fasta" or die "Couldn't open '$fasta': $!";
while (<FASTA>) {
    if (/^\s*>\s*(\S+)/) {
	$name = $1;
	push @name, $name;
    } else {
	s/\s//g;
	die "Bad FASTA file" if /\S/ && !defined $name;
	$seq{$name} .= $_;
    }
}
close FASTA;

# check all seqs are same length
my $length;
foreach my $name (@name) {
    if (defined $length) {
	die "Sequences not all same length" unless $length == length $seq{$name};
    } else {
	$length = length $seq{$name};
    }
}

# scan window through file
for (my $pos = 0; $pos < $length - $window; $pos += $step) {
    # create temporary FASTA & col files
    open TMPFASTA, ">$tmpfasta" or die "Couldn't open '$tmpfasta': $!";
    my $subseqlen;
    foreach my $name (@name) {
	my $subseq = substr ($seq{$name}, $pos, $window);
	$subseqlen = length $subseq;
	print TMPFASTA ">$name\n", $subseq, "\n";
    }
    close TMPFASTA or die "Couldn't open '$tmpfasta': $!";
    warn "[column $pos+$subseqlen]\n" if $verbose;
    system "$pfolddir/$fasta2col $tmpfasta >$tmpfasta.col";
    # run PFOLD
    my $command = "$pfolddir/$scfg --treefile $newick.col $grammar $tmpfasta.col";
    warn "[running '$command']\n" if $verbose;
    open PFOLD, "$command|" or die "Couldn't run '$command': $!";
    my $score;
    while (<PFOLD>) {
	warn $_ if $verbose;
	if (/-log\(InsideOddsRatio\)= (\S+)/) {
	    $score = $1;
	    last;
	}
    }
    close PFOLD;
    # print output
    my $prob;
    if (defined $score) {
	$prob = 1/(1+2**$score);
    } else {
	$prob = $score = "<unparseable>";
    }
    print $pos, " ", $pos+$subseqlen-1, " ", $score, " ", $prob, "\n";
}
