#!/usr/bin/perl -w

my $specname = "DroMel_3_1";
my $pfoldsuffix = ".pfold";
my $alignsuffix = ".fa";
my $mapfile = "map";
my $NA = "NA";   # text used to indicate missing sequence in mapfiles

my $source = "pfold";
my $feature = "RNA";

my $usage = "Usage: $0 <pfoldscan directory>\n";
$usage .= "      [-s <species name>] species name in alignment file (default is '$specname')\n";
$usage .= "      [-m <mapfile name>] name of mapfile in alignment directory (default is '$mapfile')\n";

my @argv;
while (@ARGV) {
    my $arg = shift;
    if ($arg =~ /^-/) {
	if ($arg eq "-s") { defined ($specname = shift) or die $usage }
	elsif ($arg eq "-m") { defined ($mapfile = shift) or die $usage }
	else { die $usage }
    } else {
	push @argv, $arg;
    }
}

die $usage unless @argv == 1;
my ($scandir) = @argv;

# loop through mapfile
open MAP, "<$scandir/$mapfile" or die "Couldn't open '$mapfile': $!";
while (<MAP>) {
    # read in mapfile line
    my ($n, @map) = split;

    # make filenames
    my $pfoldfile = "$scandir/$n$pfoldsuffix";
    my $alignfile = "$scandir/$n$alignsuffix";
    next unless -e $pfoldfile;  # skip if ".pfold" file doesn't exist

    # read in species row from alignment file
    my @col;
    open ALIGN, "<$alignfile" or die "Couldn't open '$alignfile': $!";
    my $specflag = 0;
    my $specnum = -1;   # the position of our species in the FASTA file determines which fields to look at in the map file
    my $row = "";
    while (<ALIGN>) {
	if (/^\s*>(\S+)/) {
	    my $rowname = $1;
	    last if $specflag;
	    $specflag = 1 if $rowname eq $specname;
	    ++$specnum;
	} elsif ($specflag) {
	    chomp;
	    s/\s//g;
	    $row .= $_;
	}
    }
    close ALIGN;
    unless ($specflag) { warn "Couldn't find '$specname' in '$alignfile'; skipping\n"; next }

    # get coords for our species
    my ($name, $start, $end, $strand) = @map[$specnum*4..$specnum*4+3];
    next if $strand eq $NA;  # skip if no data for our species

    # map alignment rows to sequence coords
    my @col2seqpos;
    my $step = $strand . '1';
    my ($seqpos, $endpos) = $step > 0 ? ($start, $end) : ($end, $start);
    for (my $col = 0; $col < length($row); ++$col) {
	push @col2seqpos, $seqpos;
	my $char = substr ($row, $col, 1);
	$seqpos += $step unless $char eq '-' || $char eq '.';
    }
    die "seqpos($seqpos) != endpos($endpos)\n[start=$start, end=$end, strand=$strand]\nDied" unless $seqpos == $endpos;

    # loop through pfoldscan file
    open PFOLDSCAN, "<$pfoldfile" or die "Couldn't open '$pfoldfile': $!";
    while (<PFOLDSCAN>) {
	next if /unparseable/;
	my ($startcol, $endcol, $score, $prob) = split;
	my ($startpos, $endpos) = @col2seqpos[$step > 0 ? ($startcol, $endcol) : ($endcol, $startcol)];

	# output a GFF line
	print join ("\t", $name, $source, $feature, $startpos, $endpos, $score, $strand, '.', "Probability $prob Alignment $alignfile"), "\n";
    }
    close PFOLDSCAN;
}
close MAP;
