#!/usr/bin/perl -w my $specname = "DroMel_3_1"; my $pfoldsuffix = ".pfold"; my $alignsuffix = ".fa"; my $mapfile = "map"; my $NA = "NA"; # text used to indicate missing sequence in mapfiles my $source = "pfold"; my $feature = "RNA"; my $usage = "Usage: $0 \n"; $usage .= " [-s ] species name in alignment file (default is '$specname')\n"; $usage .= " [-m ] name of mapfile in alignment directory (default is '$mapfile')\n"; my @argv; while (@ARGV) { my $arg = shift; if ($arg =~ /^-/) { if ($arg eq "-s") { defined ($specname = shift) or die $usage } elsif ($arg eq "-m") { defined ($mapfile = shift) or die $usage } else { die $usage } } else { push @argv, $arg; } } die $usage unless @argv == 1; my ($scandir) = @argv; # loop through mapfile open MAP, "<$scandir/$mapfile" or die "Couldn't open '$mapfile': $!"; while () { # read in mapfile line my ($n, @map) = split; # make filenames my $pfoldfile = "$scandir/$n$pfoldsuffix"; my $alignfile = "$scandir/$n$alignsuffix"; next unless -e $pfoldfile; # skip if ".pfold" file doesn't exist # read in species row from alignment file my @col; open ALIGN, "<$alignfile" or die "Couldn't open '$alignfile': $!"; my $specflag = 0; my $specnum = -1; # the position of our species in the FASTA file determines which fields to look at in the map file my $row = ""; while () { if (/^\s*>(\S+)/) { my $rowname = $1; last if $specflag; $specflag = 1 if $rowname eq $specname; ++$specnum; } elsif ($specflag) { chomp; s/\s//g; $row .= $_; } } close ALIGN; unless ($specflag) { warn "Couldn't find '$specname' in '$alignfile'; skipping\n"; next } # get coords for our species my ($name, $start, $end, $strand) = @map[$specnum*4..$specnum*4+3]; next if $strand eq $NA; # skip if no data for our species # map alignment rows to sequence coords my @col2seqpos; my $step = $strand . '1'; my ($seqpos, $endpos) = $step > 0 ? ($start, $end) : ($end, $start); for (my $col = 0; $col < length($row); ++$col) { push @col2seqpos, $seqpos; my $char = substr ($row, $col, 1); $seqpos += $step unless $char eq '-' || $char eq '.'; } die "seqpos($seqpos) != endpos($endpos)\n[start=$start, end=$end, strand=$strand]\nDied" unless $seqpos == $endpos; # loop through pfoldscan file open PFOLDSCAN, "<$pfoldfile" or die "Couldn't open '$pfoldfile': $!"; while () { next if /unparseable/; my ($startcol, $endcol, $score, $prob) = split; my ($startpos, $endpos) = @col2seqpos[$step > 0 ? ($startcol, $endcol) : ($endcol, $startcol)]; # output a GFF line print join ("\t", $name, $source, $feature, $startpos, $endpos, $score, $strand, '.', "Probability $prob Alignment $alignfile"), "\n"; } close PFOLDSCAN; } close MAP;