#!/usr/bin/env perl $lagandir = $ENV{LAGAN_DIR}; # Status # -- extension problems if (@ARGV < 2) { print ("usage:\n rechaos seqfile1 seqfile2 [-chaos \"chaos flags\"] [-recurse \"(wl1,nd1,co1),(wl2,nd2,co2),...\"] [-out \"filename\"] [-lazy] [-maskedonly] [-debug] [-translate] [-fastreject]\n"); exit(1); } #$recurfl = "(12,0,25,0)x,(13,1,30,0)x,(8,1,30,0)x,(7,1,30,0)x"; $recurfl = "(12,0,25,0)x,(13,1,30,0)x,(4,0,4,3000)xt,(8,1,30,0)x,(7,1,30,0)x"; #$recurfl = "(12,0,10,200)x,(12,0,10,150)x,(3,0,10,150)xt,(8,0,10,150)x,(12,0,25,0),(13,1,30,0),(3,0,30,0)t,(8,1,30,0),(7,1,25,0)"; $minbox = 0; $seq1 = $ARGV[0]; $seq2 = $ARGV[1]; $tofile = 0; $masker = 1; $lazycheck = 0; $fastreject = 0; $frminlevel = 0; $frmaxlevel = 3; @frseq1 = (150000, 50000, 30000, 15000); @frseq2 = (150000, 50000, 30000, 15000); #@frseq1 = (70000, 60000, 60000, 20000); #@frseq2 = (70000, 60000, 60000, 20000); $sentinelleft = 1.1; $sentinelright = 1.2; $gfc = " "; $dounmasked = 1; $filename = ""; $debug = 0; $anchparams = ""; $translate = 0; sub max { my ($a, $b) = @_; return $a if ($a > $b); return $b; } sub min { my ($a, $b) = @_; return $a if ($a < $b); return $b; } $i = 2; while ($i < @ARGV) { if ($ARGV[$i] =~ /-chaos/) { $chaosfl = $chaosfl." ".$ARGV[++$i]; } elsif ($ARGV[$i] =~ /-ext/) { $chaosfl = $chaosfl." -ext "; } elsif ($ARGV[$i] =~ /-recurse/) { $recurfl = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-lazy/) { $lazycheck = 1; } elsif ($ARGV[$i] =~ /-nomask/) { $masker = 0; } elsif ($ARGV[$i] =~ /-out/) { $tofile = 1; $filename = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-maskedonly/) { $dounmasked = 0; } elsif ($ARGV[$i] =~ /-fastreject/) { $fastreject = 1; } elsif ($ARGV[$i] =~ /-debug/) { $debug = 1; } elsif ($ARGV[$i] =~ /-translate/) { $translate = 1; } elsif ($ARGV[$i] =~ /-gfc/) { $gfc = " -gfc "; } elsif ($ARGV[$i] =~ /-gap/){ $anchparams = $anchparams." -gap ".$ARGV[++$i]; $anchparams = $anchparams." ".$ARGV[++$i]; } else { die ("Unrecognized option $ARGV[$i]\n"); } $i++; } if ($lazycheck) { if (-f $filename) { print STDERR "Output file already exists, lazy mode exit!\n"; exit (0); } } $extracase1 = 0; $extracase2 = 0; if (-e "$seq1.masked") { $extra1 = $seq1; $seq1 = "$seq1.masked"; $extracase1 = 1; } if (-e "$seq2.masked") { $extra2 = $seq2; $seq2 = "$seq2.masked"; $extracase2 = 1; } if (! $dounmasked){ $extracase1 = 0; $extracase2 = 0; } #open(SEQ1, "$seq1"); #open(SEQ2, "$seq2"); #$line1 = ; #while ($line1 = ) { # chomp $line1; # $seq1len += length($line1); #} # #$line2 = ; #while ($line2 = ) { # chomp $line2; # $seq2len += length($line2); #} $seq1len = `$lagandir/utils/getlength $seq1`; chomp $seq1len; $seq2len = `$lagandir/utils/getlength $seq2`; chomp $seq2len; $b1[0] = $b2[0] = 1; $e1[0] = $seq1len; $e2[0] = $seq2len; $cumanchs = 0; $clipleft1 = 0; $clipleft2 = 0; $clipright1 = $seq1len + 1; $clipright2 = $seq2len + 1; $app_str = ""; $i = 0; while (1) { $goodanchs = 0; $totalanchs = 0; $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/); if (! $stillmore) { if ($extracase1 || $extracase2) { if ($extracase1) { $seq1 = $extra1; $extracase1 = 0; } if ($extracase2) { $seq2 = $extra2; $extracase2 = 0; } } else { last; } } else { $wordlen = $1; $degeneracy = $2; $cutoff = $3; $extcutoff = $4; $tail = $5; $extraparams = ""; $extraparams = "-t ".$extraparams if ((index ($tail, "t") != -1) && ($translate)); $extraparams = $extraparams." -rsc $extcutoff" if (index ($tail, "x") != -1); } $recurfl = $6; next if ((index ($tail, "t") != -1) && (!$translate)); print STDERR "Using $seq1 $seq2 ($wordlen, $degeneracy, $cutoff, $extcutoff) $tail\n"; # PRINT OUT LIST OF REGIONS TO ALIGN open (PFILE, ">$$.anchs.pairs"); for ($j = 0; $j < @b1; $j++) { print PFILE "-s1 $b1[$j] $e1[$j] -s2 $b2[$j] $e2[$j]\n"; } close (PFILE); # print STDERR "PAIRS hits\n"; # print STDERR `cat $$.anchs.pairs`; # print STDERR "-----------------\n"; # print STDERR `cat $$.anchs.pairs`; # print STDERR "-----------------\n"; # print STDERR "$lagandir/chaos $seq1 $seq2 -wl $wordlen -nd $degeneracy -co $cutoff $extraparams $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp"; # PERFORM THE ALIGNMENTS USING CHAOS `$lagandir/chaos $seq1 $seq2 $extraparams -wl $wordlen -nd $degeneracy -co $cutoff $gfc $chaosfl -pairs $$.anchs.pairs > $$.anchtemp`; if ($?) { exit(1); } # ADD IN BOUNDARIES $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/); if ($fastreject || $stillmore || $extracase1 || $extracase2){ $temp1 = $seq1len + 1; $temp2 = $seq2len + 1; $app_str = $app_str."seq1 0 $clipleft1; seq2 0 $clipleft2; score=$sentinelleft (+)\n"; $app_str = $app_str."seq1 $clipright1 $temp1; seq2 $clipright2 $temp2; score=$sentinelright (+)\n"; } # APPEND HITS FROM $app_str TO LOCAL ALIGNMENT LIST open (OFILE, ">>$$.anchtemp"); print OFILE $app_str; close (OFILE); # `wc $$.anchtemp` =~ /(\d+)/x; # $totalanchs = $totalanchs + $1; # print STDERR "CHAOS hits\n"; # print STDERR `cat $$.anchtemp`; # FIND MAXIMAL-SCORING CONSISTENT CHAIN `$lagandir/anchors $$.anchtemp $gfc $anchparams | sort -n +1 > $$.anchs.sorted`; if ($?) { exit(1); } # IF WE'RE DONE, THEN QUIT! $stillmore = ($recurfl =~ /\((\d+)\,(\d+)\,(\d+)\,(\d+)\)(\w*)(.*)/); if (!$stillmore && !$extracase1 && !$extracase2) { last; } # `wc $$.anchs` =~ /(\d+)/x; # print STDERR "ANCHS hits\n"; # print STDERR `cat $$.anchs.sorted`; # $goodanchs = $goodanchs + $1; # if ($?) { exit(1); } # READ SORTED ANCHORS TO @anchors open(SFILE, "$$.anchs.sorted"); @anchors = ; close(SFILE); @b1new = 0; @b2new = 0; @e1new = 0; @e2new = 0; @scores = 0; $app_str = ""; # FOR EACH UNALIGNED REGION $area = 0; $maxarea = 0; $k = 0; for ($m = 0; $m < @anchors; $m++){ # SAVE OLD ANCHORS (SKIP FIRST AND LAST FAKE ANCHORS) if ($m >= 1 && $m < @anchors - 1){ $anchors[$m] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $score = $5; chomp $score; $app_str = $app_str."seq1 $1 $2; seq2 $3 $4; score=$score (+)\n"; } if ($m == 0){ next; } # DETERMINE REGION BOUNDARIES $anchors[$m-1] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $gap1begin = $2 + 1; $gap2begin = $4 + 1; $prevanchorscore = $5; chomp $prevanchorscore; $anchors[$m] =~ /\((\d+) (\d+)\)=\((\d+) (\d+)\) (.*)/; $gap1end = $1 - 1; $gap2end = $3 - 1; $nextanchorscore = $5; chomp $nextanchorscore; # CHECK IF RECURSION NEEDED $boxarea = ($gap1end - $gap1begin + 1) * ($gap2end - $gap2begin + 1); $area = $area + $boxarea; $maxarea = $boxarea if ($boxarea > $maxarea); if ($boxarea >= $minbox){ # FAST REJECT if ($fastreject && ($i >= $frminlevel) && ($i <= $frmaxlevel)){ # SKIP MARKED ENDS OF ALIGNMENT if ($nextanchorscore == $sentinelleft || $prevanchorscore == $sentinelright){ next; } # TRIM NEW ENDS OF ALIGNMENT if ($prevanchorscore == $sentinelleft){ # if ($boxarea > $frseq1[$i] * $frseq2[$i]){ if (($gap1end - $gap1begin > $frseq1[$i]) || ($gap2end - $gap2begin > $frseq2[$i])){ if (@anchors == 2){ exit(3); } $clipleft1 = max ($gap1begin-1, $gap1end - $frseq1[$i]); $clipleft2 = max ($gap2begin-1, $gap2end - $frseq2[$i]); $gap1begin = $clipleft1 + 1; $gap2begin = $clipleft2 + 1; } } elsif ($nextanchorscore == $sentinelright){ # if ($boxarea > $frseq1[$i] * $frseq2[$i]){ if (($gap1end - $gap1begin > $frseq1[$i]) || ($gap2end - $gap2begin > $frseq2[$i])){ if (@anchors == 2){ exit(3); } $clipright1 = min ($gap1end+1, $gap1begin + $frseq1[$i]); $clipright2 = min ($gap2end+1, $gap2begin + $frseq2[$i]); $gap1end = $clipright1 - 1; $gap2end = $clipright2 - 1; } } } # ADD REGION if ($gap1begin < $gap1end && $gap2begin < $gap2end){ $b1new[$k] = $gap1begin; $b2new[$k] = $gap2begin; $e1new[$k] = $gap1end; $e2new[$k] = $gap2end; $k++; } } } @b1 = @b1new; @b2 = @b2new; @e1 = @e1new; @e2 = @e2new; if ($debug) { print STDERR "Level $i Summary:\n"; print STDERR " Using $seq1 $seq2 ($wordlen, $degeneracy, $cutoff)\n"; if ($totalanchs == 0) { $percentage = 0; } else { $percentage = $goodanchs / $totalanchs * 100.0; } print STDERR " $goodanchs good out of $totalanchs total anchors ($percentage%)\n"; $area = $area / 1000000; $maxarea = $maxarea / 1000000; print STDERR " Total area left = $area (max = $maxarea)\n"; } $cumanchs = $cumanchs + $goodanchs; $i++; } $res = `sort -nr +1 $$.anchs.sorted`; if ($?) { exit(1); } `rm $$.*`; if($tofile) { open(OUTFILE, ">$filename"); print OUTFILE "$res"; close OUTFILE; } else { print "$res"; } print STDERR "$cumanchs cumulative anchors\n"