#!/usr/local/bin/perl

=head1 NAME

LDmaps.pl - use haploview to generate LD maps

=head1 SYNOPSIS

LDmaps.pl file

=head1 DESCRIPTION

Specify a tabix indexed vcf file, get the data for specific region in ped format and generate an LD map using haploview command line options.

=head1 OPTIONS

=over

=item B<coord>

Speciy the coordinates to use in tabix format (e.g. 1:100000-200000)

=item B<file>

Specify a file location to use. Can be a URL.

If not specified use http://www.ebi.ac.uk/~dunham/all_called_together_final_noindels_nomono_triocaller_rsq03_erate01_filtered.recode.vcf.gz be default.

=item B<sample>

Specify a sample file. Should list two columns

sample\tpopulation

By default uses a file calles samples in the directory from where hte script is started.

=item B<pop>

Give the population. By default medaka :-)

=item B<nopng>

Do not export png of LD structure

=item B<ld>

Export LD file. Overrides default output of block text files.

=item B<win>

Specify a window size to generate the LD map over. If specified will walk through the regions specified by B<coord> shifting by shift at a time and make a plot.

=item B<maf>

Specify the maf threshold for haploview -blockMAFThresh

=item B<bif>

Specify the threshold for -blockInformFrac <thresh> 0.95 by default

=item B<shift>

The fraction by which to shift the window each time. By default is set to 1/2 i.e. half a window at a time. Fo no overlapping use 1.

=item B<nohaplo>

Don't run haploview - useful if already run and want to reprocess the output files.

=back

=head1 LICENCE

This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.

=head1 AUTHOR

Ian Dunham, Ensembl Functional Genomics

=head1 CONTACT

Ian Dunham <dunham@ebi.ac.uk>

=cut

use 5.012;
#use warnings;
use Getopt::Long;
use Sort::Naturally;
use Cwd;

my ($coord, $win, $file, $sample, $shift, $pop, $nopng, $ld, $maf, $bif, $nohaplo);

GetOptions (
    'win=i'       => \$win,
    'coord=s'     => \$coord,
    'file=s'      => \$file,
    'sample=s'    => \$sample,
    'pop=s'       => \$pop,
    'nopng'       => \$nopng,
    'ld'          => \$ld,
    'shift'       => \$shift,
    'maf=f'       => \$maf,
    'bif=f'       => \$bif,
    'nohaplo'     => \$nohaplo,
);

unless (defined $shift){
    $shift = 0.5;
}

unless (defined $file){
    $file = "all_called_together_final_noindels_nomono_triocaller_rsq03_erate01_filtered.recode.vcf.gz";
    unless (-e $file){
        die "You need to specify a file that exists.";
    }
}

unless (defined $pop){
    $pop = "medaka";
}

unless (defined $sample){
    $sample = "samples";
    die "There is no sample file" unless -e $sample;
}

if (!defined $coord or $coord !~ /^\d+:\d+-\d+$/){
    die "You must specify a valid region using -coord."  ;
}

my ($chr, $beg, $end ) = split /:|-/, $coord;

my @regions;
#Create a set of regions;
# Ideally one would do this with a constant number of markers - process the vcf to do that
if (defined $win){
    my $first = $beg;
    my $last;
    while ($last < $end){
        $last = $first + $win -1;
        push @regions, "$chr:$first-$last";
        $first += int($win * $shift); # shift the window by $shift fraction
    }
}
else {
    push @regions, $coord;
}
unless (defined $nohaplo){
    foreach my $region (@regions){
        my ($chr, $beg, $end ) = split /:|-/, $region;
        my $outfile = $chr. "_" . $beg . "-" . $end;
        system "vcf_to_ped_convert.pl -vcf $file -sample_panel_file $sample -population $pop -region $region";
        if (defined $nopng){
            if (defined $ld){
                if (defined $maf and defined $bif){
                    system "java -jar ~/bin/Haploview.jar -nogui -pedfile $outfile.ped -memory 10000 -dprime -blockoutput GAB -blockMAFThresh $maf -blockInformFrac $bif"; # -blockInformFrac <thresh> 0.95 by default
                }
            elsif (defined $bif){
                    system "java -jar ~/bin/Haploview.jar -nogui -pedfile $outfile.ped -memory 10000 -dprime -blockoutput GAB -blockInformFrac $bif"; # -blockInformFrac <thresh> 0.95 by default
                }
                elsif (defined $maf){
                    system "java -jar ~/bin/Haploview.jar -nogui -pedfile $outfile.ped -memory 10000 -dprime -blockoutput GAB -blockMAFThresh $maf";
            }
                else {
                    system "java -jar ~/bin/Haploview.jar -nogui -pedfile $outfile.ped -memory 10000 -dprime -blockoutput GAB";
                }
            }
            else{
                system "java -jar ~/bin/Haploview.jar -nogui -pedfile $outfile.ped -memory 10000 -blockoutput GAB";
            }
        }
        else{
            system "java -jar ~/bin/Haploview.jar -nogui -pedfile $outfile.ped -memory 10000 -compressedpng -blockoutput GAB";
        }
    }
}

my $dir = getcwd;
opendir my $dh, $dir or die "Cannot open directory $dir\n";
#say $chr;
my @files = grep {/^$chr.+\.GABRIELblocks$/} readdir $dh;
my (%blocks, %LD);

foreach my $file (@files){
    (my $lookup = $file) =~ s/ped\.GABRIELblocks/info/;
    open my $lfh, "<", $lookup or die "Cannot open file $lookup\n";
    my %lookup;
    my $i = 1;
    while (<$lfh>){
        chomp;
        my ($str, $pos) = split /\t/, $_;
        ($chr, undef) = split /:/, $str;
        $lookup{$i} = $pos;
        $i++;
    }
    close $lfh;
    open my $fh, "<", $file or die "Cannot open file $file\n";
    while (<$fh>){
        next unless /BLOCK/;
         # format is BLOCK 1.  MARKERS: 72 73 74
         #markers are numbered relative to the file - need to lookup actual position from .info file above
        my ($block, @markers) = split /\.\s+MARKERS:\s/, $_;
        @markers = split(/\s+/,join(',',@markers));
        my $beg = $lookup{$markers[0]}; #lookup the marker position
        my $end = $lookup{$markers[-1]};
        my $length = $end - $beg +1;
        $blocks{"$beg-$end"}{'LENGTH'} = $length;
        $blocks{"$beg-$end"}{'COUNT'} = scalar @markers;
    }
    close $fh;
    (my $ldfile = $file) =~ s/GABRIELblocks/LD/;
    if (-e $ldfile){
        open $fh, "<", $ldfile or die "Cannot open file $ldfile\n";
        while (<$fh>){
            next if /^L1/;
            chomp;
            my ($L1, $L2, @rest) = split /\t/, $_;
            my $beg = $lookup{$L1}; #lookup the marker position
            my $end = $lookup{$L2};
            $LD{$chr}{$beg}{$end} = join("\t", @rest);
        }
    }
}

open my $fh, ">", "$chr.blocks.sizes" or die "Cannot open $chr.blocks.sizes";
foreach my $location (nsort keys %blocks){
    say $fh join("\t", "$chr:$location", $blocks{$location}{'LENGTH'}, $blocks{$location}{'COUNT'});
}
open my $ofh, ">", "$chr.LD" or die "Cannot open file $chr.LD\n";
say $ofh "L1\tL2\tDprime\tLOD\trsq\tCIlow\tCIhi\tTint";
foreach my $chr (nsort keys %LD){
    foreach my $L1 (nsort keys %{$LD{$chr}}){
        foreach my $L2 (nsort keys %{$LD{$chr}{$L1}}){
            say $ofh join("\t", $L1, $L2, $LD{$chr}{$L1}{$L2});
        }
    }
}
