#!/usr/bin/perl

# This file gives an example how to access and manipulate prank XML alignment
# files. The code given here is not professional and may contain bugs. In this
# example case it seems to work fine. The code can be freely copied and used
# elsewhere.

# See the documentation for libXML to learn more of the XML module.
# A tutorial is available at http://www.xml.com/pub/a/2001/11/14/xml-libxml.html

use XML::LibXML;

# Node.pm is a small module for manipulating the tree.
# Some functions are used and explained here, the rest should be self-explanatory

use Node;

# Here we call the XML module and read the file '20prim.xml'.

my $parser = XML::LibXML->new();
my $tree = $parser->parse_file("20prim.xml");
my $root = $tree->getDocumentElement;

# We get the newick tree.. 

$newick = ${$root->getElementsByTagName('newick')}[0]->getFirstChild->getData;

# .. the root node using Node.pm

my $rootNode = Node->new();
$rootNode = $rootNode->parseTree($newick);

# The tree is using id's, so we need a mapping between id's and real names.
# This mapping is stored as '%nameToLeaf' hash. Getting and storing the sequence 
# data is also trivial.

my %nameToLeaf = ();
my %seqsByName = ();
foreach my $lid (@{$root->getElementsByTagName('leaf')}) {
	$nameToLeaf{$lid->getAttribute('name')} = $lid->getAttribute('id');
	my $seq = $lid->findvalue('sequence');
	$seq =~ s/\s//g;
	$seqsByName{$lid->getAttribute('name')} = $seq;
}

# Posterior probabilities are named according to the nodes id and the process id.
# We store them in a hash ($postprob{$node}{$prob}) as raw data (string ot numbers).

%postprob = ();
foreach my $nid (@{$root->getElementsByTagName('node')}) {
	my $node = $nid->getAttribute('id');
	foreach my $pid (@{$nid->getElementsByTagName('probability')}) {
		my $prob = $pid->getAttribute('id');
		my $data = $pid->getFirstChild->getData;
		$data =~ s/\s//g;
		$postprob{$node}{$prob} = $data;
	}
}

# In order to use the human-readable process names, we need the mapping between
# these names and the model state id's.

%nameToState = ();
foreach my $sid (${$root->getElementsByTagName('model')}[0]->getElementsByTagName('probability')) {
	$nameToState{$sid->getAttribute('name')} = $sid->getAttribute('id');
}

# Let's define a group of nodes that end either with 'H.sapiens', 'P.paniscus' or 
# 'P.troglodytes' and then convert this to an array of sequence id's.

my @ingroup = grep /H.sapiens$/,keys %nameToLeaf;
push @ingroup, grep /P.paniscus$/,keys %nameToLeaf;
push @ingroup, grep /P.troglodytes$/,keys %nameToLeaf;

my @ingroupids = ();
foreach my $name(@ingroup) {
	push @ingroupids,$nameToLeaf{$name};
}

# Other functions for the manipulation of the tree.

#print join " ","rootNode->nodeNames",$rootNode->nodeNames,"\n";
#print join " ","rootNode->leafNames",$rootNode->leafNames,"\n";

#print "rootNode->treeDepth ",$rootNode->treeDepth,"\n"; 
#print "rootNode->leafNumber ",$rootNode->leafNumber,"\n";

# 'maxIngroup' gives the internal node that maximises the number of child
# nodes without including any nodes not mentioned on the list (it may not contain
# all the names in the list!). Here, we find the ancestor node for human and chimp
# sequences. The return values are the maximum number of leaf nodes included and 
# the name of the ancestor node maximising that.

(my $leafs,my $node) = $rootNode->maxIngroup(0,0,@ingroupids);
# print "rootNode->maxIngroup $leafs $node \n";

# One can do the same with names NOT included in the selection.

#($leafs,$node) = $rootNode->maxNotOutgroup(0,0,@ingroupids);


# As an example, we can find the sites for which the support at the nodes in the 
# human-chimp subtree is below 90%. To do that, we select 'postprob' for each 
# ancestral node and split the string into sitewise support values. First we call
# another function that return a list of nodes below the given node.

my @nodes = $rootNode->nodeNamesBelow($node,0);

my @sites = ();
my $threshold = 50;
foreach my $node(@nodes) {
	my @score = split /,/,$postprob{$node}{$nameToState{"postprob"}};
	for(my $i=0;$i<=$#score;$i++) {
		if($score[$i]<$threshold && $score[$i]>=0) {
			unless(grep /^$i$/,@sites){
				push @sites,$i;
			}
		}
	}
}

# Print the selected sites nicely (add 1 to correct for real numbering)...

print join " ","Sites in human-chimp subtree with support < $threshold%:\n";

@sites = sort numerically @sites;
for(my $i=0;$i<=$#sites;$i++) {
	if($i==0 || $sites[$i-1]+1<$sites[$i]){
		print " ",$sites[$i]+1;
		if(($i==0 || $sites[$i-1]+1<$sites[$i]) && $sites[$i]+1==$sites[$i+1]){
			print "-";
		}
		next;
	}
	if($sites[$i]+1<$sites[$i+1]){
		print $sites[$i]+1;
	}
}
print $sites[$#sites]+1,"\n";

# ..and print the sequence sites corresponding to the coordinates.

foreach my $name(@ingroup) {
	my $seq = $seqsByName{$name};
	my $p = -1;
	for(my $i=0;$i<=$#sites;$i++) {
		print " " if($p+1<$sites[$i]);
		$p = $sites[$i];
		print substr($seq,$sites[$i],1);
	}
	print "\n";
}


sub numerically { $a <=> $b; }