#!/bin/bash

## ANNOTATION_DIR should point to the folder that your .bed files are in. Only needed to specify $BED_FILES
ANNOTATION_DIR=/lustre/scratch113/projects/uk10k/users/vi1/segmentations/data/NIH_25Marks

## BED_FILES points to the locations of the .bed files you want to use
BED_FILES=$ANNOTATION_DIR/*.bed.gz

## OUTPUT_DIR is where the output will appear. NOTE: Make sure you use different folders for different runs as the names of the output files will be the same!!!

OUTPUT_DIR=/lustre/scratch113/projects/uk10k/users/vi1/segmentations/output_annotations

## VARIANTS_DIR points to the directory where the variants to be annotated are. You do not need to change this.
VARIANTS_DIR=/lustre/scratch114/teams/soranzo/users/vi1/annotation_uk10k_variants

## NOTE: if any of your .bed files has a header

## loop over chromosomes
for chr in {1..22} 'X'
do

	INFO=$VARIANTS_DIR/chr${chr}_CSQ_2cols.txt
	

	## loop over annotation files
		
	for state in {1..25}
	do

		cp $INFO $OUTPUT_DIR/chr${chr}
		
		for f in $BED_FILES
		do

			## number of columns in the .bed file
			NCOL=$(zcat $f | head -1 | awk '{print NF}')
			## make a local copy removing possible headers
			zcat $f |awk -v chr=$chr -v state=$state '$1=="chr"chr && $4==state {print}' | sort -k2n  > $OUTPUT_DIR/tmp.bed

		        echo "Processing $f file"
			## running annotation part
		        /nfs/users/nfs_v/vi1/annotation_code/annotation_code_v4 --ncol $NCOL --o $OUTPUT_DIR/tmp_chr${chr} --peaks $OUTPUT_DIR/tmp.bed --norsid --chunk 1000 --info $INFO
			## merge
			cp $OUTPUT_DIR/chr${chr} $OUTPUT_DIR/tmp0_chr${chr}
	        	paste $OUTPUT_DIR/tmp0_chr${chr} <(awk '{print $4}' $OUTPUT_DIR/tmp_chr${chr}) > $OUTPUT_DIR/chr${chr}

			## clean up
			rm $OUTPUT_DIR/tmp0_chr${chr}
			rm $OUTPUT_DIR/tmp_chr${chr}
			rm $OUTPUT_DIR/tmp.bed
		done
	done
done





## create link_file.txt (needed for running GARFIELD)
LINK_FILE=$OUTPUT_DIR/link_file.txt
echo "Index Annotation Celltype Tissue Type Category" > $LINK_FILE
#head -1 $OUTPUT_DIR/chr22 | awk '{$1=$2=""; print $0}' | sed 's/ /\n/g' | sed 1d | sed 1d | awk '{print NR-1, $1, "NA","NA","NA", "NA"}' >> $LINK_FILE
i=-1
for f in $BED_FILES;do for j in {1..25}; do i=$[$i+1]; echo $i $f "NA" "NA" "NA" $j  >> $LINK_FILE ; done; done

## reformat data for GARFIELD usage
for f in $OUTPUT_DIR/chr*
do
	cat $f > $f.tmp
	paste -d" " <(awk '{print $2}' $f.tmp | sed 1d) <(awk '{$1=$2=""; print $0}' $f.tmp | awk '{ gsub("\t",""); print;}' | awk '{ gsub(" ",""); print;}'| sed 1d) > $f
	rm $f.tmp
done

#bsub -M 1000 -R'rusage[mem=1000] select[mem>1000]' -o /lustre/scratch113/projects/uk10k/users/vi1/segmentations/scripts/log -- /lustre/scratch113/projects/uk10k/users/vi1/segmentations/scripts/annotate_script.sh

