#!/bin/bash

## ANNOTATION_DIR should point to the folder that your .bed files are in. Only needed to specify $BED_FILES
ANNOTATION_DIR=/lustre/scratch119/humgen/projects/uk10k/users/vi1/histone_modifications/newdata/H3K4me3

## BED_FILES points to the locations of the .bed files you want to use
BED_FILES=$ANNOTATION_DIR/*.narrowPeak.gz

## OUTPUT_DIR is where the output will appear. NOTE: Make sure you use different folders for different runs as the names of the output files will be the same!!!

OUTPUT_DIR=/lustre/scratch119/humgen/projects/uk10k/users/vi1/histone_modifications/newdata/H3K4me3_garfield_narrowPeak
mkdir -p $OUTPUT_DIR
## VARIANTS_DIR points to the directory where the variants to be annotated are. You do not need to change this.
##### UK10K ## commented out code for uk10k+1kg phase3 EUR - however if using that all annotations should be re-done
VARIANTS_DIR=/lustre/scratch114/teams/soranzo/users/vi1/uk10k/annotation_uk10k_variants #/lustre/scratch114/teams/soranzo/users/vi1/shared/uk10k+1kg_phase3/variants/snp

chr=${LSB_JOBINDEX}
if [ $chr -eq 23 ]
then 
	chr="X"
fi 
#edited info format.
	INFO=$VARIANTS_DIR/chr${chr}_CSQ_2cols.txt
	cp $INFO $OUTPUT_DIR/chr${chr}

	## loop over annotation files
	for f in $BED_FILES
	do
	
		## number of columns in the .bed file
		NCOL=$(zcat $f |head -1 | awk '{print NF}')
		## make a local copy removing possible headers
		zcat $f| grep "chr" |awk -v chr=$chr '$1=="chr"chr {print}' | sort -k2n  > $OUTPUT_DIR/tmp.chr$chr.bed

	        echo "Processing $f file"
		## running annotation part
	        /nfs/users/nfs_v/vi1/annotation_code/annotation_code_v4 --ncol $NCOL --o $OUTPUT_DIR/tmp_chr${chr} --peaks $OUTPUT_DIR/tmp.chr$chr.bed --norsid --chunk 1000 --info $INFO
		## merge
		cp $OUTPUT_DIR/chr${chr} $OUTPUT_DIR/tmp0_chr${chr}
        	paste $OUTPUT_DIR/tmp0_chr${chr} <(awk '{print $4}' $OUTPUT_DIR/tmp_chr${chr}) > $OUTPUT_DIR/chr${chr}

		## clean up
		rm $OUTPUT_DIR/tmp0_chr${chr}
		rm $OUTPUT_DIR/tmp_chr${chr}
		rm $OUTPUT_DIR/tmp.chr$chr.bed
	done
#done

## create link_file.txt (needed for running GARFIELD)

if [ $chr -eq 22 ] 
then
LINK_FILE=$OUTPUT_DIR/link_file.txt
echo "Index Annotation Celltype Tissue Type Category" > $LINK_FILE
#head -1 $OUTPUT_DIR/chr22 | awk '{$1=$2=""; print $0}' | sed 's/ /\n/g' | sed 1d | sed 1d | awk '{print NR-1, $1, "NA","NA","NA", "NA"}' >> $LINK_FILE
i=-1
#for f in $BED_FILES;do i=$[$i+1]; echo $i $f "NA" "NA" "NA" "NA"  >> $LINK_FILE ; done
#for f in $BED_FILES;do i=$[$i+1]; f2=$(echo $f| cut -d'/' -f 10) ; f3=$(echo $f2| cut -d'.' -f 2,4) ; echo $i $f $f3 "NA" "H3K4me3" "Histone_modifications"  >> $LINK_FILE ; done
for f in $BED_FILES;do i=$[$i+1]; f2=$(echo $f| cut -d'/' -f 11) ; f3=$(echo $f2| cut -d'.' -f 1 | cut -d'-' -f 1 ) ;f4=$(echo $f2| cut -d'.' -f 2 ) ; echo $i $f $f3 $f4 "H3K4me3" "Histone_modifications"  >> $LINK_FILE ; done

fi

## reformat data for GARFIELD usage
#do not loop over all chromosomes in this case, as we are doing just for one chromosome
#for f in $OUTPUT_DIR/chr*
#do
f=$OUTPUT_DIR/chr${chr}

	cat $f > $f.tmp
	paste -d" " <(awk '{print $2}' $f.tmp | sed 1d) <(awk '{$1=$2=""; print $0}' $f.tmp | awk '{ gsub("\t",""); print;}' | awk '{ gsub(" ",""); print;}'| sed 1d) > $f
	#rm $f.tmp
#done
#fi 



## reformat data for GARFIELD usage
#for i in {1..22} 'X'
#do
#INFO=$VARIANTS_DIR/chr${i}_CSQ_2cols.txt
#f=$OUTPUT_DIR/$i/all
#	#cat $f > $f.tmp#
#	paste -d" " <(awk '{print $2}' $INFO) $f | awk '{ gsub("\t",""); print;}' | sed 1d > $OUTPUT_DIR/chr$i
#	
#echo $i
#done




#bsub -q normal -M 1000 -R'rusage[mem=1000] select[mem>1000]' -J'annotate[1-23]' -o logs/ann2.fix.%I --  /lustre/scratch119/humgen/projects/uk10k/users/vi1/histone_modifications/newdata/scripts/annotate_h3k4me3_bugfix.sh
