#!/bin/bash

chr=${LSB_JOBINDEX}
if [ $chr -eq 23 ]
then 
chr="X"
fi
## ANNOTATION_DIR should point to the folder that your .bed files are in. Only needed to specify $BED_FILES
ANNOTATION_DIR=/lustre/scratch119/humgen/projects/uk10k/users/vi1/FDR/bedfiles/

## BED_FILES points to the locations of the .bed files you want to use
BED_FILES=$ANNOTATION_DIR/*.bed.gz

OUTPUT_DIR=/lustre/scratch119/humgen/projects/uk10k/users/vi1/FDR/bedfiles/annotations/
mkdir -p $OUTPUT_DIR
VARIANTS_DIR=/lustre/scratch114/teams/soranzo/users/vi1/uk10k/annotation_uk10k_variants/

	INFO=$VARIANTS_DIR/chr${chr}_CSQ_2cols.txt
	cp $INFO $OUTPUT_DIR/chr${chr}

	## loop over annotation files
	for f in $BED_FILES
	do
	
		## number of columns in the .bed file
		NCOL=$(zcat $f |head -1 | awk '{print NF}')
		## make a local copy removing possible headers
		zcat $f |awk -v chr=$chr '$1=="chr"chr {print}' | sort -k2n  > $OUTPUT_DIR/tmp.$chr.bed

	        echo "Processing $f file"
			## running annotation part
	        /nfs/users/nfs_v/vi1/annotation_code/annotation_code_v4 --ncol $NCOL --o $OUTPUT_DIR/tmp_chr${chr} --peaks $OUTPUT_DIR/tmp.$chr.bed --norsid --chunk 1000 --info $INFO
		## merge
		cp $OUTPUT_DIR/chr${chr} $OUTPUT_DIR/tmp0_chr${chr}
        	paste $OUTPUT_DIR/tmp0_chr${chr} <(awk '{print $4}' $OUTPUT_DIR/tmp_chr${chr}) > $OUTPUT_DIR/chr${chr}

		## clean up
		rm $OUTPUT_DIR/tmp0_chr${chr}
		rm $OUTPUT_DIR/tmp_chr${chr}
		rm $OUTPUT_DIR/tmp.$chr.bed
	done

### create link_file.txt (needed for running GARFIELD)
if [ $chr -eq 22 ]
then 
LINK_FILE=$OUTPUT_DIR/link_file.txt
echo "Index Annotation Celltype Tissue Type Category" > $LINK_FILE
i=-1
for f in $BED_FILES;do i=$[$i+1]; echo $i $f "NA" "NA" "NA" "NA"  >> $LINK_FILE ; done
fi


## reformat data for GARFIELD usage
for f in $OUTPUT_DIR/chr$chr
do
	cat $f > $f.tmp
	paste -d" " <(awk '{print $2}' $f.tmp | sed 1d) <(awk '{$1=$2=""; print $0}' $f.tmp | awk '{ gsub("\t",""); print;}' | awk '{ gsub(" ",""); print;}'| sed 1d) > $f
	rm $f.tmp
done

#bsub -J"ann[1-23]" -P uk10k -q long -M 2000 -R'rusage[mem=2000] select[mem>2000]' -o /lustre/scratch119/humgen/projects/uk10k/users/vi1/FDR/bedfiles/scripts/logs/log.fix.%I -- ./garfield_format_annotations_null.sh

