#!/bin/bash

### GWAS build conversion pipeline 


infile=$1  #LDL_ONE_Europeans.tbl
INDIR=/lustre/scratch113/projects/uk10k/users/vi1/GWASdatasets/Sept2016
IN=$INDIR/$infile
rsidcol=1
OUTDIR=$IN.dir
mkdir -p $OUTDIR

OUT=$OUTDIR/$infile

echo $(head -1 $IN) | sed 's/ /\t/g' > $OUT.header

#DB=/lustre/scratch113/projects/uk10k/users/vi1/GWASdatasets/SNPs/snp130.txt.gz # UCSC build 36
#awk -v OFS="\t" -v col=$rsidcol 'NR==FNR {a[$col]=$col;b[$col]=$0;next;} $5 in a {print $0, b[$5]}' $IN <(zcat $DB) > $OUT.ucsc.hg18

#DB2=/lustre/scratch113/projects/uk10k/users/vi1/GWASdatasets/SNPs/snp147.txt.gz
#awk -v OFS="\t" -v col=$rsidcol 'NR==FNR {a[$col]=$col;b[$col]=$0;next;} $5 in a {print $0, b[$5]}' $IN <(zcat $DB2) > $OUT.ucsc.hg19

echo -n > $OUT.dbsnp.hg19
for chr in {1..22}
do
DB3=/lustre/scratch115/projects/hematopoiesis/Neu_Fxn/data/dbSNP/chroms/chr${chr}_20160601.vcf.gz 
awk -v OFS="\t" -v col=$rsidcol 'NR==FNR {a[$col]=$col;b[$col]=$0;next;} $3 in a {print $0, b[$3]}' $IN <(zcat $DB3) >> $OUT.dbsnp.hg19
echo $chr
done

echo "Point 1"
#DB4=/lustre/scratch113/projects/uk10k/users/vi1/GWASdatasets/SNPs/Homo_sapiens.vcf.gz 
#awk -v OFS="\t" -v col=$rsidcol 'NR==FNR {a[$col]=$col;b[$col]=$0;next;} $3 in a {print $0,b[$3]}' $IN <(zcat $DB4) > $OUT.ensembl.hg19

sed 1d $IN| awk -v col=$rsidcol '{print $col}' > $OUT.rsids
awk -v col=$rsidcol 'NR==FNR {a[$3]=$3;next;} !($col in a) {print $col}' $OUT.dbsnp.hg19 $IN|sed 1d > $OUT.rsids.notindbsnp

###
awk 'NR==FNR {a[$1]=$1;next;} "rs"$1 in a {print "rs"$1, "rs"$2, $3}' $OUT.rsids.notindbsnp <(zcat /lustre/scratch113/projects/uk10k/users/vi1/GWASdatasets/SNPs/RsMergeArch.bcp.gz) > $OUT.rsids.notindbsnp.rsidmerged

awk 'NR==FNR {a[$1]=$1;next;} "rs"$2 in a {print "rs"$2, "rs"$1, $3}' $OUT.rsids.notindbsnp <(zcat /lustre/scratch113/projects/uk10k/users/vi1/GWASdatasets/SNPs/RsMergeArch.bcp.gz) > $OUT.rsids.notindbsnp.rsidmerged.opposite

awk 'NR==FNR {a[$1]=$1;next;} "rs"$1 in a {print "rs"$1}' $OUT.rsids.notindbsnp <(zcat /lustre/scratch113/projects/uk10k/users/vi1/GWASdatasets/SNPs/SNPHistory.bcp.gz) > $OUT.rsids.notindbsnp.removed

echo -n > $OUT.rsids.notindbsnp.rsidmerged.hg19
for chr in {1..22}
do
DB3=/lustre/scratch115/projects/hematopoiesis/Neu_Fxn/data/dbSNP/chroms/chr${chr}_20160601.vcf.gz 
awk -v OFS="\t" 'NR==FNR {a[$2]=$2;b[$2]=$0;next;} $3 in a {print $0, b[$3]}' $OUT.rsids.notindbsnp.rsidmerged <(zcat $DB3) >> $OUT.rsids.notindbsnp.rsidmerged.hg19
echo $chr
done

awk -v OFS="\t" -v col=$rsidcol 'NR==FNR {a[$col]=$col;b[$col]=$0;next;} $9 in a {print $1,$2,$3,$4,$5,$6,$7,$8,b[$9]}' $IN <(sed 's/ /\t/g' $OUT.rsids.notindbsnp.rsidmerged.hg19) > $OUT.rsids.notindbsnp.rsidmerged.hg19.formatted

echo "Point 2"

echo -n > $OUT.rsids.notindbsnp.rsidmerged.hg19.opposite
for chr in {1..22}
do
DB3=/lustre/scratch115/projects/hematopoiesis/Neu_Fxn/data/dbSNP/chroms/chr${chr}_20160601.vcf.gz 
awk -v OFS="\t" 'NR==FNR {a[$2]=$2;b[$2]=$0;next;} $3 in a {print $0, b[$3]}' $OUT.rsids.notindbsnp.rsidmerged.opposite <(zcat $DB3) >> $OUT.rsids.notindbsnp.rsidmerged.hg19.opposite
echo $chr
done

awk 'NR==FNR {a[$1]=$1;next;} "rs"$1 in a {print "rs"$1, "rs"$2, $3}' <(awk 'NR==FNR {a[$3]=$3;next;} !($2 in a) {print $2}' $OUT.rsids.notindbsnp.rsidmerged.hg19 $OUT.rsids.notindbsnp.rsidmerged) <(zcat /lustre/scratch113/projects/uk10k/users/vi1/GWASdatasets/SNPs/RsMergeArch.bcp.gz) > $OUT.rsids.notindbsnp.rsidmerged.v2

echo -n > $OUT.rsids.notindbsnp.rsidmerged.hg19.v2
for chr in {1..22}
do
DB3=/lustre/scratch115/projects/hematopoiesis/Neu_Fxn/data/dbSNP/chroms/chr${chr}_20160601.vcf.gz 
awk -v OFS="\t" 'NR==FNR {a[$2]=$2;b[$2]=$0;next;} $3 in a {print $0, b[$3]}' $OUT.rsids.notindbsnp.rsidmerged.v2 <(zcat $DB3) >> $OUT.rsids.notindbsnp.rsidmerged.hg19.v2
echo $chr
done

echo "Point 3"

awk -v OFS="\t" -v col=$rsidcol 'NR==FNR {a[$col]=$col;b[$col]=$0;next;} $9 in a {print $1,$2,$3,$4,$5,$6,$7,$8,b[$9]}' $IN <(awk -v OFS="\t" 'NR==FNR {a[$2]=$2;b[$2]=$0;next;} $10 in a {print $1,$2,$3,$4,$5,$6,$7,$8,b[$10]}' <(awk 'NR==FNR {a[$1]=$1;b[$1]=$2;next;} $2 in a {print $1, b[$2]}' $OUT.rsids.notindbsnp.rsidmerged.v2 $OUT.rsids.notindbsnp.rsidmerged) <(sed 's/ /\t/g' $OUT.rsids.notindbsnp.rsidmerged.hg19.v2)) > $OUT.rsids.notindbsnp.rsidmerged.hg19.v2.formatted


echo "Point 4"

echo "chr.hg19 pos.hg19 rsid.hg19 ref.hg19 alt.hg19" $(head -1 $IN) | sed 's/ /\t/g' > $OUT.hg19
#cat $OUT.dbsnp.hg19 $OUT.rsids.notindbsnp.rsidmerged.hg19.formatted $OUT.rsids.notindbsnp.rsidmerged.hg19.v2.formatted|awk '{print $1,$2,$3,$4,$5,$9,$10,$11,$12,$13,$14,$15,$16}'| sed 's/ /\t/g' | sort| uniq >> $OUT.hg19
cat $OUT.dbsnp.hg19 $OUT.rsids.notindbsnp.rsidmerged.hg19.formatted $OUT.rsids.notindbsnp.rsidmerged.hg19.v2.formatted| sed 's/ /\t/g' | sort| uniq >> $OUT.hg19

awk '{print $3}' $OUT.hg19|sed 1d|sort|uniq -d > $OUT.duplicateIDs
awk 'NR==FNR {a[$1]=$1;next;} $3 in a {print $0}' $OUT.duplicateIDs $OUT.hg19 > $OUT.duplicateIDs.hg19

echo "Point 5"

echo "chr.hg19 pos.hg19 rsid.hg19 ref.hg19 alt.hg19" $(head -1 $IN) | sed 's/ /\t/g' > $OUT.hg19.clean
cat $OUT.dbsnp.hg19 >> $OUT.hg19.clean
awk 'NR==FNR {a[$3]=$3;next;} !($3 in a) {print $0}' $OUT.dbsnp.hg19 $OUT.rsids.notindbsnp.rsidmerged.hg19.formatted >> $OUT.hg19.clean
awk 'NR==FNR {a[$3]=$3;next;} !($3 in a) {print $0}' $OUT.hg19.clean $OUT.rsids.notindbsnp.rsidmerged.hg19.v2.formatted > $OUT.hg19.clean.0
cat $OUT.hg19.clean.0 >> $OUT.hg19.clean
#awk '{print $1,$2,$3,$4,$5,$9,$10,$11,$12,$13,$14,$15,$16}' $OUT.hg19.clean| sed 's/ /\t/g' > $OUT.hg19.clean.0
sed 's/ /\t/g' $OUT.hg19.clean > $OUT.hg19.clean.0
mv $OUT.hg19.clean.0 $OUT.hg19.clean

echo "Point 6"
### at the end count how many variants does each mapping produce!

wc1=$(wc -l < $OUT.rsids)
#wc2=$(awk '{print $5}' $OUT.ucsc.hg18 |sort|uniq|wc -l)
#wc3=$(awk '{print $5}' $OUT.ucsc.hg19 |sort|uniq|wc -l)
wc4=$(awk '{print $3}' $OUT.dbsnp.hg19 |sort|uniq|wc -l)
#wc5=$(awk '{print $3}' $OUT.ensembl.hg19 |sort|uniq|wc -l)
wc6=$(wc -l < $OUT.rsids.notindbsnp)
wc7from=$(awk '{print $1}' $OUT.rsids.notindbsnp.rsidmerged |sort|uniq|wc -l)
wc7to=$(awk '{print $2}' $OUT.rsids.notindbsnp.rsidmerged |sort|uniq|wc -l)
wc8=$(awk '{print $1}' $OUT.rsids.notindbsnp.removed |sort|uniq|wc -l)
wc9=$(awk '{print $3}' $OUT.rsids.notindbsnp.rsidmerged.hg19 |sort|uniq|wc -l)
wc10=$(awk 'NR==FNR {a[$1]=$1;next;} !($1 in a) {print $1}' $OUT.rsids.notindbsnp.rsidmerged $OUT.rsids.notindbsnp|wc -l)
wc11=$(awk 'NR==FNR {a[$1]=$1;next;} !($1 in a) {print $1}' $OUT.rsids.notindbsnp.rsidmerged $OUT.rsids.notindbsnp.removed |sort|uniq|wc -l) 
wc12=$(awk 'NR==FNR {a[$3]=$3;next;} !($2 in a) {print $2}' $OUT.rsids.notindbsnp.rsidmerged.hg19 $OUT.rsids.notindbsnp.rsidmerged|wc -l)
wc13=$(awk '{print $2}' $OUT.rsids.notindbsnp.rsidmerged.v2 |sort|uniq|wc -l)
wc14=$(awk '{print $3}' $OUT.rsids.notindbsnp.rsidmerged.hg19.v2 |sort|uniq|wc -l)
wc15=$(awk 'NR==FNR {a[$3]=$3;next;} !($2 in a) {print $2}' $OUT.rsids.notindbsnp.rsidmerged.hg19.v2 $OUT.rsids.notindbsnp.rsidmerged.v2|wc -l)
wc16=$(awk '{print $3}' $OUT.hg19.clean|sed 1d|sort|uniq|wc -l)
wc17=$(awk '{print $3}' $OUT.hg19|sed 1d|sort|uniq|wc -l)
wc18=$(sed 1d $OUT.hg19.clean|sort|uniq|wc -l)
wc19=$(cat $OUT.duplicateIDs|wc -l)

echo "Variants $wc1" > $OUT.summary
#echo "UCSC.hg18 $wc2" >> $OUT.summary
#echo "UCSC.hg19 $wc3" >> $OUT.summary
#echo "Ensembl.hg19 $wc5" >> $OUT.summary
echo "dbSNP.hg19 $wc4" >> $OUT.summary
echo "Not.in.dbSNP $wc6" >> $OUT.summary
echo "Not.in.dbSNP.without.mergedIDs $wc10" >> $OUT.summary
echo "Not.in.dbSNP.mergedIDs.from $wc7from" >> $OUT.summary
echo "Not.in.dbSNP.mergedIDs.to $wc7to" >> $OUT.summary
echo "Not.in.dbSNP.mergedIDs.hg19 $wc9" >> $OUT.summary
echo "Not.in.dbSNP.with.mergedIDs.without.coordinates $wc12" >> $OUT.summary
echo "Not.in.dbSNP.with.mergedIDs.without.coordinates.mergedIDs $wc13" >> $OUT.summary
echo "Not.in.dbSNP.with.mergedIDs.without.coordinates.mergedIDs.hg19 $wc14" >> $OUT.summary
echo "Not.in.dbSNP.with.mergedIDs.without.coordinates.mergedIDs.without.coordinates $wc15" >> $OUT.summary
echo "Not.in.dbSNP.removed $wc8" >> $OUT.summary
echo "Not.in.dbSNP.removed.and.without.mergedIDs $wc11" >> $OUT.summary
echo "rsids $[$[wc17]-1]" >> $OUT.summary
echo "Unique.rsids $[$[wc16]-1]" >> $OUT.summary
echo "Unique.rsids.file $[$[wc18]-1]" >> $OUT.summary
echo "Rsids.with.multiple.pvals $wc19" >> $OUT.summary

echo "Point 7 done"