From dd4a3e4dc4f2496cffedfff02dcd23dd7a81f391 Mon Sep 17 00:00:00 2001
From: ameyner2 <alison.meynert@ed.ac.uk>
Date: Wed, 17 Nov 2021 13:50:15 +0000
Subject: [PATCH] Initial investigation of strand bias metrics

---
 docs/run-notes/strand_bias.md | 126 ++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 docs/run-notes/strand_bias.md

diff --git a/docs/run-notes/strand_bias.md b/docs/run-notes/strand_bias.md
new file mode 100644
index 0000000..0f98f87
--- /dev/null
+++ b/docs/run-notes/strand_bias.md
@@ -0,0 +1,126 @@
+# Strand bias metrics in samples sequenced to 2021-11-01
+
+## Parsing out data
+
+Working in `/home/u035/u035/ameynert`.
+
+`extract_strand_bias_metrics.sh`
+
+```
+#!/usr/bin/bash
+
+FOLDER=$1
+OUTPUT=$2
+
+cd $FOLDER/DECIPHER
+NAME=`basename $FOLDER`
+
+grep -v 'Internal' *.csv | cut -f 1-3 -d ',' | sed -e 's/_DEC_FLT.csv:/,/' | cut -f 1,3,4 -d ',' > $OUTPUT/$NAME.sites.csv
+cd $OUTPUT
+
+cut -f 1 -d ',' $NAME.sites.csv | sort -u > $NAME.indivs.txt
+
+cd $FOLDER/VCF
+
+for indiv in `cat $OUTPUT/$NAME.indivs.txt`
+do
+    if [ ! -e *.ready.$indiv.vcf.gz.tbi ]
+    then
+	tabix *.ready.$indiv.vcf.gz
+    fi
+done
+
+count=`wc -l $OUTPUT/$NAME.sites.csv | awk '{ print $1 }'`
+
+for ((i = 1; i <= $count; i = i + 1))
+do
+    indiv=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 1`
+    chr=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 2`
+    pos=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 3`
+
+    res=`bcftools view *.ready.$indiv.vcf.gz chr$chr:$pos | bcftools query -f "%CHROM,%POS,%REF,%ALT,%INFO/FS,%INFO/SOR\n"`
+    echo $indiv,$res >> $OUTPUT/$NAME.strandbias.csv
+done
+```
+
+```
+ls /home/u035/u035/shared/results/*/prioritization/* | grep ':' | sed -e 's/\://' > folders.txt
+for folder in `cat folders.txt`
+do
+  ./extract_strand_bias_metrics.sh $folder /home/u035/u035/ameynert
+done
+
+cat *.strandbias.csv | sort -u | awk '{ print $1 }' > strandbias.csv
+```
+
+Clean up `strandbias.csv` manually with emacs.
+
+```
+cut -f 1 -d ',' strandbias.csv | sort -u > indivs.txt
+
+cd ../shared/results
+cat 1*/params/*family_ids.txt | sort -u | cut -d '_' -f 2 > ~/novaseq_families.txt
+cat 200*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 > ~/nextseq_families.txt
+cat 201[02]*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 >> ~/nextseq_families.txt 
+cat 21*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 >> ~/nextseq_families.txt 
+
+cd
+grep -f nextseq_families.txt strandbias.csv > strandbias_nextseq.csv
+grep -f novaseq_families.txt strandbias.csv > strandbias_novaseq.csv
+```
+
+## Plots 
+
+```
+library(lattice)
+
+x = read.table("strandbias_nextseq.csv", col.names=c("indiv", "chr", "pos", "ref", "alt", "fs", "sor"), stringsAsFactors=F, sep=",")
+x$sequencer = "NextSeq 2000"
+
+y = read.table("strandbias_novaseq.csv", col.names=c("indiv", "chr", "pos", "ref", "alt", "fs", "sor"), stringsAsFactors=F, sep=",")
+y$sequencer = "NovaSeq 6000"
+
+x = rbind(x, y)
+x$sequencer = as.factor(x$sequencer)
+
+png("plots/runs_by_sequencer_strand_odds_ratio_histogram.png", width=600, height=800)
+histogram(~log10(sor) | sequencer, x, breaks=seq(-2.5,1.1,0.02), xlab="log10(Strand odds ratio)",  main="Trio/family whole exome candidate variants", layout=c(1,2))
+dev.off()
+
+png("plots/runs_by_sequencer_fisher_strand_test_histogram.png", width=600, height=800)
+histogram(~log10(fs)  | sequencer, x, breaks=seq(-1,3,0.02),     xlab="log10(Fisher strand test)", main="Trio/family whole exome candidate variants", layout=c(1,2))
+dev.off()
+
+my.key=list(points=list(pch=c(1,2), col=c("blue", "orange")), text=list(labels=levels(x$sequencer)))
+my.panel = function(x, y, ...) { panel.xyplot(x, y, ...); panel.abline(v=log10(3)); panel.abline(h = log10(60)) }
+
+png("plots/runs_by_sequencer_fisher_strand_test_vs_strand_odds_ratio_xyplot.png", width=600, height=600)
+xyplot(log10(fs) ~ log10(sor), x, groups=c(sequencer), xlab="log10(Strand odds ratio)", ylab="log10(Fisher strand test)", main="Trio/family whole exome candidate variants", key=my.key, pch=c(1,2), col=c("blue", "orange"), panel=my.panel)
+dev.off()
+
+y = subset(x, x$sor >= 3 | x$fs >= 60)
+length(x$sor)
+[1] 2313
+length(y$sor)
+[1] 888
+
+length(y$sor) / length(x$sor)
+[1] 0.383917
+
+ x$count = 1
+aggregate(x$count, by = list(x$sequencer), sum)
+       Group.1    x
+1 NovaSeq 6000 1868
+2 NextSeq 2000  445
+
+aggregate(y$count, by = list(y$sequencer), sum)
+       Group.1   x
+1 NovaSeq 6000 524
+2 NextSeq 2000 364
+```
+
+![Strand odds ratio histogram by sequencer](plots/runs_by_sequencer_strand_odds_ratio_histogram.png)
+
+![Fisher strand test histogram by sequencer](plots/runs_by_sequencer_fisher_strand_test_ratio_histogram.png)
+
+![Strand odds ratio vs Fisher strand test by sequencer](plots/runs_by_sequencer_fisher_strand_test_vs_strand_odds_ratio_xyplot.png)
-- 
GitLab