diff --git a/docs/run-notes/strand_bias.md b/docs/run-notes/strand_bias.md new file mode 100644 index 0000000000000000000000000000000000000000..0f98f875f66e4bfe9f8b9a9dc052476f12fe8e6e --- /dev/null +++ b/docs/run-notes/strand_bias.md @@ -0,0 +1,126 @@ +# Strand bias metrics in samples sequenced to 2021-11-01 + +## Parsing out data + +Working in `/home/u035/u035/ameynert`. + +`extract_strand_bias_metrics.sh` + +``` +#!/usr/bin/bash + +FOLDER=$1 +OUTPUT=$2 + +cd $FOLDER/DECIPHER +NAME=`basename $FOLDER` + +grep -v 'Internal' *.csv | cut -f 1-3 -d ',' | sed -e 's/_DEC_FLT.csv:/,/' | cut -f 1,3,4 -d ',' > $OUTPUT/$NAME.sites.csv +cd $OUTPUT + +cut -f 1 -d ',' $NAME.sites.csv | sort -u > $NAME.indivs.txt + +cd $FOLDER/VCF + +for indiv in `cat $OUTPUT/$NAME.indivs.txt` +do + if [ ! -e *.ready.$indiv.vcf.gz.tbi ] + then + tabix *.ready.$indiv.vcf.gz + fi +done + +count=`wc -l $OUTPUT/$NAME.sites.csv | awk '{ print $1 }'` + +for ((i = 1; i <= $count; i = i + 1)) +do + indiv=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 1` + chr=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 2` + pos=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 3` + + res=`bcftools view *.ready.$indiv.vcf.gz chr$chr:$pos | bcftools query -f "%CHROM,%POS,%REF,%ALT,%INFO/FS,%INFO/SOR\n"` + echo $indiv,$res >> $OUTPUT/$NAME.strandbias.csv +done +``` + +``` +ls /home/u035/u035/shared/results/*/prioritization/* | grep ':' | sed -e 's/\://' > folders.txt +for folder in `cat folders.txt` +do + ./extract_strand_bias_metrics.sh $folder /home/u035/u035/ameynert +done + +cat *.strandbias.csv | sort -u | awk '{ print $1 }' > strandbias.csv +``` + +Clean up `strandbias.csv` manually with emacs. + +``` +cut -f 1 -d ',' strandbias.csv | sort -u > indivs.txt + +cd ../shared/results +cat 1*/params/*family_ids.txt | sort -u | cut -d '_' -f 2 > ~/novaseq_families.txt +cat 200*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 > ~/nextseq_families.txt +cat 201[02]*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 >> ~/nextseq_families.txt +cat 21*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 >> ~/nextseq_families.txt + +cd +grep -f nextseq_families.txt strandbias.csv > strandbias_nextseq.csv +grep -f novaseq_families.txt strandbias.csv > strandbias_novaseq.csv +``` + +## Plots + +``` +library(lattice) + +x = read.table("strandbias_nextseq.csv", col.names=c("indiv", "chr", "pos", "ref", "alt", "fs", "sor"), stringsAsFactors=F, sep=",") +x$sequencer = "NextSeq 2000" + +y = read.table("strandbias_novaseq.csv", col.names=c("indiv", "chr", "pos", "ref", "alt", "fs", "sor"), stringsAsFactors=F, sep=",") +y$sequencer = "NovaSeq 6000" + +x = rbind(x, y) +x$sequencer = as.factor(x$sequencer) + +png("plots/runs_by_sequencer_strand_odds_ratio_histogram.png", width=600, height=800) +histogram(~log10(sor) | sequencer, x, breaks=seq(-2.5,1.1,0.02), xlab="log10(Strand odds ratio)", main="Trio/family whole exome candidate variants", layout=c(1,2)) +dev.off() + +png("plots/runs_by_sequencer_fisher_strand_test_histogram.png", width=600, height=800) +histogram(~log10(fs) | sequencer, x, breaks=seq(-1,3,0.02), xlab="log10(Fisher strand test)", main="Trio/family whole exome candidate variants", layout=c(1,2)) +dev.off() + +my.key=list(points=list(pch=c(1,2), col=c("blue", "orange")), text=list(labels=levels(x$sequencer))) +my.panel = function(x, y, ...) { panel.xyplot(x, y, ...); panel.abline(v=log10(3)); panel.abline(h = log10(60)) } + +png("plots/runs_by_sequencer_fisher_strand_test_vs_strand_odds_ratio_xyplot.png", width=600, height=600) +xyplot(log10(fs) ~ log10(sor), x, groups=c(sequencer), xlab="log10(Strand odds ratio)", ylab="log10(Fisher strand test)", main="Trio/family whole exome candidate variants", key=my.key, pch=c(1,2), col=c("blue", "orange"), panel=my.panel) +dev.off() + +y = subset(x, x$sor >= 3 | x$fs >= 60) +length(x$sor) +[1] 2313 +length(y$sor) +[1] 888 + +length(y$sor) / length(x$sor) +[1] 0.383917 + + x$count = 1 +aggregate(x$count, by = list(x$sequencer), sum) + Group.1 x +1 NovaSeq 6000 1868 +2 NextSeq 2000 445 + +aggregate(y$count, by = list(y$sequencer), sum) + Group.1 x +1 NovaSeq 6000 524 +2 NextSeq 2000 364 +``` + + + + + +