Skip to content
Snippets Groups Projects
Commit dd4a3e4d authored by ameyner2's avatar ameyner2
Browse files

Initial investigation of strand bias metrics

parent cf90a32a
No related branches found
No related tags found
1 merge request!3Ultra 2 SOP/doc updates
# Strand bias metrics in samples sequenced to 2021-11-01
## Parsing out data
Working in `/home/u035/u035/ameynert`.
`extract_strand_bias_metrics.sh`
```
#!/usr/bin/bash
FOLDER=$1
OUTPUT=$2
cd $FOLDER/DECIPHER
NAME=`basename $FOLDER`
grep -v 'Internal' *.csv | cut -f 1-3 -d ',' | sed -e 's/_DEC_FLT.csv:/,/' | cut -f 1,3,4 -d ',' > $OUTPUT/$NAME.sites.csv
cd $OUTPUT
cut -f 1 -d ',' $NAME.sites.csv | sort -u > $NAME.indivs.txt
cd $FOLDER/VCF
for indiv in `cat $OUTPUT/$NAME.indivs.txt`
do
if [ ! -e *.ready.$indiv.vcf.gz.tbi ]
then
tabix *.ready.$indiv.vcf.gz
fi
done
count=`wc -l $OUTPUT/$NAME.sites.csv | awk '{ print $1 }'`
for ((i = 1; i <= $count; i = i + 1))
do
indiv=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 1`
chr=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 2`
pos=`head -n $i $OUTPUT/$NAME.sites.csv | tail -n 1 | cut -d ',' -f 3`
res=`bcftools view *.ready.$indiv.vcf.gz chr$chr:$pos | bcftools query -f "%CHROM,%POS,%REF,%ALT,%INFO/FS,%INFO/SOR\n"`
echo $indiv,$res >> $OUTPUT/$NAME.strandbias.csv
done
```
```
ls /home/u035/u035/shared/results/*/prioritization/* | grep ':' | sed -e 's/\://' > folders.txt
for folder in `cat folders.txt`
do
./extract_strand_bias_metrics.sh $folder /home/u035/u035/ameynert
done
cat *.strandbias.csv | sort -u | awk '{ print $1 }' > strandbias.csv
```
Clean up `strandbias.csv` manually with emacs.
```
cut -f 1 -d ',' strandbias.csv | sort -u > indivs.txt
cd ../shared/results
cat 1*/params/*family_ids.txt | sort -u | cut -d '_' -f 2 > ~/novaseq_families.txt
cat 200*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 > ~/nextseq_families.txt
cat 201[02]*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 >> ~/nextseq_families.txt
cat 21*/params/*Ansari_Morad*family_ids.txt | sort -u | cut -d '_' -f 2 >> ~/nextseq_families.txt
cd
grep -f nextseq_families.txt strandbias.csv > strandbias_nextseq.csv
grep -f novaseq_families.txt strandbias.csv > strandbias_novaseq.csv
```
## Plots
```
library(lattice)
x = read.table("strandbias_nextseq.csv", col.names=c("indiv", "chr", "pos", "ref", "alt", "fs", "sor"), stringsAsFactors=F, sep=",")
x$sequencer = "NextSeq 2000"
y = read.table("strandbias_novaseq.csv", col.names=c("indiv", "chr", "pos", "ref", "alt", "fs", "sor"), stringsAsFactors=F, sep=",")
y$sequencer = "NovaSeq 6000"
x = rbind(x, y)
x$sequencer = as.factor(x$sequencer)
png("plots/runs_by_sequencer_strand_odds_ratio_histogram.png", width=600, height=800)
histogram(~log10(sor) | sequencer, x, breaks=seq(-2.5,1.1,0.02), xlab="log10(Strand odds ratio)", main="Trio/family whole exome candidate variants", layout=c(1,2))
dev.off()
png("plots/runs_by_sequencer_fisher_strand_test_histogram.png", width=600, height=800)
histogram(~log10(fs) | sequencer, x, breaks=seq(-1,3,0.02), xlab="log10(Fisher strand test)", main="Trio/family whole exome candidate variants", layout=c(1,2))
dev.off()
my.key=list(points=list(pch=c(1,2), col=c("blue", "orange")), text=list(labels=levels(x$sequencer)))
my.panel = function(x, y, ...) { panel.xyplot(x, y, ...); panel.abline(v=log10(3)); panel.abline(h = log10(60)) }
png("plots/runs_by_sequencer_fisher_strand_test_vs_strand_odds_ratio_xyplot.png", width=600, height=600)
xyplot(log10(fs) ~ log10(sor), x, groups=c(sequencer), xlab="log10(Strand odds ratio)", ylab="log10(Fisher strand test)", main="Trio/family whole exome candidate variants", key=my.key, pch=c(1,2), col=c("blue", "orange"), panel=my.panel)
dev.off()
y = subset(x, x$sor >= 3 | x$fs >= 60)
length(x$sor)
[1] 2313
length(y$sor)
[1] 888
length(y$sor) / length(x$sor)
[1] 0.383917
x$count = 1
aggregate(x$count, by = list(x$sequencer), sum)
Group.1 x
1 NovaSeq 6000 1868
2 NextSeq 2000 445
aggregate(y$count, by = list(y$sequencer), sum)
Group.1 x
1 NovaSeq 6000 524
2 NextSeq 2000 364
```
![Strand odds ratio histogram by sequencer](plots/runs_by_sequencer_strand_odds_ratio_histogram.png)
![Fisher strand test histogram by sequencer](plots/runs_by_sequencer_fisher_strand_test_ratio_histogram.png)
![Strand odds ratio vs Fisher strand test by sequencer](plots/runs_by_sequencer_fisher_strand_test_vs_strand_odds_ratio_xyplot.png)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment