From 6103ee3257de5284c6a2855c6331ff5954632ed7 Mon Sep 17 00:00:00 2001
From: ameyner2 <alison.meynert@ed.ac.uk>
Date: Mon, 3 May 2021 11:26:36 +0100
Subject: [PATCH] Add new file

---
 docs/17200_ITPR1_mosaic_investigation.md | 131 +++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 docs/17200_ITPR1_mosaic_investigation.md

diff --git a/docs/17200_ITPR1_mosaic_investigation.md b/docs/17200_ITPR1_mosaic_investigation.md
new file mode 100644
index 0000000..b2181bb
--- /dev/null
+++ b/docs/17200_ITPR1_mosaic_investigation.md
@@ -0,0 +1,131 @@
+# First look at the one sample
+
+```
+bcftools view -m2 -M2 -v snps 18735_426481-gatk-haplotype-annotated.vcf.gz | bcftools filter -i 'GT[1] = "ref"' | bcftools filter -i 'GT[0] = "het"' | bcftools filter -i 'GT[2] != "ref" & GT[2] != "mis"' | bgzip -c > paternally_inherited_het.vcf.gz
+tabix paternally_inherited_het.vcf.gz
+
+/home/u035/project/software/bcbio/anaconda/bin/vase -i paternally_inherited_het.vcf.gz -o paternally_inherited_het_rare.vcf.gz --freq 0.001 --csq all
+tabix paternally_inherited_het_rare.vcf.gz
+
+header.txt 
+chr	pos	ref	alt	child_gt	child_ref_dp	child_alt_dp	mom_gt	mom_ref_dp	mom_alt_dp	dad_gt	dad_ref_dp	dad_alt_dp
+
+cp header.txt paternally_inherited_het_rare.txt
+bcftools query -f "%CHROM\t%POS\t%REF\t%ALT[\t%GT\t%AD]\n" paternally_inherited_het_rare.vcf.gz | sed -e 's/,/\t/g' >> paternally_inherited_het_rare.txt
+```
+
+```
+R
+y = read.table("paternally_inherited_het_rare.txt", header=T, stringsAsFactors=F, sep="\t", na.strings=".")
+y$mom_alt_af = y$mom_alt_dp / (y$mom_alt_dp + y$mom_ref_dp)
+y$dad_alt_af = y$dad_alt_dp / (y$dad_alt_dp + y$dad_ref_dp)
+y$child_alt_af = y$child_alt_dp / (y$child_alt_dp + y$child_ref_dp)
+length(y$chr)
+382
+
+q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2)
+length(q$chr)
+225
+
+q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2 & y$mom_alt_dp > 0)
+length(q$chr)
+19
+
+z = data.frame(id = c(rep("dad", length(q$chr)), rep("mom", length(q$chr)), rep("child", length(q$chr))), af = c(q$dad_alt_af, q$mom_alt_af, q$child_alt_af))
+library(lattice)
+pdf("paternally_inherited_het_rare_aaf_histogram.pdf", width=6, height=6)
+histogram(~af | id, z, layout=c(1,3), breaks=seq(0,1,0.01), type="c")
+dev.off()
+
+19 of 225 sites have some alternate allele reads in the mother
+
+q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2)
+pdf("dad_vs_child_af.pdf", width=6, height=6)
+xyplot(child_alt_af ~ dad_alt_af, q, xlim=c(-0.05,1.05), ylim=c(-0.05,1.05))
+dev.off()
+          1 
+pdf("mom_vs_child_af.pdf", width=6, height=6)
+xyplot(child_alt_af ~ mom_alt_af, q, xlim=c(-0.05,1.05), ylim=c(-0.05,1.05))
+dev.off()
+```
+
+# All families in batch
+
+```
+for ((i = 1; i <= 32; i = i + 1))
+do
+  head -n $i affected_child.ped | tail -n 1 | cut -f 2 > child_id.txt
+  head -n $i affected_child.ped | tail -n 1 | cut -f 3 > father_id.txt
+  head -n $i affected_child.ped | tail -n 1 | cut -f 4 > mother_id.txt
+  family_id=`head -n $i affected_child.ped | tail -n 1 | cut -f 1`
+
+  bcftools view -m2 -M2 -v snps ${family_id}-gatk-haplotype-annotated.vcf.gz | bcftools filter -i 'GT[@mother_id.txt] = "ref" && GT[@child_id.txt] = "het" && GT[@father_id.txt] != "ref" && GT[@father_id.txt] != "mis"' | bgzip -c > ${family_id}_paternally_inherited_het.vcf.gz
+  tabix ${family_id}_paternally_inherited_het.vcf.gz
+
+  /home/u035/project/software/bcbio/anaconda/bin/vase -i ${family_id}_paternally_inherited_het.vcf.gz -o ${family_id}_paternally_inherited_het_rare.vcf.gz --freq 0.001 --csq all
+  tabix ${family_id}_paternally_inherited_het_rare.vcf.gz
+
+  cp header.txt ${family_id}_paternally_inherited_het_rare.txt
+  bcftools query -f "%CHROM\t%POS\t%REF\t%ALT[\t%GT\t%AD]\n" ${family_id}_paternally_inherited_het_rare.vcf.gz | sed -e 's/,/\t/g' >> ${family_id}_paternally_inherited_het_rare.txt
+done
+```
+
+Excluded the one family without VEP annotations
+
+```
+R
+ids = read.table("affected_child.ped", header=F, sep="\t", stringsAsFactors=F)
+x = data.frame()
+for (family_id in ids$V1) {
+  y = read.table(sprintf("%s_paternally_inherited_het_rare.txt", family_id), header=T, stringsAsFactors=F, sep="\t", na.strings=".")
+  y$family_id = family_id
+  x = rbind(x, y)
+}
+
+x$mom_alt_af = x$mom_alt_dp / (x$mom_alt_dp + x$mom_ref_dp)
+x$dad_alt_af = x$dad_alt_dp / (x$dad_alt_dp + x$dad_ref_dp)
+x$child_alt_af = x$child_alt_dp / (x$child_alt_dp + x$child_ref_dp)
+x$count = 1
+
+q = subset(x, x$child_alt_af >= 0.2 & x$dad_alt_af >= 0.2)
+total = aggregate(q$count, by = list(q$family_id), sum)
+
+q = subset(x, x$child_alt_af >= 0.2 & x$dad_alt_af >= 0.2 & x$mom_alt_dp > 0)
+mom_has_reads = aggregate(q$count, by = list(q$family_id), sum)
+
+output = data.frame(family_id = total$Group.1, total = total$x, mreads = mom_has_reads$x)
+output$mprop = output$mreads / output$total
+outsub = subset(output, output$total > 2)
+
+22 of 28 families have a decent number of sites - not sure what's up with the other 6, which only have 1 or 2 each, but these are excluded from analysis.
+
+summary(outsub)
+        family_id      total           mreads          mprop        
+ 18735_412876: 1   Min.   :136.0   Min.   : 6.00   Min.   :0.03109  
+ 18735_414260: 1   1st Qu.:178.0   1st Qu.:12.25   1st Qu.:0.06782  
+ 18735_415251: 1   Median :198.0   Median :16.00   Median :0.07705  
+ 18735_421930: 1   Mean   :192.6   Mean   :15.59   Mean   :0.08072  
+ 18735_424464: 1   3rd Qu.:204.8   3rd Qu.:18.00   3rd Qu.:0.09609  
+ 18735_424521: 1   Max.   :230.0   Max.   :26.00   Max.   :0.13065  
+ (Other)     :16                                                    
+
+quantile(q$mom_alt_dp + q$mom_ref_dp, seq(0,1,0.05))
+    0%     5%    10%    15%    20%    25%    30%    35%    40%    45%    50% 
+   7.0   19.5   29.0   45.5   57.0   69.5   79.0   88.0  101.0  110.0  124.0 
+   55%    60%    65%    70%    75%    80%    85%    90%    95%   100% 
+ 143.5  168.0  181.0  200.0  229.0  264.0  296.5  360.0  425.0 1023.0 
+
+quantile(q$dad_alt_dp + q$dad_ref_dp, seq(0,1,0.05))
+   0%    5%   10%   15%   20%   25%   30%   35%   40%   45%   50%   55%   60% 
+  2.0  13.0  27.0  47.0  55.0  65.5  78.0  87.0  95.0 111.0 126.0 138.0 149.0 
+  65%   70%   75%   80%   85%   90%   95%  100% 
+175.5 201.0 223.0 245.0 281.5 350.0 426.5 829.0 
+
+quantile(q$child_alt_dp + q$child_ref_dp, seq(0,1,0.05))
+   0%    5%   10%   15%   20%   25%   30%   35%   40%   45%   50%   55%   60% 
+  3.0  21.5  30.0  43.0  51.0  61.0  72.0  79.0  92.0 101.5 122.0 141.5 157.0 
+  65%   70%   75%   80%   85%   90%   95%  100% 
+174.0 190.0 215.0 238.0 261.0 315.0 402.5 846.0
+```
+
+
-- 
GitLab