diff --git a/docs/17200_ITPR1_mosaic_investigation.md b/docs/17200_ITPR1_mosaic_investigation.md new file mode 100644 index 0000000000000000000000000000000000000000..b2181bb4c416810a3545945c4f0823c9ef00482b --- /dev/null +++ b/docs/17200_ITPR1_mosaic_investigation.md @@ -0,0 +1,131 @@ +# First look at the one sample + +``` +bcftools view -m2 -M2 -v snps 18735_426481-gatk-haplotype-annotated.vcf.gz | bcftools filter -i 'GT[1] = "ref"' | bcftools filter -i 'GT[0] = "het"' | bcftools filter -i 'GT[2] != "ref" & GT[2] != "mis"' | bgzip -c > paternally_inherited_het.vcf.gz +tabix paternally_inherited_het.vcf.gz + +/home/u035/project/software/bcbio/anaconda/bin/vase -i paternally_inherited_het.vcf.gz -o paternally_inherited_het_rare.vcf.gz --freq 0.001 --csq all +tabix paternally_inherited_het_rare.vcf.gz + +header.txt +chr pos ref alt child_gt child_ref_dp child_alt_dp mom_gt mom_ref_dp mom_alt_dp dad_gt dad_ref_dp dad_alt_dp + +cp header.txt paternally_inherited_het_rare.txt +bcftools query -f "%CHROM\t%POS\t%REF\t%ALT[\t%GT\t%AD]\n" paternally_inherited_het_rare.vcf.gz | sed -e 's/,/\t/g' >> paternally_inherited_het_rare.txt +``` + +``` +R +y = read.table("paternally_inherited_het_rare.txt", header=T, stringsAsFactors=F, sep="\t", na.strings=".") +y$mom_alt_af = y$mom_alt_dp / (y$mom_alt_dp + y$mom_ref_dp) +y$dad_alt_af = y$dad_alt_dp / (y$dad_alt_dp + y$dad_ref_dp) +y$child_alt_af = y$child_alt_dp / (y$child_alt_dp + y$child_ref_dp) +length(y$chr) +382 + +q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2) +length(q$chr) +225 + +q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2 & y$mom_alt_dp > 0) +length(q$chr) +19 + +z = data.frame(id = c(rep("dad", length(q$chr)), rep("mom", length(q$chr)), rep("child", length(q$chr))), af = c(q$dad_alt_af, q$mom_alt_af, q$child_alt_af)) +library(lattice) +pdf("paternally_inherited_het_rare_aaf_histogram.pdf", width=6, height=6) +histogram(~af | id, z, layout=c(1,3), breaks=seq(0,1,0.01), type="c") +dev.off() + +19 of 225 sites have some alternate allele reads in the mother + +q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2) +pdf("dad_vs_child_af.pdf", width=6, height=6) +xyplot(child_alt_af ~ dad_alt_af, q, xlim=c(-0.05,1.05), ylim=c(-0.05,1.05)) +dev.off() + 1 +pdf("mom_vs_child_af.pdf", width=6, height=6) +xyplot(child_alt_af ~ mom_alt_af, q, xlim=c(-0.05,1.05), ylim=c(-0.05,1.05)) +dev.off() +``` + +# All families in batch + +``` +for ((i = 1; i <= 32; i = i + 1)) +do + head -n $i affected_child.ped | tail -n 1 | cut -f 2 > child_id.txt + head -n $i affected_child.ped | tail -n 1 | cut -f 3 > father_id.txt + head -n $i affected_child.ped | tail -n 1 | cut -f 4 > mother_id.txt + family_id=`head -n $i affected_child.ped | tail -n 1 | cut -f 1` + + bcftools view -m2 -M2 -v snps ${family_id}-gatk-haplotype-annotated.vcf.gz | bcftools filter -i 'GT[@mother_id.txt] = "ref" && GT[@child_id.txt] = "het" && GT[@father_id.txt] != "ref" && GT[@father_id.txt] != "mis"' | bgzip -c > ${family_id}_paternally_inherited_het.vcf.gz + tabix ${family_id}_paternally_inherited_het.vcf.gz + + /home/u035/project/software/bcbio/anaconda/bin/vase -i ${family_id}_paternally_inherited_het.vcf.gz -o ${family_id}_paternally_inherited_het_rare.vcf.gz --freq 0.001 --csq all + tabix ${family_id}_paternally_inherited_het_rare.vcf.gz + + cp header.txt ${family_id}_paternally_inherited_het_rare.txt + bcftools query -f "%CHROM\t%POS\t%REF\t%ALT[\t%GT\t%AD]\n" ${family_id}_paternally_inherited_het_rare.vcf.gz | sed -e 's/,/\t/g' >> ${family_id}_paternally_inherited_het_rare.txt +done +``` + +Excluded the one family without VEP annotations + +``` +R +ids = read.table("affected_child.ped", header=F, sep="\t", stringsAsFactors=F) +x = data.frame() +for (family_id in ids$V1) { + y = read.table(sprintf("%s_paternally_inherited_het_rare.txt", family_id), header=T, stringsAsFactors=F, sep="\t", na.strings=".") + y$family_id = family_id + x = rbind(x, y) +} + +x$mom_alt_af = x$mom_alt_dp / (x$mom_alt_dp + x$mom_ref_dp) +x$dad_alt_af = x$dad_alt_dp / (x$dad_alt_dp + x$dad_ref_dp) +x$child_alt_af = x$child_alt_dp / (x$child_alt_dp + x$child_ref_dp) +x$count = 1 + +q = subset(x, x$child_alt_af >= 0.2 & x$dad_alt_af >= 0.2) +total = aggregate(q$count, by = list(q$family_id), sum) + +q = subset(x, x$child_alt_af >= 0.2 & x$dad_alt_af >= 0.2 & x$mom_alt_dp > 0) +mom_has_reads = aggregate(q$count, by = list(q$family_id), sum) + +output = data.frame(family_id = total$Group.1, total = total$x, mreads = mom_has_reads$x) +output$mprop = output$mreads / output$total +outsub = subset(output, output$total > 2) + +22 of 28 families have a decent number of sites - not sure what's up with the other 6, which only have 1 or 2 each, but these are excluded from analysis. + +summary(outsub) + family_id total mreads mprop + 18735_412876: 1 Min. :136.0 Min. : 6.00 Min. :0.03109 + 18735_414260: 1 1st Qu.:178.0 1st Qu.:12.25 1st Qu.:0.06782 + 18735_415251: 1 Median :198.0 Median :16.00 Median :0.07705 + 18735_421930: 1 Mean :192.6 Mean :15.59 Mean :0.08072 + 18735_424464: 1 3rd Qu.:204.8 3rd Qu.:18.00 3rd Qu.:0.09609 + 18735_424521: 1 Max. :230.0 Max. :26.00 Max. :0.13065 + (Other) :16 + +quantile(q$mom_alt_dp + q$mom_ref_dp, seq(0,1,0.05)) + 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% + 7.0 19.5 29.0 45.5 57.0 69.5 79.0 88.0 101.0 110.0 124.0 + 55% 60% 65% 70% 75% 80% 85% 90% 95% 100% + 143.5 168.0 181.0 200.0 229.0 264.0 296.5 360.0 425.0 1023.0 + +quantile(q$dad_alt_dp + q$dad_ref_dp, seq(0,1,0.05)) + 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% + 2.0 13.0 27.0 47.0 55.0 65.5 78.0 87.0 95.0 111.0 126.0 138.0 149.0 + 65% 70% 75% 80% 85% 90% 95% 100% +175.5 201.0 223.0 245.0 281.5 350.0 426.5 829.0 + +quantile(q$child_alt_dp + q$child_ref_dp, seq(0,1,0.05)) + 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% + 3.0 21.5 30.0 43.0 51.0 61.0 72.0 79.0 92.0 101.5 122.0 141.5 157.0 + 65% 70% 75% 80% 85% 90% 95% 100% +174.0 190.0 215.0 238.0 261.0 315.0 402.5 846.0 +``` + +