diff --git a/docs/17200_ITPR1_mosaic_investigation.md b/docs/17200_ITPR1_mosaic_investigation.md new file mode 100644 index 0000000000000000000000000000000000000000..b2181bb4c416810a3545945c4f0823c9ef00482b --- /dev/null +++ b/docs/17200_ITPR1_mosaic_investigation.md @@ -0,0 +1,131 @@ +# First look at the one sample + +``` +bcftools view -m2 -M2 -v snps 18735_426481-gatk-haplotype-annotated.vcf.gz | bcftools filter -i 'GT[1] = "ref"' | bcftools filter -i 'GT[0] = "het"' | bcftools filter -i 'GT[2] != "ref" & GT[2] != "mis"' | bgzip -c > paternally_inherited_het.vcf.gz +tabix paternally_inherited_het.vcf.gz + +/home/u035/project/software/bcbio/anaconda/bin/vase -i paternally_inherited_het.vcf.gz -o paternally_inherited_het_rare.vcf.gz --freq 0.001 --csq all +tabix paternally_inherited_het_rare.vcf.gz + +header.txt +chr pos ref alt child_gt child_ref_dp child_alt_dp mom_gt mom_ref_dp mom_alt_dp dad_gt dad_ref_dp dad_alt_dp + +cp header.txt paternally_inherited_het_rare.txt +bcftools query -f "%CHROM\t%POS\t%REF\t%ALT[\t%GT\t%AD]\n" paternally_inherited_het_rare.vcf.gz | sed -e 's/,/\t/g' >> paternally_inherited_het_rare.txt +``` + +``` +R +y = read.table("paternally_inherited_het_rare.txt", header=T, stringsAsFactors=F, sep="\t", na.strings=".") +y$mom_alt_af = y$mom_alt_dp / (y$mom_alt_dp + y$mom_ref_dp) +y$dad_alt_af = y$dad_alt_dp / (y$dad_alt_dp + y$dad_ref_dp) +y$child_alt_af = y$child_alt_dp / (y$child_alt_dp + y$child_ref_dp) +length(y$chr) +382 + +q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2) +length(q$chr) +225 + +q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2 & y$mom_alt_dp > 0) +length(q$chr) +19 + +z = data.frame(id = c(rep("dad", length(q$chr)), rep("mom", length(q$chr)), rep("child", length(q$chr))), af = c(q$dad_alt_af, q$mom_alt_af, q$child_alt_af)) +library(lattice) +pdf("paternally_inherited_het_rare_aaf_histogram.pdf", width=6, height=6) +histogram(~af | id, z, layout=c(1,3), breaks=seq(0,1,0.01), type="c") +dev.off() + +19 of 225 sites have some alternate allele reads in the mother + +q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2) +pdf("dad_vs_child_af.pdf", width=6, height=6) +xyplot(child_alt_af ~ dad_alt_af, q, xlim=c(-0.05,1.05), ylim=c(-0.05,1.05)) +dev.off() + 1 +pdf("mom_vs_child_af.pdf", width=6, height=6) +xyplot(child_alt_af ~ mom_alt_af, q, xlim=c(-0.05,1.05), ylim=c(-0.05,1.05)) +dev.off() +``` + +# All families in batch + +``` +for ((i = 1; i <= 32; i = i + 1)) +do + head -n $i affected_child.ped | tail -n 1 | cut -f 2 > child_id.txt + head -n $i affected_child.ped | tail -n 1 | cut -f 3 > father_id.txt + head -n $i affected_child.ped | tail -n 1 | cut -f 4 > mother_id.txt + family_id=`head -n $i affected_child.ped | tail -n 1 | cut -f 1` + + bcftools view -m2 -M2 -v snps ${family_id}-gatk-haplotype-annotated.vcf.gz | bcftools filter -i 'GT[@mother_id.txt] = "ref" && GT[@child_id.txt] = "het" && GT[@father_id.txt] != "ref" && GT[@father_id.txt] != "mis"' | bgzip -c > ${family_id}_paternally_inherited_het.vcf.gz + tabix ${family_id}_paternally_inherited_het.vcf.gz + + /home/u035/project/software/bcbio/anaconda/bin/vase -i ${family_id}_paternally_inherited_het.vcf.gz -o ${family_id}_paternally_inherited_het_rare.vcf.gz --freq 0.001 --csq all + tabix ${family_id}_paternally_inherited_het_rare.vcf.gz + + cp header.txt ${family_id}_paternally_inherited_het_rare.txt + bcftools query -f "%CHROM\t%POS\t%REF\t%ALT[\t%GT\t%AD]\n" ${family_id}_paternally_inherited_het_rare.vcf.gz | sed -e 's/,/\t/g' >> ${family_id}_paternally_inherited_het_rare.txt +done +``` + +Excluded the one family without VEP annotations + +``` +R +ids = read.table("affected_child.ped", header=F, sep="\t", stringsAsFactors=F) +x = data.frame() +for (family_id in ids$V1) { + y = read.table(sprintf("%s_paternally_inherited_het_rare.txt", family_id), header=T, stringsAsFactors=F, sep="\t", na.strings=".") + y$family_id = family_id + x = rbind(x, y) +} + +x$mom_alt_af = x$mom_alt_dp / (x$mom_alt_dp + x$mom_ref_dp) +x$dad_alt_af = x$dad_alt_dp / (x$dad_alt_dp + x$dad_ref_dp) +x$child_alt_af = x$child_alt_dp / (x$child_alt_dp + x$child_ref_dp) +x$count = 1 + +q = subset(x, x$child_alt_af >= 0.2 & x$dad_alt_af >= 0.2) +total = aggregate(q$count, by = list(q$family_id), sum) + +q = subset(x, x$child_alt_af >= 0.2 & x$dad_alt_af >= 0.2 & x$mom_alt_dp > 0) +mom_has_reads = aggregate(q$count, by = list(q$family_id), sum) + +output = data.frame(family_id = total$Group.1, total = total$x, mreads = mom_has_reads$x) +output$mprop = output$mreads / output$total +outsub = subset(output, output$total > 2) + +22 of 28 families have a decent number of sites - not sure what's up with the other 6, which only have 1 or 2 each, but these are excluded from analysis. + +summary(outsub) + family_id total mreads mprop + 18735_412876: 1 Min. :136.0 Min. : 6.00 Min. :0.03109 + 18735_414260: 1 1st Qu.:178.0 1st Qu.:12.25 1st Qu.:0.06782 + 18735_415251: 1 Median :198.0 Median :16.00 Median :0.07705 + 18735_421930: 1 Mean :192.6 Mean :15.59 Mean :0.08072 + 18735_424464: 1 3rd Qu.:204.8 3rd Qu.:18.00 3rd Qu.:0.09609 + 18735_424521: 1 Max. :230.0 Max. :26.00 Max. :0.13065 + (Other) :16 + +quantile(q$mom_alt_dp + q$mom_ref_dp, seq(0,1,0.05)) + 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% + 7.0 19.5 29.0 45.5 57.0 69.5 79.0 88.0 101.0 110.0 124.0 + 55% 60% 65% 70% 75% 80% 85% 90% 95% 100% + 143.5 168.0 181.0 200.0 229.0 264.0 296.5 360.0 425.0 1023.0 + +quantile(q$dad_alt_dp + q$dad_ref_dp, seq(0,1,0.05)) + 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% + 2.0 13.0 27.0 47.0 55.0 65.5 78.0 87.0 95.0 111.0 126.0 138.0 149.0 + 65% 70% 75% 80% 85% 90% 95% 100% +175.5 201.0 223.0 245.0 281.5 350.0 426.5 829.0 + +quantile(q$child_alt_dp + q$child_ref_dp, seq(0,1,0.05)) + 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% + 3.0 21.5 30.0 43.0 51.0 61.0 72.0 79.0 92.0 101.5 122.0 141.5 157.0 + 65% 70% 75% 80% 85% 90% 95% 100% +174.0 190.0 215.0 238.0 261.0 315.0 402.5 846.0 +``` + + diff --git a/docs/18170_run_notes.md b/docs/18170_run_notes.md new file mode 100644 index 0000000000000000000000000000000000000000..dbbb8243664ac20ebf8978b3c8667e298ae3949d --- /dev/null +++ b/docs/18170_run_notes.md @@ -0,0 +1,99 @@ +# Run 18170 + +There are 5 samples on this run that failed on run 18840, all are from different families. We’ve put the whole family into the PED file. The rest of the family’s data are on run 18840. The repeated samples on this run are: + +* 125778 (family 428004) +* 125467 (family 427734) +* 101846 (family 429213) +* 125303 (family 427554) +* 125783 (family 428055) + +Other analysis notes: +* 430321 – Quad, 2 affected children. Please perform trio-based analysis per child, the 2 are not similarly affected. +* 429107 – Quad, 2 similarly affected children. Please perform shared variant analysis between 88303 and 80901, and trio-based analysis per child. +* 429453 – Trio, mother & child similarly affected. Please perform trio-based analysis and shared variant analysis between 126602 and 126605. +* 429187 – Duo, for Congenica analysis only. + +## Read set up + +``` +cd /scratch/u035/project/trio_whole_exome/analysis/reads +mkdir 18170_Ansari_Morad +cd 18170_Ansari_Morad + +cp ../17710_Ansari_Morad/*428004* ./ +cp ../17710_Ansari_Morad/*427734* ./ +cp ../17710_Ansari_Morad/*429213* ./ +cp ../17710_Ansari_Morad/*427554* ./ +cp ../17710_Ansari_Morad/*428055* ./ + +ls -lh 125778_428004* +ls -lh 125467_427734* +ls -lh 101846_429213* +ls -lh 125303_427554* +ls -lh 125783_428055* + +rm 125778_428004* +rm 125467_427734* +rm 101846_429213* +rm 125303_427554* +rm 125783_428055* +``` + +## Parameter file generation + +``` +cd /scratch/u035/project/trio_whole_exome/analysis/params + +version=v1 +project_id=18170_Ansari_Morad +sample_suffix=_WESTwist_IDT-A +ped_file=18957_WES_PED_110621.txt + +ln -s $ped_file $project_id.ped +``` + +Edit the parameter generation script to exit after running the Perl script for producing the PED files. + +``` +/home/u035/project/scripts/prepare_bcbio_config.sh \ + /home/u035/project/scripts/trio_whole_exome_config.sh \ + $project_id $version $sample_suffix &> ${version}_${project_id}.log +``` + +Copy the PED files for the above families from the previous run and edit the batch id. + +``` +for family in 428004 427734 429213 427554 428055 +do + cp 17710_Ansari_Morad_18840_${family}.ped 18170_Ansari_Morad_18957_${family}.ped + perl -pi -e 's/18840/18957/' 18170_Ansari_Morad_18957_${family}.ped +done +``` + +Re-generate the family ids list. + +``` +cut -f 1 /scratch/u035/project/trio_whole_exome/analysis/params/18170_Ansari_Morad.ped | sort -u | awk '{ print "18957_" $0 }' > 18170_Ansari_Morad.family_ids.txt +``` + +Edit the parameter generation script to skip running the Perl script for producing the PED files. + +``` +/home/u035/project/scripts/prepare_bcbio_config.sh \ + /home/u035/project/scripts/trio_whole_exome_config.sh \ + $project_id $version $sample_suffix &> ${version}_${project_id}.log +``` + +Uncomment out the skipped section of the parameter generation script. + +Fix the config files for these families. + +``` +for family in 428004 427734 429213 427554 428055 +do + cp v1_17710_Ansari_Morad_18840_${family}.yaml v1_18170_Ansari_Morad_18957_${family}.yaml + perl -pi -e 's/18840/18957/' v1_18170_Ansari_Morad_18957_${family}.yaml + perl -pi -e 's/17710/18170/' v1_18170_Ansari_Morad_18957_${family}.yaml +done +``` diff --git a/docs/SOP_archiving.md b/docs/SOP_archiving.md index 5c17c13aaa15c38b08d300da3ff1684858e44c9d..a0061e395d509927db8ccb2dd58e19c505a83eba 100644 --- a/docs/SOP_archiving.md +++ b/docs/SOP_archiving.md @@ -70,7 +70,15 @@ qsub -v \ /home/u035/project/scripts/submit_trio_wes_priority_and_qc_checksums.sh ``` -5. Archive the output to /archive/u035/trio_whole_exome, excluding the BAM files. Confirm md5 checksums on the archived files. There should be no lines ending in ‘FAIL’. If there are any, investigate which file(s) and manually copy these, then re-check the md5 of the archived copy. +5. Clean up the output directory. + +``` +cd /home/u035/project/trio_whole_exome/ +mkdir ${version}_${project_id} +mv *${version}_${project_id}* ${version}_${project_id}/ +``` + +6. Archive the output to /archive/u035/trio_whole_exome, excluding the BAM files. Confirm md5 checksums on the archived files. There should be no lines ending in ‘FAIL’. If there are any, investigate which file(s) and manually copy these, then re-check the md5 of the archived copy. ``` qsub -v \ @@ -80,21 +88,13 @@ qsub -v \ grep -c FAIL trio_whole_exome_archive_project.$project_id* ``` -6. Clean up the logs directory. +7. Clean up the logs directory. ``` cd /home/u035/project/trio_whole_exome/logs mv *.project_id.* complete/ ``` -7. Clean up the output directory. - -``` -cd /home/u035/project/trio_whole_exome/ -mkdir ${version}_${project_id} -mv *${version}_${project_id}* ${version}_${project_id}/ -``` - 8. Confirmation of copy to tape. EPCC runs a cron job on Monday mornings at 8am to check the DMF file state for all files in /archive/u035/trio_whole_exome. This produces a text report at /archive/u035/confirmation using the format yyyy-mm-dd. Check for ‘DUL’ entries for files in the confirmation report. The grep command below excludes directories, which are generally marked as ‘REG’, blank lines, and total file size lines. If not all entries are DUL, check the non-DUL ones for error states as below. The expectation is that some files may be in the process of migration; in which case, check again in the following week’s report.