Merge branch 'master' of git.ecdf.ed.ac.uk:igmmbioinformatics/trio-whole-exome

cbc66047 · ameyner2 · e19e28e5 · 4f6275a0 · cbc66047 · cbc66047
Commit cbc66047 authored 3 years ago by ameyner2
--- a/docs/17200_ITPR1_mosaic_investigation.md
+++ b/docs/17200_ITPR1_mosaic_investigation.md
+# First look at the one sample
+
+```
+bcftools view -m2 -M2 -v snps 18735_426481-gatk-haplotype-annotated.vcf.gz | bcftools filter -i 'GT[1] = "ref"' | bcftools filter -i 'GT[0] = "het"' | bcftools filter -i 'GT[2] != "ref" & GT[2] != "mis"' | bgzip -c > paternally_inherited_het.vcf.gz
+tabix paternally_inherited_het.vcf.gz
+
+/home/u035/project/software/bcbio/anaconda/bin/vase -i paternally_inherited_het.vcf.gz -o paternally_inherited_het_rare.vcf.gz --freq 0.001 --csq all
+tabix paternally_inherited_het_rare.vcf.gz
+
+header.txt 
+chr	pos	ref	alt	child_gt	child_ref_dp	child_alt_dp	mom_gt	mom_ref_dp	mom_alt_dp	dad_gt	dad_ref_dp	dad_alt_dp
+
+cp header.txt paternally_inherited_het_rare.txt
+bcftools query -f "%CHROM\t%POS\t%REF\t%ALT[\t%GT\t%AD]\n" paternally_inherited_het_rare.vcf.gz | sed -e 's/,/\t/g' >> paternally_inherited_het_rare.txt
+```
+
+```
+R
+y = read.table("paternally_inherited_het_rare.txt", header=T, stringsAsFactors=F, sep="\t", na.strings=".")
+y$mom_alt_af = y$mom_alt_dp / (y$mom_alt_dp + y$mom_ref_dp)
+y$dad_alt_af = y$dad_alt_dp / (y$dad_alt_dp + y$dad_ref_dp)
+y$child_alt_af = y$child_alt_dp / (y$child_alt_dp + y$child_ref_dp)
+length(y$chr)
+382
+
+q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2)
+length(q$chr)
+225
+
+q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2 & y$mom_alt_dp > 0)
+length(q$chr)
+19
+
+z = data.frame(id = c(rep("dad", length(q$chr)), rep("mom", length(q$chr)), rep("child", length(q$chr))), af = c(q$dad_alt_af, q$mom_alt_af, q$child_alt_af))
+library(lattice)
+pdf("paternally_inherited_het_rare_aaf_histogram.pdf", width=6, height=6)
+histogram(~af | id, z, layout=c(1,3), breaks=seq(0,1,0.01), type="c")
+dev.off()
+
+19 of 225 sites have some alternate allele reads in the mother
+
+q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2)
+pdf("dad_vs_child_af.pdf", width=6, height=6)
+xyplot(child_alt_af ~ dad_alt_af, q, xlim=c(-0.05,1.05), ylim=c(-0.05,1.05))
+dev.off()
+          1 
+pdf("mom_vs_child_af.pdf", width=6, height=6)
+xyplot(child_alt_af ~ mom_alt_af, q, xlim=c(-0.05,1.05), ylim=c(-0.05,1.05))
+dev.off()
+```
+
+# All families in batch
+
+```
+for ((i = 1; i <= 32; i = i + 1))
+do
+  head -n $i affected_child.ped | tail -n 1 | cut -f 2 > child_id.txt
+  head -n $i affected_child.ped | tail -n 1 | cut -f 3 > father_id.txt
+  head -n $i affected_child.ped | tail -n 1 | cut -f 4 > mother_id.txt
+  family_id=`head -n $i affected_child.ped | tail -n 1 | cut -f 1`
+
+  bcftools view -m2 -M2 -v snps ${family_id}-gatk-haplotype-annotated.vcf.gz | bcftools filter -i 'GT[@mother_id.txt] = "ref" && GT[@child_id.txt] = "het" && GT[@father_id.txt] != "ref" && GT[@father_id.txt] != "mis"' | bgzip -c > ${family_id}_paternally_inherited_het.vcf.gz
+  tabix ${family_id}_paternally_inherited_het.vcf.gz
+
+  /home/u035/project/software/bcbio/anaconda/bin/vase -i ${family_id}_paternally_inherited_het.vcf.gz -o ${family_id}_paternally_inherited_het_rare.vcf.gz --freq 0.001 --csq all
+  tabix ${family_id}_paternally_inherited_het_rare.vcf.gz
+
+  cp header.txt ${family_id}_paternally_inherited_het_rare.txt
+  bcftools query -f "%CHROM\t%POS\t%REF\t%ALT[\t%GT\t%AD]\n" ${family_id}_paternally_inherited_het_rare.vcf.gz | sed -e 's/,/\t/g' >> ${family_id}_paternally_inherited_het_rare.txt
+done
+```
+
+Excluded the one family without VEP annotations
+
+```
+R
+ids = read.table("affected_child.ped", header=F, sep="\t", stringsAsFactors=F)
+x = data.frame()
+for (family_id in ids$V1) {
+  y = read.table(sprintf("%s_paternally_inherited_het_rare.txt", family_id), header=T, stringsAsFactors=F, sep="\t", na.strings=".")
+  y$family_id = family_id
+  x = rbind(x, y)
+}
+
+x$mom_alt_af = x$mom_alt_dp / (x$mom_alt_dp + x$mom_ref_dp)
+x$dad_alt_af = x$dad_alt_dp / (x$dad_alt_dp + x$dad_ref_dp)
+x$child_alt_af = x$child_alt_dp / (x$child_alt_dp + x$child_ref_dp)
+x$count = 1
+
+q = subset(x, x$child_alt_af >= 0.2 & x$dad_alt_af >= 0.2)
+total = aggregate(q$count, by = list(q$family_id), sum)
+
+q = subset(x, x$child_alt_af >= 0.2 & x$dad_alt_af >= 0.2 & x$mom_alt_dp > 0)
+mom_has_reads = aggregate(q$count, by = list(q$family_id), sum)
+
+output = data.frame(family_id = total$Group.1, total = total$x, mreads = mom_has_reads$x)
+output$mprop = output$mreads / output$total
+outsub = subset(output, output$total > 2)
+
+22 of 28 families have a decent number of sites - not sure what's up with the other 6, which only have 1 or 2 each, but these are excluded from analysis.
+
+summary(outsub)
+        family_id      total           mreads          mprop        
+ 18735_412876: 1   Min.   :136.0   Min.   : 6.00   Min.   :0.03109  
+ 18735_414260: 1   1st Qu.:178.0   1st Qu.:12.25   1st Qu.:0.06782  
+ 18735_415251: 1   Median :198.0   Median :16.00   Median :0.07705  
+ 18735_421930: 1   Mean   :192.6   Mean   :15.59   Mean   :0.08072  
+ 18735_424464: 1   3rd Qu.:204.8   3rd Qu.:18.00   3rd Qu.:0.09609  
+ 18735_424521: 1   Max.   :230.0   Max.   :26.00   Max.   :0.13065  
+ (Other)     :16                                                    
+
+quantile(q$mom_alt_dp + q$mom_ref_dp, seq(0,1,0.05))
+    0%     5%    10%    15%    20%    25%    30%    35%    40%    45%    50% 
+   7.0   19.5   29.0   45.5   57.0   69.5   79.0   88.0  101.0  110.0  124.0 
+   55%    60%    65%    70%    75%    80%    85%    90%    95%   100% 
+ 143.5  168.0  181.0  200.0  229.0  264.0  296.5  360.0  425.0 1023.0 
+
+quantile(q$dad_alt_dp + q$dad_ref_dp, seq(0,1,0.05))
+   0%    5%   10%   15%   20%   25%   30%   35%   40%   45%   50%   55%   60% 
+  2.0  13.0  27.0  47.0  55.0  65.5  78.0  87.0  95.0 111.0 126.0 138.0 149.0 
+  65%   70%   75%   80%   85%   90%   95%  100% 
+175.5 201.0 223.0 245.0 281.5 350.0 426.5 829.0 
+
+quantile(q$child_alt_dp + q$child_ref_dp, seq(0,1,0.05))
+   0%    5%   10%   15%   20%   25%   30%   35%   40%   45%   50%   55%   60% 
+  3.0  21.5  30.0  43.0  51.0  61.0  72.0  79.0  92.0 101.5 122.0 141.5 157.0 
+  65%   70%   75%   80%   85%   90%   95%  100% 
+174.0 190.0 215.0 238.0 261.0 315.0 402.5 846.0
+```
+
+
--- a/docs/18170_run_notes.md
+++ b/docs/18170_run_notes.md
+# Run 18170
+
+There are 5 samples on this run that failed on run 18840, all are from different families. We’ve put the whole family into the PED file. The rest of the family’s data are on run 18840. The repeated samples on this run are:
+
+* 125778 (family 428004)
+* 125467 (family 427734)
+* 101846 (family 429213)
+* 125303 (family 427554)
+* 125783 (family 428055)
+
+Other analysis notes:
+* 430321 – Quad, 2 affected children. Please perform trio-based analysis per child, the 2 are not similarly affected.
+* 429107 – Quad, 2 similarly affected children. Please perform shared variant analysis between 88303 and 80901, and trio-based analysis per child.
+* 429453 – Trio, mother & child similarly affected. Please perform trio-based analysis and shared variant analysis between 126602 and 126605.
+* 429187 – Duo, for Congenica analysis only.
+
+## Read set up
+
+```
+cd /scratch/u035/project/trio_whole_exome/analysis/reads
+mkdir 18170_Ansari_Morad
+cd 18170_Ansari_Morad
+
+cp ../17710_Ansari_Morad/*428004* ./
+cp ../17710_Ansari_Morad/*427734* ./
+cp ../17710_Ansari_Morad/*429213* ./
+cp ../17710_Ansari_Morad/*427554* ./
+cp ../17710_Ansari_Morad/*428055* ./
+
+ls -lh 125778_428004*
+ls -lh 125467_427734*
+ls -lh 101846_429213*
+ls -lh 125303_427554*
+ls -lh 125783_428055*
+
+rm 125778_428004*
+rm 125467_427734*
+rm 101846_429213*
+rm 125303_427554*
+rm 125783_428055*
+```
+
+## Parameter file generation
+
+```
+cd /scratch/u035/project/trio_whole_exome/analysis/params
+
+version=v1
+project_id=18170_Ansari_Morad
+sample_suffix=_WESTwist_IDT-A
+ped_file=18957_WES_PED_110621.txt 
+
+ln -s $ped_file $project_id.ped
+```
+
+Edit the parameter generation script to exit after running the Perl script for producing the PED files.
+
+```
+/home/u035/project/scripts/prepare_bcbio_config.sh \
+   /home/u035/project/scripts/trio_whole_exome_config.sh \
+   $project_id $version $sample_suffix &> ${version}_${project_id}.log
+```
+
+Copy the PED files for the above families from the previous run and edit the batch id.
+
+```
+for family in 428004 427734 429213 427554 428055
+do
+  cp 17710_Ansari_Morad_18840_${family}.ped 18170_Ansari_Morad_18957_${family}.ped
+  perl -pi -e 's/18840/18957/' 18170_Ansari_Morad_18957_${family}.ped
+done
+```
+
+Re-generate the family ids list.
+
+```
+cut -f 1 /scratch/u035/project/trio_whole_exome/analysis/params/18170_Ansari_Morad.ped | sort -u  | awk '{ print "18957_" $0 }' > 18170_Ansari_Morad.family_ids.txt
+```
+
+Edit the parameter generation script to skip running the Perl script for producing the PED files.
+
+```
+/home/u035/project/scripts/prepare_bcbio_config.sh \
+   /home/u035/project/scripts/trio_whole_exome_config.sh \
+   $project_id $version $sample_suffix &> ${version}_${project_id}.log
+```
+
+Uncomment out the skipped section of the parameter generation script.
+
+Fix the config files for these families.
+
+```
+for family in 428004 427734 429213 427554 428055
+do
+  cp v1_17710_Ansari_Morad_18840_${family}.yaml v1_18170_Ansari_Morad_18957_${family}.yaml
+  perl -pi -e 's/18840/18957/' v1_18170_Ansari_Morad_18957_${family}.yaml
+  perl -pi -e 's/17710/18170/' v1_18170_Ansari_Morad_18957_${family}.yaml
+done
+```
--- a/docs/SOP_archiving.md
+++ b/docs/SOP_archiving.md
@@ -70,7 +70,15 @@ qsub -v \
  /home/u035/project/scripts/submit_trio_wes_priority_and_qc_checksums.sh
 ```

-5. Archive the output to /archive/u035/trio_whole_exome, excluding the BAM files. Confirm md5 checksums on the archived files. There should be no lines ending in ‘FAIL’. If there are any, investigate which file(s) and manually copy these, then re-check the md5 of the archived copy.
+5. Clean up the output directory.
+
+```
+cd /home/u035/project/trio_whole_exome/
+mkdir ${version}_${project_id}
+mv *${version}_${project_id}* ${version}_${project_id}/
+```
+
+6. Archive the output to /archive/u035/trio_whole_exome, excluding the BAM files. Confirm md5 checksums on the archived files. There should be no lines ending in ‘FAIL’. If there are any, investigate which file(s) and manually copy these, then re-check the md5 of the archived copy.

 ```
 qsub -v \
@@ -80,21 +88,13 @@ qsub -v \
 grep -c FAIL trio_whole_exome_archive_project.$project_id*
 ```

-6. Clean up the logs directory.
+7. Clean up the logs directory.

 ```
 cd /home/u035/project/trio_whole_exome/logs 
 mv *.project_id.* complete/
 ```

-7. Clean up the output directory.
-
-```
-cd /home/u035/project/trio_whole_exome/
-mkdir ${version}_${project_id}
-mv *${version}_${project_id}* ${version}_${project_id}/
-```
-
 8. Confirmation of copy to tape. EPCC runs a cron job on Monday mornings at 8am to check the DMF file state for all files in /archive/u035/trio_whole_exome. This produces a text report at /archive/u035/confirmation using the format yyyy-mm-dd. 

 Check for ‘DUL’ entries for files in the confirmation report. The grep command below excludes directories, which are generally marked as ‘REG’, blank lines, and total file size lines. If not all entries are DUL, check the non-DUL ones for error states as below. The expectation is that some files may be in the process of migration; in which case, check again in the following week’s report.