Skip to content
Snippets Groups Projects
Commit cbc66047 authored by ameyner2's avatar ameyner2
Browse files

Merge branch 'master' of git.ecdf.ed.ac.uk:igmmbioinformatics/trio-whole-exome

parents e19e28e5 4f6275a0
No related branches found
No related tags found
No related merge requests found
# First look at the one sample
```
bcftools view -m2 -M2 -v snps 18735_426481-gatk-haplotype-annotated.vcf.gz | bcftools filter -i 'GT[1] = "ref"' | bcftools filter -i 'GT[0] = "het"' | bcftools filter -i 'GT[2] != "ref" & GT[2] != "mis"' | bgzip -c > paternally_inherited_het.vcf.gz
tabix paternally_inherited_het.vcf.gz
/home/u035/project/software/bcbio/anaconda/bin/vase -i paternally_inherited_het.vcf.gz -o paternally_inherited_het_rare.vcf.gz --freq 0.001 --csq all
tabix paternally_inherited_het_rare.vcf.gz
header.txt
chr pos ref alt child_gt child_ref_dp child_alt_dp mom_gt mom_ref_dp mom_alt_dp dad_gt dad_ref_dp dad_alt_dp
cp header.txt paternally_inherited_het_rare.txt
bcftools query -f "%CHROM\t%POS\t%REF\t%ALT[\t%GT\t%AD]\n" paternally_inherited_het_rare.vcf.gz | sed -e 's/,/\t/g' >> paternally_inherited_het_rare.txt
```
```
R
y = read.table("paternally_inherited_het_rare.txt", header=T, stringsAsFactors=F, sep="\t", na.strings=".")
y$mom_alt_af = y$mom_alt_dp / (y$mom_alt_dp + y$mom_ref_dp)
y$dad_alt_af = y$dad_alt_dp / (y$dad_alt_dp + y$dad_ref_dp)
y$child_alt_af = y$child_alt_dp / (y$child_alt_dp + y$child_ref_dp)
length(y$chr)
382
q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2)
length(q$chr)
225
q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2 & y$mom_alt_dp > 0)
length(q$chr)
19
z = data.frame(id = c(rep("dad", length(q$chr)), rep("mom", length(q$chr)), rep("child", length(q$chr))), af = c(q$dad_alt_af, q$mom_alt_af, q$child_alt_af))
library(lattice)
pdf("paternally_inherited_het_rare_aaf_histogram.pdf", width=6, height=6)
histogram(~af | id, z, layout=c(1,3), breaks=seq(0,1,0.01), type="c")
dev.off()
19 of 225 sites have some alternate allele reads in the mother
q = subset(y, y$child_alt_af >= 0.2 & y$dad_alt_af >= 0.2)
pdf("dad_vs_child_af.pdf", width=6, height=6)
xyplot(child_alt_af ~ dad_alt_af, q, xlim=c(-0.05,1.05), ylim=c(-0.05,1.05))
dev.off()
1
pdf("mom_vs_child_af.pdf", width=6, height=6)
xyplot(child_alt_af ~ mom_alt_af, q, xlim=c(-0.05,1.05), ylim=c(-0.05,1.05))
dev.off()
```
# All families in batch
```
for ((i = 1; i <= 32; i = i + 1))
do
head -n $i affected_child.ped | tail -n 1 | cut -f 2 > child_id.txt
head -n $i affected_child.ped | tail -n 1 | cut -f 3 > father_id.txt
head -n $i affected_child.ped | tail -n 1 | cut -f 4 > mother_id.txt
family_id=`head -n $i affected_child.ped | tail -n 1 | cut -f 1`
bcftools view -m2 -M2 -v snps ${family_id}-gatk-haplotype-annotated.vcf.gz | bcftools filter -i 'GT[@mother_id.txt] = "ref" && GT[@child_id.txt] = "het" && GT[@father_id.txt] != "ref" && GT[@father_id.txt] != "mis"' | bgzip -c > ${family_id}_paternally_inherited_het.vcf.gz
tabix ${family_id}_paternally_inherited_het.vcf.gz
/home/u035/project/software/bcbio/anaconda/bin/vase -i ${family_id}_paternally_inherited_het.vcf.gz -o ${family_id}_paternally_inherited_het_rare.vcf.gz --freq 0.001 --csq all
tabix ${family_id}_paternally_inherited_het_rare.vcf.gz
cp header.txt ${family_id}_paternally_inherited_het_rare.txt
bcftools query -f "%CHROM\t%POS\t%REF\t%ALT[\t%GT\t%AD]\n" ${family_id}_paternally_inherited_het_rare.vcf.gz | sed -e 's/,/\t/g' >> ${family_id}_paternally_inherited_het_rare.txt
done
```
Excluded the one family without VEP annotations
```
R
ids = read.table("affected_child.ped", header=F, sep="\t", stringsAsFactors=F)
x = data.frame()
for (family_id in ids$V1) {
y = read.table(sprintf("%s_paternally_inherited_het_rare.txt", family_id), header=T, stringsAsFactors=F, sep="\t", na.strings=".")
y$family_id = family_id
x = rbind(x, y)
}
x$mom_alt_af = x$mom_alt_dp / (x$mom_alt_dp + x$mom_ref_dp)
x$dad_alt_af = x$dad_alt_dp / (x$dad_alt_dp + x$dad_ref_dp)
x$child_alt_af = x$child_alt_dp / (x$child_alt_dp + x$child_ref_dp)
x$count = 1
q = subset(x, x$child_alt_af >= 0.2 & x$dad_alt_af >= 0.2)
total = aggregate(q$count, by = list(q$family_id), sum)
q = subset(x, x$child_alt_af >= 0.2 & x$dad_alt_af >= 0.2 & x$mom_alt_dp > 0)
mom_has_reads = aggregate(q$count, by = list(q$family_id), sum)
output = data.frame(family_id = total$Group.1, total = total$x, mreads = mom_has_reads$x)
output$mprop = output$mreads / output$total
outsub = subset(output, output$total > 2)
22 of 28 families have a decent number of sites - not sure what's up with the other 6, which only have 1 or 2 each, but these are excluded from analysis.
summary(outsub)
family_id total mreads mprop
18735_412876: 1 Min. :136.0 Min. : 6.00 Min. :0.03109
18735_414260: 1 1st Qu.:178.0 1st Qu.:12.25 1st Qu.:0.06782
18735_415251: 1 Median :198.0 Median :16.00 Median :0.07705
18735_421930: 1 Mean :192.6 Mean :15.59 Mean :0.08072
18735_424464: 1 3rd Qu.:204.8 3rd Qu.:18.00 3rd Qu.:0.09609
18735_424521: 1 Max. :230.0 Max. :26.00 Max. :0.13065
(Other) :16
quantile(q$mom_alt_dp + q$mom_ref_dp, seq(0,1,0.05))
0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50%
7.0 19.5 29.0 45.5 57.0 69.5 79.0 88.0 101.0 110.0 124.0
55% 60% 65% 70% 75% 80% 85% 90% 95% 100%
143.5 168.0 181.0 200.0 229.0 264.0 296.5 360.0 425.0 1023.0
quantile(q$dad_alt_dp + q$dad_ref_dp, seq(0,1,0.05))
0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60%
2.0 13.0 27.0 47.0 55.0 65.5 78.0 87.0 95.0 111.0 126.0 138.0 149.0
65% 70% 75% 80% 85% 90% 95% 100%
175.5 201.0 223.0 245.0 281.5 350.0 426.5 829.0
quantile(q$child_alt_dp + q$child_ref_dp, seq(0,1,0.05))
0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60%
3.0 21.5 30.0 43.0 51.0 61.0 72.0 79.0 92.0 101.5 122.0 141.5 157.0
65% 70% 75% 80% 85% 90% 95% 100%
174.0 190.0 215.0 238.0 261.0 315.0 402.5 846.0
```
# Run 18170
There are 5 samples on this run that failed on run 18840, all are from different families. We’ve put the whole family into the PED file. The rest of the family’s data are on run 18840. The repeated samples on this run are:
* 125778 (family 428004)
* 125467 (family 427734)
* 101846 (family 429213)
* 125303 (family 427554)
* 125783 (family 428055)
Other analysis notes:
* 430321 – Quad, 2 affected children. Please perform trio-based analysis per child, the 2 are not similarly affected.
* 429107 – Quad, 2 similarly affected children. Please perform shared variant analysis between 88303 and 80901, and trio-based analysis per child.
* 429453 – Trio, mother & child similarly affected. Please perform trio-based analysis and shared variant analysis between 126602 and 126605.
* 429187 – Duo, for Congenica analysis only.
## Read set up
```
cd /scratch/u035/project/trio_whole_exome/analysis/reads
mkdir 18170_Ansari_Morad
cd 18170_Ansari_Morad
cp ../17710_Ansari_Morad/*428004* ./
cp ../17710_Ansari_Morad/*427734* ./
cp ../17710_Ansari_Morad/*429213* ./
cp ../17710_Ansari_Morad/*427554* ./
cp ../17710_Ansari_Morad/*428055* ./
ls -lh 125778_428004*
ls -lh 125467_427734*
ls -lh 101846_429213*
ls -lh 125303_427554*
ls -lh 125783_428055*
rm 125778_428004*
rm 125467_427734*
rm 101846_429213*
rm 125303_427554*
rm 125783_428055*
```
## Parameter file generation
```
cd /scratch/u035/project/trio_whole_exome/analysis/params
version=v1
project_id=18170_Ansari_Morad
sample_suffix=_WESTwist_IDT-A
ped_file=18957_WES_PED_110621.txt
ln -s $ped_file $project_id.ped
```
Edit the parameter generation script to exit after running the Perl script for producing the PED files.
```
/home/u035/project/scripts/prepare_bcbio_config.sh \
/home/u035/project/scripts/trio_whole_exome_config.sh \
$project_id $version $sample_suffix &> ${version}_${project_id}.log
```
Copy the PED files for the above families from the previous run and edit the batch id.
```
for family in 428004 427734 429213 427554 428055
do
cp 17710_Ansari_Morad_18840_${family}.ped 18170_Ansari_Morad_18957_${family}.ped
perl -pi -e 's/18840/18957/' 18170_Ansari_Morad_18957_${family}.ped
done
```
Re-generate the family ids list.
```
cut -f 1 /scratch/u035/project/trio_whole_exome/analysis/params/18170_Ansari_Morad.ped | sort -u | awk '{ print "18957_" $0 }' > 18170_Ansari_Morad.family_ids.txt
```
Edit the parameter generation script to skip running the Perl script for producing the PED files.
```
/home/u035/project/scripts/prepare_bcbio_config.sh \
/home/u035/project/scripts/trio_whole_exome_config.sh \
$project_id $version $sample_suffix &> ${version}_${project_id}.log
```
Uncomment out the skipped section of the parameter generation script.
Fix the config files for these families.
```
for family in 428004 427734 429213 427554 428055
do
cp v1_17710_Ansari_Morad_18840_${family}.yaml v1_18170_Ansari_Morad_18957_${family}.yaml
perl -pi -e 's/18840/18957/' v1_18170_Ansari_Morad_18957_${family}.yaml
perl -pi -e 's/17710/18170/' v1_18170_Ansari_Morad_18957_${family}.yaml
done
```
......@@ -70,7 +70,15 @@ qsub -v \
/home/u035/project/scripts/submit_trio_wes_priority_and_qc_checksums.sh
```
5. Archive the output to /archive/u035/trio_whole_exome, excluding the BAM files. Confirm md5 checksums on the archived files. There should be no lines ending in ‘FAIL’. If there are any, investigate which file(s) and manually copy these, then re-check the md5 of the archived copy.
5. Clean up the output directory.
```
cd /home/u035/project/trio_whole_exome/
mkdir ${version}_${project_id}
mv *${version}_${project_id}* ${version}_${project_id}/
```
6. Archive the output to /archive/u035/trio_whole_exome, excluding the BAM files. Confirm md5 checksums on the archived files. There should be no lines ending in ‘FAIL’. If there are any, investigate which file(s) and manually copy these, then re-check the md5 of the archived copy.
```
qsub -v \
......@@ -80,21 +88,13 @@ qsub -v \
grep -c FAIL trio_whole_exome_archive_project.$project_id*
```
6. Clean up the logs directory.
7. Clean up the logs directory.
```
cd /home/u035/project/trio_whole_exome/logs
mv *.project_id.* complete/
```
7. Clean up the output directory.
```
cd /home/u035/project/trio_whole_exome/
mkdir ${version}_${project_id}
mv *${version}_${project_id}* ${version}_${project_id}/
```
8. Confirmation of copy to tape. EPCC runs a cron job on Monday mornings at 8am to check the DMF file state for all files in /archive/u035/trio_whole_exome. This produces a text report at /archive/u035/confirmation using the format yyyy-mm-dd.
Check for ‘DUL’ entries for files in the confirmation report. The grep command below excludes directories, which are generally marked as ‘REG’, blank lines, and total file size lines. If not all entries are DUL, check the non-DUL ones for error states as below. The expectation is that some files may be in the process of migration; in which case, check again in the following week’s report.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment