Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • igmmbioinformatics/trio-whole-exome
1 result
Show changes
Showing
with 242 additions and 44 deletions
File moved
File moved
File moved
# Run 19973
## Pre-run info
### Other family members on previous runs
* 453234, parents (134865 and 134864) are on this run, proband was on run 19690 (132524)
* 414996, proband (135612) and father (117093) are on this run, mother was on run 18203 (121256)
* 457300, proband (134113) and mother (134114) are on this run, father was on run 19875 (134116)
### Singletons
* 457456 (FFPE tissue, might fail)
* 480173
### Duos (for singleton analysis)
* 457600 (proband: 134484)
* 458099 (proband: 129313)
### Priority
* 414996, 480290, 457300, 481401, 481293
## QC summary
* 457546
* 135245 - proband - 0.3M reads, contamination 14%, C>T/G>A deamination artefacts (probably 30% of variants) - fail, do not proceed with prioritization
* 457300
* 134113 - proband - 27X - low but usable
* 134116 - father - 33X - low but usable, failed per-tile sequence quality in FastQC (unusual)
* 481089
* 105222 - mother - 42X - borderline low, fine to analyze
* 473144
* 135054 - father - 45X - borderline low, fine to analyze
* 414996
* 117093 - father - contamination 10% - proceed with prioritization with caution
...@@ -41,4 +41,3 @@ workflow { ...@@ -41,4 +41,3 @@ workflow {
exit 1, 'params.workflow required - variant-calling or variant-prioritisation' exit 1, 'params.workflow required - variant-calling or variant-prioritisation'
} }
} }
params {
max_cpus = 16
max_mem = 32.GB
max_time = 48.h
min_cpus = 1
min_mem = 1.GB
min_time = 2.h
bcbio_template = "$projectDir/assets/trio_whole_exome_bcbio_template.yaml"
}
profiles {
standard {
process.executor = 'local'
}
debug {
process.echo = true
}
stubs {
process.executor = 'local'
params.max_cpus = 1
params.max_mem = 500.MB
params.max_time = 1.h
params.bcbio = "$projectDir/tests/scripts/bcbio_nextgen.py"
params.target_bed = "$projectDir/tests/assets/input_data/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed"
params.reference_genome = "$projectDir/tests/assets/ref.fa"
params.output_dir = "$projectDir/tests/outputs"
}
slurm {
process.executor = 'slurm'
}
sge {
process.executor = 'sge'
}
}
process { process {
executor = 'slurm' cpus = get_cpus(4)
cpus = 4 memory = get_mem(8.GB)
memory = 8.GB time = get_time(6.h)
time = '6h'
withLabel: small { withLabel: local {
executor = 'local' executor = 'local'
cpus = 2 }
memory = 2.GB
withLabel: small {
cpus = get_cpus(2)
memory = get_mem(2.GB)
} }
withLabel: medium { withLabel: medium {
cpus = 4 cpus = get_cpus(4)
memory = 8.GB memory = get_mem(8.GB)
} }
withLabel: large { withLabel: large {
cpus = 16 cpus = get_cpus(16)
memory = 32.GB memory = get_mem(32.GB)
} }
}
profiles { withLabel: long {
debug { time = get_time(48.h)
process.echo = true
} }
} }
def get_cpus(cpus) {
return Math.min(
params.max_cpus,
Math.max(
params.min_cpus,
cpus
)
)
}
def get_mem(mem) {
return Math.min(
params.max_mem.size,
Math.max(
params.min_mem.size,
mem.size
)
) as nextflow.util.MemoryUnit
}
def get_time(time) {
return Math.min(
params.max_time.toMillis(),
Math.max(
params.min_time.toMillis(),
time.toMillis()
)
) as nextflow.util.Duration
}
...@@ -37,13 +37,13 @@ process write_bcbio_csv { ...@@ -37,13 +37,13 @@ process write_bcbio_csv {
script: script:
""" """
#!/usr/bin/env python #!/usr/bin/env python3
import os import os
target_bed = os.path.realpath('${target_bed}') target_bed = os.path.realpath('${target_bed}')
individual_info = '$individual_info' individual_info = '$individual_info'
lines = individual_info.lstrip('[').rstrip(']').split('], [') lines = individual_info.lstrip('[').rstrip(']').split('], [')
with open('${family_id}.csv', 'w') as f: with open('${family_id}.csv', 'w') as f:
f.write('samplename,description,batch,sex,phenotype,variant_regions\\n') f.write('samplename,description,batch,sex,phenotype,variant_regions\\n')
for l in lines: for l in lines:
...@@ -71,6 +71,38 @@ process bcbio_family_processing { ...@@ -71,6 +71,38 @@ process bcbio_family_processing {
cd ${family_id}-merged && cd ${family_id}-merged &&
../${bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local ../${bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local
""" """
stub:
"""
output_dir=${family_id}-merged/results
family_dir="${family_id}-merged/results/\$(date '+%Y-%m-%d')_${family_id}-merged"
mkdir -p \$family_dir
mkdir ${family_id}-merged/config
touch ${family_id}-merged/config/${family_id}-merged{.csv,.yaml,-template.yaml}
cd \$family_dir
touch "\$(echo ${family_id} | sed 's/_//g')-gatk-haplotype-annotated.vcf.gz{,.tbi}" bcbio-nextgen{,-commands}.log data_versions.csv
touch project-summary.yaml metadata.csv programs.txt
mkdir multiqc
touch list_files_final.txt multiqc_config.yaml multiqc_report.html
mkdir multiqc_data report
cd ..
for i in ${individuals.collect().join(' ')}
do
mkdir -p \$i/qc
cd \$i
touch \$i-{callable.bed,ready.bam,ready.bam.bai}
cd qc
mkdir contamination coverage fastqc peddy samtools variants
touch contamination/\$i-verifybamid.{out,selfSM}
touch coverage/cleaned-Twist_Exome_RefSeq_targets_hg38.plus15bp-merged-padded.bed
touch fastqc/{\$i.zip,fastqc_data.txt,fastqc_report.html}
touch peddy/{\$i.ped_check.csv,\$i.peddy.ped,\$i.sex_check.csv}
touch samtools/{\$i-idxstats.txt,\$i.txt}
touch variants/\${i}_bcftools_stats.txt
cd ../..
done
"""
} }
...@@ -96,6 +128,7 @@ process format_bcbio_individual_outputs { ...@@ -96,6 +128,7 @@ process format_bcbio_individual_outputs {
ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed && ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed &&
ln -s \$indv_input/qc \$indv_output/qc && ln -s \$indv_input/qc \$indv_output/qc &&
# todo: make cram compression its own process
bam=\$indv_input/\$i-ready.bam bam=\$indv_input/\$i-ready.bam
cram="\$indv_output/\$i-ready.cram" && cram="\$indv_output/\$i-ready.cram" &&
\$samtools view -@ ${task.cpus} -T ${reference_genome} -C -o \$cram \$bam && \$samtools view -@ ${task.cpus} -T ${reference_genome} -C -o \$cram \$bam &&
...@@ -113,6 +146,29 @@ process format_bcbio_individual_outputs { ...@@ -113,6 +146,29 @@ process format_bcbio_individual_outputs {
fi fi
done done
""" """
stub:
"""
mkdir individual_outputs
for i in ${individuals.join(' ')}
do
indv_input=\$PWD/${bcbio_output_dir}/results/\$i
indv_output=individual_outputs/\$i &&
mkdir -p \$indv_output &&
ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed &&
ln -s \$indv_input/qc \$indv_output/qc &&
bam=\$indv_input/\$i-ready.bam
cram="\$indv_output/\$i-ready.cram" &&
cp \$bam \$cram &&
touch \$cram.crai &&
bam_flagstat=./\$i-ready.bam.flagstat.txt &&
cram_flagstat=\$cram.flagstat.txt &&
touch \$bam_flagstat &&
touch \$cram_flagstat
done
"""
} }
...@@ -185,7 +241,7 @@ process collate_pipeline_outputs { ...@@ -185,7 +241,7 @@ process collate_pipeline_outputs {
outputs=${params.pipeline_project_id}_${params.pipeline_project_version} outputs=${params.pipeline_project_id}_${params.pipeline_project_version}
mkdir \$outputs && mkdir \$outputs &&
mkdir \$outputs/{config,families,params,prioritization,qc} && mkdir \$outputs/{config,families,params,prioritization,qc} &&
for d in ${bcbio_family_output_dirs.join(' ')} for d in ${bcbio_family_output_dirs.join(' ')}
do do
cp -rL \$d \$outputs/families/\$(basename \$d) cp -rL \$d \$outputs/families/\$(basename \$d)
...@@ -197,12 +253,14 @@ process collate_pipeline_outputs { ...@@ -197,12 +253,14 @@ process collate_pipeline_outputs {
done && done &&
cd \$outputs/families && cd \$outputs/families &&
# todo: make multiqc its own process
../../${bcbio}/anaconda/bin/multiqc \ ../../${bcbio}/anaconda/bin/multiqc \
--title "Trio whole exome QC report: ${params.pipeline_project_id}_${params.pipeline_project_version}" \ --title "Trio whole exome QC report: ${params.pipeline_project_id}_${params.pipeline_project_version}" \
--outdir ../qc \ --outdir ../qc \
--filename ${params.pipeline_project_id}_${params.pipeline_project_version}_qc_report.html \ --filename ${params.pipeline_project_id}_${params.pipeline_project_version}_qc_report.html \
. && . &&
peddy_validation_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt && peddy_validation_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt &&
peddy_validation.pl \ peddy_validation.pl \
--output \$peddy_validation_output \ --output \$peddy_validation_output \
...@@ -232,6 +290,49 @@ process collate_pipeline_outputs { ...@@ -232,6 +290,49 @@ process collate_pipeline_outputs {
cp -L ${samplesheet} \$outputs/params/ cp -L ${samplesheet} \$outputs/params/
done done
""" """
stub:
"""
outputs=${params.pipeline_project_id}_${params.pipeline_project_version}
mkdir \$outputs &&
mkdir \$outputs/{config,families,params,prioritization,qc} &&
for d in ${bcbio_family_output_dirs.join(' ')}
do
cp -rL \$d \$outputs/families/\$(basename \$d)
done &&
for f in ${family_ids.join(' ')}
do
grep \$f ${ped_file} > \$outputs/params/\$f.ped
done &&
cd \$outputs/families &&
# todo: make multiqc its own process
echo "Trio whole exome QC report: ${params.pipeline_project_id}_${params.pipeline_project_version}" > ../qc/${params.pipeline_project_id}_${params.pipeline_project_version}_qc_report.html &&
peddy_validation_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt &&
peddy_validation.pl \
--output \$peddy_validation_output \
--project ${params.pipeline_project_id} \
--version ${params.pipeline_project_version} \
--ped ../../${ped_file} \
--families . &&
cd ../.. &&
for d in ${raw_bcbio_output_dirs.join(' ')}
do
family_id_merged=\$(basename \$d)
family_id=\$(echo \$family_id_merged | sed 's/-merged//') &&
dest_basename=${params.pipeline_project_id}_${params.pipeline_project_version}_\$family_id &&
cp -L \$d/config/\$family_id_merged.csv \$outputs/params/\$dest_basename.csv &&
cp -L \$d/config/\$family_id_merged.yaml \$outputs/config/\$dest_basename.yaml &&
cp -L ${ped_file} \$outputs/params/ &&
cp -L ${samplesheet} \$outputs/params/
done
"""
} }
...@@ -250,7 +351,7 @@ workflow process_families { ...@@ -250,7 +351,7 @@ workflow process_families {
ch_individuals ch_individuals
ch_ped_file ch_ped_file
ch_samplesheet ch_samplesheet
main: main:
ch_bcbio = file(params.bcbio, checkIfExists: true) ch_bcbio = file(params.bcbio, checkIfExists: true)
ch_bcbio_template = file(params.bcbio_template, checkIfExists: true) ch_bcbio_template = file(params.bcbio_template, checkIfExists: true)
...@@ -301,7 +402,7 @@ workflow process_families { ...@@ -301,7 +402,7 @@ workflow process_families {
ch_bcbio_family_outputs, ch_bcbio_family_outputs,
ch_bcbio, ch_bcbio,
ch_reference_genome ch_reference_genome
) )
ch_formatted_bcbio_outputs = format_bcbio_family_outputs( ch_formatted_bcbio_outputs = format_bcbio_family_outputs(
ch_bcbio_family_outputs.join(ch_individual_folders) ch_bcbio_family_outputs.join(ch_individual_folders)
......
# Docker image for continuous integration testing in GitLab CI with
# Docker executor. The image derives from Alma Linux 8.5, and adds
# NextFlow and basic dependencies (Java, Python, Perl, etc.) but not
# bioinformatics tools - these should be mocked up in CI with stubs
FROM almalinux:8.6
WORKDIR /opt
RUN dnf install -y java-11-openjdk python3 perl > dnf_install.log 2>&1
RUN curl -L -o ./nextflow https://github.com/nextflow-io/nextflow/releases/download/v22.04.3/nextflow-22.04.3-all && chmod u+x nextflow
ENV PATH /opt:$PATH
ENTRYPOINT /bin/bash
details:
- algorithm:
platform: illumina
quality_format: standard
aligner: bwa
align_split_size: false
trim_reads: fastp
adapters: [nextera2, polyg]
mark_duplicates: true
realign: false
recalibrate: true
effects: vep
effects_transcripts: all
variantcaller: gatk-haplotype
indelcaller: false
remove_lcr: true
tools_on:
- vep_splicesite_annotations
analysis: variant2
genome_build: hg38
upload:
dir: outputs/bcbio/results