Compare revisions

eddf83bc · eddf83bc · eddf83bc · eddf83bc · eddf83bc · eddf83bc
--- a/test_process_NHS_WES_trio.sh
+++ b/test_process_NHS_WES_trio.sh
--- a/test_run_processing.sh
+++ b/test_run_processing.sh
--- a/trio_cram_setup.sh
+++ b/trio_cram_setup.sh
--- a/trio_setup.sh
+++ b/trio_setup.sh
--- a/trio_wes_prepare_bcbio_config.sh
+++ b/trio_wes_prepare_bcbio_config.sh
--- a/trio_wes_prepare_bcbio_config_crf.sh
+++ b/trio_wes_prepare_bcbio_config_crf.sh
--- a/trio_wes_prepare_bcbio_config_singleton_from_duo.sh
+++ b/trio_wes_prepare_bcbio_config_singleton_from_duo.sh
--- a/trio_whole_exome_config.sh
+++ b/trio_whole_exome_config.sh
--- a/trio_whole_exome_create_parameter_files.pl
+++ b/trio_whole_exome_create_parameter_files.pl
--- a/docs/run-notes/19973_run_notes.md
+++ b/docs/run-notes/19973_run_notes.md
+# Run 19973
+## Pre-run info
+### Other family members on previous runs
+* 453234, parents (134865 and 134864) are on this run, proband was on run 19690 (132524)
+* 414996, proband (135612) and father (117093) are on this run, mother was on run 18203 (121256)
+* 457300, proband (134113) and mother (134114) are on this run, father was on run 19875 (134116)
+### Singletons
+* 457456 (FFPE tissue, might fail)
+* 480173
+### Duos (for singleton analysis)
+* 457600 (proband: 134484)
+* 458099 (proband: 129313)
+### Priority
+* 414996, 480290, 457300, 481401, 481293
+## QC summary
+* 457546
+  * 135245 - proband - 0.3M reads, contamination 14%, C>T/G>A deamination artefacts (probably 30% of variants) - fail, do not proceed with prioritization
+* 457300
+  * 134113 - proband - 27X - low but usable
+  * 134116 - father - 33X - low but usable, failed per-tile sequence quality in FastQC (unusual)
+* 481089
+  * 105222 - mother - 42X - borderline low, fine to analyze
+* 473144
+  * 135054 - father - 45X - borderline low, fine to analyze
+* 414996
+  * 117093 - father - contamination 10% - proceed with prioritization with caution
--- a/main.nf
+++ b/main.nf
@@ -41,4 +41,3 @@ workflow {
        exit 1, 'params.workflow required - variant-calling or variant-prioritisation'
    }
 }
--- a/nextflow.config
+++ b/nextflow.config
+params {
+    max_cpus = 16
+    max_mem = 32.GB
+    max_time = 48.h
+    min_cpus = 1
+    min_mem = 1.GB
+    min_time = 2.h
+    bcbio_template = "$projectDir/assets/trio_whole_exome_bcbio_template.yaml"
+}
+profiles {
+    standard {
+        process.executor = 'local'
+    }
+    debug {
+        process.echo = true
+    }
+    stubs {
+        process.executor = 'local'
+        params.max_cpus = 1
+        params.max_mem = 500.MB
+        params.max_time = 1.h
+        params.bcbio = "$projectDir/tests/scripts/bcbio_nextgen.py"
+        params.target_bed = "$projectDir/tests/assets/input_data/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed"
+        params.reference_genome = "$projectDir/tests/assets/ref.fa"
+        params.output_dir = "$projectDir/tests/outputs"
+    }
+    slurm {
+        process.executor = 'slurm'
+    }
+    sge {
+        process.executor = 'sge'
+    }
+}
 process {
-    executor = 'slurm'
+    cpus = get_cpus(4)
-    cpus = 4
+    memory = get_mem(8.GB)
-    memory = 8.GB
+    time = get_time(6.h)
-    time = '6h'
-    withLabel: small {
+    withLabel: local {
        executor = 'local'
-        cpus = 2
+    }
-        memory = 2.GB
+    withLabel: small {
+        cpus = get_cpus(2)
+        memory = get_mem(2.GB)
    }
    withLabel: medium {
-        cpus = 4
+        cpus = get_cpus(4)
-        memory = 8.GB
+        memory = get_mem(8.GB)
    }
    withLabel: large {
-        cpus = 16
+        cpus = get_cpus(16)
-        memory = 32.GB
+        memory = get_mem(32.GB)
    }
-}
-profiles {
+    withLabel: long {
-    debug {
+        time = get_time(48.h)
-        process.echo = true
    }
 }
+def get_cpus(cpus) {
+    return Math.min(
+        params.max_cpus,
+        Math.max(
+            params.min_cpus,
+            cpus
+        )
+    )
+}
+def get_mem(mem) {
+    return Math.min(
+        params.max_mem.size,
+        Math.max(
+            params.min_mem.size,
+            mem.size
+        )
+    ) as nextflow.util.MemoryUnit
+}
+def get_time(time) {
+    return Math.min(
+        params.max_time.toMillis(),
+        Math.max(
+            params.min_time.toMillis(),
+            time.toMillis()
+        )
+    ) as nextflow.util.Duration
+}
--- a/pipeline/var_calling.nf
+++ b/pipeline/var_calling.nf
@@ -37,13 +37,13 @@ process write_bcbio_csv {
    script:
    """
-    #!/usr/bin/env python
+    #!/usr/bin/env python3
    import os
    target_bed = os.path.realpath('${target_bed}')
    individual_info = '$individual_info'
    lines = individual_info.lstrip('[').rstrip(']').split('], [')
    with open('${family_id}.csv', 'w') as f:
        f.write('samplename,description,batch,sex,phenotype,variant_regions\\n')
        for l in lines:
@@ -71,6 +71,38 @@ process bcbio_family_processing {
    cd ${family_id}-merged &&
    ../${bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local
    """
+    stub:
+    """
+    output_dir=${family_id}-merged/results
+    family_dir="${family_id}-merged/results/\$(date '+%Y-%m-%d')_${family_id}-merged"
+    mkdir -p \$family_dir
+    mkdir ${family_id}-merged/config
+    touch ${family_id}-merged/config/${family_id}-merged{.csv,.yaml,-template.yaml}
+    cd \$family_dir
+    touch "\$(echo ${family_id} | sed 's/_//g')-gatk-haplotype-annotated.vcf.gz{,.tbi}" bcbio-nextgen{,-commands}.log data_versions.csv
+    touch project-summary.yaml metadata.csv programs.txt
+    mkdir multiqc
+    touch list_files_final.txt multiqc_config.yaml multiqc_report.html
+    mkdir multiqc_data report
+    cd ..
+    for i in ${individuals.collect().join(' ')}
+    do
+        mkdir -p \$i/qc
+        cd \$i
+        touch \$i-{callable.bed,ready.bam,ready.bam.bai}
+        cd qc
+        mkdir contamination coverage fastqc peddy samtools variants
+        touch contamination/\$i-verifybamid.{out,selfSM}
+        touch coverage/cleaned-Twist_Exome_RefSeq_targets_hg38.plus15bp-merged-padded.bed
+        touch fastqc/{\$i.zip,fastqc_data.txt,fastqc_report.html}
+        touch peddy/{\$i.ped_check.csv,\$i.peddy.ped,\$i.sex_check.csv}
+        touch samtools/{\$i-idxstats.txt,\$i.txt}
+        touch variants/\${i}_bcftools_stats.txt
+        cd ../..
+    done
+    """
 }
@@ -96,6 +128,7 @@ process format_bcbio_individual_outputs {
        ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed &&
        ln -s \$indv_input/qc \$indv_output/qc &&
+        # todo: make cram compression its own process
        bam=\$indv_input/\$i-ready.bam
        cram="\$indv_output/\$i-ready.cram" &&
        \$samtools view -@ ${task.cpus} -T ${reference_genome} -C -o \$cram \$bam &&
@@ -113,6 +146,29 @@ process format_bcbio_individual_outputs {
        fi
    done
    """
+    stub:
+    """
+    mkdir individual_outputs
+    for i in ${individuals.join(' ')}
+    do
+        indv_input=\$PWD/${bcbio_output_dir}/results/\$i
+        indv_output=individual_outputs/\$i &&
+        mkdir -p \$indv_output &&
+        ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed &&
+        ln -s \$indv_input/qc \$indv_output/qc &&
+        bam=\$indv_input/\$i-ready.bam
+        cram="\$indv_output/\$i-ready.cram" &&
+        cp \$bam \$cram &&
+        touch \$cram.crai &&
+        bam_flagstat=./\$i-ready.bam.flagstat.txt &&
+        cram_flagstat=\$cram.flagstat.txt &&
+        touch \$bam_flagstat &&
+        touch \$cram_flagstat
+    done
+    """
 }
@@ -185,7 +241,7 @@ process collate_pipeline_outputs {
    outputs=${params.pipeline_project_id}_${params.pipeline_project_version}
    mkdir \$outputs &&
    mkdir \$outputs/{config,families,params,prioritization,qc} &&
    for d in ${bcbio_family_output_dirs.join(' ')}
    do
        cp -rL \$d \$outputs/families/\$(basename \$d)
@@ -197,12 +253,14 @@ process collate_pipeline_outputs {
    done &&
    cd \$outputs/families &&
+    # todo: make multiqc its own process
    ../../${bcbio}/anaconda/bin/multiqc \
        --title "Trio whole exome QC report: ${params.pipeline_project_id}_${params.pipeline_project_version}" \
        --outdir ../qc \
        --filename ${params.pipeline_project_id}_${params.pipeline_project_version}_qc_report.html \
        . &&
    peddy_validation_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt &&
    peddy_validation.pl \
        --output \$peddy_validation_output \
@@ -232,6 +290,49 @@ process collate_pipeline_outputs {
        cp -L ${samplesheet} \$outputs/params/
    done
    """
+    stub:
+    """
+    outputs=${params.pipeline_project_id}_${params.pipeline_project_version}
+    mkdir \$outputs &&
+    mkdir \$outputs/{config,families,params,prioritization,qc} &&
+    for d in ${bcbio_family_output_dirs.join(' ')}
+    do
+        cp -rL \$d \$outputs/families/\$(basename \$d)
+    done &&
+    for f in ${family_ids.join(' ')}
+    do
+        grep \$f ${ped_file} > \$outputs/params/\$f.ped
+    done &&
+    cd \$outputs/families &&
+    # todo: make multiqc its own process
+    echo "Trio whole exome QC report: ${params.pipeline_project_id}_${params.pipeline_project_version}" > ../qc/${params.pipeline_project_id}_${params.pipeline_project_version}_qc_report.html &&
+    peddy_validation_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt &&
+    peddy_validation.pl \
+        --output \$peddy_validation_output \
+        --project ${params.pipeline_project_id} \
+        --version ${params.pipeline_project_version} \
+        --ped ../../${ped_file} \
+        --families . &&
+    cd ../.. &&
+    for d in ${raw_bcbio_output_dirs.join(' ')}
+    do
+        family_id_merged=\$(basename \$d)
+        family_id=\$(echo \$family_id_merged | sed 's/-merged//') &&
+        dest_basename=${params.pipeline_project_id}_${params.pipeline_project_version}_\$family_id &&
+        cp -L \$d/config/\$family_id_merged.csv \$outputs/params/\$dest_basename.csv &&
+        cp -L \$d/config/\$family_id_merged.yaml \$outputs/config/\$dest_basename.yaml &&
+        cp -L ${ped_file} \$outputs/params/ &&
+        cp -L ${samplesheet} \$outputs/params/
+    done
+    """
 }
@@ -250,7 +351,7 @@ workflow process_families {
        ch_individuals
        ch_ped_file
        ch_samplesheet
    main:
        ch_bcbio = file(params.bcbio, checkIfExists: true)
        ch_bcbio_template = file(params.bcbio_template, checkIfExists: true)
@@ -301,7 +402,7 @@ workflow process_families {
            ch_bcbio_family_outputs,
            ch_bcbio,
            ch_reference_genome
        )
        ch_formatted_bcbio_outputs = format_bcbio_family_outputs(
            ch_bcbio_family_outputs.join(ch_individual_folders)

--- a/tests/Dockerfile
+++ b/tests/Dockerfile
+# Docker image for continuous integration testing in GitLab CI with
+# Docker executor. The image derives from Alma Linux 8.5, and adds
+# NextFlow and basic dependencies (Java, Python, Perl, etc.) but not
+# bioinformatics tools - these should be mocked up in CI with stubs
+FROM almalinux:8.6
+WORKDIR /opt
+RUN dnf install -y java-11-openjdk python3 perl > dnf_install.log 2>&1
+RUN curl -L -o ./nextflow https://github.com/nextflow-io/nextflow/releases/download/v22.04.3/nextflow-22.04.3-all && chmod u+x nextflow
+ENV PATH /opt:$PATH
+ENTRYPOINT /bin/bash
--- a/tests/assets/bcbio/bcbio_template.yaml
+++ b/tests/assets/bcbio/bcbio_template.yaml
-details:
- algorithm:
-    platform: illumina
-    quality_format: standard
-    aligner: bwa
-    align_split_size: false
-    trim_reads: fastp
-    adapters: [nextera2, polyg]
-    mark_duplicates: true
-    realign: false
-    recalibrate: true
-    effects: vep
-    effects_transcripts: all
-    variantcaller: gatk-haplotype
-    indelcaller: false
-    remove_lcr: true
-    tools_on:
-    - vep_splicesite_annotations
-  analysis: variant2
-  genome_build: hg38
-upload:
-  dir: outputs/bcbio/results
--- a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz
+++ b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz
--- a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz
+++ b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz
--- a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz
+++ b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz
--- a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz
+++ b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz
--- a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz
+++ b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz
No results found