From 09a5543a7fd4796823871b99e45e0722bfebeaa4 Mon Sep 17 00:00:00 2001 From: Murray Wham <murray.wham@ed.ac.uk> Date: Fri, 25 Feb 2022 12:52:18 +0000 Subject: [PATCH 1/5] Adding process labels for resource managers. Fixing fastq merging, BCBio. Adding recognition for other input run formats. --- pipeline/main.nf | 73 +++++++++++++++++++++++------------------- pipeline/validation.nf | 39 +++++++++++++++++++--- 2 files changed, 74 insertions(+), 38 deletions(-) diff --git a/pipeline/main.nf b/pipeline/main.nf index 2fb6521..cae26d8 100644 --- a/pipeline/main.nf +++ b/pipeline/main.nf @@ -5,9 +5,13 @@ include {validation} from './validation.nf' params.bcbio = null params.bcbio_template = null +params.output_dir = null +params.target_bed = null process merge_fastqs { - publishDir "outputs/individuals/$indv_id/merged_fastqs", mode: 'copy' + label 'medium' + + publishDir "${params.output_dir}/individuals/$indv_id/merged_fastqs", mode: 'copy' input: tuple(val(indv_id), path(r1), path(r2)) @@ -22,13 +26,15 @@ process merge_fastqs { script: // todo: pigz if gzip becomes a bottleneck """ - cat ${r1.join(' ')} | gzip -c > ${indv_id}_merged_r1.fastq.gz & - cat ${r2.join(' ')} | gzip -c > ${indv_id}_merged_r2.fastq.gz + zcat ${r1.join(' ')} | gzip -c > ${indv_id}_merged_r1.fastq.gz & + zcat ${r2.join(' ')} | gzip -c > ${indv_id}_merged_r2.fastq.gz """ } process write_bcbio_csv { - publishDir "outputs/families/$family_id", mode: 'copy' + label 'small' + + publishDir "${params.output_dir}/families/$family_id", mode: 'copy' input: tuple(val(family_id), val(individual_info)) @@ -46,38 +52,29 @@ process write_bcbio_csv { with open('${family_id}.csv', 'w') as f: f.write('samplename,description,batch,sex,phenotype,variant_regions\\n') for l in lines: - l = l.lstrip('[').rstrip(']').split(', ') - f.write(','.join(l)) - f.write('\\n') + f.write(l.replace(', ', ',') + '\\n') """ } -process prepare_bcbio_yaml { - publishDir "outputs/families/$family_id", mode: 'copy' +process bcbio { + label 'large' - input: - tuple(val(family_id), val(family_fastqs)) - val(family_csv) + publishDir "${params.output_dir}/families/$family_id", mode: 'copy' - output: - path("${family_id}.yaml") + input: + tuple(val(family_id), val(individuals)) + path(family_csv) script: """ - ${params.bcbio_prepare_samples} --out . --csv $family_csv - - ${params.bcbio} -w template ${params.bcbio_template} ${family_id}-merged.csv ${family_fastqs.join(' ')} - """ -} + ${params.bcbio}/anaconda/bin/bcbio_prepare_samples.py --out . --csv $family_csv && -process run_bcbio { - input: - path(bcbio_config) + ${params.bcbio}/anaconda/bin/bcbio_nextgen.py -w template ${params.bcbio_template} ${family_csv.getBaseName()}-merged.csv ${individuals.collect({"${it}.fastq.gz"}).join(' ')} && - script: - """ - ${params.bcbio} $bcbio_config -n 16 -t local + cd ${family_id}-merged && + export PATH=$PATH:${params.bcbio}/tools/bin && + ${params.bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local """ } @@ -100,20 +97,30 @@ workflow prepare_bcbio_config { ch_joined_family_info = ch_individuals_by_family.map({ k, v -> v }) .join(ch_merged_fastqs) - ch_metadata = ch_joined_family_info.map( + ch_joined_family_info.map( { sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 -> - [family_id, [sample_id, father, mother, sex, phenotype, merged_r1, merged_r2]] - }).groupTuple() + [family_id, sample_id, father, mother, sex, phenotype, merged_r1] + }) + .tap {ch_read1_meta} // I hate this syntax so much - ch_family_fastqs = ch_joined_family_info.map( + ch_joined_family_info.map( { sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 -> - [family_id, merged_r1, merged_r2] + [family_id, sample_id, father, mother, sex, phenotype, merged_r2] + }) + .tap {ch_read2_meta} + + ch_metadata = ch_read1_meta.mix(ch_read2_meta).map( + { family_id, sample_id, father, mother, sex, phenotype, merged_fastq -> + [family_id, [merged_fastq, sample_id, family_id, sex, phenotype, params.target_bed]] } ).groupTuple() - ch_bcbio_csv = write_bcbio_csv(ch_metadata) - ch_bcbio_yaml = prepare_bcbio_yaml(ch_family_fastqs, ch_bcbio_csv) - run_bcbio(ch_bcbio_yaml) + + ch_individuals = ch_joined_family_info.map( + { sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 -> + [family_id, sample_id] + }).groupTuple() + bcbio(ch_individuals, ch_bcbio_csv) } diff --git a/pipeline/validation.nf b/pipeline/validation.nf index ba6c229..c02547a 100644 --- a/pipeline/validation.nf +++ b/pipeline/validation.nf @@ -3,14 +3,43 @@ nextflow.enable.dsl = 2 include {read_inputs} from './inputs.nf' process check_md5s { + label 'small' + input: - val(md5sum_file) + val(fastq_file) + // getParent() gives null object errors if this is a path script: """ - # I don't understand why getParent works since md5sum_file is not a path, but it works - cd ${md5sum_file.getParent()} - md5sum -c ${md5sum_file.getName()} + edgen_dir=${fastq_file.getParent().getParent()} + edgen_md5_file=\$edgen_dir/md5sums.txt + edgen_fastq=${fastq_file.getParent().getName()}/${fastq_file.getName()} + + crf_dir=${fastq_file.getParent()} + crf_md5_file="\$(ls \$crf_dir/*md5.txt | head -n 1)" + crf_fastq=${fastq_file.getName()} + + local_md5_file=${fastq_file}.md5 + + if [ -f \$edgen_md5_file ] + then + cd \$edgen_dir + md5sum -c <(cat md5sums.txt | grep \$edgen_fastq) + + elif [ -f \$crf_md5_file ] + then + cd \$crf_dir + md5sum -c <(cat \$crf_md5_file | grep \$crf_fastq) + + elif [ -f \$local_md5_file ] + then + cd ${fastq_file.getParent()} + md5sum -c \$local_md5_file + + else + echo "Could not find md5 file for $fastq_file" + exit 1 + fi """ } @@ -37,5 +66,5 @@ workflow validation { { fastq -> fastq.getParent().getParent() + '/md5sums.txt' } ).unique() - check_md5s(ch_md5_files) + check_md5s(ch_fastqs) } -- GitLab From 8f754358e536de8c2565814ea42dd2f931d06deb Mon Sep 17 00:00:00 2001 From: Murray Wham <murray.wham@ed.ac.uk> Date: Fri, 1 Apr 2022 13:09:35 +0100 Subject: [PATCH 2/5] Moving BCBio outputs into the right places --- pipeline/inputs.nf | 32 +++++--- pipeline/main.nf | 173 +++++++++++++++++++++++++++++++++-------- pipeline/validation.nf | 19 ++--- 3 files changed, 172 insertions(+), 52 deletions(-) diff --git a/pipeline/inputs.nf b/pipeline/inputs.nf index e5f6b31..d9e1a63 100644 --- a/pipeline/inputs.nf +++ b/pipeline/inputs.nf @@ -64,8 +64,25 @@ workflow read_inputs { .groupTuple() /* - ch_individuals_by_family - merge of samplesheet and ped file information, - grouped by family ID: + ch_individuals - merge of samplesheet and ped file information: + [ + [ + indv_id, + family_id, + father_id, + mother_id, + sex, + affected, + [r1_fastqs], + [r2_fastqs] + ], + ... + ] + */ + ch_individuals = ch_ped_file_info.join(ch_samplesheet_info) + + /* + ch_individuals_by_family - as ch_individuals, but grouped by family: [ [ indv1, @@ -76,19 +93,16 @@ workflow read_inputs { mother_id, sex, affected, - r1_fastqs, - r2_fastqs + [r1_fastqs], + [r2_fastqs] ] ], ... ] */ - ch_individuals_by_family = ch_ped_file_info - .join(ch_samplesheet_info) - .map({[it[1], it]}) + ch_individuals_by_family = ch_individuals.map({[it[1], it]}) emit: - ch_samplesheet_info - ch_ped_file_info + ch_individuals ch_individuals_by_family } diff --git a/pipeline/main.nf b/pipeline/main.nf index cae26d8..c7df05e 100644 --- a/pipeline/main.nf +++ b/pipeline/main.nf @@ -1,20 +1,20 @@ nextflow.enable.dsl = 2 -include {read_inputs} from './inputs.nf' -include {validation} from './validation.nf' +include {read_inputs} from './pipeline/inputs.nf' +include {validation} from './pipeline/validation.nf' params.bcbio = null params.bcbio_template = null params.output_dir = null +params.pipeline_project_id = null +params.pipeline_project_version = null params.target_bed = null process merge_fastqs { label 'medium' - publishDir "${params.output_dir}/individuals/$indv_id/merged_fastqs", mode: 'copy' - input: - tuple(val(indv_id), path(r1), path(r2)) + tuple(val(indv_id), val(family_id), val(father), val(mother), val(sex), val(affected), path(r1), path(r2)) output: tuple( @@ -34,13 +34,11 @@ process merge_fastqs { process write_bcbio_csv { label 'small' - publishDir "${params.output_dir}/families/$family_id", mode: 'copy' - input: tuple(val(family_id), val(individual_info)) output: - path("${family_id}.csv") + tuple(val(family_id), path("${family_id}.csv")) script: """ @@ -60,26 +58,135 @@ process write_bcbio_csv { process bcbio { label 'large' - publishDir "${params.output_dir}/families/$family_id", mode: 'copy' - input: - tuple(val(family_id), val(individuals)) - path(family_csv) + tuple(val(family_id), val(individuals), path(family_csv)) + + output: + tuple(val(family_id), val(individuals), path("${family_id}-merged")) script: """ ${params.bcbio}/anaconda/bin/bcbio_prepare_samples.py --out . --csv $family_csv && - ${params.bcbio}/anaconda/bin/bcbio_nextgen.py -w template ${params.bcbio_template} ${family_csv.getBaseName()}-merged.csv ${individuals.collect({"${it}.fastq.gz"}).join(' ')} && cd ${family_id}-merged && - export PATH=$PATH:${params.bcbio}/tools/bin && ${params.bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local + + """ +} + + +process bcbio_family_outputs { + label 'small' + + input: + tuple(val(family_id), val(individuals), path(bcbio_output_dir)) + + script: + """ + merged_bcbio_family_output="\$(ls -d $bcbio_output_dir/results/????-??-??_* | head -n 1)" && + bcbio_family_output="\$(echo \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" && + run_date="\$(basename \$bcbio_family_output | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2})_.+\$/\\1/g')" && + broken_family_id="\$(echo ${family_id} | sed 's/_//g')" && + + if [ \$merged_bcbio_family_output != \$bcbio_family_output ] && [ -d \$merged_bcbio_family_output ] + then + mv \$merged_bcbio_family_output \$bcbio_family_output + fi + + for suffix in gatk-haplotype-annotated.vcf.gz{,.tbi} + do + src=\$bcbio_family_output/\${broken_family_id}-\${suffix} + if [ -f \$src ] + then + mv \$src \$bcbio_family_output/${family_id}-\${suffix} + fi + done && + + for sample in ${individuals.join(' ')} + do + if [ -d "$bcbio_output_dir/results/\$sample" ] + then + mv "$bcbio_output_dir/results/\$sample" "\$bcbio_family_output" + fi + done && + + cd \$bcbio_family_output && + + samtools=${params.bcbio}/anaconda/bin/samtools && + for bam in */*.bam + do + cram=\${bam%.bam}.cram && + \$samtools view -@ ${task.cpus} -T ${params.reference_genome} -C -o \$cram \$bam && + rm \$bam && + \$samtools index \$cram && + + \$samtools flagstat \$bam > \$bam.flagstat.txt && + \$samtools flagstat \$cram > \$cram.flagstat.txt && + diff \$bam.flagstat.txt \$cram.flagstat.txt + + if [ \$? -ne 0 ] + then + echo "Unexpected flagstat results in \$cram.flagstat.txt" + exit 1 + fi + done && + + for f in \$(find . -type f | grep -v '\\.bam') + do + md5sum \$f >> md5sum.txt + done """ +} + + +process collate_pipeline_outputs { + label 'small' + + publishDir "${params.output_dir}/${params.pipeline_project_id}/families/$family_id", mode: 'copy', pattern: "${bcbio_output_dir}" + input: + val(family_ids) + val(bcbio_family_output_dirs) + + script: + """ + mkdir -p outputs/{config,families,params,prioritization,qc} && + + for d in ${bcbio_family_output_dirs.join(' ')} + do + ln -s \$d outputs/families/\$(basename \$d) + done && + + cd outputs/families && + multiqc \ + --title "Trio whole exome QC report: ${params.pipeline_project_id}_${params.pipeline_project_version}" \ + --outdir ../qc \ + --filename ${params.pipeline_project_id}_${params.pipeline_project_version}_qc_report.html \ + . && + + peddy_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt + # trio_whole_exome_parse_peddy_ped_csv.pl \ + # --output \$peddy_output \ + # --project ${params.pipeline_project_id} \ + # --batch ${family_ids[0].split('_')[0]} \ + # --version ${params.pipeline_project_version} \ + # --ped ${params.ped_file} \ + # --families . && + + # no && here - exit status checked below + # grep -v 'False\$' \$peddy_output + # if [ \$? -ne 0 ] + # then + # echo "Found Peddy mismatches in \$peddy_output" + # exit 1 + # fi + + """ } -workflow prepare_bcbio_config { + +workflow run_bcbio { /* - For each individual, merge all R1 and R2 fastqs - For each family: @@ -89,21 +196,19 @@ workflow prepare_bcbio_config { */ take: - ch_samplesheet_info - ch_individuals_by_family + ch_individuals main: - ch_merged_fastqs = merge_fastqs(ch_samplesheet_info) - ch_joined_family_info = ch_individuals_by_family.map({ k, v -> v }) - .join(ch_merged_fastqs) + ch_merged_fastqs = merge_fastqs(ch_individuals) + ch_joined_indv_info = ch_individuals.join(ch_merged_fastqs) - ch_joined_family_info.map( + ch_joined_indv_info.map( { sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 -> [family_id, sample_id, father, mother, sex, phenotype, merged_r1] }) .tap {ch_read1_meta} // I hate this syntax so much - ch_joined_family_info.map( + ch_joined_indv_info.map( { sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 -> [family_id, sample_id, father, mother, sex, phenotype, merged_r2] }) @@ -114,22 +219,28 @@ workflow prepare_bcbio_config { [family_id, [merged_fastq, sample_id, family_id, sex, phenotype, params.target_bed]] } ).groupTuple() - ch_bcbio_csv = write_bcbio_csv(ch_metadata) + ch_bcbio_csvs = write_bcbio_csv(ch_metadata) - ch_individuals = ch_joined_family_info.map( + ch_bcbio_inputs = ch_joined_indv_info.map( { sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 -> [family_id, sample_id] - }).groupTuple() - bcbio(ch_individuals, ch_bcbio_csv) + }).groupTuple().join(ch_bcbio_csvs) + + ch_bcbio_outputs = bcbio(ch_bcbio_inputs) + bcbio_family_outputs(ch_bcbio_outputs) + + collate_pipeline_outputs( + ch_bcbio_outputs.map({it[0]}).collect(), + ch_bcbio_outputs.map({it[2]}).collect() + ) } workflow { read_inputs() - ch_samplesheet_info = read_inputs.out[0] - ch_ped_file_info = read_inputs.out[1] - ch_individuals_by_family = read_inputs.out[2] + ch_individual_info = read_inputs.out[0] - validation(ch_samplesheet_info) - prepare_bcbio_config(ch_samplesheet_info, ch_individuals_by_family) + validation(ch_individual_info) + run_bcbio(ch_individual_info) } + diff --git a/pipeline/validation.nf b/pipeline/validation.nf index c02547a..18058f8 100644 --- a/pipeline/validation.nf +++ b/pipeline/validation.nf @@ -1,7 +1,5 @@ nextflow.enable.dsl = 2 -include {read_inputs} from './inputs.nf' - process check_md5s { label 'small' @@ -51,20 +49,17 @@ workflow validation { */ take: - ch_samplesheet_info - + ch_indv_info + main: - ch_fastqs = ch_samplesheet_info - .map( - { indv, r1, r2 -> + ch_fastqs = ch_indv_info.map( + { indv, family, father, mother, sex, affected, r1, r2 -> [r1, r2] } - ).flatten() + ) + .flatten() .map({file(it)}) - - ch_md5_files = ch_fastqs.map( - { fastq -> fastq.getParent().getParent() + '/md5sums.txt' } - ).unique() check_md5s(ch_fastqs) } + -- GitLab From 9271ba1876771213a226f75925457e1c27ed3cc5 Mon Sep 17 00:00:00 2001 From: Murray Wham <murray.wham@ed.ac.uk> Date: Tue, 12 Apr 2022 17:44:35 +0100 Subject: [PATCH 3/5] Adding parse_peddy_output check, simplifying process inputs/outputs. Outputting BCBio outputs properly, fixing outputs at end - cp-RL and mode: move. Pulling changes to trio_whole_exome_parse_peddy_ped_csv.pl from ultra2 branch --- pipeline/inputs.nf | 2 +- pipeline/main.nf | 189 ++++++++++++++---------- trio_whole_exome_parse_peddy_ped_csv.pl | 34 +++-- 3 files changed, 132 insertions(+), 93 deletions(-) diff --git a/pipeline/inputs.nf b/pipeline/inputs.nf index d9e1a63..526df57 100644 --- a/pipeline/inputs.nf +++ b/pipeline/inputs.nf @@ -85,7 +85,7 @@ workflow read_inputs { ch_individuals_by_family - as ch_individuals, but grouped by family: [ [ - indv1, + family_id, [ indv_id, family_id, diff --git a/pipeline/main.nf b/pipeline/main.nf index c7df05e..546681d 100644 --- a/pipeline/main.nf +++ b/pipeline/main.nf @@ -1,7 +1,7 @@ nextflow.enable.dsl = 2 -include {read_inputs} from './pipeline/inputs.nf' -include {validation} from './pipeline/validation.nf' +include {read_inputs} from './inputs.nf' +include {validation} from './validation.nf' params.bcbio = null params.bcbio_template = null @@ -9,12 +9,14 @@ params.output_dir = null params.pipeline_project_id = null params.pipeline_project_version = null params.target_bed = null +params.parse_peddy_output = null + process merge_fastqs { label 'medium' input: - tuple(val(indv_id), val(family_id), val(father), val(mother), val(sex), val(affected), path(r1), path(r2)) + tuple(val(indv_id), path(r1), path(r2)) output: tuple( @@ -55,7 +57,7 @@ process write_bcbio_csv { } -process bcbio { +process bcbio_family_processing { label 'large' input: @@ -71,67 +73,89 @@ process bcbio { cd ${family_id}-merged && ${params.bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local + """ +} + + +process format_bcbio_individual_outputs { + input: + tuple(val(family_id), val(individuals), path(bcbio_output_dir)) + + output: + tuple(val(family_id), path('individual_outputs')) + + script: + """ + samtools=${params.bcbio}/anaconda/bin/samtools && + mkdir individual_outputs + for i in ${individuals.join(' ')} + do + indv_input=\$PWD/${bcbio_output_dir}/results/\$i + indv_output=individual_outputs/\$i && + mkdir -p \$indv_output && + + ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed && + ln -s \$indv_input/qc \$indv_output/qc && + bam=\$indv_input/\$i-ready.bam + cram="\$indv_output/\$i-ready.cram" && + \$samtools view -@ ${task.cpus} -T ${params.reference_genome} -C -o \$cram \$bam && + \$samtools index \$cram && + bam_flagstat=./\$i-ready.bam.flagstat.txt && + cram_flagstat=\$cram.flagstat.txt && + \$samtools flagstat \$bam > \$bam_flagstat && + \$samtools flagstat \$cram > \$cram_flagstat && + diff \$bam_flagstat \$cram_flagstat + + if [ \$? -ne 0 ] + then + echo "Unexpected flagstat results in \$cram_flagstat" + exit 1 + fi + done """ } -process bcbio_family_outputs { +process format_bcbio_family_outputs { label 'small' input: - tuple(val(family_id), val(individuals), path(bcbio_output_dir)) + tuple(val(family_id), val(individuals), path(bcbio_output_dir), path(formatted_individual_outputs)) + + output: + tuple(val(family_id), path("*_${family_id}")) script: """ - merged_bcbio_family_output="\$(ls -d $bcbio_output_dir/results/????-??-??_* | head -n 1)" && + merged_bcbio_family_output="\$(ls -d \$(readlink -f $bcbio_output_dir)/results/????-??-??_* | head -n 1)" && bcbio_family_output="\$(echo \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" && run_date="\$(basename \$bcbio_family_output | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2})_.+\$/\\1/g')" && broken_family_id="\$(echo ${family_id} | sed 's/_//g')" && - if [ \$merged_bcbio_family_output != \$bcbio_family_output ] && [ -d \$merged_bcbio_family_output ] - then - mv \$merged_bcbio_family_output \$bcbio_family_output - fi + bcbio_family_output="\$(basename \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" && + mkdir \$bcbio_family_output && for suffix in gatk-haplotype-annotated.vcf.gz{,.tbi} do - src=\$bcbio_family_output/\${broken_family_id}-\${suffix} + src=\$merged_bcbio_family_output/\${broken_family_id}-\${suffix} if [ -f \$src ] then - mv \$src \$bcbio_family_output/${family_id}-\${suffix} + ln -s \$src \$bcbio_family_output/${family_id}-\${suffix} fi done && - for sample in ${individuals.join(' ')} + for f in \$merged_bcbio_family_output/*{.log,.csv,.yaml,multiqc} do - if [ -d "$bcbio_output_dir/results/\$sample" ] - then - mv "$bcbio_output_dir/results/\$sample" "\$bcbio_family_output" - fi + ln -s \$f \$bcbio_family_output/ done && - cd \$bcbio_family_output && - - samtools=${params.bcbio}/anaconda/bin/samtools && - for bam in */*.bam + for i in ${individuals.join(' ')} do - cram=\${bam%.bam}.cram && - \$samtools view -@ ${task.cpus} -T ${params.reference_genome} -C -o \$cram \$bam && - rm \$bam && - \$samtools index \$cram && - - \$samtools flagstat \$bam > \$bam.flagstat.txt && - \$samtools flagstat \$cram > \$cram.flagstat.txt && - diff \$bam.flagstat.txt \$cram.flagstat.txt - - if [ \$? -ne 0 ] - then - echo "Unexpected flagstat results in \$cram.flagstat.txt" - exit 1 - fi + ln -s \$(readlink -f $formatted_individual_outputs)/\$i \$bcbio_family_output/\$i done && + cd \$bcbio_family_output && for f in \$(find . -type f | grep -v '\\.bam') do md5sum \$f >> md5sum.txt @@ -140,66 +164,77 @@ process bcbio_family_outputs { } + process collate_pipeline_outputs { label 'small' - publishDir "${params.output_dir}/${params.pipeline_project_id}/families/$family_id", mode: 'copy', pattern: "${bcbio_output_dir}" + publishDir "${params.output_dir}", mode: 'move', pattern: "${params.pipeline_project_id}_${params.pipeline_project_version}" input: - val(family_ids) val(bcbio_family_output_dirs) + output: + path("${params.pipeline_project_id}_${params.pipeline_project_version}") + script: """ - mkdir -p outputs/{config,families,params,prioritization,qc} && + outputs=${params.pipeline_project_id}_${params.pipeline_project_version} + mkdir \$outputs && + mkdir \$outputs/{config,families,params,prioritization,qc} && for d in ${bcbio_family_output_dirs.join(' ')} do - ln -s \$d outputs/families/\$(basename \$d) + cp -rL \$d \$outputs/families/\$(basename \$d) done && - cd outputs/families && - multiqc \ + cd \$outputs/families && + ${params.bcbio}/anaconda/bin/multiqc \ --title "Trio whole exome QC report: ${params.pipeline_project_id}_${params.pipeline_project_version}" \ --outdir ../qc \ --filename ${params.pipeline_project_id}_${params.pipeline_project_version}_qc_report.html \ . && - peddy_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt - # trio_whole_exome_parse_peddy_ped_csv.pl \ - # --output \$peddy_output \ - # --project ${params.pipeline_project_id} \ - # --batch ${family_ids[0].split('_')[0]} \ - # --version ${params.pipeline_project_version} \ - # --ped ${params.ped_file} \ - # --families . && + peddy_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt && + perl ${params.parse_peddy_output} \ + --output \$peddy_output \ + --project ${params.pipeline_project_id} \ + --batch ${bcbio_family_output_dirs[0].getName().split('_')[1]} \ + --version ${params.pipeline_project_version} \ + --ped ${params.ped_file} \ + --families . && # no && here - exit status checked below - # grep -v 'False\$' \$peddy_output - # if [ \$? -ne 0 ] - # then - # echo "Found Peddy mismatches in \$peddy_output" - # exit 1 - # fi - + grep -v 'False\$' \$peddy_output + if [ \$? -ne 0 ] + then + echo "Found Peddy mismatches in \$peddy_output" + exit 1 + fi """ } -workflow run_bcbio { +workflow process_families { /* - - For each individual, merge all R1 and R2 fastqs - - For each family: + - For each individual in the family, merge all R1 and R2 fastqs + - For the family: - Read the relevant information from the samplesheet, ped files and merged fastqs - Write a BCBio config file - Run BCBio on it + - Move files around into the right places + - Run CRAM compression, MultiQC and MD5 checks */ take: ch_individuals main: - ch_merged_fastqs = merge_fastqs(ch_individuals) + ch_merged_fastqs = merge_fastqs( + ch_individuals.map( + { indv, family, father, mother, sex, affected, r1, r2 -> + [ indv, r1, r2 ] + }) + ) ch_joined_indv_info = ch_individuals.join(ch_merged_fastqs) ch_joined_indv_info.map( @@ -214,33 +249,35 @@ workflow run_bcbio { }) .tap {ch_read2_meta} - ch_metadata = ch_read1_meta.mix(ch_read2_meta).map( - { family_id, sample_id, father, mother, sex, phenotype, merged_fastq -> - [family_id, [merged_fastq, sample_id, family_id, sex, phenotype, params.target_bed]] - } - ).groupTuple() - ch_bcbio_csvs = write_bcbio_csv(ch_metadata) + ch_bcbio_csvs = write_bcbio_csv( + ch_read1_meta.mix(ch_read2_meta).map( + { family_id, sample_id, father, mother, sex, phenotype, merged_fastq -> + [family_id, [merged_fastq, sample_id, family_id, sex, phenotype, params.target_bed]] + } + ).groupTuple() + ) ch_bcbio_inputs = ch_joined_indv_info.map( { sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 -> [family_id, sample_id] }).groupTuple().join(ch_bcbio_csvs) - ch_bcbio_outputs = bcbio(ch_bcbio_inputs) - bcbio_family_outputs(ch_bcbio_outputs) - - collate_pipeline_outputs( - ch_bcbio_outputs.map({it[0]}).collect(), - ch_bcbio_outputs.map({it[2]}).collect() + ch_bcbio_family_outputs = bcbio_family_processing(ch_bcbio_inputs) + ch_individual_folders = format_bcbio_individual_outputs(ch_bcbio_family_outputs) + ch_formatted_bcbio_outputs = format_bcbio_family_outputs( + ch_bcbio_family_outputs.join(ch_individual_folders) ) + + collate_pipeline_outputs(ch_formatted_bcbio_outputs.map({it[1]}).collect()) } workflow { read_inputs() - ch_individual_info = read_inputs.out[0] + ch_individuals = read_inputs.out[0] + ch_individuals_by_family = read_inputs.out[1] - validation(ch_individual_info) - run_bcbio(ch_individual_info) + validation(ch_individuals) + process_families(ch_individuals) } diff --git a/trio_whole_exome_parse_peddy_ped_csv.pl b/trio_whole_exome_parse_peddy_ped_csv.pl index e72bda8..05e4d02 100644 --- a/trio_whole_exome_parse_peddy_ped_csv.pl +++ b/trio_whole_exome_parse_peddy_ped_csv.pl @@ -21,30 +21,33 @@ use IO::File; my $usage = qq{USAGE: $0 [--help] - --output Output directory - --ped Pedigree file for project - --project Project id - --batch Batch id - --version Analysis run version (v1, v2, etc) + --output Output file + --families Family directory + --ped Pedigree file for project + --project Project id + --batch Batch id + --version Analysis run version (v1, v2, etc) }; my $help = 0; my $ped_file; +my $fam_dir; my $project_id; my $version; -my $out_dir; +my $out_file; my $batch_id; GetOptions( - 'help' => \$help, - 'project=s' => \$project_id, - 'ped=s' => \$ped_file, - 'output=s' => \$out_dir, - 'version=s' => \$version, - 'batch=s' => \$batch_id + 'help' => \$help, + 'project=s' => \$project_id, + 'ped=s' => \$ped_file, + 'output=s' => \$out_file, + 'families=s' => \$fam_dir, + 'version=s' => \$version, + 'batch=s' => \$batch_id ) or die $usage; -if ($help || !$project_id || !$ped_file || !$out_dir || !$batch_id || !$version) +if ($help || !$project_id || !$ped_file || !$out_file || !$batch_id || !$version || !$fam_dir) { print $usage; exit(0); @@ -70,7 +73,6 @@ while (my $line = <$in_fh>) $in_fh->close(); -my $out_file = sprintf("$out_dir/qc/%s_%s.ped_check.txt", $version, $project_id); my $out_fh = new IO::File; $out_fh->open($out_file, "w") or die "Could not open $out_file\n$!"; @@ -78,8 +80,8 @@ printf $out_fh "project_id\tbatch_id\tsample_a\tsample_b\tpedigree_parents\tpred foreach my $family_id (sort keys %ped) { - my @peddy_glob = glob(sprintf("$out_dir/*/*_%s_%s_%s_%s/%s_%s/qc/peddy/%s%s.ped_check.csv", - $version, $project_id, $batch_id, $family_id, $ped{$family_id}{'aff'}, $family_id, $batch_id, $family_id)); + my @peddy_glob = glob(sprintf("$fam_dir/*_%s_%s_%s_%s/%s_%s/qc/peddy/%s%s.ped_check.csv", + $project_id, $version, $batch_id, $family_id, $ped{$family_id}{'aff'}, $family_id, $batch_id, $family_id)); next if (scalar(@peddy_glob) == 0); my $peddy_fh = new IO::File; -- GitLab From 95f1c2d0abfa8ad5eada82d153ab10ac6f94ea65 Mon Sep 17 00:00:00 2001 From: Murray Wham <murray.wham@ed.ac.uk> Date: Wed, 13 Apr 2022 12:14:19 +0100 Subject: [PATCH 4/5] Adding config/params metadata --- pipeline/main.nf | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/pipeline/main.nf b/pipeline/main.nf index 546681d..7c55fec 100644 --- a/pipeline/main.nf +++ b/pipeline/main.nf @@ -124,7 +124,8 @@ process format_bcbio_family_outputs { tuple(val(family_id), val(individuals), path(bcbio_output_dir), path(formatted_individual_outputs)) output: - tuple(val(family_id), path("*_${family_id}")) + // we don't use bcbio_output_dir here, but we need it later + tuple(val(family_id), path("*_${family_id}"), path(bcbio_output_dir)) script: """ @@ -172,6 +173,7 @@ process collate_pipeline_outputs { input: val(bcbio_family_output_dirs) + val(raw_bcbio_output_dirs) output: path("${params.pipeline_project_id}_${params.pipeline_project_version}") @@ -209,7 +211,20 @@ process collate_pipeline_outputs { then echo "Found Peddy mismatches in \$peddy_output" exit 1 - fi + fi && + + cd ../.. && + + for d in ${raw_bcbio_output_dirs.join(' ')} + do + family_id_merged=\$(basename \$d) + family_id=\$(echo \$family_id_merged | sed 's/-merged//') && + dest_basename=${params.pipeline_project_id}_${params.pipeline_project_version}_\$family_id && + cp -L \$d/config/\$family_id_merged.csv \$outputs/params/\$dest_basename.csv && + cp -L \$d/config/\$family_id_merged.yaml \$outputs/config/\$dest_basename.yaml && + cp -L ${params.ped_file} \$outputs/params/ && + cp -L ${params.sample_sheet} \$outputs/params/ + done """ } @@ -268,7 +283,10 @@ workflow process_families { ch_bcbio_family_outputs.join(ch_individual_folders) ) - collate_pipeline_outputs(ch_formatted_bcbio_outputs.map({it[1]}).collect()) + collate_pipeline_outputs( + ch_formatted_bcbio_outputs.map({it[1]}).collect(), + ch_formatted_bcbio_outputs.map({it[2]}).collect() + ) } -- GitLab From 61c7c113543a5ec34f12dd6e91cbd11303e029d3 Mon Sep 17 00:00:00 2001 From: Murray Wham <murray.wham@ed.ac.uk> Date: Mon, 2 May 2022 11:42:30 +0100 Subject: [PATCH 5/5] Doc updates, updating format of sample sheets. Adding nextflow.config. Adding main.nf entrypoint, moving param declarations there. Making most input parameters their own Channels. --- README.md | 95 +++++++++++++------ main.nf | 47 +++++++++ nextflow.config | 30 ++++++ pipeline/inputs.nf | 20 ++-- pipeline/{main.nf => var_calling.nf} | 94 ++++++++++++------ .../ped_files/giab_test_non_trios.ped | 7 ++ .../input_data/ped_files/giab_test_trios.ped | 6 ++ .../input_data/sample_sheets/batch_1.tsv | 14 +-- .../sample_sheets/batch_2_md5_errors.tsv | 4 +- .../sample_sheets/giab_test_non_trios.tsv | 8 ++ .../sample_sheets/giab_test_trios.tsv | 7 ++ tests/run_giab_tests.sh | 31 ++++++ tests/run_tests.sh | 3 +- .../{test_config.sh => nextflow_detached.sh} | 0 14 files changed, 288 insertions(+), 78 deletions(-) create mode 100644 main.nf create mode 100644 nextflow.config rename pipeline/{main.nf => var_calling.nf} (77%) create mode 100644 tests/assets/input_data/ped_files/giab_test_non_trios.ped create mode 100644 tests/assets/input_data/ped_files/giab_test_trios.ped create mode 100644 tests/assets/input_data/sample_sheets/giab_test_non_trios.tsv create mode 100644 tests/assets/input_data/sample_sheets/giab_test_trios.tsv create mode 100755 tests/run_giab_tests.sh rename tests/scripts/{test_config.sh => nextflow_detached.sh} (100%) diff --git a/README.md b/README.md index dcd54f8..cc37a5c 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,56 @@ -# Trio-Whole-Exome pipeline +# Trio-Whole-Exome Pipeline This is an automated version of the scripts currently run manually according to SOP as part of the whole exome trios -project with David Fitzpatrick's group. This pipeline is controlled by [NextFlow](https://www.nextflow.io/) +project with David Fitzpatrick's group. This pipeline is controlled by [NextFlow](https://www.nextflow.io). ## Setup -A [Conda](https://docs.conda.io) environment containing NextFlow is available in `environment.yaml`. Once you have Conda -installed, you can create an environment by `cd`-ing into this project and running the command: +This pipeline requires: - $ conda env create -n <environment_name> +- NextFlow +- An install of BCBio v1.2.8 +A [Conda](https://docs.conda.io) environment containing NextFlow is available in `environment.yml`. This can be created +with the command: -## Running + $ conda env create -n <environment_name> -f environment.yml + + +## Running the pipeline The pipeline requires two main input files: +### Configuration + +This pipeline uses a config at trio-whole-exome/nextflow.config, containing profiles for different sizes of process. +NextFlow picks this up automatically. + +A second config is necessary for providing executor and param information. This can be supplied via the `-c` argument. +Parameters: + +- `bcbio` - path to a BCBio install, containing 'anaconda', 'galaxy', 'genomes', etc +- `bcbio_template` - path to a template config for BCBio variant calling. Should set `upload.dir: ./results` so that + BCBio will output results to the working dir. +- `output_dir` - where the results get written to on the system. The variant calling creates initial results here, + and variant prioritisation adds to them +- `target_bed` - bed file of Twist exome targets +- `reference_genome` - hg38 reference genome in fasta format +- `parse_peddy_output` - path to the parse_peddy_output Perl script. Todo: remove once scripts are in bin/ + + ### Samplesheet -This is a tab-separated file mapping fastq pairs to metadata. The columns are individual ID, family ID, fastq sample ID, -r1 fastq and r2 fastq. If a sample has been sequenced over multiple lanes, then include a line for each fastq pair: +This is a tab-separated file mapping individuals to fastq pairs. The columns are individual_id, read_1 and read_2. If a +sample has been sequenced over multiple lanes, then include a line for each fastq pair: - individual_id family_id sample_id read_1 read_2 - 000001 000001 12345_000001_000001_WESTwist_IDT-B path/to/lane_1_r1.fastq.gz path/to/lane_1_r2.fastq.gz - 000001 000001 12345_000001_000001_WESTwist_IDT-B path/to/lane_2_r1.fastq.gz path/to/lane_2_r2.fastq.gz + individual_id read_1 read_2 + 000001 path/to/lane_1_r1.fastq.gz path/to/lane_1_r2.fastq.gz + 000001 path/to/lane_2_r1.fastq.gz path/to/lane_2_r2.fastq.gz ###Â Ped file -Tab-separated Ped file mapping individuals to each other and affected status. Per the +Tab-separated Ped file mapping individuals to each other family IDs and and affected status. Per the [specification](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format), the columns are family ID, individual ID, father ID, mother ID, sex (1=male, 2=female, other=unknown), affected status (-9 or 0=missing, 1=unaffected, 2=affected): @@ -36,27 +59,45 @@ family ID, individual ID, father ID, mother ID, sex (1=male, 2=female, other=unk 000001 000002 0 0 1 1 000001 000003 0 0 2 1 -The pipeline can now be run. First, check for errors: +The pipeline does support non-trios, e.g. singletons, duos, quads. - $ nextflow run pipeline/validation.nf --ped_file path/to/batch.ped --sample_sheet path/to/batch_samplesheet.tsv +### Usage + +The pipeline can now be run. First, run the initial variant calling: + + $ nextflow path/to/trio-whole-exome/main.nf \ + -c path/to/nextflow.config + --workflow 'variant-calling' \ + --pipeline_project_id projname --pipeline_project_version v1 \ + --ped_file path/to/batch.ped \ + --sample_sheet path/to/samplesheet.tsv + +Todo: variant prioritisation workflow -Todo: run the main processing ## Tests This pipeline has automated tests contained in the folder `tests/`. To the run the tests locally, `cd` to this folder -with your Conda environment active and run `./run_tests.sh`. +with your Conda environment active and run the test scripts: + +- run_tests.sh +- run_giab_tests.sh + +These tests use the environment variable `NEXTFLOW_CONFIG`, pointing to a platform-specific config file. + +## Terminology FAQ -## Terminology +- 'Batch' + - Slightly ambiguous term - can be a pipeline batch, a sequencing batch or a BCBio batch. To this end, a single + run of this pipeline is known as a project. +- 'Pipeline project' + - A single run of this pipeline, potentially mixing samples and families from multiple sequencing batches. There's + always one Ped file and sample sheet per pipeline project. +- 'Sequencing batch' + - A group of samples that were prepared and sequenced together. +- 'BCBio batch' + - Used internally by BCBio to identify a family. +- 'Sample ID' + - Specific to a sequencing batch, family ID, individual ID and extraction kit type -- Batch: slightly ambiguous term - could be a pipeline batch, a sequencing batch or a BCBio batch - - Pipeline batch: a single run of this pipeline, potentially mixing samples and families from multiple - sequencing batches - - Sequencing batch: a group of samples that were prepared and sequenced together - - BCBio batch: used internally by BCBio to identify a family -- Sample ID: specific to a sequencing batch, family ID, individual ID and extraction kit type -- file_list.tsv: there's one of these files per sequencing batch, summarising all fastqs in the batch. A - pipeline batch may need to refer to multiple different individuals across different file lists. -- Ped file: defines family relationships between individuals. There's always one Ped file per pipeline batch. -- Sample sheet: links the Ped file and file list(s) by defining what raw fastqs belong to each individual. diff --git a/main.nf b/main.nf new file mode 100644 index 0000000..58404a7 --- /dev/null +++ b/main.nf @@ -0,0 +1,47 @@ +nextflow.enable.dsl = 2 +include {var_calling} from './pipeline/var_calling.nf' + +// which part of the pipeline to run - either 'variant-calling' or 'variant-prioritisation' +params.workflow = null + +// path to a bcbio install, containing 'anaconda', 'galaxy', 'genomes', etc +params.bcbio = null + +// path to a template config for bcbio variant calling +params.bcbio_template = null + +// where the results get written to on the system. The variant calling creates initial +// results here, and variant prioritisation adds to them +params.output_dir = null + +// name of the pipeline batch, e.g. '21900', '20220427' +params.pipeline_project_id = null + +// version of the pipeline batch, e.g. 'v1' +params.pipeline_project_version = null + +// bed file of Twist exome targets +params.target_bed = null + +// hg38 reference genome in fasta format +params.reference_genome = null + +// path to the parse_peddy_output Perl script. Todo: remove once scripts are in bin/ +params.parse_peddy_output = null + +// path to a Ped file describing all the families in the pipeline batch +params.ped_file = null + +// path to a samplesheet mapping individual IDs to fastq pairs +params.sample_sheet = null + +workflow { + if (params.workflow == 'variant-calling') { + var_calling() + } else if (params.workflow == 'variant-prioritisation') { + println "Variant prioritisation coming soon" + } else { + exit 1, 'params.workflow required - variant-calling or variant-prioritisation' + } +} + diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..d861ce3 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,30 @@ + +process { + executor = 'slurm' + cpus = 4 + memory = 8.GB + time = '6h' + + withLabel: small { + executor = 'local' + cpus = 2 + memory = 2.GB + } + + withLabel: medium { + cpus = 4 + memory = 8.GB + } + + withLabel: large { + cpus = 16 + memory = 32.GB + } +} + +profiles { + debug { + process.echo = true + } +} + diff --git a/pipeline/inputs.nf b/pipeline/inputs.nf index 526df57..73d3a16 100644 --- a/pipeline/inputs.nf +++ b/pipeline/inputs.nf @@ -1,7 +1,5 @@ nextflow.enable.dsl = 2 -params.ped_file = null -params.sample_sheet = null workflow read_inputs { /* @@ -24,9 +22,8 @@ workflow read_inputs { ... ] */ - ped_file = file(params.ped_file, checkIfExists: true) - ch_ped_file_info = Channel.fromPath(ped_file) - .splitCsv(sep: '\t') + ch_ped_file = Channel.fromPath(params.ped_file, checkIfExists: true) + ch_ped_file_info = ch_ped_file.splitCsv(sep: '\t') .map( { line -> [ @@ -55,9 +52,8 @@ workflow read_inputs { ] ] */ - samplesheet = file(params.sample_sheet, checkIfExists: true) - ch_samplesheet_info = Channel.fromPath(samplesheet) - .splitCsv(sep:'\t', header: true) + ch_samplesheet = Channel.fromPath(params.sample_sheet, checkIfExists: true) + ch_samplesheet_info = ch_samplesheet.splitCsv(sep:'\t', header: true) .map( { line -> [line.individual_id, file(line.read_1), file(line.read_2)] } ) @@ -103,6 +99,10 @@ workflow read_inputs { ch_individuals_by_family = ch_individuals.map({[it[1], it]}) emit: - ch_individuals - ch_individuals_by_family + ch_ped_file = ch_ped_file + ch_ped_file_info = ch_ped_file_info + ch_samplesheet = ch_samplesheet + ch_samplesheet_info = ch_samplesheet_info + ch_individuals = ch_individuals + ch_individuals_by_family = ch_individuals_by_family } diff --git a/pipeline/main.nf b/pipeline/var_calling.nf similarity index 77% rename from pipeline/main.nf rename to pipeline/var_calling.nf index 7c55fec..4ef7a4a 100644 --- a/pipeline/main.nf +++ b/pipeline/var_calling.nf @@ -3,14 +3,6 @@ nextflow.enable.dsl = 2 include {read_inputs} from './inputs.nf' include {validation} from './validation.nf' -params.bcbio = null -params.bcbio_template = null -params.output_dir = null -params.pipeline_project_id = null -params.pipeline_project_version = null -params.target_bed = null -params.parse_peddy_output = null - process merge_fastqs { label 'medium' @@ -38,6 +30,7 @@ process write_bcbio_csv { input: tuple(val(family_id), val(individual_info)) + path(target_bed) output: tuple(val(family_id), path("${family_id}.csv")) @@ -45,14 +38,16 @@ process write_bcbio_csv { script: """ #!/usr/bin/env python + import os + target_bed = os.path.realpath('${target_bed}') individual_info = '$individual_info' lines = individual_info.lstrip('[').rstrip(']').split('], [') with open('${family_id}.csv', 'w') as f: f.write('samplename,description,batch,sex,phenotype,variant_regions\\n') for l in lines: - f.write(l.replace(', ', ',') + '\\n') + f.write(l.replace(', ', ',') + ',' + target_bed + '\\n') """ } @@ -62,17 +57,19 @@ process bcbio_family_processing { input: tuple(val(family_id), val(individuals), path(family_csv)) + path(bcbio) + path(bcbio_template) output: tuple(val(family_id), val(individuals), path("${family_id}-merged")) script: """ - ${params.bcbio}/anaconda/bin/bcbio_prepare_samples.py --out . --csv $family_csv && - ${params.bcbio}/anaconda/bin/bcbio_nextgen.py -w template ${params.bcbio_template} ${family_csv.getBaseName()}-merged.csv ${individuals.collect({"${it}.fastq.gz"}).join(' ')} && + ${bcbio}/anaconda/bin/bcbio_prepare_samples.py --out . --csv $family_csv && + ${bcbio}/anaconda/bin/bcbio_nextgen.py -w template ${bcbio_template} ${family_csv.getBaseName()}-merged.csv ${individuals.collect({"${it}.fastq.gz"}).join(' ')} && cd ${family_id}-merged && - ${params.bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local + ../${bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local """ } @@ -80,13 +77,15 @@ process bcbio_family_processing { process format_bcbio_individual_outputs { input: tuple(val(family_id), val(individuals), path(bcbio_output_dir)) + path(bcbio) + path(reference_genome) output: tuple(val(family_id), path('individual_outputs')) script: """ - samtools=${params.bcbio}/anaconda/bin/samtools && + samtools=${bcbio}/anaconda/bin/samtools && mkdir individual_outputs for i in ${individuals.join(' ')} do @@ -99,7 +98,7 @@ process format_bcbio_individual_outputs { bam=\$indv_input/\$i-ready.bam cram="\$indv_output/\$i-ready.cram" && - \$samtools view -@ ${task.cpus} -T ${params.reference_genome} -C -o \$cram \$bam && + \$samtools view -@ ${task.cpus} -T ${reference_genome} -C -o \$cram \$bam && \$samtools index \$cram && bam_flagstat=./\$i-ready.bam.flagstat.txt && cram_flagstat=\$cram.flagstat.txt && @@ -165,15 +164,19 @@ process format_bcbio_family_outputs { } - process collate_pipeline_outputs { label 'small' publishDir "${params.output_dir}", mode: 'move', pattern: "${params.pipeline_project_id}_${params.pipeline_project_version}" input: + val(family_ids) val(bcbio_family_output_dirs) val(raw_bcbio_output_dirs) + path(ped_file) + path(samplesheet) + path(bcbio) + path(parse_peddy_output) output: path("${params.pipeline_project_id}_${params.pipeline_project_version}") @@ -189,20 +192,25 @@ process collate_pipeline_outputs { cp -rL \$d \$outputs/families/\$(basename \$d) done && + for f in ${family_ids.join(' ')} + do + grep \$f ${ped_file} > \$outputs/params/\$f.ped + done && + cd \$outputs/families && - ${params.bcbio}/anaconda/bin/multiqc \ + ../../${bcbio}/anaconda/bin/multiqc \ --title "Trio whole exome QC report: ${params.pipeline_project_id}_${params.pipeline_project_version}" \ --outdir ../qc \ --filename ${params.pipeline_project_id}_${params.pipeline_project_version}_qc_report.html \ . && peddy_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt && - perl ${params.parse_peddy_output} \ + perl ../../${parse_peddy_output} \ --output \$peddy_output \ --project ${params.pipeline_project_id} \ --batch ${bcbio_family_output_dirs[0].getName().split('_')[1]} \ --version ${params.pipeline_project_version} \ - --ped ${params.ped_file} \ + --ped ../../${ped_file} \ --families . && # no && here - exit status checked below @@ -222,8 +230,8 @@ process collate_pipeline_outputs { dest_basename=${params.pipeline_project_id}_${params.pipeline_project_version}_\$family_id && cp -L \$d/config/\$family_id_merged.csv \$outputs/params/\$dest_basename.csv && cp -L \$d/config/\$family_id_merged.yaml \$outputs/config/\$dest_basename.yaml && - cp -L ${params.ped_file} \$outputs/params/ && - cp -L ${params.sample_sheet} \$outputs/params/ + cp -L ${ped_file} \$outputs/params/ && + cp -L ${samplesheet} \$outputs/params/ done """ } @@ -242,8 +250,16 @@ workflow process_families { take: ch_individuals + ch_ped_file + ch_samplesheet main: + ch_bcbio = file(params.bcbio, checkIfExists: true) + ch_bcbio_template = file(params.bcbio_template, checkIfExists: true) + ch_target_bed = file(params.target_bed, checkIfExists: true) + ch_parse_peddy_output = file(params.parse_peddy_output, checkIfExists: true) + ch_reference_genome = file(params.reference_genome, checkIfExists: true) + ch_merged_fastqs = merge_fastqs( ch_individuals.map( { indv, family, father, mother, sex, affected, r1, r2 -> @@ -267,9 +283,10 @@ workflow process_families { ch_bcbio_csvs = write_bcbio_csv( ch_read1_meta.mix(ch_read2_meta).map( { family_id, sample_id, father, mother, sex, phenotype, merged_fastq -> - [family_id, [merged_fastq, sample_id, family_id, sex, phenotype, params.target_bed]] + [family_id, [merged_fastq, sample_id, family_id, sex, phenotype]] } - ).groupTuple() + ).groupTuple(), + ch_target_bed ) ch_bcbio_inputs = ch_joined_indv_info.map( @@ -277,25 +294,42 @@ workflow process_families { [family_id, sample_id] }).groupTuple().join(ch_bcbio_csvs) - ch_bcbio_family_outputs = bcbio_family_processing(ch_bcbio_inputs) - ch_individual_folders = format_bcbio_individual_outputs(ch_bcbio_family_outputs) + ch_bcbio_family_outputs = bcbio_family_processing( + ch_bcbio_inputs, + ch_bcbio, + ch_bcbio_template + ) + + ch_individual_folders = format_bcbio_individual_outputs( + ch_bcbio_family_outputs, + ch_bcbio, + ch_reference_genome + + ) ch_formatted_bcbio_outputs = format_bcbio_family_outputs( ch_bcbio_family_outputs.join(ch_individual_folders) ) collate_pipeline_outputs( + ch_formatted_bcbio_outputs.map({it[0]}).collect(), ch_formatted_bcbio_outputs.map({it[1]}).collect(), - ch_formatted_bcbio_outputs.map({it[2]}).collect() + ch_formatted_bcbio_outputs.map({it[2]}).collect(), + ch_ped_file, + ch_samplesheet, + ch_bcbio, + ch_parse_peddy_output ) } -workflow { +workflow var_calling { read_inputs() - ch_individuals = read_inputs.out[0] - ch_individuals_by_family = read_inputs.out[1] - validation(ch_individuals) - process_families(ch_individuals) + validation(read_inputs.out.ch_individuals) + process_families( + read_inputs.out.ch_individuals, + read_inputs.out.ch_ped_file, + read_inputs.out.ch_samplesheet + ) } diff --git a/tests/assets/input_data/ped_files/giab_test_non_trios.ped b/tests/assets/input_data/ped_files/giab_test_non_trios.ped new file mode 100644 index 0000000..415e097 --- /dev/null +++ b/tests/assets/input_data/ped_files/giab_test_non_trios.ped @@ -0,0 +1,7 @@ +00001_000001 000001_000001 0 000003_000002 1 2 +00001_000001 000003_000001 0 0 2 1 +00001_000002 000004_000002 0 0 1 2 +00001_000003 000005_000003 000007_000003 000008_000003 1 2 +00001_000003 000006_000003 000007_000003 000008_000003 2 2 +00001_000003 000007_000003 0 0 1 1 +00001_000003 000008_000003 0 0 2 1 diff --git a/tests/assets/input_data/ped_files/giab_test_trios.ped b/tests/assets/input_data/ped_files/giab_test_trios.ped new file mode 100644 index 0000000..566cce4 --- /dev/null +++ b/tests/assets/input_data/ped_files/giab_test_trios.ped @@ -0,0 +1,6 @@ +00001_000001 000001_000001 000002_000002 000003_000002 1 2 +00001_000001 000002_000001 0 0 1 1 +00001_000001 000003_000001 0 0 2 1 +00001_000002 000004_000002 000005_000002 000006_000002 1 2 +00001_000002 000005_000002 0 0 1 1 +00001_000002 000006_000002 0 0 2 1 diff --git a/tests/assets/input_data/sample_sheets/batch_1.tsv b/tests/assets/input_data/sample_sheets/batch_1.tsv index 7b93d54..f0a24cc 100644 --- a/tests/assets/input_data/sample_sheets/batch_1.tsv +++ b/tests/assets/input_data/sample_sheets/batch_1.tsv @@ -1,7 +1,7 @@ -individual_id family_id sample_id read_1 read_2 -000001 000001 12345_000001_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz -000001 000001 12345_000001_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz -000002 000001 12345_000002_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz -000002 000001 12345_000002_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz -000003 000001 12345_000003_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz -000003 000001 12345_000003_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz +individual_id read_1 read_2 +000001 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz +000001 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz +000002 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz +000002 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz +000003 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz +000003 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz diff --git a/tests/assets/input_data/sample_sheets/batch_2_md5_errors.tsv b/tests/assets/input_data/sample_sheets/batch_2_md5_errors.tsv index fca9441..88f1476 100644 --- a/tests/assets/input_data/sample_sheets/batch_2_md5_errors.tsv +++ b/tests/assets/input_data/sample_sheets/batch_2_md5_errors.tsv @@ -1,2 +1,2 @@ -individual_id family_id sample_id read_1 read_2 -000006 000003 12346_000006_000003_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz +individual_id read_1 read_2 +000006 assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz diff --git a/tests/assets/input_data/sample_sheets/giab_test_non_trios.tsv b/tests/assets/input_data/sample_sheets/giab_test_non_trios.tsv new file mode 100644 index 0000000..cf341e5 --- /dev/null +++ b/tests/assets/input_data/sample_sheets/giab_test_non_trios.tsv @@ -0,0 +1,8 @@ +individual_id read_1 read_2 +000001_000001 assets/input_data/giab/AshkenazimTrio/HG002_R1.fastq.gz assets/input_data/giab/AshkenazimTrio/HG002_R2.fastq.gz +000003_000001 assets/input_data/giab/AshkenazimTrio/HG004_R1.fastq.gz assets/input_data/giab/AshkenazimTrio/HG004_R2.fastq.gz +000004_000002 assets/input_data/giab/ChineseTrio/HG005_R1.fastq.gz assets/input_data/giab/ChineseTrio/HG005_R2.fastq.gz +000005_000003 assets/input_data/giab/AshkenazimTrio/HG002_R1.fastq.gz assets/input_data/giab/AshkenazimTrio/HG002_R2.fastq.gz +000006_000003 assets/input_data/giab/ChineseTrio/HG005_R1.fastq.gz assets/input_data/giab/ChineseTrio/HG005_R2.fastq.gz +000007_000003 assets/input_data/giab/AshkenazimTrio/HG003_R1.fastq.gz assets/input_data/giab/AshkenazimTrio/HG003_R2.fastq.gz +000008_000003 assets/input_data/giab/AshkenazimTrio/HG004_R1.fastq.gz assets/input_data/giab/AshkenazimTrio/HG004_R2.fastq.gz diff --git a/tests/assets/input_data/sample_sheets/giab_test_trios.tsv b/tests/assets/input_data/sample_sheets/giab_test_trios.tsv new file mode 100644 index 0000000..35429e7 --- /dev/null +++ b/tests/assets/input_data/sample_sheets/giab_test_trios.tsv @@ -0,0 +1,7 @@ +individual_id read_1 read_2 +000001_000001 assets/input_data/giab/AshkenazimTrio/HG002_R1.fastq.gz assets/input_data/giab/AshkenazimTrio/HG002_R2.fastq.gz +000002_000001 assets/input_data/giab/AshkenazimTrio/HG003_R1.fastq.gz assets/input_data/giab/AshkenazimTrio/HG003_R2.fastq.gz +000003_000001 assets/input_data/giab/AshkenazimTrio/HG004_R1.fastq.gz assets/input_data/giab/AshkenazimTrio/HG004_R2.fastq.gz +000004_000002 assets/input_data/giab/ChineseTrio/HG005_R1.fastq.gz assets/input_data/giab/ChineseTrio/HG005_R2.fastq.gz +000005_000002 assets/input_data/giab/ChineseTrio/HG006_R1.fastq.gz assets/input_data/giab/ChineseTrio/HG006_R2.fastq.gz +000006_000002 assets/input_data/giab/ChineseTrio/HG007_R1.fastq.gz assets/input_data/giab/ChineseTrio/HG007_R2.fastq.gz diff --git a/tests/run_giab_tests.sh b/tests/run_giab_tests.sh new file mode 100755 index 0000000..23ce6e9 --- /dev/null +++ b/tests/run_giab_tests.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +source scripts/nextflow_detached.sh + +test_exit_status=0 +nextflow -c "$NEXTFLOW_CONFIG" clean -f + +echo "Reduced GiaB data - trios" +run_nextflow ../main.nf \ + -c "$NEXTFLOW_CONFIG" \ + --workflow 'variant-calling' \ + --pipeline_project_id giab_test_trios \ + --pipeline_project_version v1 \ + --ped_file $PWD/assets/input_data/ped_files/giab_test_trios.ped \ + --sample_sheet $PWD/assets/input_data/sample_sheets/giab_test_trios.tsv + +test_exit_status=$(( $test_exit_status + $? )) + +echo "Reduced GiaB data - non-trios" +run_nextflow ../main.nf \ + -c "$NEXTFLOW_CONFIG" \ + --workflow 'variant-calling' \ + --pipeline_project_id giab_test_non_trios \ + --pipeline_project_version v1 \ + --ped_file $PWD/assets/input_data/ped_files/giab_test_non_trios.ped \ + --sample_sheet $PWD/assets/input_data/sample_sheets/giab_test_non_trios.tsv + +test_exit_status=$(( $test_exit_status + $? )) + +echo "Tests finished with exit status $test_exit_status" + diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 2ae8c81..18d0898 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -1,6 +1,6 @@ #!/bin/bash -source scripts/test_config.sh +source scripts/nextflow_detached.sh bcbio=$PWD/scripts/bcbio_nextgen.py bcbio_prepare_samples=$PWD/scripts/bcbio_prepare_samples.py @@ -9,7 +9,6 @@ common_args="--bcbio $bcbio --bcbio_prepare_samples $bcbio_prepare_samples --bcb test_exit_status=0 nextflow clean -f -rm -r ./outputs/* ./work/* echo "Test case 1: simple trio" run_nextflow ../pipeline/main.nf --ped_file assets/input_data/ped_files/batch_1.ped --sample_sheet assets/input_data/sample_sheets/batch_1.tsv $common_args diff --git a/tests/scripts/test_config.sh b/tests/scripts/nextflow_detached.sh similarity index 100% rename from tests/scripts/test_config.sh rename to tests/scripts/nextflow_detached.sh -- GitLab