diff --git a/.gitignore b/.gitignore index fb2f9b9917e6068037312ed87b9176132662c6f1..f8dcefca4cce8f31cc95bb82f5c0d0d0c4b043c9 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ work *# *~ +tests/outputs \ No newline at end of file diff --git a/assets/trio_whole_exome_bcbio_template.yaml b/assets/trio_whole_exome_bcbio_template.yaml index 75d49bd28396c820e3c2e32470ca13f6d00c4914..6505c0b55e47c86133988690f7f99f528bab6a10 100644 --- a/assets/trio_whole_exome_bcbio_template.yaml +++ b/assets/trio_whole_exome_bcbio_template.yaml @@ -20,7 +20,7 @@ details: genome_build: hg38 upload: # relative path will output locally to the bcbio run folder - dir: ./results + dir: ./results resources: gatk-haplotype: options: '["--active-probability-threshold", "0", "--interval-padding", 100]' diff --git a/bin/convert_DEC_to_v10.py b/bin/convert_DEC_to_v10.py old mode 100644 new mode 100755 index 67f26a78bb6cd0b061a1c298a7704d8b268c6217..f4b23df27aaaaff3fc01bb4d1a83d224d4ff01ab --- a/bin/convert_DEC_to_v10.py +++ b/bin/convert_DEC_to_v10.py @@ -1,3 +1,4 @@ +#!/bin/env python # given a DECIPHER bulk upload file v9 # convert it to v10 # @@ -14,10 +15,7 @@ import xlsxwriter -def go(inout_dir,id): # the folder where the bulk upload files are strored; id is in the format <indiv_id>_<family_id> - - in_file = '%s/%s_DEC_FLT.csv' % (inout_dir,id) - out_file = '%s/%s_DECIPHER_v10.xlsx' % (inout_dir,id) +def go(id, in_file, out_file): # the folder where the bulk upload files are strored; id is in the format <indiv_id>_<family_id> # create the workbook workbook = xlsxwriter.Workbook(out_file) @@ -72,9 +70,9 @@ def go(inout_dir,id): # the folder where the bulk upload files are strored; id if __name__ == '__main__': - if len(sys.argv) == 3: - go(sys.argv[1],sys.argv[2]) + if len(sys.argv) == 4: + go(*sys.argv[1:]) else: - print ("Suggested use: time python convert_DEC_to_v10.py decipher_dir <indiv_id>_<family_id>") + print("Suggested use: convert_DEC_to_v10.py <indiv_id>_<family_id> <in_file> <out_file>") raise SystemExit diff --git a/bin/extract_solo_FAM_PRO_ID.py b/bin/extract_solo_FAM_PRO_ID.py index 3dcfa453973cf6c680b46b34751f426bbe4f4c28..6306fa987fa369962c8b8fd67ecd50a03e2f613b 100755 --- a/bin/extract_solo_FAM_PRO_ID.py +++ b/bin/extract_solo_FAM_PRO_ID.py @@ -1,22 +1,17 @@ +#!/usr/bin/env python3 # input: the work folder which contains a PED subfolder where all family PED files were copied # output: only for singletons # solo_FAM_IDs.txt, solo_PRO_IDs.txt and solo_FAM_PRO.txt # # Author: MH -# last modified: APR 25, 2022 +# last modified: JUL 6, 2022 +import argparse - -import sys -import os - - - -def go(work_dir): - - out_fam_file = work_dir + '/solo_FAM_IDs.txt' - out_pro_file = work_dir + '/solo_PRO_IDs.txt' - out_f_p_file = work_dir + '/solo_FAM_PRO.txt' +def go(args): + out_fam_file = 'solo_FAM_IDs.txt' + out_pro_file = 'solo_PRO_IDs.txt' + out_f_p_file = 'solo_FAM_PRO.txt' out_fam_han = open(out_fam_file,'w') out_pro_han = open(out_pro_file,'w') @@ -26,21 +21,18 @@ def go(work_dir): cntr_pro = 0 cntr_f_p = 0 - # go over the PED folder in the working dir and process each PED file - ped_dir = work_dir + '/PED' - print "" - print "Processing the PED files (in %s) to extract singleton FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir) + print("") + print("Processing %i PED files to extract singleton FAM_ID, PRO_ID and FAM_PRO files" % (len(args.ped_files))) - for file in os.listdir(ped_dir): # per each PED file - if file.endswith(".ped"): + for ped_file in args.ped_files: # per each PED file + if ped_file.endswith(".ped"): - print " %s" % (file) - in_file = os.path.join(ped_dir, file) + print(" %s" % (ped_file)) # check how many lines in the PED file - if more than one cannot be a singleton, ignore num_lines = 0 - in_han = open(in_file,'r') + in_han = open(ped_file,'r') for line in in_han: num_lines += 1 in_han.close() @@ -48,7 +40,7 @@ def go(work_dir): if num_lines > 1: continue if num_lines == 0: - print "ERROR: empty PED file %s" % (file) + print("ERROR: empty PED file %s" % (ped_file)) raise SystemExit # if here, the PED file contains exactly one line @@ -56,7 +48,7 @@ def go(work_dir): CHILD_ID = 0 FAM_ID = 0 - in_han = open(in_file,'r') + in_han = open(ped_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] @@ -64,8 +56,8 @@ def go(work_dir): y_indi,y_fam = data[1].split('_') # in the internal PED file, indi_id is indiID_familyID, split if x_fam != y_fam: - print "ERROR: FAMILY_ID mismatch in %s" % (file) - print line + print("ERROR: FAMILY_ID mismatch in %s" % (ped_file)) + print(line) raise SystemExit FAM_ID = x_fam @@ -73,8 +65,8 @@ def go(work_dir): # check both parent IDs == 0 if (data[2] != '0') or (data[3] != '0'): - print "ERROR: found parent ID for a singleton child in %s" % (file) - print line + print("ERROR: found parent ID for a singleton child in %s" % (ped_file)) + print(line) raise SystemExit # make sure the sex of the child is known @@ -82,23 +74,23 @@ def go(work_dir): if (CHILD_SEX == 1) or (CHILD_SEX == 2): pass else: - print "ERROR: proband sex unknown in %s" % (file) - print line + print("ERROR: proband sex unknown in %s" % (ped_file)) + print(line) raise SystemExit # check kid is affected if int(data[5]) != 2: - print "ERROR: singleton child not affected" - print line + print("ERROR: singleton child not affected") + print(line) raise SystemExit if FAM_ID == 0: - print "ERROR: Cannot find the FAMILY_ID in %s" % (file) + print("ERROR: Cannot find the FAMILY_ID in %s" % (ped_file)) raise SystemExit if CHILD_ID == 0: - print "ERROR: Cannot find CHILD_ID in %s" % (file) + print("ERROR: Cannot find CHILD_ID in %s" % (ped_file)) raise SystemExit - else: + elif FAM_ID not in args.exclude_families: out_fam_han.write('%s\n' % (FAM_ID)) cntr_fam += 1 out_pro_han.write('%s\n' % (CHILD_ID)) @@ -110,23 +102,23 @@ def go(work_dir): out_pro_han.close() out_f_p_han.close() - - - print "" - print "Singleton Families:" - print " %s FAM_IDs --> %s" % (cntr_fam, out_fam_file) - print " %s PRO_IDs --> %s" % (cntr_pro, out_pro_file) - print " %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file) - print "" - - - + print("") + print("Singleton Families:") + print(" %s FAM_IDs --> %s" % (cntr_fam, out_fam_file)) + print(" %s PRO_IDs --> %s" % (cntr_pro, out_pro_file)) + print(" %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file)) + print("") if __name__ == '__main__': - if len(sys.argv) == 2: - go(sys.argv[1]) - else: - print "Suggested use: time python /home/u035/u035/shared/scripts/extract_solo_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>" - raise SystemExit - + a = argparse.ArgumentParser( + description=''' + Extract singleton FAM_ID, PRO_ID and FAM_PRO files. Suggested use: + time python /home/u035/u035/shared/scripts/extract_solo_FAM_PRO_ID.py + /home/u035/u035/shared/analysis/work/<PROJECT_ID> + ''' + ) + a.add_argument('ped_files', nargs='+') + args = a.parse_args() + a.add_argument('--exclude-families', nargs='*') + go(args) diff --git a/bin/extract_trio_FAM_PRO_ID.py b/bin/extract_trio_FAM_PRO_ID.py index 4dac3b03ba55b6003b79dab6ff4e99582b408035..3bec2e218b0ac6ea5ce18fa9ee2c4d49580eaddc 100755 --- a/bin/extract_trio_FAM_PRO_ID.py +++ b/bin/extract_trio_FAM_PRO_ID.py @@ -1,22 +1,18 @@ +#!/usr/bin/env python3 # input: the work folder which contains a PED subfolder where all family PED files were copied # output: only for trio families # FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt # # Author: MH -# last modified: APR 25, 2022 +# last modified: JUL 6, 2022 +import argparse -import sys -import os - - - -def go(work_dir): - - out_fam_file = work_dir + '/FAM_IDs.txt' - out_pro_file = work_dir + '/PRO_IDs.txt' - out_f_p_file = work_dir + '/FAM_PRO.txt' +def go(args): + out_fam_file = 'FAM_IDs.txt' + out_pro_file = 'PRO_IDs.txt' + out_f_p_file = 'FAM_PRO.txt' out_fam_han = open(out_fam_file,'w') out_pro_han = open(out_pro_file,'w') @@ -26,21 +22,18 @@ def go(work_dir): cntr_pro = 0 cntr_f_p = 0 - # go over the PED folder in the working dir and process each PED file - ped_dir = work_dir + '/PED' - print "" - print "Processing the PED files (in %s) to extract FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir) + print("") + print("Processing %i PED files to extract FAM_ID, PRO_ID and FAM_PRO files" % (len(args.ped_files))) - for file in os.listdir(ped_dir): # per each PED file - if file.endswith(".ped"): - print " %s" % (file) - in_file = os.path.join(ped_dir, file) + for ped_file in args.ped_files: # per each PED file + if ped_file.endswith(".ped"): + print(" %s" % (ped_file)) CHILD_ID = 0 FAM_ID = 0 - in_han = open(in_file,'r') + in_han = open(ped_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] @@ -50,14 +43,14 @@ def go(work_dir): if FAM_ID == 0: FAM_ID = x_fam elif FAM_ID != x_fam: - print "ERROR: more than one FAMILY_ID in %s" % (file) + print("ERROR: more than one FAMILY_ID in %s" % (ped_file)) raise SystemExit if data[2] != '0' and data[3] != '0': # this is the child in the trio if CHILD_ID == 0: CHILD_ID = y_indi else: # seen another child - print "WARNING: already have seen a child (possibly a quad) in %s" % (file) + print("WARNING: already have seen a child (possibly a quad) in %s" % (ped_file)) CHILD_ID = 0 break @@ -65,21 +58,21 @@ def go(work_dir): if (CHILD_SEX == 1) or (CHILD_SEX == 2): pass else: - print "ERROR: proband sex unknown" - print line + print("ERROR: proband sex unknown") + print(line) raise SystemExit if int(data[5]) != 2: - print "ERROR: child in a trio not affected" - print line + print("ERROR: child in a trio not affected") + print(line) raise SystemExit if FAM_ID == 0: - print "ERROR: Cannot find the FAMILY_ID in %s" % (file) + print("ERROR: Cannot find the FAMILY_ID in %s" % (ped_file)) raise SystemExit if CHILD_ID == 0: - print "WARNING: Cannot find exactly one CHILD_ID (with 2 available parents) in %s : not a trio --> will not be analyzed" % (file) - else: + print("WARNING: Cannot find exactly one CHILD_ID (with 2 available parents) in %s : not a trio --> will not be analyzed" % (ped_file)) + elif FAM_ID not in args.exclude_families: out_fam_han.write('%s\n' % (FAM_ID)) cntr_fam += 1 out_pro_han.write('%s\n' % (CHILD_ID)) @@ -91,23 +84,23 @@ def go(work_dir): out_pro_han.close() out_f_p_han.close() - - - print "" - print "Trio Families:" - print " %s FAM_IDs --> %s" % (cntr_fam, out_fam_file) - print " %s PRO_IDs --> %s" % (cntr_pro, out_pro_file) - print " %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file) - print "" - - - + print("") + print("Trio Families:") + print(" %s FAM_IDs --> %s" % (cntr_fam, out_fam_file)) + print(" %s PRO_IDs --> %s" % (cntr_pro, out_pro_file)) + print(" %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file)) + print("") if __name__ == '__main__': - if len(sys.argv) == 2: - go(sys.argv[1]) - else: - print "Suggested use: time python /home/u035/u035/shared/scripts/extract_trio_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>" - raise SystemExit - + a = argparse.ArgumentParser( + description=''' + Extract FAM_ID, PRO_ID and FAM_PRO files from Ped files. Suggested use: + time python /home/u035/u035/shared/scripts/extract_trio_FAM_PRO_ID.py + /home/u035/u035/shared/analysis/work/<PROJECT_ID> + ''' + ) + a.add_argument('ped_files', nargs='+') + a.add_argument('--exclude-families', nargs='*') + args = a.parse_args() + go(args) diff --git a/bin/filter_LQ_GT.py b/bin/filter_LQ_GT.py old mode 100644 new mode 100755 index fff3d37e2a61d9f71be84d10513a9f1350bf959f..2e612f867280cfd82140c4599bc98c506c4fe9b5 --- a/bin/filter_LQ_GT.py +++ b/bin/filter_LQ_GT.py @@ -1,3 +1,4 @@ +#!/bin/env python2.7 # given a multi-sample VCF # reset the GT to ./. (no-call) for indi variant with num_ALT < num_ALT_THERSH or VAF < VAF_THRESH # diff --git a/bin/generate_DEC_IGV_scripts.py b/bin/generate_DEC_IGV_scripts.py index 48f9e1400ada94cb7716489dbe3236bccaf3eddd..0c347caa630e76cd198b8bef01c66110035f5545 100755 --- a/bin/generate_DEC_IGV_scripts.py +++ b/bin/generate_DEC_IGV_scripts.py @@ -1,3 +1,4 @@ +#!/bin/env python2.7 # input: # the family PED file [${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped] # individual VCF file for the trio proband [${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz] @@ -71,12 +72,10 @@ SOR_THRESH = float(3) -def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir): - +def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,child_vcf_file,mom_vcf_file,dad_vcf_file,plate_id,fam_id,child_bam,mom_bam,dad_bam, out_dec_file, out_igv_file): # read the decipher to internal ID mapping file read_map_file(dec_map_file) - # read the transcript mapping file read_trans_map(trans_map_file) @@ -106,11 +105,6 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir # now read the individual VCFs and record all the variants - child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID) - mom_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,MOM_ID) - dad_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,DAD_ID) - - read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT) print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID) sys.stdout.flush() @@ -150,13 +144,11 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir # setup the DECIPHER output file - out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,CHILD_ID) ################################ out_han = open(out_dec_file,'w') out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n') # setup the IGV snapshot file - out_igv_file = '%s/IGV/%s.snapshot.FLT.txt' % (dec_dir,CHILD_ID) ################################# out_igv_han = open(out_igv_file,'w') out_igv_han.write('new\n') out_igv_han.write('genome hg38\n') @@ -164,9 +156,6 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir out_igv_han.write('maxPanelHeight 10000\n') out_igv_han.write('new\n') - child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID) - mom_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,MOM_ID,MOM_ID) - dad_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,DAD_ID,DAD_ID) out_igv_han.write('load %s\n' % (child_bam)) out_igv_han.write('load %s\n' % (mom_bam)) out_igv_han.write('load %s\n' % (dad_bam)) @@ -182,7 +171,6 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir in_cntr = 0 out_cntr = 0 - child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID) in_han = gzip.open(child_vcf_file,'r') @@ -1343,10 +1331,10 @@ def read_trans_map(in_file): if __name__ == '__main__': - if len(sys.argv) == 12: - go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10],sys.argv[11]) + if len(sys.argv) == 17: + go(*sys.argv[1:]) else: print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV.py \ - dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir" + dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,child_vcf_file,mom_vcf_file,dad_vcf_file,plate_id,fam_id,child_bam,mom_bam,dad_bam,out_dec_file.txt,out_igv_file.txt" raise SystemExit diff --git a/bin/get_cov_output.py b/bin/get_cov_output.py old mode 100644 new mode 100755 index 49add7fa11c8c8979d2485da701c639b05e380e9..6e02f3ee62ff9c23e982c2141c15f3ab6fd25946 --- a/bin/get_cov_output.py +++ b/bin/get_cov_output.py @@ -1,3 +1,4 @@ +#!/bin/env python2.7 # given # <gene_set>.<ID>.sample_interval_summary - e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt # <gene_set>.<gene_set_date>.ClinVar.<clinvar_date>.plus15bp.txt - e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt diff --git a/pipeline/lib/PipelineException.groovy b/lib/PipelineException.groovy similarity index 100% rename from pipeline/lib/PipelineException.groovy rename to lib/PipelineException.groovy diff --git a/main.nf b/main.nf index f2b8c278b715a6658514e8eb80a7df741f264916..d5ec72f182225113914eb6c5d40b63e42e144157 100644 --- a/main.nf +++ b/main.nf @@ -1,43 +1,31 @@ nextflow.enable.dsl = 2 include {var_calling} from './pipeline/var_calling.nf' +include {prioritisation_setup; prioritisation} from './pipeline/variant_prioritisation.nf' +include {compress; decompress} from './pipeline/cram_compression.nf' -// which part of the pipeline to run - either 'variant-calling' or 'variant-prioritisation' -params.workflow = null - -// path to a bcbio install, containing 'anaconda', 'galaxy', 'genomes', etc -params.bcbio = null - -// path to a template config for bcbio variant calling -params.bcbio_template = null - -// where the results get written to on the system. The variant calling creates initial -// results here, and variant prioritisation adds to them -params.output_dir = null - -// name of the pipeline batch, e.g. '21900', '20220427' -params.pipeline_project_id = null - -// version of the pipeline batch, e.g. 'v1' -params.pipeline_project_version = null +workflow { + switch (params.workflow) { + case 'variant_calling': + var_calling() + break -// bed file of Twist exome targets -params.target_bed = null + case 'variant_prioritisation_setup': + prioritisation_setup() + break -// hg38 reference genome in fasta format -params.reference_genome = null + case 'variant_prioritisation': + prioritisation() + break -// path to a Ped file describing all the families in the pipeline batch -params.ped_file = null + case 'cram_compression': + compress() + break -// path to a samplesheet mapping individual IDs to fastq pairs -params.sample_sheet = null + case 'cram_decompression': + decompress() + break -workflow { - if (params.workflow == 'variant-calling') { - var_calling() - } else if (params.workflow == 'variant-prioritisation') { - println "Variant prioritisation coming soon" - } else { - exit 1, 'params.workflow required - variant-calling or variant-prioritisation' + default: + exit 1, "Invalid params.workflow: ${params.workflow}" } } diff --git a/nextflow.config b/nextflow.config index 746f52de544fe570c2b8de52500049175e6bf6a3..2bb7e975d61d665e3206199b3790211e55737c76 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,19 +1,98 @@ executor = 'slurm' params { + // General options + + // which part of the pipeline to run - 'variant-calling', 'variant-prioritisation-setup', etc.' + workflow = null + + // where the results get written to on the system. The variant calling creates initial + // results here, and variant prioritisation adds to them + output_dir = null + + // name of the pipeline batch, e.g. '21900', '20220427' + pipeline_project_id = null + + // version of the pipeline batch, e.g. 'v1' + pipeline_project_version = 'v1' + + // path to a Ped file describing all the families in the pipeline batch + ped_file = null + + // bed file of Twist exome targets + target_bed = null + + // hg38 reference genome in fasta format + reference_genome = null + reference_dict = null + + // Ensembl VEP + vep_cache = null // the 'genomes/Hsapiens/hg38/vep' folder in BCBio containing per-chromosome caches + vep_plugins = null // the 'anaconda/share/ensembl-vep-100.4-0' folder containing Perl modules + + // 'variant-calling' options + + // path to a bcbio install, containing 'anaconda', 'galaxy', 'genomes', etc + bcbio = null + + + // path to a template config for bcbio variant calling + bcbio_template = null + + // path to a samplesheet mapping individual IDs to fastq pairs + sample_sheet = null + + // BCBio config + bcbio_template = "$projectDir/assets/trio_whole_exome_bcbio_template.yaml" + + // 'variant-prioritisation'/'variant-prioritisation-setup' options + + // todo: remove this when we segment var prioritisation + scripts_dir = null + + // version number for a variant prioritisation run + prioritisation_version = 'v1' + + // families not to run variant prioritisation on + exclude_families = '' + + // path to GATK 3.8 - see #23 + gatk3 = null + + // use --fetal_trios to process trios against PanelApp's Fetal panel + fetal_trios = false + + g2p_targets = null + g2p_genes = null + clinvar = null + blacklist = null + trans_map = null + recurrent_snps = null // https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1 + list_24_chr = null + + // 'cram_compression'/'cram_decompression' options + // folder to search through recursively and perform compression/decompression on + compress_dir = null + + + // cluster options + + // max and min resource parameters. Use get_cpus, get_mem and get_time in the + // `process` scope to ensure that jobs don't request more resources than can be + // provided max_cpus = 16 max_mem = 32.GB max_time = 48.h - min_cpus = 1 min_mem = 1.GB min_time = 2.h - bcbio_template = "$projectDir/assets/trio_whole_exome_bcbio_template.yaml" } profiles { + conda.enabled = true + standard { process.executor = 'local' } @@ -22,7 +101,13 @@ profiles { process.echo = true } + testing { + params.output_dir = "$projectDir/tests/outputs" + } + stubs { + conda.enabled = false + process.conda = null process.executor = 'local' params.max_cpus = 1 params.max_mem = 500.MB @@ -31,7 +116,18 @@ profiles { params.bcbio = "$projectDir/tests/scripts/bcbio_nextgen.py" params.target_bed = "$projectDir/tests/assets/input_data/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed" params.reference_genome = "$projectDir/tests/assets/ref.fa" - params.output_dir = "$projectDir/tests/outputs" + params.reference_dict = "$projectDir/tests/assets/ref.dict" + params.blacklist = "$projectDir/tests/assets/blacklist.txt" + params.list_24_chr = "$projectDir/tests/assets/list_24_chr.txt" + params.g2p_target_bed = "$projectDir/tests/assets/g2p_targets.bed" + params.g2p_target_csv = "$projectDir/tests/assets/g2p_targets.csv" + params.g2p_genes = "$projectDir/tests/assets/g2p_genes.txt" + params.gatk3 = "$projectDir/tests/scripts/bcbio_nextgen.py" + params.clinvar = "$projectDir/tests/assets/clinvar.txt" + params.recurrent_snps = "$projectDir/tests/assets/recurrent_snps.txt" + params.trans_map = "$projectDir/tests/assets/trans_map.txt" + params.vep_cache = "$projectDir/tests/assets/vep/cache" + params.vep_plugins = "$projectDir/tests/assets/vep/plugins" } slurm { @@ -58,11 +154,16 @@ process { memory = get_mem(2.GB) } - withLabel: medium { + withLabel: small_medium { cpus = get_cpus(4) memory = get_mem(8.GB) } + withLabel: medium { + cpus = get_cpus(8) + memory = get_mem(16.GB) + } + withLabel: large { cpus = get_cpus(16) memory = get_mem(32.GB) diff --git a/pipeline/cram_compression.nf b/pipeline/cram_compression.nf new file mode 100644 index 0000000000000000000000000000000000000000..3b937089f51b2f4f60c14d8538b297af8f71ff17 --- /dev/null +++ b/pipeline/cram_compression.nf @@ -0,0 +1,106 @@ +nextflow.enable.dsl = 2 + + +process bam_to_cram { + input: + path(bam) + + output: + path("${bam.getBaseName()}.cram") + + script: + """ + samtools=${params.bcbio}/anaconda/bin/samtools && + bam_flagstat=${bam}.flagstat.txt && + cram="${bam.getBaseName()}.cram" && + cram_flagstat=\$cram.flagstat.txt && + + if [ ! -f \$bam_flagstat ] + then + \$samtools flagstat $bam > \$bam_flagstat + fi + + \$samtools view -@ ${task.cpus} -T ${params.reference_genome} -C -o \$cram ${bam} && + \$samtools index \$cram && + \$samtools flagstat \$cram > \$cram_flagstat + + if ! diff \$bam_flagstat \$cram_flagstat > /dev/null 2>&1 + then + echo "Unexpected flagstat results in \$bam_flagstat and \$cram_flagstat" + exit 1 + fi + + cp \$cram \$(dirname \$(readlink -f $bam))&& + rm \$(readlink -f $bam) + """ + + stub: + """ + cram="${bam.getBaseName()}.cram" + cp ${bam} \$cram && + touch \$cram.crai && + touch ${bam}.flagstat.txt \$cram.flagstat.txt && + cp \$cram \$(dirname \$(readlink -f $bam)) && + rm \$(readlink -f $bam) + """ +} + +process cram_to_bam { + input: + path(cram) + + output: + path("${cram.getBaseName()}.bam") + + script: + """ + samtools=${params.bcbio}/anaconda/bin/samtools && + bam="${cram.getBaseName()}.bam" && + bam_flagstat=\${bam}.flagstat.txt && + cram_flagstat=${cram}.flagstat.txt && + + if [ ! -f \$cram_flagstat ] + then + \$samtools flagstat $cram > \$cram_flagstat + fi + + \$samtools view -@ ${task.cpus} -T ${params.reference_genome} -b -o \$bam ${cram} && + \$samtools index \$bam && + \$samtools flagstat \$bam > \$bam_flagstat + + if ! diff \$bam_flagstat \$cram_flagstat > /dev/null 2>&1 + then + echo "Unexpected flagstat results in \$bam_flagstat and \$cram_flagstat" + exit 1 + fi + + cp \$bam \$(dirname \$(readlink -f $cram)) && + rm \$(readlink -f $cram) + """ + + stub: + """ + bam="${cram.getBaseName()}.bam" + cp ${cram} \$bam && + touch \$bam.crai && + touch ${cram}.flagstat.txt \$bam.flagstat.txt && + cp \$bam \$(dirname \$(readlink -f $cram)) && + rm \$(readlink -f $cram) + """ +} + + +workflow compress { + ch_bams = Channel.fromPath( + "${params.compress_dir}/**/*.bam" + ) + bam_to_cram(ch_bams) +} + + +workflow decompress { + ch_crams = Channel.fromPath( + "${params.compress_dir}/**/*.cram" + ) + cram_to_bam(ch_crams) +} diff --git a/pipeline/validation.nf b/pipeline/validation.nf index 18058f800fe028fa4788ad26c558b12a9b1cee17..7c2c35ff426951be054123edfd3f561d34bfc955 100644 --- a/pipeline/validation.nf +++ b/pipeline/validation.nf @@ -1,11 +1,18 @@ nextflow.enable.dsl = 2 process check_md5s { + /* + For a fastq file, find its md5 file and run md5sum -c. Output the fastq name if + invalid, otherwise empty string `''`. + */ label 'small' + errorStrategy 'ignore' input: - val(fastq_file) - // getParent() gives null object errors if this is a path + val(fastq_file) // getParent() gives null object errors if this is a path + + output: + stdout script: """ @@ -14,7 +21,7 @@ process check_md5s { edgen_fastq=${fastq_file.getParent().getName()}/${fastq_file.getName()} crf_dir=${fastq_file.getParent()} - crf_md5_file="\$(ls \$crf_dir/*md5.txt | head -n 1)" + crf_md5_file="\$(find \$crf_dir -name *md5.txt | head -n 1)" crf_fastq=${fastq_file.getName()} local_md5_file=${fastq_file}.md5 @@ -22,17 +29,17 @@ process check_md5s { if [ -f \$edgen_md5_file ] then cd \$edgen_dir - md5sum -c <(cat md5sums.txt | grep \$edgen_fastq) + md5sum --quiet -c <(cat md5sums.txt | grep \$edgen_fastq) | cut -d ':' -f 1 elif [ -f \$crf_md5_file ] then cd \$crf_dir - md5sum -c <(cat \$crf_md5_file | grep \$crf_fastq) + md5sum --quiet -c <(cat \$crf_md5_file | grep \$crf_fastq) | cut -d ':' -f 1 elif [ -f \$local_md5_file ] then cd ${fastq_file.getParent()} - md5sum -c \$local_md5_file + md5sum --quiet -c \$local_md5_file | cut -d ':' -f 1 else echo "Could not find md5 file for $fastq_file" @@ -41,6 +48,7 @@ process check_md5s { """ } + workflow validation { /* Take a parsed samplesheet, flatten it and parse into a channel of observed vs. @@ -60,6 +68,14 @@ workflow validation { .flatten() .map({file(it)}) - check_md5s(ch_fastqs) + invalid_md5s = check_md5s(ch_fastqs) + .filter({it != ''}) // filter out the '' results from valid md5s + .collect() + .map( + { + if (it != null) { // invalid md5s found + throw new PipelineException("Invalid md5s detected:\n${it.join('')}") + } + } + ) } - diff --git a/pipeline/var_calling.nf b/pipeline/var_calling.nf index f2c9846ae382ca324411150270dfc1b2aef7fda2..b0b954c8f5a07021cef2d03264fa1ccd057d964d 100644 --- a/pipeline/var_calling.nf +++ b/pipeline/var_calling.nf @@ -5,7 +5,7 @@ include {validation} from './validation.nf' process merge_fastqs { - label 'medium' + label 'small_medium' input: tuple(val(indv_id), val(family_id), path(r1), path(r2)) @@ -54,6 +54,7 @@ process write_bcbio_csv { process bcbio_family_processing { label 'large' + label 'long' input: tuple(val(family_id), val(individuals), path(family_csv), path(merged_fastq1), path(merged_fastq2)) @@ -93,11 +94,12 @@ process bcbio_family_processing { output_dir=${family_id}/results family_dir="${family_id}/results/\$(date '+%Y-%m-%d')_${family_id}" mkdir -p \$family_dir - mkdir ${family_id}/config + mkdir ${family_id}/{config,provenance} touch ${family_id}/config/${family_id}{.csv,.yaml,-template.yaml} + touch ${family_id}/provenance/programs.txt cd \$family_dir - touch "\$(echo ${family_id} | sed 's/_//g')-gatk-haplotype-annotated.vcf.gz{,.tbi}" bcbio-nextgen{,-commands}.log data_versions.csv - touch project-summary.yaml metadata.csv programs.txt + touch "\$(echo ${family_id} | sed 's/_//g')-gatk-haplotype-annotated.vcf.gz"{,.tbi} bcbio-nextgen{,-commands}.log data_versions.csv + touch project-summary.yaml metadata.csv mkdir multiqc touch list_files_final.txt multiqc_config.yaml multiqc_report.html mkdir multiqc_data report @@ -141,48 +143,10 @@ process format_bcbio_individual_outputs { indv_output=individual_outputs/\$i && mkdir -p \$indv_output && - ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed && - ln -s \$indv_input/qc \$indv_output/qc && - - # todo: make cram compression its own process - bam=\$indv_input/\$i-ready.bam - cram="\$indv_output/\$i-ready.cram" && - \$samtools view -@ ${task.cpus} -T ${reference_genome} -C -o \$cram \$bam && - \$samtools index \$cram && - bam_flagstat=./\$i-ready.bam.flagstat.txt && - cram_flagstat=\$cram.flagstat.txt && - \$samtools flagstat \$bam > \$bam_flagstat && - \$samtools flagstat \$cram > \$cram_flagstat && - diff \$bam_flagstat \$cram_flagstat - - if [ \$? -ne 0 ] - then - echo "Unexpected flagstat results in \$cram_flagstat" - exit 1 - fi - done - """ - - stub: - """ - mkdir individual_outputs - for i in ${individuals.join(' ')} - do - indv_input=\$PWD/${bcbio_output_dir}/results/\$i - indv_output=individual_outputs/\$i && - mkdir -p \$indv_output && - - ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed && - ln -s \$indv_input/qc \$indv_output/qc && - - bam=\$indv_input/\$i-ready.bam - cram="\$indv_output/\$i-ready.cram" && - cp \$bam \$cram && - touch \$cram.crai && - bam_flagstat=./\$i-ready.bam.flagstat.txt && - cram_flagstat=\$cram.flagstat.txt && - touch \$bam_flagstat && - touch \$cram_flagstat + for f in \$i-ready.bam \$i-ready.bam.bai \${i}-callable.bed qc + do + ln -s \$indv_input/\$f \$indv_output/\$f + done done """ } @@ -200,34 +164,34 @@ process format_bcbio_family_outputs { script: """ - merged_bcbio_family_output="\$(ls -d \$(readlink -f $bcbio_output_dir)/results/????-??-??_* | head -n 1)" && - bcbio_family_output="\$(echo \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" && + bcbio_family_output="\$(ls -d \$(readlink -f $bcbio_output_dir)/results/????-??-??_* | head -n 1)" && run_date="\$(basename \$bcbio_family_output | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2})_.+\$/\\1/g')" && - broken_family_id="\$(echo ${family_id} | sed 's/_//g')" && - bcbio_family_output="\$(basename \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" && - mkdir \$bcbio_family_output && + outdir="\$(basename \$bcbio_family_output | sed 's/-merged\\/*\$//g')" && + mkdir \$outdir && for suffix in gatk-haplotype-annotated.vcf.gz{,.tbi} do - src=\$merged_bcbio_family_output/\${broken_family_id}-\${suffix} + src=\$(ls \$bcbio_family_output/*-\${suffix} | head -n 1) if [ -f \$src ] then - ln -s \$src \$bcbio_family_output/${family_id}-\${suffix} + ln -s \$src \$outdir/${family_id}-\${suffix} fi done && - for f in \$merged_bcbio_family_output/*{.log,.csv,.yaml,multiqc} + ln -s \$(readlink -f $bcbio_output_dir)/provenance/programs.txt \$outdir/ && + + for f in \$bcbio_family_output/*{.log,.csv,.yaml,multiqc} do - ln -s \$f \$bcbio_family_output/ + ln -s \$f \$outdir/ done && for i in ${individuals.join(' ')} do - ln -s \$(readlink -f $formatted_individual_outputs)/\$i \$bcbio_family_output/\$i + ln -s \$(readlink -f $formatted_individual_outputs)/\$i \$outdir/\$i done && - cd \$bcbio_family_output && + cd \$outdir && for f in \$(find . -type f | grep -v '\\.bam') do md5sum \$f >> md5sum.txt @@ -360,16 +324,40 @@ workflow process_families { */ take: + /* + ch_individuals: [ + [ + indv_id, + family_id, + father_id, + mother_id, + sex, + affected, + [r1_fastqs], + [r2_fastqs] + ], + ... + ] + */ ch_individuals + + // value channels ch_ped_file ch_samplesheet main: - ch_bcbio = file(params.bcbio, checkIfExists: true) - ch_bcbio_template = file(params.bcbio_template, checkIfExists: true) - ch_target_bed = file(params.target_bed, checkIfExists: true) - ch_reference_genome = file(params.reference_genome, checkIfExists: true) - + // collect() can make a queue channel of paths into a value channel + ch_bcbio = Channel.fromPath(params.bcbio, checkIfExists: true).collect() + ch_bcbio_template = Channel.fromPath(params.bcbio_template, checkIfExists: true).collect() + ch_target_bed = Channel.fromPath(params.target_bed, checkIfExists: true).collect() + ch_reference_genome = Channel.fromPath(params.reference_genome, checkIfExists: true).collect() + + /* + ch_merged_fastqs: [ + [indv_id, indv_id_merged_r1.fastq.gz, indv_id_merged_r2.fastq.gz], + ... + ] + */ ch_merged_fastqs = merge_fastqs( ch_individuals.map( { indv, family, father, mother, sex, affected, r1, r2 -> @@ -377,35 +365,63 @@ workflow process_families { }) ).groupTuple() - ch_families = ch_individuals.map( + ch_families = ch_individuals.map( { sample_id, family_id, father, mother, sex, phenotype, r1, r2 -> [family_id, [sample_id, sample_id, family_id, sex, phenotype, father, mother]] - }).groupTuple() + } + ).groupTuple() ch_indv_by_family = ch_individuals.map( - { sample_id, family_id, father, mother, sex, phenotype, r1, r2 -> + { sample_id, family_id, father, mother, sex, phenotype, r1, r2 -> [family_id, sample_id] - }).groupTuple() + } + ).groupTuple() + /* + ch_bcbio_csvs: [ + [family_id, family_id.csv], + ... + ] + */ ch_bcbio_csvs = write_bcbio_csv( - ch_families, + ch_families, ch_target_bed ) ch_bcbio_inputs = ch_indv_by_family.join(ch_bcbio_csvs.join(ch_merged_fastqs)) + /* + ch_bcbio_family_outputs: [ + [family_id, [indv1_id, indv2_id, ...], family_id-merged/], + ... + ] + */ ch_bcbio_family_outputs = bcbio_family_processing( ch_bcbio_inputs, ch_bcbio, ch_bcbio_template ) + /* + ch_individual_folder: [ + [family_id, individual_outputs/], + ... + ] + */ ch_individual_folders = format_bcbio_individual_outputs( ch_bcbio_family_outputs, ch_bcbio, ch_reference_genome ) + + /* + ch_formatted_bcbio_outputs: [ + [family_id, 2022-07-04_family_id, family_id-merged/], + ... + ] + + */ ch_formatted_bcbio_outputs = format_bcbio_family_outputs( ch_bcbio_family_outputs.join(ch_individual_folders) ) diff --git a/pipeline/variant_prioritisation.nf b/pipeline/variant_prioritisation.nf new file mode 100644 index 0000000000000000000000000000000000000000..66cb69e47178f3a0bde510202c5a349922d5429b --- /dev/null +++ b/pipeline/variant_prioritisation.nf @@ -0,0 +1,928 @@ +nextflow.enable.dsl = 2 + +prioritisation_out = "${params.output_dir}/${params.pipeline_project_id}_${params.pipeline_project_version}/prioritization/${params.prioritisation_version}" +conda_python2 = 'python=2.7' +conda_xlsxwriter = 'xlsxwriter=1.1.5' +conda_vt = 'bioconda::vt=0.57721' +conda_bcftools ='bioconda::bcftools=1.9' +conda_tabix = 'bioconda::tabix=1.11' // tabix 1.9 isn't available on Conda +conda_vep = 'bioconda::ensembl-vep=100.4' +conda_vase = 'bioconda::vase=0.5.1' // 0.4.2 not available +conda_gatk4 = 'bioconda::gatk4=4.2.1.0' +conda_gatk3 = 'bioconda::gatk=3.8' +conda_samtools = 'bioconda::samtools=1.9' + + + +def conda_env(env) { + // if running stub tests, don't use Conda + if (workflow.profile.contains("stubs")) { + return null + } else { + return env + } +} + + + +process trio_setup { + label 'small' + + publishDir "${prioritisation_out}", mode: 'copy' + + input: + val(family_peds) + + output: + path('FAM_IDs.txt') + path('PRO_IDs.txt') + path('FAM_PRO.txt') + + script: + """ + extract_trio_FAM_PRO_ID.py ${family_peds.join(' ')} --exclude-families ${params.exclude_families.join(' ')} + """ +} + + +// Todo: fix affected parents + + +process process_trio { + label 'medium' + + conda conda_env("${params.bcbio}/anaconda/envs/java") + + publishDir "${prioritisation_out}", mode: 'copy' + + input: + tuple( + val(family_id), // from FAM_IDs.txt + path(family_ped), + path(family_vcf), + val(plate_id), + val(proband_id), + val(proband_bam), + val(par_1_id), + val(par1_bam), + val(par_2_id), + val(par2_bam) + ) + path(decipher_file) + + output: + path("VCF/*", emit: vcf) + //path("PED/*", emit: ped) + path("BAMOUT/*", emit: bamout) + path("COV/*", emit: cov) + //path("CNV/*", emit: cnv) + path("DECIPHER/*.*", emit: decipher) + path("DECIPHER/IGV/*", emit: decipher_igv) + path("G2P/*", emit: g2p) + //path("LOG/*", emit: log) + path("VASE/*", emit: vase) + + script: + """ + mkdir VCF + mkdir PED + mkdir LOG + mkdir G2P + mkdir VASE + mkdir COV + mkdir -p DECIPHER/IGV + mkdir CNV + mkdir BAMOUT + + export FAMILY_ID=$family_id + export FAMILY_PED=$family_ped + export FAMILY_VCF=$family_vcf + export PLATE_ID=$plate_id + export PROBAND_ID=$proband_id + export PROBAND_BAM=$proband_bam + export PAR_1_ID=$par_1_id + export PAR_1_BAM=$par1_bam + export PAR_2_ID=$par_2_id + export PAR_2_BAM=$par2_bam + + export BCBIO=${params.bcbio} + export REFERENCE_GENOME=${params.reference_genome} + export BLACKLIST=${params.blacklist} + export TRANS_MAP=${params.trans_map} + export REC_SNP=${params.recurrent_snps} + export G2P_TARGETS=${params.g2p_targets} + export LIST_24_CHR=${params.list_24_chr} + export CLINVAR=${params.clinvar} + export GATK3=${params.gatk3} + export SCRIPTS_DIR=${params.scripts_dir} + export DEC_MAP=${decipher_file} + + process_trio.sh $family_id ${params.bcbio} + """ + + stub: + """ + vcf_base=${family_vcf.getName().replace('-gatk-haplotype-annotated.vcf.gz', '')} + mkdir VCF G2P VASE COV + mkdir -p DECIPHER/IGV BAMOUT/${proband_id} + touch VCF/\$vcf_base.{decomp,norm,DNU,ready}.vcf.gz + touch VCF/\$vcf_base.{AC0,clean}.vcf + touch G2P/${plate_id}_${family_id}{_inter_out.txt{,_summary.html},.report.{html,txt}} + touch G2P/3631209.txt + touch VASE/\$vcf_base.ready.denovo.vcf + touch COV/${proband_id}_${family_id}.DD15.COV.txt + touch COV/${proband_id}_${family_id}.DD15.sample_{cumulative_coverage_{counts,proportions},interval_{statistics,summary},statistics,summary} + touch COV/${proband_id}_${family_id}.REC_SNP_COV.txt + for i in \$(cat $family_ped | cut -f 2 | tr '\n' ' ') + do + touch "VCF/\$vcf_base.ready.\$i.vcf.gz" + done + + touch DECIPHER/${proband_id}_${family_id}_{DEC_FLT.csv,DECIPHER_v10.xlsx} + touch DECIPHER/IGV/${proband_id}_${family_id}.snapshot.FLT.csv + touch DECIPHER/IGV/bamout_${proband_id}_${family_id}.snapshot.txt + + touch BAMOUT/${family_id}/${family_id}_chr{2,4,7,16,22,X}_12345678.bamout.{bam,bai,vcf,vcf.idx} + """ +} + + +process dnu_clean { + conda conda_env("$conda_python2 $conda_vt $conda_bcftools $conda_tabix") + + input: + val(trio_input) + path(reference_fa) + path(reference_fai) + path(g2p_targets) + path(blacklist) + + output: + val(trio_input, emit: trio_data) + path("${base}.ready.vcf.gz", emit: ready_vcf) + path("${base}.ready.vcf.gz.tbi", emit: ready_vcf_tbi) + path("${base}.clean.vcf", emit: clean_vcf) + + publishDir "${prioritisation_out}/VCF", mode: 'copy', pattern: '*.ready.vcf.gz*' + + // DNU and clean the family VCF, e.g. ${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz, + // plus deal with strand bias variants + script: + base = "${trio_input.plate_id}_${trio_input.family_id}" + dnu_vcf = "${base}.DNU.vcf.gz" + sb_exome_vcf = "${base}.SB_exome.vcf.gz" + sb_ddg2p_vcf = "${base}.SB_DDG2P.vcf.gz" + sb_vcf = "${base}.SB.vcf.gz" + + """ + vt decompose -s ${trio_input.family_vcf} -o ${base}.decomp.vcf.gz && + vt normalize ${base}.decomp.vcf.gz -r ${reference_fa} -o ${base}.norm.vcf.gz && + vt uniq ${base}.norm.vcf.gz -o ${dnu_vcf} && + + # identify variants (exome wide) which fail the strand bias filter: FS >= 60 and SOR >= 3 + bcftools filter --include "INFO/FS>=60 & INFO/SOR>=3" ${dnu_vcf} | bgzip > ${sb_exome_vcf} && + tabix -p vcf ${sb_exome_vcf} && + + # select only those that are within/overlap the DDG2P panel + bcftools view --regions-file ${g2p_targets} ${sb_exome_vcf} | bgzip > ${sb_ddg2p_vcf} && + tabix -p vcf ${sb_ddg2p_vcf} && + + echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" && + echo " FAMILY_ID = ${trio_input.family_id}: the VCF file containing excluded DDG2P variants with strand bias (FS >= 60 and SOR >= 3): " && + echo " ${sb_ddg2p_vcf}" && + echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" && + + # exclude the strand bias variants + bcftools filter --exclude "INFO/FS>=60 & INFO/SOR>=3" ${dnu_vcf} | bgzip > ${sb_vcf} && + tabix -p vcf ${sb_vcf} && + + # remove sites with AC=0 + bcftools view --min-ac=1 --no-update ${dnu_vcf} > ${base}.AC0.vcf && + + # reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0 + # exclude variants from the blacklist (matching on chr,pos,ref,alt) + filter_LQ_GT.py ${blacklist} ${base}.AC0.vcf ${base}.clean.vcf && + + # bgzip and tabix it + cat ${base}.clean.vcf | bgzip > ${base}.ready.vcf.gz && + tabix -p vcf ${base}.ready.vcf.gz + """ + + stub: + base = "${trio_input.plate_id}_${trio_input.family_id}" + """ + touch ${base}.ready.vcf.gz{,.tbi} + touch ${base}.clean.vcf + """ +} + + +process g2p { + // run G2P for each family VCF (DD genes), e.g. ${PLATE_ID}_${FAMILY_ID}.clean.vcf + + conda conda_env(conda_vep) + + input: + val(trio_data) + path(clean_vcf) + path(reference_fa) + path(reference_fai) + path(g2p_target_csv) + path(g2p_genes) + path(vep_cache) + path(vep_plugins) + + publishDir "${prioritisation_out}/G2P", mode: 'copy' + + output: + tuple( + val(trio_data), + path("${base}_LOG_DIR/${base}_inter_out.txt"), + path("${base}_LOG_DIR/${base}_inter_out.txt_summary.html"), + path("${base}_LOG_DIR/${base}.report.html"), + path("${base}_LOG_DIR/${base}.report.txt"), + path("${base}_LOG_DIR/*.txt") + ) + + script: + base = "${trio_data.plate_id}_${trio_data.family_id}" + """ + G2P_LOG_DIR=${base}_LOG_DIR + TXT_OUT=\${G2P_LOG_DIR}/${base}.report.txt + HTML_OUT=\${G2P_LOG_DIR}/${base}.report.html + VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38' + + vep \ + -i ${clean_vcf} \ + --output_file \${G2P_LOG_DIR}/${base}_inter_out.txt \ + --force_overwrite \ + --assembly GRCh38 \ + --fasta ${reference_fa} \ + --offline \ + --merged \ + --use_given_ref \ + --cache --cache_version 100 \ + --dir_cache ${vep_cache} \ + --individual all \ + --transcript_filter "gene_symbol in ${g2p_genes}" \ + --dir_plugins ${vep_plugins} \ + --plugin G2P,file='${g2p_target_csv}',af_from_vcf=1,confidence_levels='definitive&strong&moderate',af_from_vcf_keys='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38',log_dir=\${G2P_LOG_DIR},txt_report=\${TXT_OUT},html_report=\${HTML_OUT} + """ + + stub: + base = "${trio_data.plate_id}_${trio_data.family_id}" + """ + log_dir=${base}_LOG_DIR + mkdir \$log_dir + touch \$log_dir/${base}{_inter_out.txt{,_summary.html},.report.{html,txt}} + touch \$log_dir/3631209.txt + """ +} + + +process vase { + // run VASE for each family VCF (de novo) + + conda conda_env("$conda_vase $conda_tabix") + + input: + val(trio_data) + path(ready_vcf) + path(ready_vcf_tbi) + + output: + val(trio_data, emit: trio_data) + path("${base}.strict.denovo.vcf", emit: strict_denovo_vcf) + + script: + base = "${trio_data.plate_id}_${trio_data.family_id}" + """ + OUT_FILE=${base}.strict.denovo.vcf + + vase \ + -i ${ready_vcf} \ + -o \${OUT_FILE} \ + --log_progress \ + --prog_interval 100000 \ + --freq 0.0001 \ + --gq 30 --dp 10 \ + --het_ab 0.3 \ + --max_alt_alleles 1 \ + --csq all \ + --biotypes all \ + --control_gq 15 --control_dp 5 \ + --control_het_ab 0.01 \ + --control_max_ref_ab 0.05 \ + --de_novo \ + --ped ${trio_data.family_ped} + + + # for the cases where one of the parents is also affected, VASE de novo will crash not being able to find parents for the affected parent + # not producing any output file, which trips the pipeline downstream + # to handle: check if VASE denovo produced an output and if not (as it will be for such cases) + # create an empty VCF headered output == no denovos found) + + if [ -f "\${OUT_FILE}" ]; then + echo "VASE denovo completed successfully" + else + echo "WARNING: VASE denovo has not produced an output (e.g. affected parent), generate an empty VCF one (with headers)" + tabix -H ${ready_vcf} > \${OUT_FILE} + fi + """ + + stub: + base = "${trio_data.plate_id}_${trio_data.family_id}" + """ + touch ${base}.strict.denovo.vcf + """ +} + + +process filter_denovo_vcfs { + // run VASE for each family VCF (de novo) + + conda conda_env("$conda_gatk4 $conda_bcftools") + + input: + val(trio_data) + path(strict_denovo_vcf) + path(reference_fa) + path(reference_fai) + path(reference_dict) + path(list_24_chr) + + publishDir "${prioritisation_out}/VASE", mode: 'copy' + + output: + tuple( + val(trio_data), + path("${base}.ready.denovo.vcf") + ) + + script: + base = "${trio_data.plate_id}_${trio_data.family_id}" + """ + strict_24chr_denovo_vcf=${base}.strict.24chr.denovo.vcf + strict_24chr_sort_denovo_vcf=${base}.strict.24chr.sort.denovo.vcf + + # do some filtering on the denovo VCFs - exclude variants not on the 24 chr, as well as variants in LCR and telomere/centromere regions + ### actually, ignore the filtering of variants in LCR and telomere/centromere regions --> more variants with “Unknown†status may be classified as “denovo†if enough support + # index the denovo VCF + gatk IndexFeatureFile -I ${strict_denovo_vcf} + + # select only variants on the 24 chromosomes + gatk SelectVariants -R ${reference_fa} -V ${strict_denovo_vcf} -O \$strict_24chr_denovo_vcf -L $list_24_chr --exclude-non-variants + + # sort the VCF (maybe not needed?, but just in case, and it is quick) + grep '^#' \$strict_24chr_denovo_vcf > \$strict_24chr_sort_denovo_vcf \ + && grep -v '^#' \$strict_24chr_denovo_vcf | LC_ALL=C sort -t \$'\t' -k1,1V -k2,2n >> \$strict_24chr_sort_denovo_vcf + # index the sorted VCF + gatk IndexFeatureFile -I \$strict_24chr_sort_denovo_vcf + + # split multi-allelic sites [by -m -any] + # left-alignment and normalization [by adding the -f] + bcftools norm -f ${reference_fa} -m -any -Ov -o ${base}.ready.denovo.vcf \$strict_24chr_sort_denovo_vcf + """ + + stub: + base = "${trio_data.plate_id}_${trio_data.family_id}" + """ + touch ${base}.ready.denovo.vcf + """ +} + + +process coverage { + // run coverage for each proband (DD genes) + // format: ${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${INDI_ID}_${FAMILY_ID}/${INDI_ID}_${FAMILY_ID}-ready.bam + + conda conda_env("$conda_python2 $conda_gatk3") + + input: + val(trio_data) + path(reference_fa) + path(reference_fai) + path(reference_dict) + path(g2p_targets) + path(gatk3) + path(clinvar) + + publishDir "${prioritisation_out}/COV", mode: 'copy' + + output: + path("${full_proband}*DD15*") + + script: + full_proband = "${trio_data.proband_id}_${trio_data.family_id}" + """ + # make sure we are reading the data from the exact batch & plate ID + OUT_FILE=${full_proband}.DD15 + + java -Xmx8g -jar ${gatk3} -T DepthOfCoverage -R ${reference_fa} -o \${OUT_FILE} -I ${trio_data.proband_bam} -L ${g2p_targets} \ + --omitDepthOutputAtEachBase \ + --minBaseQuality 20 \ + --minMappingQuality 20 \ + -ct 20 \ + -jdk_deflater \ + -jdk_inflater \ + --allow_potentially_misencoded_quality_scores && + + echo "percentage of DD exons (+/-15bp) covered at least 20x in PROBAND_ID = ${full_proband} ..." + cat \${OUT_FILE}.sample_summary | awk '{print \$7}' && + + # now compute the coverage per DD exon (+/-15bp) interval, adding the number of P/LP ClinVar variants (assertion criteria provided) in each interval + get_cov_output.py \${OUT_FILE}.sample_interval_summary ${clinvar} \${OUT_FILE}.COV.txt + """ + + stub: + full_proband = "${trio_data.proband_id}_${trio_data.family_id}" + """ + touch ${full_proband}.DD15.COV.txt + touch ${full_proband}.DD15.sample_{cumulative_coverage_{counts,proportions},interval_{statistics,summary},statistics,summary} + """ +} + +process recurrent_denovo_snp_cov { + // check the coverage per each of the recurent de novo SNPs (padded with 15bp both directions) + + conda conda_env(conda_samtools) + input: + val(trio_data) + path(rec_snp) + + publishDir "${prioritisation_out}/COV", mode: 'copy' + + output: + path("${trio_data.proband_id}_${trio_data.family_id}.REC_SNP_COV.txt") + + script: + """ + # we have identified the name of the proband's BAM file above (PROBAND_BAM), reuse it + # set the name of the file containing info about the coverage of the recurrent SNPs + REC_OUT_FILE=${trio_data.proband_id}_${trio_data.family_id}.REC_SNP_COV.txt + + while IFS=\$'\t' read -ra var; do + gene="\${var[0]}" + chr="\${var[1]}" + pos="\${var[2]}" + lo=\$(expr \$pos - 15) + hi=\$(expr \$pos + 15) + reg="\$lo-\$hi" + echo "=============================================" + echo "\$gene : recurrent variant at \$chr:\$pos" + echo "exploring coverage at \$chr:\$reg" + + echo "---------------------------------------------" + echo "precisely at the position" + samtools depth -aa -Q 20 -r \$chr:\$reg ${trio_data.proband_bam} | grep "\$pos" + + echo "---------------------------------------------" + echo "average in the +/- 15bp region" + samtools depth -aa -Q 20 -r \$chr:\$reg ${trio_data.proband_bam} | awk '{sum+=\$3} END { print "Average = ",sum/NR}' + + echo "---------------------------------------------" + echo "detailed in the +/- 15bp region" + samtools depth -aa -Q 20 -r \$chr:\$reg ${trio_data.proband_bam} + done < ${rec_snp} > ${trio_data.proband_id}_${trio_data.family_id}.REC_SNP_COV.txt + """ + + stub: + """ + touch ${trio_data.proband_id}_${trio_data.family_id}.REC_SNP_COV.txt + """ +} + +process split_family_vcf { + conda conda_env(conda_bcftools) + + input: + val(trio_data) + path(ready_vcf) + path(ready_vcf_tbi) + path(reference_fa) + path(reference_fai) + + publishDir "${prioritisation_out}/VCF", mode: 'copy' + + output: + tuple( + val(trio_data), + path("${trio_data.plate_id}_${trio_data.family_id}.ready.${trio_data.proband_id}_${trio_data.family_id}.vcf.gz"), // proband vcf + path("${trio_data.plate_id}_${trio_data.family_id}.ready.${trio_data.mother_id}_${trio_data.family_id}.vcf.gz"), // mother_vcf + path("${trio_data.plate_id}_${trio_data.family_id}.ready.${trio_data.father_id}_${trio_data.family_id}.vcf.gz"), // father_vcf + ) + + script: + """ + # split the family VCF to individual VCFs + # -c1: minimum allele count (INFO/AC) of sites to be printed + # split multi-allelic sites (by -m -any) + # left-alignment and normalization (by adding the -f) + + file=${ready_vcf} + echo "splitting \$file" + for indi in `bcftools query -l \$file`; do + indv_rough_vcf_gz=\${file/.vcf*/.\$indi.rough.vcf.gz} + bcftools view -c1 -Oz -s \$indi -o \$indv_rough_vcf_gz \$file + bcftools norm -f ${reference_fa} -m -any -Oz -o \${file/.vcf*/.\$indi.vcf.gz} \$indv_rough_vcf_gz + done + """ + + stub: + """ + for i in ${trio_data.proband_id} ${trio_data.mother_id} ${trio_data.father_id} + do + touch ${trio_data.plate_id}_${trio_data.family_id}.ready.\${i}_${trio_data.family_id}.vcf.gz + done + """ +} + + +process generate_decipher_file { + // for each proband generate the DECIPHER file + // ${VCF_DIR}/${VCF_BASE}.ready.vcf.gz - the cleaned family VCF + // ${VASE_DIR}/${VCF_BASE}.ready.denovo.vcf - the VASE file + // ${TRANS_MAP} - the current transcript mapping file + + conda conda_env("$conda_python2 $conda_xlsxwriter") + + input: + val(trio_data) + path(vase_ready_denovo_vcf) + path(decipher_internal_ids) + path(g2p_file) + path(trans_map) + + publishDir "${prioritisation_out}/DECIPHER", mode: 'copy' + + output: + val(trio_data, emit: trio_data) + path("IGV/${full_proband}.snapshot.FLT.txt", emit: snapshot_flt_txt) + path("${full_proband}_DEC_FLT.csv", emit: dec_flt_csv) + path("${full_proband}_DECIPHER_v10.xlsx", emit: dec_v10_xlsx) + + + script: + full_proband = "${trio_data.proband_id}_${trio_data.family_id}" + """ + # VASE file - already split, left-aligned and normalized + mkdir IGV + + ## call the python script + generate_DEC_IGV_scripts.py \ + ${decipher_internal_ids} \ + ${trans_map} \ + ${trio_data.family_ped} \ + ${g2p_file} \ + ${vase_ready_denovo_vcf} \ + ${trio_data.plate_id}_${trio_data.family_id} \ + ${trio_data.proband_vcf} \ + ${trio_data.mother_vcf} \ + ${trio_data.father_vcf} \ + ${trio_data.plate_id} \ + ${trio_data.family_id} \ + ${trio_data.proband_bam} \ + ${trio_data.mother_bam} \ + ${trio_data.father_bam} \ + ${full_proband}_DEC_FLT.csv \ + IGV/${full_proband}.snapshot.FLT.txt && + + ## using the DECIPHER bulk upload file v9 --> generate the DECIPHER bulk upload file v10 + echo "...Generating v10 Decipher bulk upload file for proband = ${trio_data.proband_id}, family_id = ${trio_data.family_id} ..." && + convert_DEC_to_v10.py ${full_proband} ${full_proband}_DEC_FLT.csv ${full_proband}_DECIPHER_v10.xlsx + """ + + stub: + full_proband = "${trio_data.proband_id}_${trio_data.family_id}" + """ + mkdir IGV + touch IGV/${full_proband}.snapshot.FLT.txt + touch ${full_proband}_{DEC_FLT.csv,DECIPHER_v10.xlsx} + """ +} + + +process igv_snapshot_bamouts { + // for each variant in the DECIPHER upload file + // generate a IGV snapshot based on the realigned BAM used by GATK for calling variants + // first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder) + // then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs + + conda conda_env(conda_gatk4) + + input: + val(trio_data) + path(proband_dec_flt_csv) + path(reference_fa) + path(reference_fai) + path(reference_dict) + + publishDir "${prioritisation_out}/BAMOUT", mode: 'copy', pattern: "${trio_data.family_id}/*.bamout.*" + + output: + val(trio_data, emit: trio_data) + path("${trio_data.family_id}/*.bamout.*", emit: bamout, optional: true) + path(proband_dec_flt_csv, emit: proband_dec_flt_csv) + + script: + """ + echo "...Generating BAMOUT files for the ${trio_data.family_id} family, proband = ${trio_data.proband_id} ..." + + # gather the trio BAM files + echo "...kid_bam = ${trio_data.proband_bam}..." + echo "...par_1_bam = ${trio_data.mother_bam}..." + echo "...par_2_bam = ${trio_data.father_bam}..." + + # gather the variants in the DECIPHER file for which to generate bamouts + # chr is the second column - need to add the 'chr' prefix + # pos is the third column + # the first line is a header line, starting with 'Internal reference number or ID' + # file called: <proband_id>_<fam_id>_DEC_FLT.csv + # and for each run GATK to generate the bamout files + # to be stored in \${BAMOUT_DIR}/\${FAMILY_ID} + + mkdir -p ${trio_data.family_id} && + + echo "... reading ${proband_dec_flt_csv} to generate the bamouts..." && + + grep -v '^Internal' ${proband_dec_flt_csv} | + while IFS= read -r line + do + echo "\$line" + IFS=, read -ra ary <<<"\$line" + chr=\${ary[1]} + pos=\${ary[2]} + ref=\${ary[4]} + alt=\${ary[5]} + echo " --> chr = \$chr, pos = \$pos, ref = \${ref}, alt = \${alt}" + + # generate the bamout file + echo "...doing the bamout" + + gatk HaplotypeCaller \ + --reference ${reference_fa} \ + --input ${trio_data.proband_bam} \ + --input ${trio_data.mother_bam} \ + --input ${trio_data.father_bam} \ + -L chr\${chr}:\${pos} \ + --interval-padding 500 --active-probability-threshold 0.000 -ploidy 2 \ + --output ${trio_data.family_id}/${trio_data.family_id}_chr\${chr}_\${pos}.bamout.vcf \ + -bamout ${trio_data.family_id}/${trio_data.family_id}_chr\${chr}_\${pos}.bamout.bam + done + """ + + stub: + """ + mkdir ${trio_data.family_id} + touch ${trio_data.family_id}/${trio_data.family_id}_chr{2,4,7,16,22,X}_12345678.bamout.{bam,bai,vcf{,.idx}} + """ +} + +process generate_igv_snapshots { + // write the IGV batch file for this family based on the bamouts + // to be stored as DECIPHER/IGV/bamout_${PROBAND_ID}_${FAMILY_ID}.snapshot.txt + + input: + val(trio_data) + path(proband_dec_flt_csv) + + publishDir "${prioritisation_out}/DECIPHER/IGV", mode: 'copy' + + output: + path("bamout_${full_proband}.snapshot.txt") + + script: + full_proband = "${trio_data.proband_id}_${trio_data.family_id}" + """ + snap_file=bamout_${full_proband}.snapshot.txt && + + # check if previous version exist, if so - delete it + if [ -f "\$snap_file" ]; then + echo "previous version of \$snap_file exist --> deleted" + rm \$snap_file + fi && + + # write the header for the IGV batch file + echo "new" >> \$snap_file + echo "genome hg38" >> \$snap_file + echo "snapshotDirectory \"${trio_data.plate_id}_${trio_data.family_id}\"" >> \$snap_file + echo "" >> \$snap_file + + # now, go again over the variants in the DECIPHER file and generate one snapshot file for all the variants + echo "... reading ${proband_dec_flt_csv} to generate the IGV batch file using the bamouts..." + + grep -v '^Internal' ${proband_dec_flt_csv} | + while IFS= read -r line + do + IFS=, read -ra ary <<<"\$line" + chr=\${ary[1]} + pos=\${ary[2]} + ref=\${ary[4]} + alt=\${ary[5]} + left=\$((\${pos}-25)) + right=\$((\${pos}+25)) + + echo "new" >> \$snap_file + echo "load ${trio_data.family_id}/${trio_data.family_id}_chr\${chr}_\${pos}.bamout.bam" >> \$snap_file + echo "preference SAM.SHADE_BASE_QUALITY true" >> \$snap_file + + echo "goto chr\${chr}:\${left}-\${right}" >> \$snap_file + echo "group SAMPLE" >> \$snap_file + echo "sort base" >> \$snap_file + echo "squish" >> \$snap_file + echo "snapshot bamout_${full_proband}_chr\${chr}_\${pos}_\${ref}_\${alt}.png" >> \$snap_file + echo "" >> \$snap_file + echo "" >> \$snap_file + + done + + echo "Generating of the IGV batch files based on bamouts - done!" + echo "snap_file = \$snap_file" + """ + + stub: + full_proband = "${trio_data.proband_id}_${trio_data.family_id}" + """ + touch bamout_${full_proband}.snapshot.txt + """ +} + + +workflow read_inputs { + main: + // ch_family_ids: [family1, family2, ...] + ch_family_ids = Channel.fromPath(params.ped_file) + .splitCsv(sep: '\t') + .map({line -> line[0]}) + .flatten() + .unique() + + ch_family_peds = ch_family_ids + .map( + { + it -> [ + it, + "${params.output_dir}/${params.pipeline_project_id}_${params.pipeline_project_version}/params/${it}.ped" + ] + } + ) + + emit: + ch_family_ids = ch_family_ids + ch_family_peds = ch_family_peds +} + +workflow prioritisation_setup { + read_inputs() + + trio_setup( + read_inputs.out.ch_family_peds.map( + {family_id, ped -> ped} + ).collect() + ) + report = """ + Family ID file available at:\n + ${params.output_dir}/${params.pipeline_project_id}_${params.pipeline_project_version}/prioritization/${params.prioritisation_version}/FAM_IDs.txt + + Download this file for translation to DECIPHER IDs, and upload the resulting DECIPHER_INTERNAL_IDS.txt file to ${params.output_dir}/${params.pipeline_project_id}_${params.pipeline_project_version}/prioritization/${params.prioritisation_version}/ + """ + println report.stripIndent() +} + + +workflow prioritisation { + read_inputs() + + pipeline_dir = "${params.output_dir}/${params.pipeline_project_id}_${params.pipeline_project_version}" + prioritisation_dir = "$pipeline_dir/prioritization/${params.prioritisation_version}" + + ch_probands = Channel.fromPath("$prioritisation_dir/FAM_PRO.txt") + .splitCsv(sep: '\t') + + ch_vcfs = Channel.fromPath("$pipeline_dir/families/*/*-gatk-haplotype-annotated.vcf.gz") + .map({ it -> [it.getName().replace('-gatk-haplotype-annotated.vcf.gz', ''), it] }) + + ch_bams = Channel.fromPath("$pipeline_dir/families/*/*/*-ready.bam") + .map({[it.getParent().getName(), it]}) + + ch_family_ped_data = Channel.fromPath(params.ped_file) + .splitCsv(sep: '\t') + .filter({it[2] != '0' && it[3] != '0'}) + // todo: remove redundancy with FAM_PRO.txt + .map( + { + family_id, proband, father, mother, sex, affected -> + [proband, father, mother, family_id] + } + ).join(ch_bams) + .map( + { + proband, father, mother, family_id, proband_bam -> + [father, proband, proband_bam, mother, family_id] + } + ) + .join(ch_bams) + .map( + { + father, proband, proband_bam, mother, family_id, father_bam -> + [mother, proband, proband_bam, father, father_bam, family_id] + } + ) + .join(ch_bams) + .map( + { + mother, proband, proband_bam, father, father_bam, family_id, mother_bam -> + [family_id, proband, proband_bam, father, father_bam, mother, mother_bam] + } + ) + + ch_trio_inputs = read_inputs.out.ch_family_peds.map( + {full_family_id, ped -> + [full_family_id.split('_')[1], full_family_id, ped] + }) + .join(ch_probands) + .map( + {short_family_id, full_family_id, ped, proband -> + [full_family_id, ped, short_family_id, proband] + } + ) + .join(ch_vcfs) + .join(ch_family_ped_data) + .map( + { + full_family_id, ped, short_family_id, proband, vcf, full_proband, + proband_bam, full_father, father_bam, full_mother, mother_bam -> + [ + family_id: full_family_id.split('_')[1], + family_ped: ped, + family_vcf: vcf, + plate_id: full_family_id.split('_')[0], + proband_id: proband, + proband_bam: proband_bam, + father_id: full_father.split('_')[0], + father_bam: father_bam, + mother_id: full_mother.split('_')[0], + mother_bam: mother_bam + ] + } + ) + + ch_decipher_file = Channel.fromPath("$prioritisation_dir/DECIPHER_INTERNAL_ID*.txt").collect() + ch_reference_fa = Channel.fromPath(params.reference_genome, checkIfExists: true).collect() + ch_reference_fai = Channel.fromPath("${params.reference_genome}.fai", checkIfExists: true).collect() + ch_reference_dict = Channel.fromPath("${params.reference_dict}", checkIfExists: true).collect() + ch_blacklist = Channel.fromPath(params.blacklist, checkIfExists: true).collect() + ch_list_24_chr = Channel.fromPath(params.list_24_chr, checkIfExists: true).collect() + ch_g2p_target_bed = Channel.fromPath(params.g2p_target_bed, checkIfExists: true).collect() + ch_g2p_target_csv = Channel.fromPath(params.g2p_target_csv, checkIfExists: true).collect() + ch_g2p_genes = Channel.fromPath(params.g2p_genes, checkIfExists: true).collect() + ch_gatk3 = Channel.fromPath(params.gatk3, checkIfExists: true).collect() + ch_clinvar = Channel.fromPath(params.clinvar, checkIfExists: true).collect() + ch_rec_snp = Channel.fromPath(params.recurrent_snps, checkIfExists: true).collect() + ch_trans_map = Channel.fromPath(params.trans_map, checkIfExists: true).collect() + ch_vep_cache = Channel.fromPath(params.vep_cache, checkIfExists: true).collect() + ch_vep_plugins = Channel.fromPath(params.vep_plugins, checkIfExists: true).collect() + + dnu_clean(ch_trio_inputs, ch_reference_fa, ch_reference_fai, ch_g2p_target_bed, ch_blacklist) + g2p(dnu_clean.out.trio_data, dnu_clean.out.clean_vcf, ch_reference_fa, ch_reference_fai, ch_g2p_target_csv, ch_g2p_genes, ch_vep_cache, ch_vep_plugins) + vase(dnu_clean.out.trio_data, dnu_clean.out.ready_vcf, dnu_clean.out.ready_vcf_tbi) + filter_denovo_vcfs(vase.out.trio_data, vase.out.strict_denovo_vcf, ch_reference_fa, ch_reference_fai, ch_reference_dict, ch_list_24_chr) + coverage(ch_trio_inputs, ch_reference_fa, ch_reference_fai, ch_reference_dict, ch_g2p_target_bed, ch_gatk3, ch_clinvar) + recurrent_denovo_snp_cov(ch_trio_inputs, ch_rec_snp) + split_family_vcf(dnu_clean.out.trio_data, dnu_clean.out.ready_vcf, dnu_clean.out.ready_vcf_tbi, ch_reference_fa, ch_reference_fai) + + split_family_vcf.out.map({it -> [it[0].family_id, it]}) + .join(filter_denovo_vcfs.out.map({trio_data, ready_denovo_vcf -> [trio_data.family_id, ready_denovo_vcf]})) + .join( + g2p.out.map( + {trio_data, inter_out, inter_out_summary, report_html, report_txt, x_txt -> [trio_data.family_id, report_txt]} + ) + ) + .multiMap( + { family_id, split_data, ready_denovo, g2p_report -> + family_id: family_id + trio_data: [ + family_id: split_data[0].family_id, + family_ped: split_data[0].family_ped, + family_vcf: split_data[0].family_vcf, + plate_id: split_data[0].plate_id, + proband_id: split_data[0].proband_id, + proband_bam: split_data[0].proband_bam, + proband_vcf: split_data[1], + father_id: split_data[0].father_id, + father_bam: split_data[0].father_bam, + father_vcf: split_data[3], + mother_id: split_data[0].mother_id, + mother_bam: split_data[0].mother_bam, + mother_vcf: split_data[2] + ] + ready_denovo_vcf: ready_denovo + proband_g2p_report: g2p_report + } + ) + .set({ch_trios_with_vcfs}) + + generate_decipher_file(ch_trios_with_vcfs.trio_data, ch_trios_with_vcfs.ready_denovo_vcf, ch_decipher_file, ch_trios_with_vcfs.proband_g2p_report, ch_trans_map) + igv_snapshot_bamouts(generate_decipher_file.out.trio_data, generate_decipher_file.out.dec_flt_csv, ch_reference_fa, ch_reference_fai, ch_reference_dict) + generate_igv_snapshots(igv_snapshot_bamouts.out.trio_data, igv_snapshot_bamouts.out.proband_dec_flt_csv) +} diff --git a/tests/assets/input_data/ped_files/batch_1.ped b/tests/assets/input_data/ped_files/batch_1.ped index b494a07f123e08320cabb914cd6e47afd1ba6fc7..e0ecc42bb215a13c8e51bcbf6d4f9af7866470ee 100644 --- a/tests/assets/input_data/ped_files/batch_1.ped +++ b/tests/assets/input_data/ped_files/batch_1.ped @@ -1,6 +1,7 @@ -000001 000001 000002 000003 2 2 -000001 000002 0 0 1 1 -000001 000003 0 0 2 1 -000002 000004 000005 000006 2 2 -000002 000005 0 0 1 1 -000002 000006 0 0 2 1 +12345_000001 000001_000001 000002_000001 000003_000001 2 2 +12345_000001 000002_000001 0 0 1 1 +12345_000001 000003_000001 0 0 2 1 +12345_000002 000004_000002 000005_000002 000006_000002 2 2 +12345_000002 000005_000002 0 0 1 1 +12345_000002 000006_000002 0 0 2 1 +12345_000003 000007_000003 0 0 2 2 \ No newline at end of file diff --git a/tests/assets/input_data/ped_files/giab_test_trios.ped b/tests/assets/input_data/ped_files/giab_test_trios.ped index 566cce43692ccc031da29f5336c4a7130d093c25..63184eb9088d6da1f49335562f69d3bd48ba7176 100644 --- a/tests/assets/input_data/ped_files/giab_test_trios.ped +++ b/tests/assets/input_data/ped_files/giab_test_trios.ped @@ -1,4 +1,4 @@ -00001_000001 000001_000001 000002_000002 000003_000002 1 2 +00001_000001 000001_000001 000002_000001 000003_000001 1 2 00001_000001 000002_000001 0 0 1 1 00001_000001 000003_000001 0 0 2 1 00001_000002 000004_000002 000005_000002 000006_000002 1 2 diff --git a/tests/assets/input_data/sample_sheets/DECIPHER_INTERNAL_IDs_stubs.txt b/tests/assets/input_data/sample_sheets/DECIPHER_INTERNAL_IDs_stubs.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fde77da00f77cbe82af7b94786ed311fd4b863d --- /dev/null +++ b/tests/assets/input_data/sample_sheets/DECIPHER_INTERNAL_IDs_stubs.txt @@ -0,0 +1,2 @@ +000001 decipher_000001 +000002 decipher_000002 diff --git a/tests/assets/input_data/sample_sheets/batch_1.tsv b/tests/assets/input_data/sample_sheets/batch_1.tsv index f0a24cca62135246abe875a9ae1b2021b4fb05e7..64910cfae6534a3f32b6a4a2bfb6c18ee3368cd1 100644 --- a/tests/assets/input_data/sample_sheets/batch_1.tsv +++ b/tests/assets/input_data/sample_sheets/batch_1.tsv @@ -1,7 +1,11 @@ individual_id read_1 read_2 -000001 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz -000001 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz -000002 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz -000002 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz -000003 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz -000003 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz +000001_000001 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz +000001_000001 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz +000002_000001 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz +000002_000001 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz +000003_000001 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz +000003_000001 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz +000004_000002 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz +000005_000002 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz +000006_000002 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz +000007_000003 assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz \ No newline at end of file diff --git a/tests/assets/input_data/sample_sheets/giab_test_trios.tsv b/tests/assets/input_data/sample_sheets/giab_test_trios.tsv index 35429e7d7a353c713b4503c09fdeb69539bbebf1..51b77671045667a4393cfbec0198cd2cf985ba97 100644 --- a/tests/assets/input_data/sample_sheets/giab_test_trios.tsv +++ b/tests/assets/input_data/sample_sheets/giab_test_trios.tsv @@ -1,7 +1,7 @@ individual_id read_1 read_2 -000001_000001 assets/input_data/giab/AshkenazimTrio/HG002_R1.fastq.gz assets/input_data/giab/AshkenazimTrio/HG002_R2.fastq.gz -000002_000001 assets/input_data/giab/AshkenazimTrio/HG003_R1.fastq.gz assets/input_data/giab/AshkenazimTrio/HG003_R2.fastq.gz -000003_000001 assets/input_data/giab/AshkenazimTrio/HG004_R1.fastq.gz assets/input_data/giab/AshkenazimTrio/HG004_R2.fastq.gz -000004_000002 assets/input_data/giab/ChineseTrio/HG005_R1.fastq.gz assets/input_data/giab/ChineseTrio/HG005_R2.fastq.gz -000005_000002 assets/input_data/giab/ChineseTrio/HG006_R1.fastq.gz assets/input_data/giab/ChineseTrio/HG006_R2.fastq.gz -000006_000002 assets/input_data/giab/ChineseTrio/HG007_R1.fastq.gz assets/input_data/giab/ChineseTrio/HG007_R2.fastq.gz +000001_000001 assets/input_data/giab/raw_data/AshkenazimTrio/HG002.GRCh38.2x250_R1.fastq.gz assets/input_data/giab/raw_data/AshkenazimTrio/HG002.GRCh38.2x250_R2.fastq.gz +000002_000001 assets/input_data/giab/raw_data/AshkenazimTrio/HG003.GRCh38.2x250_R1.fastq.gz assets/input_data/giab/raw_data/AshkenazimTrio/HG003.GRCh38.2x250_R2.fastq.gz +000003_000001 assets/input_data/giab/raw_data/AshkenazimTrio/HG004.GRCh38.2x250_R1.fastq.gz assets/input_data/giab/raw_data/AshkenazimTrio/HG004.GRCh38.2x250_R2.fastq.gz +000004_000002 assets/input_data/giab/raw_data/ChineseTrio/HG005.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.300x_R1.fastq.gz assets/input_data/giab/raw_data/ChineseTrio/HG005.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.300x_R2.fastq.gz +000005_000002 assets/input_data/giab/raw_data/ChineseTrio/HG006.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.100x_R1.fastq.gz assets/input_data/giab/raw_data/ChineseTrio/HG006.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.100x_R2.fastq.gz +000006_000002 assets/input_data/giab/raw_data/ChineseTrio/HG007.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.100x_R1.fastq.gz assets/input_data/giab/raw_data/ChineseTrio/HG007.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.100x_R2.fastq.gz diff --git a/tests/run_giab_tests.sh b/tests/run_giab_tests.sh index 23ce6e9fde33310bc7cdbca3a8a5cd9224b937e9..eefe547bc2402cd1ef370a2fd6dd27f6750c51c8 100755 --- a/tests/run_giab_tests.sh +++ b/tests/run_giab_tests.sh @@ -5,21 +5,48 @@ source scripts/nextflow_detached.sh test_exit_status=0 nextflow -c "$NEXTFLOW_CONFIG" clean -f -echo "Reduced GiaB data - trios" -run_nextflow ../main.nf \ - -c "$NEXTFLOW_CONFIG" \ - --workflow 'variant-calling' \ +common_args="-ansi-log false -c $NEXTFLOW_CONFIG -profile testing -resume" + +echo "--- Reduced GiaB data: trios ---" && +echo Variant calling && +nextflow ../main.nf \ + $common_args \ + --workflow variant_calling \ --pipeline_project_id giab_test_trios \ --pipeline_project_version v1 \ - --ped_file $PWD/assets/input_data/ped_files/giab_test_trios.ped \ - --sample_sheet $PWD/assets/input_data/sample_sheets/giab_test_trios.tsv + --ped_file assets/input_data/ped_files/giab_test_trios.ped \ + --sample_sheet assets/input_data/sample_sheets/giab_test_trios.tsv && + +echo Variant prioritisation setup && +nextflow ../main.nf \ + $common_args \ + --workflow variant_prioritisation_setup \ + --pipeline_project_id giab_test_trios \ + --variant_prioritisation_version v1 \ + --ped_file assets/input_data/ped_files/giab_test_trios.ped && + +cp assets/input_data/sample_sheets/DECIPHER_INTERNAL_IDs_giab_trios.txt outputs/giab_test_trios_v1/prioritization/v1 && + +echo Variant prioritisation && +nextflow run ../main.nf \ + $common_args \ + --workflow variant_prioritisation \ + --pipeline_project_id giab_test_trios \ + --variant_prioritisation_version v1 \ + --ped_file assets/input_data/ped_files/giab_test_trios.ped && + +echo Cram compression && +nextflow run ../main.nf \ + $common_args \ + --workflow cram_compression \ + --compress_dir outputs/giab_test_trios_v1 test_exit_status=$(( $test_exit_status + $? )) -echo "Reduced GiaB data - non-trios" -run_nextflow ../main.nf \ - -c "$NEXTFLOW_CONFIG" \ - --workflow 'variant-calling' \ +echo "--- Reduced GiaB data: non-trios ---" && +nextflow run ../main.nf \ + $common_args \ + --workflow variant_calling \ --pipeline_project_id giab_test_non_trios \ --pipeline_project_version v1 \ --ped_file $PWD/assets/input_data/ped_files/giab_test_non_trios.ped \ diff --git a/tests/run_stubs.sh b/tests/run_stubs.sh index 3f8f86418e86414da673d550086d53b052a097d3..ad2458ec38ae9d30e251f4ad182c72e9679dbd02 100755 --- a/tests/run_stubs.sh +++ b/tests/run_stubs.sh @@ -3,34 +3,109 @@ source scripts/nextflow_detached.sh bcbio=$PWD/scripts/bcbio_nextgen.py +common_args="-stub-run -profile testing,stubs -ansi-log false" test_exit_status=0 rm -r work/*/* -echo "Test case 1: simple trio" +echo "--- Test case 1: simple trio ---" && +echo "Variant calling" && run_nextflow ../main.nf \ - -stub-run -profile stubs \ - --workflow variant-calling \ + $common_args \ + --workflow variant_calling \ --pipeline_project_id test_stub \ - --pipeline_project_version v1 \ --ped_file assets/input_data/ped_files/batch_1.ped \ - --sample_sheet assets/input_data/sample_sheets/batch_1.tsv + --sample_sheet assets/input_data/sample_sheets/batch_1.tsv && + +echo "Variant prioritisation setup" && +run_nextflow ../main.nf \ + $common_args \ + --workflow variant_prioritisation_setup \ + --pipeline_project_id test_stub \ + --variant_prioritisation_version v1 \ + --ped_file outputs/test_stub_v1/params/batch_1.ped \ + --exclude_families 12345_000003,12345_000004 && + +cp assets/input_data/sample_sheets/DECIPHER_INTERNAL_IDs_stubs.txt outputs/test_stub_v1/prioritization/v1/DECIPHER_INTERNAL_IDs.txt && + +echo "Variant prioritisation" && +run_nextflow ../main.nf \ + $common_args \ + --workflow variant_prioritisation \ + --pipeline_project_id test_stub \ + --variant_prioritisation_version v1 \ + --ped_file outputs/test_stub_v1/params/batch_1.ped + +output_dir='outputs/test_stub_v1/prioritization/v1' + +for pro_fam in 000001_000001 000004_000002 +do + for ext in DD15.COV.txt REC_SNP_COV.txt DD15.sample_{cumulative_coverage_{counts,proportions},interval_{statistics,summary},statistics,summary} + do + ls $output_dir/COV/$pro_fam.$ext + done + + ls $output_dir/DECIPHER/${pro_fam}_{DEC_FLT.csv,DECIPHER_v10.xlsx} + ls $output_dir/DECIPHER/IGV/$pro_fam.snapshot.FLT.txt + ls $output_dir/DECIPHER/IGV/bamout_$pro_fam.snapshot.txt +done + +for fam_id in 000001 000002 +do + ls $output_dir/BAMOUT/${fam_id}/${fam_id}_chr{2,4,7,16,22,X}_12345678.bamout.{bam,bai,vcf,vcf.idx} + ls $output_dir/G2P/12345_${fam_id}_LOG_DIR/12345_${fam_id}_inter_out.txt{,_summary.html} + ls $output_dir/G2P/12345_${fam_id}_LOG_DIR/12345_${fam_id}.report.{html,txt} + ls $output_dir/VCF/12345_${fam_id}.ready.vcf.gz + ls $output_dir/VASE/12345_${fam_id}.ready.denovo.vcf +done + +ls $output_dir/{DECIPHER_INTERNAL_IDS,{FAM,PRO}_IDs,FAM_PRO}.txt +ls $output_dir/VCF/12345_000001.ready{,.00000{1,2,3}_000001}.vcf.gz +ls $output_dir/VCF/12345_000002.ready{,.00000{4,5,6}_000002}.vcf.gz + +# should be bams +! ls outputs/test_stub_v1/families/*/*/*.cram && +ls outputs/test_stub_v1/families/*/*/*.bam && + +echo "Cram compression" && +run_nextflow ../main.nf \ + $common_args \ + --workflow cram_compression \ + --compress_dir outputs/test_stub_v1/families && + +# should be crams, not bams +! ls outputs/test_stub_v1/families/*/*/*.bam &> /dev/null && +ls outputs/test_stub_v1/families/*/*/*.cram &> /dev/null && +echo "Cram decompression" && +run_nextflow ../main.nf \ + $common_args \ + --workflow cram_decompression \ + --compress_dir outputs/test_stub_v1/families && + +# should be bams again +! ls outputs/test_stub_v1/families/*/*/*.cram &> /dev/null && +ls outputs/test_stub_v1/families/*/*/*.bam &> /dev/null -test_exit_status=$(( $test_exit_status + $? )) +test_case_exit_status=$? +echo "Test case exited with status $test_case_exit_status" +test_exit_status=$(( $test_exit_status + $test_case_exit_status )) -echo "Test case 2: MD5 errors" +echo "--- Test case 2: MD5 errors ---" && run_nextflow ../main.nf \ - -stub-run -profile stubs \ - --workflow variant-calling \ + $common_args \ + --workflow variant_calling \ --pipeline_project_id test_stub_md5_errors \ --pipeline_project_version v1 \ --ped_file assets/input_data/ped_files/batch_2_md5_errors.ped \ --sample_sheet assets/input_data/sample_sheets/batch_2_md5_errors.tsv -if [ $? == 0 ] +test_case_exit_status=$? +echo "Test case exited with status $test_case_exit_status" +if [ $test_case_exit_status -eq 0 ] then test_exit_status=$(( $test_exit_status + 1 )) fi +echo "" echo "Tests finished with exit status $test_exit_status" exit $test_exit_status