diff --git a/.gitignore b/.gitignore
index fb2f9b9917e6068037312ed87b9176132662c6f1..f8dcefca4cce8f31cc95bb82f5c0d0d0c4b043c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@
 work
 *#
 *~
+tests/outputs
\ No newline at end of file
diff --git a/assets/trio_whole_exome_bcbio_template.yaml b/assets/trio_whole_exome_bcbio_template.yaml
index 75d49bd28396c820e3c2e32470ca13f6d00c4914..6505c0b55e47c86133988690f7f99f528bab6a10 100644
--- a/assets/trio_whole_exome_bcbio_template.yaml
+++ b/assets/trio_whole_exome_bcbio_template.yaml
@@ -20,7 +20,7 @@ details:
   genome_build: hg38
 upload:
   # relative path will output locally to the bcbio run folder
-  dir: ./results 
+  dir: ./results
 resources:
   gatk-haplotype:
       options: '["--active-probability-threshold", "0", "--interval-padding", 100]'
diff --git a/bin/convert_DEC_to_v10.py b/bin/convert_DEC_to_v10.py
old mode 100644
new mode 100755
index 67f26a78bb6cd0b061a1c298a7704d8b268c6217..f4b23df27aaaaff3fc01bb4d1a83d224d4ff01ab
--- a/bin/convert_DEC_to_v10.py
+++ b/bin/convert_DEC_to_v10.py
@@ -1,3 +1,4 @@
+#!/bin/env python
 #	given a DECIPHER bulk upload file v9
 #	convert it to v10
 #
@@ -14,10 +15,7 @@ import xlsxwriter
 
 
 
-def go(inout_dir,id):		# the folder where the bulk upload files are strored; id is in the format <indiv_id>_<family_id>
-
-    in_file = '%s/%s_DEC_FLT.csv' % (inout_dir,id)
-    out_file = '%s/%s_DECIPHER_v10.xlsx' % (inout_dir,id)
+def go(id, in_file, out_file):		# the folder where the bulk upload files are strored; id is in the format <indiv_id>_<family_id>
 
     # create the workbook
     workbook = xlsxwriter.Workbook(out_file)
@@ -72,9 +70,9 @@ def go(inout_dir,id):		# the folder where the bulk upload files are strored; id
 
 
 if __name__ == '__main__':
-    if len(sys.argv) == 3:
-        go(sys.argv[1],sys.argv[2])
+    if len(sys.argv) == 4:
+        go(*sys.argv[1:])
     else:
-        print ("Suggested use: time python convert_DEC_to_v10.py decipher_dir <indiv_id>_<family_id>")
+        print("Suggested use: convert_DEC_to_v10.py <indiv_id>_<family_id> <in_file> <out_file>")
         raise SystemExit
 
diff --git a/bin/extract_solo_FAM_PRO_ID.py b/bin/extract_solo_FAM_PRO_ID.py
index 3dcfa453973cf6c680b46b34751f426bbe4f4c28..6306fa987fa369962c8b8fd67ecd50a03e2f613b 100755
--- a/bin/extract_solo_FAM_PRO_ID.py
+++ b/bin/extract_solo_FAM_PRO_ID.py
@@ -1,22 +1,17 @@
+#!/usr/bin/env python3
 #	input:	the work folder which contains a PED subfolder where all family PED files were copied
 #	output:	only for singletons
 #		solo_FAM_IDs.txt, solo_PRO_IDs.txt and solo_FAM_PRO.txt
 #
 #       Author: MH
-#       last modified: APR 25, 2022
+#       last modified: JUL 6, 2022
 
+import argparse
 
-
-import sys
-import os
-
-
-
-def go(work_dir):
-
-    out_fam_file = work_dir + '/solo_FAM_IDs.txt'
-    out_pro_file = work_dir + '/solo_PRO_IDs.txt'
-    out_f_p_file = work_dir + '/solo_FAM_PRO.txt'
+def go(args):
+    out_fam_file = 'solo_FAM_IDs.txt'
+    out_pro_file = 'solo_PRO_IDs.txt'
+    out_f_p_file = 'solo_FAM_PRO.txt'
 
     out_fam_han = open(out_fam_file,'w')
     out_pro_han = open(out_pro_file,'w')
@@ -26,21 +21,18 @@ def go(work_dir):
     cntr_pro = 0
     cntr_f_p = 0
 
-
     # go over the PED folder in the working dir and process each PED file
-    ped_dir = work_dir + '/PED'
-    print ""
-    print "Processing the PED files (in %s) to extract singleton FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir)
+    print("")
+    print("Processing %i PED files to extract singleton FAM_ID, PRO_ID and FAM_PRO files" % (len(args.ped_files)))
 
-    for file in os.listdir(ped_dir):					# per each PED file
-        if file.endswith(".ped"):
+    for ped_file in args.ped_files:					# per each PED file
+        if ped_file.endswith(".ped"):
 
-            print "  %s" % (file)
-            in_file = os.path.join(ped_dir, file)
+            print("  %s" % (ped_file))
 
             # check how many lines in the PED file - if more than one cannot be a singleton, ignore
             num_lines = 0
-            in_han = open(in_file,'r')
+            in_han = open(ped_file,'r')
             for line in in_han:
                 num_lines += 1
             in_han.close()
@@ -48,7 +40,7 @@ def go(work_dir):
             if num_lines > 1:
                 continue
             if num_lines == 0:
-                print "ERROR: empty PED file %s" % (file)
+                print("ERROR: empty PED file %s" % (ped_file))
                 raise SystemExit
 
             # if here, the PED file contains exactly one line
@@ -56,7 +48,7 @@ def go(work_dir):
             CHILD_ID = 0
             FAM_ID = 0
 
-            in_han = open(in_file,'r')
+            in_han = open(ped_file,'r')
             for line in in_han:
                 data = [x.strip() for x in line.strip().split('\t')]
 
@@ -64,8 +56,8 @@ def go(work_dir):
                 y_indi,y_fam = data[1].split('_')			# in the internal PED file, indi_id is indiID_familyID, split
 
                 if x_fam != y_fam:
-                    print "ERROR: FAMILY_ID mismatch in %s" % (file)
-                    print line
+                    print("ERROR: FAMILY_ID mismatch in %s" % (ped_file))
+                    print(line)
                     raise SystemExit
 
                 FAM_ID = x_fam
@@ -73,8 +65,8 @@ def go(work_dir):
 
                 # check both parent IDs == 0
                 if (data[2] != '0') or (data[3] != '0'):
-                    print "ERROR: found parent ID for a singleton child in %s" % (file)
-                    print line
+                    print("ERROR: found parent ID for a singleton child in %s" % (ped_file))
+                    print(line)
                     raise SystemExit
 
                 # make sure the sex of the child is known
@@ -82,23 +74,23 @@ def go(work_dir):
                 if (CHILD_SEX == 1) or (CHILD_SEX == 2):
                     pass
                 else:
-                    print "ERROR: proband sex unknown in %s" % (file)
-                    print line
+                    print("ERROR: proband sex unknown in %s" % (ped_file))
+                    print(line)
                     raise SystemExit
 
                 # check kid is affected
                 if int(data[5]) != 2:
-                    print "ERROR: singleton child not affected"
-                    print line
+                    print("ERROR: singleton child not affected")
+                    print(line)
                     raise SystemExit
 
             if FAM_ID == 0:
-                print "ERROR: Cannot find the FAMILY_ID in %s" % (file)
+                print("ERROR: Cannot find the FAMILY_ID in %s" % (ped_file))
                 raise SystemExit
             if CHILD_ID == 0:
-                print "ERROR: Cannot find CHILD_ID in %s" % (file)
+                print("ERROR: Cannot find CHILD_ID in %s" % (ped_file))
                 raise SystemExit
-            else:
+            elif FAM_ID not in args.exclude_families:
                 out_fam_han.write('%s\n' % (FAM_ID))
                 cntr_fam += 1
                 out_pro_han.write('%s\n' % (CHILD_ID))
@@ -110,23 +102,23 @@ def go(work_dir):
     out_pro_han.close()
     out_f_p_han.close()
 
-
-
-    print ""
-    print "Singleton Families:"
-    print "   %s FAM_IDs --> %s" % (cntr_fam, out_fam_file)
-    print "   %s PRO_IDs --> %s" % (cntr_pro, out_pro_file)
-    print "   %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file)
-    print ""
-
-
-
+    print("")
+    print("Singleton Families:")
+    print("   %s FAM_IDs --> %s" % (cntr_fam, out_fam_file))
+    print("   %s PRO_IDs --> %s" % (cntr_pro, out_pro_file))
+    print("   %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file))
+    print("")
 
 
 if __name__ == '__main__':
-    if len(sys.argv) == 2:
-        go(sys.argv[1])
-    else:
-        print "Suggested use: time python /home/u035/u035/shared/scripts/extract_solo_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>"
-        raise SystemExit
-
+    a = argparse.ArgumentParser(
+        description='''
+            Extract singleton FAM_ID, PRO_ID and FAM_PRO files. Suggested use:
+            time python /home/u035/u035/shared/scripts/extract_solo_FAM_PRO_ID.py
+            /home/u035/u035/shared/analysis/work/<PROJECT_ID>
+        '''
+    )
+    a.add_argument('ped_files', nargs='+')
+    args = a.parse_args()
+    a.add_argument('--exclude-families', nargs='*')
+    go(args)
diff --git a/bin/extract_trio_FAM_PRO_ID.py b/bin/extract_trio_FAM_PRO_ID.py
index 4dac3b03ba55b6003b79dab6ff4e99582b408035..3bec2e218b0ac6ea5ce18fa9ee2c4d49580eaddc 100755
--- a/bin/extract_trio_FAM_PRO_ID.py
+++ b/bin/extract_trio_FAM_PRO_ID.py
@@ -1,22 +1,18 @@
+#!/usr/bin/env python3
 #	input:	the work folder which contains a PED subfolder where all family PED files were copied
 #	output:	only for trio families
 #		FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt
 #
 #       Author: MH
-#       last modified: APR 25, 2022
+#       last modified: JUL 6, 2022
 
+import argparse
 
 
-import sys
-import os
-
-
-
-def go(work_dir):
-
-    out_fam_file = work_dir + '/FAM_IDs.txt'
-    out_pro_file = work_dir + '/PRO_IDs.txt'
-    out_f_p_file = work_dir + '/FAM_PRO.txt'
+def go(args):
+    out_fam_file = 'FAM_IDs.txt'
+    out_pro_file = 'PRO_IDs.txt'
+    out_f_p_file = 'FAM_PRO.txt'
 
     out_fam_han = open(out_fam_file,'w')
     out_pro_han = open(out_pro_file,'w')
@@ -26,21 +22,18 @@ def go(work_dir):
     cntr_pro = 0
     cntr_f_p = 0
 
-
     # go over the PED folder in the working dir and process each PED file
-    ped_dir = work_dir + '/PED'
-    print ""
-    print "Processing the PED files (in %s) to extract FAM_ID, PRO_ID amd FAM_PRO files" % (ped_dir)
+    print("")
+    print("Processing %i PED files to extract FAM_ID, PRO_ID and FAM_PRO files" % (len(args.ped_files)))
 
-    for file in os.listdir(ped_dir):					# per each PED file
-        if file.endswith(".ped"):
-            print "  %s" % (file)
-            in_file = os.path.join(ped_dir, file)
+    for ped_file in args.ped_files:					# per each PED file
+        if ped_file.endswith(".ped"):
+            print("  %s" % (ped_file))
 
             CHILD_ID = 0
             FAM_ID = 0
 
-            in_han = open(in_file,'r')
+            in_han = open(ped_file,'r')
             for line in in_han:
                 data = [x.strip() for x in line.strip().split('\t')]
 
@@ -50,14 +43,14 @@ def go(work_dir):
                 if FAM_ID == 0:
                     FAM_ID = x_fam
                 elif FAM_ID != x_fam:
-                    print "ERROR: more than one FAMILY_ID in %s" % (file)
+                    print("ERROR: more than one FAMILY_ID in %s" % (ped_file))
                     raise SystemExit
 
                 if data[2] != '0' and data[3] != '0':			# this is the child in the trio
                     if CHILD_ID == 0:
                         CHILD_ID = y_indi
                     else:						# seen another child
-                        print "WARNING: already have seen a child (possibly a quad) in %s" % (file)
+                        print("WARNING: already have seen a child (possibly a quad) in %s" % (ped_file))
                         CHILD_ID = 0
                         break
 
@@ -65,21 +58,21 @@ def go(work_dir):
                     if (CHILD_SEX == 1) or (CHILD_SEX == 2):
                         pass
                     else:
-                        print "ERROR: proband sex unknown"
-                        print line
+                        print("ERROR: proband sex unknown")
+                        print(line)
                         raise SystemExit
 
                     if int(data[5]) != 2:
-                        print "ERROR: child in a trio not affected"
-                        print line
+                        print("ERROR: child in a trio not affected")
+                        print(line)
                         raise SystemExit
 
             if FAM_ID == 0:
-                print "ERROR: Cannot find the FAMILY_ID in %s" % (file)
+                print("ERROR: Cannot find the FAMILY_ID in %s" % (ped_file))
                 raise SystemExit
             if CHILD_ID == 0:
-                print "WARNING: Cannot find exactly one CHILD_ID (with 2 available parents) in %s : not a trio --> will not be analyzed" % (file)
-            else:
+                print("WARNING: Cannot find exactly one CHILD_ID (with 2 available parents) in %s : not a trio --> will not be analyzed" % (ped_file))
+            elif FAM_ID not in args.exclude_families:
                 out_fam_han.write('%s\n' % (FAM_ID))
                 cntr_fam += 1
                 out_pro_han.write('%s\n' % (CHILD_ID))
@@ -91,23 +84,23 @@ def go(work_dir):
     out_pro_han.close()
     out_f_p_han.close()
 
-
-
-    print ""
-    print "Trio Families:"
-    print "   %s FAM_IDs --> %s" % (cntr_fam, out_fam_file)
-    print "   %s PRO_IDs --> %s" % (cntr_pro, out_pro_file)
-    print "   %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file)
-    print ""
-
-
-
+    print("")
+    print("Trio Families:")
+    print("   %s FAM_IDs --> %s" % (cntr_fam, out_fam_file))
+    print("   %s PRO_IDs --> %s" % (cntr_pro, out_pro_file))
+    print("   %s FAM_PRO --> %s" % (cntr_f_p, out_f_p_file))
+    print("")
 
 
 if __name__ == '__main__':
-    if len(sys.argv) == 2:
-        go(sys.argv[1])
-    else:
-        print "Suggested use: time python /home/u035/u035/shared/scripts/extract_trio_FAM_PRO_ID.py /home/u035/u035/shared/analysis/work/<PROJECT_ID>"
-        raise SystemExit
-
+    a = argparse.ArgumentParser(
+        description='''
+            Extract FAM_ID, PRO_ID and FAM_PRO files from Ped files. Suggested use:
+            time python /home/u035/u035/shared/scripts/extract_trio_FAM_PRO_ID.py
+            /home/u035/u035/shared/analysis/work/<PROJECT_ID>
+        '''
+    )
+    a.add_argument('ped_files', nargs='+')
+    a.add_argument('--exclude-families', nargs='*')
+    args = a.parse_args()
+    go(args)
diff --git a/bin/filter_LQ_GT.py b/bin/filter_LQ_GT.py
old mode 100644
new mode 100755
index fff3d37e2a61d9f71be84d10513a9f1350bf959f..2e612f867280cfd82140c4599bc98c506c4fe9b5
--- a/bin/filter_LQ_GT.py
+++ b/bin/filter_LQ_GT.py
@@ -1,3 +1,4 @@
+#!/bin/env python2.7
 #	given a multi-sample VCF
 #	reset the GT to ./. (no-call) for indi variant with num_ALT < num_ALT_THERSH or VAF < VAF_THRESH
 #
diff --git a/bin/generate_DEC_IGV_scripts.py b/bin/generate_DEC_IGV_scripts.py
index 48f9e1400ada94cb7716489dbe3236bccaf3eddd..0c347caa630e76cd198b8bef01c66110035f5545 100755
--- a/bin/generate_DEC_IGV_scripts.py
+++ b/bin/generate_DEC_IGV_scripts.py
@@ -1,3 +1,4 @@
+#!/bin/env python2.7
 #	input:
 #		the family PED file				[${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped]
 #		individual VCF file for the trio proband	[${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz]
@@ -71,12 +72,10 @@ SOR_THRESH = float(3)
 
 
 
-def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir):
-
+def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,child_vcf_file,mom_vcf_file,dad_vcf_file,plate_id,fam_id,child_bam,mom_bam,dad_bam, out_dec_file, out_igv_file):
     # read the decipher to internal ID mapping file
     read_map_file(dec_map_file)
 
-
     # read the transcript mapping file
     read_trans_map(trans_map_file)
 
@@ -106,11 +105,6 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir
 
 
     # now read the individual VCFs and record all the variants
-    child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
-    mom_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,MOM_ID)
-    dad_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,DAD_ID)
-
-
     read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT)
     print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID)
     sys.stdout.flush()
@@ -150,13 +144,11 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir
 
 
     # setup the DECIPHER output file
-    out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,CHILD_ID)		################################
     out_han = open(out_dec_file,'w')
     out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')
 
 
     # setup the IGV snapshot file
-    out_igv_file = '%s/IGV/%s.snapshot.FLT.txt' % (dec_dir,CHILD_ID)	#################################
     out_igv_han = open(out_igv_file,'w')
     out_igv_han.write('new\n')
     out_igv_han.write('genome hg38\n')
@@ -164,9 +156,6 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir
     out_igv_han.write('maxPanelHeight 10000\n')
     out_igv_han.write('new\n')
 
-    child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID)
-    mom_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,MOM_ID,MOM_ID)
-    dad_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,DAD_ID,DAD_ID)
     out_igv_han.write('load %s\n' % (child_bam))
     out_igv_han.write('load %s\n' % (mom_bam))
     out_igv_han.write('load %s\n' % (dad_bam))
@@ -182,7 +171,6 @@ def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir
     in_cntr = 0
     out_cntr = 0
 
-    child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
     in_han = gzip.open(child_vcf_file,'r')
 
 
@@ -1343,10 +1331,10 @@ def read_trans_map(in_file):
 
 
 if __name__ == '__main__':
-    if len(sys.argv) == 12:
-        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10],sys.argv[11])
+    if len(sys.argv) == 17:
+        go(*sys.argv[1:])
     else:
         print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV.py \
-        dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir"
+        dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,child_vcf_file,mom_vcf_file,dad_vcf_file,plate_id,fam_id,child_bam,mom_bam,dad_bam,out_dec_file.txt,out_igv_file.txt"
         raise SystemExit
 
diff --git a/bin/get_cov_output.py b/bin/get_cov_output.py
old mode 100644
new mode 100755
index 49add7fa11c8c8979d2485da701c639b05e380e9..6e02f3ee62ff9c23e982c2141c15f3ab6fd25946
--- a/bin/get_cov_output.py
+++ b/bin/get_cov_output.py
@@ -1,3 +1,4 @@
+#!/bin/env python2.7
 #	given
 #		<gene_set>.<ID>.sample_interval_summary					- e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt
 #		<gene_set>.<gene_set_date>.ClinVar.<clinvar_date>.plus15bp.txt		- e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt
diff --git a/pipeline/lib/PipelineException.groovy b/lib/PipelineException.groovy
similarity index 100%
rename from pipeline/lib/PipelineException.groovy
rename to lib/PipelineException.groovy
diff --git a/main.nf b/main.nf
index f2b8c278b715a6658514e8eb80a7df741f264916..d5ec72f182225113914eb6c5d40b63e42e144157 100644
--- a/main.nf
+++ b/main.nf
@@ -1,43 +1,31 @@
 nextflow.enable.dsl = 2
 include {var_calling} from './pipeline/var_calling.nf'
+include {prioritisation_setup; prioritisation} from './pipeline/variant_prioritisation.nf'
+include {compress; decompress} from './pipeline/cram_compression.nf'
 
-// which part of the pipeline to run - either 'variant-calling' or 'variant-prioritisation'
-params.workflow = null
-
-// path to a bcbio install, containing 'anaconda', 'galaxy', 'genomes', etc
-params.bcbio = null
-
-// path to a template config for bcbio variant calling
-params.bcbio_template = null
-
-// where the results get written to on the system. The variant calling creates initial
-// results here, and variant prioritisation adds to them
-params.output_dir = null
-
-// name of the pipeline batch, e.g. '21900', '20220427'
-params.pipeline_project_id = null
-
-// version of the pipeline batch, e.g. 'v1'
-params.pipeline_project_version = null
+workflow {
+    switch (params.workflow) {
+        case 'variant_calling':
+            var_calling()
+            break
 
-// bed file of Twist exome targets
-params.target_bed = null
+        case 'variant_prioritisation_setup':
+            prioritisation_setup()
+            break
 
-// hg38 reference genome in fasta format
-params.reference_genome = null
+        case 'variant_prioritisation':
+            prioritisation()
+            break
 
-// path to a Ped file describing all the families in the pipeline batch
-params.ped_file = null
+        case 'cram_compression':
+            compress()
+            break
 
-// path to a samplesheet mapping individual IDs to fastq pairs
-params.sample_sheet = null
+        case 'cram_decompression':
+            decompress()
+            break
 
-workflow {
-    if (params.workflow == 'variant-calling') {
-        var_calling()
-    } else if (params.workflow == 'variant-prioritisation') {
-        println "Variant prioritisation coming soon"
-    } else {
-        exit 1, 'params.workflow required - variant-calling or variant-prioritisation'
+        default:
+            exit 1, "Invalid params.workflow: ${params.workflow}"
     }
 }
diff --git a/nextflow.config b/nextflow.config
index 746f52de544fe570c2b8de52500049175e6bf6a3..2bb7e975d61d665e3206199b3790211e55737c76 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,19 +1,98 @@
 executor = 'slurm'
 
 params {
+    // General options
+
+    // which part of the pipeline to run - 'variant-calling', 'variant-prioritisation-setup', etc.'
+    workflow = null
+
+    // where the results get written to on the system. The variant calling creates initial
+    // results here, and variant prioritisation adds to them
+    output_dir = null
+
+    // name of the pipeline batch, e.g. '21900', '20220427'
+    pipeline_project_id = null
+
+    // version of the pipeline batch, e.g. 'v1'
+    pipeline_project_version = 'v1'
+
+    // path to a Ped file describing all the families in the pipeline batch
+    ped_file = null
+
+    // bed file of Twist exome targets
+    target_bed = null
+
+    // hg38 reference genome in fasta format
+    reference_genome = null
+    reference_dict = null
+
+    // Ensembl VEP
+    vep_cache = null  // the 'genomes/Hsapiens/hg38/vep' folder in BCBio containing per-chromosome caches
+    vep_plugins = null  // the 'anaconda/share/ensembl-vep-100.4-0' folder containing Perl modules
+
+    // 'variant-calling' options
+
+    // path to a bcbio install, containing 'anaconda', 'galaxy', 'genomes', etc
+    bcbio = null
+
+
+    // path to a template config for bcbio variant calling
+    bcbio_template = null
+
+    // path to a samplesheet mapping individual IDs to fastq pairs
+    sample_sheet = null
+
+    // BCBio config
+    bcbio_template = "$projectDir/assets/trio_whole_exome_bcbio_template.yaml"
+
+    // 'variant-prioritisation'/'variant-prioritisation-setup' options
+
+    // todo: remove this when we segment var prioritisation
+    scripts_dir = null
+
+    // version number for a variant prioritisation run
+    prioritisation_version = 'v1'
+
+    // families not to run variant prioritisation on
+    exclude_families = ''
+
+    // path to GATK 3.8 - see #23
+    gatk3 = null
+
+    // use --fetal_trios to process trios against PanelApp's Fetal panel
+    fetal_trios = false
+
+    g2p_targets = null
+    g2p_genes = null
+    clinvar = null
+    blacklist = null
+    trans_map = null
+    recurrent_snps = null  // https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1
+    list_24_chr = null
+
+    // 'cram_compression'/'cram_decompression' options
+    // folder to search through recursively and perform compression/decompression on
+    compress_dir = null
+
+
+    // cluster options
+
+    // max and min resource parameters. Use get_cpus, get_mem and get_time in the
+    // `process` scope to ensure that jobs don't request more resources than can be
+    // provided
     max_cpus = 16
     max_mem = 32.GB
     max_time = 48.h
-
     min_cpus = 1
     min_mem = 1.GB
     min_time = 2.h
 
-    bcbio_template = "$projectDir/assets/trio_whole_exome_bcbio_template.yaml"
 }
 
 
 profiles {
+    conda.enabled = true
+
     standard {
         process.executor = 'local'
     }
@@ -22,7 +101,13 @@ profiles {
         process.echo = true
     }
 
+    testing {
+       params.output_dir = "$projectDir/tests/outputs"
+    } 
+
     stubs {
+        conda.enabled = false
+        process.conda = null
         process.executor = 'local'
         params.max_cpus = 1
         params.max_mem = 500.MB
@@ -31,7 +116,18 @@ profiles {
         params.bcbio = "$projectDir/tests/scripts/bcbio_nextgen.py"
         params.target_bed = "$projectDir/tests/assets/input_data/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed"
         params.reference_genome = "$projectDir/tests/assets/ref.fa"
-        params.output_dir = "$projectDir/tests/outputs"
+        params.reference_dict = "$projectDir/tests/assets/ref.dict"
+        params.blacklist = "$projectDir/tests/assets/blacklist.txt"
+        params.list_24_chr = "$projectDir/tests/assets/list_24_chr.txt"
+        params.g2p_target_bed = "$projectDir/tests/assets/g2p_targets.bed"
+        params.g2p_target_csv = "$projectDir/tests/assets/g2p_targets.csv"
+        params.g2p_genes = "$projectDir/tests/assets/g2p_genes.txt"
+        params.gatk3 = "$projectDir/tests/scripts/bcbio_nextgen.py"
+        params.clinvar = "$projectDir/tests/assets/clinvar.txt"
+        params.recurrent_snps = "$projectDir/tests/assets/recurrent_snps.txt"
+        params.trans_map = "$projectDir/tests/assets/trans_map.txt"
+        params.vep_cache = "$projectDir/tests/assets/vep/cache"
+        params.vep_plugins = "$projectDir/tests/assets/vep/plugins"
     }
 
     slurm {
@@ -58,11 +154,16 @@ process {
         memory = get_mem(2.GB)
     }
 
-    withLabel: medium {
+    withLabel: small_medium {
         cpus = get_cpus(4)
         memory = get_mem(8.GB)
     }
 
+    withLabel: medium {
+        cpus = get_cpus(8)
+        memory = get_mem(16.GB)
+    }
+
     withLabel: large {
         cpus = get_cpus(16)
         memory = get_mem(32.GB)
diff --git a/pipeline/cram_compression.nf b/pipeline/cram_compression.nf
new file mode 100644
index 0000000000000000000000000000000000000000..3b937089f51b2f4f60c14d8538b297af8f71ff17
--- /dev/null
+++ b/pipeline/cram_compression.nf
@@ -0,0 +1,106 @@
+nextflow.enable.dsl = 2
+
+
+process bam_to_cram {
+    input:
+    path(bam)
+
+    output:
+    path("${bam.getBaseName()}.cram")
+
+    script:
+    """
+    samtools=${params.bcbio}/anaconda/bin/samtools &&
+    bam_flagstat=${bam}.flagstat.txt &&
+    cram="${bam.getBaseName()}.cram" &&
+    cram_flagstat=\$cram.flagstat.txt &&
+
+    if [ ! -f \$bam_flagstat ]
+    then
+        \$samtools flagstat $bam > \$bam_flagstat
+    fi
+
+    \$samtools view -@ ${task.cpus} -T ${params.reference_genome} -C -o \$cram ${bam} &&
+    \$samtools index \$cram &&
+    \$samtools flagstat \$cram > \$cram_flagstat
+
+    if ! diff \$bam_flagstat \$cram_flagstat > /dev/null 2>&1
+    then
+        echo "Unexpected flagstat results in \$bam_flagstat and \$cram_flagstat"
+        exit 1
+    fi
+
+    cp \$cram \$(dirname \$(readlink -f $bam))&&
+    rm \$(readlink -f $bam)
+    """
+
+    stub:
+    """
+    cram="${bam.getBaseName()}.cram"
+    cp ${bam} \$cram &&
+    touch \$cram.crai &&
+    touch ${bam}.flagstat.txt \$cram.flagstat.txt &&
+    cp \$cram \$(dirname \$(readlink -f $bam)) &&
+    rm \$(readlink -f $bam)
+    """
+}
+
+process cram_to_bam {
+    input:
+    path(cram)
+
+    output:
+    path("${cram.getBaseName()}.bam")
+
+    script:
+    """
+    samtools=${params.bcbio}/anaconda/bin/samtools &&
+    bam="${cram.getBaseName()}.bam" &&
+    bam_flagstat=\${bam}.flagstat.txt &&
+    cram_flagstat=${cram}.flagstat.txt &&
+
+    if [ ! -f \$cram_flagstat ]
+    then
+        \$samtools flagstat $cram > \$cram_flagstat
+    fi
+
+    \$samtools view -@ ${task.cpus} -T ${params.reference_genome} -b -o \$bam ${cram} &&
+    \$samtools index \$bam &&
+    \$samtools flagstat \$bam > \$bam_flagstat
+
+    if ! diff \$bam_flagstat \$cram_flagstat > /dev/null 2>&1
+    then
+        echo "Unexpected flagstat results in \$bam_flagstat and \$cram_flagstat"
+        exit 1
+    fi
+
+    cp \$bam \$(dirname \$(readlink -f $cram)) &&
+    rm \$(readlink -f $cram)
+    """
+
+    stub:
+    """
+    bam="${cram.getBaseName()}.bam"
+    cp ${cram} \$bam &&
+    touch \$bam.crai &&
+    touch ${cram}.flagstat.txt \$bam.flagstat.txt &&
+    cp \$bam \$(dirname \$(readlink -f $cram)) &&
+    rm \$(readlink -f $cram)
+    """
+}
+
+
+workflow compress {
+    ch_bams = Channel.fromPath(
+        "${params.compress_dir}/**/*.bam"
+    )
+    bam_to_cram(ch_bams)
+}
+
+
+workflow decompress {
+    ch_crams = Channel.fromPath(
+        "${params.compress_dir}/**/*.cram"
+    )
+    cram_to_bam(ch_crams)
+}
diff --git a/pipeline/validation.nf b/pipeline/validation.nf
index 18058f800fe028fa4788ad26c558b12a9b1cee17..7c2c35ff426951be054123edfd3f561d34bfc955 100644
--- a/pipeline/validation.nf
+++ b/pipeline/validation.nf
@@ -1,11 +1,18 @@
 nextflow.enable.dsl = 2
 
 process check_md5s {
+    /*
+    For a fastq file, find its md5 file and run md5sum -c. Output the fastq name if
+    invalid, otherwise empty string `''`.
+    */
     label 'small'
+    errorStrategy 'ignore'
 
     input:
-    val(fastq_file)
-    // getParent() gives null object errors if this is a path
+    val(fastq_file)  // getParent() gives null object errors if this is a path
+
+    output:
+    stdout
 
     script:
     """
@@ -14,7 +21,7 @@ process check_md5s {
     edgen_fastq=${fastq_file.getParent().getName()}/${fastq_file.getName()}
 
     crf_dir=${fastq_file.getParent()}
-    crf_md5_file="\$(ls \$crf_dir/*md5.txt | head -n 1)"
+    crf_md5_file="\$(find \$crf_dir -name *md5.txt | head -n 1)"
     crf_fastq=${fastq_file.getName()}
 
     local_md5_file=${fastq_file}.md5
@@ -22,17 +29,17 @@ process check_md5s {
     if [ -f \$edgen_md5_file ]
     then
         cd \$edgen_dir
-        md5sum -c <(cat md5sums.txt | grep \$edgen_fastq)
+        md5sum --quiet -c <(cat md5sums.txt | grep \$edgen_fastq) | cut -d ':' -f 1
 
     elif [ -f \$crf_md5_file ]
     then
         cd \$crf_dir
-        md5sum -c <(cat \$crf_md5_file | grep \$crf_fastq)
+        md5sum  --quiet -c <(cat \$crf_md5_file | grep \$crf_fastq) | cut -d ':' -f 1
 
     elif [ -f \$local_md5_file ]
     then
         cd ${fastq_file.getParent()}
-        md5sum -c \$local_md5_file
+        md5sum --quiet -c \$local_md5_file | cut -d ':' -f 1
 
     else
         echo "Could not find md5 file for $fastq_file"
@@ -41,6 +48,7 @@ process check_md5s {
     """
 }
 
+
 workflow validation {
     /*
     Take a parsed samplesheet, flatten it and parse into a channel of observed vs.
@@ -60,6 +68,14 @@ workflow validation {
         .flatten()
         .map({file(it)})
 
-        check_md5s(ch_fastqs)
+        invalid_md5s = check_md5s(ch_fastqs)
+            .filter({it != ''})  // filter out the '' results from valid md5s
+            .collect()
+            .map(
+                {
+                    if (it != null) {  // invalid md5s found
+                        throw new PipelineException("Invalid md5s detected:\n${it.join('')}")
+                    }
+                }
+            )
 }
-
diff --git a/pipeline/var_calling.nf b/pipeline/var_calling.nf
index f2c9846ae382ca324411150270dfc1b2aef7fda2..b0b954c8f5a07021cef2d03264fa1ccd057d964d 100644
--- a/pipeline/var_calling.nf
+++ b/pipeline/var_calling.nf
@@ -5,7 +5,7 @@ include {validation} from './validation.nf'
 
 
 process merge_fastqs {
-    label 'medium'
+    label 'small_medium'
 
     input:
     tuple(val(indv_id), val(family_id), path(r1), path(r2))
@@ -54,6 +54,7 @@ process write_bcbio_csv {
 
 process bcbio_family_processing {
     label 'large'
+    label 'long'
 
     input:
     tuple(val(family_id), val(individuals), path(family_csv), path(merged_fastq1), path(merged_fastq2))
@@ -93,11 +94,12 @@ process bcbio_family_processing {
     output_dir=${family_id}/results
     family_dir="${family_id}/results/\$(date '+%Y-%m-%d')_${family_id}"
     mkdir -p \$family_dir
-    mkdir ${family_id}/config
+    mkdir ${family_id}/{config,provenance}
     touch ${family_id}/config/${family_id}{.csv,.yaml,-template.yaml}
+    touch ${family_id}/provenance/programs.txt
     cd \$family_dir
-    touch "\$(echo ${family_id} | sed 's/_//g')-gatk-haplotype-annotated.vcf.gz{,.tbi}" bcbio-nextgen{,-commands}.log data_versions.csv
-    touch project-summary.yaml metadata.csv programs.txt
+    touch "\$(echo ${family_id} | sed 's/_//g')-gatk-haplotype-annotated.vcf.gz"{,.tbi} bcbio-nextgen{,-commands}.log data_versions.csv
+    touch project-summary.yaml metadata.csv
     mkdir multiqc
     touch list_files_final.txt multiqc_config.yaml multiqc_report.html
     mkdir multiqc_data report
@@ -141,48 +143,10 @@ process format_bcbio_individual_outputs {
         indv_output=individual_outputs/\$i &&
         mkdir -p \$indv_output &&
 
-        ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed &&
-        ln -s \$indv_input/qc \$indv_output/qc &&
-
-        # todo: make cram compression its own process
-        bam=\$indv_input/\$i-ready.bam
-        cram="\$indv_output/\$i-ready.cram" &&
-        \$samtools view -@ ${task.cpus} -T ${reference_genome} -C -o \$cram \$bam &&
-        \$samtools index \$cram &&
-        bam_flagstat=./\$i-ready.bam.flagstat.txt &&
-        cram_flagstat=\$cram.flagstat.txt &&
-        \$samtools flagstat \$bam > \$bam_flagstat &&
-        \$samtools flagstat \$cram > \$cram_flagstat &&
-        diff \$bam_flagstat \$cram_flagstat
-
-        if [ \$? -ne 0 ]
-        then
-            echo "Unexpected flagstat results in \$cram_flagstat"
-            exit 1
-        fi
-    done
-    """
-
-    stub:
-    """
-    mkdir individual_outputs
-    for i in ${individuals.join(' ')}
-    do
-        indv_input=\$PWD/${bcbio_output_dir}/results/\$i
-        indv_output=individual_outputs/\$i &&
-        mkdir -p \$indv_output &&
-
-        ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed &&
-        ln -s \$indv_input/qc \$indv_output/qc &&
-
-        bam=\$indv_input/\$i-ready.bam
-        cram="\$indv_output/\$i-ready.cram" &&
-        cp \$bam \$cram &&
-        touch \$cram.crai &&
-        bam_flagstat=./\$i-ready.bam.flagstat.txt &&
-        cram_flagstat=\$cram.flagstat.txt &&
-        touch \$bam_flagstat &&
-        touch \$cram_flagstat
+        for f in \$i-ready.bam \$i-ready.bam.bai \${i}-callable.bed qc
+        do
+            ln -s \$indv_input/\$f \$indv_output/\$f
+        done
     done
     """
 }
@@ -200,34 +164,34 @@ process format_bcbio_family_outputs {
 
     script:
     """
-    merged_bcbio_family_output="\$(ls -d \$(readlink -f $bcbio_output_dir)/results/????-??-??_* | head -n 1)" &&
-    bcbio_family_output="\$(echo \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" &&
+    bcbio_family_output="\$(ls -d \$(readlink -f $bcbio_output_dir)/results/????-??-??_* | head -n 1)" &&
     run_date="\$(basename \$bcbio_family_output | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2})_.+\$/\\1/g')" &&
-    broken_family_id="\$(echo ${family_id} | sed 's/_//g')" &&
 
-    bcbio_family_output="\$(basename \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" &&
-    mkdir \$bcbio_family_output &&
+    outdir="\$(basename \$bcbio_family_output | sed 's/-merged\\/*\$//g')" &&
+    mkdir \$outdir &&
 
     for suffix in gatk-haplotype-annotated.vcf.gz{,.tbi}
     do
-        src=\$merged_bcbio_family_output/\${broken_family_id}-\${suffix}
+        src=\$(ls \$bcbio_family_output/*-\${suffix} | head -n 1)
         if [ -f \$src ]
         then
-            ln -s \$src \$bcbio_family_output/${family_id}-\${suffix}
+            ln -s \$src \$outdir/${family_id}-\${suffix}
         fi
     done &&
 
-    for f in \$merged_bcbio_family_output/*{.log,.csv,.yaml,multiqc}
+    ln -s \$(readlink -f $bcbio_output_dir)/provenance/programs.txt \$outdir/ &&
+
+    for f in \$bcbio_family_output/*{.log,.csv,.yaml,multiqc}
     do
-        ln -s \$f \$bcbio_family_output/
+        ln -s \$f \$outdir/
     done &&
 
     for i in ${individuals.join(' ')}
     do
-        ln -s \$(readlink -f $formatted_individual_outputs)/\$i \$bcbio_family_output/\$i
+        ln -s \$(readlink -f $formatted_individual_outputs)/\$i \$outdir/\$i
     done &&
 
-    cd \$bcbio_family_output &&
+    cd \$outdir &&
     for f in \$(find . -type f | grep -v '\\.bam')
     do
         md5sum \$f >> md5sum.txt
@@ -360,16 +324,40 @@ workflow process_families {
     */
 
     take:
+        /*
+        ch_individuals: [
+            [
+                indv_id,
+                family_id,
+                father_id,
+                mother_id,
+                sex,
+                affected,
+                [r1_fastqs],
+                [r2_fastqs]
+            ],
+            ...
+        ]
+        */
         ch_individuals
+
+        // value channels
         ch_ped_file
         ch_samplesheet
 
     main:
-        ch_bcbio = file(params.bcbio, checkIfExists: true)
-        ch_bcbio_template = file(params.bcbio_template, checkIfExists: true)
-        ch_target_bed = file(params.target_bed, checkIfExists: true)
-        ch_reference_genome = file(params.reference_genome, checkIfExists: true)
-
+        // collect() can make a queue channel of paths into a value channel
+        ch_bcbio = Channel.fromPath(params.bcbio, checkIfExists: true).collect()
+        ch_bcbio_template = Channel.fromPath(params.bcbio_template, checkIfExists: true).collect()
+        ch_target_bed = Channel.fromPath(params.target_bed, checkIfExists: true).collect()
+        ch_reference_genome = Channel.fromPath(params.reference_genome, checkIfExists: true).collect()
+
+        /*
+        ch_merged_fastqs: [
+            [indv_id, indv_id_merged_r1.fastq.gz, indv_id_merged_r2.fastq.gz],
+            ...
+        ]
+        */
         ch_merged_fastqs = merge_fastqs(
             ch_individuals.map(
                 { indv, family, father, mother, sex, affected, r1, r2 ->
@@ -377,35 +365,63 @@ workflow process_families {
             })
         ).groupTuple()
 
-	ch_families = ch_individuals.map(
+        ch_families = ch_individuals.map(
             { sample_id, family_id, father, mother, sex, phenotype, r1, r2 ->
                 [family_id, [sample_id, sample_id, family_id, sex, phenotype, father, mother]]
-      	    }).groupTuple()
+            }
+        ).groupTuple()
 
         ch_indv_by_family = ch_individuals.map(
-	    { sample_id, family_id, father, mother, sex, phenotype, r1, r2 ->
+            { sample_id, family_id, father, mother, sex, phenotype, r1, r2 ->
                 [family_id, sample_id]
-	    }).groupTuple()
+            }
+        ).groupTuple()
 
+        /*
+        ch_bcbio_csvs: [
+            [family_id, family_id.csv],
+            ...
+        ]
+        */
         ch_bcbio_csvs = write_bcbio_csv(
-	    ch_families,
+            ch_families,
             ch_target_bed
         )
 
         ch_bcbio_inputs = ch_indv_by_family.join(ch_bcbio_csvs.join(ch_merged_fastqs))
 
+        /*
+        ch_bcbio_family_outputs: [
+            [family_id, [indv1_id, indv2_id, ...], family_id-merged/],
+            ...
+        ]
+        */
         ch_bcbio_family_outputs = bcbio_family_processing(
             ch_bcbio_inputs,
             ch_bcbio,
             ch_bcbio_template
         )
 
+        /*
+        ch_individual_folder: [
+            [family_id, individual_outputs/],
+            ...
+        ]
+        */
         ch_individual_folders = format_bcbio_individual_outputs(
             ch_bcbio_family_outputs,
             ch_bcbio,
             ch_reference_genome
 
         )
+
+        /*
+        ch_formatted_bcbio_outputs: [
+            [family_id, 2022-07-04_family_id, family_id-merged/],
+            ...
+        ]
+
+        */
         ch_formatted_bcbio_outputs = format_bcbio_family_outputs(
             ch_bcbio_family_outputs.join(ch_individual_folders)
         )
diff --git a/pipeline/variant_prioritisation.nf b/pipeline/variant_prioritisation.nf
new file mode 100644
index 0000000000000000000000000000000000000000..66cb69e47178f3a0bde510202c5a349922d5429b
--- /dev/null
+++ b/pipeline/variant_prioritisation.nf
@@ -0,0 +1,928 @@
+nextflow.enable.dsl = 2
+
+prioritisation_out = "${params.output_dir}/${params.pipeline_project_id}_${params.pipeline_project_version}/prioritization/${params.prioritisation_version}"
+conda_python2 = 'python=2.7'
+conda_xlsxwriter = 'xlsxwriter=1.1.5'
+conda_vt = 'bioconda::vt=0.57721'
+conda_bcftools ='bioconda::bcftools=1.9'
+conda_tabix = 'bioconda::tabix=1.11'  // tabix 1.9 isn't available on Conda
+conda_vep = 'bioconda::ensembl-vep=100.4'
+conda_vase = 'bioconda::vase=0.5.1'  // 0.4.2 not available
+conda_gatk4 = 'bioconda::gatk4=4.2.1.0'
+conda_gatk3 = 'bioconda::gatk=3.8'
+conda_samtools = 'bioconda::samtools=1.9'
+
+
+
+def conda_env(env) {
+    // if running stub tests, don't use Conda
+    if (workflow.profile.contains("stubs")) {
+        return null
+    } else {
+        return env
+    }
+}
+
+
+
+process trio_setup {
+    label 'small'
+
+    publishDir "${prioritisation_out}", mode: 'copy'
+
+    input:
+    val(family_peds)
+
+    output:
+    path('FAM_IDs.txt')
+    path('PRO_IDs.txt')
+    path('FAM_PRO.txt')
+
+    script:
+    """
+    extract_trio_FAM_PRO_ID.py ${family_peds.join(' ')} --exclude-families ${params.exclude_families.join(' ')}
+    """
+}
+
+
+// Todo: fix affected parents
+
+
+process process_trio {
+    label 'medium'
+
+    conda conda_env("${params.bcbio}/anaconda/envs/java")
+
+    publishDir "${prioritisation_out}", mode: 'copy'
+
+    input:
+    tuple(
+        val(family_id),  // from FAM_IDs.txt
+        path(family_ped),
+        path(family_vcf),
+        val(plate_id),
+        val(proband_id),
+        val(proband_bam),
+        val(par_1_id),
+        val(par1_bam),
+        val(par_2_id),
+        val(par2_bam)
+    )
+    path(decipher_file)
+
+    output:
+    path("VCF/*", emit: vcf)
+    //path("PED/*", emit: ped)
+    path("BAMOUT/*", emit: bamout)
+    path("COV/*", emit: cov)
+    //path("CNV/*", emit: cnv)
+    path("DECIPHER/*.*", emit: decipher)
+    path("DECIPHER/IGV/*", emit: decipher_igv)
+    path("G2P/*", emit: g2p)
+    //path("LOG/*", emit: log)
+    path("VASE/*", emit: vase)
+
+    script:
+    """
+    mkdir VCF
+    mkdir PED
+    mkdir LOG
+    mkdir G2P
+    mkdir VASE
+    mkdir COV
+    mkdir -p DECIPHER/IGV
+    mkdir CNV
+    mkdir BAMOUT
+
+    export FAMILY_ID=$family_id
+    export FAMILY_PED=$family_ped
+    export FAMILY_VCF=$family_vcf
+    export PLATE_ID=$plate_id
+    export PROBAND_ID=$proband_id
+    export PROBAND_BAM=$proband_bam
+    export PAR_1_ID=$par_1_id
+    export PAR_1_BAM=$par1_bam
+    export PAR_2_ID=$par_2_id
+    export PAR_2_BAM=$par2_bam
+
+    export BCBIO=${params.bcbio}
+    export REFERENCE_GENOME=${params.reference_genome}
+    export BLACKLIST=${params.blacklist}
+    export TRANS_MAP=${params.trans_map}
+    export REC_SNP=${params.recurrent_snps}
+    export G2P_TARGETS=${params.g2p_targets}
+    export LIST_24_CHR=${params.list_24_chr}
+    export CLINVAR=${params.clinvar}
+    export GATK3=${params.gatk3}
+    export SCRIPTS_DIR=${params.scripts_dir}
+    export DEC_MAP=${decipher_file}
+
+    process_trio.sh $family_id ${params.bcbio}
+    """
+
+    stub:
+    """
+    vcf_base=${family_vcf.getName().replace('-gatk-haplotype-annotated.vcf.gz', '')}
+    mkdir VCF G2P VASE COV
+    mkdir -p DECIPHER/IGV BAMOUT/${proband_id}
+    touch VCF/\$vcf_base.{decomp,norm,DNU,ready}.vcf.gz
+    touch VCF/\$vcf_base.{AC0,clean}.vcf
+    touch G2P/${plate_id}_${family_id}{_inter_out.txt{,_summary.html},.report.{html,txt}}
+    touch G2P/3631209.txt
+    touch VASE/\$vcf_base.ready.denovo.vcf
+    touch COV/${proband_id}_${family_id}.DD15.COV.txt
+    touch COV/${proband_id}_${family_id}.DD15.sample_{cumulative_coverage_{counts,proportions},interval_{statistics,summary},statistics,summary}
+    touch COV/${proband_id}_${family_id}.REC_SNP_COV.txt
+    for i in \$(cat $family_ped | cut -f 2 | tr '\n' ' ')
+    do
+        touch "VCF/\$vcf_base.ready.\$i.vcf.gz"
+    done
+
+    touch DECIPHER/${proband_id}_${family_id}_{DEC_FLT.csv,DECIPHER_v10.xlsx}
+    touch DECIPHER/IGV/${proband_id}_${family_id}.snapshot.FLT.csv
+    touch DECIPHER/IGV/bamout_${proband_id}_${family_id}.snapshot.txt
+
+    touch BAMOUT/${family_id}/${family_id}_chr{2,4,7,16,22,X}_12345678.bamout.{bam,bai,vcf,vcf.idx}
+    """
+}
+
+
+process dnu_clean {
+    conda conda_env("$conda_python2 $conda_vt $conda_bcftools $conda_tabix")
+
+    input:
+    val(trio_input)
+    path(reference_fa)
+    path(reference_fai)
+    path(g2p_targets)
+    path(blacklist)
+
+    output:
+    val(trio_input, emit: trio_data)
+    path("${base}.ready.vcf.gz", emit: ready_vcf)
+    path("${base}.ready.vcf.gz.tbi", emit: ready_vcf_tbi)
+    path("${base}.clean.vcf", emit: clean_vcf)
+
+    publishDir "${prioritisation_out}/VCF", mode: 'copy', pattern: '*.ready.vcf.gz*'
+
+    // DNU and clean the family VCF, e.g. ${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz,
+    // plus deal with strand bias variants
+    script:
+    base = "${trio_input.plate_id}_${trio_input.family_id}"
+    dnu_vcf = "${base}.DNU.vcf.gz"
+    sb_exome_vcf = "${base}.SB_exome.vcf.gz"
+    sb_ddg2p_vcf = "${base}.SB_DDG2P.vcf.gz"
+    sb_vcf = "${base}.SB.vcf.gz"
+
+    """
+    vt decompose -s ${trio_input.family_vcf} -o ${base}.decomp.vcf.gz &&
+    vt normalize ${base}.decomp.vcf.gz -r ${reference_fa} -o ${base}.norm.vcf.gz &&
+    vt uniq ${base}.norm.vcf.gz -o ${dnu_vcf} &&
+
+    # identify variants (exome wide) which fail the strand bias filter: FS >= 60 and SOR >= 3
+    bcftools filter --include "INFO/FS>=60 & INFO/SOR>=3" ${dnu_vcf} | bgzip > ${sb_exome_vcf} &&
+    tabix -p vcf ${sb_exome_vcf} &&
+
+    # select only those that are within/overlap the DDG2P panel
+    bcftools view --regions-file ${g2p_targets} ${sb_exome_vcf} | bgzip > ${sb_ddg2p_vcf} &&
+    tabix -p vcf ${sb_ddg2p_vcf} &&
+
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" &&
+    echo " FAMILY_ID = ${trio_input.family_id}: the VCF file containing excluded DDG2P variants with strand bias (FS >= 60 and SOR >= 3): " &&
+    echo "    ${sb_ddg2p_vcf}" &&
+    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" &&
+
+    # exclude the strand bias variants
+    bcftools filter --exclude "INFO/FS>=60 & INFO/SOR>=3" ${dnu_vcf} | bgzip > ${sb_vcf} &&
+    tabix -p vcf ${sb_vcf} &&
+
+    # remove sites with AC=0
+    bcftools view --min-ac=1 --no-update ${dnu_vcf} > ${base}.AC0.vcf &&
+
+    # reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0
+    # exclude variants from the blacklist (matching on chr,pos,ref,alt)
+    filter_LQ_GT.py ${blacklist} ${base}.AC0.vcf ${base}.clean.vcf &&
+
+    # bgzip and tabix it
+    cat ${base}.clean.vcf | bgzip > ${base}.ready.vcf.gz &&
+    tabix -p vcf ${base}.ready.vcf.gz
+    """
+
+    stub:
+    base = "${trio_input.plate_id}_${trio_input.family_id}"
+    """
+    touch ${base}.ready.vcf.gz{,.tbi}
+    touch ${base}.clean.vcf
+    """
+}
+
+
+process g2p {
+    // run G2P for each family VCF (DD genes), e.g. ${PLATE_ID}_${FAMILY_ID}.clean.vcf
+
+    conda conda_env(conda_vep)
+
+    input:
+    val(trio_data)
+    path(clean_vcf)
+    path(reference_fa)
+    path(reference_fai)
+    path(g2p_target_csv)
+    path(g2p_genes)
+    path(vep_cache)
+    path(vep_plugins)
+
+    publishDir "${prioritisation_out}/G2P", mode: 'copy'
+
+    output:
+    tuple(
+        val(trio_data),
+        path("${base}_LOG_DIR/${base}_inter_out.txt"),
+        path("${base}_LOG_DIR/${base}_inter_out.txt_summary.html"),
+        path("${base}_LOG_DIR/${base}.report.html"),
+        path("${base}_LOG_DIR/${base}.report.txt"),
+        path("${base}_LOG_DIR/*.txt")
+    )
+
+    script:
+    base = "${trio_data.plate_id}_${trio_data.family_id}"
+    """
+    G2P_LOG_DIR=${base}_LOG_DIR
+    TXT_OUT=\${G2P_LOG_DIR}/${base}.report.txt
+    HTML_OUT=\${G2P_LOG_DIR}/${base}.report.html
+    VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38'
+
+    vep \
+        -i ${clean_vcf} \
+        --output_file \${G2P_LOG_DIR}/${base}_inter_out.txt \
+        --force_overwrite \
+        --assembly GRCh38 \
+        --fasta ${reference_fa} \
+        --offline \
+        --merged \
+        --use_given_ref \
+        --cache --cache_version 100 \
+        --dir_cache ${vep_cache} \
+        --individual all \
+        --transcript_filter "gene_symbol in ${g2p_genes}" \
+        --dir_plugins ${vep_plugins} \
+        --plugin G2P,file='${g2p_target_csv}',af_from_vcf=1,confidence_levels='definitive&strong&moderate',af_from_vcf_keys='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38',log_dir=\${G2P_LOG_DIR},txt_report=\${TXT_OUT},html_report=\${HTML_OUT}
+    """
+
+    stub:
+    base = "${trio_data.plate_id}_${trio_data.family_id}"
+    """
+    log_dir=${base}_LOG_DIR
+    mkdir \$log_dir
+    touch \$log_dir/${base}{_inter_out.txt{,_summary.html},.report.{html,txt}}
+    touch \$log_dir/3631209.txt
+    """
+}
+
+
+process vase {
+    // run VASE for each family VCF (de novo)
+
+    conda conda_env("$conda_vase $conda_tabix")
+
+    input:
+    val(trio_data)
+    path(ready_vcf)
+    path(ready_vcf_tbi)
+
+    output:
+    val(trio_data, emit: trio_data)
+    path("${base}.strict.denovo.vcf", emit: strict_denovo_vcf)
+
+    script:
+    base = "${trio_data.plate_id}_${trio_data.family_id}"
+    """
+    OUT_FILE=${base}.strict.denovo.vcf
+
+    vase \
+        -i ${ready_vcf} \
+        -o \${OUT_FILE} \
+        --log_progress \
+        --prog_interval 100000 \
+        --freq 0.0001 \
+        --gq 30 --dp 10 \
+        --het_ab 0.3 \
+        --max_alt_alleles 1 \
+        --csq all \
+        --biotypes all \
+        --control_gq 15 --control_dp 5 \
+        --control_het_ab 0.01 \
+        --control_max_ref_ab 0.05 \
+        --de_novo \
+        --ped ${trio_data.family_ped}
+
+
+    # for the cases where one of the parents is also affected, VASE de novo will crash not being able to find parents for the affected parent
+    # not producing any output file, which trips the pipeline downstream
+    # to handle: check if VASE denovo produced an output and if not (as it will be for such cases)
+    # create an empty VCF headered output == no denovos found)
+
+    if [ -f "\${OUT_FILE}" ]; then
+        echo "VASE denovo completed successfully"
+    else
+        echo "WARNING: VASE denovo has not produced an output (e.g. affected parent), generate an empty VCF one (with headers)"
+        tabix -H ${ready_vcf} > \${OUT_FILE}
+    fi
+    """
+
+    stub:
+    base = "${trio_data.plate_id}_${trio_data.family_id}"
+    """
+    touch ${base}.strict.denovo.vcf
+    """
+}
+
+
+process filter_denovo_vcfs {
+    // run VASE for each family VCF (de novo)
+
+    conda conda_env("$conda_gatk4 $conda_bcftools")
+
+    input:
+    val(trio_data)
+    path(strict_denovo_vcf)
+    path(reference_fa)
+    path(reference_fai)
+    path(reference_dict)
+    path(list_24_chr)
+
+    publishDir "${prioritisation_out}/VASE", mode: 'copy'
+
+    output:
+    tuple(
+        val(trio_data),
+        path("${base}.ready.denovo.vcf")
+    )
+
+    script:
+    base = "${trio_data.plate_id}_${trio_data.family_id}"
+    """
+    strict_24chr_denovo_vcf=${base}.strict.24chr.denovo.vcf
+    strict_24chr_sort_denovo_vcf=${base}.strict.24chr.sort.denovo.vcf
+
+    # do some filtering on the denovo VCFs - exclude variants not on the 24 chr, as well as variants in LCR and telomere/centromere regions
+    ### actually, ignore the filtering of variants in LCR and telomere/centromere regions --> more variants with  “Unknown” status may be classified as “denovo” if enough support
+    # index the denovo VCF
+    gatk IndexFeatureFile -I ${strict_denovo_vcf}
+
+    # select only variants on the 24 chromosomes
+    gatk SelectVariants -R ${reference_fa} -V ${strict_denovo_vcf} -O \$strict_24chr_denovo_vcf -L $list_24_chr --exclude-non-variants
+
+    # sort the VCF (maybe not needed?, but just in case, and it is quick)
+    grep '^#' \$strict_24chr_denovo_vcf > \$strict_24chr_sort_denovo_vcf \
+    && grep -v '^#' \$strict_24chr_denovo_vcf | LC_ALL=C sort -t \$'\t' -k1,1V -k2,2n >> \$strict_24chr_sort_denovo_vcf
+    # index the sorted VCF
+    gatk IndexFeatureFile -I \$strict_24chr_sort_denovo_vcf
+
+    # split multi-allelic sites [by -m -any]
+    # left-alignment and normalization [by adding the -f]
+    bcftools norm -f ${reference_fa} -m -any -Ov -o ${base}.ready.denovo.vcf \$strict_24chr_sort_denovo_vcf
+    """
+
+    stub:
+    base = "${trio_data.plate_id}_${trio_data.family_id}"
+    """
+    touch ${base}.ready.denovo.vcf
+    """
+}
+
+
+process coverage {
+    // run coverage for each proband (DD genes)
+    // format: ${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${INDI_ID}_${FAMILY_ID}/${INDI_ID}_${FAMILY_ID}-ready.bam
+
+    conda conda_env("$conda_python2 $conda_gatk3")
+
+    input:
+    val(trio_data)
+    path(reference_fa)
+    path(reference_fai)
+    path(reference_dict)
+    path(g2p_targets)
+    path(gatk3)
+    path(clinvar)
+
+    publishDir "${prioritisation_out}/COV", mode: 'copy'
+
+    output:
+    path("${full_proband}*DD15*")
+
+    script:
+    full_proband = "${trio_data.proband_id}_${trio_data.family_id}"
+    """
+    # make sure we are reading the data from the exact batch & plate ID
+    OUT_FILE=${full_proband}.DD15
+
+    java -Xmx8g -jar ${gatk3} -T DepthOfCoverage -R ${reference_fa} -o \${OUT_FILE} -I ${trio_data.proband_bam} -L ${g2p_targets} \
+        --omitDepthOutputAtEachBase \
+        --minBaseQuality 20 \
+        --minMappingQuality 20 \
+        -ct 20 \
+        -jdk_deflater \
+        -jdk_inflater \
+        --allow_potentially_misencoded_quality_scores &&
+
+    echo "percentage of DD exons (+/-15bp) covered at least 20x in PROBAND_ID = ${full_proband} ..."
+    cat \${OUT_FILE}.sample_summary | awk '{print \$7}' &&
+
+    # now compute the coverage per DD exon (+/-15bp) interval, adding the number of P/LP ClinVar variants (assertion criteria provided) in each interval
+    get_cov_output.py \${OUT_FILE}.sample_interval_summary ${clinvar} \${OUT_FILE}.COV.txt
+    """
+
+    stub:
+    full_proband = "${trio_data.proband_id}_${trio_data.family_id}"
+    """
+    touch ${full_proband}.DD15.COV.txt
+    touch ${full_proband}.DD15.sample_{cumulative_coverage_{counts,proportions},interval_{statistics,summary},statistics,summary}
+    """
+}
+
+process recurrent_denovo_snp_cov {
+    // check the coverage per each of the recurent de novo SNPs (padded with 15bp both directions)
+
+    conda conda_env(conda_samtools)
+    input:
+    val(trio_data)
+    path(rec_snp)
+
+    publishDir "${prioritisation_out}/COV", mode: 'copy'
+
+    output:
+    path("${trio_data.proband_id}_${trio_data.family_id}.REC_SNP_COV.txt")
+
+    script:
+    """
+    # we have identified the name of the proband's BAM file above (PROBAND_BAM), reuse it
+    # set the name of the file containing info about the coverage of the recurrent SNPs
+    REC_OUT_FILE=${trio_data.proband_id}_${trio_data.family_id}.REC_SNP_COV.txt
+
+    while IFS=\$'\t' read -ra var; do
+        gene="\${var[0]}"
+        chr="\${var[1]}"
+        pos="\${var[2]}"
+        lo=\$(expr \$pos - 15)
+        hi=\$(expr \$pos + 15)
+        reg="\$lo-\$hi"
+        echo "============================================="
+        echo "\$gene : recurrent variant at \$chr:\$pos"
+        echo "exploring coverage at \$chr:\$reg"
+
+        echo "---------------------------------------------"
+        echo "precisely at the position"
+        samtools depth -aa -Q 20 -r \$chr:\$reg ${trio_data.proband_bam} | grep "\$pos"
+
+        echo "---------------------------------------------"
+        echo "average in the +/- 15bp region"
+        samtools depth -aa -Q 20 -r \$chr:\$reg ${trio_data.proband_bam} | awk '{sum+=\$3} END { print "Average = ",sum/NR}'
+
+        echo "---------------------------------------------"
+        echo "detailed in the +/- 15bp region"
+        samtools depth -aa -Q 20 -r \$chr:\$reg ${trio_data.proband_bam}
+    done < ${rec_snp} > ${trio_data.proband_id}_${trio_data.family_id}.REC_SNP_COV.txt
+    """
+
+    stub:
+    """
+    touch ${trio_data.proband_id}_${trio_data.family_id}.REC_SNP_COV.txt
+    """
+}
+
+process split_family_vcf {
+    conda conda_env(conda_bcftools)
+
+    input:
+    val(trio_data)
+    path(ready_vcf)
+    path(ready_vcf_tbi)
+    path(reference_fa)
+    path(reference_fai)
+
+    publishDir "${prioritisation_out}/VCF", mode: 'copy'
+
+    output:
+    tuple(
+        val(trio_data),
+        path("${trio_data.plate_id}_${trio_data.family_id}.ready.${trio_data.proband_id}_${trio_data.family_id}.vcf.gz"),  // proband vcf
+        path("${trio_data.plate_id}_${trio_data.family_id}.ready.${trio_data.mother_id}_${trio_data.family_id}.vcf.gz"),  // mother_vcf
+        path("${trio_data.plate_id}_${trio_data.family_id}.ready.${trio_data.father_id}_${trio_data.family_id}.vcf.gz"),  // father_vcf
+    )
+
+    script:
+    """
+    # split the family VCF to individual VCFs
+    # -c1:  minimum allele count (INFO/AC) of sites to be printed
+    # split multi-allelic sites (by -m -any)
+    # left-alignment and normalization (by adding the -f)
+
+    file=${ready_vcf}
+    echo "splitting \$file"
+    for indi in `bcftools query -l \$file`; do
+        indv_rough_vcf_gz=\${file/.vcf*/.\$indi.rough.vcf.gz}
+        bcftools view -c1 -Oz -s \$indi -o \$indv_rough_vcf_gz \$file
+        bcftools norm -f ${reference_fa} -m -any -Oz -o \${file/.vcf*/.\$indi.vcf.gz} \$indv_rough_vcf_gz
+    done
+    """
+
+    stub:
+    """
+    for i in ${trio_data.proband_id} ${trio_data.mother_id} ${trio_data.father_id}
+    do
+        touch ${trio_data.plate_id}_${trio_data.family_id}.ready.\${i}_${trio_data.family_id}.vcf.gz
+    done
+    """
+}
+
+
+process generate_decipher_file {
+    // for each proband generate the DECIPHER file
+    // ${VCF_DIR}/${VCF_BASE}.ready.vcf.gz - the cleaned family VCF
+    // ${VASE_DIR}/${VCF_BASE}.ready.denovo.vcf - the VASE file
+    // ${TRANS_MAP} - the current transcript mapping file
+
+    conda conda_env("$conda_python2 $conda_xlsxwriter")
+
+    input:
+    val(trio_data)
+    path(vase_ready_denovo_vcf)
+    path(decipher_internal_ids)
+    path(g2p_file)
+    path(trans_map)
+
+    publishDir "${prioritisation_out}/DECIPHER", mode: 'copy'
+
+    output:
+    val(trio_data, emit: trio_data)
+    path("IGV/${full_proband}.snapshot.FLT.txt", emit: snapshot_flt_txt)
+    path("${full_proband}_DEC_FLT.csv", emit: dec_flt_csv)
+    path("${full_proband}_DECIPHER_v10.xlsx", emit: dec_v10_xlsx)
+
+
+    script:
+    full_proband = "${trio_data.proband_id}_${trio_data.family_id}"
+    """
+    # VASE file - already split, left-aligned and normalized
+    mkdir IGV
+
+    ## call the python script
+    generate_DEC_IGV_scripts.py \
+        ${decipher_internal_ids} \
+        ${trans_map} \
+        ${trio_data.family_ped} \
+        ${g2p_file} \
+        ${vase_ready_denovo_vcf} \
+        ${trio_data.plate_id}_${trio_data.family_id} \
+        ${trio_data.proband_vcf} \
+        ${trio_data.mother_vcf} \
+        ${trio_data.father_vcf} \
+        ${trio_data.plate_id} \
+        ${trio_data.family_id} \
+        ${trio_data.proband_bam} \
+        ${trio_data.mother_bam} \
+        ${trio_data.father_bam} \
+        ${full_proband}_DEC_FLT.csv \
+        IGV/${full_proband}.snapshot.FLT.txt &&
+
+    ## using the DECIPHER bulk upload file v9 --> generate the DECIPHER bulk upload file v10
+    echo "...Generating v10 Decipher bulk upload file for proband = ${trio_data.proband_id}, family_id = ${trio_data.family_id} ..." &&
+    convert_DEC_to_v10.py ${full_proband} ${full_proband}_DEC_FLT.csv ${full_proband}_DECIPHER_v10.xlsx
+    """
+
+    stub:
+    full_proband = "${trio_data.proband_id}_${trio_data.family_id}"
+    """
+    mkdir IGV
+    touch IGV/${full_proband}.snapshot.FLT.txt
+    touch ${full_proband}_{DEC_FLT.csv,DECIPHER_v10.xlsx}
+    """
+}
+
+
+process igv_snapshot_bamouts {
+    // for each variant in the DECIPHER upload file
+    // generate a IGV snapshot based on the realigned BAM used by GATK for calling variants
+    // first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder)
+    // then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs
+
+    conda conda_env(conda_gatk4)
+
+    input:
+    val(trio_data)
+    path(proband_dec_flt_csv)
+    path(reference_fa)
+    path(reference_fai)
+    path(reference_dict)
+
+    publishDir "${prioritisation_out}/BAMOUT", mode: 'copy', pattern: "${trio_data.family_id}/*.bamout.*"
+
+    output:
+    val(trio_data, emit: trio_data)
+    path("${trio_data.family_id}/*.bamout.*", emit: bamout, optional: true)
+    path(proband_dec_flt_csv, emit: proband_dec_flt_csv)
+
+    script:
+    """
+    echo "...Generating BAMOUT files for the ${trio_data.family_id} family, proband = ${trio_data.proband_id} ..."
+
+    # gather the trio BAM files
+    echo "...kid_bam = ${trio_data.proband_bam}..."
+    echo "...par_1_bam = ${trio_data.mother_bam}..."
+    echo "...par_2_bam = ${trio_data.father_bam}..."
+
+    # gather the variants in the DECIPHER file for which to generate bamouts
+    # chr is the second column - need to add the 'chr' prefix
+    # pos is the third column
+    # the first line is a header line, starting with 'Internal reference number or ID'
+    # file called: <proband_id>_<fam_id>_DEC_FLT.csv
+    # and for each run GATK to generate the bamout files
+    # to be stored in \${BAMOUT_DIR}/\${FAMILY_ID}
+
+    mkdir -p ${trio_data.family_id} &&
+
+    echo "... reading ${proband_dec_flt_csv} to generate the bamouts..." &&
+
+    grep -v '^Internal' ${proband_dec_flt_csv} |
+    while IFS= read -r line
+    do
+        echo "\$line"
+        IFS=, read -ra ary <<<"\$line"
+        chr=\${ary[1]}
+        pos=\${ary[2]}
+        ref=\${ary[4]}
+        alt=\${ary[5]}
+        echo " --> chr = \$chr, pos = \$pos, ref = \${ref}, alt = \${alt}"
+
+        # generate the bamout file
+        echo "...doing the bamout"
+
+        gatk HaplotypeCaller \
+            --reference ${reference_fa} \
+            --input ${trio_data.proband_bam} \
+            --input ${trio_data.mother_bam} \
+            --input ${trio_data.father_bam} \
+            -L chr\${chr}:\${pos} \
+            --interval-padding 500 --active-probability-threshold 0.000 -ploidy 2 \
+            --output ${trio_data.family_id}/${trio_data.family_id}_chr\${chr}_\${pos}.bamout.vcf \
+            -bamout ${trio_data.family_id}/${trio_data.family_id}_chr\${chr}_\${pos}.bamout.bam
+    done
+    """
+
+    stub:
+    """
+    mkdir ${trio_data.family_id}
+    touch ${trio_data.family_id}/${trio_data.family_id}_chr{2,4,7,16,22,X}_12345678.bamout.{bam,bai,vcf{,.idx}}
+    """
+}
+
+process generate_igv_snapshots {
+    // write the IGV batch file for this family based on the bamouts
+    // to be stored as DECIPHER/IGV/bamout_${PROBAND_ID}_${FAMILY_ID}.snapshot.txt
+
+    input:
+    val(trio_data)
+    path(proband_dec_flt_csv)
+
+    publishDir "${prioritisation_out}/DECIPHER/IGV", mode: 'copy'
+
+    output:
+    path("bamout_${full_proband}.snapshot.txt")
+
+    script:
+    full_proband = "${trio_data.proband_id}_${trio_data.family_id}"
+    """
+    snap_file=bamout_${full_proband}.snapshot.txt &&
+
+    # check if previous version exist, if so - delete it
+    if [ -f "\$snap_file" ]; then
+        echo "previous version of \$snap_file exist --> deleted"
+        rm \$snap_file
+    fi &&
+
+    # write the header for the IGV batch file
+    echo "new" >> \$snap_file
+    echo "genome hg38" >> \$snap_file
+    echo "snapshotDirectory \"${trio_data.plate_id}_${trio_data.family_id}\"" >> \$snap_file
+    echo "" >> \$snap_file
+
+    # now, go again over the variants in the DECIPHER file and generate one snapshot file for all the variants
+    echo "... reading ${proband_dec_flt_csv} to generate the IGV batch file using the bamouts..."
+
+    grep -v '^Internal' ${proband_dec_flt_csv} |
+    while IFS= read -r line
+    do
+        IFS=, read -ra ary <<<"\$line"
+        chr=\${ary[1]}
+        pos=\${ary[2]}
+        ref=\${ary[4]}
+        alt=\${ary[5]}
+        left=\$((\${pos}-25))
+        right=\$((\${pos}+25))
+
+        echo "new" >> \$snap_file
+        echo "load ${trio_data.family_id}/${trio_data.family_id}_chr\${chr}_\${pos}.bamout.bam" >> \$snap_file
+        echo "preference SAM.SHADE_BASE_QUALITY true" >> \$snap_file
+
+        echo "goto chr\${chr}:\${left}-\${right}" >> \$snap_file
+        echo "group SAMPLE" >> \$snap_file
+        echo "sort base" >> \$snap_file
+        echo "squish" >> \$snap_file
+        echo "snapshot bamout_${full_proband}_chr\${chr}_\${pos}_\${ref}_\${alt}.png" >> \$snap_file
+        echo "" >> \$snap_file
+        echo "" >> \$snap_file
+
+    done
+
+    echo "Generating of the IGV batch files based on bamouts - done!"
+    echo "snap_file = \$snap_file"
+    """
+
+    stub:
+    full_proband = "${trio_data.proband_id}_${trio_data.family_id}"
+    """
+    touch bamout_${full_proband}.snapshot.txt
+    """
+}
+
+
+workflow read_inputs {
+    main:
+        // ch_family_ids: [family1, family2, ...]
+        ch_family_ids = Channel.fromPath(params.ped_file)
+            .splitCsv(sep: '\t')
+            .map({line -> line[0]})
+            .flatten()
+            .unique()
+
+        ch_family_peds = ch_family_ids
+            .map(
+                {
+                    it -> [
+                        it,
+                        "${params.output_dir}/${params.pipeline_project_id}_${params.pipeline_project_version}/params/${it}.ped"
+                    ]
+                }
+            )
+
+    emit:
+        ch_family_ids = ch_family_ids
+        ch_family_peds = ch_family_peds
+}
+
+workflow prioritisation_setup {
+    read_inputs()
+
+    trio_setup(
+        read_inputs.out.ch_family_peds.map(
+            {family_id, ped -> ped}
+        ).collect()
+    )
+    report = """
+    Family ID file available at:\n
+    ${params.output_dir}/${params.pipeline_project_id}_${params.pipeline_project_version}/prioritization/${params.prioritisation_version}/FAM_IDs.txt
+
+    Download this file for translation to DECIPHER IDs, and upload the resulting DECIPHER_INTERNAL_IDS.txt file to ${params.output_dir}/${params.pipeline_project_id}_${params.pipeline_project_version}/prioritization/${params.prioritisation_version}/
+    """
+    println report.stripIndent()
+}
+
+
+workflow prioritisation {
+    read_inputs()
+
+    pipeline_dir = "${params.output_dir}/${params.pipeline_project_id}_${params.pipeline_project_version}"
+    prioritisation_dir = "$pipeline_dir/prioritization/${params.prioritisation_version}"
+
+    ch_probands = Channel.fromPath("$prioritisation_dir/FAM_PRO.txt")
+    .splitCsv(sep: '\t')
+
+    ch_vcfs = Channel.fromPath("$pipeline_dir/families/*/*-gatk-haplotype-annotated.vcf.gz")
+    .map({ it -> [it.getName().replace('-gatk-haplotype-annotated.vcf.gz', ''), it] })
+
+    ch_bams = Channel.fromPath("$pipeline_dir/families/*/*/*-ready.bam")
+    .map({[it.getParent().getName(), it]})
+
+    ch_family_ped_data = Channel.fromPath(params.ped_file)
+    .splitCsv(sep: '\t')
+    .filter({it[2] != '0' && it[3] != '0'})
+    // todo: remove redundancy with FAM_PRO.txt
+    .map(
+        {
+            family_id, proband, father, mother, sex, affected ->
+                [proband, father, mother, family_id]
+        }
+    ).join(ch_bams)
+    .map(
+        {
+            proband, father, mother, family_id, proband_bam ->
+                [father, proband, proband_bam, mother, family_id]
+        }
+    )
+    .join(ch_bams)
+    .map(
+        {
+            father, proband, proband_bam, mother, family_id, father_bam ->
+            [mother, proband, proband_bam, father, father_bam, family_id]
+        }
+    )
+    .join(ch_bams)
+    .map(
+        {
+            mother, proband, proband_bam, father, father_bam, family_id, mother_bam ->
+            [family_id, proband, proband_bam, father, father_bam, mother, mother_bam]
+        }
+    )
+
+    ch_trio_inputs = read_inputs.out.ch_family_peds.map(
+        {full_family_id, ped ->
+            [full_family_id.split('_')[1], full_family_id, ped]
+    })
+    .join(ch_probands)
+    .map(
+        {short_family_id, full_family_id, ped, proband ->
+            [full_family_id, ped, short_family_id, proband]
+        }
+    )
+    .join(ch_vcfs)
+    .join(ch_family_ped_data)
+    .map(
+        {
+            full_family_id, ped, short_family_id, proband, vcf, full_proband,
+            proband_bam, full_father, father_bam, full_mother, mother_bam ->
+            [
+                family_id: full_family_id.split('_')[1],
+                family_ped: ped,
+                family_vcf: vcf,
+                plate_id: full_family_id.split('_')[0],
+                proband_id: proband,
+                proband_bam: proband_bam,
+                father_id: full_father.split('_')[0],
+                father_bam: father_bam,
+                mother_id: full_mother.split('_')[0],
+                mother_bam: mother_bam
+            ]
+        }
+    )
+
+    ch_decipher_file = Channel.fromPath("$prioritisation_dir/DECIPHER_INTERNAL_ID*.txt").collect()
+    ch_reference_fa = Channel.fromPath(params.reference_genome, checkIfExists: true).collect()
+    ch_reference_fai = Channel.fromPath("${params.reference_genome}.fai", checkIfExists: true).collect()
+    ch_reference_dict = Channel.fromPath("${params.reference_dict}", checkIfExists: true).collect()
+    ch_blacklist = Channel.fromPath(params.blacklist, checkIfExists: true).collect()
+    ch_list_24_chr = Channel.fromPath(params.list_24_chr, checkIfExists: true).collect()
+    ch_g2p_target_bed = Channel.fromPath(params.g2p_target_bed, checkIfExists: true).collect()
+    ch_g2p_target_csv = Channel.fromPath(params.g2p_target_csv, checkIfExists: true).collect()
+    ch_g2p_genes = Channel.fromPath(params.g2p_genes, checkIfExists: true).collect()
+    ch_gatk3 = Channel.fromPath(params.gatk3, checkIfExists: true).collect()
+    ch_clinvar = Channel.fromPath(params.clinvar, checkIfExists: true).collect()
+    ch_rec_snp = Channel.fromPath(params.recurrent_snps, checkIfExists: true).collect()
+    ch_trans_map = Channel.fromPath(params.trans_map, checkIfExists: true).collect()
+    ch_vep_cache = Channel.fromPath(params.vep_cache, checkIfExists: true).collect()
+    ch_vep_plugins = Channel.fromPath(params.vep_plugins, checkIfExists: true).collect()
+
+    dnu_clean(ch_trio_inputs, ch_reference_fa, ch_reference_fai, ch_g2p_target_bed, ch_blacklist)
+    g2p(dnu_clean.out.trio_data, dnu_clean.out.clean_vcf, ch_reference_fa, ch_reference_fai, ch_g2p_target_csv, ch_g2p_genes, ch_vep_cache, ch_vep_plugins)
+    vase(dnu_clean.out.trio_data, dnu_clean.out.ready_vcf, dnu_clean.out.ready_vcf_tbi)
+    filter_denovo_vcfs(vase.out.trio_data, vase.out.strict_denovo_vcf, ch_reference_fa, ch_reference_fai, ch_reference_dict, ch_list_24_chr)
+    coverage(ch_trio_inputs, ch_reference_fa, ch_reference_fai, ch_reference_dict, ch_g2p_target_bed, ch_gatk3, ch_clinvar)
+    recurrent_denovo_snp_cov(ch_trio_inputs, ch_rec_snp)
+    split_family_vcf(dnu_clean.out.trio_data, dnu_clean.out.ready_vcf, dnu_clean.out.ready_vcf_tbi, ch_reference_fa, ch_reference_fai)
+
+    split_family_vcf.out.map({it -> [it[0].family_id, it]})
+        .join(filter_denovo_vcfs.out.map({trio_data, ready_denovo_vcf -> [trio_data.family_id, ready_denovo_vcf]}))
+        .join(
+            g2p.out.map(
+                {trio_data, inter_out, inter_out_summary, report_html, report_txt, x_txt -> [trio_data.family_id, report_txt]}
+            )
+        )
+        .multiMap(
+            { family_id, split_data, ready_denovo, g2p_report ->
+                family_id: family_id
+                trio_data: [
+                    family_id: split_data[0].family_id,
+                    family_ped: split_data[0].family_ped,
+                    family_vcf: split_data[0].family_vcf,
+                    plate_id: split_data[0].plate_id,
+                    proband_id: split_data[0].proband_id,
+                    proband_bam: split_data[0].proband_bam,
+                    proband_vcf: split_data[1],
+                    father_id: split_data[0].father_id,
+                    father_bam: split_data[0].father_bam,
+                    father_vcf: split_data[3],
+                    mother_id: split_data[0].mother_id,
+                    mother_bam: split_data[0].mother_bam,
+                    mother_vcf: split_data[2]
+                ]
+                ready_denovo_vcf: ready_denovo
+                proband_g2p_report: g2p_report
+            }
+        )
+        .set({ch_trios_with_vcfs})
+
+    generate_decipher_file(ch_trios_with_vcfs.trio_data, ch_trios_with_vcfs.ready_denovo_vcf, ch_decipher_file, ch_trios_with_vcfs.proband_g2p_report, ch_trans_map)
+    igv_snapshot_bamouts(generate_decipher_file.out.trio_data, generate_decipher_file.out.dec_flt_csv, ch_reference_fa, ch_reference_fai, ch_reference_dict)
+    generate_igv_snapshots(igv_snapshot_bamouts.out.trio_data, igv_snapshot_bamouts.out.proband_dec_flt_csv)
+}
diff --git a/tests/assets/input_data/ped_files/batch_1.ped b/tests/assets/input_data/ped_files/batch_1.ped
index b494a07f123e08320cabb914cd6e47afd1ba6fc7..e0ecc42bb215a13c8e51bcbf6d4f9af7866470ee 100644
--- a/tests/assets/input_data/ped_files/batch_1.ped
+++ b/tests/assets/input_data/ped_files/batch_1.ped
@@ -1,6 +1,7 @@
-000001	000001	000002	000003	2	2
-000001	000002	0	0	1	1
-000001	000003	0	0	2	1
-000002	000004	000005	000006	2	2
-000002	000005	0	0	1	1
-000002	000006	0	0	2	1
+12345_000001	000001_000001	000002_000001	000003_000001	2	2
+12345_000001	000002_000001	0	0	1	1
+12345_000001	000003_000001	0	0	2	1
+12345_000002	000004_000002	000005_000002	000006_000002	2	2
+12345_000002	000005_000002	0	0	1	1
+12345_000002	000006_000002	0	0	2	1
+12345_000003	000007_000003	0	0	2	2
\ No newline at end of file
diff --git a/tests/assets/input_data/ped_files/giab_test_trios.ped b/tests/assets/input_data/ped_files/giab_test_trios.ped
index 566cce43692ccc031da29f5336c4a7130d093c25..63184eb9088d6da1f49335562f69d3bd48ba7176 100644
--- a/tests/assets/input_data/ped_files/giab_test_trios.ped
+++ b/tests/assets/input_data/ped_files/giab_test_trios.ped
@@ -1,4 +1,4 @@
-00001_000001	000001_000001	000002_000002	000003_000002	1	2
+00001_000001	000001_000001	000002_000001	000003_000001	1	2
 00001_000001	000002_000001	0	0	1	1
 00001_000001	000003_000001	0	0	2	1
 00001_000002	000004_000002	000005_000002	000006_000002	1	2
diff --git a/tests/assets/input_data/sample_sheets/DECIPHER_INTERNAL_IDs_stubs.txt b/tests/assets/input_data/sample_sheets/DECIPHER_INTERNAL_IDs_stubs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0fde77da00f77cbe82af7b94786ed311fd4b863d
--- /dev/null
+++ b/tests/assets/input_data/sample_sheets/DECIPHER_INTERNAL_IDs_stubs.txt
@@ -0,0 +1,2 @@
+000001	decipher_000001
+000002	decipher_000002
diff --git a/tests/assets/input_data/sample_sheets/batch_1.tsv b/tests/assets/input_data/sample_sheets/batch_1.tsv
index f0a24cca62135246abe875a9ae1b2021b4fb05e7..64910cfae6534a3f32b6a4a2bfb6c18ee3368cd1 100644
--- a/tests/assets/input_data/sample_sheets/batch_1.tsv
+++ b/tests/assets/input_data/sample_sheets/batch_1.tsv
@@ -1,7 +1,11 @@
 individual_id	read_1	read_2
-000001	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz
-000001	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz
-000002	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz
-000002	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz
-000003	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz
-000003	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz
+000001_000001	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz
+000001_000001	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz
+000002_000001	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz
+000002_000001	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz
+000003_000001	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz
+000003_000001	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz
+000004_000002	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz
+000005_000002	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz
+000006_000002	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz
+000007_000003	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz
\ No newline at end of file
diff --git a/tests/assets/input_data/sample_sheets/giab_test_trios.tsv b/tests/assets/input_data/sample_sheets/giab_test_trios.tsv
index 35429e7d7a353c713b4503c09fdeb69539bbebf1..51b77671045667a4393cfbec0198cd2cf985ba97 100644
--- a/tests/assets/input_data/sample_sheets/giab_test_trios.tsv
+++ b/tests/assets/input_data/sample_sheets/giab_test_trios.tsv
@@ -1,7 +1,7 @@
 individual_id	read_1	read_2
-000001_000001	assets/input_data/giab/AshkenazimTrio/HG002_R1.fastq.gz	assets/input_data/giab/AshkenazimTrio/HG002_R2.fastq.gz
-000002_000001	assets/input_data/giab/AshkenazimTrio/HG003_R1.fastq.gz	assets/input_data/giab/AshkenazimTrio/HG003_R2.fastq.gz
-000003_000001	assets/input_data/giab/AshkenazimTrio/HG004_R1.fastq.gz	assets/input_data/giab/AshkenazimTrio/HG004_R2.fastq.gz
-000004_000002	assets/input_data/giab/ChineseTrio/HG005_R1.fastq.gz	assets/input_data/giab/ChineseTrio/HG005_R2.fastq.gz
-000005_000002	assets/input_data/giab/ChineseTrio/HG006_R1.fastq.gz	assets/input_data/giab/ChineseTrio/HG006_R2.fastq.gz
-000006_000002	assets/input_data/giab/ChineseTrio/HG007_R1.fastq.gz	assets/input_data/giab/ChineseTrio/HG007_R2.fastq.gz
+000001_000001	assets/input_data/giab/raw_data/AshkenazimTrio/HG002.GRCh38.2x250_R1.fastq.gz	assets/input_data/giab/raw_data/AshkenazimTrio/HG002.GRCh38.2x250_R2.fastq.gz
+000002_000001	assets/input_data/giab/raw_data/AshkenazimTrio/HG003.GRCh38.2x250_R1.fastq.gz	assets/input_data/giab/raw_data/AshkenazimTrio/HG003.GRCh38.2x250_R2.fastq.gz
+000003_000001	assets/input_data/giab/raw_data/AshkenazimTrio/HG004.GRCh38.2x250_R1.fastq.gz	assets/input_data/giab/raw_data/AshkenazimTrio/HG004.GRCh38.2x250_R2.fastq.gz
+000004_000002	assets/input_data/giab/raw_data/ChineseTrio/HG005.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.300x_R1.fastq.gz	assets/input_data/giab/raw_data/ChineseTrio/HG005.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.300x_R2.fastq.gz
+000005_000002	assets/input_data/giab/raw_data/ChineseTrio/HG006.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.100x_R1.fastq.gz	assets/input_data/giab/raw_data/ChineseTrio/HG006.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.100x_R2.fastq.gz
+000006_000002	assets/input_data/giab/raw_data/ChineseTrio/HG007.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.100x_R1.fastq.gz	assets/input_data/giab/raw_data/ChineseTrio/HG007.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.100x_R2.fastq.gz
diff --git a/tests/run_giab_tests.sh b/tests/run_giab_tests.sh
index 23ce6e9fde33310bc7cdbca3a8a5cd9224b937e9..eefe547bc2402cd1ef370a2fd6dd27f6750c51c8 100755
--- a/tests/run_giab_tests.sh
+++ b/tests/run_giab_tests.sh
@@ -5,21 +5,48 @@ source scripts/nextflow_detached.sh
 test_exit_status=0
 nextflow -c "$NEXTFLOW_CONFIG" clean -f
 
-echo "Reduced GiaB data - trios"
-run_nextflow ../main.nf \
-    -c "$NEXTFLOW_CONFIG" \
-    --workflow 'variant-calling' \
+common_args="-ansi-log false -c $NEXTFLOW_CONFIG -profile testing -resume"
+
+echo "--- Reduced GiaB data: trios ---" &&
+echo Variant calling &&
+nextflow ../main.nf \
+    $common_args \
+    --workflow variant_calling \
     --pipeline_project_id giab_test_trios \
     --pipeline_project_version v1 \
-    --ped_file $PWD/assets/input_data/ped_files/giab_test_trios.ped \
-    --sample_sheet $PWD/assets/input_data/sample_sheets/giab_test_trios.tsv
+    --ped_file assets/input_data/ped_files/giab_test_trios.ped \
+    --sample_sheet assets/input_data/sample_sheets/giab_test_trios.tsv &&
+
+echo Variant prioritisation setup &&
+nextflow ../main.nf \
+    $common_args \
+    --workflow variant_prioritisation_setup \
+    --pipeline_project_id giab_test_trios \
+    --variant_prioritisation_version v1 \
+    --ped_file assets/input_data/ped_files/giab_test_trios.ped &&
+
+cp assets/input_data/sample_sheets/DECIPHER_INTERNAL_IDs_giab_trios.txt outputs/giab_test_trios_v1/prioritization/v1 &&
+
+echo Variant prioritisation &&
+nextflow run ../main.nf \
+    $common_args \
+    --workflow variant_prioritisation \
+    --pipeline_project_id giab_test_trios \
+    --variant_prioritisation_version v1 \
+    --ped_file assets/input_data/ped_files/giab_test_trios.ped &&
+    
+echo Cram compression &&
+nextflow run ../main.nf \
+    $common_args \
+    --workflow cram_compression \
+    --compress_dir outputs/giab_test_trios_v1
 
 test_exit_status=$(( $test_exit_status + $? ))
 
-echo "Reduced GiaB data - non-trios"
-run_nextflow ../main.nf \
-    -c "$NEXTFLOW_CONFIG" \
-    --workflow 'variant-calling' \
+echo "--- Reduced GiaB data: non-trios ---" &&
+nextflow run ../main.nf \
+    $common_args \
+    --workflow variant_calling \
     --pipeline_project_id giab_test_non_trios \
     --pipeline_project_version v1 \
     --ped_file $PWD/assets/input_data/ped_files/giab_test_non_trios.ped \
diff --git a/tests/run_stubs.sh b/tests/run_stubs.sh
index 3f8f86418e86414da673d550086d53b052a097d3..ad2458ec38ae9d30e251f4ad182c72e9679dbd02 100755
--- a/tests/run_stubs.sh
+++ b/tests/run_stubs.sh
@@ -3,34 +3,109 @@
 source scripts/nextflow_detached.sh
 bcbio=$PWD/scripts/bcbio_nextgen.py
 
+common_args="-stub-run -profile testing,stubs -ansi-log false"
 test_exit_status=0
 
 rm -r work/*/*
 
-echo "Test case 1: simple trio"
+echo "--- Test case 1: simple trio ---" &&
+echo "Variant calling" &&
 run_nextflow ../main.nf \
-    -stub-run -profile stubs \
-    --workflow variant-calling \
+    $common_args \
+    --workflow variant_calling \
     --pipeline_project_id test_stub \
-    --pipeline_project_version v1 \
     --ped_file assets/input_data/ped_files/batch_1.ped \
-    --sample_sheet assets/input_data/sample_sheets/batch_1.tsv
+    --sample_sheet assets/input_data/sample_sheets/batch_1.tsv &&
+
+echo "Variant prioritisation setup" &&
+run_nextflow ../main.nf \
+    $common_args \
+    --workflow variant_prioritisation_setup \
+    --pipeline_project_id test_stub \
+    --variant_prioritisation_version v1 \
+    --ped_file outputs/test_stub_v1/params/batch_1.ped \
+    --exclude_families 12345_000003,12345_000004 &&
+
+cp assets/input_data/sample_sheets/DECIPHER_INTERNAL_IDs_stubs.txt outputs/test_stub_v1/prioritization/v1/DECIPHER_INTERNAL_IDs.txt &&
+
+echo "Variant prioritisation" &&
+run_nextflow ../main.nf \
+    $common_args \
+    --workflow variant_prioritisation \
+    --pipeline_project_id test_stub \
+    --variant_prioritisation_version v1 \
+    --ped_file outputs/test_stub_v1/params/batch_1.ped
+
+output_dir='outputs/test_stub_v1/prioritization/v1'
+
+for pro_fam in 000001_000001 000004_000002
+do
+    for ext in DD15.COV.txt REC_SNP_COV.txt DD15.sample_{cumulative_coverage_{counts,proportions},interval_{statistics,summary},statistics,summary}
+    do
+        ls $output_dir/COV/$pro_fam.$ext
+    done
+
+    ls $output_dir/DECIPHER/${pro_fam}_{DEC_FLT.csv,DECIPHER_v10.xlsx}
+    ls $output_dir/DECIPHER/IGV/$pro_fam.snapshot.FLT.txt
+    ls $output_dir/DECIPHER/IGV/bamout_$pro_fam.snapshot.txt
+done
+
+for fam_id in 000001 000002
+do
+    ls $output_dir/BAMOUT/${fam_id}/${fam_id}_chr{2,4,7,16,22,X}_12345678.bamout.{bam,bai,vcf,vcf.idx}
+    ls $output_dir/G2P/12345_${fam_id}_LOG_DIR/12345_${fam_id}_inter_out.txt{,_summary.html}
+    ls $output_dir/G2P/12345_${fam_id}_LOG_DIR/12345_${fam_id}.report.{html,txt}
+    ls $output_dir/VCF/12345_${fam_id}.ready.vcf.gz
+    ls $output_dir/VASE/12345_${fam_id}.ready.denovo.vcf
+done
+
+ls $output_dir/{DECIPHER_INTERNAL_IDS,{FAM,PRO}_IDs,FAM_PRO}.txt
+ls $output_dir/VCF/12345_000001.ready{,.00000{1,2,3}_000001}.vcf.gz
+ls $output_dir/VCF/12345_000002.ready{,.00000{4,5,6}_000002}.vcf.gz
+
+# should be bams
+! ls outputs/test_stub_v1/families/*/*/*.cram &&
+ls outputs/test_stub_v1/families/*/*/*.bam &&
+
+echo "Cram compression" &&
+run_nextflow ../main.nf \
+    $common_args \
+    --workflow cram_compression \
+    --compress_dir outputs/test_stub_v1/families &&
+
+# should be crams, not bams
+! ls outputs/test_stub_v1/families/*/*/*.bam &> /dev/null &&
+ls outputs/test_stub_v1/families/*/*/*.cram &> /dev/null &&
+echo "Cram decompression" &&
+run_nextflow ../main.nf \
+    $common_args \
+    --workflow cram_decompression \
+    --compress_dir outputs/test_stub_v1/families &&
+
+# should be bams again
+! ls outputs/test_stub_v1/families/*/*/*.cram &> /dev/null &&
+ls outputs/test_stub_v1/families/*/*/*.bam &> /dev/null
 
-test_exit_status=$(( $test_exit_status + $? ))
+test_case_exit_status=$?
+echo "Test case exited with status $test_case_exit_status"
+test_exit_status=$(( $test_exit_status + $test_case_exit_status ))
 
-echo "Test case 2: MD5 errors"
+echo "--- Test case 2: MD5 errors ---" &&
 run_nextflow ../main.nf \
-    -stub-run -profile stubs \
-    --workflow variant-calling \
+    $common_args \
+    --workflow variant_calling \
     --pipeline_project_id test_stub_md5_errors \
     --pipeline_project_version v1 \
     --ped_file assets/input_data/ped_files/batch_2_md5_errors.ped \
     --sample_sheet assets/input_data/sample_sheets/batch_2_md5_errors.tsv
 
-if [ $? == 0 ]
+test_case_exit_status=$?
+echo "Test case exited with status $test_case_exit_status"
+if [ $test_case_exit_status -eq 0 ]
 then
     test_exit_status=$(( $test_exit_status + 1 ))
 fi
 
+echo ""
 echo "Tests finished with exit status $test_exit_status"
 exit $test_exit_status