From 2ca0e60c42dea2eb716378f2f227660dc76b8f67 Mon Sep 17 00:00:00 2001
From: Murray Wham <murray.wham@ed.ac.uk>
Date: Tue, 5 Oct 2021 12:20:17 +0100
Subject: [PATCH] Initial commit of NextFlow pipeline - test data and MD5
 checks

---
 .gitignore                                    |   2 +
 README.md                                     |  26 ++++
 environment.yml                               |  11 ++
 pipeline/file_checks.nf                       | 116 ++++++++++++++++++
 pipeline/lib/PipelineException.groovy         |  12 ++
 pipeline/main.nf                              | 105 ++++++++++++++++
 tests/assets/input_data/Readme.md             |  15 +++
 ..._BHNTGMDMXX_1_00001AM0001L01_1.fastq.count |   0
 ...001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz |   0
 ..._BHNTGMDMXX_1_00001AM0001L01_2.fastq.count |   0
 ...001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz |   0
 ..._BHNTGMDMXX_2_00001AM0001L01_1.fastq.count |   0
 ...001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz |   0
 ..._BHNTGMDMXX_2_00001AM0001L01_2.fastq.count |   0
 ...001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz |   0
 ..._BHNTGMDMXX_3_00002AM0001L01_1.fastq.count |   0
 ...001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz |   0
 ..._BHNTGMDMXX_3_00002AM0001L01_2.fastq.count |   0
 ...001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz |   0
 ..._BHNTGMDMXX_4_00002AM0001L01_1.fastq.count |   0
 ...001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz |   0
 ..._BHNTGMDMXX_4_00002AM0001L01_2.fastq.count |   0
 ...001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz |   0
 ..._BHNTGMDMXX_5_00003AM0001L01_1.fastq.count |   0
 ...001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz |   0
 ..._BHNTGMDMXX_5_00003AM0001L01_2.fastq.count |   0
 ...001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz |   0
 ..._BHNTGMDMXX_6_00003AM0001L01_1.fastq.count |   0
 ...001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz |   0
 ..._BHNTGMDMXX_6_00003AM0001L01_2.fastq.count |   0
 ...001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz |   0
 ..._BHNTGMDMXX_7_00004AM0001L01_1.fastq.count |   0
 ...001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz |   0
 ..._BHNTGMDMXX_7_00004AM0001L01_2.fastq.count |   0
 ...001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz |   0
 ..._BHNTGMDMXX_8_00004AM0001L01_1.fastq.count |   0
 ...001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz |   0
 ..._BHNTGMDMXX_8_00004AM0001L01_2.fastq.count |   0
 ...001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz |   0
 ..._BRLSHNMKBX_1_00005AM0001L01_1.fastq.count |   0
 ...002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz |   0
 ..._BRLSHNMKBX_1_00005AM0001L01_2.fastq.count |   0
 ...002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz |   0
 ..._BRLSHNMKBX_2_00005AM0001L01_1.fastq.count |   0
 ...002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz |   0
 ..._BRLSHNMKBX_2_00005AM0001L01_2.fastq.count |   0
 ...002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz |   0
 .../20210922/file_list.tsv                    |  21 ++++
 .../X12345_A_Researcher/20210922/md5sums.txt  |  32 +++++
 ...002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz |   0
 ...002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz |   0
 .../X12346_MD5_Errors/20211005/file_list.tsv  |   3 +
 .../X12346_MD5_Errors/20211005/md5sums.txt    |   2 +
 tests/assets/input_data/ped_files/batch_1.ped |   6 +
 .../ped_files/batch_2_md5_errors.ped          |   1 +
 .../input_data/sample_sheets/batch_1.txt      |   7 ++
 .../sample_sheets/batch_2_md5_errors.txt      |   2 +
 tests/run_tests.sh                            |  16 +++
 58 files changed, 377 insertions(+)
 create mode 100644 environment.yml
 create mode 100644 pipeline/file_checks.nf
 create mode 100644 pipeline/lib/PipelineException.groovy
 create mode 100644 pipeline/main.nf
 create mode 100644 tests/assets/input_data/Readme.md
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.count
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/file_list.tsv
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/md5sums.txt
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/file_list.tsv
 create mode 100644 tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/md5sums.txt
 create mode 100644 tests/assets/input_data/ped_files/batch_1.ped
 create mode 100644 tests/assets/input_data/ped_files/batch_2_md5_errors.ped
 create mode 100644 tests/assets/input_data/sample_sheets/batch_1.txt
 create mode 100644 tests/assets/input_data/sample_sheets/batch_2_md5_errors.txt
 create mode 100755 tests/run_tests.sh

diff --git a/.gitignore b/.gitignore
index 0fb699f..bdd964e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 *_transfer_info_file.sh
 *.o[1-9]*
+*.nextflow*
+work
\ No newline at end of file
diff --git a/README.md b/README.md
index e69de29..769be0e 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,26 @@
+# Trio-Whole-Exome pipeline
+
+This is an automated version of the scripts currently run manually according to SOP as part of the whole exome trios project with David Fitzpatrick's group. This pipeline is controlled by [NextFlow](https://www.nextflow.io/)
+
+
+## Setup
+
+A [Conda](https://docs.conda.io) environment containing NextFlow is available in `environment.yaml`. Once you have Conda installed, you can create an environment by `cd`-ing into this project and running the command:
+
+    $ conda env create -n <environment_name>
+
+## Tests
+
+This pipeline has automated tests contained in the folder `tests/`. To the run the tests locally, `cd` to this folder with your Conda environment active and run `./run_tests.sh`.
+
+
+## About the terms used here
+
+Batch - could be a pipeline batch, a sequencing batch or a BCBio batch
+Sequencing batch - a group of samples that were prepared and sequenced together
+Pipeline batch - a single run of this pipeline, potentially mixing samples and fmailies from multiple seq batches
+BCBio batch - used internally by BCBio to identify a family
+Sample ID - specific to a: seq batch, family ID, individual ID, extraction kit type
+File list - one file_list.tsv per seq batch, summarises all the fastqs in the batch. A pipeline batch may need to refer to multiple different individuals within the family across different file lists
+Ped file - defines family relationships between individuals. One Ped file per pipeline batch.
+Sample sheet - links the Ped file and file list(s) by defining what raw fastqs belong to each individual
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..d64d2d1
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,11 @@
+---
+
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+
+dependencies:
+  - coreutils=8.25  # gives us cross-platform md5sum
+  - nextflow=21.04.0
+  - bcbio-nextgen=1.2.8
\ No newline at end of file
diff --git a/pipeline/file_checks.nf b/pipeline/file_checks.nf
new file mode 100644
index 0000000..51d6014
--- /dev/null
+++ b/pipeline/file_checks.nf
@@ -0,0 +1,116 @@
+nextflow.enable.dsl = 2
+
+
+process observed_md5 {
+    /* Run md5sum on a file to get its observed checksum */
+
+    input:
+    path(downloaded_file)
+
+    output:
+    tuple(val("${downloaded_file.getName()}"), stdout)
+
+    script:
+    """
+    md5sum $downloaded_file | cut -d ' ' -f 1
+    """
+}
+
+process expected_md5 {
+    /*
+    Grep out downloaded_file's expected checksum from md5sum_file. Assumes the
+    md5sum_file to be in the format '<checksum> path/to/downloaded.file',
+    separated by a space
+    */
+
+    input:
+    path(downloaded_file)
+    path(md5sum_file)
+
+    output:
+    tuple(val("${downloaded_file.getName()}"), stdout)
+
+    script:
+    """
+    grep $downloaded_file $md5sum_file | cut -d ' ' -f 1
+    """
+}
+
+process check_errors {
+    /*
+    Compare obsered and expected checksums for all given files. If there are
+    any, then raise an exception. We have to do this inside a process because
+    race conditions occur when trying to execute this directly in a workflow.
+    */
+    input:
+    val(mismatches)
+    
+    exec:
+    if (!mismatches.isEmpty()) {
+        throw new PipelineException("MD5 mismatches found: ${mismatches.toString()}")
+    }
+}
+
+
+workflow checksums {
+    /*
+    Take a parsed samplesheet, flatten it and parse into a channel of observed vs.
+    expected checksums. Calls check_errors above to raise an exception upon any
+    mismatches.
+    */
+
+    take:
+        ch_samplesheet_info
+    
+    main:
+        ch_fastqs = ch_samplesheet_info.flatMap(
+            { indv ->
+                [indv[1].r1, indv[1].r2]
+            }
+        ).flatten()
+        .map({file(it)})
+        
+        ch_md5_files = ch_fastqs.map(
+            { fastq -> fastq.getParent().getParent() + '/md5sums.txt' }
+        )
+        
+        ch_obs = observed_md5(ch_fastqs)
+        ch_exp = expected_md5(ch_fastqs, ch_md5_files)
+
+        ch_mismatches = ch_obs.concat(ch_exp)
+            .map({fastq, md5 -> [fastq, md5.strip()]})
+            .groupTuple()
+            .filter({it[1][0] != it[1][1]})
+            .collect({"${it[0]}: ${it[1][0]} != ${it[1][1]}"})
+        
+        ch_mismatches.view()
+        check_errors(ch_mismatches)
+}
+
+/*
+
+merge_fastqs
+->
+{
+    indv1: {
+        r1: merged_r1.fastq.gz,
+        r2: merged_r2.fastq.gz
+    },
+    indv2: {
+        ...
+    }
+}
+
+
+per family:
+    prepare_bcbio_config
+    ->
+    "samplename,description,batch,sex,phenotype,variant_regions"
+    "$MERGED_FASTQ,$INDIVIDUAL_ID,$FAMILY_ID,$SEX,$PHENOTYPE,$TWIST_BED_FILE"
+    -> 
+    family_{family_id}.csv
+
+
+    bcbio_nextgen.py -w template $BCBIO_TEMPLATE $family{family_id}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
+
+*/
diff --git a/pipeline/lib/PipelineException.groovy b/pipeline/lib/PipelineException.groovy
new file mode 100644
index 0000000..8a3c4fb
--- /dev/null
+++ b/pipeline/lib/PipelineException.groovy
@@ -0,0 +1,12 @@
+import org.codehaus.groovy.GroovyException
+
+class PipelineException extends GroovyException {
+    /*
+    Simple exception class that can be thrown at pipeline runtime.
+    Maybe there's a better way of raising errors?
+    */
+    public PipelineException(String message) {
+        super(message)
+    }
+
+}
diff --git a/pipeline/main.nf b/pipeline/main.nf
new file mode 100644
index 0000000..ef30e4c
--- /dev/null
+++ b/pipeline/main.nf
@@ -0,0 +1,105 @@
+nextflow.enable.dsl = 2
+
+include {checksums} from './file_checks.nf'
+
+params.ped_file = null
+params.sample_sheet = null
+
+ch_sample_sheet = Channel.fromPath(params.sample_sheet, checkIfExists: true)
+ch_ped_file = Channel.fromPath(params.ped_file, checkIfExists: true)
+
+
+workflow read_ped_file {
+    /*
+    Read the given ped file and output a tuple channel of included families and their members:
+
+    {
+        family1: [
+            {
+                family: family1,
+                indv: indv1,
+                father: indv2,
+                mother: indv3,
+                sex: 2,
+                affected: 2
+            },
+            {
+                ...
+            },
+        
+        family2: [
+            ...
+        ]
+    }
+    */
+
+    take:
+        ch_ped_file
+
+    main:
+        ch_family_info = ch_ped_file
+            .splitCsv(sep: '\t')
+            .map(
+                { line ->
+                    [
+                        line[0],  // family ID
+                        [
+                            family: line[0],
+                            indv: line[1],
+                            father: line[2],
+                            mother: line[3],
+                            sex: line[4],
+                            affected: line[5]
+                        ]
+                    ]
+                }
+            )
+            .groupTuple()
+    
+    emit:
+        ch_family_info
+}
+
+workflow read_sample_sheet {
+    /*
+    Read in a pipeline sample sheet and output a tuple of included individuals and their listed fastqs:
+    {
+        indv1: {
+            r1: [lane_1_r1.fastq.gz, lane_2_r1.fastq.gz],
+            r2: [lane_1_r2.fastq.gz, lane_2_r2.fastq.gz]
+        },
+        indv2: {
+            ...
+        }
+    }
+    */
+
+    take:
+        ch_sample_sheet
+
+    main:
+        ch_samplesheet_info = ch_sample_sheet
+            .splitCsv(sep:'\t', header: true)
+            .map(
+                { line -> [line.individual_id, line.read_1, line.read_2] }
+            )
+            .groupTuple()
+            .map(
+                { record ->
+                    [
+                        record[0],
+                        [
+                            indv: record[0],
+                            r1: record[1],
+                            r2: record[2]
+                        ]
+                    ]
+                }
+            )
+    
+    emit: ch_samplesheet_info
+}
+
+workflow {
+    checksums(read_sample_sheet(ch_sample_sheet))
+}
diff --git a/tests/assets/input_data/Readme.md b/tests/assets/input_data/Readme.md
new file mode 100644
index 0000000..84e19df
--- /dev/null
+++ b/tests/assets/input_data/Readme.md
@@ -0,0 +1,15 @@
+# Test input data
+
+The test data here consists of:
+
+## Edinburgh Genomics test data
+
+Family 000001 - an example of a complete trio. Individuals: 000001, 000002, 000003
+Family 000002 - an incomplete trio with only one parent. Individuals: 000004, 000005
+Family 000003 - a singleton test case for an unexpected MD5 checksum. Individuals: 000006
+
+Todo: more EG test data from a different download date - maybe a singleton, complete duo and a quad
+
+## CRF
+
+todo: CRF test data
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.count
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/file_list.tsv b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/file_list.tsv
new file mode 100644
index 0000000..b91c6b6
--- /dev/null
+++ b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/file_list.tsv
@@ -0,0 +1,21 @@
+File	Run_ID	Lane_or_Cell	Read	Read_length_BP	Total_reads	Barcode_sequence	Sample_name	Library_ID	File_size	Platform	Model
+12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz	200922_A00001_0001_BHNTGMDMXX	1	1	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000001_000001_WESTwist_IDT-B	00001AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz	200922_A00001_0001_BHNTGMDMXX	1	2	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000001_000001_WESTwist_IDT-B	00001AM0001L01	500.2M	Illumina	NovaSeq 6000
+12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz	200922_A00001_0001_BHNTGMDMXX	2	1	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000001_000001_WESTwist_IDT-B	00001AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz	200922_A00001_0001_BHNTGMDMXX	2	2	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000001_000001_WESTwist_IDT-B	00001AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz	200922_A00001_0001_BHNTGMDMXX	3	1	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000002_000001_WESTwist_IDT-B	00002AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz	200922_A00001_0001_BHNTGMDMXX	3	2	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000002_000001_WESTwist_IDT-B	00002AM0001L01	500.2M	Illumina	NovaSeq 6000
+12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz	200922_A00001_0001_BHNTGMDMXX	4	1	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000002_000001_WESTwist_IDT-B	00002AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz	200922_A00001_0001_BHNTGMDMXX	4	2	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000002_000001_WESTwist_IDT-B	00002AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz	200922_A00001_0001_BHNTGMDMXX	5	1	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000003_000001_WESTwist_IDT-B	00003AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz	200922_A00001_0001_BHNTGMDMXX	5	2	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000003_000001_WESTwist_IDT-B	00003AM0001L01	500.2M	Illumina	NovaSeq 6000
+12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz	200922_A00001_0001_BHNTGMDMXX	6	1	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000003_000001_WESTwist_IDT-B	00003AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz	200922_A00001_0001_BHNTGMDMXX	6	2	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000003_000001_WESTwist_IDT-B	00003AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz	200922_A00001_0001_BHNTGMDMXX	7	1	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000004_000002_WESTwist_IDT-B	00004AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz	200922_A00001_0001_BHNTGMDMXX	7	2	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000004_000002_WESTwist_IDT-B	00004AM0001L01	500.2M	Illumina	NovaSeq 6000
+12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz	200922_A00001_0001_BHNTGMDMXX	8	1	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000004_000002_WESTwist_IDT-B	00004AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz	200922_A00001_0001_BHNTGMDMXX	8	2	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000004_000002_WESTwist_IDT-B	00004AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz	200923_A00001_0002_BRLSHNMKBX	1	1	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000005_000002_WESTwist_IDT-B	00005AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz	200923_A00001_0002_BRLSHNMKBX	1	2	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000005_000002_WESTwist_IDT-B	00005AM0001L01	500.2M	Illumina	NovaSeq 6000
+12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz	200923_A00001_0002_BRLSHNMKBX	2	1	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000005_000002_WESTwist_IDT-B	00005AM0001L01	500.1M	Illumina	NovaSeq 6000
+12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz	200923_A00001_0002_BRLSHNMKBX	2	2	100	12345167	GTGACGGAGC+TGGCGGTCCA	12345_000005_000002_WESTwist_IDT-B	00005AM0001L01	500.1M	Illumina	NovaSeq 6000
diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/md5sums.txt b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/md5sums.txt
new file mode 100644
index 0000000..25e1b82
--- /dev/null
+++ b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/md5sums.txt
@@ -0,0 +1,32 @@
+d41d8cd98f00b204e9800998ecf8427e  12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz
+d41d8cd98f00b204e9800998ecf8427e  12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.count
+d41d8cd98f00b204e9800998ecf8427e  12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz
diff --git a/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/file_list.tsv b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/file_list.tsv
new file mode 100644
index 0000000..d347008
--- /dev/null
+++ b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/file_list.tsv
@@ -0,0 +1,3 @@
+File	Run_ID	Lane_or_Cell	Read	Read_length_BP	Total_reads	Barcode_sequence	Sample_name	Library_ID	File_size	Platform	Model
+12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz	211005_A00002_0002_AJTHSNRLXX	1	1	100	12345167	GTGACGGAGC+TGGCGGTCCA	12346_000006_000003_WESTwist_IDT-B	00002AM0002L01	500.1M	Illumina	NovaSeq 6000
+12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz	211005_A00002_0002_AJTHSNRLXX	1	2	100	12345167	GTGACGGAGC+TGGCGGTCCA	12346_000006_000003_WESTwist_IDT-B	00002AM0002L01	500.1M	Illumina	NovaSeq 6000
diff --git a/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/md5sums.txt b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/md5sums.txt
new file mode 100644
index 0000000..3c82835
--- /dev/null
+++ b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/md5sums.txt
@@ -0,0 +1,2 @@
+some_unexpected_md5_checksum 12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz
+some_unexpected_md5_checksum 12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz
diff --git a/tests/assets/input_data/ped_files/batch_1.ped b/tests/assets/input_data/ped_files/batch_1.ped
new file mode 100644
index 0000000..b494a07
--- /dev/null
+++ b/tests/assets/input_data/ped_files/batch_1.ped
@@ -0,0 +1,6 @@
+000001	000001	000002	000003	2	2
+000001	000002	0	0	1	1
+000001	000003	0	0	2	1
+000002	000004	000005	000006	2	2
+000002	000005	0	0	1	1
+000002	000006	0	0	2	1
diff --git a/tests/assets/input_data/ped_files/batch_2_md5_errors.ped b/tests/assets/input_data/ped_files/batch_2_md5_errors.ped
new file mode 100644
index 0000000..2f7610f
--- /dev/null
+++ b/tests/assets/input_data/ped_files/batch_2_md5_errors.ped
@@ -0,0 +1 @@
+000003	000006	0	0	1	1
diff --git a/tests/assets/input_data/sample_sheets/batch_1.txt b/tests/assets/input_data/sample_sheets/batch_1.txt
new file mode 100644
index 0000000..7b93d54
--- /dev/null
+++ b/tests/assets/input_data/sample_sheets/batch_1.txt
@@ -0,0 +1,7 @@
+individual_id	family_id	sample_id	read_1	read_2
+000001	000001	12345_000001_000001_WESTwist_IDT-B	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz
+000001	000001	12345_000001_000001_WESTwist_IDT-B	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz
+000002	000001	12345_000002_000001_WESTwist_IDT-B	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz
+000002	000001	12345_000002_000001_WESTwist_IDT-B	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz
+000003	000001	12345_000003_000001_WESTwist_IDT-B	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz
+000003	000001	12345_000003_000001_WESTwist_IDT-B	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz
diff --git a/tests/assets/input_data/sample_sheets/batch_2_md5_errors.txt b/tests/assets/input_data/sample_sheets/batch_2_md5_errors.txt
new file mode 100644
index 0000000..fca9441
--- /dev/null
+++ b/tests/assets/input_data/sample_sheets/batch_2_md5_errors.txt
@@ -0,0 +1,2 @@
+individual_id	family_id	sample_id	read_1	read_2
+000006	000003	12346_000006_000003_WESTwist_IDT-B	assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz	assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz
diff --git a/tests/run_tests.sh b/tests/run_tests.sh
new file mode 100755
index 0000000..83763e8
--- /dev/null
+++ b/tests/run_tests.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+test_exit_status=0
+
+echo "Test case 1: simple trio"
+nextflow run -lib ../pipeline/lib ../pipeline/main.nf --ped_file assets/input_data/ped_files/batch_1.ped  --sample_sheet assets/input_data/sample_sheets/batch_1.txt
+test_exit_status=$(( $test_exit_status + $? ))
+
+echo "Test case 2: MD5 errors"
+nextflow run -lib ../pipeline/lib ../pipeline/main.nf --ped_file assets/input_data/ped_files/batch_2_md5_errors.ped  --sample_sheet assets/input_data/sample_sheets/batch_2_md5_errors.txt
+if ! [ $? == 1 ]
+then
+    test_exit_status=$(( $test_exit_status + 1 ))
+fi
+
+echo "Tests finished with exit status $test_exit_status"
-- 
GitLab