From 2ca0e60c42dea2eb716378f2f227660dc76b8f67 Mon Sep 17 00:00:00 2001 From: Murray Wham <murray.wham@ed.ac.uk> Date: Tue, 5 Oct 2021 12:20:17 +0100 Subject: [PATCH] Initial commit of NextFlow pipeline - test data and MD5 checks --- .gitignore | 2 + README.md | 26 ++++ environment.yml | 11 ++ pipeline/file_checks.nf | 116 ++++++++++++++++++ pipeline/lib/PipelineException.groovy | 12 ++ pipeline/main.nf | 105 ++++++++++++++++ tests/assets/input_data/Readme.md | 15 +++ ..._BHNTGMDMXX_1_00001AM0001L01_1.fastq.count | 0 ...001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz | 0 ..._BHNTGMDMXX_1_00001AM0001L01_2.fastq.count | 0 ...001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz | 0 ..._BHNTGMDMXX_2_00001AM0001L01_1.fastq.count | 0 ...001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz | 0 ..._BHNTGMDMXX_2_00001AM0001L01_2.fastq.count | 0 ...001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz | 0 ..._BHNTGMDMXX_3_00002AM0001L01_1.fastq.count | 0 ...001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz | 0 ..._BHNTGMDMXX_3_00002AM0001L01_2.fastq.count | 0 ...001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz | 0 ..._BHNTGMDMXX_4_00002AM0001L01_1.fastq.count | 0 ...001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz | 0 ..._BHNTGMDMXX_4_00002AM0001L01_2.fastq.count | 0 ...001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz | 0 ..._BHNTGMDMXX_5_00003AM0001L01_1.fastq.count | 0 ...001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz | 0 ..._BHNTGMDMXX_5_00003AM0001L01_2.fastq.count | 0 ...001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz | 0 ..._BHNTGMDMXX_6_00003AM0001L01_1.fastq.count | 0 ...001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz | 0 ..._BHNTGMDMXX_6_00003AM0001L01_2.fastq.count | 0 ...001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz | 0 ..._BHNTGMDMXX_7_00004AM0001L01_1.fastq.count | 0 ...001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz | 0 ..._BHNTGMDMXX_7_00004AM0001L01_2.fastq.count | 0 ...001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz | 0 ..._BHNTGMDMXX_8_00004AM0001L01_1.fastq.count | 0 ...001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz | 0 ..._BHNTGMDMXX_8_00004AM0001L01_2.fastq.count | 0 ...001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz | 0 ..._BRLSHNMKBX_1_00005AM0001L01_1.fastq.count | 0 ...002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz | 0 ..._BRLSHNMKBX_1_00005AM0001L01_2.fastq.count | 0 ...002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz | 0 ..._BRLSHNMKBX_2_00005AM0001L01_1.fastq.count | 0 ...002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz | 0 ..._BRLSHNMKBX_2_00005AM0001L01_2.fastq.count | 0 ...002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz | 0 .../20210922/file_list.tsv | 21 ++++ .../X12345_A_Researcher/20210922/md5sums.txt | 32 +++++ ...002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz | 0 ...002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz | 0 .../X12346_MD5_Errors/20211005/file_list.tsv | 3 + .../X12346_MD5_Errors/20211005/md5sums.txt | 2 + tests/assets/input_data/ped_files/batch_1.ped | 6 + .../ped_files/batch_2_md5_errors.ped | 1 + .../input_data/sample_sheets/batch_1.txt | 7 ++ .../sample_sheets/batch_2_md5_errors.txt | 2 + tests/run_tests.sh | 16 +++ 58 files changed, 377 insertions(+) create mode 100644 environment.yml create mode 100644 pipeline/file_checks.nf create mode 100644 pipeline/lib/PipelineException.groovy create mode 100644 pipeline/main.nf create mode 100644 tests/assets/input_data/Readme.md create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.count create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/file_list.tsv create mode 100644 tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/md5sums.txt create mode 100644 tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz create mode 100644 tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/file_list.tsv create mode 100644 tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/md5sums.txt create mode 100644 tests/assets/input_data/ped_files/batch_1.ped create mode 100644 tests/assets/input_data/ped_files/batch_2_md5_errors.ped create mode 100644 tests/assets/input_data/sample_sheets/batch_1.txt create mode 100644 tests/assets/input_data/sample_sheets/batch_2_md5_errors.txt create mode 100755 tests/run_tests.sh diff --git a/.gitignore b/.gitignore index 0fb699f..bdd964e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ *_transfer_info_file.sh *.o[1-9]* +*.nextflow* +work \ No newline at end of file diff --git a/README.md b/README.md index e69de29..769be0e 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,26 @@ +# Trio-Whole-Exome pipeline + +This is an automated version of the scripts currently run manually according to SOP as part of the whole exome trios project with David Fitzpatrick's group. This pipeline is controlled by [NextFlow](https://www.nextflow.io/) + + +## Setup + +A [Conda](https://docs.conda.io) environment containing NextFlow is available in `environment.yaml`. Once you have Conda installed, you can create an environment by `cd`-ing into this project and running the command: + + $ conda env create -n <environment_name> + +## Tests + +This pipeline has automated tests contained in the folder `tests/`. To the run the tests locally, `cd` to this folder with your Conda environment active and run `./run_tests.sh`. + + +## About the terms used here + +Batch - could be a pipeline batch, a sequencing batch or a BCBio batch +Sequencing batch - a group of samples that were prepared and sequenced together +Pipeline batch - a single run of this pipeline, potentially mixing samples and fmailies from multiple seq batches +BCBio batch - used internally by BCBio to identify a family +Sample ID - specific to a: seq batch, family ID, individual ID, extraction kit type +File list - one file_list.tsv per seq batch, summarises all the fastqs in the batch. A pipeline batch may need to refer to multiple different individuals within the family across different file lists +Ped file - defines family relationships between individuals. One Ped file per pipeline batch. +Sample sheet - links the Ped file and file list(s) by defining what raw fastqs belong to each individual diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..d64d2d1 --- /dev/null +++ b/environment.yml @@ -0,0 +1,11 @@ +--- + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - coreutils=8.25 # gives us cross-platform md5sum + - nextflow=21.04.0 + - bcbio-nextgen=1.2.8 \ No newline at end of file diff --git a/pipeline/file_checks.nf b/pipeline/file_checks.nf new file mode 100644 index 0000000..51d6014 --- /dev/null +++ b/pipeline/file_checks.nf @@ -0,0 +1,116 @@ +nextflow.enable.dsl = 2 + + +process observed_md5 { + /* Run md5sum on a file to get its observed checksum */ + + input: + path(downloaded_file) + + output: + tuple(val("${downloaded_file.getName()}"), stdout) + + script: + """ + md5sum $downloaded_file | cut -d ' ' -f 1 + """ +} + +process expected_md5 { + /* + Grep out downloaded_file's expected checksum from md5sum_file. Assumes the + md5sum_file to be in the format '<checksum> path/to/downloaded.file', + separated by a space + */ + + input: + path(downloaded_file) + path(md5sum_file) + + output: + tuple(val("${downloaded_file.getName()}"), stdout) + + script: + """ + grep $downloaded_file $md5sum_file | cut -d ' ' -f 1 + """ +} + +process check_errors { + /* + Compare obsered and expected checksums for all given files. If there are + any, then raise an exception. We have to do this inside a process because + race conditions occur when trying to execute this directly in a workflow. + */ + input: + val(mismatches) + + exec: + if (!mismatches.isEmpty()) { + throw new PipelineException("MD5 mismatches found: ${mismatches.toString()}") + } +} + + +workflow checksums { + /* + Take a parsed samplesheet, flatten it and parse into a channel of observed vs. + expected checksums. Calls check_errors above to raise an exception upon any + mismatches. + */ + + take: + ch_samplesheet_info + + main: + ch_fastqs = ch_samplesheet_info.flatMap( + { indv -> + [indv[1].r1, indv[1].r2] + } + ).flatten() + .map({file(it)}) + + ch_md5_files = ch_fastqs.map( + { fastq -> fastq.getParent().getParent() + '/md5sums.txt' } + ) + + ch_obs = observed_md5(ch_fastqs) + ch_exp = expected_md5(ch_fastqs, ch_md5_files) + + ch_mismatches = ch_obs.concat(ch_exp) + .map({fastq, md5 -> [fastq, md5.strip()]}) + .groupTuple() + .filter({it[1][0] != it[1][1]}) + .collect({"${it[0]}: ${it[1][0]} != ${it[1][1]}"}) + + ch_mismatches.view() + check_errors(ch_mismatches) +} + +/* + +merge_fastqs +-> +{ + indv1: { + r1: merged_r1.fastq.gz, + r2: merged_r2.fastq.gz + }, + indv2: { + ... + } +} + + +per family: + prepare_bcbio_config + -> + "samplename,description,batch,sex,phenotype,variant_regions" + "$MERGED_FASTQ,$INDIVIDUAL_ID,$FAMILY_ID,$SEX,$PHENOTYPE,$TWIST_BED_FILE" + -> + family_{family_id}.csv + + + bcbio_nextgen.py -w template $BCBIO_TEMPLATE $family{family_id}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz + +*/ diff --git a/pipeline/lib/PipelineException.groovy b/pipeline/lib/PipelineException.groovy new file mode 100644 index 0000000..8a3c4fb --- /dev/null +++ b/pipeline/lib/PipelineException.groovy @@ -0,0 +1,12 @@ +import org.codehaus.groovy.GroovyException + +class PipelineException extends GroovyException { + /* + Simple exception class that can be thrown at pipeline runtime. + Maybe there's a better way of raising errors? + */ + public PipelineException(String message) { + super(message) + } + +} diff --git a/pipeline/main.nf b/pipeline/main.nf new file mode 100644 index 0000000..ef30e4c --- /dev/null +++ b/pipeline/main.nf @@ -0,0 +1,105 @@ +nextflow.enable.dsl = 2 + +include {checksums} from './file_checks.nf' + +params.ped_file = null +params.sample_sheet = null + +ch_sample_sheet = Channel.fromPath(params.sample_sheet, checkIfExists: true) +ch_ped_file = Channel.fromPath(params.ped_file, checkIfExists: true) + + +workflow read_ped_file { + /* + Read the given ped file and output a tuple channel of included families and their members: + + { + family1: [ + { + family: family1, + indv: indv1, + father: indv2, + mother: indv3, + sex: 2, + affected: 2 + }, + { + ... + }, + + family2: [ + ... + ] + } + */ + + take: + ch_ped_file + + main: + ch_family_info = ch_ped_file + .splitCsv(sep: '\t') + .map( + { line -> + [ + line[0], // family ID + [ + family: line[0], + indv: line[1], + father: line[2], + mother: line[3], + sex: line[4], + affected: line[5] + ] + ] + } + ) + .groupTuple() + + emit: + ch_family_info +} + +workflow read_sample_sheet { + /* + Read in a pipeline sample sheet and output a tuple of included individuals and their listed fastqs: + { + indv1: { + r1: [lane_1_r1.fastq.gz, lane_2_r1.fastq.gz], + r2: [lane_1_r2.fastq.gz, lane_2_r2.fastq.gz] + }, + indv2: { + ... + } + } + */ + + take: + ch_sample_sheet + + main: + ch_samplesheet_info = ch_sample_sheet + .splitCsv(sep:'\t', header: true) + .map( + { line -> [line.individual_id, line.read_1, line.read_2] } + ) + .groupTuple() + .map( + { record -> + [ + record[0], + [ + indv: record[0], + r1: record[1], + r2: record[2] + ] + ] + } + ) + + emit: ch_samplesheet_info +} + +workflow { + checksums(read_sample_sheet(ch_sample_sheet)) +} diff --git a/tests/assets/input_data/Readme.md b/tests/assets/input_data/Readme.md new file mode 100644 index 0000000..84e19df --- /dev/null +++ b/tests/assets/input_data/Readme.md @@ -0,0 +1,15 @@ +# Test input data + +The test data here consists of: + +## Edinburgh Genomics test data + +Family 000001 - an example of a complete trio. Individuals: 000001, 000002, 000003 +Family 000002 - an incomplete trio with only one parent. Individuals: 000004, 000005 +Family 000003 - a singleton test case for an unexpected MD5 checksum. Individuals: 000006 + +Todo: more EG test data from a different download date - maybe a singleton, complete duo and a quad + +## CRF + +todo: CRF test data diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.count b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.count new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/file_list.tsv b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/file_list.tsv new file mode 100644 index 0000000..b91c6b6 --- /dev/null +++ b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/file_list.tsv @@ -0,0 +1,21 @@ +File Run_ID Lane_or_Cell Read Read_length_BP Total_reads Barcode_sequence Sample_name Library_ID File_size Platform Model +12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz 200922_A00001_0001_BHNTGMDMXX 1 1 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000001_000001_WESTwist_IDT-B 00001AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz 200922_A00001_0001_BHNTGMDMXX 1 2 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000001_000001_WESTwist_IDT-B 00001AM0001L01 500.2M Illumina NovaSeq 6000 +12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz 200922_A00001_0001_BHNTGMDMXX 2 1 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000001_000001_WESTwist_IDT-B 00001AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz 200922_A00001_0001_BHNTGMDMXX 2 2 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000001_000001_WESTwist_IDT-B 00001AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz 200922_A00001_0001_BHNTGMDMXX 3 1 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000002_000001_WESTwist_IDT-B 00002AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz 200922_A00001_0001_BHNTGMDMXX 3 2 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000002_000001_WESTwist_IDT-B 00002AM0001L01 500.2M Illumina NovaSeq 6000 +12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz 200922_A00001_0001_BHNTGMDMXX 4 1 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000002_000001_WESTwist_IDT-B 00002AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz 200922_A00001_0001_BHNTGMDMXX 4 2 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000002_000001_WESTwist_IDT-B 00002AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz 200922_A00001_0001_BHNTGMDMXX 5 1 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000003_000001_WESTwist_IDT-B 00003AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz 200922_A00001_0001_BHNTGMDMXX 5 2 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000003_000001_WESTwist_IDT-B 00003AM0001L01 500.2M Illumina NovaSeq 6000 +12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz 200922_A00001_0001_BHNTGMDMXX 6 1 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000003_000001_WESTwist_IDT-B 00003AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz 200922_A00001_0001_BHNTGMDMXX 6 2 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000003_000001_WESTwist_IDT-B 00003AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz 200922_A00001_0001_BHNTGMDMXX 7 1 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000004_000002_WESTwist_IDT-B 00004AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz 200922_A00001_0001_BHNTGMDMXX 7 2 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000004_000002_WESTwist_IDT-B 00004AM0001L01 500.2M Illumina NovaSeq 6000 +12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz 200922_A00001_0001_BHNTGMDMXX 8 1 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000004_000002_WESTwist_IDT-B 00004AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz 200922_A00001_0001_BHNTGMDMXX 8 2 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000004_000002_WESTwist_IDT-B 00004AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_1.fastq.gz 200923_A00001_0002_BRLSHNMKBX 1 1 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000005_000002_WESTwist_IDT-B 00005AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_1_00005AM0001L01_2.fastq.gz 200923_A00001_0002_BRLSHNMKBX 1 2 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000005_000002_WESTwist_IDT-B 00005AM0001L01 500.2M Illumina NovaSeq 6000 +12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_1.fastq.gz 200923_A00001_0002_BRLSHNMKBX 2 1 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000005_000002_WESTwist_IDT-B 00005AM0001L01 500.1M Illumina NovaSeq 6000 +12345_000005_000002_WESTwist_IDT-B/200923_A00001_0002_BRLSHNMKBX_2_00005AM0001L01_2.fastq.gz 200923_A00001_0002_BRLSHNMKBX 2 2 100 12345167 GTGACGGAGC+TGGCGGTCCA 12345_000005_000002_WESTwist_IDT-B 00005AM0001L01 500.1M Illumina NovaSeq 6000 diff --git a/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/md5sums.txt b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/md5sums.txt new file mode 100644 index 0000000..25e1b82 --- /dev/null +++ b/tests/assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/md5sums.txt @@ -0,0 +1,32 @@ +d41d8cd98f00b204e9800998ecf8427e 12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_1.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_7_00004AM0001L01_2.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_1.fastq.gz +d41d8cd98f00b204e9800998ecf8427e 12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.count +d41d8cd98f00b204e9800998ecf8427e 12345_000004_000002_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_8_00004AM0001L01_2.fastq.gz diff --git a/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/file_list.tsv b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/file_list.tsv new file mode 100644 index 0000000..d347008 --- /dev/null +++ b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/file_list.tsv @@ -0,0 +1,3 @@ +File Run_ID Lane_or_Cell Read Read_length_BP Total_reads Barcode_sequence Sample_name Library_ID File_size Platform Model +12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz 211005_A00002_0002_AJTHSNRLXX 1 1 100 12345167 GTGACGGAGC+TGGCGGTCCA 12346_000006_000003_WESTwist_IDT-B 00002AM0002L01 500.1M Illumina NovaSeq 6000 +12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz 211005_A00002_0002_AJTHSNRLXX 1 2 100 12345167 GTGACGGAGC+TGGCGGTCCA 12346_000006_000003_WESTwist_IDT-B 00002AM0002L01 500.1M Illumina NovaSeq 6000 diff --git a/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/md5sums.txt b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/md5sums.txt new file mode 100644 index 0000000..3c82835 --- /dev/null +++ b/tests/assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/md5sums.txt @@ -0,0 +1,2 @@ +some_unexpected_md5_checksum 12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz +some_unexpected_md5_checksum 12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz diff --git a/tests/assets/input_data/ped_files/batch_1.ped b/tests/assets/input_data/ped_files/batch_1.ped new file mode 100644 index 0000000..b494a07 --- /dev/null +++ b/tests/assets/input_data/ped_files/batch_1.ped @@ -0,0 +1,6 @@ +000001 000001 000002 000003 2 2 +000001 000002 0 0 1 1 +000001 000003 0 0 2 1 +000002 000004 000005 000006 2 2 +000002 000005 0 0 1 1 +000002 000006 0 0 2 1 diff --git a/tests/assets/input_data/ped_files/batch_2_md5_errors.ped b/tests/assets/input_data/ped_files/batch_2_md5_errors.ped new file mode 100644 index 0000000..2f7610f --- /dev/null +++ b/tests/assets/input_data/ped_files/batch_2_md5_errors.ped @@ -0,0 +1 @@ +000003 000006 0 0 1 1 diff --git a/tests/assets/input_data/sample_sheets/batch_1.txt b/tests/assets/input_data/sample_sheets/batch_1.txt new file mode 100644 index 0000000..7b93d54 --- /dev/null +++ b/tests/assets/input_data/sample_sheets/batch_1.txt @@ -0,0 +1,7 @@ +individual_id family_id sample_id read_1 read_2 +000001 000001 12345_000001_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_1_00001AM0001L01_2.fastq.gz +000001 000001 12345_000001_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000001_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_2_00001AM0001L01_2.fastq.gz +000002 000001 12345_000002_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_3_00002AM0001L01_2.fastq.gz +000002 000001 12345_000002_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000002_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_4_00002AM0001L01_2.fastq.gz +000003 000001 12345_000003_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_5_00003AM0001L01_2.fastq.gz +000003 000001 12345_000003_000001_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12345_A_Researcher/20210922/12345_000003_000001_WESTwist_IDT-B/200922_A00001_0001_BHNTGMDMXX_6_00003AM0001L01_2.fastq.gz diff --git a/tests/assets/input_data/sample_sheets/batch_2_md5_errors.txt b/tests/assets/input_data/sample_sheets/batch_2_md5_errors.txt new file mode 100644 index 0000000..fca9441 --- /dev/null +++ b/tests/assets/input_data/sample_sheets/batch_2_md5_errors.txt @@ -0,0 +1,2 @@ +individual_id family_id sample_id read_1 read_2 +000006 000003 12346_000006_000003_WESTwist_IDT-B assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_1.fastq.gz assets/input_data/edinburgh_genomics/X12346_MD5_Errors/20211005/12346_000006_000003_WESTwist_IDT-B/211005_A00002_0002_AJTHSNRLXX_1_00002AM0002L01_2.fastq.gz diff --git a/tests/run_tests.sh b/tests/run_tests.sh new file mode 100755 index 0000000..83763e8 --- /dev/null +++ b/tests/run_tests.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +test_exit_status=0 + +echo "Test case 1: simple trio" +nextflow run -lib ../pipeline/lib ../pipeline/main.nf --ped_file assets/input_data/ped_files/batch_1.ped --sample_sheet assets/input_data/sample_sheets/batch_1.txt +test_exit_status=$(( $test_exit_status + $? )) + +echo "Test case 2: MD5 errors" +nextflow run -lib ../pipeline/lib ../pipeline/main.nf --ped_file assets/input_data/ped_files/batch_2_md5_errors.ped --sample_sheet assets/input_data/sample_sheets/batch_2_md5_errors.txt +if ! [ $? == 1 ] +then + test_exit_status=$(( $test_exit_status + 1 )) +fi + +echo "Tests finished with exit status $test_exit_status" -- GitLab