diff --git a/README.md b/README.md index 01d12564946e33b615bbbfb7362576c4489667f5..dcd54f8645b26b0e7d7dc0e7337102b3c4a28376 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,58 @@ # Trio-Whole-Exome pipeline -This is an automated version of the scripts currently run manually according to SOP as part of the whole exome trios project with David Fitzpatrick's group. This pipeline is controlled by [NextFlow](https://www.nextflow.io/) +This is an automated version of the scripts currently run manually according to SOP as part of the whole exome trios +project with David Fitzpatrick's group. This pipeline is controlled by [NextFlow](https://www.nextflow.io/) ## Setup -A [Conda](https://docs.conda.io) environment containing NextFlow is available in `environment.yaml`. Once you have Conda installed, you can create an environment by `cd`-ing into this project and running the command: +A [Conda](https://docs.conda.io) environment containing NextFlow is available in `environment.yaml`. Once you have Conda +installed, you can create an environment by `cd`-ing into this project and running the command: $ conda env create -n <environment_name> + +## Running + +The pipeline requires two main input files: + +### Samplesheet + +This is a tab-separated file mapping fastq pairs to metadata. The columns are individual ID, family ID, fastq sample ID, +r1 fastq and r2 fastq. If a sample has been sequenced over multiple lanes, then include a line for each fastq pair: + + individual_id family_id sample_id read_1 read_2 + 000001 000001 12345_000001_000001_WESTwist_IDT-B path/to/lane_1_r1.fastq.gz path/to/lane_1_r2.fastq.gz + 000001 000001 12345_000001_000001_WESTwist_IDT-B path/to/lane_2_r1.fastq.gz path/to/lane_2_r2.fastq.gz + +###Â Ped file + +Tab-separated Ped file mapping individuals to each other and affected status. Per the +[specification](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format), the columns are +family ID, individual ID, father ID, mother ID, sex (1=male, 2=female, other=unknown), affected status (-9 or 0=missing, +1=unaffected, 2=affected): + + 000001 000001 000002 000003 2 2 + 000001 000002 0 0 1 1 + 000001 000003 0 0 2 1 + +The pipeline can now be run. First, check for errors: + + $ nextflow run pipeline/validation.nf --ped_file path/to/batch.ped --sample_sheet path/to/batch_samplesheet.tsv + +Todo: run the main processing + ## Tests -This pipeline has automated tests contained in the folder `tests/`. To the run the tests locally, `cd` to this folder with your Conda environment active and run `./run_tests.sh`. +This pipeline has automated tests contained in the folder `tests/`. To the run the tests locally, `cd` to this folder +with your Conda environment active and run `./run_tests.sh`. ## Terminology -- Batch: slightly ambiguous, could be a pipeline batch, a sequencing batch or a BCBio batch - - Pipeline batch: a single run of this pipeline, potentially mixing samples and fmailies from multiple seq batches +- Batch: slightly ambiguous term - could be a pipeline batch, a sequencing batch or a BCBio batch + - Pipeline batch: a single run of this pipeline, potentially mixing samples and families from multiple + sequencing batches - Sequencing batch: a group of samples that were prepared and sequenced together - BCBio batch: used internally by BCBio to identify a family - Sample ID: specific to a sequencing batch, family ID, individual ID and extraction kit type diff --git a/pipeline/file_checks.nf b/pipeline/file_checks.nf deleted file mode 100644 index 51d6014e24834e70cc0296d2451fc05749e10187..0000000000000000000000000000000000000000 --- a/pipeline/file_checks.nf +++ /dev/null @@ -1,116 +0,0 @@ -nextflow.enable.dsl = 2 - - -process observed_md5 { - /* Run md5sum on a file to get its observed checksum */ - - input: - path(downloaded_file) - - output: - tuple(val("${downloaded_file.getName()}"), stdout) - - script: - """ - md5sum $downloaded_file | cut -d ' ' -f 1 - """ -} - -process expected_md5 { - /* - Grep out downloaded_file's expected checksum from md5sum_file. Assumes the - md5sum_file to be in the format '<checksum> path/to/downloaded.file', - separated by a space - */ - - input: - path(downloaded_file) - path(md5sum_file) - - output: - tuple(val("${downloaded_file.getName()}"), stdout) - - script: - """ - grep $downloaded_file $md5sum_file | cut -d ' ' -f 1 - """ -} - -process check_errors { - /* - Compare obsered and expected checksums for all given files. If there are - any, then raise an exception. We have to do this inside a process because - race conditions occur when trying to execute this directly in a workflow. - */ - input: - val(mismatches) - - exec: - if (!mismatches.isEmpty()) { - throw new PipelineException("MD5 mismatches found: ${mismatches.toString()}") - } -} - - -workflow checksums { - /* - Take a parsed samplesheet, flatten it and parse into a channel of observed vs. - expected checksums. Calls check_errors above to raise an exception upon any - mismatches. - */ - - take: - ch_samplesheet_info - - main: - ch_fastqs = ch_samplesheet_info.flatMap( - { indv -> - [indv[1].r1, indv[1].r2] - } - ).flatten() - .map({file(it)}) - - ch_md5_files = ch_fastqs.map( - { fastq -> fastq.getParent().getParent() + '/md5sums.txt' } - ) - - ch_obs = observed_md5(ch_fastqs) - ch_exp = expected_md5(ch_fastqs, ch_md5_files) - - ch_mismatches = ch_obs.concat(ch_exp) - .map({fastq, md5 -> [fastq, md5.strip()]}) - .groupTuple() - .filter({it[1][0] != it[1][1]}) - .collect({"${it[0]}: ${it[1][0]} != ${it[1][1]}"}) - - ch_mismatches.view() - check_errors(ch_mismatches) -} - -/* - -merge_fastqs --> -{ - indv1: { - r1: merged_r1.fastq.gz, - r2: merged_r2.fastq.gz - }, - indv2: { - ... - } -} - - -per family: - prepare_bcbio_config - -> - "samplename,description,batch,sex,phenotype,variant_regions" - "$MERGED_FASTQ,$INDIVIDUAL_ID,$FAMILY_ID,$SEX,$PHENOTYPE,$TWIST_BED_FILE" - -> - family_{family_id}.csv - - - bcbio_nextgen.py -w template $BCBIO_TEMPLATE $family{family_id}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz - -*/ diff --git a/pipeline/inputs.nf b/pipeline/inputs.nf new file mode 100644 index 0000000000000000000000000000000000000000..cf4094f9a6e3dc0547c8cec7039673db77d6b7c4 --- /dev/null +++ b/pipeline/inputs.nf @@ -0,0 +1,95 @@ +nextflow.enable.dsl = 2 + +params.ped_file = null +params.sample_sheet = null + +workflow read_inputs { + /* + Read the input files into channels. This has to be a workflow so the channels + can be included in other scripts + */ + + main: + /* + ch_family_info - tuple channel of family members: + [ + [ + family: family1, + indv: indv1, + father: indv2, + mother: indv3, + sex: 2, + affected: 2 + ], + ... + ] + */ + ped_file = file(params.ped_file, checkIfExists: true) + ch_ped_file_info = Channel.fromPath(ped_file) + .splitCsv(sep: '\t') + .map( + { line -> + [ + line[1], // indv ID + line[0], // family ID + line[2], // father + line[3], // mother + line[4], // sex + line[5] // affected + ] + } + ) + + + /* + ch_samplesheet_info - Tuple channel of individual ID, r1 fastqs and r2 fastqs: + [ + [ + indv1, + [lane_1_r1.fastq.gz, lane_2_r1.fastq.gz], + [lane_1_r2.fastq.gz, lane_2_r2.fastq.gz] + ], + [ + indv2, + ... + ] + ] + */ + samplesheet = file(params.sample_sheet, checkIfExists: true) + ch_samplesheet_info = Channel.fromPath(samplesheet) + .splitCsv(sep:'\t', header: true) + .map( + { line -> [line.individual_id, line.read_1, line.read_2] } + ) + .groupTuple() + + /* + ch_individuals_by_family - merge of samplesheet and ped file information, + grouped by family ID: + [ + [ + indv1, + [ + indv_id, + family_id, + father_id, + mother_id, + sex, + affected, + r1_fastqs, + r2_fastqs + ] + ], + ... + ] + */ + ch_individuals_by_family = ch_ped_file_info + .join(ch_samplesheet_info) + .map({[it[1], it]}) + .groupTuple() + + emit: + ch_samplesheet_info + ch_ped_file_info + ch_individuals_by_family +} diff --git a/pipeline/main.nf b/pipeline/main.nf deleted file mode 100644 index ef30e4ca5a007c049dd7263002942eaa39211b40..0000000000000000000000000000000000000000 --- a/pipeline/main.nf +++ /dev/null @@ -1,105 +0,0 @@ -nextflow.enable.dsl = 2 - -include {checksums} from './file_checks.nf' - -params.ped_file = null -params.sample_sheet = null - -ch_sample_sheet = Channel.fromPath(params.sample_sheet, checkIfExists: true) -ch_ped_file = Channel.fromPath(params.ped_file, checkIfExists: true) - - -workflow read_ped_file { - /* - Read the given ped file and output a tuple channel of included families and their members: - - { - family1: [ - { - family: family1, - indv: indv1, - father: indv2, - mother: indv3, - sex: 2, - affected: 2 - }, - { - ... - }, - - family2: [ - ... - ] - } - */ - - take: - ch_ped_file - - main: - ch_family_info = ch_ped_file - .splitCsv(sep: '\t') - .map( - { line -> - [ - line[0], // family ID - [ - family: line[0], - indv: line[1], - father: line[2], - mother: line[3], - sex: line[4], - affected: line[5] - ] - ] - } - ) - .groupTuple() - - emit: - ch_family_info -} - -workflow read_sample_sheet { - /* - Read in a pipeline sample sheet and output a tuple of included individuals and their listed fastqs: - { - indv1: { - r1: [lane_1_r1.fastq.gz, lane_2_r1.fastq.gz], - r2: [lane_1_r2.fastq.gz, lane_2_r2.fastq.gz] - }, - indv2: { - ... - } - } - */ - - take: - ch_sample_sheet - - main: - ch_samplesheet_info = ch_sample_sheet - .splitCsv(sep:'\t', header: true) - .map( - { line -> [line.individual_id, line.read_1, line.read_2] } - ) - .groupTuple() - .map( - { record -> - [ - record[0], - [ - indv: record[0], - r1: record[1], - r2: record[2] - ] - ] - } - ) - - emit: ch_samplesheet_info -} - -workflow { - checksums(read_sample_sheet(ch_sample_sheet)) -} diff --git a/pipeline/validation.nf b/pipeline/validation.nf new file mode 100644 index 0000000000000000000000000000000000000000..81b7c2107074585512e15cd4c4e4c075745c4062 --- /dev/null +++ b/pipeline/validation.nf @@ -0,0 +1,83 @@ +nextflow.enable.dsl = 2 + +include {read_inputs} from './inputs.nf' + +process observed_md5 { + // Run md5sum on a file to get its observed checksum + + input: + path(downloaded_file) + + output: + tuple(val("${downloaded_file.getName()}"), stdout) + + script: + """ + md5sum $downloaded_file | cut -d ' ' -f 1 + """ +} + +process expected_md5 { + /* + Grep out downloaded_file's expected checksum from md5sum_file. Assumes the + md5sum_file to be in the format '<checksum> path/to/downloaded.file', + separated by a space. + */ + + input: + path(downloaded_file) + path(md5sum_file) + + output: + tuple(val("${downloaded_file.getName()}"), stdout) + + script: + """ + grep $downloaded_file $md5sum_file | cut -d ' ' -f 1 + """ +} + +process raise_errors { + // Raise any identified checksum mismatches + + input: + val(errors) + + exec: + exit 1, "MD5 mismatches found" +} + +workflow { + /* + Take a parsed samplesheet, flatten it and parse into a channel of observed vs. + expected checksums. Calls check_errors above to raise an exception upon any + mismatches. + */ + + read_inputs() + ch_samplesheet_info = read_inputs.out[0] + + ch_fastqs = ch_samplesheet_info + .map( + { indv, r1, r2 -> + [r1, r2] + } + ).flatten() + .map({file(it)}) + + ch_md5_files = ch_fastqs.map( + { fastq -> fastq.getParent().getParent() + '/md5sums.txt' } + ) + + ch_obs = observed_md5(ch_fastqs) + ch_exp = expected_md5(ch_fastqs, ch_md5_files) + + ch_mismatches = ch_obs.concat(ch_exp) + .map({fastq, md5 -> [fastq, md5.strip()]}) + .groupTuple() + .filter({it[1][0] != it[1][1]}) + .collect({"${it[0]}: ${it[1][0]} != ${it[1][1]}"}) + + ch_mismatches.view({"\nChecksum mismatches:\n${it.join('\n')}"}) + raise_errors(ch_mismatches) +} diff --git a/tests/assets/input_data/sample_sheets/batch_1.txt b/tests/assets/input_data/sample_sheets/batch_1.tsv similarity index 100% rename from tests/assets/input_data/sample_sheets/batch_1.txt rename to tests/assets/input_data/sample_sheets/batch_1.tsv diff --git a/tests/assets/input_data/sample_sheets/batch_2_md5_errors.txt b/tests/assets/input_data/sample_sheets/batch_2_md5_errors.tsv similarity index 100% rename from tests/assets/input_data/sample_sheets/batch_2_md5_errors.txt rename to tests/assets/input_data/sample_sheets/batch_2_md5_errors.tsv diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 2958dbfaeb6935ec53a385a069f20c9e2b253346..59a53a4b069d685fed86a35b4edcdbc6d3e2c48d 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -20,13 +20,19 @@ function run_nextflow { test_exit_status=0 +nextflow clean -f +for d in ./outputs/* +do + rm -r $d +done + echo "Test case 1: simple trio" -run_nextflow -lib ../pipeline/lib ../pipeline/main.nf --ped_file assets/input_data/ped_files/batch_1.ped --sample_sheet assets/input_data/sample_sheets/batch_1.txt +run_nextflow ../pipeline/validation.nf --ped_file assets/input_data/ped_files/batch_1.ped --sample_sheet assets/input_data/sample_sheets/batch_1.tsv test_exit_status=$(( $test_exit_status + $? )) echo "Test case 2: MD5 errors" -run_nextflow -lib ../pipeline/lib ../pipeline/main.nf --ped_file assets/input_data/ped_files/batch_2_md5_errors.ped --sample_sheet assets/input_data/sample_sheets/batch_2_md5_errors.txt -if ! [ $? == 1 ] +run_nextflow ../pipeline/validation.nf --ped_file assets/input_data/ped_files/batch_2_md5_errors.ped --sample_sheet assets/input_data/sample_sheets/batch_2_md5_errors.tsv +if [ $? == 0 ] then test_exit_status=$(( $test_exit_status + 1 )) fi