Skip to content
Snippets Groups Projects
Commit 3dc7bc2f authored by mwham's avatar mwham
Browse files

Making file checking its own script

parent d8d87bf9
No related branches found
No related tags found
1 merge request!1NextFlow
Pipeline #7742 passed
# Trio-Whole-Exome pipeline
This is an automated version of the scripts currently run manually according to SOP as part of the whole exome trios project with David Fitzpatrick's group. This pipeline is controlled by [NextFlow](https://www.nextflow.io/)
This is an automated version of the scripts currently run manually according to SOP as part of the whole exome trios
project with David Fitzpatrick's group. This pipeline is controlled by [NextFlow](https://www.nextflow.io/)
## Setup
A [Conda](https://docs.conda.io) environment containing NextFlow is available in `environment.yaml`. Once you have Conda installed, you can create an environment by `cd`-ing into this project and running the command:
A [Conda](https://docs.conda.io) environment containing NextFlow is available in `environment.yaml`. Once you have Conda
installed, you can create an environment by `cd`-ing into this project and running the command:
$ conda env create -n <environment_name>
## Running
The pipeline requires two main input files:
### Samplesheet
This is a tab-separated file mapping fastq pairs to metadata. The columns are individual ID, family ID, fastq sample ID,
r1 fastq and r2 fastq. If a sample has been sequenced over multiple lanes, then include a line for each fastq pair:
individual_id family_id sample_id read_1 read_2
000001 000001 12345_000001_000001_WESTwist_IDT-B path/to/lane_1_r1.fastq.gz path/to/lane_1_r2.fastq.gz
000001 000001 12345_000001_000001_WESTwist_IDT-B path/to/lane_2_r1.fastq.gz path/to/lane_2_r2.fastq.gz
### Ped file
Tab-separated Ped file mapping individuals to each other and affected status. Per the
[specification](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format), the columns are
family ID, individual ID, father ID, mother ID, sex (1=male, 2=female, other=unknown), affected status (-9 or 0=missing,
1=unaffected, 2=affected):
000001 000001 000002 000003 2 2
000001 000002 0 0 1 1
000001 000003 0 0 2 1
The pipeline can now be run. First, check for errors:
$ nextflow run pipeline/validation.nf --ped_file path/to/batch.ped --sample_sheet path/to/batch_samplesheet.tsv
Todo: run the main processing
## Tests
This pipeline has automated tests contained in the folder `tests/`. To the run the tests locally, `cd` to this folder with your Conda environment active and run `./run_tests.sh`.
This pipeline has automated tests contained in the folder `tests/`. To the run the tests locally, `cd` to this folder
with your Conda environment active and run `./run_tests.sh`.
## Terminology
- Batch: slightly ambiguous, could be a pipeline batch, a sequencing batch or a BCBio batch
- Pipeline batch: a single run of this pipeline, potentially mixing samples and fmailies from multiple seq batches
- Batch: slightly ambiguous term - could be a pipeline batch, a sequencing batch or a BCBio batch
- Pipeline batch: a single run of this pipeline, potentially mixing samples and families from multiple
sequencing batches
- Sequencing batch: a group of samples that were prepared and sequenced together
- BCBio batch: used internally by BCBio to identify a family
- Sample ID: specific to a sequencing batch, family ID, individual ID and extraction kit type
......
nextflow.enable.dsl = 2
include {checksums} from './file_checks.nf'
params.ped_file = null
params.sample_sheet = null
ch_sample_sheet = Channel.fromPath(params.sample_sheet, checkIfExists: true)
ch_ped_file = Channel.fromPath(params.ped_file, checkIfExists: true)
workflow read_ped_file {
workflow read_inputs {
/*
Read the given ped file and output a tuple channel of included families and their members:
Read the input files into channels. This has to be a workflow so the channels
can be included in other scripts
*/
{
family1: [
{
main:
/*
ch_family_info - tuple channel of family members:
[
[
family: family1,
indv: indv1,
father: indv2,
mother: indv3,
sex: 2,
affected: 2
},
{
...
},
family2: [
],
...
]
}
*/
take:
ch_ped_file
main:
ch_family_info = ch_ped_file
*/
ped_file = file(params.ped_file, checkIfExists: true)
ch_ped_file_info = Channel.fromPath(ped_file)
.splitCsv(sep: '\t')
.map(
{ line ->
[
line[1], // indv ID
line[0], // family ID
[
family: line[0],
indv: line[1],
father: line[2],
mother: line[3],
sex: line[4],
affected: line[5]
]
line[2], // father
line[3], // mother
line[4], // sex
line[5] // affected
]
}
)
.groupTuple()
emit:
ch_family_info
}
workflow read_sample_sheet {
/*
Read in a pipeline sample sheet and output a tuple of included individuals and their listed fastqs:
{
indv1: {
r1: [lane_1_r1.fastq.gz, lane_2_r1.fastq.gz],
r2: [lane_1_r2.fastq.gz, lane_2_r2.fastq.gz]
},
indv2: {
...
}
}
*/
take:
ch_sample_sheet
main:
ch_samplesheet_info = ch_sample_sheet
/*
ch_samplesheet_info - Tuple channel of individual ID, r1 fastqs and r2 fastqs:
[
[
indv1,
[lane_1_r1.fastq.gz, lane_2_r1.fastq.gz],
[lane_1_r2.fastq.gz, lane_2_r2.fastq.gz]
],
[
indv2,
...
]
]
*/
samplesheet = file(params.sample_sheet, checkIfExists: true)
ch_samplesheet_info = Channel.fromPath(samplesheet)
.splitCsv(sep:'\t', header: true)
.map(
{ line -> [line.individual_id, line.read_1, line.read_2] }
)
.groupTuple()
.map(
{ record ->
[
record[0],
[
indv: record[0],
r1: record[1],
r2: record[2]
]
]
}
)
emit: ch_samplesheet_info
}
workflow {
checksums(read_sample_sheet(ch_sample_sheet))
/*
ch_individuals_by_family - merge of samplesheet and ped file information,
grouped by family ID:
[
[
indv1,
[
indv_id,
family_id,
father_id,
mother_id,
sex,
affected,
r1_fastqs,
r2_fastqs
]
],
...
]
*/
ch_individuals_by_family = ch_ped_file_info
.join(ch_samplesheet_info)
.map({[it[1], it]})
.groupTuple()
emit:
ch_samplesheet_info
ch_ped_file_info
ch_individuals_by_family
}
nextflow.enable.dsl = 2
include {read_inputs} from './inputs.nf'
process observed_md5 {
/* Run md5sum on a file to get its observed checksum */
// Run md5sum on a file to get its observed checksum
input:
path(downloaded_file)
......@@ -20,7 +21,7 @@ process expected_md5 {
/*
Grep out downloaded_file's expected checksum from md5sum_file. Assumes the
md5sum_file to be in the format '<checksum> path/to/downloaded.file',
separated by a space
separated by a space.
*/
input:
......@@ -36,81 +37,47 @@ process expected_md5 {
"""
}
process check_errors {
/*
Compare obsered and expected checksums for all given files. If there are
any, then raise an exception. We have to do this inside a process because
race conditions occur when trying to execute this directly in a workflow.
*/
process raise_errors {
// Raise any identified checksum mismatches
input:
val(mismatches)
val(errors)
exec:
if (!mismatches.isEmpty()) {
throw new PipelineException("MD5 mismatches found: ${mismatches.toString()}")
}
exit 1, "MD5 mismatches found"
}
workflow checksums {
workflow {
/*
Take a parsed samplesheet, flatten it and parse into a channel of observed vs.
expected checksums. Calls check_errors above to raise an exception upon any
mismatches.
*/
take:
ch_samplesheet_info
read_inputs()
ch_samplesheet_info = read_inputs.out[0]
main:
ch_fastqs = ch_samplesheet_info.flatMap(
{ indv ->
[indv[1].r1, indv[1].r2]
}
).flatten()
.map({file(it)})
ch_md5_files = ch_fastqs.map(
{ fastq -> fastq.getParent().getParent() + '/md5sums.txt' }
)
ch_obs = observed_md5(ch_fastqs)
ch_exp = expected_md5(ch_fastqs, ch_md5_files)
ch_mismatches = ch_obs.concat(ch_exp)
.map({fastq, md5 -> [fastq, md5.strip()]})
.groupTuple()
.filter({it[1][0] != it[1][1]})
.collect({"${it[0]}: ${it[1][0]} != ${it[1][1]}"})
ch_mismatches.view()
check_errors(ch_mismatches)
}
/*
merge_fastqs
->
{
indv1: {
r1: merged_r1.fastq.gz,
r2: merged_r2.fastq.gz
},
indv2: {
...
}
ch_fastqs = ch_samplesheet_info
.map(
{ indv, r1, r2 ->
[r1, r2]
}
).flatten()
.map({file(it)})
ch_md5_files = ch_fastqs.map(
{ fastq -> fastq.getParent().getParent() + '/md5sums.txt' }
)
ch_obs = observed_md5(ch_fastqs)
ch_exp = expected_md5(ch_fastqs, ch_md5_files)
ch_mismatches = ch_obs.concat(ch_exp)
.map({fastq, md5 -> [fastq, md5.strip()]})
.groupTuple()
.filter({it[1][0] != it[1][1]})
.collect({"${it[0]}: ${it[1][0]} != ${it[1][1]}"})
ch_mismatches.view({"\nChecksum mismatches:\n${it.join('\n')}"})
raise_errors(ch_mismatches)
}
per family:
prepare_bcbio_config
->
"samplename,description,batch,sex,phenotype,variant_regions"
"$MERGED_FASTQ,$INDIVIDUAL_ID,$FAMILY_ID,$SEX,$PHENOTYPE,$TWIST_BED_FILE"
->
family_{family_id}.csv
bcbio_nextgen.py -w template $BCBIO_TEMPLATE $family{family_id}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
*/
......@@ -20,13 +20,19 @@ function run_nextflow {
test_exit_status=0
nextflow clean -f
for d in ./outputs/*
do
rm -r $d
done
echo "Test case 1: simple trio"
run_nextflow -lib ../pipeline/lib ../pipeline/main.nf --ped_file assets/input_data/ped_files/batch_1.ped --sample_sheet assets/input_data/sample_sheets/batch_1.txt
run_nextflow ../pipeline/validation.nf --ped_file assets/input_data/ped_files/batch_1.ped --sample_sheet assets/input_data/sample_sheets/batch_1.tsv
test_exit_status=$(( $test_exit_status + $? ))
echo "Test case 2: MD5 errors"
run_nextflow -lib ../pipeline/lib ../pipeline/main.nf --ped_file assets/input_data/ped_files/batch_2_md5_errors.ped --sample_sheet assets/input_data/sample_sheets/batch_2_md5_errors.txt
if ! [ $? == 1 ]
run_nextflow ../pipeline/validation.nf --ped_file assets/input_data/ped_files/batch_2_md5_errors.ped --sample_sheet assets/input_data/sample_sheets/batch_2_md5_errors.tsv
if [ $? == 0 ]
then
test_exit_status=$(( $test_exit_status + 1 ))
fi
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment