Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
trio-whole-exome
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
igmmbioinformatics
trio-whole-exome
Commits
8f754358
Commit
8f754358
authored
2 years ago
by
mwham
Browse files
Options
Downloads
Patches
Plain Diff
Moving BCBio outputs into the right places
parent
09a5543a
No related branches found
Branches containing commit
No related tags found
1 merge request
!4
NextFlow variant calling
Pipeline
#13081
failed
2 years ago
Stage: test
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
pipeline/inputs.nf
+23
-9
23 additions, 9 deletions
pipeline/inputs.nf
pipeline/main.nf
+142
-31
142 additions, 31 deletions
pipeline/main.nf
pipeline/validation.nf
+7
-12
7 additions, 12 deletions
pipeline/validation.nf
with
172 additions
and
52 deletions
pipeline/inputs.nf
+
23
−
9
View file @
8f754358
...
...
@@ -64,8 +64,25 @@ workflow read_inputs {
.groupTuple()
/*
ch_individuals_by_family - merge of samplesheet and ped file information,
grouped by family ID:
ch_individuals - merge of samplesheet and ped file information:
[
[
indv_id,
family_id,
father_id,
mother_id,
sex,
affected,
[r1_fastqs],
[r2_fastqs]
],
...
]
*/
ch_individuals = ch_ped_file_info.join(ch_samplesheet_info)
/*
ch_individuals_by_family - as ch_individuals, but grouped by family:
[
[
indv1,
...
...
@@ -76,19 +93,16 @@ workflow read_inputs {
mother_id,
sex,
affected,
r1_fastqs,
r2_fastqs
[
r1_fastqs
]
,
[
r2_fastqs
]
]
],
...
]
*/
ch_individuals_by_family = ch_ped_file_info
.join(ch_samplesheet_info)
.map({[it[1], it]})
ch_individuals_by_family = ch_individuals.map({[it[1], it]})
emit:
ch_samplesheet_info
ch_ped_file_info
ch_individuals
ch_individuals_by_family
}
This diff is collapsed.
Click to expand it.
pipeline/main.nf
+
142
−
31
View file @
8f754358
nextflow.enable.dsl = 2
include {read_inputs} from './inputs.nf'
include {validation} from './validation.nf'
include {read_inputs} from './
pipeline/
inputs.nf'
include {validation} from './
pipeline/
validation.nf'
params.bcbio = null
params.bcbio_template = null
params.output_dir = null
params.pipeline_project_id = null
params.pipeline_project_version = null
params.target_bed = null
process merge_fastqs {
label 'medium'
publishDir "${params.output_dir}/individuals/$indv_id/merged_fastqs", mode: 'copy'
input:
tuple(val(indv_id), path(r1), path(r2))
tuple(val(indv_id),
val(family_id), val(father), val(mother), val(sex), val(affected),
path(r1), path(r2))
output:
tuple(
...
...
@@ -34,13 +34,11 @@ process merge_fastqs {
process write_bcbio_csv {
label 'small'
publishDir "${params.output_dir}/families/$family_id", mode: 'copy'
input:
tuple(val(family_id), val(individual_info))
output:
path("${family_id}.csv")
tuple(val(family_id),
path("${family_id}.csv")
)
script:
"""
...
...
@@ -60,26 +58,135 @@ process write_bcbio_csv {
process bcbio {
label 'large'
publishDir "${params.output_dir}/families/$family_id", mode: 'copy'
input:
tuple(val(family_id), val(individuals))
path(family_csv)
tuple(val(family_id), val(individuals), path(family_csv))
output:
tuple(val(family_id), val(individuals), path("${family_id}-merged"))
script:
"""
${params.bcbio}/anaconda/bin/bcbio_prepare_samples.py --out . --csv $family_csv &&
${params.bcbio}/anaconda/bin/bcbio_nextgen.py -w template ${params.bcbio_template} ${family_csv.getBaseName()}-merged.csv ${individuals.collect({"${it}.fastq.gz"}).join(' ')} &&
cd ${family_id}-merged &&
export PATH=$PATH:${params.bcbio}/tools/bin &&
${params.bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local
"""
}
process bcbio_family_outputs {
label 'small'
input:
tuple(val(family_id), val(individuals), path(bcbio_output_dir))
script:
"""
merged_bcbio_family_output="\$(ls -d $bcbio_output_dir/results/????-??-??_* | head -n 1)" &&
bcbio_family_output="\$(echo \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" &&
run_date="\$(basename \$bcbio_family_output | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2})_.+\$/\\1/g')" &&
broken_family_id="\$(echo ${family_id} | sed 's/_//g')" &&
if [ \$merged_bcbio_family_output != \$bcbio_family_output ] && [ -d \$merged_bcbio_family_output ]
then
mv \$merged_bcbio_family_output \$bcbio_family_output
fi
for suffix in gatk-haplotype-annotated.vcf.gz{,.tbi}
do
src=\$bcbio_family_output/\${broken_family_id}-\${suffix}
if [ -f \$src ]
then
mv \$src \$bcbio_family_output/${family_id}-\${suffix}
fi
done &&
for sample in ${individuals.join(' ')}
do
if [ -d "$bcbio_output_dir/results/\$sample" ]
then
mv "$bcbio_output_dir/results/\$sample" "\$bcbio_family_output"
fi
done &&
cd \$bcbio_family_output &&
samtools=${params.bcbio}/anaconda/bin/samtools &&
for bam in */*.bam
do
cram=\${bam%.bam}.cram &&
\$samtools view -@ ${task.cpus} -T ${params.reference_genome} -C -o \$cram \$bam &&
rm \$bam &&
\$samtools index \$cram &&
\$samtools flagstat \$bam > \$bam.flagstat.txt &&
\$samtools flagstat \$cram > \$cram.flagstat.txt &&
diff \$bam.flagstat.txt \$cram.flagstat.txt
if [ \$? -ne 0 ]
then
echo "Unexpected flagstat results in \$cram.flagstat.txt"
exit 1
fi
done &&
for f in \$(find . -type f | grep -v '\\.bam')
do
md5sum \$f >> md5sum.txt
done
"""
}
process collate_pipeline_outputs {
label 'small'
publishDir "${params.output_dir}/${params.pipeline_project_id}/families/$family_id", mode: 'copy', pattern: "${bcbio_output_dir}"
input:
val(family_ids)
val(bcbio_family_output_dirs)
script:
"""
mkdir -p outputs/{config,families,params,prioritization,qc} &&
for d in ${bcbio_family_output_dirs.join(' ')}
do
ln -s \$d outputs/families/\$(basename \$d)
done &&
cd outputs/families &&
multiqc \
--title "Trio whole exome QC report: ${params.pipeline_project_id}_${params.pipeline_project_version}" \
--outdir ../qc \
--filename ${params.pipeline_project_id}_${params.pipeline_project_version}_qc_report.html \
. &&
peddy_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt
# trio_whole_exome_parse_peddy_ped_csv.pl \
# --output \$peddy_output \
# --project ${params.pipeline_project_id} \
# --batch ${family_ids[0].split('_')[0]} \
# --version ${params.pipeline_project_version} \
# --ped ${params.ped_file} \
# --families . &&
# no && here - exit status checked below
# grep -v 'False\$' \$peddy_output
# if [ \$? -ne 0 ]
# then
# echo "Found Peddy mismatches in \$peddy_output"
# exit 1
# fi
"""
}
workflow prepare_bcbio_config {
workflow run_bcbio {
/*
- For each individual, merge all R1 and R2 fastqs
- For each family:
...
...
@@ -89,21 +196,19 @@ workflow prepare_bcbio_config {
*/
take:
ch_samplesheet_info
ch_individuals_by_family
ch_individuals
main:
ch_merged_fastqs = merge_fastqs(ch_samplesheet_info)
ch_joined_family_info = ch_individuals_by_family.map({ k, v -> v })
.join(ch_merged_fastqs)
ch_merged_fastqs = merge_fastqs(ch_individuals)
ch_joined_indv_info = ch_individuals.join(ch_merged_fastqs)
ch_joined_
family
_info.map(
ch_joined_
indv
_info.map(
{ sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 ->
[family_id, sample_id, father, mother, sex, phenotype, merged_r1]
})
.tap {ch_read1_meta} // I hate this syntax so much
ch_joined_
family
_info.map(
ch_joined_
indv
_info.map(
{ sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 ->
[family_id, sample_id, father, mother, sex, phenotype, merged_r2]
})
...
...
@@ -114,22 +219,28 @@ workflow prepare_bcbio_config {
[family_id, [merged_fastq, sample_id, family_id, sex, phenotype, params.target_bed]]
}
).groupTuple()
ch_bcbio_csv = write_bcbio_csv(ch_metadata)
ch_bcbio_csv
s
= write_bcbio_csv(ch_metadata)
ch_
individual
s = ch_joined_
family
_info.map(
ch_
bcbio_input
s = ch_joined_
indv
_info.map(
{ sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 ->
[family_id, sample_id]
}).groupTuple()
bcbio(ch_individuals, ch_bcbio_csv)
}).groupTuple().join(ch_bcbio_csvs)
ch_bcbio_outputs = bcbio(ch_bcbio_inputs)
bcbio_family_outputs(ch_bcbio_outputs)
collate_pipeline_outputs(
ch_bcbio_outputs.map({it[0]}).collect(),
ch_bcbio_outputs.map({it[2]}).collect()
)
}
workflow {
read_inputs()
ch_samplesheet_info = read_inputs.out[0]
ch_ped_file_info = read_inputs.out[1]
ch_individuals_by_family = read_inputs.out[2]
ch_individual_info = read_inputs.out[0]
validation(ch_
samplesheet
_info)
prepare_bcbio_config(ch_samplesheet_info, ch_individuals_by_family
)
validation(ch_
individual
_info)
run_bcbio(ch_individual_info
)
}
This diff is collapsed.
Click to expand it.
pipeline/validation.nf
+
7
−
12
View file @
8f754358
nextflow.enable.dsl = 2
include {read_inputs} from './inputs.nf'
process check_md5s {
label 'small'
...
...
@@ -51,20 +49,17 @@ workflow validation {
*/
take:
ch_
samplesheet
_info
ch_
indv
_info
main:
ch_fastqs = ch_samplesheet_info
.map(
{ indv, r1, r2 ->
ch_fastqs = ch_indv_info.map(
{ indv, family, father, mother, sex, affected, r1, r2 ->
[r1, r2]
}
).flatten()
)
.flatten()
.map({file(it)})
ch_md5_files = ch_fastqs.map(
{ fastq -> fastq.getParent().getParent() + '/md5sums.txt' }
).unique()
check_md5s(ch_fastqs)
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment