Skip to content
Snippets Groups Projects
Commit 9271ba18 authored by mwham's avatar mwham
Browse files

Adding parse_peddy_output check, simplifying process inputs/outputs....

Adding parse_peddy_output check, simplifying process inputs/outputs. Outputting BCBio outputs properly, fixing outputs at end - cp-RL and mode: move. Pulling changes to trio_whole_exome_parse_peddy_ped_csv.pl from ultra2 branch
parent 8f754358
No related branches found
No related tags found
1 merge request!4NextFlow variant calling
Pipeline #13547 failed
......@@ -85,7 +85,7 @@ workflow read_inputs {
ch_individuals_by_family - as ch_individuals, but grouped by family:
[
[
indv1,
family_id,
[
indv_id,
family_id,
......
nextflow.enable.dsl = 2
include {read_inputs} from './pipeline/inputs.nf'
include {validation} from './pipeline/validation.nf'
include {read_inputs} from './inputs.nf'
include {validation} from './validation.nf'
params.bcbio = null
params.bcbio_template = null
......@@ -9,12 +9,14 @@ params.output_dir = null
params.pipeline_project_id = null
params.pipeline_project_version = null
params.target_bed = null
params.parse_peddy_output = null
process merge_fastqs {
label 'medium'
input:
tuple(val(indv_id), val(family_id), val(father), val(mother), val(sex), val(affected), path(r1), path(r2))
tuple(val(indv_id), path(r1), path(r2))
output:
tuple(
......@@ -55,7 +57,7 @@ process write_bcbio_csv {
}
process bcbio {
process bcbio_family_processing {
label 'large'
input:
......@@ -71,67 +73,89 @@ process bcbio {
cd ${family_id}-merged &&
${params.bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local
"""
}
process format_bcbio_individual_outputs {
input:
tuple(val(family_id), val(individuals), path(bcbio_output_dir))
output:
tuple(val(family_id), path('individual_outputs'))
script:
"""
samtools=${params.bcbio}/anaconda/bin/samtools &&
mkdir individual_outputs
for i in ${individuals.join(' ')}
do
indv_input=\$PWD/${bcbio_output_dir}/results/\$i
indv_output=individual_outputs/\$i &&
mkdir -p \$indv_output &&
ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed &&
ln -s \$indv_input/qc \$indv_output/qc &&
bam=\$indv_input/\$i-ready.bam
cram="\$indv_output/\$i-ready.cram" &&
\$samtools view -@ ${task.cpus} -T ${params.reference_genome} -C -o \$cram \$bam &&
\$samtools index \$cram &&
bam_flagstat=./\$i-ready.bam.flagstat.txt &&
cram_flagstat=\$cram.flagstat.txt &&
\$samtools flagstat \$bam > \$bam_flagstat &&
\$samtools flagstat \$cram > \$cram_flagstat &&
diff \$bam_flagstat \$cram_flagstat
if [ \$? -ne 0 ]
then
echo "Unexpected flagstat results in \$cram_flagstat"
exit 1
fi
done
"""
}
process bcbio_family_outputs {
process format_bcbio_family_outputs {
label 'small'
input:
tuple(val(family_id), val(individuals), path(bcbio_output_dir))
tuple(val(family_id), val(individuals), path(bcbio_output_dir), path(formatted_individual_outputs))
output:
tuple(val(family_id), path("*_${family_id}"))
script:
"""
merged_bcbio_family_output="\$(ls -d $bcbio_output_dir/results/????-??-??_* | head -n 1)" &&
merged_bcbio_family_output="\$(ls -d \$(readlink -f $bcbio_output_dir)/results/????-??-??_* | head -n 1)" &&
bcbio_family_output="\$(echo \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" &&
run_date="\$(basename \$bcbio_family_output | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2})_.+\$/\\1/g')" &&
broken_family_id="\$(echo ${family_id} | sed 's/_//g')" &&
if [ \$merged_bcbio_family_output != \$bcbio_family_output ] && [ -d \$merged_bcbio_family_output ]
then
mv \$merged_bcbio_family_output \$bcbio_family_output
fi
bcbio_family_output="\$(basename \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" &&
mkdir \$bcbio_family_output &&
for suffix in gatk-haplotype-annotated.vcf.gz{,.tbi}
do
src=\$bcbio_family_output/\${broken_family_id}-\${suffix}
src=\$merged_bcbio_family_output/\${broken_family_id}-\${suffix}
if [ -f \$src ]
then
mv \$src \$bcbio_family_output/${family_id}-\${suffix}
ln -s \$src \$bcbio_family_output/${family_id}-\${suffix}
fi
done &&
for sample in ${individuals.join(' ')}
for f in \$merged_bcbio_family_output/*{.log,.csv,.yaml,multiqc}
do
if [ -d "$bcbio_output_dir/results/\$sample" ]
then
mv "$bcbio_output_dir/results/\$sample" "\$bcbio_family_output"
fi
ln -s \$f \$bcbio_family_output/
done &&
cd \$bcbio_family_output &&
samtools=${params.bcbio}/anaconda/bin/samtools &&
for bam in */*.bam
for i in ${individuals.join(' ')}
do
cram=\${bam%.bam}.cram &&
\$samtools view -@ ${task.cpus} -T ${params.reference_genome} -C -o \$cram \$bam &&
rm \$bam &&
\$samtools index \$cram &&
\$samtools flagstat \$bam > \$bam.flagstat.txt &&
\$samtools flagstat \$cram > \$cram.flagstat.txt &&
diff \$bam.flagstat.txt \$cram.flagstat.txt
if [ \$? -ne 0 ]
then
echo "Unexpected flagstat results in \$cram.flagstat.txt"
exit 1
fi
ln -s \$(readlink -f $formatted_individual_outputs)/\$i \$bcbio_family_output/\$i
done &&
cd \$bcbio_family_output &&
for f in \$(find . -type f | grep -v '\\.bam')
do
md5sum \$f >> md5sum.txt
......@@ -140,66 +164,77 @@ process bcbio_family_outputs {
}
process collate_pipeline_outputs {
label 'small'
publishDir "${params.output_dir}/${params.pipeline_project_id}/families/$family_id", mode: 'copy', pattern: "${bcbio_output_dir}"
publishDir "${params.output_dir}", mode: 'move', pattern: "${params.pipeline_project_id}_${params.pipeline_project_version}"
input:
val(family_ids)
val(bcbio_family_output_dirs)
output:
path("${params.pipeline_project_id}_${params.pipeline_project_version}")
script:
"""
mkdir -p outputs/{config,families,params,prioritization,qc} &&
outputs=${params.pipeline_project_id}_${params.pipeline_project_version}
mkdir \$outputs &&
mkdir \$outputs/{config,families,params,prioritization,qc} &&
for d in ${bcbio_family_output_dirs.join(' ')}
do
ln -s \$d outputs/families/\$(basename \$d)
cp -rL \$d \$outputs/families/\$(basename \$d)
done &&
cd outputs/families &&
multiqc \
cd \$outputs/families &&
${params.bcbio}/anaconda/bin/multiqc \
--title "Trio whole exome QC report: ${params.pipeline_project_id}_${params.pipeline_project_version}" \
--outdir ../qc \
--filename ${params.pipeline_project_id}_${params.pipeline_project_version}_qc_report.html \
. &&
peddy_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt
# trio_whole_exome_parse_peddy_ped_csv.pl \
# --output \$peddy_output \
# --project ${params.pipeline_project_id} \
# --batch ${family_ids[0].split('_')[0]} \
# --version ${params.pipeline_project_version} \
# --ped ${params.ped_file} \
# --families . &&
peddy_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt &&
perl ${params.parse_peddy_output} \
--output \$peddy_output \
--project ${params.pipeline_project_id} \
--batch ${bcbio_family_output_dirs[0].getName().split('_')[1]} \
--version ${params.pipeline_project_version} \
--ped ${params.ped_file} \
--families . &&
# no && here - exit status checked below
# grep -v 'False\$' \$peddy_output
# if [ \$? -ne 0 ]
# then
# echo "Found Peddy mismatches in \$peddy_output"
# exit 1
# fi
grep -v 'False\$' \$peddy_output
if [ \$? -ne 0 ]
then
echo "Found Peddy mismatches in \$peddy_output"
exit 1
fi
"""
}
workflow run_bcbio {
workflow process_families {
/*
- For each individual, merge all R1 and R2 fastqs
- For each family:
- For each individual in the family, merge all R1 and R2 fastqs
- For the family:
- Read the relevant information from the samplesheet, ped files and merged fastqs
- Write a BCBio config file
- Run BCBio on it
- Move files around into the right places
- Run CRAM compression, MultiQC and MD5 checks
*/
take:
ch_individuals
main:
ch_merged_fastqs = merge_fastqs(ch_individuals)
ch_merged_fastqs = merge_fastqs(
ch_individuals.map(
{ indv, family, father, mother, sex, affected, r1, r2 ->
[ indv, r1, r2 ]
})
)
ch_joined_indv_info = ch_individuals.join(ch_merged_fastqs)
ch_joined_indv_info.map(
......@@ -214,33 +249,35 @@ workflow run_bcbio {
})
.tap {ch_read2_meta}
ch_metadata = ch_read1_meta.mix(ch_read2_meta).map(
{ family_id, sample_id, father, mother, sex, phenotype, merged_fastq ->
[family_id, [merged_fastq, sample_id, family_id, sex, phenotype, params.target_bed]]
}
).groupTuple()
ch_bcbio_csvs = write_bcbio_csv(ch_metadata)
ch_bcbio_csvs = write_bcbio_csv(
ch_read1_meta.mix(ch_read2_meta).map(
{ family_id, sample_id, father, mother, sex, phenotype, merged_fastq ->
[family_id, [merged_fastq, sample_id, family_id, sex, phenotype, params.target_bed]]
}
).groupTuple()
)
ch_bcbio_inputs = ch_joined_indv_info.map(
{ sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 ->
[family_id, sample_id]
}).groupTuple().join(ch_bcbio_csvs)
ch_bcbio_outputs = bcbio(ch_bcbio_inputs)
bcbio_family_outputs(ch_bcbio_outputs)
collate_pipeline_outputs(
ch_bcbio_outputs.map({it[0]}).collect(),
ch_bcbio_outputs.map({it[2]}).collect()
ch_bcbio_family_outputs = bcbio_family_processing(ch_bcbio_inputs)
ch_individual_folders = format_bcbio_individual_outputs(ch_bcbio_family_outputs)
ch_formatted_bcbio_outputs = format_bcbio_family_outputs(
ch_bcbio_family_outputs.join(ch_individual_folders)
)
collate_pipeline_outputs(ch_formatted_bcbio_outputs.map({it[1]}).collect())
}
workflow {
read_inputs()
ch_individual_info = read_inputs.out[0]
ch_individuals = read_inputs.out[0]
ch_individuals_by_family = read_inputs.out[1]
validation(ch_individual_info)
run_bcbio(ch_individual_info)
validation(ch_individuals)
process_families(ch_individuals)
}
......@@ -21,30 +21,33 @@ use IO::File;
my $usage = qq{USAGE:
$0 [--help]
--output Output directory
--ped Pedigree file for project
--project Project id
--batch Batch id
--version Analysis run version (v1, v2, etc)
--output Output file
--families Family directory
--ped Pedigree file for project
--project Project id
--batch Batch id
--version Analysis run version (v1, v2, etc)
};
my $help = 0;
my $ped_file;
my $fam_dir;
my $project_id;
my $version;
my $out_dir;
my $out_file;
my $batch_id;
GetOptions(
'help' => \$help,
'project=s' => \$project_id,
'ped=s' => \$ped_file,
'output=s' => \$out_dir,
'version=s' => \$version,
'batch=s' => \$batch_id
'help' => \$help,
'project=s' => \$project_id,
'ped=s' => \$ped_file,
'output=s' => \$out_file,
'families=s' => \$fam_dir,
'version=s' => \$version,
'batch=s' => \$batch_id
) or die $usage;
if ($help || !$project_id || !$ped_file || !$out_dir || !$batch_id || !$version)
if ($help || !$project_id || !$ped_file || !$out_file || !$batch_id || !$version || !$fam_dir)
{
print $usage;
exit(0);
......@@ -70,7 +73,6 @@ while (my $line = <$in_fh>)
$in_fh->close();
my $out_file = sprintf("$out_dir/qc/%s_%s.ped_check.txt", $version, $project_id);
my $out_fh = new IO::File;
$out_fh->open($out_file, "w") or die "Could not open $out_file\n$!";
......@@ -78,8 +80,8 @@ printf $out_fh "project_id\tbatch_id\tsample_a\tsample_b\tpedigree_parents\tpred
foreach my $family_id (sort keys %ped)
{
my @peddy_glob = glob(sprintf("$out_dir/*/*_%s_%s_%s_%s/%s_%s/qc/peddy/%s%s.ped_check.csv",
$version, $project_id, $batch_id, $family_id, $ped{$family_id}{'aff'}, $family_id, $batch_id, $family_id));
my @peddy_glob = glob(sprintf("$fam_dir/*_%s_%s_%s_%s/%s_%s/qc/peddy/%s%s.ped_check.csv",
$project_id, $version, $batch_id, $family_id, $ped{$family_id}{'aff'}, $family_id, $batch_id, $family_id));
next if (scalar(@peddy_glob) == 0);
my $peddy_fh = new IO::File;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment