Adding parse_peddy_output check, simplifying process inputs/outputs....

Adding parse_peddy_output check, simplifying process inputs/outputs. Outputting BCBio outputs properly, fixing outputs at end - cp-RL and mode: move. Pulling changes to trio_whole_exome_parse_peddy_ped_csv.pl from ultra2 branch

Adding parse_peddy_output check, simplifying process inputs/outputs....
Adding parse_peddy_output check, simplifying process inputs/outputs. Outputting BCBio outputs properly, fixing outputs at end - cp-RL and mode: move. Pulling changes to trio_whole_exome_parse_peddy_ped_csv.pl from ultra2 branch
9271ba18 · mwham · 8f754358 · 9271ba18 · 9271ba18 · 9271ba18
Commit 9271ba18 authored 2 years ago by mwham
--- a/pipeline/inputs.nf
+++ b/pipeline/inputs.nf
@@ -85,7 +85,7 @@ workflow read_inputs {
        ch_individuals_by_family - as ch_individuals, but grouped by family:
        [
            [
-                indv1,
+                family_id,
                [
                    indv_id,
                    family_id,

--- a/pipeline/main.nf
+++ b/pipeline/main.nf
 nextflow.enable.dsl = 2

-include {read_inputs} from './pipeline/inputs.nf'
-include {validation} from './pipeline/validation.nf'
+include {read_inputs} from './inputs.nf'
+include {validation} from './validation.nf'

 params.bcbio = null
 params.bcbio_template = null
@@ -9,12 +9,14 @@ params.output_dir = null
 params.pipeline_project_id = null
 params.pipeline_project_version = null
 params.target_bed = null
+params.parse_peddy_output = null
+

 process merge_fastqs {
    label 'medium'

    input:
-    tuple(val(indv_id), val(family_id), val(father), val(mother), val(sex), val(affected), path(r1), path(r2))
+    tuple(val(indv_id), path(r1), path(r2))

    output:
    tuple(
@@ -55,7 +57,7 @@ process write_bcbio_csv {
 }


-process bcbio {
+process bcbio_family_processing {
    label 'large'

    input:
@@ -71,67 +73,89 @@ process bcbio {

    cd ${family_id}-merged &&
    ${params.bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local
+    """
+}
+
+
+process format_bcbio_individual_outputs {
+    input:
+    tuple(val(family_id), val(individuals), path(bcbio_output_dir))
+
+    output:
+    tuple(val(family_id), path('individual_outputs'))
+
+    script:
+    """
+    samtools=${params.bcbio}/anaconda/bin/samtools &&
+    mkdir individual_outputs
+    for i in ${individuals.join(' ')}
+    do
+        indv_input=\$PWD/${bcbio_output_dir}/results/\$i
+        indv_output=individual_outputs/\$i &&
+        mkdir -p \$indv_output &&
+
+        ln -s \$indv_input/\${i}-callable.bed \$indv_output/\${i}-callable.bed &&
+        ln -s \$indv_input/qc \$indv_output/qc &&

+        bam=\$indv_input/\$i-ready.bam
+        cram="\$indv_output/\$i-ready.cram" &&
+        \$samtools view -@ ${task.cpus} -T ${params.reference_genome} -C -o \$cram \$bam &&
+        \$samtools index \$cram &&
+        bam_flagstat=./\$i-ready.bam.flagstat.txt &&
+        cram_flagstat=\$cram.flagstat.txt &&
+        \$samtools flagstat \$bam > \$bam_flagstat &&
+        \$samtools flagstat \$cram > \$cram_flagstat &&
+        diff \$bam_flagstat \$cram_flagstat
+
+        if [ \$? -ne 0 ]
+        then
+            echo "Unexpected flagstat results in \$cram_flagstat"
+            exit 1
+        fi
+    done
    """
 }


-process bcbio_family_outputs {
+process format_bcbio_family_outputs {
    label 'small'

    input:
-    tuple(val(family_id), val(individuals), path(bcbio_output_dir))
+    tuple(val(family_id), val(individuals), path(bcbio_output_dir), path(formatted_individual_outputs))
+
+    output:
+    tuple(val(family_id), path("*_${family_id}"))

    script:
    """
-    merged_bcbio_family_output="\$(ls -d $bcbio_output_dir/results/????-??-??_* | head -n 1)" &&
+    merged_bcbio_family_output="\$(ls -d \$(readlink -f $bcbio_output_dir)/results/????-??-??_* | head -n 1)" &&
    bcbio_family_output="\$(echo \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" &&
    run_date="\$(basename \$bcbio_family_output | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2})_.+\$/\\1/g')" &&
    broken_family_id="\$(echo ${family_id} | sed 's/_//g')" &&

-    if [ \$merged_bcbio_family_output != \$bcbio_family_output ] && [ -d \$merged_bcbio_family_output ]
-    then
-        mv \$merged_bcbio_family_output \$bcbio_family_output
-    fi
+    bcbio_family_output="\$(basename \$merged_bcbio_family_output | sed 's/-merged\\/*\$//g')" &&
+    mkdir \$bcbio_family_output &&

    for suffix in gatk-haplotype-annotated.vcf.gz{,.tbi}
    do
-        src=\$bcbio_family_output/\${broken_family_id}-\${suffix}
+        src=\$merged_bcbio_family_output/\${broken_family_id}-\${suffix}
        if [ -f \$src ]
        then
-            mv \$src \$bcbio_family_output/${family_id}-\${suffix}
+            ln -s \$src \$bcbio_family_output/${family_id}-\${suffix}
        fi
    done &&

-    for sample in ${individuals.join(' ')}
+    for f in \$merged_bcbio_family_output/*{.log,.csv,.yaml,multiqc}
    do
-        if [ -d "$bcbio_output_dir/results/\$sample" ]
-        then
-            mv "$bcbio_output_dir/results/\$sample" "\$bcbio_family_output"
-        fi
+        ln -s \$f \$bcbio_family_output/
    done &&

-    cd \$bcbio_family_output &&
-
-    samtools=${params.bcbio}/anaconda/bin/samtools &&
-    for bam in */*.bam
+    for i in ${individuals.join(' ')}
    do
-        cram=\${bam%.bam}.cram &&
-        \$samtools view -@ ${task.cpus} -T ${params.reference_genome} -C -o \$cram \$bam &&
-        rm \$bam &&
-        \$samtools index \$cram &&
-
-        \$samtools flagstat \$bam > \$bam.flagstat.txt &&
-        \$samtools flagstat \$cram > \$cram.flagstat.txt &&
-        diff \$bam.flagstat.txt \$cram.flagstat.txt
-
-        if [ \$? -ne 0 ]
-        then
-            echo "Unexpected flagstat results in \$cram.flagstat.txt"
-            exit 1
-        fi
+        ln -s \$(readlink -f $formatted_individual_outputs)/\$i \$bcbio_family_output/\$i
    done &&

+    cd \$bcbio_family_output &&
    for f in \$(find . -type f | grep -v '\\.bam')
    do
        md5sum \$f >> md5sum.txt
@@ -140,66 +164,77 @@ process bcbio_family_outputs {
 }


+
 process collate_pipeline_outputs {
    label 'small'

-    publishDir "${params.output_dir}/${params.pipeline_project_id}/families/$family_id", mode: 'copy', pattern: "${bcbio_output_dir}"
+    publishDir "${params.output_dir}", mode: 'move', pattern: "${params.pipeline_project_id}_${params.pipeline_project_version}"

    input:
-    val(family_ids)
    val(bcbio_family_output_dirs)

+    output:
+    path("${params.pipeline_project_id}_${params.pipeline_project_version}")
+
    script:
    """
-    mkdir -p outputs/{config,families,params,prioritization,qc} &&
+    outputs=${params.pipeline_project_id}_${params.pipeline_project_version}
+    mkdir \$outputs &&
+    mkdir \$outputs/{config,families,params,prioritization,qc} &&
    
    for d in ${bcbio_family_output_dirs.join(' ')}
    do
-        ln -s \$d outputs/families/\$(basename \$d)
+        cp -rL \$d \$outputs/families/\$(basename \$d)
    done &&

-    cd outputs/families &&
-    multiqc \
+    cd \$outputs/families &&
+    ${params.bcbio}/anaconda/bin/multiqc \
        --title "Trio whole exome QC report: ${params.pipeline_project_id}_${params.pipeline_project_version}" \
        --outdir ../qc \
        --filename ${params.pipeline_project_id}_${params.pipeline_project_version}_qc_report.html \
        . &&
    
-    peddy_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt
-    # trio_whole_exome_parse_peddy_ped_csv.pl \
-    #     --output \$peddy_output \
-    #     --project ${params.pipeline_project_id} \
-    #     --batch ${family_ids[0].split('_')[0]} \
-    #     --version ${params.pipeline_project_version} \
-    #     --ped ${params.ped_file} \
-    #     --families . &&
+    peddy_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt &&
+    perl ${params.parse_peddy_output} \
+        --output \$peddy_output \
+        --project ${params.pipeline_project_id} \
+        --batch ${bcbio_family_output_dirs[0].getName().split('_')[1]} \
+        --version ${params.pipeline_project_version} \
+        --ped ${params.ped_file} \
+        --families . &&

    # no && here - exit status checked below
-    # grep -v 'False\$' \$peddy_output
-    # if [ \$? -ne 0 ]
-    # then
-    #     echo "Found Peddy mismatches in \$peddy_output"
-    #     exit 1
-    # fi
-
+    grep -v 'False\$' \$peddy_output
+    if [ \$? -ne 0 ]
+    then
+        echo "Found Peddy mismatches in \$peddy_output"
+        exit 1
+    fi
    """
 }


-workflow run_bcbio {
+workflow process_families {
    /*
-    - For each individual, merge all R1 and R2 fastqs
-    - For each family:
+    - For each individual in the family, merge all R1 and R2 fastqs
+    - For the family:
      - Read the relevant information from the samplesheet, ped files and merged fastqs
      - Write a BCBio config file
      - Run BCBio on it
+      - Move files around into the right places
+      - Run CRAM compression, MultiQC and MD5 checks
    */

    take:
        ch_individuals
    
    main:
-        ch_merged_fastqs = merge_fastqs(ch_individuals)
+        ch_merged_fastqs = merge_fastqs(
+            ch_individuals.map(
+                { indv, family, father, mother, sex, affected, r1, r2 ->
+                    [ indv, r1, r2 ]
+            })
+        )
        ch_joined_indv_info = ch_individuals.join(ch_merged_fastqs)

        ch_joined_indv_info.map(
@@ -214,33 +249,35 @@ workflow run_bcbio {
        })
        .tap {ch_read2_meta}

-        ch_metadata = ch_read1_meta.mix(ch_read2_meta).map(
-            { family_id, sample_id, father, mother, sex, phenotype, merged_fastq ->
-                [family_id, [merged_fastq, sample_id, family_id, sex, phenotype, params.target_bed]]
-            }
-        ).groupTuple()
-        ch_bcbio_csvs = write_bcbio_csv(ch_metadata)
+        ch_bcbio_csvs = write_bcbio_csv(
+            ch_read1_meta.mix(ch_read2_meta).map(
+                { family_id, sample_id, father, mother, sex, phenotype, merged_fastq ->
+                    [family_id, [merged_fastq, sample_id, family_id, sex, phenotype, params.target_bed]]
+                }
+            ).groupTuple()
+        )

        ch_bcbio_inputs = ch_joined_indv_info.map(
            { sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 ->
                [family_id, sample_id]
        }).groupTuple().join(ch_bcbio_csvs)

-        ch_bcbio_outputs = bcbio(ch_bcbio_inputs)
-        bcbio_family_outputs(ch_bcbio_outputs)
-
-        collate_pipeline_outputs(
-            ch_bcbio_outputs.map({it[0]}).collect(),
-            ch_bcbio_outputs.map({it[2]}).collect()
+        ch_bcbio_family_outputs = bcbio_family_processing(ch_bcbio_inputs)
+        ch_individual_folders = format_bcbio_individual_outputs(ch_bcbio_family_outputs)
+        ch_formatted_bcbio_outputs = format_bcbio_family_outputs(
+            ch_bcbio_family_outputs.join(ch_individual_folders)
        )
+
+        collate_pipeline_outputs(ch_formatted_bcbio_outputs.map({it[1]}).collect())
 }


 workflow {
    read_inputs()
-    ch_individual_info = read_inputs.out[0]
+    ch_individuals = read_inputs.out[0]
+    ch_individuals_by_family = read_inputs.out[1]

-    validation(ch_individual_info)
-    run_bcbio(ch_individual_info)
+    validation(ch_individuals)
+    process_families(ch_individuals)
 }

--- a/trio_whole_exome_parse_peddy_ped_csv.pl
+++ b/trio_whole_exome_parse_peddy_ped_csv.pl
@@ -21,30 +21,33 @@ use IO::File;

 my $usage = qq{USAGE:
 $0 [--help]
-  --output  Output directory
-  --ped     Pedigree file for project
-  --project Project id
-  --batch   Batch id
-  --version Analysis run version (v1, v2, etc)
+  --output   Output file
+  --families Family directory
+  --ped      Pedigree file for project
+  --project  Project id
+  --batch    Batch id
+  --version  Analysis run version (v1, v2, etc)
 };

 my $help = 0;
 my $ped_file;
+my $fam_dir;
 my $project_id;
 my $version;
-my $out_dir;
+my $out_file;
 my $batch_id;

 GetOptions(
-    'help'      => \$help,
-    'project=s' => \$project_id,
-    'ped=s'     => \$ped_file,
-    'output=s'  => \$out_dir,
-    'version=s' => \$version,
-    'batch=s'   => \$batch_id
+    'help'       => \$help,
+    'project=s'  => \$project_id,
+    'ped=s'      => \$ped_file,
+    'output=s'   => \$out_file,
+    'families=s' => \$fam_dir,
+    'version=s'  => \$version,
+    'batch=s'    => \$batch_id
 ) or die $usage;

-if ($help || !$project_id || !$ped_file || !$out_dir || !$batch_id || !$version)
+if ($help || !$project_id || !$ped_file || !$out_file || !$batch_id || !$version || !$fam_dir)
 {
    print $usage;
    exit(0);
@@ -70,7 +73,6 @@ while (my $line = <$in_fh>)

 $in_fh->close();

-my $out_file = sprintf("$out_dir/qc/%s_%s.ped_check.txt", $version, $project_id);
 my $out_fh = new IO::File;
 $out_fh->open($out_file, "w") or die "Could not open $out_file\n$!";

@@ -78,8 +80,8 @@ printf $out_fh "project_id\tbatch_id\tsample_a\tsample_b\tpedigree_parents\tpred

 foreach my $family_id (sort keys %ped)
 {
-	my @peddy_glob = glob(sprintf("$out_dir/*/*_%s_%s_%s_%s/%s_%s/qc/peddy/%s%s.ped_check.csv", 
-		$version, $project_id, $batch_id, $family_id, $ped{$family_id}{'aff'}, $family_id, $batch_id, $family_id));
+	my @peddy_glob = glob(sprintf("$fam_dir/*_%s_%s_%s_%s/%s_%s/qc/peddy/%s%s.ped_check.csv", 
+	        $project_id, $version, $batch_id, $family_id, $ped{$family_id}{'aff'}, $family_id, $batch_id, $family_id));
 	next if (scalar(@peddy_glob) == 0);

 	my $peddy_fh = new IO::File;