From 02ad391c4f11eed423d11b61f2c2554d40529509 Mon Sep 17 00:00:00 2001
From: user name <ameynert@sdf-cs1.eidf.epcc.ed.ac.uk>
Date: Wed, 10 Aug 2022 17:06:41 +0100
Subject: [PATCH] Refactored to use tuples for passing linked information
 through channels, parameterized some hard-coded values

---
 pipeline/cnv_calling.nf | 159 ++++++++++++++--------------------------
 1 file changed, 55 insertions(+), 104 deletions(-)

diff --git a/pipeline/cnv_calling.nf b/pipeline/cnv_calling.nf
index d42c61c..afe15fc 100644
--- a/pipeline/cnv_calling.nf
+++ b/pipeline/cnv_calling.nf
@@ -91,7 +91,7 @@ process check_ped_file {
    cat $ped_file
    """
 }
-  
+
 process run_exomedepth {
    label 'large'
 
@@ -101,53 +101,31 @@ process run_exomedepth {
 
    // Want to emit a channel of family IDs for next process to use so can find relevant files 
 
+   publishDir params.output, mode: 'copy'
+
    input:
    path(target_bed)
-   path(reference_genome) 
-   val(ped_file_info)
-   val(samplesheet_info)
+   path(reference_genome)
+   path(male_reference)
+   path(female_reference)
+   tuple val(ind_id), val(fam_id), val(father), val(mother), val(sex), val(affected), path(bam), path(bai)
 
    output:
-   stdout emit: result
+   tuple val(fam_id), val(ind_id), val(affected), path("exome_calls_*.csv"), emit: exome_cnv_calls_csv
 
    script:
             
    """
-   ind_id=\$(echo ${ped_file_info} |cut -c2- | cut -d',' -f1)
-   fam_id=\$(echo ${ped_file_info} |cut -c2- | cut -d',' -f2 | cut -c2-)
-   sex=\$(echo ${ped_file_info} |cut -c2- | cut -d',' -f5 | cut -c2- )
-   
-   # echo ${ped_file_info}
-   # echo ${samplesheet_info}
-   
-   bam_path=\$(echo ${samplesheet_info} | cut -d',' -f4 | cut -c3- | rev | cut -c3- | rev)
-      
-   # bam_path 
-   # take the pedfile info of a sample, if sex = 1 use male reference if sex = 2 use female reference 
-   # create an input file, adding the path to target to the reference, using cat [bam path from sample_sheet_info] and reference file 
-   # take the last element in path to target from sample sheet, use it as the target
-   # if it doesn't already exist, create a folder for the outputs of the same family, using the family name in the ped file 
-    
-   echo \${ind_id}
-   echo \${fam_id}
-   echo \${sex}
-   echo \${bam_path}
-
-   bam_file=\$(echo \${bam_path} | awk -F"/" '{print \$NF}')
-   
-   # if directory for family doesnt exist, create it
-    
-   mkdir -p "$workflow.projectDir/\${fam_id}"
-
-   # make reference file for sample, store in fam directory, depending on sex, if sex is not known use both, maybe return a warning
+   # if sex = 1 use male reference if sex = 2 use female reference 
+   # make count file for sample, store in fam directory, depending on sex, if sex is not known return a warning
    
-   if [ \${sex} == 1 ]; then
+   if [ ${sex} == 1 ]; then
        
-       Rscript $workflow.projectDir/pipeline/ExomeDepth_assets/ExomeDepth_run.R -r $workflow.projectDir/pipeline/ExomeDepth_assets/male_reference_dataframe.csv -i \${bam_path} -o $workflow.projectDir/\${fam_id} -b ${target_bed} -f ${reference_genome} -t \${bam_file}
+       Rscript $workflow.projectDir/pipeline/ExomeDepth_assets/ExomeDepth_run.R -r ${male_reference} -i ${bam} -o . -b ${target_bed} -f ${reference_genome} -t ${ind_id}-ready.bam
 
-   else [ \${sex} == 2 ]
+   else [ ${sex} == 2 ]
 
-       Rscript $workflow.projectDir/pipeline/ExomeDepth_assets/ExomeDepth_run.R -r $workflow.projectDir/pipeline/ExomeDepth_assets/female_reference_dataframe.csv -i \${bam_path} -o  $workflow.projectDir/\${fam_id} -b ${target_bed} -f ${reference_genome} -t \${bam_file}
+       Rscript $workflow.projectDir/pipeline/ExomeDepth_assets/ExomeDepth_run.R -r ${female_reference} -i ${bam} -o . -b ${target_bed} -f ${reference_genome} -t ${ind_id}-ready.bam
 
    fi
   
@@ -156,72 +134,50 @@ process run_exomedepth {
 }
 
 process convert_to_bed {
-  
+
+   publishDir params.output, mode: 'copy'
+
    input:
-   val(ped_file_info)
-   val(go) 
+   tuple val(fam_id), val(ind_id), val(affected), path(exome_cnv_calls_csv)
 
    output:
-   stdout emit: bed_result
+   tuple val(fam_id), val(ind_id), val(affected), path("*cnv_calls_all_chr.bed"), emit: exome_cnv_calls_bed
 
    script:
    """
-   fam_id=\$(echo ${ped_file_info} |cut -c2- | cut -d',' -f2 | cut -c2-)
-   ind_id=\$(echo ${ped_file_info} |cut -c2- | cut -d',' -f1)
-
-   awk -F"," '{print \$7, \$5, \$6, \$3}' $workflow.projectDir/\${fam_id}/exome_calls_\${ind_id}.recal.bam.csv | sed 's/"//g' | awk -F" " -vOFS='\t' '\$1="chr"\$1' | sed "s/deletion/<DEL>/" | sed "s/duplication/<DUP>/" | tail -n +2 > $workflow.projectDir/\${fam_id}/\${ind_id}_cnv_calls.bed
-
-      # extract relevant columns from output of exome depth, (column 7 = chrom, 5 = start, 6 = end)
-      # turn into a bed file
-      # awk -F"," '{print \$7, \$5, \$6, \$3}' FAM01/exome_calls_HG003.recal.bam.csv | sed 's/"//g' | awk -F" " -vOFS='\t' '\$1="chr"\$1'
-
-      # make sure all chromosomes are present, otherwise an error is produced 
+   awk -F"," '{print \$7 "\t" \$5 "\t" \$6 "\t" \$3}' $exome_cnv_calls_csv | sed 's/"//g' | sed "s/deletion/<DEL>/" | sed "s/duplication/<DUP>/" | tail -n +2 > ${ind_id}_cnv_calls.bed
 
+   # make sure all chromosomes are present, otherwise an error is produced 
    for chrom in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY
    do 
-      grep -q \${chrom} $workflow.projectDir/\${fam_id}/\${ind_id}_cnv_calls.bed || echo "\${chrom}"'	0	1	NA' >> $workflow.projectDir/\${fam_id}/\${ind_id}_cnv_calls.bed
+      grep -q \${chrom} ${ind_id}_cnv_calls.bed || echo "\${chrom}"'	0	1	NA' >> ${ind_id}_cnv_calls.bed
    done  
-   sort -V $workflow.projectDir/\${fam_id}/\${ind_id}_cnv_calls.bed > $workflow.projectDir/\${fam_id}/\${ind_id}_cnv_calls_all_chr.bed
-   rm $workflow.projectDir/\${fam_id}/\${ind_id}_cnv_calls.bed
+   sort -V ${ind_id}_cnv_calls.bed > ${ind_id}_cnv_calls_all_chr.bed
    """
-
 }
 
 process collect_proband_cnvs {
 
+   publishDir params.output, mode: 'copy'
+
    input:
-   val(ped_file_info)
-   val(go)
+   tuple val(fam_id), val(ind_id), val(affected), path(exome_cnv_calls_bed)
 
    output:
-   stdout emit: cnv_result
-   
-   script:
-   """
-  
-   fam_id=\$(echo ${ped_file_info} |cut -c2- | cut -d',' -f2 | cut -c2-)
-   ind_id=\$(echo ${ped_file_info} |cut -c2- | cut -d',' -f1)
-   affected_status=\$(echo ${ped_file_info} | cut -d',' -f6 | cut -c2- | rev | cut -c2- | rev )
- 
-   mkdir -p "$workflow.projectDir/\${fam_id}/unique_proband"
-
-   if [ \${affected_status} == 2 ]; then
+   tuple val(fam_id), env(affected_ind_id_bash), path("*intersects.txt"), path("*proband_only.bed")   
 
-      files=\$(ls $workflow.projectDir/\${fam_id}/*all_chr.bed | grep -vi \${ind_id} | tr '\n' '\t ' )
-
-      bedtools intersect -a $workflow.projectDir/\${fam_id}/\${ind_id}_cnv_calls_all_chr.bed -b \${files} -loj -f 0.50 -sorted -filenames > $workflow.projectDir/\${fam_id}/unique_proband/\${ind_id}_intersects.txt
-      bedtools intersect -a $workflow.projectDir/\${fam_id}/\${ind_id}_cnv_calls_all_chr.bed -b \${files} -v -f 0.50 > $workflow.projectDir/\${fam_id}/unique_proband/\${ind_id}_proband_only.bed
-       
-      # use the individual with affected status (maybe loop if multiple?) as query for bedtools
-      # get individual id, and use to get files named the same 
-
-      # bed tools intersect -a proband -b parent1 parent2 - find where they overlap
+   script:
+   // expecting only a single affected individual
+   def affected_idx = affected.findIndexOf{it == '2'}
+   def affected_ind_id = ind_id[affected_idx]
+   def affected_ind_file = exome_cnv_calls_bed[affected_idx]
 
-      # bedtools intersect -a A.bed -b B.bed -loj -sorted -filenames
-      # will report overlaps from proband with parents
-      # extract with 0 in column to find those without overlaps 
+   """
+   affected_ind_id_bash=${affected_ind_id}
+   unaffected_ind_files=\$(ls *all_chr.bed | grep -vi ${affected_ind_id} | tr '\n' '\t ' )
 
-   fi
+   bedtools intersect -a ${affected_ind_file} -b \${unaffected_ind_files} -loj -f 0.50 -sorted -filenames > ${affected_ind_id}_intersects.txt
+   bedtools intersect -a ${affected_ind_file} -b \${unaffected_ind_files} -v -f 0.50 > ${affected_ind_id}_proband_only.bed
    """
 
 }
@@ -229,41 +185,36 @@ process collect_proband_cnvs {
 process run_VEP {
    label 'large'
 
+   publishDir params.output, mode: 'copy'
+
    input:
    path(reference_genome)
-   val(ped_file_info)
-   val(go)
+   tuple val(fam_id), val(ind_id), path(intersects_txt), path(proband_only_bed)
 
+   output:
+   path("*VEP_output.vcf")
+ 
    script:
    """
-   fam_id=\$(echo ${ped_file_info} |cut -c2- | cut -d',' -f2 | cut -c2-)
-   ind_id=\$(echo ${ped_file_info} |cut -c2- | cut -d',' -f1)
-   affected_status=\$(echo ${ped_file_info} | cut -d',' -f6 | cut -c2- | rev | cut -c2- | rev )
-
-   # use databases to produce a VEP file, also us G2P plugin, following arguments specified following the example in
-   # https://git.ecdf.ed.ac.uk/igmmbioinformatics/trio-whole-exome/-/blob/master/docs/Setup_variant_prioritization.md#parameter-values-for-g2p-call 
-
-   # vep --cache --dir_cache /exports/eddie/scratch/s1734289/.vep/ --fork 6 -i $workflow.projectDir/\${fam_id}/unique_proband/\${ind_id}_proband_only.bed -o $workflow.projectDir/\${fam_id}/unique_proband/\${ind_id}_VEP.tsv --cache_version 107 --filter_common --symbol --canonical --pick
-   # only do this for affected individuals
-
-   if [ \${affected_status} == 2 ]; then
-     
-      vep --cache --fork 6 --dir_cache /exports/igmm/eddie/IGMM-VariantAnalysis/software/bcbio-1.0.9/genomes/Hsapiens/hg38/vep/ -i $workflow.projectDir/\${fam_id}/unique_proband/\${ind_id}_proband_only.bed -o $workflow.projectDir/\${fam_id}/unique_proband/\${ind_id}_VEP_output.vcf --cache_version 92 --plugin G2P,file='/exports/igmm/eddie/IGMM-VariantAnalysis/mike/work_folder/DDG2P.20220721.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys='gnomADe_GRCh38|gnomADg_r3.0_GRCh38' --assembly GRCh38 --individual all --transcript_filter "gene_symbol in /exports/igmm/eddie/IGMM-VariantAnalysis/resources/G2P/genes_in_DDG2P.30082018.txt" --dir_plugins /exports/igmm/eddie/IGMM-VariantAnalysis/software/bcbio-1.0.9/anaconda/share/ensembl-vep-92.0-1 --merged --fasta ${reference_genome} --use_given_ref --vcf
-
-   fi
+   vep --cache --fork 6 --dir_cache ${params.vep_cache} \
+       -i ${proband_only_bed} -o ${ind_id}_VEP_output.vcf \
+       --plugin G2P,file='${params.g2p_panel}',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys='gnomADe_GRCh38|gnomADg_r3.0_GRCh38' \
+       --assembly GRCh38 --individual all --transcript_filter "gene_symbol in ${params.g2p_genes}" \
+       --dir_plugins ${params.vep_plugins} --merged --fasta ${reference_genome} --use_given_ref --vcf
    """
 
 }
 
+
 // future process for gnomAD
 
 workflow check_inputs {
 
     read_inputs()
-    check_sample_sheet(read_inputs.out.ch_samplesheet)
-    check_ped_file(read_inputs.out.ch_ped_file)
-    run_exomedepth(params.target_bed, params.reference_genome, read_inputs.out.ch_ped_file_info, read_inputs.out.ch_samplesheet_info)
-    convert_to_bed(read_inputs.out.ch_ped_file_info, run_exomedepth.out.result.collect())
-    collect_proband_cnvs(read_inputs.out.ch_ped_file_info, convert_to_bed.out.bed_result.collect())
-    run_VEP(params.reference_genome, read_inputs.out.ch_ped_file_info, collect_proband_cnvs.out.cnv_result.collect())
+//    check_sample_sheet(read_inputs.out.ch_samplesheet)
+//    check_ped_file(read_inputs.out.ch_ped_file)
+    run_exomedepth(params.target_bed, params.reference_genome, params.male_reference, params.female_reference, read_inputs.out.ch_individuals)
+    convert_to_bed(run_exomedepth.out.exome_cnv_calls_csv)
+    collect_proband_cnvs(convert_to_bed.out.exome_cnv_calls_bed.groupTuple())
+    run_VEP(params.reference_genome, collect_proband_cnvs.out)
 }
-- 
GitLab