diff --git a/bin/peddy_validation.pl b/bin/peddy_validation.pl index 02482134378b4249a992d7808995547e7a4c7814..c78682786fb0444124ac433709446bb6231a4b25 100755 --- a/bin/peddy_validation.pl +++ b/bin/peddy_validation.pl @@ -25,28 +25,21 @@ $0 [--help] --output Output file --families Family directory --ped Pedigree file for project - --project Project id - --batch Batch id - --version Analysis run version (v1, v2, etc) }; my $help = 0; my $ped_file; my $fam_dir; -my $project_id; -my $version; my $out_file; GetOptions( 'help' => \$help, - 'project=s' => \$project_id, 'ped=s' => \$ped_file, 'output=s' => \$out_file, 'families=s' => \$fam_dir, - 'version=s' => \$version ) or die $usage; -if ($help || !$project_id || !$ped_file || !$out_file || !$version || !$fam_dir) +if ($help || !$ped_file || !$out_file || !$fam_dir) { print $usage; exit(0); @@ -75,7 +68,7 @@ $in_fh->close(); my $out_fh = new IO::File; $out_fh->open($out_file, "w") or die "Could not open $out_file\n$!"; -printf $out_fh "project_id\tsample_a\tsample_b\tpedigree_parents\tpredicted_parents\tparent_error\n"; +printf $out_fh "sample_a\tsample_b\tpedigree_parents\tpredicted_parents\tparent_error\n"; foreach my $family_id (sort keys %ped) { @@ -113,22 +106,17 @@ foreach my $family_id (sort keys %ped) { my ($sample_a, $sample_b) = split(/\t/, $sample_pair); - $sample_a =~ /(.+)_$family_id/; - my $sample_a_nofam = $1; - $sample_b =~ /(.+)_$family_id/; - my $sample_b_nofam = $1; - - if ($ped{$family_id}{$sample_a_nofam}{'father'} eq $sample_b_nofam || - $ped{$family_id}{$sample_a_nofam}{'mother'} eq $sample_b_nofam || - $ped{$family_id}{$sample_b_nofam}{'father'} eq $sample_a_nofam || - $ped{$family_id}{$sample_b_nofam}{'mother'} eq $sample_a_nofam) + if ($ped{$family_id}{$sample_a}{'father'} eq $sample_b || + $ped{$family_id}{$sample_a}{'mother'} eq $sample_b || + $ped{$family_id}{$sample_b}{'father'} eq $sample_a || + $ped{$family_id}{$sample_b}{'mother'} eq $sample_a) { $info{'pedigree_parents'}{$sample_pair} = 'True'; } $info{'parent_error'}{$sample_pair} = $info{'pedigree_parents'}{$sample_pair} eq $info{'predicted_parents'}{$sample_pair} ? 'False' : 'True'; - printf $out_fh "$project_id\t$sample_pair\t%s\t%s\t%s\n", + printf $out_fh "$sample_pair\t%s\t%s\t%s\n", $info{'pedigree_parents'}{$sample_pair}, $info{'predicted_parents'}{$sample_pair}, $info{'parent_error'}{$sample_pair}; diff --git a/bin/sample_id_cleanup.pl b/bin/sample_id_cleanup.pl new file mode 100755 index 0000000000000000000000000000000000000000..8abfc2dabce11da6c618828e4d6ed863d8bbf7d6 --- /dev/null +++ b/bin/sample_id_cleanup.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl -w + +=head1 NAME + +sample_id_cleanup.pl + +=head1 AUTHOR + +Alison Meynert (alison.meynert@ed.ac.uk) + +=head1 DESCRIPTION + +Fixes batch, maternal, and paternal ids that have had underscores stripped unintentionally. + + batch: 19875457300 + maternal_id: 134114457300 + paternal_id: 134116457300 + + +=cut + +use strict; + +use Cwd; +use Getopt::Long; +use IO::File; + +my $usage = qq{USAGE: +$0 [--help] + --map Id map + --input Input YAML +}; + +my $help = 0; +my $map_file; +my $input_file; + +GetOptions( + 'help' => \$help, + 'map=s' => \$map_file, + 'input=s' => \$input_file +) or die $usage; + +if ($help || !$map_file) +{ + print $usage; + exit(0); +} + +# Read in the map file +my $in_fh = new IO::File; +$in_fh->open($map_file, "r") or die "Could not open $map_file\n$!"; + +my $i = 0; +`cp $input_file temp.yaml.$i`; +while (my $line = <$in_fh>) +{ + chomp $line; + my ($old_id, $new_id) = split(/\s+/, $line); + + my $j = $i; + $i++; + `sed -e 's/$old_id/$new_id/' temp.yaml.$j > temp.yaml.$i`; +} + + +$in_fh->close(); + +`mv temp.yaml.$i $input_file`; +`rm temp.yaml.*`; diff --git a/pipeline/var_calling.nf b/pipeline/var_calling.nf index 2e0210e4b90d4f55b707e379d2555ed9733d998b..f2c9846ae382ca324411150270dfc1b2aef7fda2 100644 --- a/pipeline/var_calling.nf +++ b/pipeline/var_calling.nf @@ -8,20 +8,20 @@ process merge_fastqs { label 'medium' input: - tuple(val(indv_id), path(r1), path(r2)) + tuple(val(indv_id), val(family_id), path(r1), path(r2)) output: tuple( - val(indv_id), - path("${indv_id}_merged_r1.fastq.gz"), - path("${indv_id}_merged_r2.fastq.gz") + val(family_id), + path("${indv_id}_R1.fastq.gz"), + path("${indv_id}_R2.fastq.gz") ) script: // todo: pigz if gzip becomes a bottleneck """ - zcat ${r1.join(' ')} | gzip -c > ${indv_id}_merged_r1.fastq.gz & - zcat ${r2.join(' ')} | gzip -c > ${indv_id}_merged_r2.fastq.gz + zcat ${r1.join(' ')} | gzip -c > ${indv_id}_R1.fastq.gz & + zcat ${r2.join(' ')} | gzip -c > ${indv_id}_R2.fastq.gz """ } @@ -45,7 +45,7 @@ process write_bcbio_csv { lines = individual_info.lstrip('[').rstrip(']').split('], [') with open('${family_id}.csv', 'w') as f: - f.write('samplename,description,batch,sex,phenotype,variant_regions\\n') + f.write('samplename,description,batch,sex,phenotype,paternal_id,maternal_id,variant_regions\\n') for l in lines: f.write(l.replace(', ', ',') + ',' + target_bed + '\\n') """ @@ -56,29 +56,45 @@ process bcbio_family_processing { label 'large' input: - tuple(val(family_id), val(individuals), path(family_csv)) + tuple(val(family_id), val(individuals), path(family_csv), path(merged_fastq1), path(merged_fastq2)) path(bcbio) path(bcbio_template) output: - tuple(val(family_id), val(individuals), path("${family_id}-merged")) + tuple(val(family_id), val(individuals), path("${family_id}")) script: """ - ${bcbio}/anaconda/bin/bcbio_prepare_samples.py --out . --csv $family_csv && - ${bcbio}/anaconda/bin/bcbio_nextgen.py -w template ${bcbio_template} ${family_csv.getBaseName()}-merged.csv ${individuals.collect({"${it}.fastq.gz"}).join(' ')} && + ${bcbio}/anaconda/bin/bcbio_nextgen.py -w template ${bcbio_template} $family_csv $merged_fastq1 $merged_fastq2 && + cd ${family_id} - cd ${family_id}-merged && - ../${bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}-merged.yaml -n 16 -t local + # fix numeric ids where underscores have been stripped out + + # family id + family_id_1=`echo $family_id | cut -f 1 -d '_'` + family_id_2=`echo $family_id | cut -f 2 -d '_'` + echo \${family_id_1}\${family_id_2} ${family_id} > id_map.txt + + # individual ids + for indv_id in `grep -v samplename ../$family_csv | cut -f 1 -d ','` + do + indv_id_1=`echo \$indv_id | cut -f 1 -d '_'` + indv_id_2=`echo \$indv_id | cut -f 2 -d '_'` + echo \${indv_id_1}\${indv_id_2} \${indv_id} >> id_map.txt + done + + sample_id_cleanup.pl --map id_map.txt --input config/${family_id}.yaml + + ../${bcbio}/anaconda/bin/bcbio_nextgen.py config/${family_id}.yaml -n $task.cpus -t local """ stub: """ - output_dir=${family_id}-merged/results - family_dir="${family_id}-merged/results/\$(date '+%Y-%m-%d')_${family_id}-merged" + output_dir=${family_id}/results + family_dir="${family_id}/results/\$(date '+%Y-%m-%d')_${family_id}" mkdir -p \$family_dir - mkdir ${family_id}-merged/config - touch ${family_id}-merged/config/${family_id}-merged{.csv,.yaml,-template.yaml} + mkdir ${family_id}/config + touch ${family_id}/config/${family_id}{.csv,.yaml,-template.yaml} cd \$family_dir touch "\$(echo ${family_id} | sed 's/_//g')-gatk-haplotype-annotated.vcf.gz{,.tbi}" bcbio-nextgen{,-commands}.log data_versions.csv touch project-summary.yaml metadata.csv programs.txt @@ -264,8 +280,6 @@ process collate_pipeline_outputs { peddy_validation_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt && peddy_validation.pl \ --output \$peddy_validation_output \ - --project ${params.pipeline_project_id} \ - --version ${params.pipeline_project_version} \ --ped ../../${ped_file} \ --families . && @@ -315,8 +329,6 @@ process collate_pipeline_outputs { peddy_validation_output=../qc/${params.pipeline_project_id}_${params.pipeline_project_version}.ped_check.txt && peddy_validation.pl \ --output \$peddy_validation_output \ - --project ${params.pipeline_project_id} \ - --version ${params.pipeline_project_version} \ --ped ../../${ped_file} \ --families . && @@ -361,36 +373,26 @@ workflow process_families { ch_merged_fastqs = merge_fastqs( ch_individuals.map( { indv, family, father, mother, sex, affected, r1, r2 -> - [ indv, r1, r2 ] + [ indv, family, r1, r2 ] }) - ) - ch_joined_indv_info = ch_individuals.join(ch_merged_fastqs) + ).groupTuple() - ch_joined_indv_info.map( - { sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 -> - [family_id, sample_id, father, mother, sex, phenotype, merged_r1] - }) - .tap {ch_read1_meta} // I hate this syntax so much + ch_families = ch_individuals.map( + { sample_id, family_id, father, mother, sex, phenotype, r1, r2 -> + [family_id, [sample_id, sample_id, family_id, sex, phenotype, father, mother]] + }).groupTuple() - ch_joined_indv_info.map( - { sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 -> - [family_id, sample_id, father, mother, sex, phenotype, merged_r2] - }) - .tap {ch_read2_meta} + ch_indv_by_family = ch_individuals.map( + { sample_id, family_id, father, mother, sex, phenotype, r1, r2 -> + [family_id, sample_id] + }).groupTuple() ch_bcbio_csvs = write_bcbio_csv( - ch_read1_meta.mix(ch_read2_meta).map( - { family_id, sample_id, father, mother, sex, phenotype, merged_fastq -> - [family_id, [merged_fastq, sample_id, family_id, sex, phenotype]] - } - ).groupTuple(), + ch_families, ch_target_bed ) - ch_bcbio_inputs = ch_joined_indv_info.map( - { sample_id, family_id, father, mother, sex, phenotype, r1s, r2s, merged_r1, merged_r2 -> - [family_id, sample_id] - }).groupTuple().join(ch_bcbio_csvs) + ch_bcbio_inputs = ch_indv_by_family.join(ch_bcbio_csvs.join(ch_merged_fastqs)) ch_bcbio_family_outputs = bcbio_family_processing( ch_bcbio_inputs,