diff --git a/trio_whole_exome_parse_peddy_ped_csv.pl b/trio_whole_exome_parse_peddy_ped_csv.pl new file mode 100644 index 0000000000000000000000000000000000000000..be9546dbeb16a4f15f8987d10cb9edb6b066c2c6 --- /dev/null +++ b/trio_whole_exome_parse_peddy_ped_csv.pl @@ -0,0 +1,136 @@ +#!/usr/bin/perl -w + +=head1 NAME + +trio_whole_exome_parse_peddy_ped_csv.pl + +=head1 AUTHOR + +Alison Meynert (alison.meynert@igmm.ed.ac.uk) + +=head1 DESCRIPTION + +Checks the parent-child and parent-parent relationships from peddy output. + +=cut + +use strict; + +use Getopt::Long; +use IO::File; + +my $usage = qq{USAGE: +$0 [--help] + --output Output directory + --ped Pedigree file for project + --project Project id + --batch Batch id +}; + +my $help = 0; +my $ped_file; +my $project_id; +my $out_dir; +my $batch_id; + +GetOptions( + 'help' => \$help, + 'project=s' => \$project_id, + 'ped=s' => \$ped_file, + 'output=s' => \$out_dir, + 'batch=s' => \$batch_id +) or die $usage; + +if ($help || !$project_id || !$ped_file || !$out_dir || !$batch_id) +{ + print $usage; + exit(0); +} + +# Read in the pedigree file +my $in_fh = new IO::File; +$in_fh->open($ped_file, "r") or die "Could not open $ped_file\n$!"; + +my %ped; +while (my $line = <$in_fh>) +{ + chomp $line; + my ($family_id, $individual_id, $father_id, $mother_id, $sex, $aff) = split(/\t/, $line); + $ped{$family_id}{'count'}++; + $ped{$family_id}{$individual_id}{'father'} = $father_id; + $ped{$family_id}{$individual_id}{'mother'} = $mother_id; + if ($aff == 2) + { + $ped{$family_id}{'aff'} = $individual_id; + } +} + +$in_fh->close(); + +my $out_file = "$out_dir/qc/$project_id.ped_check.txt"; +my $out_fh = new IO::File; +$out_fh->open($out_file, "w") or die "Could not open $out_file\n$!"; + +printf $out_fh "project_id\tbatch_id\tsample_a\tsample_b\tpedigree_parents\tpredicted_parents\tparent_error\n"; + +foreach my $family_id (sort keys %ped) +{ + #my $peddy_file = "$out_dir/2020-01-15_11972_Diamond_Austin_17304_402965/103195_402965/qc/peddy/17304402965.ped_check.csv"; + my @peddy_glob = glob(sprintf("$out_dir/*_%s_%s_%s/%s_%s/qc/peddy/%s%s.ped_check.csv", + $project_id, $batch_id, $family_id, $ped{$family_id}{'aff'}, $family_id, $batch_id, $family_id)); + + my $peddy_fh = new IO::File; + $peddy_fh->open($peddy_glob[0], "r") or die "Could not open $peddy_glob[0]\n$!"; + + my @headers; + my %info; + my @sample_pairs; + while (my $line = <$peddy_fh>) + { + chomp $line; + if ($line =~ /^sample_a/) + { + @headers = split(/,/, $line); + } + else + { + my @data = split(/,/, $line); + push(@sample_pairs, sprintf("%s\t%s", $data[0], $data[1])); + for (my $i = 2; $i < scalar(@headers); $i++) + { + $info{$headers[$i]}{sprintf("%s\t%s", $data[0], $data[1])} = $data[$i]; + } + } + } + + $peddy_fh->close(); + + foreach my $sample_pair (@sample_pairs) + { + my ($sample_a, $sample_b) = split(/\t/, $sample_pair); + + $sample_a =~ /(.+)_$family_id/; + my $sample_a_nofam = $1; + $sample_b =~ /(.+)_$family_id/; + my $sample_b_nofam = $1; + + if ($ped{$family_id}{$sample_a_nofam}{'father'} eq $sample_b_nofam || + $ped{$family_id}{$sample_a_nofam}{'mother'} eq $sample_b_nofam || + $ped{$family_id}{$sample_b_nofam}{'father'} eq $sample_a_nofam || + $ped{$family_id}{$sample_b_nofam}{'mother'} eq $sample_a_nofam) + { + $info{'pedigree_parents'}{$sample_pair} = 'True'; + } + + $info{'parent_error'}{$sample_pair} = $info{'pedigree_parents'}{$sample_pair} eq $info{'predicted_parents'}{$sample_pair} ? 'False' : 'True'; + + printf $out_fh "$project_id\t$batch_id\t$sample_pair\t%s\t%s\t%s\n", + $info{'pedigree_parents'}{$sample_pair}, + $info{'predicted_parents'}{$sample_pair}, + $info{'parent_error'}{$sample_pair}; + } +} + +$out_fh->close(); + +