From 04064420f7a919a3dd9aecf2dae9a82b1b45f8f0 Mon Sep 17 00:00:00 2001 From: ameyner2 <alison.meynert@igmm.ed.ac.uk> Date: Thu, 29 Apr 2021 08:18:54 +0100 Subject: [PATCH] Versions of the scripts for Santosh samples --- prepare_bcbio_config_santosh.sh | 90 ++++++++++++ ...hole_exome_parse_peddy_ped_csv_no_batch.pl | 136 ++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100755 prepare_bcbio_config_santosh.sh create mode 100644 trio_whole_exome_parse_peddy_ped_csv_no_batch.pl diff --git a/prepare_bcbio_config_santosh.sh b/prepare_bcbio_config_santosh.sh new file mode 100755 index 0000000..7c0fe58 --- /dev/null +++ b/prepare_bcbio_config_santosh.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# +# prepare_bcbio_config_crf.sh <config.sh> <project_id> <version> +# +# Adaptation of prepare_bcbio_config.sh for data from the Santosh set +# +# Given a <project_id>.ped file for a set of trios (families) in the +# folder $PARAMS_DIR, creates the files <project_id>.family_ids.txt +# and <project>.sample_ids.txt in the same folder. +# +# Assumes that reads for the samples are in the path +# $READS_DIR/<project_id>/*.gz, +# and that no samples other than those with reads are listed in the +# PED file. $READS_DIR is specified in the <config.sh> file. +# +# Assumes that the sample names in the PED file match those +# specifying the read files. +# +# All samples must be annotated with sex (1=male, 2=female) in the +# 5th column and phenotype (1=unaffected, 2=affected) in the 6th +# column of the PED file. +# +# Runs bcbio sample preparation and configuration file generation, +# assuming the template configuration file is at $BCBIO_TEMPLATE, +# specified in the <config.sh> file. +# +# Assumes bcbio is on the PATH (set in <config.sh>). +# + +CONFIG_SH=$1 +PROJECT_ID=$2 +VERSION=$3 + +source $CONFIG_SH + +# +# Create the files: +# $PROJECT_ID.family_ids.txt - format <pcr_plate_id>_<family_id> +# $PROJECT_ID.$FAMILY_ID.ped - select only the individuals in a given family, +# prefix <family_id> with <pcr_plate_id> and +# add suffix <family_id> to <individual_id> +# +cd $PARAMS_DIR + +# remove DOS newline characters if necessary +perl -pi -e 's/\r//' $PROJECT_ID.ped + +# create reads directory for project +mkdir -p $READS_DIR/$PROJECT_ID + +# generate the family_ids list - makes strong assumption about relative paths! +cut -f 1 $PROJECT_ID.ped | sort -u > $PROJECT_ID.family_ids.txt + +for FAMILY_ID in `cat $PROJECT_ID.family_ids.txt` +do + grep ^$FAMILY_ID $PROJECT_ID.ped > ${PROJECT_ID}_${FAMILY_ID}.ped + + echo "samplename,description,batch,sex,phenotype,variant_regions" > ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv + COUNT=`wc -l ${PROJECT_ID}_${FAMILY_ID}.ped | awk '{ print $1 }'` + + for ((i=1; i<=$COUNT; i=i+1)) + do + SAMPLE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 2` + SEX=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 5` + PHENOTYPE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 6` + + for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*${SAMPLE}*.gz` + do + echo "$FILE,${SAMPLE}_${FAMILY_ID},$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv + done + + done + + bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv + + mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}-merged.csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv + + BARE_FAMILY_ID=`echo $FAMILY_ID | cut -d '_' -f 2` + + bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz + + mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}/config/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml $CONFIG_DIR/ + + COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"` + + perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml + + rm -r ${VERSION}_${PROJECT_ID}_${FAMILY_ID} + +done diff --git a/trio_whole_exome_parse_peddy_ped_csv_no_batch.pl b/trio_whole_exome_parse_peddy_ped_csv_no_batch.pl new file mode 100644 index 0000000..6ef53f1 --- /dev/null +++ b/trio_whole_exome_parse_peddy_ped_csv_no_batch.pl @@ -0,0 +1,136 @@ +#!/usr/bin/perl -w + +=head1 NAME + +trio_whole_exome_parse_peddy_ped_csv_no_batch.pl + +=head1 AUTHOR + +Alison Meynert (alison.meynert@igmm.ed.ac.uk) + +=head1 DESCRIPTION + +Checks the parent-child and parent-parent relationships from peddy output. + +=cut + +use strict; + +use Getopt::Long; +use IO::File; + +my $usage = qq{USAGE: +$0 [--help] + --output Output directory + --ped Pedigree file for project + --project Project id + --version Analysis run version (v1, v2, etc) +}; + +my $help = 0; +my $ped_file; +my $project_id; +my $version; +my $out_dir; + +GetOptions( + 'help' => \$help, + 'project=s' => \$project_id, + 'ped=s' => \$ped_file, + 'output=s' => \$out_dir, + 'version=s' => \$version +) or die $usage; + +if ($help || !$project_id || !$ped_file || !$out_dir || !$version) +{ + print $usage; + exit(0); +} + +# Read in the pedigree file +my $in_fh = new IO::File; +$in_fh->open($ped_file, "r") or die "Could not open $ped_file\n$!"; + +my %ped; +while (my $line = <$in_fh>) +{ + chomp $line; + my ($family_id, $individual_id, $father_id, $mother_id, $sex, $aff) = split(/\t/, $line); + $ped{$family_id}{'count'}++; + $ped{$family_id}{$individual_id}{'father'} = $father_id; + $ped{$family_id}{$individual_id}{'mother'} = $mother_id; + if ($aff == 2) + { + $ped{$family_id}{'aff'} = $individual_id; + } +} + +$in_fh->close(); + +my $out_file = sprintf("$out_dir/qc/%s_%s.ped_check.txt", $version, $project_id); +my $out_fh = new IO::File; +$out_fh->open($out_file, "w") or die "Could not open $out_file\n$!"; + +printf $out_fh "project_id\tsample_a\tsample_b\tpedigree_parents\tpredicted_parents\tparent_error\n"; + +foreach my $family_id (sort keys %ped) +{ + my @peddy_glob = glob(sprintf("$out_dir/*_%s_%s_%s/%s_%s/qc/peddy/*.ped_check.csv", + $version, $project_id, $family_id, $ped{$family_id}{'aff'}, $family_id)); + next if (scalar(@peddy_glob) == 0); + + my $peddy_fh = new IO::File; + $peddy_fh->open($peddy_glob[0], "r") or die "Could not open $peddy_glob[0]\n$!"; + + my @headers; + my %info; + my @sample_pairs; + while (my $line = <$peddy_fh>) + { + chomp $line; + if ($line =~ /^sample_a/) + { + @headers = split(/,/, $line); + } + else + { + my @data = split(/,/, $line); + push(@sample_pairs, sprintf("%s\t%s", $data[0], $data[1])); + for (my $i = 2; $i < scalar(@headers); $i++) + { + $info{$headers[$i]}{sprintf("%s\t%s", $data[0], $data[1])} = $data[$i]; + } + } + } + + $peddy_fh->close(); + + foreach my $sample_pair (@sample_pairs) + { + my ($sample_a, $sample_b) = split(/\t/, $sample_pair); + + $sample_a =~ /(.+)_$family_id/; + my $sample_a_nofam = $1; + $sample_b =~ /(.+)_$family_id/; + my $sample_b_nofam = $1; + + if ($ped{$family_id}{$sample_a_nofam}{'father'} eq $sample_b_nofam || + $ped{$family_id}{$sample_a_nofam}{'mother'} eq $sample_b_nofam || + $ped{$family_id}{$sample_b_nofam}{'father'} eq $sample_a_nofam || + $ped{$family_id}{$sample_b_nofam}{'mother'} eq $sample_a_nofam) + { + $info{'pedigree_parents'}{$sample_pair} = 'True'; + } + + $info{'parent_error'}{$sample_pair} = $info{'pedigree_parents'}{$sample_pair} eq $info{'predicted_parents'}{$sample_pair} ? 'False' : 'True'; + + printf $out_fh "$project_id\t$sample_pair\t%s\t%s\t%s\n", + $info{'pedigree_parents'}{$sample_pair}, + $info{'predicted_parents'}{$sample_pair}, + $info{'parent_error'}{$sample_pair}; + } +} + +$out_fh->close(); + + -- GitLab