Versions of the scripts for Santosh samples

04064420 · ameyner2 · bafadff4 · 04064420 · 04064420
Commit 04064420 authored 3 years ago by ameyner2
--- a/prepare_bcbio_config_santosh.sh
+++ b/prepare_bcbio_config_santosh.sh
+#!/bin/bash
+#
+# prepare_bcbio_config_crf.sh <config.sh> <project_id> <version>
+# 
+# Adaptation of prepare_bcbio_config.sh for data from the Santosh set
+#
+# Given a <project_id>.ped file for a set of trios (families) in the 
+# folder $PARAMS_DIR, creates the files <project_id>.family_ids.txt
+# and <project>.sample_ids.txt in the same folder.
+#
+# Assumes that reads for the samples are in the path
+# $READS_DIR/<project_id>/*.gz,
+# and that no samples other than those with reads are listed in the 
+# PED file. $READS_DIR is specified in the <config.sh> file.
+#
+# Assumes that the sample names in the PED file match those 
+# specifying the read files.
+#
+# All samples must be annotated with sex (1=male, 2=female) in the
+# 5th column and phenotype (1=unaffected, 2=affected) in the 6th
+# column of the PED file.
+#
+# Runs bcbio sample preparation and configuration file generation,
+# assuming the template configuration file is at $BCBIO_TEMPLATE,
+# specified in the <config.sh> file.
+#
+# Assumes bcbio is on the PATH (set in <config.sh>).
+#
+
+CONFIG_SH=$1
+PROJECT_ID=$2
+VERSION=$3
+
+source $CONFIG_SH
+
+#
+# Create the files:
+#  $PROJECT_ID.family_ids.txt - format <pcr_plate_id>_<family_id>
+#  $PROJECT_ID.$FAMILY_ID.ped - select only the individuals in a given family, 
+#                               prefix <family_id> with <pcr_plate_id> and
+#                               add suffix <family_id> to <individual_id> 
+#
+cd $PARAMS_DIR
+
+# remove DOS newline characters if necessary
+perl -pi -e 's/\r//' $PROJECT_ID.ped
+
+# create reads directory for project
+mkdir -p $READS_DIR/$PROJECT_ID
+
+# generate the family_ids list - makes strong assumption about relative paths!
+cut -f 1 $PROJECT_ID.ped | sort -u > $PROJECT_ID.family_ids.txt
+
+for FAMILY_ID in `cat $PROJECT_ID.family_ids.txt`
+do
+  grep ^$FAMILY_ID $PROJECT_ID.ped > ${PROJECT_ID}_${FAMILY_ID}.ped
+
+  echo "samplename,description,batch,sex,phenotype,variant_regions" > ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+  COUNT=`wc -l ${PROJECT_ID}_${FAMILY_ID}.ped | awk '{ print $1 }'`
+
+  for ((i=1; i<=$COUNT; i=i+1))
+  do
+    SAMPLE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 2`
+    SEX=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 5`
+    PHENOTYPE=`head -n $i ${PROJECT_ID}_${FAMILY_ID}.ped | tail -n 1 | cut -f 6`
+
+    for FILE in `ls $DOWNLOAD_DIR/$PROJECT_ID/*${SAMPLE}*.gz`
+    do
+      echo "$FILE,${SAMPLE}_${FAMILY_ID},$FAMILY_ID,$SEX,$PHENOTYPE,$TARGET" >> ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+    done
+
+  done
+
+  bcbio_prepare_samples.py --out $READS_DIR/$PROJECT_ID --csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+
+  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}-merged.csv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv
+
+  BARE_FAMILY_ID=`echo $FAMILY_ID | cut -d '_' -f 2`
+
+  bcbio_nextgen.py -w template $BCBIO_TEMPLATE ${VERSION}_${PROJECT_ID}_${FAMILY_ID}.csv $READS_DIR/$PROJECT_ID/*_${BARE_FAMILY_ID}_R[12].fastq.gz
+
+  mv ${VERSION}_${PROJECT_ID}_${FAMILY_ID}/config/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml $CONFIG_DIR/
+
+  COMPRESSED_ID=`echo "$FAMILY_ID" | perl -pe "s/\_//"`
+
+  perl -i -pe "s/${COMPRESSED_ID}/${FAMILY_ID}/" $CONFIG_DIR/${VERSION}_${PROJECT_ID}_${FAMILY_ID}.yaml
+
+  rm -r ${VERSION}_${PROJECT_ID}_${FAMILY_ID}
+
+done
--- a/trio_whole_exome_parse_peddy_ped_csv_no_batch.pl
+++ b/trio_whole_exome_parse_peddy_ped_csv_no_batch.pl
+#!/usr/bin/perl -w
+
+=head1 NAME
+
+trio_whole_exome_parse_peddy_ped_csv_no_batch.pl
+
+=head1 AUTHOR
+
+Alison Meynert (alison.meynert@igmm.ed.ac.uk)
+
+=head1 DESCRIPTION
+
+Checks the parent-child and parent-parent relationships from peddy output.
+
+=cut
+
+use strict;
+
+use Getopt::Long;
+use IO::File;
+
+my $usage = qq{USAGE:
+$0 [--help]
+  --output  Output directory
+  --ped     Pedigree file for project
+  --project Project id
+  --version Analysis run version (v1, v2, etc)
+};
+
+my $help = 0;
+my $ped_file;
+my $project_id;
+my $version;
+my $out_dir;
+
+GetOptions(
+    'help'      => \$help,
+    'project=s' => \$project_id,
+    'ped=s'     => \$ped_file,
+    'output=s'  => \$out_dir,
+    'version=s' => \$version
+) or die $usage;
+
+if ($help || !$project_id || !$ped_file || !$out_dir || !$version)
+{
+    print $usage;
+    exit(0);
+}
+
+# Read in the pedigree file
+my $in_fh = new IO::File;
+$in_fh->open($ped_file, "r") or die "Could not open $ped_file\n$!";
+
+my %ped;
+while (my $line = <$in_fh>)
+{
+	chomp $line;
+	my ($family_id, $individual_id, $father_id, $mother_id, $sex, $aff) = split(/\t/, $line);
+	$ped{$family_id}{'count'}++;
+	$ped{$family_id}{$individual_id}{'father'} = $father_id;
+	$ped{$family_id}{$individual_id}{'mother'} = $mother_id;
+	if ($aff == 2)
+	{
+		$ped{$family_id}{'aff'} = $individual_id;
+	}
+}
+
+$in_fh->close();
+
+my $out_file = sprintf("$out_dir/qc/%s_%s.ped_check.txt", $version, $project_id);
+my $out_fh = new IO::File;
+$out_fh->open($out_file, "w") or die "Could not open $out_file\n$!";
+
+printf $out_fh "project_id\tsample_a\tsample_b\tpedigree_parents\tpredicted_parents\tparent_error\n";
+
+foreach my $family_id (sort keys %ped)
+{
+	my @peddy_glob = glob(sprintf("$out_dir/*_%s_%s_%s/%s_%s/qc/peddy/*.ped_check.csv", 
+		$version, $project_id, $family_id, $ped{$family_id}{'aff'}, $family_id));
+	next if (scalar(@peddy_glob) == 0);
+
+	my $peddy_fh = new IO::File;
+	$peddy_fh->open($peddy_glob[0], "r") or die "Could not open $peddy_glob[0]\n$!";
+
+	my @headers;
+	my %info;
+	my @sample_pairs;
+	while (my $line = <$peddy_fh>)
+	{
+		chomp $line;
+		if ($line =~ /^sample_a/)
+		{
+			@headers = split(/,/, $line);
+		}
+		else
+		{
+			my @data = split(/,/, $line);
+			push(@sample_pairs, sprintf("%s\t%s", $data[0], $data[1]));
+			for (my $i = 2; $i < scalar(@headers); $i++)
+			{
+				$info{$headers[$i]}{sprintf("%s\t%s", $data[0], $data[1])} = $data[$i];
+			}
+		}
+	}
+
+	$peddy_fh->close();
+
+	foreach my $sample_pair (@sample_pairs)
+	{
+		my ($sample_a, $sample_b) = split(/\t/, $sample_pair);
+
+		$sample_a =~ /(.+)_$family_id/;
+		my $sample_a_nofam = $1;
+		$sample_b =~ /(.+)_$family_id/;
+		my $sample_b_nofam = $1;
+
+		if ($ped{$family_id}{$sample_a_nofam}{'father'} eq $sample_b_nofam ||
+		    $ped{$family_id}{$sample_a_nofam}{'mother'} eq $sample_b_nofam ||
+		    $ped{$family_id}{$sample_b_nofam}{'father'} eq $sample_a_nofam ||
+		    $ped{$family_id}{$sample_b_nofam}{'mother'} eq $sample_a_nofam)
+		{
+			$info{'pedigree_parents'}{$sample_pair} = 'True';
+		}
+
+		$info{'parent_error'}{$sample_pair} = $info{'pedigree_parents'}{$sample_pair} eq $info{'predicted_parents'}{$sample_pair} ? 'False' : 'True';
+
+		printf $out_fh "$project_id\t$sample_pair\t%s\t%s\t%s\n", 
+		    $info{'pedigree_parents'}{$sample_pair}, 
+		    $info{'predicted_parents'}{$sample_pair},
+		    $info{'parent_error'}{$sample_pair};
+	}
+}
+
+$out_fh->close();
+
+