From ce1b6a183812ea31cb329a49240b8b1e76e59c65 Mon Sep 17 00:00:00 2001
From: ameyner2 <alison.meynert@igmm.ed.ac.uk>
Date: Mon, 23 Sep 2019 09:15:53 +0100
Subject: [PATCH] Initial commit of CRAM compression script and addition of
 reference genome path to config file

---
 submit_trio_wes_cram_compression.sh | 46 +++++++++++++++++++++++++++++
 trio_whole_exome_config.sh          |  1 +
 2 files changed, 47 insertions(+)
 create mode 100755 submit_trio_wes_cram_compression.sh

diff --git a/submit_trio_wes_cram_compression.sh b/submit_trio_wes_cram_compression.sh
new file mode 100755
index 0000000..5843f06
--- /dev/null
+++ b/submit_trio_wes_cram_compression.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#PBS -l walltime=48:00:00
+#PBS -l ncpus=16,mem=8gb
+#PBS -q sgp
+#PBS -N trio_whole_exome_cram_compression
+#PBS -j oe
+
+# enable running singletons
+if [ -z $PBS_ARRAY_INDEX ]
+then
+  if [ -z $INDEX ]
+  then
+    export PBS_ARRAY_INDX=1
+  else
+    export PBS_ARRAY_INDEX=$INDEX
+  fi
+fi
+
+# Expects environment variables to be set
+# PROJECT_ID - e.g. 12345_LastnameFirstname
+# CONFIG_SH - absolute path to configuration script setting environment variables
+
+source $CONFIG_SH
+
+FAMILY_ID=`head -n $PBS_ARRAY_INDEX $PARAMS_DIR/$PROJECT_ID.family_ids.txt | tail -n 1`
+
+# This assumes that ${PROJECT_ID}_${FAMILY_ID} is unique, and it should be - if there was
+# a re-run of a family, it should have a new project id.
+cd $OUTPUT_DIR/*${PROJECT_ID}_${FAMILY_ID}*
+
+for BAM in */*.bam
+do
+
+  # 1. Compress to CRAM format without quality score binning
+  CRAM=${BAM%.bam}.cram
+  samtools view -@ 16 -T $REFERENCE_GENOME -C -o $CRAM $BAM
+
+  # 2. Index the CRAM file - good sanity check
+  samtools index $CRAM
+  
+  # 3. Compare the stats from the BAM and CRAM files
+  samtools flagstat $BAM > $BAM.flagstat.txt
+  samtools flagstat $CRAM > $CRAM.flagstat.txt
+  diff $BAM.flagstat.txt $CRAM.flagstat.txt
+
+done
diff --git a/trio_whole_exome_config.sh b/trio_whole_exome_config.sh
index bc0b78a..46e2978 100644
--- a/trio_whole_exome_config.sh
+++ b/trio_whole_exome_config.sh
@@ -7,6 +7,7 @@ SCRIPTS=/home/u027/project/scripts
 BCBIO_TEMPLATE=$SCRIPTS/trio_whole_exome_bcbio_template.yaml
 TARGET=/home/u027/project/resources/Twist_Exome_Target_hg38.bed
 DOWNLOAD_DIR=/scratch/u027/project/trio_whole_exome/data
+REFERENCE_GENOME=/home/u027/project/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa
 
 BASE=/scratch/u027/project/trio_whole_exome/analysis
 PARAMS_DIR=$BASE/params
-- 
GitLab