From 012db0fb2cde31b623f4c45c85658cc3e54761fe Mon Sep 17 00:00:00 2001
From: user name <kdonnel2@sdf-cs1.eidf.epcc.ed.ac.uk>
Date: Thu, 21 Nov 2024 16:33:20 +0000
Subject: [PATCH] Moved nextflow testing scripts to new bcbio vs nextflow
 directory

---
 .../bqsr_identity_check.sh                    | 31 ++++++++++++
 .../bwa_mem_identity_check.sh                 | 28 +++++++++++
 .../bwa_stochasticity_check.sh                | 49 +++++++++++++++++++
 3 files changed, 108 insertions(+)
 create mode 100644 Nextflow_vs_Bcbio_Testing/bqsr_identity_check.sh
 create mode 100644 Nextflow_vs_Bcbio_Testing/bwa_mem_identity_check.sh
 create mode 100644 Nextflow_vs_Bcbio_Testing/bwa_stochasticity_check.sh

diff --git a/Nextflow_vs_Bcbio_Testing/bqsr_identity_check.sh b/Nextflow_vs_Bcbio_Testing/bqsr_identity_check.sh
new file mode 100644
index 0000000..f506e6d
--- /dev/null
+++ b/Nextflow_vs_Bcbio_Testing/bqsr_identity_check.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=10GB
+#SBATCH --time=48:00:00
+#SBATCH --job-name=bqsr_test
+#SBATCH --output=bqsr_test.out
+#SBATCH --error=bqsr_test.err
+
+SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
+
+#The outputs of bcbio and nf-core bqsr initially appear different
+#However, this can be accounted for by arbitrary ordering of reads with identical coordinates
+#They can be made identical by a sort operation - samtools sort is not suitable as they are already adequately sorted by samtools' definition
+
+#First, convert to sam
+
+#bcbio recal bam
+$SAMTOOLS view 158063_519317-sort-recal.bcbio.bam > 158063_519317-sort-recal.bcbio.sam
+
+#nf-core recal bam
+$SAMTOOLS view 158063_519317-recal.nf-core.bam > 158063_519317-recal.nf-core.sam
+
+#Sort the sam files using bash sort to confirm that they are indentical
+sort 158063_519317-recal.nf-core.sort.sam > 158063_519317-recal.nf-core.bashsort.sam 
+sort 158063_519317-sort-recal.bcbio.sort.sam > 158063_519317-sort-recal.bcbio.bashsort.sam  
+
+#Generate md5sums - these should be identical
+md5sum 158063_519317-recal.nf-core.bashsort.sam > 158063_519317-recal.nf-core.bashsort.sam.md5
+md5sum 158063_519317-sort-recal.bcbio.bashsort.sam > 158063_519317-sort-recal.bcbio.bashsort.sam.md5
+
+
diff --git a/Nextflow_vs_Bcbio_Testing/bwa_mem_identity_check.sh b/Nextflow_vs_Bcbio_Testing/bwa_mem_identity_check.sh
new file mode 100644
index 0000000..67d6775
--- /dev/null
+++ b/Nextflow_vs_Bcbio_Testing/bwa_mem_identity_check.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=10GB
+#SBATCH --time=24:00:00
+#SBATCH --job-name=bwa_identity
+#SBATCH --output=bwa_identity.out
+#SBATCH --error=bwa_identity.err
+
+SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
+
+#We know that running bwa mem multiple times on identical input produces identical output
+#We also wish to test that the nextflow and bcbio pipeline produce identical bam files when provided with identical fastqs
+#In this example, nextflow is run with the trimmed fastqs produced by the bcbio pipeline (via fastp)
+#We now wish to test that the resulting bam files are identical following bamsormadup (sort and mark duplicates)
+
+#Convert bams to sam - remove read group tags as naming convention differs slightly between pipelines
+
+#bcbio bam
+$SAMTOOLS view -x RG 158063_519317-sort.bam > 158063_519317-sort.noRG.sam
+
+#nextflow bam
+$SAMTOOLS view -x RG 158063.bam > 158063.noRG.sam
+
+#Generate md5sums - these should be identical
+md5sum 158063_519317-sort.noRG.sam > 158063_519317-sort.noRG.sam.md5
+md5sum 158063.noRG.sam > 158063.noRG.sam.md5
+
+
diff --git a/Nextflow_vs_Bcbio_Testing/bwa_stochasticity_check.sh b/Nextflow_vs_Bcbio_Testing/bwa_stochasticity_check.sh
new file mode 100644
index 0000000..59c1cff
--- /dev/null
+++ b/Nextflow_vs_Bcbio_Testing/bwa_stochasticity_check.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=72GB
+#SBATCH --time=24:00:00
+#SBATCH --job-name=bwa_test
+#SBATCH --output=bwa_test.out
+#SBATCH --error=bwa_test.err
+
+BWA=/home/u035/u035/shared/software/bcbio/anaconda/bin/bwa
+SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
+
+#Is bwa mem deterministic - i.e. will it produce the same alignment given identical commands and input
+#The script below is designed to test this (the answer is 'yes')
+
+
+#Point to appropriate bwa index
+#In the pipeline, a softlink to this would be provided in the working directory
+INDEX=`find -L ./ -name "*.amb" | sed 's/\.amb$//'`
+
+#Run bwa on identical fastq input three times 
+#These example input files are from 20240902_Ansari_Morad
+#We are picking them up from the pipeline immediately after outpu by fastp
+
+for i in $(seq 1 3)
+do
+$BWA mem \
+    -R '@RG\tID:158063\tPL:illumina\tPU:158063\tSM:158063' -c 250 -M \
+    -t 16 \
+    $INDEX \
+    subset_158063_1.fastq.gz subset_158063_2.fastq.gz \
+    | $SAMTOOLS view  --threads 16 -o "${i}_158063_.bam" -
+done
+
+#We expect the headers to differ 
+#To be confident, let's compare headerless sam
+for i in $(seq 1 3)
+do
+$SAMTOOLS view "${i}_158063_.bam" > "${i}_158063_.sam" 
+done
+
+#Finally, generate md5 checksums for each sam 
+#These should be identical
+for i in $(seq 1 3)
+do
+md5sum "${i}_158063_.sam" > "${i}_158063_.sam.md5"
+done
+
+
+
-- 
GitLab