From 012db0fb2cde31b623f4c45c85658cc3e54761fe Mon Sep 17 00:00:00 2001 From: user name <kdonnel2@sdf-cs1.eidf.epcc.ed.ac.uk> Date: Thu, 21 Nov 2024 16:33:20 +0000 Subject: [PATCH] Moved nextflow testing scripts to new bcbio vs nextflow directory --- .../bqsr_identity_check.sh | 31 ++++++++++++ .../bwa_mem_identity_check.sh | 28 +++++++++++ .../bwa_stochasticity_check.sh | 49 +++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 Nextflow_vs_Bcbio_Testing/bqsr_identity_check.sh create mode 100644 Nextflow_vs_Bcbio_Testing/bwa_mem_identity_check.sh create mode 100644 Nextflow_vs_Bcbio_Testing/bwa_stochasticity_check.sh diff --git a/Nextflow_vs_Bcbio_Testing/bqsr_identity_check.sh b/Nextflow_vs_Bcbio_Testing/bqsr_identity_check.sh new file mode 100644 index 0000000..f506e6d --- /dev/null +++ b/Nextflow_vs_Bcbio_Testing/bqsr_identity_check.sh @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --cpus-per-task=1 +#SBATCH --mem=10GB +#SBATCH --time=48:00:00 +#SBATCH --job-name=bqsr_test +#SBATCH --output=bqsr_test.out +#SBATCH --error=bqsr_test.err + +SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools + +#The outputs of bcbio and nf-core bqsr initially appear different +#However, this can be accounted for by arbitrary ordering of reads with identical coordinates +#They can be made identical by a sort operation - samtools sort is not suitable as they are already adequately sorted by samtools' definition + +#First, convert to sam + +#bcbio recal bam +$SAMTOOLS view 158063_519317-sort-recal.bcbio.bam > 158063_519317-sort-recal.bcbio.sam + +#nf-core recal bam +$SAMTOOLS view 158063_519317-recal.nf-core.bam > 158063_519317-recal.nf-core.sam + +#Sort the sam files using bash sort to confirm that they are indentical +sort 158063_519317-recal.nf-core.sort.sam > 158063_519317-recal.nf-core.bashsort.sam +sort 158063_519317-sort-recal.bcbio.sort.sam > 158063_519317-sort-recal.bcbio.bashsort.sam + +#Generate md5sums - these should be identical +md5sum 158063_519317-recal.nf-core.bashsort.sam > 158063_519317-recal.nf-core.bashsort.sam.md5 +md5sum 158063_519317-sort-recal.bcbio.bashsort.sam > 158063_519317-sort-recal.bcbio.bashsort.sam.md5 + + diff --git a/Nextflow_vs_Bcbio_Testing/bwa_mem_identity_check.sh b/Nextflow_vs_Bcbio_Testing/bwa_mem_identity_check.sh new file mode 100644 index 0000000..67d6775 --- /dev/null +++ b/Nextflow_vs_Bcbio_Testing/bwa_mem_identity_check.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --cpus-per-task=1 +#SBATCH --mem=10GB +#SBATCH --time=24:00:00 +#SBATCH --job-name=bwa_identity +#SBATCH --output=bwa_identity.out +#SBATCH --error=bwa_identity.err + +SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools + +#We know that running bwa mem multiple times on identical input produces identical output +#We also wish to test that the nextflow and bcbio pipeline produce identical bam files when provided with identical fastqs +#In this example, nextflow is run with the trimmed fastqs produced by the bcbio pipeline (via fastp) +#We now wish to test that the resulting bam files are identical following bamsormadup (sort and mark duplicates) + +#Convert bams to sam - remove read group tags as naming convention differs slightly between pipelines + +#bcbio bam +$SAMTOOLS view -x RG 158063_519317-sort.bam > 158063_519317-sort.noRG.sam + +#nextflow bam +$SAMTOOLS view -x RG 158063.bam > 158063.noRG.sam + +#Generate md5sums - these should be identical +md5sum 158063_519317-sort.noRG.sam > 158063_519317-sort.noRG.sam.md5 +md5sum 158063.noRG.sam > 158063.noRG.sam.md5 + + diff --git a/Nextflow_vs_Bcbio_Testing/bwa_stochasticity_check.sh b/Nextflow_vs_Bcbio_Testing/bwa_stochasticity_check.sh new file mode 100644 index 0000000..59c1cff --- /dev/null +++ b/Nextflow_vs_Bcbio_Testing/bwa_stochasticity_check.sh @@ -0,0 +1,49 @@ +#!/bin/bash +#SBATCH --cpus-per-task=16 +#SBATCH --mem=72GB +#SBATCH --time=24:00:00 +#SBATCH --job-name=bwa_test +#SBATCH --output=bwa_test.out +#SBATCH --error=bwa_test.err + +BWA=/home/u035/u035/shared/software/bcbio/anaconda/bin/bwa +SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools + +#Is bwa mem deterministic - i.e. will it produce the same alignment given identical commands and input +#The script below is designed to test this (the answer is 'yes') + + +#Point to appropriate bwa index +#In the pipeline, a softlink to this would be provided in the working directory +INDEX=`find -L ./ -name "*.amb" | sed 's/\.amb$//'` + +#Run bwa on identical fastq input three times +#These example input files are from 20240902_Ansari_Morad +#We are picking them up from the pipeline immediately after outpu by fastp + +for i in $(seq 1 3) +do +$BWA mem \ + -R '@RG\tID:158063\tPL:illumina\tPU:158063\tSM:158063' -c 250 -M \ + -t 16 \ + $INDEX \ + subset_158063_1.fastq.gz subset_158063_2.fastq.gz \ + | $SAMTOOLS view --threads 16 -o "${i}_158063_.bam" - +done + +#We expect the headers to differ +#To be confident, let's compare headerless sam +for i in $(seq 1 3) +do +$SAMTOOLS view "${i}_158063_.bam" > "${i}_158063_.sam" +done + +#Finally, generate md5 checksums for each sam +#These should be identical +for i in $(seq 1 3) +do +md5sum "${i}_158063_.sam" > "${i}_158063_.sam.md5" +done + + + -- GitLab