GIAB test data script downloads & extracts small sets of reads for 3 trios

2f6dc35d · not populated not populated · af6a1054 · 2f6dc35d
Commit 2f6dc35d authored 3 years ago by not populated not populated
--- a/tests/assets/input_data/scripts/giab.sh
+++ b/tests/assets/input_data/scripts/giab.sh
@@ -5,35 +5,104 @@
 #
 # Requires:
 # samtools
-#
+# bedtools
+# bazam
+
+TWIST_TARGET=Twist_Exome_RefSeq_targets_hg38.plus15bp.bed

-CHR22_TWIST=$1
+###
+# Prepare the chr22 target subset
+###
+mkdir -p giab
+cd giab
+echo "chr22 0 50818468" > chr22.bed
+perl -pi -e 's/ /\t/g' chr22.bed
+bedtools intersect -a chr22.bed -b ../$TWIST_TARGET > TWIST.chr22.bed
+sed -e 's/chr//' TWIST.chr22.bed > TWIST.22.bed

 ###
-# AshkenazimTrio: Illumina whole exome
-## 
+# AshkenazimTrio: Illumina whole exome (for testing this script)
+##

 # Fetch the index
-mkdir -p giab/raw_data/AshkenazimTrio
-cd giab/raw_data/AshkenazimTrio
-#wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/AshkenazimTrio/alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015
+mkdir -p raw_data/AshkenazimTrioExome
+cd raw_data/AshkenazimTrioExome
+wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/AshkenazimTrio/alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015
+index=alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015

-# Download the BAM, BAI, and md5 checksums
 for ((i = 2; i <= 4; i = i + 1))
 do
-    bam=`head -n $i alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 | tail -n 1 | cut -f 1`
-    bam_md5=`head -n $i alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 | tail -n 1 | cut -f 2`
-    bai=`head -n $i alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 | tail -n 1 | cut -f 3`
-    bai_md5=`head -n $i alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 | tail -n 1 | cut -f 4`
+    # Download the BAM and its md5 checksum
+    bam=`head -n $i $index | tail -n 1 | cut -f 1`
+    bam_md5=`head -n $i $index | tail -n 1 | cut -f 2`
+
+    bam_base=`basename $bam`
+
+    # Validate the md5 checksum
+    echo $bam_md5 $bam_base > $bam_base.md5
+    wget $bam
+    md5sum --check $bam_base.md5
+
+    # Freshly index
+    samtools index $bam_base
+
+    # Extract FASTQ
+    java -jar ../../../../../software/bazam.jar -bam $bam_base -L ../../TWIST.22.bed -r1 ${bam_base%.bam}_R1.fastq -r2 ${bam_base%.bam}_R2.fastq
+
+    # Gzip the FASTQ reads
+    gzip *.fastq
+
+    # Subset to 10K reads per sample
+    sample=`echo $bam_base | cut -f 6 -d '_'`
+    seqtk sample -s 100 ${bam_base%.bam}_R1.fastq.gz 10000 > ${sample}_R1.fastq.gz
+    seqtk sample -s 100 ${bam_base%.bam}_R2.fastq.gz 10000 > ${sample}_R2.fastq.gz
+
+    # Move to output
+    mkdir -p ../../AshkenazimTrioExome
+    mv ${sample}_R{1,2}.fastq.gz ../../AshkenazimTrioExome
+done
+
+# Move back up to giab working folder
+cd ../..
+
+###
+# AshkenazimTrio:
+###
+mkdir -p raw_data/AshkenazimTrio
+cd raw_data/AshkenazimTrio
+wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/AshkenazimTrio/alignment.index.AJtrio_Illumina_2x250bps_novoalign_GRCh37_GRCh38_NHGRI_06062016
+index=alignment.index.AJtrio_Illumina_2x250bps_novoalign_GRCh37_GRCh38_NHGRI_06062016
+
+for ((i = 2; i <= 6; i = i + 2))
+do
+    # Download the BAM and its md5 checksum
+    bam=`head -n $i $index | tail -n 1 | cut -f 1`
+    bam_md5=`head -n $i $index | tail -n 1 | cut -f 2`

    bam_base=`basename $bam`
-    bai_base=`basename $bai`

+    # Validate the md5 checksum
    echo $bam_md5 $bam_base > $bam_base.md5
-#    wget $bam
+    wget $bam
    md5sum --check $bam_base.md5

+    # Freshly index
    samtools index $bam_base
+
+    # Extract FASTQ
+    java -jar ../../../../../software/bazam.jar -bam $bam_base -L ../../TWIST.22.bed -r1 ${bam_base%.bam}_R1.fastq -r2 ${bam_base%.bam}_R2.fastq
+
+    # Gzip the FASTQ reads
+    gzip *.fastq
+
+    # Subset to 10K reads per sample
+    sample=`echo $bam_base | cut -f 1 -d '.'`
+    seqtk sample -s 100 ${bam_base%.bam}_R1.fastq.gz 10000 > ${sample}_R1.fastq.gz
+    seqtk sample -s 100 ${bam_base%.bam}_R2.fastq.gz 10000 > ${sample}_R2.fastq.gz
+
+    # Move to output folder
+    mkdir -p ../../AshkenazimTrio
+    mv ${sample}_R{1,2}.fastq.gz ../../AshkenazimTrio
 done

 # Move back up to giab working folder
@@ -46,16 +115,40 @@ cd ../..
 # Fetch the index
 mkdir -p raw_data/ChineseTrio
 cd raw_data/ChineseTrio
-#wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/ChineseTrio/alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016
-
-# Download the BAM, BAI, and md5 checksums
-#for line in `grep -v BAM_MD5 alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016 | grep GRCh38`
-#do
-#    for file in $line
-#    do
-#	wget $file
-#    done
-#done
+wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/ChineseTrio/alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016
+index=alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016
+
+for ((i = 2; i <= 6; i = i + 2))
+do
+    # Download the BAM and its md5 checksum
+    bam=`head -n $i $index | tail -n 1 | cut -f 1`
+    bam_md5=`head -n $i $index | tail -n 1 | cut -f 2`
+
+    bam_base=`basename $bam`
+
+    # Validate the md5 checksum
+    echo $bam_md5 $bam_base > $bam_base.md5
+    wget $bam
+    md5sum --check $bam_base.md5
+
+    # Freshly index
+    samtools index $bam_base
+
+    # Extract FASTQ
+    java -jar ../../../../../software/bazam.jar -bam $bam_base -L ../../TWIST.22.bed -r1 ${bam_base%.bam}_R1.fastq -r2 ${bam_base%.bam}_R2.fastq
+
+    # Gzip the FASTQ reads
+    gzip *.fastq
+
+    # Subset to 10K reads per sample
+    sample=`echo $bam_base | cut -f 1 -d '.'`
+    seqtk sample -s 100 ${bam_base%.bam}_R1.fastq.gz 10000 > ${sample}_R1.fastq.gz
+    seqtk sample -s 100 ${bam_base%.bam}_R2.fastq.gz 10000 > ${sample}_R2.fastq.gz
+
+    # Move to output folder
+    mkdir -p ../../ChineseTrio
+    mv ${sample}_R{1,2}.fastq.gz ../../ChineseTrio
+done

 # Move back up to enclosing folder
 cd ../../..