Skip to content
Snippets Groups Projects
Commit 2f6dc35d authored by not populated not populated's avatar not populated not populated
Browse files

GIAB test data script downloads & extracts small sets of reads for 3 trios

parent af6a1054
No related branches found
No related tags found
2 merge requests!2Giab test data,!1NextFlow
Pipeline #8457 failed
......@@ -5,35 +5,104 @@
#
# Requires:
# samtools
#
# bedtools
# bazam
TWIST_TARGET=Twist_Exome_RefSeq_targets_hg38.plus15bp.bed
CHR22_TWIST=$1
###
# Prepare the chr22 target subset
###
mkdir -p giab
cd giab
echo "chr22 0 50818468" > chr22.bed
perl -pi -e 's/ /\t/g' chr22.bed
bedtools intersect -a chr22.bed -b ../$TWIST_TARGET > TWIST.chr22.bed
sed -e 's/chr//' TWIST.chr22.bed > TWIST.22.bed
###
# AshkenazimTrio: Illumina whole exome
##
# AshkenazimTrio: Illumina whole exome (for testing this script)
##
# Fetch the index
mkdir -p giab/raw_data/AshkenazimTrio
cd giab/raw_data/AshkenazimTrio
#wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/AshkenazimTrio/alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015
mkdir -p raw_data/AshkenazimTrioExome
cd raw_data/AshkenazimTrioExome
wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/AshkenazimTrio/alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015
index=alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015
# Download the BAM, BAI, and md5 checksums
for ((i = 2; i <= 4; i = i + 1))
do
bam=`head -n $i alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 | tail -n 1 | cut -f 1`
bam_md5=`head -n $i alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 | tail -n 1 | cut -f 2`
bai=`head -n $i alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 | tail -n 1 | cut -f 3`
bai_md5=`head -n $i alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 | tail -n 1 | cut -f 4`
# Download the BAM and its md5 checksum
bam=`head -n $i $index | tail -n 1 | cut -f 1`
bam_md5=`head -n $i $index | tail -n 1 | cut -f 2`
bam_base=`basename $bam`
# Validate the md5 checksum
echo $bam_md5 $bam_base > $bam_base.md5
wget $bam
md5sum --check $bam_base.md5
# Freshly index
samtools index $bam_base
# Extract FASTQ
java -jar ../../../../../software/bazam.jar -bam $bam_base -L ../../TWIST.22.bed -r1 ${bam_base%.bam}_R1.fastq -r2 ${bam_base%.bam}_R2.fastq
# Gzip the FASTQ reads
gzip *.fastq
# Subset to 10K reads per sample
sample=`echo $bam_base | cut -f 6 -d '_'`
seqtk sample -s 100 ${bam_base%.bam}_R1.fastq.gz 10000 > ${sample}_R1.fastq.gz
seqtk sample -s 100 ${bam_base%.bam}_R2.fastq.gz 10000 > ${sample}_R2.fastq.gz
# Move to output
mkdir -p ../../AshkenazimTrioExome
mv ${sample}_R{1,2}.fastq.gz ../../AshkenazimTrioExome
done
# Move back up to giab working folder
cd ../..
###
# AshkenazimTrio:
###
mkdir -p raw_data/AshkenazimTrio
cd raw_data/AshkenazimTrio
wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/AshkenazimTrio/alignment.index.AJtrio_Illumina_2x250bps_novoalign_GRCh37_GRCh38_NHGRI_06062016
index=alignment.index.AJtrio_Illumina_2x250bps_novoalign_GRCh37_GRCh38_NHGRI_06062016
for ((i = 2; i <= 6; i = i + 2))
do
# Download the BAM and its md5 checksum
bam=`head -n $i $index | tail -n 1 | cut -f 1`
bam_md5=`head -n $i $index | tail -n 1 | cut -f 2`
bam_base=`basename $bam`
bai_base=`basename $bai`
# Validate the md5 checksum
echo $bam_md5 $bam_base > $bam_base.md5
# wget $bam
wget $bam
md5sum --check $bam_base.md5
# Freshly index
samtools index $bam_base
# Extract FASTQ
java -jar ../../../../../software/bazam.jar -bam $bam_base -L ../../TWIST.22.bed -r1 ${bam_base%.bam}_R1.fastq -r2 ${bam_base%.bam}_R2.fastq
# Gzip the FASTQ reads
gzip *.fastq
# Subset to 10K reads per sample
sample=`echo $bam_base | cut -f 1 -d '.'`
seqtk sample -s 100 ${bam_base%.bam}_R1.fastq.gz 10000 > ${sample}_R1.fastq.gz
seqtk sample -s 100 ${bam_base%.bam}_R2.fastq.gz 10000 > ${sample}_R2.fastq.gz
# Move to output folder
mkdir -p ../../AshkenazimTrio
mv ${sample}_R{1,2}.fastq.gz ../../AshkenazimTrio
done
# Move back up to giab working folder
......@@ -46,16 +115,40 @@ cd ../..
# Fetch the index
mkdir -p raw_data/ChineseTrio
cd raw_data/ChineseTrio
#wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/ChineseTrio/alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016
# Download the BAM, BAI, and md5 checksums
#for line in `grep -v BAM_MD5 alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016 | grep GRCh38`
#do
# for file in $line
# do
# wget $file
# done
#done
wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/ChineseTrio/alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016
index=alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016
for ((i = 2; i <= 6; i = i + 2))
do
# Download the BAM and its md5 checksum
bam=`head -n $i $index | tail -n 1 | cut -f 1`
bam_md5=`head -n $i $index | tail -n 1 | cut -f 2`
bam_base=`basename $bam`
# Validate the md5 checksum
echo $bam_md5 $bam_base > $bam_base.md5
wget $bam
md5sum --check $bam_base.md5
# Freshly index
samtools index $bam_base
# Extract FASTQ
java -jar ../../../../../software/bazam.jar -bam $bam_base -L ../../TWIST.22.bed -r1 ${bam_base%.bam}_R1.fastq -r2 ${bam_base%.bam}_R2.fastq
# Gzip the FASTQ reads
gzip *.fastq
# Subset to 10K reads per sample
sample=`echo $bam_base | cut -f 1 -d '.'`
seqtk sample -s 100 ${bam_base%.bam}_R1.fastq.gz 10000 > ${sample}_R1.fastq.gz
seqtk sample -s 100 ${bam_base%.bam}_R2.fastq.gz 10000 > ${sample}_R2.fastq.gz
# Move to output folder
mkdir -p ../../ChineseTrio
mv ${sample}_R{1,2}.fastq.gz ../../ChineseTrio
done
# Move back up to enclosing folder
cd ../../..
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment