Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
trio-whole-exome
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
igmmbioinformatics
trio-whole-exome
Commits
2f6dc35d
Commit
2f6dc35d
authored
3 years ago
by
not populated not populated
Browse files
Options
Downloads
Patches
Plain Diff
GIAB test data script downloads & extracts small sets of reads for 3 trios
parent
af6a1054
No related branches found
Branches containing commit
No related tags found
2 merge requests
!2
Giab test data
,
!1
NextFlow
Pipeline
#8457
failed
3 years ago
Stage: test
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
tests/assets/input_data/scripts/giab.sh
+117
-24
117 additions, 24 deletions
tests/assets/input_data/scripts/giab.sh
with
117 additions
and
24 deletions
tests/assets/input_data/scripts/giab.sh
+
117
−
24
View file @
2f6dc35d
...
...
@@ -5,35 +5,104 @@
#
# Requires:
# samtools
#
# bedtools
# bazam
TWIST_TARGET
=
Twist_Exome_RefSeq_targets_hg38.plus15bp.bed
CHR22_TWIST
=
$1
###
# Prepare the chr22 target subset
###
mkdir
-p
giab
cd
giab
echo
"chr22 0 50818468"
>
chr22.bed
perl
-pi
-e
's/ /\t/g'
chr22.bed
bedtools intersect
-a
chr22.bed
-b
../
$TWIST_TARGET
>
TWIST.chr22.bed
sed
-e
's/chr//'
TWIST.chr22.bed
>
TWIST.22.bed
###
# AshkenazimTrio: Illumina whole exome
##
# AshkenazimTrio: Illumina whole exome
(for testing this script)
##
# Fetch the index
mkdir
-p
giab/raw_data/AshkenazimTrio
cd
giab/raw_data/AshkenazimTrio
#wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/AshkenazimTrio/alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015
mkdir
-p
raw_data/AshkenazimTrioExome
cd
raw_data/AshkenazimTrioExome
wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/AshkenazimTrio/alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015
index
=
alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015
# Download the BAM, BAI, and md5 checksums
for
((
i
=
2
;
i <
=
4
;
i
=
i + 1
))
do
bam
=
`
head
-n
$i
alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 |
tail
-n
1 |
cut
-f
1
`
bam_md5
=
`
head
-n
$i
alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 |
tail
-n
1 |
cut
-f
2
`
bai
=
`
head
-n
$i
alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 |
tail
-n
1 |
cut
-f
3
`
bai_md5
=
`
head
-n
$i
alignment.index.AJtrio_OsloUniversityHospital_IlluminaExome_bwamem_GRCh37_11252015 |
tail
-n
1 |
cut
-f
4
`
# Download the BAM and its md5 checksum
bam
=
`
head
-n
$i
$index
|
tail
-n
1 |
cut
-f
1
`
bam_md5
=
`
head
-n
$i
$index
|
tail
-n
1 |
cut
-f
2
`
bam_base
=
`
basename
$bam
`
# Validate the md5 checksum
echo
$bam_md5
$bam_base
>
$bam_base
.md5
wget
$bam
md5sum
--check
$bam_base
.md5
# Freshly index
samtools index
$bam_base
# Extract FASTQ
java
-jar
../../../../../software/bazam.jar
-bam
$bam_base
-L
../../TWIST.22.bed
-r1
${
bam_base
%.bam
}
_R1.fastq
-r2
${
bam_base
%.bam
}
_R2.fastq
# Gzip the FASTQ reads
gzip
*
.fastq
# Subset to 10K reads per sample
sample
=
`
echo
$bam_base
|
cut
-f
6
-d
'_'
`
seqtk sample
-s
100
${
bam_base
%.bam
}
_R1.fastq.gz 10000
>
${
sample
}
_R1.fastq.gz
seqtk sample
-s
100
${
bam_base
%.bam
}
_R2.fastq.gz 10000
>
${
sample
}
_R2.fastq.gz
# Move to output
mkdir
-p
../../AshkenazimTrioExome
mv
${
sample
}
_R
{
1,2
}
.fastq.gz ../../AshkenazimTrioExome
done
# Move back up to giab working folder
cd
../..
###
# AshkenazimTrio:
###
mkdir
-p
raw_data/AshkenazimTrio
cd
raw_data/AshkenazimTrio
wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/AshkenazimTrio/alignment.index.AJtrio_Illumina_2x250bps_novoalign_GRCh37_GRCh38_NHGRI_06062016
index
=
alignment.index.AJtrio_Illumina_2x250bps_novoalign_GRCh37_GRCh38_NHGRI_06062016
for
((
i
=
2
;
i <
=
6
;
i
=
i + 2
))
do
# Download the BAM and its md5 checksum
bam
=
`
head
-n
$i
$index
|
tail
-n
1 |
cut
-f
1
`
bam_md5
=
`
head
-n
$i
$index
|
tail
-n
1 |
cut
-f
2
`
bam_base
=
`
basename
$bam
`
bai_base
=
`
basename
$bai
`
# Validate the md5 checksum
echo
$bam_md5
$bam_base
>
$bam_base
.md5
#
wget $bam
wget
$bam
md5sum
--check
$bam_base
.md5
# Freshly index
samtools index
$bam_base
# Extract FASTQ
java
-jar
../../../../../software/bazam.jar
-bam
$bam_base
-L
../../TWIST.22.bed
-r1
${
bam_base
%.bam
}
_R1.fastq
-r2
${
bam_base
%.bam
}
_R2.fastq
# Gzip the FASTQ reads
gzip
*
.fastq
# Subset to 10K reads per sample
sample
=
`
echo
$bam_base
|
cut
-f
1
-d
'.'
`
seqtk sample
-s
100
${
bam_base
%.bam
}
_R1.fastq.gz 10000
>
${
sample
}
_R1.fastq.gz
seqtk sample
-s
100
${
bam_base
%.bam
}
_R2.fastq.gz 10000
>
${
sample
}
_R2.fastq.gz
# Move to output folder
mkdir
-p
../../AshkenazimTrio
mv
${
sample
}
_R
{
1,2
}
.fastq.gz ../../AshkenazimTrio
done
# Move back up to giab working folder
...
...
@@ -46,16 +115,40 @@ cd ../..
# Fetch the index
mkdir
-p
raw_data/ChineseTrio
cd
raw_data/ChineseTrio
#wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/ChineseTrio/alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016
# Download the BAM, BAI, and md5 checksums
#for line in `grep -v BAM_MD5 alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016 | grep GRCh38`
#do
# for file in $line
# do
# wget $file
# done
#done
wget https://raw.githubusercontent.com/genome-in-a-bottle/giab_data_indexes/master/ChineseTrio/alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016
index
=
alignment.index.ChineseTrio_Illumina300X100X_wgs_novoalign_GRCh37_GRCh38_NHGRI_04062016
for
((
i
=
2
;
i <
=
6
;
i
=
i + 2
))
do
# Download the BAM and its md5 checksum
bam
=
`
head
-n
$i
$index
|
tail
-n
1 |
cut
-f
1
`
bam_md5
=
`
head
-n
$i
$index
|
tail
-n
1 |
cut
-f
2
`
bam_base
=
`
basename
$bam
`
# Validate the md5 checksum
echo
$bam_md5
$bam_base
>
$bam_base
.md5
wget
$bam
md5sum
--check
$bam_base
.md5
# Freshly index
samtools index
$bam_base
# Extract FASTQ
java
-jar
../../../../../software/bazam.jar
-bam
$bam_base
-L
../../TWIST.22.bed
-r1
${
bam_base
%.bam
}
_R1.fastq
-r2
${
bam_base
%.bam
}
_R2.fastq
# Gzip the FASTQ reads
gzip
*
.fastq
# Subset to 10K reads per sample
sample
=
`
echo
$bam_base
|
cut
-f
1
-d
'.'
`
seqtk sample
-s
100
${
bam_base
%.bam
}
_R1.fastq.gz 10000
>
${
sample
}
_R1.fastq.gz
seqtk sample
-s
100
${
bam_base
%.bam
}
_R2.fastq.gz 10000
>
${
sample
}
_R2.fastq.gz
# Move to output folder
mkdir
-p
../../ChineseTrio
mv
${
sample
}
_R
{
1,2
}
.fastq.gz ../../ChineseTrio
done
# Move back up to enclosing folder
cd
../../..
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment