diff --git a/docs/Software_installation.md b/docs/Software_installation.md index 984718cdb44cb2464810b5eac69048e3b9be9b73..9d68329cedde6b0b01e8e181514a42202caf2856 100644 --- a/docs/Software_installation.md +++ b/docs/Software_installation.md @@ -48,6 +48,65 @@ Increase JVM memory for GATK in galaxy/bcbio_system.yaml   jvm_opts: ["-Xms500m", "-Xmx5g"] ``` +### Verifybamid custom panel for exomes + +``` +source /home/u035/project/scripts/trio_whole_exome_config.sh + +mkdir /home/u035/project/software/install/1000G_phase3_hg38 +cd /home/u035/project/software/install/1000G_phase3_hg38 + +# download the 1000 Genomes autosomes + X site VCFs +for ((i = 1; i <= 22; i = i + 1)) +do + wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz; + wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz.tbi +done +wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz +wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz.tbi +cd .. + +# create bare to prefixed chromosome map +for ((i = 1; i <= 22; i = i + 1)) +do + echo $i "chr"$i >> chr_prefix_map.txt +done +echo chrX >> chr_prefix_map.txt + +# use the TWIST kit to subset the variants and add the chr prefix at the same time +for file in 1000G_phase3_hg38/*vcf.gz +do + bname=`basename $file` + bcftools view -R /home/u035/project/resources/Twist_Exome_Target_hg38.bed -m2 -M2 -v snps -i 'AF >= 0.01' $file | bcftools annotate --rename-chrs chr_prefix_map.txt | bgzip -c > ${bname%.vcf.gz}.biallelic.snps.m\ +inAF0.01.vcf.gz + tabix ${bname%.vcf.gz}.biallelic.snps.minAF0.01.vcf.gz +done + +# concatenate all the files in the correct order +bcftools concat -o ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz -O z \ + ALL.chr[1-9].shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz \ + ALL.chr[12][0-9].shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz \ + ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz +tabix ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz + +# use VerifyBamID to create the new panel +/home/u035/project/software/bcbio/anaconda/share/verifybamid2-1.0.6-0/VerifyBamID \ + --RefVCF ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz + --Reference bcbio-1.1.5/genomes/Hsapiens/hg38/seq/hg38.fa + +# rename the files to the correct format +mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.bed 1000g.phase3.100k.b38.vcf.gz.dat.bed +mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.mu 1000g.phase3.100k.b38.vcf.gz.dat.mu +mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.PC 1000g.phase3.100k.b38.vcf.gz.dat.V +mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.UD 1000g.phase3.100k.b38.vcf.gz.dat.UD + +# move them into the correct location, backing up the original resource folder +cd /home/u035/project/software/bcbio/anaconda/share/verifybamid2-1.0.6-0 +mv resource resource.bak +mkdir resource +mv /home/u035/project/software/install/1000G_phase3_hg38/1000g.phase3.100k.b38* resource/ +``` + ## VASE VASE was installed following the instructions at https://github.com/david-a-parry/vase, downloaded 27 August 2020.