From 537f6c09e16dcddd8ae193f1a70c600c6bd274c6 Mon Sep 17 00:00:00 2001
From: ameyner2 <alison.meynert@ed.ac.uk>
Date: Fri, 13 Aug 2021 10:43:50 +0100
Subject: [PATCH] Initial documentation of software installation on ultra2
 (sdf-cs1) at EIDF/EPCC. Got as far as executing bcbio base installation.

---
 docs/Software_installation_ultra2.md | 176 +++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 docs/Software_installation_ultra2.md

diff --git a/docs/Software_installation_ultra2.md b/docs/Software_installation_ultra2.md
new file mode 100644
index 0000000..19fe100
--- /dev/null
+++ b/docs/Software_installation_ultra2.md
@@ -0,0 +1,176 @@
+# Installation of software for trio whole exome project
+
+## Aspera
+
+Downloaded Aspera Connect version 3.9.6.1467 installer script from https://downloads.asperasoft.com to /home/u035/project/software/install and run it. This installs the software in ~/.aspera, so it needs to be moved to the shared folder.
+
+```
+bash ibm-aspera-cli-3.9.6.1467.159c5b1-linux-64-release.sh
+mv ~/.aspera ../aspera
+```
+
+## bcbio
+
+Version 1.2.8 (14 April 2021).
+
+Start with installing the base software, and add datatargets.
+
+This will take a long time, and may require multiple runs if it fails on a step. It will resume if needed. Run on a screen session and log each attempt. It's important to set the limit on the number of concurrently open files to as high as possible (4096).
+
+```
+cd /home/u035/u035/shared/software/install
+mkdir bcbio_install_logs
+
+wget https://raw.github.com/bcbio/bcbio-nextgen/master/scripts/bcbio_nextgen_install.py
+
+ulimit -n 4096
+
+DATE=`date +%Y%m%d%H%M`
+python3 bcbio_nextgen_install.py /home/u035/u035/shared/software/bcbio \
+  --tooldir /home/u035/u035/shared/software/bcbio/tools \
+  --genomes hg38 --aligners bwa \
+  --cores 128 &> bcbio_install_logs/bcbio_install_base_${DATE}.log
+```
+
+Note: this could be fixed in version 1.2.8. Check htslib version after base install.
+
+Fix an issue with bcbio & vt/samtools/htslib. See https://github.com/bcbio/bcbio-nextgen/issues/3327 and https://github.com/bcbio/bcbio-nextgen/issues/3328.
+
+```
+DATE=`date +%Y%m%d%H%M`
+/home/u035/project/software/bcbio/tools/bin/bcbio_nextgen.py upgrade -u development --tools &> bcbio_install_logs/bcbio_install_upgrade_tools_${DATE}.log
+```
+
+Install datatarget vep
+
+```
+DATE=`date +%Y%m%d%H%M`
+/home/u035/project/software/bcbio/tools/bin/bcbio_nextgen.py upgrade -u skip --datatarget vep &> bcbio_install_logs/bcbio_install_datatarget_vep_${DATE}.log
+```
+
+gnomAD 3.1.1 installation will take about 6 days (old ultra). Might be faster on ultra2.
+
+```
+DATE=`date +%Y%m%d%H%M`
+/home/u035/project/software/bcbio/tools/bin/bcbio_nextgen.py upgrade -u skip --datatarget gnomad &> bcbio_install_logs/bcbio_install_datatarget_gnomad_${DATE}.log
+```
+
+Increase JVM memory for GATK in galaxy/bcbio_system.yaml
+
+```
+  gatk:
+    jvm_opts: ["-Xms500m", "-Xmx5g"]
+```
+
+### Patch Ensembl VEP 100.4
+
+See https://github.com/Ensembl/ensembl-variation/pull/621/files
+
+Edit /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-100.4-0/Bio/EnsEMBL/Variation/BaseAnnotation.pm accordingly.
+
+### Verifybamid custom panel for exomes
+
+```
+source /home/u035/project/scripts/trio_whole_exome_config.sh
+
+mkdir /home/u035/project/software/install/1000G_phase3_hg38
+cd /home/u035/project/software/install/1000G_phase3_hg38
+
+# download the 1000 Genomes autosomes + X site VCFs
+for ((i = 1; i <= 22; i = i + 1))
+do
+  wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz;
+  wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz.tbi
+done
+wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
+wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz.tbi
+cd ..
+
+# create bare to prefixed chromosome map
+for ((i = 1; i <= 22; i = i + 1))
+do
+  echo $i "chr"$i >> chr_prefix_map.txt
+done
+echo chrX >> chr_prefix_map.txt
+
+# use the TWIST kit to subset the variants and add the chr prefix at the same time
+for file in 1000G_phase3_hg38/*vcf.gz
+do
+  bname=`basename $file`
+  bcftools view -R /home/u035/project/resources/Twist_Exome_Target_hg38.bed -m2 -M2 -v snps -i 'AF >= 0.01' $file | bcftools annotate --rename-chrs chr_prefix_map.txt | bgzip -c > ${bname%.vcf.gz}.biallelic.snps.m\
+inAF0.01.vcf.gz
+  tabix ${bname%.vcf.gz}.biallelic.snps.minAF0.01.vcf.gz
+done
+
+# concatenate all the files in the correct order
+bcftools concat -o ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz -O z \
+  ALL.chr[1-9].shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz \
+  ALL.chr[12][0-9].shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz \
+  ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz
+tabix ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz
+
+# use VerifyBamID to create the new panel
+/home/u035/project/software/bcbio/anaconda/share/verifybamid2-1.0.6-0/VerifyBamID \
+  --RefVCF ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz
+  --Reference bcbio-1.1.5/genomes/Hsapiens/hg38/seq/hg38.fa
+
+# rename the files to the correct format
+mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.bed 1000g.phase3.100k.b38.vcf.gz.dat.bed
+mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.mu 1000g.phase3.100k.b38.vcf.gz.dat.mu
+mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.PC 1000g.phase3.100k.b38.vcf.gz.dat.V
+mv ALL.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.chr.biallelic.snps.minAF0.01.vcf.gz.UD 1000g.phase3.100k.b38.vcf.gz.dat.UD
+
+# move them into the correct location, backing up the original resource folder
+cd /home/u035/project/software/bcbio/anaconda/share/verifybamid2-1.0.6-0
+mv resource resource.bak
+mkdir resource
+mv /home/u035/project/software/install/1000G_phase3_hg38/1000g.phase3.100k.b38* resource/
+```
+
+## Python modules
+
+### VASE
+
+VASE v0.4 was installed 28 August 2020.
+
+```
+cd /home/u035/project/software
+./bcbio/anaconda/bin/pip3 install git+git://github.com/david-a-parry/vase.git#egg=project[BGZIP,REPORTER,MYGENE]
+```
+
+### XlsxWriter
+
+XlsxWriter 1.3.3 was installed 28 August 2020.
+
+```
+cd /home/u035/project/software
+./bcbio/anaconda/bin/pip3 install XlsxWriter
+```
+
+## GATK 3.8
+
+```
+cd /home/u035/project/software/install
+wget https://storage.googleapis.com/gatk-software/package-archive/gatk/GenomeAnalysisTK-3.8-0-ge9d806836.tar.bz2
+bzip2 -d GenomeAnalysisTK-3.8-0-ge9d806836.tar.bz2 
+tar -xf GenomeAnalysisTK-3.8-0-ge9d806836.tar
+mv GenomeAnalysisTK-3.8-0-ge9d806836 ../GenomeAnalysisTK-3.8
+```
+
+## RTG tools
+
+```
+cd /home/u035/project/software
+wget https://github.com/RealTimeGenomics/rtg-tools/releases/download/3.11/rtg-tools-3.11-linux-x64.zip
+unzip rtg-tools-3.11-linux-x64.zip
+rm rtg-tools-3.11-linux-x64.zip
+```
+
+## IGV
+
+```
+cd /home/u035/project/software
+wget https://data.broadinstitute.org/igv/projects/downloads/2.8/IGV_Linux_2.8.9.zip
+unzip IGV_Linux_2.8.9.zip
+rm IGV_Linux_2.8.9.zip
+```
-- 
GitLab