From bd20f643ea7f3613de52a811dc1bf1570bf00b90 Mon Sep 17 00:00:00 2001 From: s1734289 <s1734289@sms.ed.ac.uk> Date: Wed, 22 Jun 2022 10:24:57 +0100 Subject: [PATCH] Update run_on_eddie_guide.md --- docs/run_on_eddie_guide.md | 92 +++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/docs/run_on_eddie_guide.md b/docs/run_on_eddie_guide.md index 1a1af33..496bd1e 100644 --- a/docs/run_on_eddie_guide.md +++ b/docs/run_on_eddie_guide.md @@ -2,5 +2,97 @@ A guide for any new users logging on to eddie and running the GIAB tests for the first time. This is based on my experience learning how to run the tests on eddie during my master's project. It took me 6 weeks to get the tests to successfully run. Hopefully this guide will make the process much easier for anyone in the future. The work was done in branch that was created off of master commit 6d2af8344dcecd588635019ffe94f629eeb10cb3 on the 4th of May. +## Login and clone repository +- Login to eddie using the command `ssh [user]@eddie.ecdf.ed.ac.uk`. The user should be your id, such as a student number +- Go to the shared directory `cd /exports/igmm/eddie/IGMM-VariantAnalysis/` and create a folder to work in ie: `mkdir emma` +- Enter your folder `cd emma` and clone the git repository `git clone https://git.ecdf.ed.ac.uk/igmmbioinformatics/trio-whole-exome.git` +This will clone the master branch of the git repository into your folder + +- Create a new branch to work on `git branch [new branch]` +- Switch branch so you are on your new branch `git checkout [new branch]` + +## Create a conda environment + +A specific conda environment is needed for running the pipeline. Luckily the specifications for this environment are provided in environment.yml + +- If this is your first time creating a conda environment on eddie, you will likely need to configure your .condarc file. +Instructions on how to do this are available at https://www.wiki.ed.ac.uk/display/ResearchServices/Anaconda +- Login to an interactive session. qlogin -l h_vmem=16G. Eddie needs at least 4G of memomory to load modules, so create an interactive session and request enough +- Load the anaconda module `module load anaconda` +- Create the conda environment using environment.yml `conda env create -n trio-pipe-env -f environment.yml` +- Activate the conda environment `source activate trio-pipe-env` + +## Create a config file for running nextflow on eddie + +Here is an example eddie.config file. It will liekly need to be changed so paths to files reflect the environment they are being run in. + +``` +arams { + // path to BCBio - should contain anaconda/bin/bcbio_nextgen.py + bcbio = '/exports/igmm/eddie/IGMM-VariantAnalysis/software/bcbio-1.0.9' + + // this will just be `<pipeline_repo>/trio_whole_exome_parse_peddy_ped_csv.pl`. Won't need this once + // Alison's merged some stuff + parse_peddy_output = '/exports/igmm/eddie/IGMM-VariantAnalysis/emma/trio-whole-exome/trio_whole_exome_parse_peddy_ped_csv.pl' + + // base BCBio variant calling template + bcbio_template = '/exports/igmm/eddie/IGMM-VariantAnalysis/emma/trio-whole-exome/tests/assets/bcbio/bcbio_template.yaml' + + // exome target BED file + target_bed = '/exports/igmm/eddie/IGMM-VariantAnalysis/emma/trio-whole-exome/tests/assets/input_data/Twist_Exome_RefSeq_targets_hg38.plus15bp.bed' + + // HG38 reference genome + reference_genome = '/exports/igmm/eddie/IGMM-VariantAnalysis/software/bcbio-1.0.9/genomes/Hsapiens/hg38/seq/hg38.fa' + + // pipeline outputs + output_dir = '/exports/igmm/eddie/IGMM-VariantAnalysis/emma/trio-whole-exome/tests/outputs' +} + +executor { + name = "local" + queueSize = "100" +} + +process { + // Set h_vmem to memory / cpus, and use the correct parallel environment (not set if cpus = 1) + clusterOptions = {"-l h_vmem=${(task.memory + 8.GB).bytes/task.cpus}"} + penv = { task.cpus > 1 ? "sharedmem" : null } + + // Use $TMPDIR for process execution + scratch = true + + // Attempt to retry up to 3 times if one of these common SGE error statuses is returned + // errorStrategy = {task.exitStatus in [143,137,104,134,139,140] ? 'retry' : 'finish'} + + errorStrategy = 'retry' + maxRetries = 1 + + // No maximum number of errors for a process across all instances + maxErrors = 5 + + //see if I can turn of parallel garbage collection for java + + + // Load Singularity and correctly set the container unpacking $TMPDIR environment variable + beforeScript = + """ + . /etc/profile.d/modules.sh + module load 'anaconda/5.3.1' + module load 'singularity' + module load igmm/apps/BEDTools + module load igmm/apps/samtools/1.6 + export SINGULARITY_TMPDIR="\$TMPDIR" + export SINGULARITY_BIND="/gpfs" + """ +} + + singularity { + envWhitelist = "SINGULARITY_TMPDIR,TMPDIR,SINGULARITY_BIND" + runOptions = '-p -B "$TMPDIR"' + enabled = true + autoMounts = true +} +``` +This setup allows nextflow to run singularity, -- GitLab