diff --git a/README.md b/README.md index e241b539eb9583ef4950cdad3c48c3f025f35120..7edbc7775fa22fa577e1b723dea62bfb61380708 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool ## Resources -Download the EGA Cryptor JAR file. It's at bin/ega-cryptor-2.0.0.jar in this repository if the EGA link doesn't work. +The EGA-Cryptor JAR file is from ega-archive.org and stored at `/modules/local/ega/encrypt/resources`. ``` wget https://ega-archive.org/files/EgaCryptor.zip @@ -18,22 +18,34 @@ rm EgaCryptor.zip ## Running -The CSV file used to upload sample metadata to EGA must be provided. It links the internal EGA sample alias to its name. This pipeline assumes that the FASTQ files for upload are named in the format sample_R1.fastq.gz, sample_R2.fastq.gz. +The CSV file used to upload sample metadata to EGA must be provided. It links the internal EGA sample alias to its name. This pipeline assumes that the FASTQ files for upload are named in the format `sample_R1.fastq.gz`, `sample_R2.fastq.gz`, where `sample` is the entry in the `subjectId` field of the sample CSV file. + +To run and upload automatically: ``` -nextflow run https://git.ecdf.ed.ac.uk/igmmbioinformatics/ega-submission-via-portal \ +nextflow https://git.ecdf.ed.ac.uk/igmmbioinformatics/ega-submission-via-portal \ -profile conda \ --reads '*_R{1,2}.fastq.gz' \ --samples /absolute/path/to/samples.csv \ - --ega_cryptor /absolute/path/to/ega-cryptor-2.0.0.jar \ --outdir output \ --ega_user ega-box-1234 \ - --ega_password password + --egapass /absolute/path/to/egapass +``` + +To encrypt and produce a `runs.csv` file without uploading: + +``` +nextflow run ameynert/ega-submission-via-portal \ + -profile conda \ + --reads '*_R{1,2}.fastq.gz' \ + --samples /absolute/path/to/samples.csv \ + --outdir output ``` The CSV file for connecting uploaded paired-end FASTQ files to their sample aliases in the EGA Submitter Portal will be in the specified output folder as runs.csv. ## Credits -ega-submission-via-portal was originally written by Alison Meynert (alison.meynert@igmm.ed.ac.uk). +Alison Meynert (alison.meynert@ed.ac.uk) +Murray Wham (murray.wham@ed.ac.uk) diff --git a/conf/eddie.config b/conf/eddie.config deleted file mode 100644 index fa62c115057d77d1f827bc06cde768c6eaa40a24..0000000000000000000000000000000000000000 --- a/conf/eddie.config +++ /dev/null @@ -1,40 +0,0 @@ -/* - * ---------------------------------------------------- - * University of Edinburgh eddie config file - * ---------------------------------------------------- - */ - -executor = "local" - -process { - - beforeScript = """ - . /etc/profile.d/modules.sh - sleep 2; - """ - - penv = "sharedmem" - - cpus = 2 - memory = 4.GB - time = 4.h - clusterOptions = "-l h_vmem=${memory.toString().replaceAll(/[\sB]/,'')}" - - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'terminate' } - maxRetries = 1 - maxErrors = '-1' - - withName: encrypt { - cpus = { check_max( 8, 'cpus' ) } - memory = { check_max( 16.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } - } - -} - -params { - // Defaults only, expecting to be overwritten - max_memory = 256.GB - max_cpus = 16 - max_time = 240.h -} diff --git a/main.nf b/main.nf index 5e7c66d34082abf80c0dc4a29368c7d98c91f364..71684064467639dbcc76f2fdf71b6716785891bc 100644 --- a/main.nf +++ b/main.nf @@ -1,4 +1,6 @@ #!/usr/bin/env nextflow +nextflow.enable.dsl=2 + /* ======================================================================================== ega-submission-via-portal @@ -7,130 +9,49 @@ ---------------------------------------------------------------------------------------- */ -def helpMessage() { - log.info""" +include { EGA_ENCRYPT } from './modules/local/ega/encrypt' +include { EGA_COLLECTRUNCSVS } from './modules/local/ega/collectruncsvs' +include { EGA_UPLOAD } from './modules/local/ega/upload' + +def helpMessage = """ Usage: The typical command for running the pipeline is as follows: - nextflow run https://git.ecdf.ed.ac.uk/igmmbioinformatics/ega-submission-via-portal --reads '*_R{1,2}.fastq.gz' -profile conda + nextflow run https://git.ecdf.ed.ac.uk/igmmbioinformatics/ega-submission-via-portal + --samples samples.csv + --reads '*_R{1,2}.fastq.gz' + -profile conda + Mandatory arguments: - --reads [file] Path to input data (must be surrounded with quotes) - --samples [file] Path to EGA sample.csv file - --ega_cryptor [file] Absolute path to EGA Cryptor JAR file (included in bin/ega-cryptor-2.0.0.jar) - --ega_user [str] EGA upload box account (e.g. ega-box-1234) - --ega_pass [str] Password for EGA upload box account (TODO: securely pass this through to the upload process) + --samples [file] Path to samples CSV file + --reads [file] Path to input data (must be surrounded with quotes, e.g. '*_R[1,2].fastq.gz]') -profile [str] Configuration profile to use. Can use multiple (comma separated) - Available: conda + Available: conda, stubs Other options: - --outdir [file] The output directory where the results will be saved - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic - - """.stripIndent() -} - -// Show help message -if (params.help) { - helpMessage() - exit 0 -} - -/* - * SET UP CONFIGURATION VARIABLES - */ - -// Has the run name been specified by the user? -// this has the bonus effect of catching both -name and --name -custom_runName = params.name -if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { - custom_runName = workflow.runName -} - -/* - * Create a channel for input read files - */ -ch_read_files = Channel - .fromFilePairs(params.reads, size : 2) - .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!" } - -/* - * STEP 1 - Encrypt - */ -process encrypt { - tag "$name" - - input: - set val(name), file(reads) from ch_read_files - - output: - set val(name), file('*') into ch_encrypt_results - - script: - """ - java -Xmx8g -jar ${params.ega_cryptor} -i ${reads[0]} -t 8 -o . - java -Xmx8g -jar ${params.ega_cryptor} -i ${reads[1]} -t 8 -o . - """ -} - -/* - * Duplicate the encrypted reads channel - */ -ch_encrypt_results.into { ch_runs_csv_input; ch_upload_input } - -/* - * STEP 2 - Generate a line of CSV output for runs - */ -process runs_csv { - - input: - set sample, file(files) from ch_runs_csv_input - - output: - file "*.csv" into ch_runs_csv_output - - script: - """ - echo "${sample},${sample}_R1.fastq.gz,`cat ${files[1]}`,`cat ${files[2]}`,${sample}_R2.fastq.gz,`cat ${files[4]}`,`cat ${files[5]}`" > ${sample}.csv - """ -} + --outdir [file] The output directory where the results will be saved + --ega_user [str] EGA upload box account (e.g. ega-box-1234) + --egapass [str] Absolute path to a file containing password for EGA upload box account, must be specified if --ega-user is specified + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic +""".stripIndent() -/* - * STEP 3 - Collect the CSV output for runs - */ -process collect_runs_csv { - publishDir "${params.outdir}", mode: 'copy' +workflow { + if (params.help) { + log.info(helpMessage) + exit 0 + } - input: - file(files) from ch_runs_csv_output.collect() + ch_read_files = Channel + .fromFilePairs(params.reads, size: 2) + .ifEmpty({ exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!" }) - output: - file(runs) + ch_egapass = params.egapass && file(params.egapass).exists() ? Channel.fromPath(params.egapass).collect() : Channel.value('') - script: - runs = "runs.csv" - """ - echo \"Sample alias\",\"First Fastq File\",\"First Checksum\",\"First Unencrypted checksum\",\"Second Fastq File\",\"Second Checksum\",\"Second Unencrypted checksum\" > runs_pre.csv - cat ${files} >> runs_pre.csv - map_sample_alias.pl -i runs_pre.csv -s ${params.samples} -o ${runs} - """ + EGA_ENCRYPT(ch_read_files) + EGA_COLLECTRUNCSVS(EGA_ENCRYPT.out.csv.map({sample, csv -> csv}).collect()) + EGA_UPLOAD(EGA_ENCRYPT.out.all, ch_egapass) } -/* - * STEP 4 - Upload output via Aspera to EGA box - */ -/*process upload { - - input: - set sample, file(files) from ch_upload_input - - output: - - script: - """ - export ASPERA_SCP_PASS=${params.ega_pass} - ascp -T -P 33001 -O 33001 -l 300M -QT -L- -k 1 ${sample}* ${params.ega_user}@fasp.ega.ebi.ac.uk:/. - """ -}*/ diff --git a/modules/local/ega/collectruncsvs.nf b/modules/local/ega/collectruncsvs.nf new file mode 100644 index 0000000000000000000000000000000000000000..9d1796e6170a36b9c50b73e0738e98f5be698878 --- /dev/null +++ b/modules/local/ega/collectruncsvs.nf @@ -0,0 +1,18 @@ +/* + * STEP 2 - Collect the CSV output for runs + */ + +process EGA_COLLECTRUNCSVS { + input: + path(files) + + output: + path("runs.csv") + + script: + """ + echo \"Sample alias\",\"First Fastq File\",\"First Checksum\",\"First Unencrypted checksum\",\"Second Fastq File\",\"Second Checksum\",\"Second Unencrypted checksum\" > runs.csv + cat ${files} >> runs.csv + """ +} + diff --git a/modules/local/ega/encrypt/environment.yml b/modules/local/ega/encrypt/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..f19269428713e4a676640d4a0c2e6834fc810e69 --- /dev/null +++ b/modules/local/ega/encrypt/environment.yml @@ -0,0 +1,8 @@ +--- +name: ega-encrypt +channels: + - conda-forge + - defaults +dependencies: + - conda-forge::openjdk + diff --git a/modules/local/ega/encrypt/main.nf b/modules/local/ega/encrypt/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..25b9f142c22bb8addc3e7790e7dfef1e517ae4d1 --- /dev/null +++ b/modules/local/ega/encrypt/main.nf @@ -0,0 +1,34 @@ +/* + * STEP 1 - Encrypt the FASTQ files. Generate a line of CSV output for runs, if not uploading to EGA, move the + * encrypted files and md5 checksums to the output directory. + */ +process EGA_ENCRYPT { + tag "$sample" + conda "${moduleDir}/environment.yml" + + input: + tuple val(sample), path(reads) + + output: + tuple val(sample), path('*.csv'), emit: csv + tuple val(sample), path('*.*'), emit: all + + script: + """ + java -Xmx8g -jar ${moduleDir}/resources/ega-cryptor-2.0.0.jar -i ${reads[0]} -t 8 -o . + java -Xmx8g -jar ${moduleDir}/resources/ega-cryptor-2.0.0.jar -i ${reads[1]} -t 8 -o . + + echo "sample_${sample},${sample}_R1.fastq.gz,`cat ${sample}_R1.fastq.gz.gpg.md5`,`cat ${sample}_R1.fastq.gz.md5`,${sample}_R2.fastq.gz,`cat ${sample}_R2.fastq.gz.gpg.md5`,`cat ${sample}_R2.fastq.gz.md5`" > ${sample}.csv + """ + + stub: + """ + for f in ${reads} + do + touch \${f}.{md5,gpg,gpg.md5} + done + + echo "sample_${sample},${sample}_R1.fastq.gz,`cat ${sample}_R1.fastq.gz.gpg.md5`,`cat ${sample}_R1.fastq.gz.md5`,${sample}_R2.fastq.gz,`cat ${sample}_R2.fastq.gz.gpg.md5`,`cat ${sample}_R2.fastq.gz.md5`" > ${sample}.csv + """ +} + diff --git a/bin/ega-cryptor-2.0.0.jar b/modules/local/ega/encrypt/resources/ega-cryptor-2.0.0.jar similarity index 100% rename from bin/ega-cryptor-2.0.0.jar rename to modules/local/ega/encrypt/resources/ega-cryptor-2.0.0.jar diff --git a/modules/local/ega/upload/environment.yml b/modules/local/ega/upload/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..5430bb468c61f0666a53f91b26bcc2a8a989a2b2 --- /dev/null +++ b/modules/local/ega/upload/environment.yml @@ -0,0 +1,9 @@ +--- +name: ega-upload +channels: + - hcc + - conda-forge + - defaults +dependencies: + - aspera-cli + diff --git a/modules/local/ega/upload/main.nf b/modules/local/ega/upload/main.nf new file mode 100644 index 0000000000000000000000000000000000000000..03ef357ed1ad37176215efdae51be4af85aa3015 --- /dev/null +++ b/modules/local/ega/upload/main.nf @@ -0,0 +1,27 @@ +/* + * STEP 3 - Upload output via Aspera to EGA box + */ +process EGA_UPLOAD { + tag "${sample}" + conda "${moduleDir}/environment.yml" + + when: params.ega_user + + input: + tuple val(sample), path(files) + val(egapass) // not a path to avoid excessive linking and accidental copying of pass file if stageInMode 'copy' is used + + script: + """ + ls ${egapass[0]} + export ASPERA_SCP_PASS=\$(cat ${egapass[0]}) + ascp -T -P 33001 -O 33001 -l 300M -QT -L- -k 1 ${sample}* ${params.ega_user}@fasp.ega.ebi.ac.uk:/. + """ + + stub: + """ + echo 'Would run:' + echo 'export ASPERA_SCP_PASS=\$(cat ${egapass[0]})' + echo 'ascp -T -P 33001 -O 33001 -l 300M -QT -L- -k 1 ${sample}* ${params.ega_user}@fasp.ega.ebi.ac.uk:/.' + """ +} diff --git a/nextflow.config b/nextflow.config index 12b87ba24f7e387f1a9047807f97ed45b8c507a3..75513aeeaab3903f6f8c7d1bf1bd3aec57098684 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,9 +10,8 @@ params { // Workflow flags reads = "data/*.fastq.gz" - ega_cryptor = "bin/ega-cryptor-2.0.0.jar" - ega_user = "ega-box-1234" - ega_pass = "password" + ega_user = "" + ega_password = "" outdir = './results' samples = 'samples.csv' @@ -31,11 +30,41 @@ params { } -// Load eddie.config by default for all pipelines -includeConfig 'conf/eddie.config' +process { + withName: EGA_ENCRYPT { + cpus = 8 + memory = 12.GB + time = 2.h + + publishDir = [ + path: { params.outdir }, + mode: 'symlink', + pattern: '*.{gpg,md5}', + enabled: params.ega_user == null || params.ega_user == '' + ] + } + + withName: EGA_COLLECTRUNCSVS { + cpus = 1 + memory = 2.GB + time = 10.m + + publishDir = [ + path: { params.outdir }, + mode: 'copy' + ] + } + + withName: EGA_UPLOAD { + cpus = 1 + memory = 2.GB + time = 8.h + } +} profiles { - conda { process.conda = "$baseDir/environment.yml" } + conda { conda.enabled = true } + stubs { conda.enabled = false } } // Export this variable to prevent local Python libraries from conflicting with those in the container