validation.nf

nextflow.enable.dsl = 2

include {read_inputs} from './inputs.nf'

process observed_md5 {
    // Run md5sum on a file to get its observed checksum

    input:
    path(downloaded_file)

    output:
    tuple(val("${downloaded_file.getName()}"), stdout)

    script:
    """
    md5sum $downloaded_file | cut -d ' ' -f 1
    """
}

process expected_md5 {
    /*
    Grep out downloaded_file's expected checksum from md5sum_file. Assumes the
    md5sum_file to be in the format '<checksum> path/to/downloaded.file',
    separated by a space.
    */

    input:
    path(downloaded_file)
    path(md5sum_file)

    output:
    tuple(val("${downloaded_file.getName()}"), stdout)

    script:
    """
    grep $downloaded_file $md5sum_file | cut -d ' ' -f 1
    """
}

process raise_errors {
    // Raise any identified checksum mismatches

    input:
    val(errors)
    
    exec:
    exit 1, "MD5 mismatches found"
}

workflow validation {
    /*
    Take a parsed samplesheet, flatten it and parse into a channel of observed vs.
    expected checksums. Calls check_errors above to raise an exception upon any
    mismatches.
    */

    take:
        ch_samplesheet_info
    
    main:
        ch_fastqs = ch_samplesheet_info
        .map(
            { indv, r1, r2 ->
                [r1, r2]
            }
        ).flatten()
        .map({file(it)})
        
        ch_md5_files = ch_fastqs.map(
            { fastq -> fastq.getParent().getParent() + '/md5sums.txt' }
        )
        
        ch_obs = observed_md5(ch_fastqs)
        ch_exp = expected_md5(ch_fastqs, ch_md5_files)

        ch_mismatches = ch_obs.concat(ch_exp)
            .map({fastq, md5 -> [fastq, md5.strip()]})
            .groupTuple()
            .filter({it[1][0] != it[1][1]})
            .collect({"${it[0]}: ${it[1][0]} != ${it[1][1]}"})
        
        ch_mismatches.view({"\nChecksum mismatches:\n${it.join('\n')}"})
        raise_errors(ch_mismatches)
}