From 59bec6ab61372c291560f56e2ce12909bf143ed5 Mon Sep 17 00:00:00 2001 From: Murray Wham <murray.wham@ed.ac.uk> Date: Fri, 25 Oct 2024 13:24:17 +0100 Subject: [PATCH] Initial commit of metadata submission script --- .gitignore | 4 +- metadata/ega_metadata.py | 375 +++++++++++++++++++++++ metadata/requirements.txt | 4 + metadata/templates/analysis.xml | 23 ++ metadata/templates/dac.xml | 23 ++ metadata/templates/dataset.xml | 25 ++ metadata/templates/experiment.xml | 30 ++ metadata/templates/policy.xml | 37 +++ metadata/templates/run.xml | 13 + metadata/templates/sample.xml | 38 +++ metadata/templates/study.xml | 17 + metadata/templates/submission.xml | 20 ++ metadata/tests/abstract.txt | 1 + metadata/tests/ega_upload.yaml | 47 +++ metadata/tests/expected/dac.xml | 8 + metadata/tests/expected/dataset.xml | 10 + metadata/tests/expected/experiment-1.xml | 77 +++++ metadata/tests/expected/experiment-2.xml | 77 +++++ metadata/tests/expected/experiment-3.xml | 52 ++++ metadata/tests/expected/policy.xml | 14 + metadata/tests/expected/run-1.xml | 29 ++ metadata/tests/expected/run-2.xml | 29 ++ metadata/tests/expected/run-3.xml | 20 ++ metadata/tests/expected/sample.xml | 216 +++++++++++++ metadata/tests/expected/study.xml | 15 + metadata/tests/expected/submission.xml | 16 + metadata/tests/receipt.xml | 28 ++ metadata/tests/runs.csv | 9 + metadata/tests/samples.csv | 9 + metadata/tests/study_attributes.csv | 2 + metadata/tests/test_ega_metadata.py | 59 ++++ 31 files changed, 1326 insertions(+), 1 deletion(-) create mode 100644 metadata/ega_metadata.py create mode 100644 metadata/requirements.txt create mode 100644 metadata/templates/analysis.xml create mode 100644 metadata/templates/dac.xml create mode 100644 metadata/templates/dataset.xml create mode 100644 metadata/templates/experiment.xml create mode 100644 metadata/templates/policy.xml create mode 100644 metadata/templates/run.xml create mode 100644 metadata/templates/sample.xml create mode 100644 metadata/templates/study.xml create mode 100644 metadata/templates/submission.xml create mode 100644 metadata/tests/abstract.txt create mode 100644 metadata/tests/ega_upload.yaml create mode 100644 metadata/tests/expected/dac.xml create mode 100644 metadata/tests/expected/dataset.xml create mode 100644 metadata/tests/expected/experiment-1.xml create mode 100644 metadata/tests/expected/experiment-2.xml create mode 100644 metadata/tests/expected/experiment-3.xml create mode 100644 metadata/tests/expected/policy.xml create mode 100644 metadata/tests/expected/run-1.xml create mode 100644 metadata/tests/expected/run-2.xml create mode 100644 metadata/tests/expected/run-3.xml create mode 100644 metadata/tests/expected/sample.xml create mode 100644 metadata/tests/expected/study.xml create mode 100644 metadata/tests/expected/submission.xml create mode 100644 metadata/tests/receipt.xml create mode 100644 metadata/tests/runs.csv create mode 100644 metadata/tests/samples.csv create mode 100644 metadata/tests/study_attributes.csv create mode 100644 metadata/tests/test_ega_metadata.py diff --git a/.gitignore b/.gitignore index 683243f..e884596 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,10 @@ *~ *# +.idea *.nextflow* +__pycache__ results work reads output -test* +metadata/tests/xml diff --git a/metadata/ega_metadata.py b/metadata/ega_metadata.py new file mode 100644 index 0000000..53fc12c --- /dev/null +++ b/metadata/ega_metadata.py @@ -0,0 +1,375 @@ +import os +import re +import sys +import math +import yaml +import uuid +import jinja2 +import pandas +import logging +import argparse +import xml.dom.minidom + +multi_newline = re.compile(r'\n{2,}') + +options = { + # universal options - root parser has prog=__name__ + __name__: { + 'debug': {'help': 'Set the logging level to DEBUG', 'action': 'store_true', 'default': False, 'alias': '-d'}, + 'force': {'help': 'Always overwrite output XMLs', 'action': 'store_true', 'default': 'False', 'alias': '-f'}, + 'output_dir': {'help': 'Directory to write output XMLs', 'alias': '-o'}, + 'submission_contacts': {'help': "Colon-separated name and email to add to the submission for notifications, e.g. 'Murray Wham:murray.wham@ed.ac.uk'. Can be specified multiple times.", 'nargs': '+'} + }, + 'createdac': { + 'dac_alias': {'help': 'Alias for the DAC object to create'}, + 'dac_title': {'help': 'Short text to show in searches and displays'}, + 'dac_contacts': {'help': "Colon-separated name, email and organisation to add to the dac, e.g. 'Murray Wham:murray.wham@ed.ac.uk:IGC'. Can be specified multiple times.", 'nargs': '+'}, + 'dac_attributes': {'help': "Colon-separated tag, value and unit of extra attributes to list with the DAC. Can be specified multiple times.", 'nargs': '+'} + }, + 'createpolicy': { + 'policy_alias': {'help': 'Alias for the policy object to create'}, + 'policy_title': {'help': 'Policy title'}, + 'dac_accession': {'help': 'EGA accession number for the DAC object to link this policy to'}, + 'policy_text': {'help': 'Policy text. This or policy_file_url is required.'}, + 'policy_file_url': {'help': 'URL of the policy document. This or policy_text is required'}, + 'policy_links': {'help': "Semicolon-separated description and URL of policy materials to link to, e.g. 'IGC Data Access Agreement;https://www.ed.ac.uk/some-policy'. Can be specified multiple times.", 'nargs': '+'}, + 'policy_attributes': {'help': "Colon-separated tag, value and unit of extra attributes to list with the policy. Can be specified multiple times.", 'nargs': '+'} + }, + 'createstudy': { + 'study_alias': {'help': 'Unique ID for the study. Used to refer to the study during the submission process and is supposed to be globally unique within the submission account, but is not shared with anyone.'}, + 'study_type': {'help': 'Study type accepted by EGA, e.g. Whole Genome Sequencing', 'choices': ('Whole Genome Sequencing', 'Metagenomics', 'Transcriptome Analysis', 'Resequencing', 'Epigenetics', 'Synthetic Genomics', 'Forensic or Paleo-genomics', 'Gene Regulation Study', 'Cancer Genomics', 'Population Genomics', 'RNASeq', 'Exome Sequencing', 'Pooled Clone Sequencing', 'Transcriptome Sequencing', 'Other (Study type not listed)')}, + 'study_title': {'help': 'Study title'}, + 'study_abstract': {'help': 'Path to plain-text file containing the abstract for this study'}, + 'study_attributes': {'help': "CSV or Excel table with two columns ('tag' and 'value'), describing study attributes to add to study.xml"} + }, + 'createsamples': { + 'samples': {'help': "CSV or Excel table with at least 8 columns ('id', 'title', 'scientific_name', 'common_name', 'description', 'sex', 'phenotype'), describing samples to add. Any additional columns will be included as sample attributes"}, + }, + 'createrunsandexperiments': { + 'nbatches': {'help': 'Set to a number larger than 1 to split paired-end fastqs into smaller batches. This may be required if uploading large datasets - ega-box accounts should not exceed 8Tb and must not exceed 10Tb.'}, + 'runs': {'help': "Path to CSV file linking samples to files. Must have the columns 'Sample alias', 'First Fastq File', 'First Checksum', 'First Unencrypted checksum', 'Second Fastq File', 'Second Checksum', 'Second Unencrypted checksum'", 'default': 'runs.csv'}, + 'experiment_title': {'help': 'Title to apply to all created experiment objects'}, + 'library_strategy': {'help': 'Value to apply to LIBRARY_STRATEGY, e.g. WGS'}, + 'library_source': {'help': 'Value to apply to LIBRARY_SOURCE, e.g. GENOMIC'}, + 'library_selection': {'help': 'Value to apply to LIBRARY_SELECTION', 'default': 'other'}, + 'library_nominal_length': {'help': 'Library nominal length, e.g. 450'}, + 'library_protocol': {'help': 'Description of the library preparation process'}, + 'platform_type': {'help': 'Platform type, e.g. illumina'}, + 'platform_instrument': {'help': 'Instrument type, e.g. HiSeq X Ten'}, + 'file_box_base': {'help': 'Base file path to apply to uploaded files when passing to `filename`, e.g. if files are uploaded to a folder structure on the ega-box FTP server', 'default': ''} + }, + 'createdataset': { + 'receipt_xml': {'help': 'XML receipt file from running stage 1, containing run accesion numbers from EGA'}, + 'policy_accession': {'help': "'EGAPxxx...' accession number of the data access policy to apply to this dataset"}, + 'dataset_title': {'help': 'Dataset title'}, + 'dataset_type': {'help': 'Dataset type accepted by EGA, e.g. Whole genome sequencing', 'choices': ('Whole genome sequencing', 'Exome sequencing', 'Genotyping by array', 'Transcriptome profiling by high-throughput sequencing', 'Transcriptome profiling by array', 'Amplicon sequencing', 'Methylation binding domain sequencing', 'Methylation profiling by high-throughput sequencing', 'Phenotype information', 'Study summary information', 'Genomic variant calling', 'Chromatin accessibility profiling by high-throughput sequencing', 'Histone modification profiling by high-throughput sequencing', 'Chip-Seq')} + } +} + +env = jinja2.Environment( + loader=jinja2.FileSystemLoader(os.path.join(os.path.dirname(__file__), 'templates')), + autoescape=jinja2.select_autoescape() +) + +formatter = logging.Formatter('[%(asctime)s][%(name)s][%(levelname)s] %(message)s') +handler = logging.StreamHandler(sys.stdout) +handler.setFormatter(formatter) +handler.setLevel(logging.INFO) +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +xml_dir = '' +force = False + + +def cmd_args(argv=None): + a = argparse.ArgumentParser(__name__) + subparsers = a.add_subparsers() + for n in options: + if n == __name__: + p = a + else: + p = subparsers.add_parser(n) + + p.set_defaults(entry=p.prog) + for k, v in options[p.prog.replace(__name__ + ' ', '')].items(): + alias = v.get('alias') # e.g. -s for --study_alias + names = ['--' + k] + if alias: + v.pop('alias') + names.append(alias) + + p.add_argument(*names, **v) + + return a.parse_args(argv) + + +def file_args(): + for f in (os.getenv('EGA_UPLOAD_CONFIG'), os.path.join(os.getcwd(), 'ega_upload.yaml')): + if f and os.path.isfile(f): + with open(f) as h: + return yaml.safe_load(h) + return {} + + +def _merge_file_and_cmd_args(file_config, args): + config = dict() + for k in {__name__, args.entry.replace(__name__ + ' ', '')}: + for k2, v in options[k].items(): + # first populate config from command line, but only when different from the default + cmd_v = args.__dict__.get(k2) + if cmd_v is not None and cmd_v != v.get('default'): + config[k2] = cmd_v + + # next with file config + elif k2 in file_config: + config[k2] = file_config[k2] + + # finally with declared defaults + elif 'default' in v: + config[k2] = v['default'] + + return config + + +def configure(argv=None): + args = cmd_args(argv) + file_conf = file_args() + config = _merge_file_and_cmd_args(file_conf, args) + + if config['debug']: + handler.setLevel(logging.DEBUG) + logger.setLevel(logging.DEBUG) + + logger.info('Output dir: %s', config['output_dir']) + global xml_dir + xml_dir = config['output_dir'] + os.makedirs(xml_dir, exist_ok=True) + + global force + if config['force']: + force = True + + return args.entry, config + + +def main(argv=None): + logger.addHandler(handler) + entry, config = configure(argv) + + # we'll always need this + createsubmission(config) + + if entry.endswith('createdac'): + createdac(config) + elif entry.endswith('createpolicy'): + createpolicy(config) + elif entry.endswith('createstudy'): + createstudy(config) + elif entry.endswith('createsamples'): + createsamples(config) + elif entry.endswith('createrunsandexperiments'): + createrunsandexperiments(config) + elif entry.endswith('createdataset'): + createdataset(config) + + +def createsubmission(config): + write_xml( + 'submission.xml', + config=config, + submission_contacts=[c.split(':') for c in config.get('submission_contacts', ())] + ) + + +def createdac(config): + dac_attributes = [] + for attr in config.get('dac_attributes', ()): + split_attr = attr.split(':') + if len(split_attr) == 2: # no unit + split_attr.append(None) + + dac_attributes.append(split_attr) + + write_xml( + 'dac.xml', + config=config, + dac_contacts=[c.split(':') for c in config.get('dac_contacts', ())], + dac_attributes=dac_attributes + ) + + +def createpolicy(config): + if not config['policy_text'] and not config['policy_file_url']: + raise KeyError('policy_text or policy_file_url is required') + + policy_attributes = [] + for attr in config.get('policy_attributes', ()): + split_attr = attr.split(':') + if len(split_attr) == 2: # no unit + split_attr.append(None) + + policy_attributes.append(split_attr) + + write_xml( + 'policy.xml', + config=config, + policy_links=[l.split(':') for l in config.get('policy_links', ())], + policy_attributes=policy_attributes + ) + + +def createstudy(config): + df = read_table(config['study_attributes']) + if list(df['tag']) != list(set(df['tag'])): + raise ValueError('Duplicate tags found in %s' % config['study_attributes']) + + study_attributes = {row.tag: row.value for row in df.itertuples()} + write_xml('study.xml', config=config, study_attributes=study_attributes) + + +def createsamples(config): + df = read_table(config['samples']) + mandatory_columns = ('id', 'title', 'taxon_id', 'scientific_name', 'common_name', 'description', 'sex', 'phenotype') + extra_cols = tuple(c for c in df.columns if c not in mandatory_columns) + if tuple(df.columns[:8]) != mandatory_columns: + raise ValueError('First 8 columns of %s must be %s' % (config['samples'], mandatory_columns)) + + samples = [] + for i, s in df.iterrows(): + new_sample = dict() + for k in mandatory_columns: + v = s[k] + if pandas.notna(v): + new_sample[k] = v + + extra_attributes = {c: s[c] for c in extra_cols if s[c] is not None} + new_sample['alias'] = new_sample['id'] + new_sample['attributes'] = extra_attributes + samples.append(new_sample) + + write_xml('sample.xml', config=config, samples=samples) + + +def createrunsandexperiments(config): + idgen = IDGenerator(xml_dir) + df = read_table(config['runs']) + df['experiment_id'] = [idgen.new() for _ in range(len(df))] + df['run_id'] = [idgen.new() for _ in range(len(df))] + df['r1'] = df['First Fastq File'].map(lambda f: os.path.join(config['file_box_base'], f)) + df['r2'] = df['Second Fastq File'].map(lambda f: os.path.join(config['file_box_base'], f)) + df['file_type_r1'] = df['First Fastq File'].map(get_file_type) + df['file_type_r2'] = df['Second Fastq File'].map(get_file_type) + + nsamples = len(df) + logger.info('Splitting %i samples into %i batches', nsamples, config['nbatches']) + slices = batches(nsamples, config['nbatches']) + + for i, (j, k) in enumerate(slices): + batch = [line for i, line in df[j:k].iterrows()] + write_xml('experiment.xml', 'experiment-%i.xml' % (i + 1), id_map=batch, config=config) + write_xml('run.xml', 'run-%i.xml' % (i + 1), id_map=batch, config=config) + + +def createdataset(config): + dom = xml.dom.minidom.parse(config['receipt_xml']) + run_accessions = [r.attributes['accession'].value for r in dom.getElementsByTagName('RUN')] + idgen = IDGenerator(xml_dir) + write_xml('dataset.xml', config=config, dataset_alias=idgen.new(), runs=run_accessions, analyses=()) + + +def read_table(f: str) -> pandas.DataFrame: + if f.endswith('.csv'): + return pandas.read_csv(f) + elif f.endswith('xlsx'): + return pandas.read_excel(f, engine='openpyxl') + else: + raise NameError('Unrecognised file format for %s - must be csv or xlsx' % f) + + +def render_xml(template, **kwargs): + return multi_newline.sub('\n', env.get_template(template).render(**kwargs)) + + +def write_xml(template_name, xml_name_out=None, **kwargs): + xml_name_out = xml_name_out or template_name + path = os.path.join(xml_dir, xml_name_out) + if force or not os.path.isfile(path): + content = render_xml(template_name, **kwargs) + with open(path, 'w') as f: + f.write(content) + + logger.info('Written %s', xml_name_out) + logger.debug(content) + else: + logger.info('%s already exists', xml_name_out) + + +class IDGenerator: + def __init__(self, xml_dir=None): + if xml_dir: + self.existing_ids = self.scrape_xml_dir(xml_dir) + else: + self.existing_ids = set() + + def new(self): + candidate = None + for _ in range(3): + candidate = str(uuid.uuid1()) + + if candidate in self.existing_ids: + logger.warning('Attempted to generate an ID %s that already existed - retrying', candidate) + else: + self.existing_ids.add(candidate) + return candidate + + raise ValueError('Failed to generate an experiment ID. Last tried: %s' % candidate) + + @staticmethod + def scrape_xml_dir(xml_dir): + ids = set() + tags = ('DAC', 'POLICY', 'EXPERIMENT', 'RUN', 'STUDY', 'SAMPLE') + for root, dirs, files in os.walk(xml_dir): + for f in files: + if not f.endswith('.xml'): + continue + + dom = xml.dom.minidom.parse(os.path.join(root, f)) + for t in tags: + elements = dom.getElementsByTagName(t) + for e in elements: + if 'alias' in e.attributes: + ids.add(e.attributes.get('alias').value) + + return ids + + +def get_file_type(filename): + if filename.endswith('fq.gz') or filename.endswith('fastq.gz'): + return 'fastq' + + raise NameError('Could not determine file type for file %s' % filename) + + +def batches(nitems, nbatches): + """ + Split a number of items (`nitems`) into `nbatches` number of slices, taking + remainders into account if needed. E.g, splitting 99 fastq pairs into 4 batches + -> [25, 25, 25, 24] -> [(0, 25), (25, 50), (50, 75), (75, 99)]. + """ + full_batch = math.ceil(nitems / nbatches) # full size batch + nfull_batches = nitems // full_batch + _batches = [ + (i * full_batch, (i * full_batch) + full_batch) + for i in range(nfull_batches) + ] + remainder = nitems % full_batch + if remainder: + last_batch = (nitems - remainder, nitems) + _batches.append(last_batch) + + return _batches + + +if __name__ == '__main__': + main() diff --git a/metadata/requirements.txt b/metadata/requirements.txt new file mode 100644 index 0000000..17a1e58 --- /dev/null +++ b/metadata/requirements.txt @@ -0,0 +1,4 @@ +pyYAML>=6.0.2 +pandas>=2.2.2 +jinja2>=3.1.4 +openpyxl>=3.1.5 diff --git a/metadata/templates/analysis.xml b/metadata/templates/analysis.xml new file mode 100644 index 0000000..25a086e --- /dev/null +++ b/metadata/templates/analysis.xml @@ -0,0 +1,23 @@ +<ANALYSIS_SET> +<ANALYSIS alias="{{ analysis.alias }}" center_name="{{ center_name }}" broker_name="EGA" > + <TITLE>{{ analysis.title }}</TITLE> + <DESCRIPTION>{{ analysis.description }}</DESCRIPTION> + <STUDY_REF refname="{{ study_alias }}" refcenter="{{ center_name }}"/> + <SAMPLE_REF refname="{{ analysis.sample_alias }}" refcenter="{{ center_name }}" label="{{ analysis.sample_alias }}"/> + <ANALYSIS_TYPE> + <REFERENCE_ALIGNMENT> + <ASSEMBLY> + <STANDARD refname="{{ analysis.assembly_name }}" accession="{{ analysis.assembly_accession }}"/> + </ASSEMBLY> +{% for chr in analysis.assembly_chromosomes %} + <SEQUENCE accession="{{ chr.accession }}" label="{{ chr.name }}"/> +{% endfor %} + </REFERENCE_ALIGNMENT> + </ANALYSIS_TYPE> + <FILES> +{% for f in analysis.files %} + <FILE filename="{{ f.path }}" filetype="{{ f.type }}" checksum_method="MD5" checksum="{{ f.checksum }}" unencrypted_checksum="{{ f.unencrypted_checksum }}"/> +{% endfor %} + </FILES> + </ANALYSIS> +</ANALYSIS_SET> diff --git a/metadata/templates/dac.xml b/metadata/templates/dac.xml new file mode 100644 index 0000000..26d80f3 --- /dev/null +++ b/metadata/templates/dac.xml @@ -0,0 +1,23 @@ +<DAC_SET> + <DAC alias="{{ config.dac_alias }}" center_name="{{ center_name }}" broker_name="EGA"> + <TITLE>{{ config.dac_title }}</TITLE> + <CONTACTS> +{% for name, email, org in dac_contacts %} + <CONTACT name="{{ name }}" email="{{ email }}" organisation="{{ org }}" /> +{% endfor %} + </CONTACTS> +{% if dac_attributes %} + <DAC_ATTRIBUTES> +{% for k, v, unit in dac_attributes %} + <DAC_ATTRIBUTE> + <TAG>{{ k }}</TAG> + <VALUE>{{ v }}</VALUE> +{% if unit %} + <UNITS>{{ unit }}</UNITS> +{% endif %} + </DAC_ATTRIBUTE> +{% endfor %} + </DAC_ATTRIBUTES> +{% endif %} + </DAC> +</DAC_SET> diff --git a/metadata/templates/dataset.xml b/metadata/templates/dataset.xml new file mode 100644 index 0000000..60d84d0 --- /dev/null +++ b/metadata/templates/dataset.xml @@ -0,0 +1,25 @@ +<DATASETS> + <DATASET alias="{{ dataset_alias }}" broker_name="EGA"> + <TITLE>{{ config.dataset_title }}</TITLE> + <DATASET_TYPE>{{ config.dataset_type }}</DATASET_TYPE> +{% for r in runs %} + <RUN_REF accession="{{ r }}" /> +{% endfor %} +{% for a in analyses %} + <ANALYSIS_REF accession="{{ a }}" /> +{% endfor %} + <POLICY_REF accession="{{ config.policy_accession }}" /> +{% if links %} + <DATASET_LINKS> +{% for k, v in links.items() %} + <DATASET_LINK> + <URL_LINK> + <LABEL>{{ k }}</LABEL> + <URL>{{ v }}</URL> + </URL_LINK> + </DATASET_LINK> +{% endfor %} + </DATASET_LINKS> +{% endif %} + </DATASET> +</DATASETS> diff --git a/metadata/templates/experiment.xml b/metadata/templates/experiment.xml new file mode 100644 index 0000000..0873d23 --- /dev/null +++ b/metadata/templates/experiment.xml @@ -0,0 +1,30 @@ +<EXPERIMENT_SET> +{% for line in id_map %} + <EXPERIMENT alias="{{ line['experiment_id'] }}"> + <TITLE>{{ experiment_title }}</TITLE> + <STUDY_REF refname="{{ config.study_alias }}" /> + <DESIGN> + <DESIGN_DESCRIPTION /> + <SAMPLE_DESCRIPTOR refname="{{ line['Sample alias'] }}" /> + <LIBRARY_DESCRIPTOR> + <LIBRARY_NAME></LIBRARY_NAME> + <LIBRARY_STRATEGY>{{ config.library_strategy | default('WGS') }}</LIBRARY_STRATEGY> + <LIBRARY_SOURCE>{{ config.library_source | default('GENOMIC') }}</LIBRARY_SOURCE> + <LIBRARY_SELECTION>{{ config.library_selection | default('other') }}</LIBRARY_SELECTION> + <LIBRARY_LAYOUT> + <PAIRED NOMINAL_LENGTH="{{ config.library_nominal_length }}"></PAIRED> + </LIBRARY_LAYOUT> + <LIBRARY_CONSTRUCTION_PROTOCOL>{{ config.library_protocol }}</LIBRARY_CONSTRUCTION_PROTOCOL> + </LIBRARY_DESCRIPTOR> + </DESIGN> + <PLATFORM> +{% if config.platform_type == 'illumina' %} + <ILLUMINA> + <INSTRUMENT_MODEL>{{ config.platform_instrument }}</INSTRUMENT_MODEL> + </ILLUMINA> +{% endif %} + </PLATFORM> + <PROCESSING /> + </EXPERIMENT> +{% endfor %} +</EXPERIMENT_SET> diff --git a/metadata/templates/policy.xml b/metadata/templates/policy.xml new file mode 100644 index 0000000..001e7ac --- /dev/null +++ b/metadata/templates/policy.xml @@ -0,0 +1,37 @@ +<POLICY_SET> + <POLICY alias="{{ config.policy_alias }}" broker_name="EGA"> + <TITLE>{{ config.policy_title }}</TITLE> + <DAC_REF accession="{{ config.dac_accession }}" /> +{% if config.policy_text %} + <POLICY_TEXT>{{ config.policy_text }}</POLICY_TEXT> +{% endif %} +{% if config.policy_file_url %} + <POLICY_FILE>{{ config.policy_file_url }}</POLICY_FILE> +{% endif %} +{% if policy_links %} + <POLICY_LINKS> +{% for k, v in policy_links.items() %} + <POLICY_LINK> + <URL_LINK> + <LABEL>{{ k }}</LABEL> + <URL>{{ v }}</URL> + </URL_LINK> + </POLICY_LINK> +{% endfor %} + </POLICY_LINKS> +{% endif %} +{% if policy_attributes %} + <POLICY_ATTRIBUTES> +{% for tag, value, unit in policy_attributes %} + <POLICY_ATTRIBUTE> + <TAG>{{ tag }}</TAG> + <VALUE>{{ value }}</VALUE> +{% if unit %} + <UNITS>{{ unit }}</UNITS> +{% endif %} + </POLICY_ATTRIBUTE> +{% endfor %} + </POLICY_ATTRIBUTES> +{% endif %} + </POLICY> +</POLICY_SET> diff --git a/metadata/templates/run.xml b/metadata/templates/run.xml new file mode 100644 index 0000000..0598c21 --- /dev/null +++ b/metadata/templates/run.xml @@ -0,0 +1,13 @@ +<RUN_SET> +{% for line in id_map %} + <RUN alias="{{ line['run_id'] }}"> + <EXPERIMENT_REF refname="{{ line['experiment_id'] }}" /> + <DATA_BLOCK> + <FILES> + <FILE filename="{{ line['r1'] }}" filetype="{{ line['file_type_r1'] }}" checksum_method="MD5" checksum="{{ line['First Checksum'] }}" unencrypted_checksum="{{ line['First Unencrypted checksum'] }}" /> + <FILE filename="{{ line['r2'] }}" filetype="{{ line['file_type_r2'] }}" checksum_method="MD5" checksum="{{ line['Second Checksum'] }}" unencrypted_checksum="{{ line['Second Unencrypted checksum'] }}" /> + </FILES> + </DATA_BLOCK> + </RUN> +{% endfor %} +</RUN_SET> diff --git a/metadata/templates/sample.xml b/metadata/templates/sample.xml new file mode 100644 index 0000000..685a32e --- /dev/null +++ b/metadata/templates/sample.xml @@ -0,0 +1,38 @@ +<SAMPLE_SET> +{% for sample in samples %} + <SAMPLE alias="{{ sample.alias }}" center_name="{{ center_name }}"> + <TITLE>{{ sample.title }}</TITLE> + <SAMPLE_NAME> + <TAXON_ID>{{ sample.taxon_id | default(9606) }}</TAXON_ID> +{% if sample.scientific_name %} + <SCIENTIFIC_NAME>{{ sample.scientific_name }}</SCIENTIFIC_NAME> +{% endif %} +{% if sample.common_name %} + <COMMON_NAME>{{ sample.common_name }}</COMMON_NAME> +{% endif %} + </SAMPLE_NAME> + <DESCRIPTION>{{ sample.description }}</DESCRIPTION> + <SAMPLE_ATTRIBUTES> + <SAMPLE_ATTRIBUTE> + <TAG>subject_id</TAG> + <VALUE>{{ sample.alias }}</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>sex</TAG> + <VALUE>{{ sample.sex | default('unknown') }}</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>phenotype</TAG> + <VALUE>{{ sample.phenotype }}</VALUE> + </SAMPLE_ATTRIBUTE> + +{% for k, v in sample.attributes.items() %} + <SAMPLE_ATTRIBUTE> + <TAG>{{ k }}</TAG> + <VALUE>{{ v }}</VALUE> + </SAMPLE_ATTRIBUTE> +{% endfor %} + </SAMPLE_ATTRIBUTES> + </SAMPLE> +{% endfor %} +</SAMPLE_SET> diff --git a/metadata/templates/study.xml b/metadata/templates/study.xml new file mode 100644 index 0000000..4f0a48c --- /dev/null +++ b/metadata/templates/study.xml @@ -0,0 +1,17 @@ +<STUDY_SET> + <STUDY alias="{{ config.study_alias }}" center_name="{{ config.center_name }}"> + <DESCRIPTOR> + <STUDY_TITLE>{{ config.study_title }}</STUDY_TITLE> + <STUDY_TYPE existing_study_type="{{ config.study_type }}"/> + <STUDY_ABSTRACT>{{ config.abstract }}</STUDY_ABSTRACT> + </DESCRIPTOR> + <STUDY_ATTRIBUTES> +{% for k, v in study_attributes.items() %} + <STUDY_ATTRIBUTE> + <TAG>{{ k }}</TAG> + <VALUE>{{ v }}</VALUE> + </STUDY_ATTRIBUTE> +{% endfor %} + </STUDY_ATTRIBUTES> + </STUDY> +</STUDY_SET> diff --git a/metadata/templates/submission.xml b/metadata/templates/submission.xml new file mode 100644 index 0000000..3d8b267 --- /dev/null +++ b/metadata/templates/submission.xml @@ -0,0 +1,20 @@ +<?xml version="1.0" encoding="UTF-8"?> +<SUBMISSION_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="SRA.submission.xsd"> + <SUBMISSION alias="" broker_name="EGA"> +{% if submission_contacts %} + <CONTACTS> +{% for name, email in submission_contacts %} + <CONTACT name="{{ name }}" inform_on_status="{{ email }}" inform_on_error="{{ email }}"/> +{% endfor %} + </CONTACTS> +{% endif %} + <ACTIONS> + <ACTION> + <ADD /> + </ACTION> + <ACTION> + <PROTECT/> + </ACTION> + </ACTIONS> + </SUBMISSION> +</SUBMISSION_SET> diff --git a/metadata/tests/abstract.txt b/metadata/tests/abstract.txt new file mode 100644 index 0000000..3b95e4b --- /dev/null +++ b/metadata/tests/abstract.txt @@ -0,0 +1 @@ +Some study abstract diff --git a/metadata/tests/ega_upload.yaml b/metadata/tests/ega_upload.yaml new file mode 100644 index 0000000..f63ce06 --- /dev/null +++ b/metadata/tests/ega_upload.yaml @@ -0,0 +1,47 @@ +--- + +output_dir: xml +force: true + +submission_contacts: + - 'Somebody:somebody@ed.ac.uk' + +dac_alias: test_dac +dac_title: Test DAC +dac_contacts: + - Somebody:somebody@ed.ac.uk:DAC +dac_attributes: + - this:that:other + +policy_alias: some_data_access_policy +policy_title: Some Data Access Policy +policy_text: Some data access policy text +dac_accession: EGAC01234 +policy_attributes: + - this:that:other + +study_alias: some_study +study_type: Whole Genome Sequencing +study_title: Some study title +study_abstract: abstract.txt +study_attributes: study_attributes.csv + +samples: samples.csv + +nbatches: 3 +runs: runs.csv +experiment_title: Whole genome sequencing for some dataset +library_strategy: WGS +library_source: GENOMIC +library_selection: other +library_nominal_length: 450 +library_protocol: | + Some library protocol text +platform_type: illumina +platform_instrument: HiSeq X Ten +file_box_base: some_dataset/fastq + +receipt_xml: receipt.xml +policy_accession: EGAP01234 +dataset_title: Test dataset +dataset_type: 'Exome sequencing' diff --git a/metadata/tests/expected/dac.xml b/metadata/tests/expected/dac.xml new file mode 100644 index 0000000..2459c34 --- /dev/null +++ b/metadata/tests/expected/dac.xml @@ -0,0 +1,8 @@ +<DAC_SET> + <DAC alias="IGC DAC" center_name="" broker_name="EGA"> + <TITLE>Test DAC</TITLE> + <CONTACTS> + <CONTACT name="Murray Wham" email="murray.wham@ed.ac.uk" organisation="IGC" /> + </CONTACTS> + </DAC> +</DAC_SET> \ No newline at end of file diff --git a/metadata/tests/expected/dataset.xml b/metadata/tests/expected/dataset.xml new file mode 100644 index 0000000..bcad8ee --- /dev/null +++ b/metadata/tests/expected/dataset.xml @@ -0,0 +1,10 @@ +<DATASETS> + <DATASET alias="00000000-0000-0000-0000-000000000011" broker_name="EGA"> + <TITLE>Test dataset</TITLE> + <DATASET_TYPE>Exome sequencing</DATASET_TYPE> + <RUN_REF accession="EGAR00001" /> + <RUN_REF accession="EGAR00002" /> + <RUN_REF accession="EGAR00003" /> + <POLICY_REF accession="EGAP01234" /> + </DATASET> +</DATASETS> \ No newline at end of file diff --git a/metadata/tests/expected/experiment-1.xml b/metadata/tests/expected/experiment-1.xml new file mode 100644 index 0000000..1696a5e --- /dev/null +++ b/metadata/tests/expected/experiment-1.xml @@ -0,0 +1,77 @@ +<EXPERIMENT_SET> + <EXPERIMENT alias="00000000-0000-0000-0000-000000000001"> + <TITLE></TITLE> + <STUDY_REF refname="some_study" /> + <DESIGN> + <DESIGN_DESCRIPTION /> + <SAMPLE_DESCRIPTOR refname="sample1" /> + <LIBRARY_DESCRIPTOR> + <LIBRARY_NAME></LIBRARY_NAME> + <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY> + <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE> + <LIBRARY_SELECTION>other</LIBRARY_SELECTION> + <LIBRARY_LAYOUT> + <PAIRED NOMINAL_LENGTH="450"></PAIRED> + </LIBRARY_LAYOUT> + <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text +</LIBRARY_CONSTRUCTION_PROTOCOL> + </LIBRARY_DESCRIPTOR> + </DESIGN> + <PLATFORM> + <ILLUMINA> + <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL> + </ILLUMINA> + </PLATFORM> + <PROCESSING /> + </EXPERIMENT> + <EXPERIMENT alias="00000000-0000-0000-0000-000000000002"> + <TITLE></TITLE> + <STUDY_REF refname="some_study" /> + <DESIGN> + <DESIGN_DESCRIPTION /> + <SAMPLE_DESCRIPTOR refname="sample2" /> + <LIBRARY_DESCRIPTOR> + <LIBRARY_NAME></LIBRARY_NAME> + <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY> + <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE> + <LIBRARY_SELECTION>other</LIBRARY_SELECTION> + <LIBRARY_LAYOUT> + <PAIRED NOMINAL_LENGTH="450"></PAIRED> + </LIBRARY_LAYOUT> + <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text +</LIBRARY_CONSTRUCTION_PROTOCOL> + </LIBRARY_DESCRIPTOR> + </DESIGN> + <PLATFORM> + <ILLUMINA> + <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL> + </ILLUMINA> + </PLATFORM> + <PROCESSING /> + </EXPERIMENT> + <EXPERIMENT alias="00000000-0000-0000-0000-000000000003"> + <TITLE></TITLE> + <STUDY_REF refname="some_study" /> + <DESIGN> + <DESIGN_DESCRIPTION /> + <SAMPLE_DESCRIPTOR refname="sample3" /> + <LIBRARY_DESCRIPTOR> + <LIBRARY_NAME></LIBRARY_NAME> + <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY> + <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE> + <LIBRARY_SELECTION>other</LIBRARY_SELECTION> + <LIBRARY_LAYOUT> + <PAIRED NOMINAL_LENGTH="450"></PAIRED> + </LIBRARY_LAYOUT> + <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text +</LIBRARY_CONSTRUCTION_PROTOCOL> + </LIBRARY_DESCRIPTOR> + </DESIGN> + <PLATFORM> + <ILLUMINA> + <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL> + </ILLUMINA> + </PLATFORM> + <PROCESSING /> + </EXPERIMENT> +</EXPERIMENT_SET> \ No newline at end of file diff --git a/metadata/tests/expected/experiment-2.xml b/metadata/tests/expected/experiment-2.xml new file mode 100644 index 0000000..0f2e3b1 --- /dev/null +++ b/metadata/tests/expected/experiment-2.xml @@ -0,0 +1,77 @@ +<EXPERIMENT_SET> + <EXPERIMENT alias="00000000-0000-0000-0000-000000000004"> + <TITLE></TITLE> + <STUDY_REF refname="some_study" /> + <DESIGN> + <DESIGN_DESCRIPTION /> + <SAMPLE_DESCRIPTOR refname="sample4" /> + <LIBRARY_DESCRIPTOR> + <LIBRARY_NAME></LIBRARY_NAME> + <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY> + <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE> + <LIBRARY_SELECTION>other</LIBRARY_SELECTION> + <LIBRARY_LAYOUT> + <PAIRED NOMINAL_LENGTH="450"></PAIRED> + </LIBRARY_LAYOUT> + <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text +</LIBRARY_CONSTRUCTION_PROTOCOL> + </LIBRARY_DESCRIPTOR> + </DESIGN> + <PLATFORM> + <ILLUMINA> + <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL> + </ILLUMINA> + </PLATFORM> + <PROCESSING /> + </EXPERIMENT> + <EXPERIMENT alias="00000000-0000-0000-0000-000000000005"> + <TITLE></TITLE> + <STUDY_REF refname="some_study" /> + <DESIGN> + <DESIGN_DESCRIPTION /> + <SAMPLE_DESCRIPTOR refname="sample5" /> + <LIBRARY_DESCRIPTOR> + <LIBRARY_NAME></LIBRARY_NAME> + <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY> + <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE> + <LIBRARY_SELECTION>other</LIBRARY_SELECTION> + <LIBRARY_LAYOUT> + <PAIRED NOMINAL_LENGTH="450"></PAIRED> + </LIBRARY_LAYOUT> + <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text +</LIBRARY_CONSTRUCTION_PROTOCOL> + </LIBRARY_DESCRIPTOR> + </DESIGN> + <PLATFORM> + <ILLUMINA> + <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL> + </ILLUMINA> + </PLATFORM> + <PROCESSING /> + </EXPERIMENT> + <EXPERIMENT alias="00000000-0000-0000-0000-000000000006"> + <TITLE></TITLE> + <STUDY_REF refname="some_study" /> + <DESIGN> + <DESIGN_DESCRIPTION /> + <SAMPLE_DESCRIPTOR refname="sample6" /> + <LIBRARY_DESCRIPTOR> + <LIBRARY_NAME></LIBRARY_NAME> + <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY> + <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE> + <LIBRARY_SELECTION>other</LIBRARY_SELECTION> + <LIBRARY_LAYOUT> + <PAIRED NOMINAL_LENGTH="450"></PAIRED> + </LIBRARY_LAYOUT> + <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text +</LIBRARY_CONSTRUCTION_PROTOCOL> + </LIBRARY_DESCRIPTOR> + </DESIGN> + <PLATFORM> + <ILLUMINA> + <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL> + </ILLUMINA> + </PLATFORM> + <PROCESSING /> + </EXPERIMENT> +</EXPERIMENT_SET> \ No newline at end of file diff --git a/metadata/tests/expected/experiment-3.xml b/metadata/tests/expected/experiment-3.xml new file mode 100644 index 0000000..9cdd701 --- /dev/null +++ b/metadata/tests/expected/experiment-3.xml @@ -0,0 +1,52 @@ +<EXPERIMENT_SET> + <EXPERIMENT alias="00000000-0000-0000-0000-000000000007"> + <TITLE></TITLE> + <STUDY_REF refname="some_study" /> + <DESIGN> + <DESIGN_DESCRIPTION /> + <SAMPLE_DESCRIPTOR refname="sample7" /> + <LIBRARY_DESCRIPTOR> + <LIBRARY_NAME></LIBRARY_NAME> + <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY> + <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE> + <LIBRARY_SELECTION>other</LIBRARY_SELECTION> + <LIBRARY_LAYOUT> + <PAIRED NOMINAL_LENGTH="450"></PAIRED> + </LIBRARY_LAYOUT> + <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text +</LIBRARY_CONSTRUCTION_PROTOCOL> + </LIBRARY_DESCRIPTOR> + </DESIGN> + <PLATFORM> + <ILLUMINA> + <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL> + </ILLUMINA> + </PLATFORM> + <PROCESSING /> + </EXPERIMENT> + <EXPERIMENT alias="00000000-0000-0000-0000-000000000008"> + <TITLE></TITLE> + <STUDY_REF refname="some_study" /> + <DESIGN> + <DESIGN_DESCRIPTION /> + <SAMPLE_DESCRIPTOR refname="sample8" /> + <LIBRARY_DESCRIPTOR> + <LIBRARY_NAME></LIBRARY_NAME> + <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY> + <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE> + <LIBRARY_SELECTION>other</LIBRARY_SELECTION> + <LIBRARY_LAYOUT> + <PAIRED NOMINAL_LENGTH="450"></PAIRED> + </LIBRARY_LAYOUT> + <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text +</LIBRARY_CONSTRUCTION_PROTOCOL> + </LIBRARY_DESCRIPTOR> + </DESIGN> + <PLATFORM> + <ILLUMINA> + <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL> + </ILLUMINA> + </PLATFORM> + <PROCESSING /> + </EXPERIMENT> +</EXPERIMENT_SET> \ No newline at end of file diff --git a/metadata/tests/expected/policy.xml b/metadata/tests/expected/policy.xml new file mode 100644 index 0000000..5a24f8e --- /dev/null +++ b/metadata/tests/expected/policy.xml @@ -0,0 +1,14 @@ +<POLICY_SET> + <POLICY alias="some_data_access_policy" broker_name="EGA"> + <TITLE>Some Data Access Policy</TITLE> + <DAC_REF accession="EGAC01234" /> + <POLICY_TEXT>Some data access policy text</POLICY_TEXT> + <POLICY_ATTRIBUTES> + <POLICY_ATTRIBUTE> + <TAG>this</TAG> + <VALUE>that</VALUE> + <UNITS>other</UNITS> + </POLICY_ATTRIBUTE> + </POLICY_ATTRIBUTES> + </POLICY> +</POLICY_SET> \ No newline at end of file diff --git a/metadata/tests/expected/run-1.xml b/metadata/tests/expected/run-1.xml new file mode 100644 index 0000000..be93b45 --- /dev/null +++ b/metadata/tests/expected/run-1.xml @@ -0,0 +1,29 @@ +<RUN_SET> + <RUN alias="00000000-0000-0000-0000-000000000009"> + <EXPERIMENT_REF refname="00000000-0000-0000-0000-000000000001" /> + <DATA_BLOCK> + <FILES> + <FILE filename="some_dataset/fastq/sample1_R1.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="1r1md5" unencrypted_checksum="1r1md5u" /> + <FILE filename="some_dataset/fastq/sample1_R2.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="1r2md5" unencrypted_checksum="1r2md5u" /> + </FILES> + </DATA_BLOCK> + </RUN> + <RUN alias="00000000-0000-0000-0000-00000000000a"> + <EXPERIMENT_REF refname="00000000-0000-0000-0000-000000000002" /> + <DATA_BLOCK> + <FILES> + <FILE filename="some_dataset/fastq/sample2_R1.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="2r1md5" unencrypted_checksum="2r1md5u" /> + <FILE filename="some_dataset/fastq/sample2_R2.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="2r2md5" unencrypted_checksum="2r2md5u" /> + </FILES> + </DATA_BLOCK> + </RUN> + <RUN alias="00000000-0000-0000-0000-00000000000b"> + <EXPERIMENT_REF refname="00000000-0000-0000-0000-000000000003" /> + <DATA_BLOCK> + <FILES> + <FILE filename="some_dataset/fastq/sample3_R1.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="3r1md5" unencrypted_checksum="3r1md5u" /> + <FILE filename="some_dataset/fastq/sample3_R2.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="3r2md5" unencrypted_checksum="3r2md5u" /> + </FILES> + </DATA_BLOCK> + </RUN> +</RUN_SET> \ No newline at end of file diff --git a/metadata/tests/expected/run-2.xml b/metadata/tests/expected/run-2.xml new file mode 100644 index 0000000..821ecdb --- /dev/null +++ b/metadata/tests/expected/run-2.xml @@ -0,0 +1,29 @@ +<RUN_SET> + <RUN alias="00000000-0000-0000-0000-00000000000c"> + <EXPERIMENT_REF refname="00000000-0000-0000-0000-000000000004" /> + <DATA_BLOCK> + <FILES> + <FILE filename="some_dataset/fastq/sample4_R1.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="4r1md5" unencrypted_checksum="4r1md5u" /> + <FILE filename="some_dataset/fastq/sample4_R2.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="4r2md5" unencrypted_checksum="4r2md5u" /> + </FILES> + </DATA_BLOCK> + </RUN> + <RUN alias="00000000-0000-0000-0000-00000000000d"> + <EXPERIMENT_REF refname="00000000-0000-0000-0000-000000000005" /> + <DATA_BLOCK> + <FILES> + <FILE filename="some_dataset/fastq/sample5_R1.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="5r1md5" unencrypted_checksum="5r1md5u" /> + <FILE filename="some_dataset/fastq/sample5_R2.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="5r2md5" unencrypted_checksum="5r2md5u" /> + </FILES> + </DATA_BLOCK> + </RUN> + <RUN alias="00000000-0000-0000-0000-00000000000e"> + <EXPERIMENT_REF refname="00000000-0000-0000-0000-000000000006" /> + <DATA_BLOCK> + <FILES> + <FILE filename="some_dataset/fastq/sample6_R1.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="6r1md5" unencrypted_checksum="6r1md5u" /> + <FILE filename="some_dataset/fastq/sample6_R2.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="6r2md5" unencrypted_checksum="6r2md5u" /> + </FILES> + </DATA_BLOCK> + </RUN> +</RUN_SET> \ No newline at end of file diff --git a/metadata/tests/expected/run-3.xml b/metadata/tests/expected/run-3.xml new file mode 100644 index 0000000..57ed785 --- /dev/null +++ b/metadata/tests/expected/run-3.xml @@ -0,0 +1,20 @@ +<RUN_SET> + <RUN alias="00000000-0000-0000-0000-00000000000f"> + <EXPERIMENT_REF refname="00000000-0000-0000-0000-000000000007" /> + <DATA_BLOCK> + <FILES> + <FILE filename="some_dataset/fastq/sample7_R1.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="7r1md5" unencrypted_checksum="7r1md5u" /> + <FILE filename="some_dataset/fastq/sample7_R2.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="7r2md5" unencrypted_checksum="7r2md5u" /> + </FILES> + </DATA_BLOCK> + </RUN> + <RUN alias="00000000-0000-0000-0000-000000000010"> + <EXPERIMENT_REF refname="00000000-0000-0000-0000-000000000008" /> + <DATA_BLOCK> + <FILES> + <FILE filename="some_dataset/fastq/sample8_R1.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="8r1md5" unencrypted_checksum="8r1md5u" /> + <FILE filename="some_dataset/fastq/sample8_R2.fastq.gz" filetype="fastq" checksum_method="MD5" checksum="8r2md5" unencrypted_checksum="8r2md5u" /> + </FILES> + </DATA_BLOCK> + </RUN> +</RUN_SET> \ No newline at end of file diff --git a/metadata/tests/expected/sample.xml b/metadata/tests/expected/sample.xml new file mode 100644 index 0000000..f089e8d --- /dev/null +++ b/metadata/tests/expected/sample.xml @@ -0,0 +1,216 @@ +<SAMPLE_SET> + <SAMPLE alias="sample1" center_name=""> + <TITLE>Sample 1</TITLE> + <SAMPLE_NAME> + <TAXON_ID>9606</TAXON_ID> + <SCIENTIFIC_NAME>homo sapiens</SCIENTIFIC_NAME> + <COMMON_NAME>human</COMMON_NAME> + </SAMPLE_NAME> + <DESCRIPTION>A whole-genome sequenced human sample</DESCRIPTION> + <SAMPLE_ATTRIBUTES> + <SAMPLE_ATTRIBUTE> + <TAG>subject_id</TAG> + <VALUE>sample1</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>sex</TAG> + <VALUE>female</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>phenotype</TAG> + <VALUE>normal</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>this</TAG> + <VALUE>that</VALUE> + </SAMPLE_ATTRIBUTE> + </SAMPLE_ATTRIBUTES> + </SAMPLE> + <SAMPLE alias="sample2" center_name=""> + <TITLE>Sample 2</TITLE> + <SAMPLE_NAME> + <TAXON_ID>9606</TAXON_ID> + </SAMPLE_NAME> + <DESCRIPTION>A whole-genome sequenced human sample</DESCRIPTION> + <SAMPLE_ATTRIBUTES> + <SAMPLE_ATTRIBUTE> + <TAG>subject_id</TAG> + <VALUE>sample2</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>sex</TAG> + <VALUE>male</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>phenotype</TAG> + <VALUE>normal</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>this</TAG> + <VALUE>other</VALUE> + </SAMPLE_ATTRIBUTE> + </SAMPLE_ATTRIBUTES> + </SAMPLE> + <SAMPLE alias="sample3" center_name=""> + <TITLE>Sample 3</TITLE> + <SAMPLE_NAME> + <TAXON_ID>9606</TAXON_ID> + <SCIENTIFIC_NAME>homo_sapiens</SCIENTIFIC_NAME> + <COMMON_NAME>human</COMMON_NAME> + </SAMPLE_NAME> + <DESCRIPTION>A whole-genome sequenced human sample</DESCRIPTION> + <SAMPLE_ATTRIBUTES> + <SAMPLE_ATTRIBUTE> + <TAG>subject_id</TAG> + <VALUE>sample3</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>sex</TAG> + <VALUE>male</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>phenotype</TAG> + <VALUE>affected</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>this</TAG> + <VALUE>another</VALUE> + </SAMPLE_ATTRIBUTE> + </SAMPLE_ATTRIBUTES> + </SAMPLE> + <SAMPLE alias="sample4" center_name=""> + <TITLE>Sample 4</TITLE> + <SAMPLE_NAME> + <TAXON_ID>9606</TAXON_ID> + <SCIENTIFIC_NAME>homo sapiens</SCIENTIFIC_NAME> + <COMMON_NAME>human</COMMON_NAME> + </SAMPLE_NAME> + <DESCRIPTION>A whole-genome sequenced human sample</DESCRIPTION> + <SAMPLE_ATTRIBUTES> + <SAMPLE_ATTRIBUTE> + <TAG>subject_id</TAG> + <VALUE>sample4</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>sex</TAG> + <VALUE>female</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>phenotype</TAG> + <VALUE>normal</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>this</TAG> + <VALUE>that</VALUE> + </SAMPLE_ATTRIBUTE> + </SAMPLE_ATTRIBUTES> + </SAMPLE> + <SAMPLE alias="sample5" center_name=""> + <TITLE>Sample 5</TITLE> + <SAMPLE_NAME> + <TAXON_ID>9606</TAXON_ID> + <SCIENTIFIC_NAME>homo sapiens</SCIENTIFIC_NAME> + <COMMON_NAME>human</COMMON_NAME> + </SAMPLE_NAME> + <DESCRIPTION>A whole-genome sequenced human sample</DESCRIPTION> + <SAMPLE_ATTRIBUTES> + <SAMPLE_ATTRIBUTE> + <TAG>subject_id</TAG> + <VALUE>sample5</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>sex</TAG> + <VALUE>female</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>phenotype</TAG> + <VALUE>normal</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>this</TAG> + <VALUE>that</VALUE> + </SAMPLE_ATTRIBUTE> + </SAMPLE_ATTRIBUTES> + </SAMPLE> + <SAMPLE alias="sample6" center_name=""> + <TITLE>Sample 6</TITLE> + <SAMPLE_NAME> + <TAXON_ID>9606</TAXON_ID> + <SCIENTIFIC_NAME>homo sapiens</SCIENTIFIC_NAME> + <COMMON_NAME>human</COMMON_NAME> + </SAMPLE_NAME> + <DESCRIPTION>A whole-genome sequenced human sample</DESCRIPTION> + <SAMPLE_ATTRIBUTES> + <SAMPLE_ATTRIBUTE> + <TAG>subject_id</TAG> + <VALUE>sample6</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>sex</TAG> + <VALUE>female</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>phenotype</TAG> + <VALUE>normal</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>this</TAG> + <VALUE>that</VALUE> + </SAMPLE_ATTRIBUTE> + </SAMPLE_ATTRIBUTES> + </SAMPLE> + <SAMPLE alias="sample7" center_name=""> + <TITLE>Sample 7</TITLE> + <SAMPLE_NAME> + <TAXON_ID>9606</TAXON_ID> + <SCIENTIFIC_NAME>homo sapiens</SCIENTIFIC_NAME> + <COMMON_NAME>human</COMMON_NAME> + </SAMPLE_NAME> + <DESCRIPTION>A whole-genome sequenced human sample</DESCRIPTION> + <SAMPLE_ATTRIBUTES> + <SAMPLE_ATTRIBUTE> + <TAG>subject_id</TAG> + <VALUE>sample7</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>sex</TAG> + <VALUE>female</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>phenotype</TAG> + <VALUE>normal</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>this</TAG> + <VALUE>that</VALUE> + </SAMPLE_ATTRIBUTE> + </SAMPLE_ATTRIBUTES> + </SAMPLE> + <SAMPLE alias="sample8" center_name=""> + <TITLE>Sample 8</TITLE> + <SAMPLE_NAME> + <TAXON_ID>9606</TAXON_ID> + <SCIENTIFIC_NAME>homo sapiens</SCIENTIFIC_NAME> + <COMMON_NAME>human</COMMON_NAME> + </SAMPLE_NAME> + <DESCRIPTION>A whole-genome sequenced human sample</DESCRIPTION> + <SAMPLE_ATTRIBUTES> + <SAMPLE_ATTRIBUTE> + <TAG>subject_id</TAG> + <VALUE>sample8</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>sex</TAG> + <VALUE>female</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>phenotype</TAG> + <VALUE>normal</VALUE> + </SAMPLE_ATTRIBUTE> + <SAMPLE_ATTRIBUTE> + <TAG>this</TAG> + <VALUE>that</VALUE> + </SAMPLE_ATTRIBUTE> + </SAMPLE_ATTRIBUTES> + </SAMPLE> +</SAMPLE_SET> \ No newline at end of file diff --git a/metadata/tests/expected/study.xml b/metadata/tests/expected/study.xml new file mode 100644 index 0000000..d25590e --- /dev/null +++ b/metadata/tests/expected/study.xml @@ -0,0 +1,15 @@ +<STUDY_SET> + <STUDY alias="some_study" center_name=""> + <DESCRIPTOR> + <STUDY_TITLE>Some study title</STUDY_TITLE> + <STUDY_TYPE existing_study_type="Whole Genome Sequencing"/> + <STUDY_ABSTRACT></STUDY_ABSTRACT> + </DESCRIPTOR> + <STUDY_ATTRIBUTES> + <STUDY_ATTRIBUTE> + <TAG>url</TAG> + <VALUE>https://www.ed.ac.uk</VALUE> + </STUDY_ATTRIBUTE> + </STUDY_ATTRIBUTES> + </STUDY> +</STUDY_SET> \ No newline at end of file diff --git a/metadata/tests/expected/submission.xml b/metadata/tests/expected/submission.xml new file mode 100644 index 0000000..6c7c6ec --- /dev/null +++ b/metadata/tests/expected/submission.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="UTF-8"?> +<SUBMISSION_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="SRA.submission.xsd"> + <SUBMISSION alias="" broker_name="EGA"> + <CONTACTS> + <CONTACT name="Somebody" inform_on_status="somebody@ed.ac.uk" inform_on_error="somebody@ed.ac.uk"/> + </CONTACTS> + <ACTIONS> + <ACTION> + <ADD /> + </ACTION> + <ACTION> + <PROTECT/> + </ACTION> + </ACTIONS> + </SUBMISSION> +</SUBMISSION_SET> \ No newline at end of file diff --git a/metadata/tests/receipt.xml b/metadata/tests/receipt.xml new file mode 100644 index 0000000..bf45dcd --- /dev/null +++ b/metadata/tests/receipt.xml @@ -0,0 +1,28 @@ +<?xml version="1.0" encoding="UTF-8"?> +<?xml-stylesheet type="text/xsl" href="receipt.xsl"?> +<RECEIPT receiptDate="2024-09-06T10:39:38.697+01:00" submissionFile="submission.xml" success="true"> + <EXPERIMENT accession="EGAX00001" alias="EGAX00001_uuid" status="PRIVATE"/> + <EXPERIMENT accession="EGAX00002" alias="EGAX00002_uuid" status="PRIVATE"/> + <EXPERIMENT accession="EGAX00003" alias="EGAX00003_uuid" status="PRIVATE"/> + <RUN accession="EGAR00001" alias="EGAR00001_uuid" status="PRIVATE"/> + <RUN accession="EGAR00002" alias="EGAR00002_uuid" status="PRIVATE"/> + <RUN accession="EGAR00003" alias="EGAR00003_uuid" status="PRIVATE"/> + <SAMPLE accession="EGAN00001" alias="sample1" status="PRIVATE"> + <EXT_ID accession="SAMEA1" type="biosample"/> + </SAMPLE> + <SAMPLE accession="EGAN00002" alias="sample2" status="PRIVATE"> + <EXT_ID accession="SAMEA2" type="biosample"/> + </SAMPLE> + <SAMPLE accession="EGAN00003" alias="sample3" status="PRIVATE"> + <EXT_ID accession="SAMEA3" type="biosample"/> + </SAMPLE> + <STUDY accession="EGAS00001" alias="some_study" status="PRIVATE" holdUntilDate="2026-09-06+01:00"> + <EXT_ID accession="EGAS00001" type="Project"/> + </STUDY> + <SUBMISSION accession="EGA00002" alias="SUBMISSION-06-09-2024-10:39:38:069"/> + <MESSAGES> + <INFO>This submission is a TEST submission and will be discarded within 24 hours</INFO> + </MESSAGES> + <ACTIONS>ADD</ACTIONS> + <ACTIONS>PROTECT</ACTIONS> +</RECEIPT> diff --git a/metadata/tests/runs.csv b/metadata/tests/runs.csv new file mode 100644 index 0000000..78d83be --- /dev/null +++ b/metadata/tests/runs.csv @@ -0,0 +1,9 @@ +Sample alias,First Fastq File,First Checksum,First Unencrypted checksum,Second Fastq File,Second Checksum,Second Unencrypted checksum +sample1,sample1_R1.fastq.gz,1r1md5,1r1md5u,sample1_R2.fastq.gz,1r2md5,1r2md5u +sample2,sample2_R1.fastq.gz,2r1md5,2r1md5u,sample2_R2.fastq.gz,2r2md5,2r2md5u +sample3,sample3_R1.fastq.gz,3r1md5,3r1md5u,sample3_R2.fastq.gz,3r2md5,3r2md5u +sample4,sample4_R1.fastq.gz,4r1md5,4r1md5u,sample4_R2.fastq.gz,4r2md5,4r2md5u +sample5,sample5_R1.fastq.gz,5r1md5,5r1md5u,sample5_R2.fastq.gz,5r2md5,5r2md5u +sample6,sample6_R1.fastq.gz,6r1md5,6r1md5u,sample6_R2.fastq.gz,6r2md5,6r2md5u +sample7,sample7_R1.fastq.gz,7r1md5,7r1md5u,sample7_R2.fastq.gz,7r2md5,7r2md5u +sample8,sample8_R1.fastq.gz,8r1md5,8r1md5u,sample8_R2.fastq.gz,8r2md5,8r2md5u \ No newline at end of file diff --git a/metadata/tests/samples.csv b/metadata/tests/samples.csv new file mode 100644 index 0000000..dd4e155 --- /dev/null +++ b/metadata/tests/samples.csv @@ -0,0 +1,9 @@ +id,title,taxon_id,scientific_name,common_name,description,sex,phenotype,this +sample1,Sample 1,9606,homo sapiens,human,A whole-genome sequenced human sample,female,normal,that +sample2,Sample 2,9606,,,A whole-genome sequenced human sample,male,normal,other +sample3,Sample 3,9606,homo_sapiens,human,A whole-genome sequenced human sample,male,affected,another +sample4,Sample 4,9606,homo sapiens,human,A whole-genome sequenced human sample,female,normal,that +sample5,Sample 5,9606,homo sapiens,human,A whole-genome sequenced human sample,female,normal,that +sample6,Sample 6,9606,homo sapiens,human,A whole-genome sequenced human sample,female,normal,that +sample7,Sample 7,9606,homo sapiens,human,A whole-genome sequenced human sample,female,normal,that +sample8,Sample 8,9606,homo sapiens,human,A whole-genome sequenced human sample,female,normal,that diff --git a/metadata/tests/study_attributes.csv b/metadata/tests/study_attributes.csv new file mode 100644 index 0000000..ad6ebb7 --- /dev/null +++ b/metadata/tests/study_attributes.csv @@ -0,0 +1,2 @@ +tag,value +url,https://www.ed.ac.uk diff --git a/metadata/tests/test_ega_metadata.py b/metadata/tests/test_ega_metadata.py new file mode 100644 index 0000000..63c7494 --- /dev/null +++ b/metadata/tests/test_ega_metadata.py @@ -0,0 +1,59 @@ +import os +import uuid +import hashlib +import unittest +import ega_metadata + +uuid_idx = 0 +test_path = os.path.dirname(__file__) +obs_dir = os.path.join(test_path, 'xml') +exp_dir = os.path.join(test_path, 'expected') + + +def fake_uuid1(): + global uuid_idx + uuid_idx += 1 + return uuid.UUID(int=uuid_idx) + + +def md5_file(f): + m = hashlib.md5() + with open(f) as h: + for line in h: + m.update(line.encode()) + + return m.hexdigest() + + +class MyTestCase(unittest.TestCase): + def setUp(self): + os.environ['EGA_UPLOAD_CONFIG'] = os.path.join(test_path, 'ega_upload.yaml') + self.config = ega_metadata.file_args() + + ega_metadata.xml_dir = 'xml' + ega_metadata.force = True + self.original_uuid1 = uuid.uuid1 + uuid.uuid1 = fake_uuid1 + + for f in os.listdir(obs_dir): + os.remove(os.path.join(obs_dir, f)) + + def test_metadata(self): + ega_metadata.createsubmission(self.config) + ega_metadata.createpolicy(self.config) + ega_metadata.createstudy(self.config) + ega_metadata.createsamples(self.config) + ega_metadata.createrunsandexperiments(self.config) + ega_metadata.createdataset(self.config) + + for f in os.listdir(obs_dir): + obs_md5 = md5_file(os.path.join(obs_dir, f)) + exp_md5 = md5_file(os.path.join(exp_dir, f)) + self.assertEqual(exp_md5, obs_md5, msg='MD5 mismatch for file ' + f) + + def tearDown(self): + uuid.uuid1 = self.original_uuid1 + + +if __name__ == '__main__': + unittest.main() -- GitLab