Initial commit of metadata submission script

59bec6ab · mwham · ba9f9a07 · 59bec6ab · 59bec6ab · 59bec6ab
Commit 59bec6ab authored 5 months ago by mwham
--- a/.gitignore
+++ b/.gitignore
 *~
 *#
+.idea
 *.nextflow*
+__pycache__
 results
 work
 reads
 output
-test*
+metadata/tests/xml
--- a/metadata/ega_metadata.py
+++ b/metadata/ega_metadata.py
+import os
+import re
+import sys
+import math
+import yaml
+import uuid
+import jinja2
+import pandas
+import logging
+import argparse
+import xml.dom.minidom
+
+multi_newline = re.compile(r'\n{2,}')
+
+options = {
+    # universal options - root parser has prog=__name__
+    __name__: {
+        'debug': {'help': 'Set the logging level to DEBUG', 'action': 'store_true', 'default': False, 'alias': '-d'},
+        'force': {'help': 'Always overwrite output XMLs', 'action': 'store_true', 'default': 'False', 'alias': '-f'},
+        'output_dir': {'help': 'Directory to write output XMLs', 'alias': '-o'},
+        'submission_contacts': {'help': "Colon-separated name and email to add to the submission for notifications, e.g. 'Murray Wham:murray.wham@ed.ac.uk'. Can be specified multiple times.", 'nargs': '+'}
+    },
+    'createdac': {
+        'dac_alias': {'help': 'Alias for the DAC object to create'},
+        'dac_title': {'help': 'Short text to show in searches and displays'},
+        'dac_contacts': {'help': "Colon-separated name, email and organisation to add to the dac, e.g. 'Murray Wham:murray.wham@ed.ac.uk:IGC'. Can be specified multiple times.", 'nargs': '+'},
+        'dac_attributes': {'help': "Colon-separated tag, value and unit of extra attributes to list with the DAC. Can be specified multiple times.", 'nargs': '+'}
+    },
+    'createpolicy': {
+        'policy_alias': {'help': 'Alias for the policy object to create'},
+        'policy_title': {'help': 'Policy title'},
+        'dac_accession': {'help': 'EGA accession number for the DAC object to link this policy to'},
+        'policy_text': {'help': 'Policy text. This or policy_file_url is required.'},
+        'policy_file_url': {'help': 'URL of the policy document. This or policy_text is required'},
+        'policy_links': {'help': "Semicolon-separated description and URL of policy materials to link to, e.g. 'IGC Data Access Agreement;https://www.ed.ac.uk/some-policy'. Can be specified multiple times.", 'nargs': '+'},
+        'policy_attributes': {'help': "Colon-separated tag, value and unit of extra attributes to list with the policy. Can be specified multiple times.", 'nargs': '+'}
+    },
+    'createstudy': {
+        'study_alias': {'help': 'Unique ID for the study. Used to refer to the study during the submission process and is supposed to be globally unique within the submission account, but is not shared with anyone.'},
+        'study_type': {'help': 'Study type accepted by EGA, e.g. Whole Genome Sequencing', 'choices': ('Whole Genome Sequencing', 'Metagenomics', 'Transcriptome Analysis', 'Resequencing', 'Epigenetics', 'Synthetic Genomics', 'Forensic or Paleo-genomics', 'Gene Regulation Study', 'Cancer Genomics', 'Population Genomics', 'RNASeq', 'Exome Sequencing', 'Pooled Clone Sequencing', 'Transcriptome Sequencing', 'Other (Study type not listed)')},
+        'study_title': {'help': 'Study title'},
+        'study_abstract': {'help': 'Path to plain-text file containing the abstract for this study'},
+        'study_attributes': {'help': "CSV or Excel table with two columns ('tag' and 'value'), describing study attributes to add to study.xml"}
+    },
+    'createsamples': {
+        'samples': {'help': "CSV or Excel table with at least 8 columns ('id', 'title', 'scientific_name', 'common_name', 'description', 'sex', 'phenotype'), describing samples to add. Any additional columns will be included as sample attributes"},
+    },
+    'createrunsandexperiments': {
+        'nbatches': {'help': 'Set to a number larger than 1 to split paired-end fastqs into smaller batches. This may be required if uploading large datasets - ega-box accounts should not exceed 8Tb and must not exceed 10Tb.'},
+        'runs': {'help': "Path to CSV file linking samples to files. Must have the columns 'Sample alias', 'First Fastq File', 'First Checksum', 'First Unencrypted checksum', 'Second Fastq File', 'Second Checksum', 'Second Unencrypted checksum'", 'default': 'runs.csv'},
+        'experiment_title': {'help': 'Title to apply to all created experiment objects'},
+        'library_strategy': {'help': 'Value to apply to LIBRARY_STRATEGY, e.g. WGS'},
+        'library_source': {'help': 'Value to apply to LIBRARY_SOURCE, e.g. GENOMIC'},
+        'library_selection': {'help': 'Value to apply to LIBRARY_SELECTION', 'default': 'other'},
+        'library_nominal_length': {'help': 'Library nominal length, e.g. 450'},
+        'library_protocol': {'help': 'Description of the library preparation process'},
+        'platform_type': {'help': 'Platform type, e.g. illumina'},
+        'platform_instrument': {'help': 'Instrument type, e.g. HiSeq X Ten'},
+        'file_box_base': {'help': 'Base file path to apply to uploaded files when passing to `filename`, e.g. if files are uploaded to a folder structure on the ega-box FTP server', 'default': ''}
+    },
+    'createdataset': {
+        'receipt_xml': {'help': 'XML receipt file from running stage 1, containing run accesion numbers from EGA'},
+        'policy_accession': {'help': "'EGAPxxx...' accession number of the data access policy to apply to this dataset"},
+        'dataset_title': {'help': 'Dataset title'},
+        'dataset_type': {'help': 'Dataset type accepted by EGA, e.g. Whole genome sequencing', 'choices': ('Whole genome sequencing', 'Exome sequencing', 'Genotyping by array', 'Transcriptome profiling by high-throughput sequencing', 'Transcriptome profiling by array', 'Amplicon sequencing', 'Methylation binding domain sequencing', 'Methylation profiling by high-throughput sequencing', 'Phenotype information', 'Study summary information', 'Genomic variant calling', 'Chromatin accessibility profiling by high-throughput sequencing', 'Histone modification profiling by high-throughput sequencing', 'Chip-Seq')}
+    }
+}
+
+env = jinja2.Environment(
+    loader=jinja2.FileSystemLoader(os.path.join(os.path.dirname(__file__), 'templates')),
+    autoescape=jinja2.select_autoescape()
+)
+
+formatter = logging.Formatter('[%(asctime)s][%(name)s][%(levelname)s] %(message)s')
+handler = logging.StreamHandler(sys.stdout)
+handler.setFormatter(formatter)
+handler.setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+xml_dir = ''
+force = False
+
+
+def cmd_args(argv=None):
+    a = argparse.ArgumentParser(__name__)
+    subparsers = a.add_subparsers()
+    for n in options:
+        if n == __name__:
+            p = a
+        else:
+            p = subparsers.add_parser(n)
+
+        p.set_defaults(entry=p.prog)
+        for k, v in options[p.prog.replace(__name__ + ' ', '')].items():
+            alias = v.get('alias')  # e.g. -s for --study_alias
+            names = ['--' + k]
+            if alias:
+                v.pop('alias')
+                names.append(alias)
+
+            p.add_argument(*names, **v)
+
+    return a.parse_args(argv)
+
+
+def file_args():
+    for f in (os.getenv('EGA_UPLOAD_CONFIG'), os.path.join(os.getcwd(), 'ega_upload.yaml')):
+        if f and os.path.isfile(f):
+            with open(f) as h:
+                return yaml.safe_load(h)
+    return {}
+
+
+def _merge_file_and_cmd_args(file_config, args):
+    config = dict()
+    for k in {__name__, args.entry.replace(__name__ + ' ', '')}:
+        for k2, v in options[k].items():
+            # first populate config from command line, but only when different from the default
+            cmd_v = args.__dict__.get(k2)
+            if cmd_v is not None and cmd_v != v.get('default'):
+                config[k2] = cmd_v
+
+            # next with file config
+            elif k2 in file_config:
+                config[k2] = file_config[k2]
+
+            # finally with declared defaults
+            elif 'default' in v:
+                config[k2] = v['default']
+
+    return config
+
+
+def configure(argv=None):
+    args = cmd_args(argv)
+    file_conf = file_args()
+    config = _merge_file_and_cmd_args(file_conf, args)
+
+    if config['debug']:
+        handler.setLevel(logging.DEBUG)
+        logger.setLevel(logging.DEBUG)
+
+    logger.info('Output dir: %s', config['output_dir'])
+    global xml_dir
+    xml_dir = config['output_dir']
+    os.makedirs(xml_dir, exist_ok=True)
+
+    global force
+    if config['force']:
+        force = True
+
+    return args.entry, config
+
+
+def main(argv=None):
+    logger.addHandler(handler)
+    entry, config = configure(argv)
+
+    # we'll always need this
+    createsubmission(config)
+
+    if entry.endswith('createdac'):
+        createdac(config)
+    elif entry.endswith('createpolicy'):
+        createpolicy(config)
+    elif entry.endswith('createstudy'):
+        createstudy(config)
+    elif entry.endswith('createsamples'):
+        createsamples(config)
+    elif entry.endswith('createrunsandexperiments'):
+        createrunsandexperiments(config)
+    elif entry.endswith('createdataset'):
+        createdataset(config)
+
+
+def createsubmission(config):
+    write_xml(
+        'submission.xml',
+        config=config,
+        submission_contacts=[c.split(':') for c in config.get('submission_contacts', ())]
+    )
+
+
+def createdac(config):
+    dac_attributes = []
+    for attr in config.get('dac_attributes', ()):
+        split_attr = attr.split(':')
+        if len(split_attr) == 2:  # no unit
+            split_attr.append(None)
+
+        dac_attributes.append(split_attr)
+
+    write_xml(
+        'dac.xml',
+        config=config,
+        dac_contacts=[c.split(':') for c in config.get('dac_contacts', ())],
+        dac_attributes=dac_attributes
+    )
+
+
+def createpolicy(config):
+    if not config['policy_text'] and not config['policy_file_url']:
+        raise KeyError('policy_text or policy_file_url is required')
+
+    policy_attributes = []
+    for attr in config.get('policy_attributes', ()):
+        split_attr = attr.split(':')
+        if len(split_attr) == 2:  # no unit
+            split_attr.append(None)
+
+        policy_attributes.append(split_attr)
+
+    write_xml(
+        'policy.xml',
+        config=config,
+        policy_links=[l.split(':') for l in config.get('policy_links', ())],
+        policy_attributes=policy_attributes
+    )
+
+
+def createstudy(config):
+    df = read_table(config['study_attributes'])
+    if list(df['tag']) != list(set(df['tag'])):
+        raise ValueError('Duplicate tags found in %s' % config['study_attributes'])
+
+    study_attributes = {row.tag: row.value for row in df.itertuples()}
+    write_xml('study.xml', config=config, study_attributes=study_attributes)
+
+
+def createsamples(config):
+    df = read_table(config['samples'])
+    mandatory_columns = ('id', 'title', 'taxon_id', 'scientific_name', 'common_name', 'description', 'sex', 'phenotype')
+    extra_cols = tuple(c for c in df.columns if c not in mandatory_columns)
+    if tuple(df.columns[:8]) != mandatory_columns:
+        raise ValueError('First 8 columns of %s must be %s' % (config['samples'], mandatory_columns))
+
+    samples = []
+    for i, s in df.iterrows():
+        new_sample = dict()
+        for k in mandatory_columns:
+            v = s[k]
+            if pandas.notna(v):
+                new_sample[k] = v
+
+        extra_attributes = {c: s[c] for c in extra_cols if s[c] is not None}
+        new_sample['alias'] = new_sample['id']
+        new_sample['attributes'] = extra_attributes
+        samples.append(new_sample)
+
+    write_xml('sample.xml', config=config, samples=samples)
+
+
+def createrunsandexperiments(config):
+    idgen = IDGenerator(xml_dir)
+    df = read_table(config['runs'])
+    df['experiment_id'] = [idgen.new() for _ in range(len(df))]
+    df['run_id'] = [idgen.new() for _ in range(len(df))]
+    df['r1'] = df['First Fastq File'].map(lambda f: os.path.join(config['file_box_base'], f))
+    df['r2'] = df['Second Fastq File'].map(lambda f: os.path.join(config['file_box_base'], f))
+    df['file_type_r1'] = df['First Fastq File'].map(get_file_type)
+    df['file_type_r2'] = df['Second Fastq File'].map(get_file_type)
+
+    nsamples = len(df)
+    logger.info('Splitting %i samples into %i batches', nsamples, config['nbatches'])
+    slices = batches(nsamples, config['nbatches'])
+
+    for i, (j, k) in enumerate(slices):
+        batch = [line for i, line in df[j:k].iterrows()]
+        write_xml('experiment.xml', 'experiment-%i.xml' % (i + 1), id_map=batch, config=config)
+        write_xml('run.xml', 'run-%i.xml' % (i + 1), id_map=batch, config=config)
+
+
+def createdataset(config):
+    dom = xml.dom.minidom.parse(config['receipt_xml'])
+    run_accessions = [r.attributes['accession'].value for r in dom.getElementsByTagName('RUN')]
+    idgen = IDGenerator(xml_dir)
+    write_xml('dataset.xml', config=config, dataset_alias=idgen.new(), runs=run_accessions, analyses=())
+
+
+def read_table(f: str) -> pandas.DataFrame:
+    if f.endswith('.csv'):
+        return pandas.read_csv(f)
+    elif f.endswith('xlsx'):
+        return pandas.read_excel(f, engine='openpyxl')
+    else:
+        raise NameError('Unrecognised file format for %s - must be csv or xlsx' % f)
+
+
+def render_xml(template, **kwargs):
+    return multi_newline.sub('\n', env.get_template(template).render(**kwargs))
+
+
+def write_xml(template_name, xml_name_out=None, **kwargs):
+    xml_name_out = xml_name_out or template_name
+    path = os.path.join(xml_dir, xml_name_out)
+    if force or not os.path.isfile(path):
+        content = render_xml(template_name, **kwargs)
+        with open(path, 'w') as f:
+            f.write(content)
+
+        logger.info('Written %s', xml_name_out)
+        logger.debug(content)
+    else:
+        logger.info('%s already exists', xml_name_out)
+
+
+class IDGenerator:
+    def __init__(self, xml_dir=None):
+        if xml_dir:
+            self.existing_ids = self.scrape_xml_dir(xml_dir)
+        else:
+            self.existing_ids = set()
+
+    def new(self):
+        candidate = None
+        for _ in range(3):
+            candidate = str(uuid.uuid1())
+
+            if candidate in self.existing_ids:
+                logger.warning('Attempted to generate an ID %s that already existed - retrying', candidate)
+            else:
+                self.existing_ids.add(candidate)
+                return candidate
+
+        raise ValueError('Failed to generate an experiment ID. Last tried: %s' % candidate)
+
+    @staticmethod
+    def scrape_xml_dir(xml_dir):
+        ids = set()
+        tags = ('DAC', 'POLICY', 'EXPERIMENT', 'RUN', 'STUDY', 'SAMPLE')
+        for root, dirs, files in os.walk(xml_dir):
+            for f in files:
+                if not f.endswith('.xml'):
+                    continue
+
+                dom = xml.dom.minidom.parse(os.path.join(root, f))
+                for t in tags:
+                    elements = dom.getElementsByTagName(t)
+                    for e in elements:
+                        if 'alias' in e.attributes:
+                            ids.add(e.attributes.get('alias').value)
+
+        return ids
+
+
+def get_file_type(filename):
+    if filename.endswith('fq.gz') or filename.endswith('fastq.gz'):
+        return 'fastq'
+
+    raise NameError('Could not determine file type for file %s' % filename)
+
+
+def batches(nitems, nbatches):
+    """
+    Split a number of items (`nitems`) into `nbatches` number of slices, taking
+    remainders into account if needed. E.g, splitting 99 fastq pairs into 4 batches
+    -> [25, 25, 25, 24] -> [(0, 25), (25, 50), (50, 75), (75, 99)].
+    """
+    full_batch = math.ceil(nitems / nbatches)  # full size batch
+    nfull_batches = nitems // full_batch
+    _batches = [
+        (i * full_batch, (i * full_batch) + full_batch)
+        for i in range(nfull_batches)
+    ]
+    remainder = nitems % full_batch
+    if remainder:
+        last_batch = (nitems - remainder, nitems)
+        _batches.append(last_batch)
+
+    return _batches
+
+
+if __name__ == '__main__':
+    main()
--- a/metadata/requirements.txt
+++ b/metadata/requirements.txt
+pyYAML>=6.0.2
+pandas>=2.2.2
+jinja2>=3.1.4
+openpyxl>=3.1.5
--- a/metadata/templates/analysis.xml
+++ b/metadata/templates/analysis.xml
+<ANALYSIS_SET>
+<ANALYSIS alias="{{ analysis.alias }}" center_name="{{ center_name }}" broker_name="EGA" >
+        <TITLE>{{ analysis.title }}</TITLE>
+        <DESCRIPTION>{{ analysis.description }}</DESCRIPTION>
+        <STUDY_REF refname="{{ study_alias }}" refcenter="{{ center_name }}"/>
+        <SAMPLE_REF refname="{{ analysis.sample_alias }}" refcenter="{{ center_name }}" label="{{ analysis.sample_alias }}"/>
+        <ANALYSIS_TYPE>
+            <REFERENCE_ALIGNMENT>
+                <ASSEMBLY>
+                    <STANDARD refname="{{ analysis.assembly_name }}" accession="{{ analysis.assembly_accession }}"/>
+                </ASSEMBLY>
+{% for chr in analysis.assembly_chromosomes %}
+                <SEQUENCE accession="{{ chr.accession }}" label="{{ chr.name }}"/>
+{% endfor %}
+            </REFERENCE_ALIGNMENT>
+        </ANALYSIS_TYPE>
+        <FILES>
+{% for f in analysis.files %}
+            <FILE filename="{{ f.path }}" filetype="{{ f.type }}" checksum_method="MD5" checksum="{{ f.checksum }}" unencrypted_checksum="{{ f.unencrypted_checksum }}"/>
+{% endfor %}
+        </FILES>
+    </ANALYSIS>
+</ANALYSIS_SET>
--- a/metadata/templates/dac.xml
+++ b/metadata/templates/dac.xml
+<DAC_SET>
+    <DAC alias="{{ config.dac_alias }}" center_name="{{ center_name }}" broker_name="EGA">
+        <TITLE>{{ config.dac_title }}</TITLE>
+        <CONTACTS>
+{% for name, email, org in dac_contacts %}
+            <CONTACT name="{{ name }}" email="{{ email }}" organisation="{{ org }}" />
+{% endfor %}
+        </CONTACTS>
+{% if dac_attributes %}
+        <DAC_ATTRIBUTES>
+{% for k, v, unit in dac_attributes %}
+            <DAC_ATTRIBUTE>
+				<TAG>{{ k }}</TAG>
+				<VALUE>{{ v }}</VALUE>
+{% if unit %}
+				<UNITS>{{ unit }}</UNITS>
+{% endif %}
+			</DAC_ATTRIBUTE>
+{% endfor %}
+        </DAC_ATTRIBUTES>
+{% endif %}
+    </DAC>
+</DAC_SET>
--- a/metadata/templates/dataset.xml
+++ b/metadata/templates/dataset.xml
+<DATASETS>
+    <DATASET alias="{{ dataset_alias }}" broker_name="EGA">
+        <TITLE>{{ config.dataset_title }}</TITLE>
+        <DATASET_TYPE>{{ config.dataset_type }}</DATASET_TYPE>
+{% for r in runs %}
+        <RUN_REF accession="{{ r }}" />
+{% endfor %}
+{% for a in analyses %}
+        <ANALYSIS_REF accession="{{ a }}" />
+{% endfor %}
+        <POLICY_REF accession="{{ config.policy_accession }}" />
+{% if links %}
+        <DATASET_LINKS>
+{% for k, v in links.items() %}
+            <DATASET_LINK>
+                <URL_LINK>
+                    <LABEL>{{ k }}</LABEL>
+                    <URL>{{ v }}</URL>
+                </URL_LINK>
+            </DATASET_LINK>
+{% endfor %}
+        </DATASET_LINKS>
+{% endif %}
+    </DATASET>
+</DATASETS>
--- a/metadata/templates/experiment.xml
+++ b/metadata/templates/experiment.xml
+<EXPERIMENT_SET>
+{% for line in id_map %}
+  <EXPERIMENT alias="{{ line['experiment_id'] }}">
+    <TITLE>{{ experiment_title }}</TITLE>
+    <STUDY_REF refname="{{ config.study_alias }}" />
+    <DESIGN>
+      <DESIGN_DESCRIPTION />
+      <SAMPLE_DESCRIPTOR refname="{{ line['Sample alias'] }}" />
+      <LIBRARY_DESCRIPTOR>
+        <LIBRARY_NAME></LIBRARY_NAME>
+        <LIBRARY_STRATEGY>{{ config.library_strategy | default('WGS') }}</LIBRARY_STRATEGY>
+        <LIBRARY_SOURCE>{{ config.library_source | default('GENOMIC') }}</LIBRARY_SOURCE>
+        <LIBRARY_SELECTION>{{ config.library_selection | default('other') }}</LIBRARY_SELECTION>
+        <LIBRARY_LAYOUT>
+          <PAIRED NOMINAL_LENGTH="{{ config.library_nominal_length }}"></PAIRED>
+        </LIBRARY_LAYOUT>
+        <LIBRARY_CONSTRUCTION_PROTOCOL>{{ config.library_protocol }}</LIBRARY_CONSTRUCTION_PROTOCOL>
+      </LIBRARY_DESCRIPTOR>
+    </DESIGN>
+    <PLATFORM>
+{% if config.platform_type == 'illumina' %}
+      <ILLUMINA>
+        <INSTRUMENT_MODEL>{{ config.platform_instrument }}</INSTRUMENT_MODEL>
+      </ILLUMINA>
+{% endif %}
+    </PLATFORM>
+    <PROCESSING />
+  </EXPERIMENT>
+{% endfor %}
+</EXPERIMENT_SET>
--- a/metadata/templates/policy.xml
+++ b/metadata/templates/policy.xml
+<POLICY_SET>
+    <POLICY alias="{{ config.policy_alias }}" broker_name="EGA">
+        <TITLE>{{ config.policy_title }}</TITLE>
+        <DAC_REF accession="{{ config.dac_accession }}" />
+{% if config.policy_text %}
+        <POLICY_TEXT>{{ config.policy_text }}</POLICY_TEXT>
+{% endif %}
+{% if config.policy_file_url %}
+        <POLICY_FILE>{{ config.policy_file_url }}</POLICY_FILE>
+{% endif %}
+{% if policy_links %}
+        <POLICY_LINKS>
+{% for k, v in policy_links.items() %}
+            <POLICY_LINK>
+                <URL_LINK>
+                    <LABEL>{{ k }}</LABEL>
+                    <URL>{{ v }}</URL>
+                </URL_LINK>
+            </POLICY_LINK>
+{% endfor %}
+        </POLICY_LINKS>
+{% endif %}
+{% if policy_attributes %}
+        <POLICY_ATTRIBUTES>
+{% for tag, value, unit in policy_attributes %}
+            <POLICY_ATTRIBUTE>
+                <TAG>{{ tag }}</TAG>
+                <VALUE>{{ value }}</VALUE>
+{% if unit %}
+                <UNITS>{{ unit }}</UNITS>
+{% endif %}
+            </POLICY_ATTRIBUTE>
+{% endfor %}
+        </POLICY_ATTRIBUTES>
+{% endif %}
+    </POLICY>
+</POLICY_SET>
--- a/metadata/templates/run.xml
+++ b/metadata/templates/run.xml
+<RUN_SET>
+{% for line in id_map %}
+  <RUN alias="{{ line['run_id'] }}">
+    <EXPERIMENT_REF refname="{{ line['experiment_id'] }}" />
+    <DATA_BLOCK>
+      <FILES>
+        <FILE filename="{{ line['r1'] }}" filetype="{{ line['file_type_r1'] }}" checksum_method="MD5" checksum="{{ line['First Checksum'] }}" unencrypted_checksum="{{ line['First Unencrypted checksum'] }}" />
+        <FILE filename="{{ line['r2'] }}" filetype="{{ line['file_type_r2'] }}" checksum_method="MD5" checksum="{{ line['Second Checksum'] }}" unencrypted_checksum="{{ line['Second Unencrypted checksum'] }}" />
+      </FILES>
+    </DATA_BLOCK>
+  </RUN>
+{% endfor %}
+</RUN_SET>
--- a/metadata/templates/sample.xml
+++ b/metadata/templates/sample.xml
+<SAMPLE_SET>
+{% for sample in samples %}
+    <SAMPLE alias="{{ sample.alias }}" center_name="{{ center_name }}">
+        <TITLE>{{ sample.title }}</TITLE>
+        <SAMPLE_NAME>
+            <TAXON_ID>{{ sample.taxon_id | default(9606) }}</TAXON_ID>
+{% if sample.scientific_name %}
+            <SCIENTIFIC_NAME>{{ sample.scientific_name }}</SCIENTIFIC_NAME>
+{% endif %}
+{% if sample.common_name %}
+            <COMMON_NAME>{{ sample.common_name }}</COMMON_NAME>
+{% endif %}
+        </SAMPLE_NAME>
+        <DESCRIPTION>{{ sample.description }}</DESCRIPTION>
+        <SAMPLE_ATTRIBUTES>
+            <SAMPLE_ATTRIBUTE>
+                <TAG>subject_id</TAG>
+                <VALUE>{{ sample.alias }}</VALUE>
+            </SAMPLE_ATTRIBUTE>
+            <SAMPLE_ATTRIBUTE>
+                <TAG>sex</TAG>
+                <VALUE>{{ sample.sex | default('unknown') }}</VALUE>
+            </SAMPLE_ATTRIBUTE>
+            <SAMPLE_ATTRIBUTE>
+                <TAG>phenotype</TAG>
+                <VALUE>{{ sample.phenotype }}</VALUE>
+            </SAMPLE_ATTRIBUTE>
+
+{% for k, v in sample.attributes.items() %}
+            <SAMPLE_ATTRIBUTE>
+                <TAG>{{ k }}</TAG>
+                <VALUE>{{ v }}</VALUE>
+            </SAMPLE_ATTRIBUTE>
+{% endfor %}
+        </SAMPLE_ATTRIBUTES>
+    </SAMPLE>
+{% endfor %}
+</SAMPLE_SET>
--- a/metadata/templates/study.xml
+++ b/metadata/templates/study.xml
+<STUDY_SET>
+    <STUDY alias="{{ config.study_alias }}" center_name="{{ config.center_name }}">
+        <DESCRIPTOR>
+            <STUDY_TITLE>{{ config.study_title }}</STUDY_TITLE>
+            <STUDY_TYPE existing_study_type="{{ config.study_type }}"/>
+            <STUDY_ABSTRACT>{{ config.abstract }}</STUDY_ABSTRACT>
+        </DESCRIPTOR>
+        <STUDY_ATTRIBUTES>
+{% for k, v in study_attributes.items() %}
+            <STUDY_ATTRIBUTE>
+                <TAG>{{ k }}</TAG>
+                <VALUE>{{ v }}</VALUE>
+            </STUDY_ATTRIBUTE>
+{% endfor %}
+        </STUDY_ATTRIBUTES>
+    </STUDY>
+</STUDY_SET>
--- a/metadata/templates/submission.xml
+++ b/metadata/templates/submission.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<SUBMISSION_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="SRA.submission.xsd">
+    <SUBMISSION alias="" broker_name="EGA">
+{% if submission_contacts %}
+        <CONTACTS>
+{% for name, email in submission_contacts %}
+            <CONTACT name="{{ name }}" inform_on_status="{{ email }}" inform_on_error="{{ email }}"/>
+{% endfor %}
+        </CONTACTS>
+{% endif %}
+        <ACTIONS>
+            <ACTION>
+                <ADD />
+            </ACTION>
+            <ACTION>
+                <PROTECT/>
+            </ACTION>
+        </ACTIONS>
+    </SUBMISSION>
+</SUBMISSION_SET>
--- a/metadata/tests/abstract.txt
+++ b/metadata/tests/abstract.txt
+Some study abstract
--- a/metadata/tests/ega_upload.yaml
+++ b/metadata/tests/ega_upload.yaml
+---
+
+output_dir: xml
+force: true
+
+submission_contacts:
+  - 'Somebody:somebody@ed.ac.uk'
+
+dac_alias: test_dac
+dac_title: Test DAC
+dac_contacts:
+  - Somebody:somebody@ed.ac.uk:DAC
+dac_attributes:
+  - this:that:other
+
+policy_alias: some_data_access_policy
+policy_title: Some Data Access Policy
+policy_text: Some data access policy text
+dac_accession: EGAC01234
+policy_attributes:
+  - this:that:other
+
+study_alias: some_study
+study_type: Whole Genome Sequencing
+study_title: Some study title
+study_abstract: abstract.txt
+study_attributes: study_attributes.csv
+
+samples: samples.csv
+
+nbatches: 3
+runs: runs.csv
+experiment_title: Whole genome sequencing for some dataset
+library_strategy: WGS
+library_source: GENOMIC
+library_selection: other
+library_nominal_length: 450
+library_protocol: |
+  Some library protocol text
+platform_type: illumina
+platform_instrument: HiSeq X Ten
+file_box_base: some_dataset/fastq
+
+receipt_xml: receipt.xml
+policy_accession: EGAP01234
+dataset_title: Test dataset
+dataset_type: 'Exome sequencing'
--- a/metadata/tests/expected/dac.xml
+++ b/metadata/tests/expected/dac.xml
+<DAC_SET>
+    <DAC alias="IGC DAC" center_name="" broker_name="EGA">
+        <TITLE>Test DAC</TITLE>
+        <CONTACTS>
+            <CONTACT name="Murray Wham" email="murray.wham@ed.ac.uk" organisation="IGC" />
+        </CONTACTS>
+    </DAC>
+</DAC_SET>
\ No newline at end of file
--- a/metadata/tests/expected/dataset.xml
+++ b/metadata/tests/expected/dataset.xml
+<DATASETS>
+    <DATASET alias="00000000-0000-0000-0000-000000000011" broker_name="EGA">
+        <TITLE>Test dataset</TITLE>
+        <DATASET_TYPE>Exome sequencing</DATASET_TYPE>
+        <RUN_REF accession="EGAR00001" />
+        <RUN_REF accession="EGAR00002" />
+        <RUN_REF accession="EGAR00003" />
+        <POLICY_REF accession="EGAP01234" />
+    </DATASET>
+</DATASETS>
\ No newline at end of file
--- a/metadata/tests/expected/experiment-1.xml
+++ b/metadata/tests/expected/experiment-1.xml
+<EXPERIMENT_SET>
+  <EXPERIMENT alias="00000000-0000-0000-0000-000000000001">
+    <TITLE></TITLE>
+    <STUDY_REF refname="some_study" />
+    <DESIGN>
+      <DESIGN_DESCRIPTION />
+      <SAMPLE_DESCRIPTOR refname="sample1" />
+      <LIBRARY_DESCRIPTOR>
+        <LIBRARY_NAME></LIBRARY_NAME>
+        <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
+        <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
+        <LIBRARY_SELECTION>other</LIBRARY_SELECTION>
+        <LIBRARY_LAYOUT>
+          <PAIRED NOMINAL_LENGTH="450"></PAIRED>
+        </LIBRARY_LAYOUT>
+        <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
+</LIBRARY_CONSTRUCTION_PROTOCOL>
+      </LIBRARY_DESCRIPTOR>
+    </DESIGN>
+    <PLATFORM>
+      <ILLUMINA>
+        <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
+      </ILLUMINA>
+    </PLATFORM>
+    <PROCESSING />
+  </EXPERIMENT>
+  <EXPERIMENT alias="00000000-0000-0000-0000-000000000002">
+    <TITLE></TITLE>
+    <STUDY_REF refname="some_study" />
+    <DESIGN>
+      <DESIGN_DESCRIPTION />
+      <SAMPLE_DESCRIPTOR refname="sample2" />
+      <LIBRARY_DESCRIPTOR>
+        <LIBRARY_NAME></LIBRARY_NAME>
+        <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
+        <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
+        <LIBRARY_SELECTION>other</LIBRARY_SELECTION>
+        <LIBRARY_LAYOUT>
+          <PAIRED NOMINAL_LENGTH="450"></PAIRED>
+        </LIBRARY_LAYOUT>
+        <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
+</LIBRARY_CONSTRUCTION_PROTOCOL>
+      </LIBRARY_DESCRIPTOR>
+    </DESIGN>
+    <PLATFORM>
+      <ILLUMINA>
+        <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
+      </ILLUMINA>
+    </PLATFORM>
+    <PROCESSING />
+  </EXPERIMENT>
+  <EXPERIMENT alias="00000000-0000-0000-0000-000000000003">
+    <TITLE></TITLE>
+    <STUDY_REF refname="some_study" />
+    <DESIGN>
+      <DESIGN_DESCRIPTION />
+      <SAMPLE_DESCRIPTOR refname="sample3" />
+      <LIBRARY_DESCRIPTOR>
+        <LIBRARY_NAME></LIBRARY_NAME>
+        <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
+        <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
+        <LIBRARY_SELECTION>other</LIBRARY_SELECTION>
+        <LIBRARY_LAYOUT>
+          <PAIRED NOMINAL_LENGTH="450"></PAIRED>
+        </LIBRARY_LAYOUT>
+        <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
+</LIBRARY_CONSTRUCTION_PROTOCOL>
+      </LIBRARY_DESCRIPTOR>
+    </DESIGN>
+    <PLATFORM>
+      <ILLUMINA>
+        <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
+      </ILLUMINA>
+    </PLATFORM>
+    <PROCESSING />
+  </EXPERIMENT>
+</EXPERIMENT_SET>
\ No newline at end of file
--- a/metadata/tests/expected/experiment-2.xml
+++ b/metadata/tests/expected/experiment-2.xml
+<EXPERIMENT_SET>
+  <EXPERIMENT alias="00000000-0000-0000-0000-000000000004">
+    <TITLE></TITLE>
+    <STUDY_REF refname="some_study" />
+    <DESIGN>
+      <DESIGN_DESCRIPTION />
+      <SAMPLE_DESCRIPTOR refname="sample4" />
+      <LIBRARY_DESCRIPTOR>
+        <LIBRARY_NAME></LIBRARY_NAME>
+        <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
+        <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
+        <LIBRARY_SELECTION>other</LIBRARY_SELECTION>
+        <LIBRARY_LAYOUT>
+          <PAIRED NOMINAL_LENGTH="450"></PAIRED>
+        </LIBRARY_LAYOUT>
+        <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
+</LIBRARY_CONSTRUCTION_PROTOCOL>
+      </LIBRARY_DESCRIPTOR>
+    </DESIGN>
+    <PLATFORM>
+      <ILLUMINA>
+        <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
+      </ILLUMINA>
+    </PLATFORM>
+    <PROCESSING />
+  </EXPERIMENT>
+  <EXPERIMENT alias="00000000-0000-0000-0000-000000000005">
+    <TITLE></TITLE>
+    <STUDY_REF refname="some_study" />
+    <DESIGN>
+      <DESIGN_DESCRIPTION />
+      <SAMPLE_DESCRIPTOR refname="sample5" />
+      <LIBRARY_DESCRIPTOR>
+        <LIBRARY_NAME></LIBRARY_NAME>
+        <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
+        <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
+        <LIBRARY_SELECTION>other</LIBRARY_SELECTION>
+        <LIBRARY_LAYOUT>
+          <PAIRED NOMINAL_LENGTH="450"></PAIRED>
+        </LIBRARY_LAYOUT>
+        <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
+</LIBRARY_CONSTRUCTION_PROTOCOL>
+      </LIBRARY_DESCRIPTOR>
+    </DESIGN>
+    <PLATFORM>
+      <ILLUMINA>
+        <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
+      </ILLUMINA>
+    </PLATFORM>
+    <PROCESSING />
+  </EXPERIMENT>
+  <EXPERIMENT alias="00000000-0000-0000-0000-000000000006">
+    <TITLE></TITLE>
+    <STUDY_REF refname="some_study" />
+    <DESIGN>
+      <DESIGN_DESCRIPTION />
+      <SAMPLE_DESCRIPTOR refname="sample6" />
+      <LIBRARY_DESCRIPTOR>
+        <LIBRARY_NAME></LIBRARY_NAME>
+        <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
+        <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
+        <LIBRARY_SELECTION>other</LIBRARY_SELECTION>
+        <LIBRARY_LAYOUT>
+          <PAIRED NOMINAL_LENGTH="450"></PAIRED>
+        </LIBRARY_LAYOUT>
+        <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
+</LIBRARY_CONSTRUCTION_PROTOCOL>
+      </LIBRARY_DESCRIPTOR>
+    </DESIGN>
+    <PLATFORM>
+      <ILLUMINA>
+        <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
+      </ILLUMINA>
+    </PLATFORM>
+    <PROCESSING />
+  </EXPERIMENT>
+</EXPERIMENT_SET>
\ No newline at end of file
--- a/metadata/tests/expected/experiment-3.xml
+++ b/metadata/tests/expected/experiment-3.xml
+<EXPERIMENT_SET>
+  <EXPERIMENT alias="00000000-0000-0000-0000-000000000007">
+    <TITLE></TITLE>
+    <STUDY_REF refname="some_study" />
+    <DESIGN>
+      <DESIGN_DESCRIPTION />
+      <SAMPLE_DESCRIPTOR refname="sample7" />
+      <LIBRARY_DESCRIPTOR>
+        <LIBRARY_NAME></LIBRARY_NAME>
+        <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
+        <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
+        <LIBRARY_SELECTION>other</LIBRARY_SELECTION>
+        <LIBRARY_LAYOUT>
+          <PAIRED NOMINAL_LENGTH="450"></PAIRED>
+        </LIBRARY_LAYOUT>
+        <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
+</LIBRARY_CONSTRUCTION_PROTOCOL>
+      </LIBRARY_DESCRIPTOR>
+    </DESIGN>
+    <PLATFORM>
+      <ILLUMINA>
+        <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
+      </ILLUMINA>
+    </PLATFORM>
+    <PROCESSING />
+  </EXPERIMENT>
+  <EXPERIMENT alias="00000000-0000-0000-0000-000000000008">
+    <TITLE></TITLE>
+    <STUDY_REF refname="some_study" />
+    <DESIGN>
+      <DESIGN_DESCRIPTION />
+      <SAMPLE_DESCRIPTOR refname="sample8" />
+      <LIBRARY_DESCRIPTOR>
+        <LIBRARY_NAME></LIBRARY_NAME>
+        <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
+        <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
+        <LIBRARY_SELECTION>other</LIBRARY_SELECTION>
+        <LIBRARY_LAYOUT>
+          <PAIRED NOMINAL_LENGTH="450"></PAIRED>
+        </LIBRARY_LAYOUT>
+        <LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
+</LIBRARY_CONSTRUCTION_PROTOCOL>
+      </LIBRARY_DESCRIPTOR>
+    </DESIGN>
+    <PLATFORM>
+      <ILLUMINA>
+        <INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
+      </ILLUMINA>
+    </PLATFORM>
+    <PROCESSING />
+  </EXPERIMENT>
+</EXPERIMENT_SET>
\ No newline at end of file
--- a/metadata/tests/expected/policy.xml
+++ b/metadata/tests/expected/policy.xml
+<POLICY_SET>
+    <POLICY alias="some_data_access_policy" broker_name="EGA">
+        <TITLE>Some Data Access Policy</TITLE>
+        <DAC_REF accession="EGAC01234" />
+        <POLICY_TEXT>Some data access policy text</POLICY_TEXT>
+        <POLICY_ATTRIBUTES>
+            <POLICY_ATTRIBUTE>
+                <TAG>this</TAG>
+                <VALUE>that</VALUE>
+                <UNITS>other</UNITS>
+            </POLICY_ATTRIBUTE>
+        </POLICY_ATTRIBUTES>
+    </POLICY>
+</POLICY_SET>
\ No newline at end of file