Skip to content
Snippets Groups Projects
Commit 59bec6ab authored by mwham's avatar mwham
Browse files

Initial commit of metadata submission script

parent ba9f9a07
Branches metadata
No related tags found
1 merge request!2Metadata
Showing
with 894 additions and 1 deletion
*~
*#
.idea
*.nextflow*
__pycache__
results
work
reads
output
test*
metadata/tests/xml
import os
import re
import sys
import math
import yaml
import uuid
import jinja2
import pandas
import logging
import argparse
import xml.dom.minidom
multi_newline = re.compile(r'\n{2,}')
options = {
# universal options - root parser has prog=__name__
__name__: {
'debug': {'help': 'Set the logging level to DEBUG', 'action': 'store_true', 'default': False, 'alias': '-d'},
'force': {'help': 'Always overwrite output XMLs', 'action': 'store_true', 'default': 'False', 'alias': '-f'},
'output_dir': {'help': 'Directory to write output XMLs', 'alias': '-o'},
'submission_contacts': {'help': "Colon-separated name and email to add to the submission for notifications, e.g. 'Murray Wham:murray.wham@ed.ac.uk'. Can be specified multiple times.", 'nargs': '+'}
},
'createdac': {
'dac_alias': {'help': 'Alias for the DAC object to create'},
'dac_title': {'help': 'Short text to show in searches and displays'},
'dac_contacts': {'help': "Colon-separated name, email and organisation to add to the dac, e.g. 'Murray Wham:murray.wham@ed.ac.uk:IGC'. Can be specified multiple times.", 'nargs': '+'},
'dac_attributes': {'help': "Colon-separated tag, value and unit of extra attributes to list with the DAC. Can be specified multiple times.", 'nargs': '+'}
},
'createpolicy': {
'policy_alias': {'help': 'Alias for the policy object to create'},
'policy_title': {'help': 'Policy title'},
'dac_accession': {'help': 'EGA accession number for the DAC object to link this policy to'},
'policy_text': {'help': 'Policy text. This or policy_file_url is required.'},
'policy_file_url': {'help': 'URL of the policy document. This or policy_text is required'},
'policy_links': {'help': "Semicolon-separated description and URL of policy materials to link to, e.g. 'IGC Data Access Agreement;https://www.ed.ac.uk/some-policy'. Can be specified multiple times.", 'nargs': '+'},
'policy_attributes': {'help': "Colon-separated tag, value and unit of extra attributes to list with the policy. Can be specified multiple times.", 'nargs': '+'}
},
'createstudy': {
'study_alias': {'help': 'Unique ID for the study. Used to refer to the study during the submission process and is supposed to be globally unique within the submission account, but is not shared with anyone.'},
'study_type': {'help': 'Study type accepted by EGA, e.g. Whole Genome Sequencing', 'choices': ('Whole Genome Sequencing', 'Metagenomics', 'Transcriptome Analysis', 'Resequencing', 'Epigenetics', 'Synthetic Genomics', 'Forensic or Paleo-genomics', 'Gene Regulation Study', 'Cancer Genomics', 'Population Genomics', 'RNASeq', 'Exome Sequencing', 'Pooled Clone Sequencing', 'Transcriptome Sequencing', 'Other (Study type not listed)')},
'study_title': {'help': 'Study title'},
'study_abstract': {'help': 'Path to plain-text file containing the abstract for this study'},
'study_attributes': {'help': "CSV or Excel table with two columns ('tag' and 'value'), describing study attributes to add to study.xml"}
},
'createsamples': {
'samples': {'help': "CSV or Excel table with at least 8 columns ('id', 'title', 'scientific_name', 'common_name', 'description', 'sex', 'phenotype'), describing samples to add. Any additional columns will be included as sample attributes"},
},
'createrunsandexperiments': {
'nbatches': {'help': 'Set to a number larger than 1 to split paired-end fastqs into smaller batches. This may be required if uploading large datasets - ega-box accounts should not exceed 8Tb and must not exceed 10Tb.'},
'runs': {'help': "Path to CSV file linking samples to files. Must have the columns 'Sample alias', 'First Fastq File', 'First Checksum', 'First Unencrypted checksum', 'Second Fastq File', 'Second Checksum', 'Second Unencrypted checksum'", 'default': 'runs.csv'},
'experiment_title': {'help': 'Title to apply to all created experiment objects'},
'library_strategy': {'help': 'Value to apply to LIBRARY_STRATEGY, e.g. WGS'},
'library_source': {'help': 'Value to apply to LIBRARY_SOURCE, e.g. GENOMIC'},
'library_selection': {'help': 'Value to apply to LIBRARY_SELECTION', 'default': 'other'},
'library_nominal_length': {'help': 'Library nominal length, e.g. 450'},
'library_protocol': {'help': 'Description of the library preparation process'},
'platform_type': {'help': 'Platform type, e.g. illumina'},
'platform_instrument': {'help': 'Instrument type, e.g. HiSeq X Ten'},
'file_box_base': {'help': 'Base file path to apply to uploaded files when passing to `filename`, e.g. if files are uploaded to a folder structure on the ega-box FTP server', 'default': ''}
},
'createdataset': {
'receipt_xml': {'help': 'XML receipt file from running stage 1, containing run accesion numbers from EGA'},
'policy_accession': {'help': "'EGAPxxx...' accession number of the data access policy to apply to this dataset"},
'dataset_title': {'help': 'Dataset title'},
'dataset_type': {'help': 'Dataset type accepted by EGA, e.g. Whole genome sequencing', 'choices': ('Whole genome sequencing', 'Exome sequencing', 'Genotyping by array', 'Transcriptome profiling by high-throughput sequencing', 'Transcriptome profiling by array', 'Amplicon sequencing', 'Methylation binding domain sequencing', 'Methylation profiling by high-throughput sequencing', 'Phenotype information', 'Study summary information', 'Genomic variant calling', 'Chromatin accessibility profiling by high-throughput sequencing', 'Histone modification profiling by high-throughput sequencing', 'Chip-Seq')}
}
}
env = jinja2.Environment(
loader=jinja2.FileSystemLoader(os.path.join(os.path.dirname(__file__), 'templates')),
autoescape=jinja2.select_autoescape()
)
formatter = logging.Formatter('[%(asctime)s][%(name)s][%(levelname)s] %(message)s')
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(formatter)
handler.setLevel(logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
xml_dir = ''
force = False
def cmd_args(argv=None):
a = argparse.ArgumentParser(__name__)
subparsers = a.add_subparsers()
for n in options:
if n == __name__:
p = a
else:
p = subparsers.add_parser(n)
p.set_defaults(entry=p.prog)
for k, v in options[p.prog.replace(__name__ + ' ', '')].items():
alias = v.get('alias') # e.g. -s for --study_alias
names = ['--' + k]
if alias:
v.pop('alias')
names.append(alias)
p.add_argument(*names, **v)
return a.parse_args(argv)
def file_args():
for f in (os.getenv('EGA_UPLOAD_CONFIG'), os.path.join(os.getcwd(), 'ega_upload.yaml')):
if f and os.path.isfile(f):
with open(f) as h:
return yaml.safe_load(h)
return {}
def _merge_file_and_cmd_args(file_config, args):
config = dict()
for k in {__name__, args.entry.replace(__name__ + ' ', '')}:
for k2, v in options[k].items():
# first populate config from command line, but only when different from the default
cmd_v = args.__dict__.get(k2)
if cmd_v is not None and cmd_v != v.get('default'):
config[k2] = cmd_v
# next with file config
elif k2 in file_config:
config[k2] = file_config[k2]
# finally with declared defaults
elif 'default' in v:
config[k2] = v['default']
return config
def configure(argv=None):
args = cmd_args(argv)
file_conf = file_args()
config = _merge_file_and_cmd_args(file_conf, args)
if config['debug']:
handler.setLevel(logging.DEBUG)
logger.setLevel(logging.DEBUG)
logger.info('Output dir: %s', config['output_dir'])
global xml_dir
xml_dir = config['output_dir']
os.makedirs(xml_dir, exist_ok=True)
global force
if config['force']:
force = True
return args.entry, config
def main(argv=None):
logger.addHandler(handler)
entry, config = configure(argv)
# we'll always need this
createsubmission(config)
if entry.endswith('createdac'):
createdac(config)
elif entry.endswith('createpolicy'):
createpolicy(config)
elif entry.endswith('createstudy'):
createstudy(config)
elif entry.endswith('createsamples'):
createsamples(config)
elif entry.endswith('createrunsandexperiments'):
createrunsandexperiments(config)
elif entry.endswith('createdataset'):
createdataset(config)
def createsubmission(config):
write_xml(
'submission.xml',
config=config,
submission_contacts=[c.split(':') for c in config.get('submission_contacts', ())]
)
def createdac(config):
dac_attributes = []
for attr in config.get('dac_attributes', ()):
split_attr = attr.split(':')
if len(split_attr) == 2: # no unit
split_attr.append(None)
dac_attributes.append(split_attr)
write_xml(
'dac.xml',
config=config,
dac_contacts=[c.split(':') for c in config.get('dac_contacts', ())],
dac_attributes=dac_attributes
)
def createpolicy(config):
if not config['policy_text'] and not config['policy_file_url']:
raise KeyError('policy_text or policy_file_url is required')
policy_attributes = []
for attr in config.get('policy_attributes', ()):
split_attr = attr.split(':')
if len(split_attr) == 2: # no unit
split_attr.append(None)
policy_attributes.append(split_attr)
write_xml(
'policy.xml',
config=config,
policy_links=[l.split(':') for l in config.get('policy_links', ())],
policy_attributes=policy_attributes
)
def createstudy(config):
df = read_table(config['study_attributes'])
if list(df['tag']) != list(set(df['tag'])):
raise ValueError('Duplicate tags found in %s' % config['study_attributes'])
study_attributes = {row.tag: row.value for row in df.itertuples()}
write_xml('study.xml', config=config, study_attributes=study_attributes)
def createsamples(config):
df = read_table(config['samples'])
mandatory_columns = ('id', 'title', 'taxon_id', 'scientific_name', 'common_name', 'description', 'sex', 'phenotype')
extra_cols = tuple(c for c in df.columns if c not in mandatory_columns)
if tuple(df.columns[:8]) != mandatory_columns:
raise ValueError('First 8 columns of %s must be %s' % (config['samples'], mandatory_columns))
samples = []
for i, s in df.iterrows():
new_sample = dict()
for k in mandatory_columns:
v = s[k]
if pandas.notna(v):
new_sample[k] = v
extra_attributes = {c: s[c] for c in extra_cols if s[c] is not None}
new_sample['alias'] = new_sample['id']
new_sample['attributes'] = extra_attributes
samples.append(new_sample)
write_xml('sample.xml', config=config, samples=samples)
def createrunsandexperiments(config):
idgen = IDGenerator(xml_dir)
df = read_table(config['runs'])
df['experiment_id'] = [idgen.new() for _ in range(len(df))]
df['run_id'] = [idgen.new() for _ in range(len(df))]
df['r1'] = df['First Fastq File'].map(lambda f: os.path.join(config['file_box_base'], f))
df['r2'] = df['Second Fastq File'].map(lambda f: os.path.join(config['file_box_base'], f))
df['file_type_r1'] = df['First Fastq File'].map(get_file_type)
df['file_type_r2'] = df['Second Fastq File'].map(get_file_type)
nsamples = len(df)
logger.info('Splitting %i samples into %i batches', nsamples, config['nbatches'])
slices = batches(nsamples, config['nbatches'])
for i, (j, k) in enumerate(slices):
batch = [line for i, line in df[j:k].iterrows()]
write_xml('experiment.xml', 'experiment-%i.xml' % (i + 1), id_map=batch, config=config)
write_xml('run.xml', 'run-%i.xml' % (i + 1), id_map=batch, config=config)
def createdataset(config):
dom = xml.dom.minidom.parse(config['receipt_xml'])
run_accessions = [r.attributes['accession'].value for r in dom.getElementsByTagName('RUN')]
idgen = IDGenerator(xml_dir)
write_xml('dataset.xml', config=config, dataset_alias=idgen.new(), runs=run_accessions, analyses=())
def read_table(f: str) -> pandas.DataFrame:
if f.endswith('.csv'):
return pandas.read_csv(f)
elif f.endswith('xlsx'):
return pandas.read_excel(f, engine='openpyxl')
else:
raise NameError('Unrecognised file format for %s - must be csv or xlsx' % f)
def render_xml(template, **kwargs):
return multi_newline.sub('\n', env.get_template(template).render(**kwargs))
def write_xml(template_name, xml_name_out=None, **kwargs):
xml_name_out = xml_name_out or template_name
path = os.path.join(xml_dir, xml_name_out)
if force or not os.path.isfile(path):
content = render_xml(template_name, **kwargs)
with open(path, 'w') as f:
f.write(content)
logger.info('Written %s', xml_name_out)
logger.debug(content)
else:
logger.info('%s already exists', xml_name_out)
class IDGenerator:
def __init__(self, xml_dir=None):
if xml_dir:
self.existing_ids = self.scrape_xml_dir(xml_dir)
else:
self.existing_ids = set()
def new(self):
candidate = None
for _ in range(3):
candidate = str(uuid.uuid1())
if candidate in self.existing_ids:
logger.warning('Attempted to generate an ID %s that already existed - retrying', candidate)
else:
self.existing_ids.add(candidate)
return candidate
raise ValueError('Failed to generate an experiment ID. Last tried: %s' % candidate)
@staticmethod
def scrape_xml_dir(xml_dir):
ids = set()
tags = ('DAC', 'POLICY', 'EXPERIMENT', 'RUN', 'STUDY', 'SAMPLE')
for root, dirs, files in os.walk(xml_dir):
for f in files:
if not f.endswith('.xml'):
continue
dom = xml.dom.minidom.parse(os.path.join(root, f))
for t in tags:
elements = dom.getElementsByTagName(t)
for e in elements:
if 'alias' in e.attributes:
ids.add(e.attributes.get('alias').value)
return ids
def get_file_type(filename):
if filename.endswith('fq.gz') or filename.endswith('fastq.gz'):
return 'fastq'
raise NameError('Could not determine file type for file %s' % filename)
def batches(nitems, nbatches):
"""
Split a number of items (`nitems`) into `nbatches` number of slices, taking
remainders into account if needed. E.g, splitting 99 fastq pairs into 4 batches
-> [25, 25, 25, 24] -> [(0, 25), (25, 50), (50, 75), (75, 99)].
"""
full_batch = math.ceil(nitems / nbatches) # full size batch
nfull_batches = nitems // full_batch
_batches = [
(i * full_batch, (i * full_batch) + full_batch)
for i in range(nfull_batches)
]
remainder = nitems % full_batch
if remainder:
last_batch = (nitems - remainder, nitems)
_batches.append(last_batch)
return _batches
if __name__ == '__main__':
main()
pyYAML>=6.0.2
pandas>=2.2.2
jinja2>=3.1.4
openpyxl>=3.1.5
<ANALYSIS_SET>
<ANALYSIS alias="{{ analysis.alias }}" center_name="{{ center_name }}" broker_name="EGA" >
<TITLE>{{ analysis.title }}</TITLE>
<DESCRIPTION>{{ analysis.description }}</DESCRIPTION>
<STUDY_REF refname="{{ study_alias }}" refcenter="{{ center_name }}"/>
<SAMPLE_REF refname="{{ analysis.sample_alias }}" refcenter="{{ center_name }}" label="{{ analysis.sample_alias }}"/>
<ANALYSIS_TYPE>
<REFERENCE_ALIGNMENT>
<ASSEMBLY>
<STANDARD refname="{{ analysis.assembly_name }}" accession="{{ analysis.assembly_accession }}"/>
</ASSEMBLY>
{% for chr in analysis.assembly_chromosomes %}
<SEQUENCE accession="{{ chr.accession }}" label="{{ chr.name }}"/>
{% endfor %}
</REFERENCE_ALIGNMENT>
</ANALYSIS_TYPE>
<FILES>
{% for f in analysis.files %}
<FILE filename="{{ f.path }}" filetype="{{ f.type }}" checksum_method="MD5" checksum="{{ f.checksum }}" unencrypted_checksum="{{ f.unencrypted_checksum }}"/>
{% endfor %}
</FILES>
</ANALYSIS>
</ANALYSIS_SET>
<DAC_SET>
<DAC alias="{{ config.dac_alias }}" center_name="{{ center_name }}" broker_name="EGA">
<TITLE>{{ config.dac_title }}</TITLE>
<CONTACTS>
{% for name, email, org in dac_contacts %}
<CONTACT name="{{ name }}" email="{{ email }}" organisation="{{ org }}" />
{% endfor %}
</CONTACTS>
{% if dac_attributes %}
<DAC_ATTRIBUTES>
{% for k, v, unit in dac_attributes %}
<DAC_ATTRIBUTE>
<TAG>{{ k }}</TAG>
<VALUE>{{ v }}</VALUE>
{% if unit %}
<UNITS>{{ unit }}</UNITS>
{% endif %}
</DAC_ATTRIBUTE>
{% endfor %}
</DAC_ATTRIBUTES>
{% endif %}
</DAC>
</DAC_SET>
<DATASETS>
<DATASET alias="{{ dataset_alias }}" broker_name="EGA">
<TITLE>{{ config.dataset_title }}</TITLE>
<DATASET_TYPE>{{ config.dataset_type }}</DATASET_TYPE>
{% for r in runs %}
<RUN_REF accession="{{ r }}" />
{% endfor %}
{% for a in analyses %}
<ANALYSIS_REF accession="{{ a }}" />
{% endfor %}
<POLICY_REF accession="{{ config.policy_accession }}" />
{% if links %}
<DATASET_LINKS>
{% for k, v in links.items() %}
<DATASET_LINK>
<URL_LINK>
<LABEL>{{ k }}</LABEL>
<URL>{{ v }}</URL>
</URL_LINK>
</DATASET_LINK>
{% endfor %}
</DATASET_LINKS>
{% endif %}
</DATASET>
</DATASETS>
<EXPERIMENT_SET>
{% for line in id_map %}
<EXPERIMENT alias="{{ line['experiment_id'] }}">
<TITLE>{{ experiment_title }}</TITLE>
<STUDY_REF refname="{{ config.study_alias }}" />
<DESIGN>
<DESIGN_DESCRIPTION />
<SAMPLE_DESCRIPTOR refname="{{ line['Sample alias'] }}" />
<LIBRARY_DESCRIPTOR>
<LIBRARY_NAME></LIBRARY_NAME>
<LIBRARY_STRATEGY>{{ config.library_strategy | default('WGS') }}</LIBRARY_STRATEGY>
<LIBRARY_SOURCE>{{ config.library_source | default('GENOMIC') }}</LIBRARY_SOURCE>
<LIBRARY_SELECTION>{{ config.library_selection | default('other') }}</LIBRARY_SELECTION>
<LIBRARY_LAYOUT>
<PAIRED NOMINAL_LENGTH="{{ config.library_nominal_length }}"></PAIRED>
</LIBRARY_LAYOUT>
<LIBRARY_CONSTRUCTION_PROTOCOL>{{ config.library_protocol }}</LIBRARY_CONSTRUCTION_PROTOCOL>
</LIBRARY_DESCRIPTOR>
</DESIGN>
<PLATFORM>
{% if config.platform_type == 'illumina' %}
<ILLUMINA>
<INSTRUMENT_MODEL>{{ config.platform_instrument }}</INSTRUMENT_MODEL>
</ILLUMINA>
{% endif %}
</PLATFORM>
<PROCESSING />
</EXPERIMENT>
{% endfor %}
</EXPERIMENT_SET>
<POLICY_SET>
<POLICY alias="{{ config.policy_alias }}" broker_name="EGA">
<TITLE>{{ config.policy_title }}</TITLE>
<DAC_REF accession="{{ config.dac_accession }}" />
{% if config.policy_text %}
<POLICY_TEXT>{{ config.policy_text }}</POLICY_TEXT>
{% endif %}
{% if config.policy_file_url %}
<POLICY_FILE>{{ config.policy_file_url }}</POLICY_FILE>
{% endif %}
{% if policy_links %}
<POLICY_LINKS>
{% for k, v in policy_links.items() %}
<POLICY_LINK>
<URL_LINK>
<LABEL>{{ k }}</LABEL>
<URL>{{ v }}</URL>
</URL_LINK>
</POLICY_LINK>
{% endfor %}
</POLICY_LINKS>
{% endif %}
{% if policy_attributes %}
<POLICY_ATTRIBUTES>
{% for tag, value, unit in policy_attributes %}
<POLICY_ATTRIBUTE>
<TAG>{{ tag }}</TAG>
<VALUE>{{ value }}</VALUE>
{% if unit %}
<UNITS>{{ unit }}</UNITS>
{% endif %}
</POLICY_ATTRIBUTE>
{% endfor %}
</POLICY_ATTRIBUTES>
{% endif %}
</POLICY>
</POLICY_SET>
<RUN_SET>
{% for line in id_map %}
<RUN alias="{{ line['run_id'] }}">
<EXPERIMENT_REF refname="{{ line['experiment_id'] }}" />
<DATA_BLOCK>
<FILES>
<FILE filename="{{ line['r1'] }}" filetype="{{ line['file_type_r1'] }}" checksum_method="MD5" checksum="{{ line['First Checksum'] }}" unencrypted_checksum="{{ line['First Unencrypted checksum'] }}" />
<FILE filename="{{ line['r2'] }}" filetype="{{ line['file_type_r2'] }}" checksum_method="MD5" checksum="{{ line['Second Checksum'] }}" unencrypted_checksum="{{ line['Second Unencrypted checksum'] }}" />
</FILES>
</DATA_BLOCK>
</RUN>
{% endfor %}
</RUN_SET>
<SAMPLE_SET>
{% for sample in samples %}
<SAMPLE alias="{{ sample.alias }}" center_name="{{ center_name }}">
<TITLE>{{ sample.title }}</TITLE>
<SAMPLE_NAME>
<TAXON_ID>{{ sample.taxon_id | default(9606) }}</TAXON_ID>
{% if sample.scientific_name %}
<SCIENTIFIC_NAME>{{ sample.scientific_name }}</SCIENTIFIC_NAME>
{% endif %}
{% if sample.common_name %}
<COMMON_NAME>{{ sample.common_name }}</COMMON_NAME>
{% endif %}
</SAMPLE_NAME>
<DESCRIPTION>{{ sample.description }}</DESCRIPTION>
<SAMPLE_ATTRIBUTES>
<SAMPLE_ATTRIBUTE>
<TAG>subject_id</TAG>
<VALUE>{{ sample.alias }}</VALUE>
</SAMPLE_ATTRIBUTE>
<SAMPLE_ATTRIBUTE>
<TAG>sex</TAG>
<VALUE>{{ sample.sex | default('unknown') }}</VALUE>
</SAMPLE_ATTRIBUTE>
<SAMPLE_ATTRIBUTE>
<TAG>phenotype</TAG>
<VALUE>{{ sample.phenotype }}</VALUE>
</SAMPLE_ATTRIBUTE>
{% for k, v in sample.attributes.items() %}
<SAMPLE_ATTRIBUTE>
<TAG>{{ k }}</TAG>
<VALUE>{{ v }}</VALUE>
</SAMPLE_ATTRIBUTE>
{% endfor %}
</SAMPLE_ATTRIBUTES>
</SAMPLE>
{% endfor %}
</SAMPLE_SET>
<STUDY_SET>
<STUDY alias="{{ config.study_alias }}" center_name="{{ config.center_name }}">
<DESCRIPTOR>
<STUDY_TITLE>{{ config.study_title }}</STUDY_TITLE>
<STUDY_TYPE existing_study_type="{{ config.study_type }}"/>
<STUDY_ABSTRACT>{{ config.abstract }}</STUDY_ABSTRACT>
</DESCRIPTOR>
<STUDY_ATTRIBUTES>
{% for k, v in study_attributes.items() %}
<STUDY_ATTRIBUTE>
<TAG>{{ k }}</TAG>
<VALUE>{{ v }}</VALUE>
</STUDY_ATTRIBUTE>
{% endfor %}
</STUDY_ATTRIBUTES>
</STUDY>
</STUDY_SET>
<?xml version="1.0" encoding="UTF-8"?>
<SUBMISSION_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="SRA.submission.xsd">
<SUBMISSION alias="" broker_name="EGA">
{% if submission_contacts %}
<CONTACTS>
{% for name, email in submission_contacts %}
<CONTACT name="{{ name }}" inform_on_status="{{ email }}" inform_on_error="{{ email }}"/>
{% endfor %}
</CONTACTS>
{% endif %}
<ACTIONS>
<ACTION>
<ADD />
</ACTION>
<ACTION>
<PROTECT/>
</ACTION>
</ACTIONS>
</SUBMISSION>
</SUBMISSION_SET>
Some study abstract
---
output_dir: xml
force: true
submission_contacts:
- 'Somebody:somebody@ed.ac.uk'
dac_alias: test_dac
dac_title: Test DAC
dac_contacts:
- Somebody:somebody@ed.ac.uk:DAC
dac_attributes:
- this:that:other
policy_alias: some_data_access_policy
policy_title: Some Data Access Policy
policy_text: Some data access policy text
dac_accession: EGAC01234
policy_attributes:
- this:that:other
study_alias: some_study
study_type: Whole Genome Sequencing
study_title: Some study title
study_abstract: abstract.txt
study_attributes: study_attributes.csv
samples: samples.csv
nbatches: 3
runs: runs.csv
experiment_title: Whole genome sequencing for some dataset
library_strategy: WGS
library_source: GENOMIC
library_selection: other
library_nominal_length: 450
library_protocol: |
Some library protocol text
platform_type: illumina
platform_instrument: HiSeq X Ten
file_box_base: some_dataset/fastq
receipt_xml: receipt.xml
policy_accession: EGAP01234
dataset_title: Test dataset
dataset_type: 'Exome sequencing'
<DAC_SET>
<DAC alias="IGC DAC" center_name="" broker_name="EGA">
<TITLE>Test DAC</TITLE>
<CONTACTS>
<CONTACT name="Murray Wham" email="murray.wham@ed.ac.uk" organisation="IGC" />
</CONTACTS>
</DAC>
</DAC_SET>
\ No newline at end of file
<DATASETS>
<DATASET alias="00000000-0000-0000-0000-000000000011" broker_name="EGA">
<TITLE>Test dataset</TITLE>
<DATASET_TYPE>Exome sequencing</DATASET_TYPE>
<RUN_REF accession="EGAR00001" />
<RUN_REF accession="EGAR00002" />
<RUN_REF accession="EGAR00003" />
<POLICY_REF accession="EGAP01234" />
</DATASET>
</DATASETS>
\ No newline at end of file
<EXPERIMENT_SET>
<EXPERIMENT alias="00000000-0000-0000-0000-000000000001">
<TITLE></TITLE>
<STUDY_REF refname="some_study" />
<DESIGN>
<DESIGN_DESCRIPTION />
<SAMPLE_DESCRIPTOR refname="sample1" />
<LIBRARY_DESCRIPTOR>
<LIBRARY_NAME></LIBRARY_NAME>
<LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
<LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
<LIBRARY_SELECTION>other</LIBRARY_SELECTION>
<LIBRARY_LAYOUT>
<PAIRED NOMINAL_LENGTH="450"></PAIRED>
</LIBRARY_LAYOUT>
<LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
</LIBRARY_CONSTRUCTION_PROTOCOL>
</LIBRARY_DESCRIPTOR>
</DESIGN>
<PLATFORM>
<ILLUMINA>
<INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
</ILLUMINA>
</PLATFORM>
<PROCESSING />
</EXPERIMENT>
<EXPERIMENT alias="00000000-0000-0000-0000-000000000002">
<TITLE></TITLE>
<STUDY_REF refname="some_study" />
<DESIGN>
<DESIGN_DESCRIPTION />
<SAMPLE_DESCRIPTOR refname="sample2" />
<LIBRARY_DESCRIPTOR>
<LIBRARY_NAME></LIBRARY_NAME>
<LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
<LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
<LIBRARY_SELECTION>other</LIBRARY_SELECTION>
<LIBRARY_LAYOUT>
<PAIRED NOMINAL_LENGTH="450"></PAIRED>
</LIBRARY_LAYOUT>
<LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
</LIBRARY_CONSTRUCTION_PROTOCOL>
</LIBRARY_DESCRIPTOR>
</DESIGN>
<PLATFORM>
<ILLUMINA>
<INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
</ILLUMINA>
</PLATFORM>
<PROCESSING />
</EXPERIMENT>
<EXPERIMENT alias="00000000-0000-0000-0000-000000000003">
<TITLE></TITLE>
<STUDY_REF refname="some_study" />
<DESIGN>
<DESIGN_DESCRIPTION />
<SAMPLE_DESCRIPTOR refname="sample3" />
<LIBRARY_DESCRIPTOR>
<LIBRARY_NAME></LIBRARY_NAME>
<LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
<LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
<LIBRARY_SELECTION>other</LIBRARY_SELECTION>
<LIBRARY_LAYOUT>
<PAIRED NOMINAL_LENGTH="450"></PAIRED>
</LIBRARY_LAYOUT>
<LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
</LIBRARY_CONSTRUCTION_PROTOCOL>
</LIBRARY_DESCRIPTOR>
</DESIGN>
<PLATFORM>
<ILLUMINA>
<INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
</ILLUMINA>
</PLATFORM>
<PROCESSING />
</EXPERIMENT>
</EXPERIMENT_SET>
\ No newline at end of file
<EXPERIMENT_SET>
<EXPERIMENT alias="00000000-0000-0000-0000-000000000004">
<TITLE></TITLE>
<STUDY_REF refname="some_study" />
<DESIGN>
<DESIGN_DESCRIPTION />
<SAMPLE_DESCRIPTOR refname="sample4" />
<LIBRARY_DESCRIPTOR>
<LIBRARY_NAME></LIBRARY_NAME>
<LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
<LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
<LIBRARY_SELECTION>other</LIBRARY_SELECTION>
<LIBRARY_LAYOUT>
<PAIRED NOMINAL_LENGTH="450"></PAIRED>
</LIBRARY_LAYOUT>
<LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
</LIBRARY_CONSTRUCTION_PROTOCOL>
</LIBRARY_DESCRIPTOR>
</DESIGN>
<PLATFORM>
<ILLUMINA>
<INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
</ILLUMINA>
</PLATFORM>
<PROCESSING />
</EXPERIMENT>
<EXPERIMENT alias="00000000-0000-0000-0000-000000000005">
<TITLE></TITLE>
<STUDY_REF refname="some_study" />
<DESIGN>
<DESIGN_DESCRIPTION />
<SAMPLE_DESCRIPTOR refname="sample5" />
<LIBRARY_DESCRIPTOR>
<LIBRARY_NAME></LIBRARY_NAME>
<LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
<LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
<LIBRARY_SELECTION>other</LIBRARY_SELECTION>
<LIBRARY_LAYOUT>
<PAIRED NOMINAL_LENGTH="450"></PAIRED>
</LIBRARY_LAYOUT>
<LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
</LIBRARY_CONSTRUCTION_PROTOCOL>
</LIBRARY_DESCRIPTOR>
</DESIGN>
<PLATFORM>
<ILLUMINA>
<INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
</ILLUMINA>
</PLATFORM>
<PROCESSING />
</EXPERIMENT>
<EXPERIMENT alias="00000000-0000-0000-0000-000000000006">
<TITLE></TITLE>
<STUDY_REF refname="some_study" />
<DESIGN>
<DESIGN_DESCRIPTION />
<SAMPLE_DESCRIPTOR refname="sample6" />
<LIBRARY_DESCRIPTOR>
<LIBRARY_NAME></LIBRARY_NAME>
<LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
<LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
<LIBRARY_SELECTION>other</LIBRARY_SELECTION>
<LIBRARY_LAYOUT>
<PAIRED NOMINAL_LENGTH="450"></PAIRED>
</LIBRARY_LAYOUT>
<LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
</LIBRARY_CONSTRUCTION_PROTOCOL>
</LIBRARY_DESCRIPTOR>
</DESIGN>
<PLATFORM>
<ILLUMINA>
<INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
</ILLUMINA>
</PLATFORM>
<PROCESSING />
</EXPERIMENT>
</EXPERIMENT_SET>
\ No newline at end of file
<EXPERIMENT_SET>
<EXPERIMENT alias="00000000-0000-0000-0000-000000000007">
<TITLE></TITLE>
<STUDY_REF refname="some_study" />
<DESIGN>
<DESIGN_DESCRIPTION />
<SAMPLE_DESCRIPTOR refname="sample7" />
<LIBRARY_DESCRIPTOR>
<LIBRARY_NAME></LIBRARY_NAME>
<LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
<LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
<LIBRARY_SELECTION>other</LIBRARY_SELECTION>
<LIBRARY_LAYOUT>
<PAIRED NOMINAL_LENGTH="450"></PAIRED>
</LIBRARY_LAYOUT>
<LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
</LIBRARY_CONSTRUCTION_PROTOCOL>
</LIBRARY_DESCRIPTOR>
</DESIGN>
<PLATFORM>
<ILLUMINA>
<INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
</ILLUMINA>
</PLATFORM>
<PROCESSING />
</EXPERIMENT>
<EXPERIMENT alias="00000000-0000-0000-0000-000000000008">
<TITLE></TITLE>
<STUDY_REF refname="some_study" />
<DESIGN>
<DESIGN_DESCRIPTION />
<SAMPLE_DESCRIPTOR refname="sample8" />
<LIBRARY_DESCRIPTOR>
<LIBRARY_NAME></LIBRARY_NAME>
<LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>
<LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>
<LIBRARY_SELECTION>other</LIBRARY_SELECTION>
<LIBRARY_LAYOUT>
<PAIRED NOMINAL_LENGTH="450"></PAIRED>
</LIBRARY_LAYOUT>
<LIBRARY_CONSTRUCTION_PROTOCOL>Some library protocol text
</LIBRARY_CONSTRUCTION_PROTOCOL>
</LIBRARY_DESCRIPTOR>
</DESIGN>
<PLATFORM>
<ILLUMINA>
<INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL>
</ILLUMINA>
</PLATFORM>
<PROCESSING />
</EXPERIMENT>
</EXPERIMENT_SET>
\ No newline at end of file
<POLICY_SET>
<POLICY alias="some_data_access_policy" broker_name="EGA">
<TITLE>Some Data Access Policy</TITLE>
<DAC_REF accession="EGAC01234" />
<POLICY_TEXT>Some data access policy text</POLICY_TEXT>
<POLICY_ATTRIBUTES>
<POLICY_ATTRIBUTE>
<TAG>this</TAG>
<VALUE>that</VALUE>
<UNITS>other</UNITS>
</POLICY_ATTRIBUTE>
</POLICY_ATTRIBUTES>
</POLICY>
</POLICY_SET>
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment