diff --git a/src/agora/io/metadata.py b/src/agora/io/metadata.py index 8f152b27d6ba6231b9b79c92d069908d73d6e2b4..b90d2db39243119741bf5561df566a0d33aaa76b 100644 --- a/src/agora/io/metadata.py +++ b/src/agora/io/metadata.py @@ -138,8 +138,16 @@ def get_minimal_meta_swainlab(parsed_metadata: dict): """ channels_dict = find_channels_by_position(parsed_metadata["group_config"]) channels = parsed_metadata["image_config"]["Image config"].values.tolist() - ntps = parsed_metadata["group_time"]["frames"].max() - timeinterval = parsed_metadata["group_time"]["interval"].min() + parsed_ntps = parsed_metadata["group_time"]["frames"] + if type(parsed_ntps) is int: + ntps = parsed_ntps + else: + ntps = parsed_ntps.max() + parsed_tinterval = parsed_metadata["group_time"]["interval"] + if type(parsed_tinterval) is int: + timeinterval = parsed_tinterval + else: + timeinterval = parsed_tinterval.min() minimal_meta = { "channels_by_group": channels_dict, "channels": channels, diff --git a/src/aliby/io/image.py b/src/aliby/io/image.py index b87bcdc1a3a3116beabc14230a15ee7c6ee94757..65aa90b7e3d41363073ee4cf9410f6537708a6b4 100644 --- a/src/aliby/io/image.py +++ b/src/aliby/io/image.py @@ -26,7 +26,7 @@ from dask.array.image import imread from importlib_resources import files from tifffile import TiffFile -from agora.io.metadata import dir_to_meta, dispatch_metadata_parser +from agora.io.metadata import parse_metadata def instantiate_image( @@ -71,11 +71,47 @@ def dispatch_image(source: t.Union[str, int, t.Dict[str, str], Path]): return instantiator +def files_to_image_sizes(path: Path, suffix="tiff"): + """Deduce image sizes from the naming convention of tiff files.""" + filenames = list(path.glob(f"*.{suffix}")) + try: + # deduce order from filenames + dimorder = "".join( + map(lambda x: x[0], filenames[0].stem.split("_")[1:]) + ) + dim_value = list( + map( + lambda f: filename_to_dict_indices(f.stem), + path.glob("*.tiff"), + ) + ) + maxes = [max(map(lambda x: x[dim], dim_value)) for dim in dimorder] + mins = [min(map(lambda x: x[dim], dim_value)) for dim in dimorder] + dim_shapes = [ + max_val - min_val + 1 for max_val, min_val in zip(maxes, mins) + ] + meta = { + "size_" + dim: shape for dim, shape in zip(dimorder, dim_shapes) + } + except Exception as e: + print("Warning: files_to_image_sizes failed." f"\nError: {e}") + meta = {} + return meta + + +def filename_to_dict_indices(stem: str): + """Split string into a dict.""" + return { + dim_number[0]: int(dim_number[1:]) + for dim_number in stem.split("_")[1:] + } + + class BaseLocalImage(ABC): """Set path and provide method for context management.""" # default image order - _default_dimorder = "tczyx" + default_dimorder = "tczyx" def __init__(self, path: t.Union[str, Path]): # If directory, assume contents are naturally sorted @@ -98,8 +134,8 @@ class BaseLocalImage(ABC): 1, 1, 1, - self._meta["size_y"], - self._meta["size_x"], + self.meta["size_y"], + self.meta["size_x"], ), ) return self._rechunked_img @@ -112,11 +148,15 @@ class BaseLocalImage(ABC): @property def metadata(self): """Get metadata.""" - return self._meta + return self.meta def set_meta(self): """Load metadata using parser dispatch.""" - self._meta = dispatch_metadata_parser(self.path) + parsed_meta = parse_metadata(self.path) + if parsed_meta is None: + # try to deduce metadata + parsed_meta = files_to_image_sizes(self.path) + self.meta = parsed_meta @abstractmethod def get_data_lazy(self) -> da.Array: @@ -153,40 +193,37 @@ class ImageLocalOME(BaseLocalImage): meta = dict() try: with TiffFile(path) as f: - self._meta = xmltodict.parse(f.ome_metadata)["OME"] + self.meta = xmltodict.parse(f.ome_metadata)["OME"] for dim in self.dimorder: meta["size_" + dim.lower()] = int( - self._meta["Image"]["Pixels"]["@Size" + dim] + self.meta["Image"]["Pixels"]["@Size" + dim] ) meta["channels"] = [ - x["@Name"] - for x in self._meta["Image"]["Pixels"]["Channel"] + x["@Name"] for x in self.meta["Image"]["Pixels"]["Channel"] ] - meta["name"] = self._meta["Image"]["@Name"] - meta["type"] = self._meta["Image"]["Pixels"]["@Type"] + meta["name"] = self.meta["Image"]["@Name"] + meta["type"] = self.meta["Image"]["Pixels"]["@Type"] except Exception as e: # images not in OMEXML print("Warning:Metadata not found: {}".format(e)) print( "Warning: No dimensional info provided. " - f"Assuming {self._default_dimorder}" + f"Assuming {self.default_dimorder}" ) # mark non-existent dimensions for padding - self.base = self._default_dimorder - # self.ids = [self.index(i) for i in dimorder] - self._dimorder = self.base - self._meta = meta - # self._meta["name"] = Path(path).name.split(".")[0] + self.base = self.default_dimorder + self.dimorder = self.base + self.meta = meta @property def name(self): - return self._meta["name"] + return self.meta["name"] @property def date(self): date_str = [ x - for x in self._meta["StructuredAnnotations"]["TagAnnotation"] + for x in self.meta["StructuredAnnotations"]["TagAnnotation"] if x["Description"] == "Date" ][0]["Value"] return datetime.strptime(date_str, "%d-%b-%Y") @@ -194,14 +231,14 @@ class ImageLocalOME(BaseLocalImage): @property def dimorder(self): """Return order of dimensions in the image.""" - if not hasattr(self, "_dimorder"): - self._dimorder = self._meta["Image"]["Pixels"]["@DimensionOrder"] - return self._dimorder + if not hasattr(self, "dimorder"): + self.dimorder = self.meta["Image"]["Pixels"]["@DimensionOrder"] + return self.dimorder @dimorder.setter def dimorder(self, order: str): - self._dimorder = order - return self._dimorder + self.dimorder = order + return self.dimorder def get_data_lazy(self) -> da.Array: """Return 5D dask array via lazy-loading of tiff files.""" @@ -212,8 +249,8 @@ class ImageLocalOME(BaseLocalImage): else: # bespoke order, so rearrange axes for compatibility img = imread(str(self.path))[0] - for i, d in enumerate(self._dimorder): - self._meta["size_" + d.lower()] = img.shape[i] + for i, d in enumerate(self.dimorder): + self.meta["size_" + d.lower()] = img.shape[i] target_order = ( *self.ids, *[ @@ -254,7 +291,7 @@ class ImageDir(BaseLocalImage): """Initialise using file name.""" super().__init__(path) self.image_id = str(self.path.stem) - self._meta = dir_to_meta(self.path) + self.meta = files_to_image_sizes(self.path) def get_data_lazy(self) -> da.Array: """Return 5D dask array.""" @@ -262,17 +299,16 @@ class ImageDir(BaseLocalImage): # If extra channels, pick the first stack of the last dimensions while len(img.shape) > 3: img = img[..., 0] - if self._meta: - self._meta["size_x"], self._meta["size_y"] = img.shape[-2:] + if self.meta: + self.meta["size_x"], self.meta["size_y"] = img.shape[-2:] # Reshape using metadata - # img = da.reshape(img, (*self._meta, *img.shape[1:])) - img = da.reshape(img, self._meta.values()) + img = da.reshape(img, self.meta.values()) original_order = [ - i[-1] for i in self._meta.keys() if i.startswith("size") + i[-1] for i in self.meta.keys() if i.startswith("size") ] # Swap axis to conform with normal order target_order = [ - self._default_dimorder.index(x) for x in original_order + self.default_dimorder.index(x) for x in original_order ] img = da.moveaxis( img, @@ -291,7 +327,7 @@ class ImageDir(BaseLocalImage): def dimorder(self): # Assumes only dimensions start with "size" return [ - k.split("_")[-1] for k in self._meta.keys() if k.startswith("size") + k.split("_")[-1] for k in self.meta.keys() if k.startswith("size") ] @@ -319,7 +355,7 @@ class ImageZarr(BaseLocalImage): def add_size_to_meta(self): """Add shape of image array to metadata.""" - self._meta.update( + self.meta.update( { f"size_{dim}": shape for dim, shape in zip(self.dimorder, self._img.shape) @@ -335,116 +371,3 @@ class ImageZarr(BaseLocalImage): def dimorder(self): """Impose a hard-coded order of dimensions based on the zarr compression script.""" return "TCZYX" - - -class ImageDummy(BaseLocalImage): - """ - Dummy Image class. - - ImageDummy mimics the other Image classes in such a way that it is accepted - by Tiler. The purpose of this class is for testing, in particular, - identifying silent failures. If something goes wrong, we should be able to - know whether it is because of bad parameters or bad input data. - - For the purposes of testing parameters, ImageDummy assumes that we already - know the tiler parameters before Image instances are instantiated. This is - true for a typical pipeline run. - """ - - def __init__(self, tiler_parameters: dict): - """Builds image instance - - Parameters - ---------- - tiler_parameters : dict - Tiler parameters, in dict form. Following - aliby.tile.tiler.TilerParameters, the keys are: "tile_size" (size of - tile), "ref_channel" (reference channel for tiling), and "ref_z" - (reference z-stack, 0 to choose a default). - """ - self.ref_channel = tiler_parameters["ref_channel"] - self.ref_z = tiler_parameters["ref_z"] - - # Goal: make Tiler happy. - @staticmethod - def pad_array( - image_array: da.Array, - dim: int, - n_empty_slices: int, - image_position: int = 0, - ): - """Extends a dimension in a dask array and pads with zeros - - Extends a dimension in a dask array that has existing content, then pads - with zeros. - - Parameters - ---------- - image_array : da.Array - Input dask array - dim : int - Dimension in which to extend the dask array. - n_empty_slices : int - Number of empty slices to extend the dask array by, in the specified - dimension/axis. - image_position : int - Position within the new dimension to place the input arary, default 0 - (the beginning). - - Examples - -------- - ``` - extended_array = pad_array( - my_da_array, dim = 2, n_empty_slices = 4, image_position = 1) - ``` - Extends a dask array called `my_da_array` in the 3rd dimension - (dimensions start from 0) by 4 slices, filled with zeros. And puts the - original content in slice 1 of the 3rd dimension - """ - # Concats zero arrays with same dimensions as image_array, and puts - # image_array as first element in list of arrays to be concatenated - zeros_array = da.zeros_like(image_array) - return da.concatenate( - [ - *([zeros_array] * image_position), - image_array, - *([zeros_array] * (n_empty_slices - image_position)), - ], - axis=dim, - ) - - # Logic: We want to return a image instance - def get_data_lazy(self) -> da.Array: - """Return 5D dask array. For lazy-loading multidimensional tiff files. Dummy image.""" - examples_dir = get_examples_dir() - # TODO: Make this robust to having multiple TIFF images, one for each z-section, - # all falling under the same "pypipeline_unit_test_00_000001_Brightfield_*.tif" - # naming scheme. The aim is to create a multidimensional dask array that stores - # the z-stacks. - img_filename = "pypipeline_unit_test_00_000001_Brightfield_003.tif" - img_path = examples_dir / img_filename - # img is a dask array has three dimensions: z, x, y - # TODO: Write a test to confirm this: If everything worked well, - # z = 1, x = 1200, y = 1200 - img = imread(str(img_path)) - # Adds t & c dimensions - img = da.reshape( - img, (1, 1, img.shape[-3], img.shape[-2], img.shape[-1]) - ) - # Pads t, c, and z dimensions - img = self.pad_array( - img, dim=0, n_empty_slices=199 - ) # 200 timepoints total - img = self.pad_array(img, dim=1, n_empty_slices=2) # 3 channels - img = self.pad_array( - img, dim=2, n_empty_slices=4, image_position=self.ref_z - ) # 5 z-stacks - return img - - @property - def name(self): - pass - - @property - def dimorder(self): - pass diff --git a/src/logfile_parser/swainlab_parser.py b/src/logfile_parser/swainlab_parser.py index a461bb3a120fab4226c092ec201af4b4ee469916..702b0adca188c442bc2d481f66317b6907da383b 100644 --- a/src/logfile_parser/swainlab_parser.py +++ b/src/logfile_parser/swainlab_parser.py @@ -1,5 +1,4 @@ #!/usr/bin/env jupyter -# TODO should this be merged to the regular logfile_parser structure? """ Description of new logfile: @@ -25,15 +24,11 @@ Data to extract: - GIT commit - (Not working as of 2022/10/03, but projects and tags) * Basic information - - - -New grammar - -- Tables are assumed to end with an empty line. """ import logging import typing as t +from itertools import product from pathlib import PosixPath import pandas as pd @@ -53,36 +48,36 @@ from pyparsing import ( atomic = t.Union[str, int, float, bool] -# Grammar specification -grammar = { +# grammar specification +sl_grammar = { "general": { - "start_trigger": Literal("Swain Lab microscope experiment log file"), "type": "fields", + "start_trigger": Literal("Swain Lab microscope experiment log file"), "end_trigger": "-----Acquisition settings-----", }, "image_config": { - "start_trigger": "Image Configs:", "type": "table", + "start_trigger": "Image Configs:", }, "device_properties": { - "start_trigger": "Device properties:", "type": "table", + "start_trigger": "Device properties:", }, "group": { "position": { + "type": "table", "start_trigger": Group( Group(Literal("group:") + Word(printables)) + Group(Literal("field:") + "position") ), - "type": "table", }, **{ key: { + "type": "fields", "start_trigger": Group( Group(Literal("group:") + Word(printables)) + Group(Literal("field:") + key) ), - "type": "fields", } for key in ("time", "config") }, @@ -91,15 +86,14 @@ grammar = { HEADER_END = "-----Experiment started-----" -MAX_NLINES = 2000 # In case of malformed logfile +MAX_NLINES = 2000 # in case of malformed logfile ParserElement.setDefaultWhitespaceChars(" \t") -def extract_header(filepath: PosixPath): +def extract_header(filepath: PosixPath, **kwargs): """Extract content of log file upto HEADER_END.""" - with open(filepath, "r", encoding="latin1") as f: - # with open(filepath, "r", errors="ignore", encoding="unicode_escape") as f: + with open(filepath, "r", **kwargs) as f: try: header = "" for _ in range(MAX_NLINES): @@ -112,17 +106,33 @@ def extract_header(filepath: PosixPath): return header -def parse_from_grammar(filepath: str, grammar: t.Dict): - """Parse a file using the specified grammar.""" - header = extract_header(filepath) +def parse_from_swainlab_grammar(filepath: t.Union[str, PosixPath]): + """Parse using a grammar for the Swain lab.""" + try: + header = extract_header(filepath, encoding="latin1") + res = parse_from_grammar(header, sl_grammar) + except Exception: + # removes unwanted windows characters + header = extract_header( + filepath, errors="ignore", encoding="unicode_escape" + ) + res = parse_from_grammar(header, sl_grammar) + return res + + +def parse_from_grammar(header: str, grammar: t.Dict): + """Parse a string using the specified grammar.""" d = {} for key, values in grammar.items(): try: if "type" in values: + # use values to find parsing function d[key] = parse_x(header, **values) - else: # Use subkeys to parse groups + else: + # for group, use subkeys to parse for subkey, subvalues in values.items(): subkey = "_".join((key, subkey)) + # use subvalues to find parsing function d[subkey] = parse_x(header, **subvalues) except Exception as e: logging.getLogger("aliby").critical( @@ -132,18 +142,13 @@ def parse_from_grammar(filepath: str, grammar: t.Dict): return d -def parse_x(string: str, type: str, **kwargs): +def parse_x(string_input: str, type: str, **kwargs): """Parse a string using a function specifed by data_type.""" - return eval(f"parse_{type}(string, **kwargs)") - - -def parse_from_swainlab_grammar(filepath: t.Union[str, PosixPath]): - """Parse using a grammar for the Swain lab.""" - return parse_from_grammar(filepath, grammar) + return eval(f"parse_{type}(string_input, **kwargs)") def parse_table( - string: str, + string_input: str, start_trigger: t.Union[str, Keyword], ) -> pd.DataFrame: """ @@ -172,13 +177,8 @@ def parse_table( line = LineStart() + Group( OneOrMore(field + Literal(",").suppress()) + field + EOL ) - parser = ( - start_trigger - + EOL - + Group(OneOrMore(line)) - + EOL # end_trigger.suppress() - ) - parser_result = parser.search_string(string) + parser = start_trigger + EOL + Group(OneOrMore(line)) + EOL + parser_result = parser.search_string(string_input) assert all( [len(row) == len(parser_result[0]) for row in parser_result] ), f"Table {start_trigger} has unequal number of columns" @@ -187,7 +187,7 @@ def parse_table( def parse_fields( - string: str, start_trigger, end_trigger=None + string_input: str, start_trigger, end_trigger=None ) -> t.Union[pd.DataFrame, t.Dict[str, atomic]]: """ Parse fields are parsed as key-value pairs. @@ -215,16 +215,16 @@ def parse_fields( parser = ( start_trigger + EOL + Group(OneOrMore(line)) + end_trigger.suppress() ) - parser_result = parser.search_string(string) + parser_result = parser.search_string(string_input) results = parser_result.as_list() assert len(results), "Parsing returned nothing" - return fields_to_dict_or_table(results) + return fields_to_dict_or_df(results) def table_to_df(result: t.List[t.List]): - if len(result) > 1: # Multiple tables with ids to append - from itertools import product - + """Convert table to data frame.""" + if len(result) > 1: + # multiple tables with ids to append group_name = [ product((table[0][0][1],), (row[0] for row in table[1][1:])) for table in result @@ -237,37 +237,38 @@ def table_to_df(result: t.List[t.List]): index=multiindices, ) df.name = result[0][0][1][1] - else: # If it is a single table + else: + # a single table df = pd.DataFrame(result[0][1][1:], columns=result[0][1][0]) return df -def fields_to_dict_or_table(result: t.List[t.List]): +def fields_to_dict_or_df(result: t.List[t.List]): + """Convert field to dict or dataframe.""" if len(result) > 1: formatted = pd.DataFrame( [[row[1] for row in pr[1]] for pr in result], columns=[x[0] for x in result[0][1]], index=[x[0][0][1] for x in result], ) - formatted.name = result[0][0][1][1] - else: # If it is a single table - formatted = {k: _cast_type(v) for k, v in dict(result[0][1]).items()} - + else: + # a single table + formatted = {k: cast_type(v) for k, v in dict(result[0][1]).items()} return formatted -def _cast_type(x: str) -> t.Union[str, int, float, bool]: - # Convert to any possible when possible +def cast_type(x: str) -> t.Union[str, int, float, bool]: + """Convert to either an integer or float or boolean.""" x = x.strip() if x.isdigit(): x = int(x) else: try: x = float(x) - except: + except Exception: try: x = ("false", "true").index(x.lower()) - except: + except Exception: pass return x