Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • swain-lab/aliby/aliby-mirror
  • swain-lab/aliby/alibylite
2 results
Show changes
Commits on Source (70)
Showing
with 2819 additions and 2462 deletions
This diff is collapsed.
...@@ -33,7 +33,7 @@ pathos = "^0.2.8" # Lambda-friendly multithreading ...@@ -33,7 +33,7 @@ pathos = "^0.2.8" # Lambda-friendly multithreading
p-tqdm = "^1.3.3" p-tqdm = "^1.3.3"
pandas = ">=1.3.3" pandas = ">=1.3.3"
py-find-1st = "^1.1.5" # Fast indexing py-find-1st = "^1.1.5" # Fast indexing
scikit-learn = ">=1.0.2" # Used for an extraction metric scikit-learn = ">=1.0.2, <1.3" # Used for an extraction metric
scipy = ">=1.7.3" scipy = ">=1.7.3"
# Pipeline + I/O # Pipeline + I/O
...@@ -46,14 +46,11 @@ xmltodict = "^0.13.0" # read ome-tiff metadata ...@@ -46,14 +46,11 @@ xmltodict = "^0.13.0" # read ome-tiff metadata
zarr = "^2.14.0" zarr = "^2.14.0"
GitPython = "^3.1.27" GitPython = "^3.1.27"
h5py = "2.10" # File I/O h5py = "2.10" # File I/O
aliby-baby = "^0.1.17"
# Networking # Networking
omero-py = { version = ">=5.6.2", optional = true } # contact omero server omero-py = { version = ">=5.6.2", optional = true } # contact omero server
# Baby segmentation
aliby-baby = {version = "^0.1.17", optional=true}
# Postprocessing # Postprocessing
[tool.poetry.group.pp.dependencies] [tool.poetry.group.pp.dependencies]
leidenalg = "^0.8.8" leidenalg = "^0.8.8"
...@@ -113,7 +110,6 @@ grid-strategy = {version = "^0.0.1", optional=true} ...@@ -113,7 +110,6 @@ grid-strategy = {version = "^0.0.1", optional=true}
[tool.poetry.extras] [tool.poetry.extras]
omero = ["omero-py"] omero = ["omero-py"]
baby = ["aliby-baby"]
[tool.black] [tool.black]
line-length = 79 line-length = 79
......
...@@ -17,16 +17,14 @@ atomic = t.Union[int, float, str, bool] ...@@ -17,16 +17,14 @@ atomic = t.Union[int, float, str, bool]
class ParametersABC(ABC): class ParametersABC(ABC):
""" """
Defines parameters as attributes and allows parameters to Define parameters as attributes and allow parameters to
be converted to either a dictionary or to yaml. be converted to either a dictionary or to yaml.
No attribute should be called "parameters"! No attribute should be called "parameters"!
""" """
def __init__(self, **kwargs): def __init__(self, **kwargs):
""" """Define parameters as attributes."""
Defines parameters as attributes
"""
assert ( assert (
"parameters" not in kwargs "parameters" not in kwargs
), "No attribute should be named parameters" ), "No attribute should be named parameters"
...@@ -35,8 +33,9 @@ class ParametersABC(ABC): ...@@ -35,8 +33,9 @@ class ParametersABC(ABC):
def to_dict(self, iterable="null") -> t.Dict: def to_dict(self, iterable="null") -> t.Dict:
""" """
Recursive function to return a nested dictionary of the Return a nested dictionary of the attributes of the class instance.
attributes of the class instance.
Use recursion.
""" """
if isinstance(iterable, dict): if isinstance(iterable, dict):
if any( if any(
...@@ -62,7 +61,8 @@ class ParametersABC(ABC): ...@@ -62,7 +61,8 @@ class ParametersABC(ABC):
def to_yaml(self, path: Union[Path, str] = None): def to_yaml(self, path: Union[Path, str] = None):
""" """
Returns a yaml stream of the attributes of the class instance. Return a yaml stream of the attributes of the class instance.
If path is provided, the yaml stream is saved there. If path is provided, the yaml stream is saved there.
Parameters Parameters
...@@ -81,9 +81,7 @@ class ParametersABC(ABC): ...@@ -81,9 +81,7 @@ class ParametersABC(ABC):
@classmethod @classmethod
def from_yaml(cls, source: Union[Path, str]): def from_yaml(cls, source: Union[Path, str]):
""" """Return instance from a yaml filename or stdin."""
Returns instance from a yaml filename or stdin
"""
is_buffer = True is_buffer = True
try: try:
if Path(source).exists(): if Path(source).exists():
...@@ -107,7 +105,8 @@ class ParametersABC(ABC): ...@@ -107,7 +105,8 @@ class ParametersABC(ABC):
def update(self, name: str, new_value): def update(self, name: str, new_value):
""" """
Update values recursively Update values recursively.
if name is a dictionary, replace data where existing found or add if not. if name is a dictionary, replace data where existing found or add if not.
It warns against type changes. It warns against type changes.
...@@ -116,11 +115,10 @@ class ParametersABC(ABC): ...@@ -116,11 +115,10 @@ class ParametersABC(ABC):
If a leaf node that is to be changed is a collection, it adds the new elements. If a leaf node that is to be changed is a collection, it adds the new elements.
""" """
assert name not in ( assert name not in (
"parameters", "parameters",
"params", "params",
), "Attribute can't be named params or parameters" ), "Attribute cannot be named params or parameters."
if name in self.__dict__: if name in self.__dict__:
if check_type_recursive(getattr(self, name), new_value): if check_type_recursive(getattr(self, name), new_value):
...@@ -179,7 +177,8 @@ def add_to_collection( ...@@ -179,7 +177,8 @@ def add_to_collection(
class ProcessABC(ABC): class ProcessABC(ABC):
""" """
Base class for processes. Base class for processes.
Defines parameters as attributes and requires run method to be defined.
Define parameters as attributes and requires a run method.
""" """
def __init__(self, parameters): def __init__(self, parameters):
...@@ -190,8 +189,8 @@ class ProcessABC(ABC): ...@@ -190,8 +189,8 @@ class ProcessABC(ABC):
""" """
self._parameters = parameters self._parameters = parameters
# convert parameters to dictionary # convert parameters to dictionary
# and then define each parameter as an attribute
for k, v in parameters.to_dict().items(): for k, v in parameters.to_dict().items():
# define each parameter as an attribute
setattr(self, k, v) setattr(self, k, v)
@property @property
...@@ -243,11 +242,9 @@ class StepABC(ProcessABC): ...@@ -243,11 +242,9 @@ class StepABC(ProcessABC):
@timer @timer
def run_tp(self, tp: int, **kwargs): def run_tp(self, tp: int, **kwargs):
""" """Time and log the timing of a step."""
Time and log the timing of a step.
"""
return self._run_tp(tp, **kwargs) return self._run_tp(tp, **kwargs)
def run(self): def run(self):
# Replace run with run_tp # Replace run with run_tp
raise Warning("Steps use run_tp instead of run") raise Warning("Steps use run_tp instead of run.")
...@@ -23,20 +23,19 @@ class BridgeH5: ...@@ -23,20 +23,19 @@ class BridgeH5:
"""Initialise with the name of the h5 file.""" """Initialise with the name of the h5 file."""
self.filename = filename self.filename = filename
if flag is not None: if flag is not None:
self._hdf = h5py.File(filename, flag) self.hdf = h5py.File(filename, flag)
self._filecheck assert (
"cell_info" in self.hdf
), "Invalid file. No 'cell_info' found."
def _log(self, message: str, level: str = "warn"): def _log(self, message: str, level: str = "warn"):
# Log messages in the corresponding level # Log messages in the corresponding level
logger = logging.getLogger("aliby") logger = logging.getLogger("aliby")
getattr(logger, level)(f"{self.__class__.__name__}: {message}") getattr(logger, level)(f"{self.__class__.__name__}: {message}")
def _filecheck(self):
assert "cell_info" in self._hdf, "Invalid file. No 'cell_info' found."
def close(self): def close(self):
"""Close the h5 file.""" """Close the h5 file."""
self._hdf.close() self.hdf.close()
@property @property
def meta_h5(self) -> t.Dict[str, t.Any]: def meta_h5(self) -> t.Dict[str, t.Any]:
...@@ -83,7 +82,7 @@ class BridgeH5: ...@@ -83,7 +82,7 @@ class BridgeH5:
def get_npairs_over_time(self, nstepsback=2): def get_npairs_over_time(self, nstepsback=2):
tree = self.cell_tree tree = self.cell_tree
npairs = [] npairs = []
for tp in self._hdf["cell_info"]["processed_timepoints"][()]: for tp in self.hdf["cell_info"]["processed_timepoints"][()]:
tmp_tree = { tmp_tree = {
k: {k2: v2 for k2, v2 in v.items() if k2 <= tp} k: {k2: v2 for k2, v2 in v.items() if k2 <= tp}
for k, v in tree.items() for k, v in tree.items()
...@@ -115,7 +114,7 @@ class BridgeH5: ...@@ -115,7 +114,7 @@ class BridgeH5:
---------- ----------
Nested dictionary where keys (or branches) are the upper levels and the leaves are the last element of :fields:. Nested dictionary where keys (or branches) are the upper levels and the leaves are the last element of :fields:.
""" """
zipped_info = (*zip(*[self._hdf["cell_info"][f][()] for f in fields]),) zipped_info = (*zip(*[self.hdf["cell_info"][f][()] for f in fields]),)
return recursive_groupsort(zipped_info) return recursive_groupsort(zipped_info)
......
This diff is collapsed.
...@@ -6,17 +6,19 @@ import typing as t ...@@ -6,17 +6,19 @@ import typing as t
from functools import wraps from functools import wraps
def _first_arg_str_to_df( def _first_arg_str_to_raw_df(
fn: t.Callable, fn: t.Callable,
): ):
"""Enable Signal-like classes to convert strings to data sets.""" """Enable Signal-like classes to convert strings to data sets."""
@wraps(fn) @wraps(fn)
def format_input(*args, **kwargs): def format_input(*args, **kwargs):
cls = args[0] cls = args[0]
data = args[1] data = args[1]
if isinstance(data, str): if isinstance(data, str):
# get data from h5 file # get data from h5 file using Signal's get_raw
data = cls.get_raw(data) data = cls.get_raw(data)
# replace path in the undecorated function with data # replace path in the undecorated function with data
return fn(cls, data, *args[2:], **kwargs) return fn(cls, data, *args[2:], **kwargs)
return format_input return format_input
""" """
Anthology of interfaces fordispatch_metadata_parse different parsers and lack of them. Aliby decides on using different metadata parsers based on two elements:
1. The parameter given by PipelineParameters (either True/False or a string
ALIBY decides on using different metadata parsers based on two elements: pointing to the metadata file)
2. The available files in the root folder where images are found (either
1. The parameter given by PipelineParameters (Either True/False, or a string pointing to the metadata file) remote or locally).
2. The available files in the root folder where images are found (remote or locally)
If parameters is a string pointing to a metadata file, Aliby picks a parser
If parameters is a string pointing to a metadata file, ALIBY picks a parser based on the file format. based on the file format.
If parameters is True (as a boolean), ALIBY searches for any available file and uses the first valid one. If parameters is True, Aliby searches for any available file and uses the
If there are no metadata files, ALIBY requires indicating indices for tiler, segmentation and extraction. first valid one.
If there are no metadata files, Aliby requires indices in the tiff file names
for tiler, segmentation, and extraction.
WARNING: grammars depend on the directory structure of a local log-file_parser
repository.
""" """
import glob import glob
import logging import logging
...@@ -27,28 +31,32 @@ from logfile_parser.swainlab_parser import parse_from_swainlab_grammar ...@@ -27,28 +31,32 @@ from logfile_parser.swainlab_parser import parse_from_swainlab_grammar
class MetaData: class MetaData:
"""Small metadata Process that loads log.""" """Metadata process that loads and parses log files."""
def __init__(self, log_dir, store): def __init__(self, log_dir, store):
"""Initialise with log-file directory and h5 location to write."""
self.log_dir = log_dir self.log_dir = log_dir
self.store = store self.store = store
self.metadata_writer = Writer(self.store) self.metadata_writer = Writer(self.store)
def __getitem__(self, item): def __getitem__(self, item):
"""Load log and access item in resulting meta data dictionary."""
return self.load_logs()[item] return self.load_logs()[item]
def load_logs(self): def load_logs(self):
# parsed_flattened = parse_logfiles(self.log_dir) """Load log using a hierarchy of parsers."""
parsed_flattened = dispatch_metadata_parser(self.log_dir) parsed_flattened = dispatch_metadata_parser(self.log_dir)
return parsed_flattened return parsed_flattened
def run(self, overwrite=False): def run(self, overwrite=False):
"""Load and parse logs and write to h5 file."""
metadata_dict = self.load_logs() metadata_dict = self.load_logs()
self.metadata_writer.write( self.metadata_writer.write(
path="/", meta=metadata_dict, overwrite=overwrite path="/", meta=metadata_dict, overwrite=overwrite
) )
def add_field(self, field_name, field_value, **kwargs): def add_field(self, field_name, field_value, **kwargs):
"""Write a field and its values to the h5 file."""
self.metadata_writer.write( self.metadata_writer.write(
path="/", path="/",
meta={field_name: field_value}, meta={field_name: field_value},
...@@ -56,96 +64,87 @@ class MetaData: ...@@ -56,96 +64,87 @@ class MetaData:
) )
def add_fields(self, fields_values: dict, **kwargs): def add_fields(self, fields_values: dict, **kwargs):
"""Write a dict of fields and values to the h5 file."""
for field, value in fields_values.items(): for field, value in fields_values.items():
self.add_field(field, value) self.add_field(field, value)
# Paradigm: able to do something with all datatypes present in log files,
# then pare down on what specific information is really useful later.
# Needed because HDF5 attributes do not support dictionaries
def flatten_dict(nested_dict, separator="/"): def flatten_dict(nested_dict, separator="/"):
""" """
Flattens nested dictionary. If empty return as-is. Flatten nested dictionary because h5 attributes cannot be dicts.
If empty return as-is.
""" """
flattened = {} flattened = {}
if nested_dict: if nested_dict:
df = pd.json_normalize(nested_dict, sep=separator) df = pd.json_normalize(nested_dict, sep=separator)
flattened = df.to_dict(orient="records")[0] or {} flattened = df.to_dict(orient="records")[0] or {}
return flattened return flattened
# Needed because HDF5 attributes do not support datetime objects
# Takes care of time zones & daylight saving
def datetime_to_timestamp(time, locale="Europe/London"): def datetime_to_timestamp(time, locale="Europe/London"):
""" """Convert datetime object to UNIX timestamp."""
Convert datetime object to UNIX timestamp # h5 attributes do not support datetime objects
"""
return timezone(locale).localize(time).timestamp() return timezone(locale).localize(time).timestamp()
def find_file(root_dir, regex): def find_file(root_dir, regex):
"""Find files in a directory using regex."""
# ignore aliby.log files
file = [ file = [
f f
for f in glob.glob(os.path.join(str(root_dir), regex)) for f in glob.glob(os.path.join(str(root_dir), regex))
if Path(f).name != "aliby.log" # Skip filename reserved for aliby if Path(f).name != "aliby.log"
] ]
if len(file) > 1:
print(
"Warning:Metadata: More than one logfile found. Defaulting to first option."
)
file = [sorted(file)[0]]
if len(file) == 0: if len(file) == 0:
logging.getLogger("aliby").log( logging.getLogger("aliby").log(
logging.WARNING, "Metadata: No valid swainlab .log found." logging.WARNING, "Metadata: No valid swainlab .log found."
) )
return None
elif len(file) > 1:
print(
"Warning:Metadata: More than one log file found."
" Defaulting to first option."
)
return sorted(file)[0]
else: else:
return file[0] return file[0]
return None
# TODO: re-write this as a class if appropriate
# WARNING: grammars depend on the directory structure of a locally installed
# logfile_parser repo
def parse_logfiles( def parse_logfiles(
root_dir, root_dir,
acq_grammar="multiDGUI_acq_format.json", acq_grammar="multiDGUI_acq_format.json",
log_grammar="multiDGUI_log_format.json", log_grammar="multiDGUI_log_format.json",
): ):
""" """
Parse acq and log files depending on the grammar specified, then merge into Parse acq and log files using the grammar specified.
single dict.
Merge results into a single dict.
""" """
# Both acq and log files contain useful information.
# ACQ_FILE = 'flavin_htb2_glucose_long_ramp_DelftAcq.txt'
# LOG_FILE = 'flavin_htb2_glucose_long_ramp_Delftlog.txt'
log_parser = Parser(log_grammar) log_parser = Parser(log_grammar)
acq_parser = Parser(acq_grammar) acq_parser = Parser(acq_grammar)
# an example acq file is 'flavin_htb2_glucose_long_ramp_DelftAcq.txt'
log_file = find_file(root_dir, "*log.txt") log_file = find_file(root_dir, "*log.txt")
# an example log file is 'flavin_htb2_glucose_long_ramp_Delftlog.txt'
acq_file = find_file(root_dir, "*[Aa]cq.txt") acq_file = find_file(root_dir, "*[Aa]cq.txt")
# parse into a single dict
parsed = {} parsed = {}
if log_file and acq_file: if log_file and acq_file:
with open(log_file, "r") as f: with open(log_file, "r") as f:
log_parsed = log_parser.parse(f) log_parsed = log_parser.parse(f)
with open(acq_file, "r") as f: with open(acq_file, "r") as f:
acq_parsed = acq_parser.parse(f) acq_parsed = acq_parser.parse(f)
parsed = {**acq_parsed, **log_parsed} parsed = {**acq_parsed, **log_parsed}
# convert data to having time stamps
for key, value in parsed.items(): for key, value in parsed.items():
if isinstance(value, datetime): if isinstance(value, datetime):
parsed[key] = datetime_to_timestamp(value) parsed[key] = datetime_to_timestamp(value)
# flatten dict
parsed_flattened = flatten_dict(parsed) parsed_flattened = flatten_dict(parsed)
for k, v in parsed_flattened.items(): for k, v in parsed_flattened.items():
if isinstance(v, list): if isinstance(v, list):
# replace None with 0
parsed_flattened[k] = [0 if el is None else el for el in v] parsed_flattened[k] = [0 if el is None else el for el in v]
return parsed_flattened return parsed_flattened
...@@ -153,109 +152,98 @@ def get_meta_swainlab(parsed_metadata: dict): ...@@ -153,109 +152,98 @@ def get_meta_swainlab(parsed_metadata: dict):
""" """
Convert raw parsing of Swainlab logfile to the metadata interface. Convert raw parsing of Swainlab logfile to the metadata interface.
Input: Parameters
-------- --------
parsed_metadata: Dict[str, str or int or DataFrame or Dict] parsed_metadata: dict[str, str or int or DataFrame or Dict]
default['general', 'image_config', 'device_properties', 'group_position', 'group_time', 'group_config'] default['general', 'image_config', 'device_properties',
'group_position', 'group_time', 'group_config']
Returns: Returns
-------- --------
Dictionary with metadata following the standard Dict with channels metadata
""" """
channels = parsed_metadata["image_config"]["Image config"].values.tolist() channels = parsed_metadata["image_config"]["Image config"].values.tolist()
# nframes = int(parsed_metadata["group_time"]["frames"].max())
# return {"channels": channels, "nframes": nframes}
return {"channels": channels} return {"channels": channels}
def get_meta_from_legacy(parsed_metadata: dict): def get_meta_from_legacy(parsed_metadata: dict):
"""Fix naming convention for channels in legacy .txt log files."""
result = parsed_metadata result = parsed_metadata
result["channels"] = result["channels/channel"] result["channels"] = result["channels/channel"]
return result return result
def parse_swainlab_metadata(filedir: t.Union[str, Path]): def parse_swainlab_metadata(filedir: t.Union[str, Path]):
""" """Parse new, .log, and old, .txt, files in a directory into a dict."""
Dispatcher function that determines which parser to use based on the file ending.
Input:
--------
filedir: Directory where the logfile is located.
Returns:
--------
Dictionary with minimal metadata
"""
filedir = Path(filedir) filedir = Path(filedir)
filepath = find_file(filedir, "*.log") filepath = find_file(filedir, "*.log")
if filepath: if filepath:
# new log files ending in .log
raw_parse = parse_from_swainlab_grammar(filepath) raw_parse = parse_from_swainlab_grammar(filepath)
minimal_meta = get_meta_swainlab(raw_parse) minimal_meta = get_meta_swainlab(raw_parse)
else: else:
# old log files ending in .txt
if filedir.is_file() or str(filedir).endswith(".zarr"): if filedir.is_file() or str(filedir).endswith(".zarr"):
# log file is in parent directory
filedir = filedir.parent filedir = filedir.parent
legacy_parse = parse_logfiles(filedir) legacy_parse = parse_logfiles(filedir)
minimal_meta = ( minimal_meta = (
get_meta_from_legacy(legacy_parse) if legacy_parse else {} get_meta_from_legacy(legacy_parse) if legacy_parse else {}
) )
return minimal_meta return minimal_meta
def dispatch_metadata_parser(filepath: t.Union[str, Path]): def dispatch_metadata_parser(filepath: t.Union[str, Path]):
""" """
Function to dispatch different metadata parsers that convert logfiles into a Dispatch different metadata parsers that convert logfiles into a dictionary.
basic metadata dictionary. Currently only contains the swainlab log parsers.
Currently only contains the swainlab log parsers.
Input: Parameters
-------- --------
filepath: str existing file containing metadata, or folder containing naming conventions filepath: str
File containing metadata or folder containing naming conventions.
""" """
parsed_meta = parse_swainlab_metadata(filepath) parsed_meta = parse_swainlab_metadata(filepath)
if parsed_meta is None: if parsed_meta is None:
# try to deduce metadata
parsed_meta = dir_to_meta parsed_meta = dir_to_meta
return parsed_meta return parsed_meta
def dir_to_meta(path: Path, suffix="tiff"): def dir_to_meta(path: Path, suffix="tiff"):
"""Deduce meta data from the naming convention of tiff files."""
filenames = list(path.glob(f"*.{suffix}")) filenames = list(path.glob(f"*.{suffix}"))
try: try:
# Deduct order from filenames # deduce order from filenames
dimorder = "".join( dim_order = "".join(
map(lambda x: x[0], filenames[0].stem.split("_")[1:]) map(lambda x: x[0], filenames[0].stem.split("_")[1:])
) )
dim_value = list( dim_value = list(
map( map(
lambda f: filename_to_dict_indices(f.stem), lambda f: filename_to_dict_indices(f.stem),
path.glob("*.tiff"), path.glob("*.tiff"),
) )
) )
maxes = [max(map(lambda x: x[dim], dim_value)) for dim in dimorder] maxs = [max(map(lambda x: x[dim], dim_value)) for dim in dim_order]
mins = [min(map(lambda x: x[dim], dim_value)) for dim in dimorder] mins = [min(map(lambda x: x[dim], dim_value)) for dim in dim_order]
_dim_shapes = [ dim_shapes = [
max_val - min_val + 1 for max_val, min_val in zip(maxes, mins) max_val - min_val + 1 for max_val, min_val in zip(maxs, mins)
] ]
meta = { meta = {
"size_" + dim: shape for dim, shape in zip(dimorder, _dim_shapes) "size_" + dim: shape for dim, shape in zip(dim_order, dim_shapes)
} }
except Exception as e: except Exception as e:
print( print(
f"Warning:Metadata: Cannot extract dimensions from filenames. Empty meta set {e}" "Warning:Metadata: Cannot extract dimensions from filenames."
f" Empty meta set {e}"
) )
meta = {} meta = {}
return meta return meta
def filename_to_dict_indices(stem: str): def filename_to_dict_indices(stem: str):
"""Convert a file name into a dict by splitting."""
return { return {
dim_number[0]: int(dim_number[1:]) dim_number[0]: int(dim_number[1:])
for dim_number in stem.split("_")[1:] for dim_number in stem.split("_")[1:]
......
...@@ -5,7 +5,7 @@ import h5py ...@@ -5,7 +5,7 @@ import h5py
import numpy as np import numpy as np
from agora.io.bridge import groupsort from agora.io.bridge import groupsort
from agora.io.writer import load_attributes from agora.io.writer import load_meta
class DynamicReader: class DynamicReader:
...@@ -13,7 +13,7 @@ class DynamicReader: ...@@ -13,7 +13,7 @@ class DynamicReader:
def __init__(self, file: str): def __init__(self, file: str):
self.file = file self.file = file
self.metadata = load_attributes(file) self.metadata = load_meta(file)
class StateReader(DynamicReader): class StateReader(DynamicReader):
......
...@@ -9,9 +9,10 @@ import h5py ...@@ -9,9 +9,10 @@ import h5py
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import aliby.global_parameters as global_parameters
from agora.io.bridge import BridgeH5 from agora.io.bridge import BridgeH5
from agora.io.decorators import _first_arg_str_to_df from agora.io.decorators import _first_arg_str_to_raw_df
from agora.utils.indexing import validate_association from agora.utils.indexing import validate_lineage
from agora.utils.kymograph import add_index_levels from agora.utils.kymograph import add_index_levels
from agora.utils.merge import apply_merges from agora.utils.merge import apply_merges
...@@ -20,11 +21,14 @@ class Signal(BridgeH5): ...@@ -20,11 +21,14 @@ class Signal(BridgeH5):
""" """
Fetch data from h5 files for post-processing. Fetch data from h5 files for post-processing.
Signal assumes that the metadata and data are accessible to perform time-adjustments and apply previously recorded post-processes. Signal assumes that the metadata and data are accessible to
perform time-adjustments and apply previously recorded
post-processes.
""" """
def __init__(self, file: t.Union[str, Path]): def __init__(self, file: t.Union[str, Path]):
"""Define index_names for dataframes, candidate fluorescence channels, and composite statistics.""" """Define index_names for dataframes, candidate fluorescence channels,
and composite statistics."""
super().__init__(file, flag=None) super().__init__(file, flag=None)
self.index_names = ( self.index_names = (
"experiment", "experiment",
...@@ -33,22 +37,13 @@ class Signal(BridgeH5): ...@@ -33,22 +37,13 @@ class Signal(BridgeH5):
"cell_label", "cell_label",
"mother_label", "mother_label",
) )
self.candidate_channels = ( self.candidate_channels = global_parameters.possible_imaging_channels
"GFP",
"GFPFast",
"mCherry",
"Flavin",
"Citrine",
"mKO2",
"Cy5",
"pHluorin405",
)
def __getitem__(self, dsets: t.Union[str, t.Collection]): def __getitem__(self, dsets: t.Union[str, t.Collection]):
"""Get and potentially pre-process data from h5 file and return as a dataframe.""" """Get and potentially pre-process data from h5 file and return as a dataframe."""
if isinstance(dsets, str): # no pre-processing if isinstance(dsets, str):
return self.get(dsets) return self.get(dsets)
elif isinstance(dsets, list): # pre-processing elif isinstance(dsets, list):
is_bgd = [dset.endswith("imBackground") for dset in dsets] is_bgd = [dset.endswith("imBackground") for dset in dsets]
# Check we are not comparing tile-indexed and cell-indexed data # Check we are not comparing tile-indexed and cell-indexed data
assert sum(is_bgd) == 0 or sum(is_bgd) == len( assert sum(is_bgd) == 0 or sum(is_bgd) == len(
...@@ -58,22 +53,23 @@ class Signal(BridgeH5): ...@@ -58,22 +53,23 @@ class Signal(BridgeH5):
else: else:
raise Exception(f"Invalid type {type(dsets)} to get datasets") raise Exception(f"Invalid type {type(dsets)} to get datasets")
def get(self, dsets: t.Union[str, t.Collection], **kwargs): def get(self, dset_name: t.Union[str, t.Collection], **kwargs):
"""Get and potentially pre-process data from h5 file and return as a dataframe.""" """Return pre-processed data as a dataframe."""
if isinstance(dsets, str): # no pre-processing if isinstance(dset_name, str):
df = self.get_raw(dsets, **kwargs) dsets = self.get_raw(dset_name, **kwargs)
prepost_applied = self.apply_prepost(dsets, **kwargs) prepost_applied = self.apply_prepost(dsets, **kwargs)
return self.add_name(prepost_applied, dset_name)
return self.add_name(prepost_applied, dsets) else:
raise Exception("Error in Signal.get")
@staticmethod @staticmethod
def add_name(df, name): def add_name(df, name):
"""Add column of identical strings to a dataframe.""" """Add name of the Signal as an attribute to its corresponding dataframe."""
df.name = name df.name = name
return df return df
def cols_in_mins(self, df: pd.DataFrame): def cols_in_mins(self, df: pd.DataFrame):
# Convert numerical columns in a dataframe to minutes """Convert numerical columns in a dataframe to minutes."""
try: try:
df.columns = (df.columns * self.tinterval // 60).astype(int) df.columns = (df.columns * self.tinterval // 60).astype(int)
except Exception as e: except Exception as e:
...@@ -94,14 +90,15 @@ class Signal(BridgeH5): ...@@ -94,14 +90,15 @@ class Signal(BridgeH5):
if tinterval_location in f.attrs: if tinterval_location in f.attrs:
return f.attrs[tinterval_location][0] return f.attrs[tinterval_location][0]
else: else:
logging.getlogger("aliby").warn( logging.getLogger("aliby").warn(
f"{str(self.filename).split('/')[-1]}: using default time interval of 5 minutes" f"{str(self.filename).split('/')[-1]}: using default time interval of 5 minutes"
) )
return 5 return 5
@staticmethod @staticmethod
def get_retained(df, cutoff): def get_retained(df, cutoff):
"""Return a fraction of the df, one without later time points.""" """Return rows of df with at least cutoff fraction of the total number
of time points."""
return df.loc[bn.nansum(df.notna(), axis=1) > df.shape[1] * cutoff] return df.loc[bn.nansum(df.notna(), axis=1) > df.shape[1] * cutoff]
@property @property
...@@ -110,15 +107,17 @@ class Signal(BridgeH5): ...@@ -110,15 +107,17 @@ class Signal(BridgeH5):
with h5py.File(self.filename, "r") as f: with h5py.File(self.filename, "r") as f:
return list(f.attrs["channels"]) return list(f.attrs["channels"])
@_first_arg_str_to_df def retained(
def retained(self, signal, cutoff=0.8): self, signal, cutoff=global_parameters.signal_retained_cutoff
):
""" """
Load data (via decorator) and reduce the resulting dataframe. Load data (via decorator) and reduce the resulting dataframe.
Load data for a signal or a list of signals and reduce the resulting Load data for a signal or a list of signals and reduce the resulting
dataframes to a fraction of their original size, losing late time dataframes to rows with sufficient numbers of time points.
points.
""" """
if isinstance(signal, str):
signal = self.get_raw(signal)
if isinstance(signal, pd.DataFrame): if isinstance(signal, pd.DataFrame):
return self.get_retained(signal, cutoff) return self.get_retained(signal, cutoff)
elif isinstance(signal, list): elif isinstance(signal, list):
...@@ -131,17 +130,15 @@ class Signal(BridgeH5): ...@@ -131,17 +130,15 @@ class Signal(BridgeH5):
""" """
Get lineage data from a given location in the h5 file. Get lineage data from a given location in the h5 file.
Returns an array with three columns: the tile id, the mother label, and the daughter label. Returns an array with three columns: the tile id, the mother label,
and the daughter label.
""" """
if lineage_location is None: if lineage_location is None:
lineage_location = "modifiers/lineage_merged" lineage_location = "modifiers/lineage_merged"
with h5py.File(self.filename, "r") as f: with h5py.File(self.filename, "r") as f:
# if lineage_location not in f:
# lineage_location = lineage_location.split("_")[0]
if lineage_location not in f: if lineage_location not in f:
lineage_location = "postprocessing/lineage" lineage_location = "postprocessing/lineage"
tile_mo_da = f[lineage_location] tile_mo_da = f[lineage_location]
if isinstance(tile_mo_da, h5py.Dataset): if isinstance(tile_mo_da, h5py.Dataset):
lineage = tile_mo_da[()] lineage = tile_mo_da[()]
else: else:
...@@ -154,7 +151,7 @@ class Signal(BridgeH5): ...@@ -154,7 +151,7 @@ class Signal(BridgeH5):
).T ).T
return lineage return lineage
@_first_arg_str_to_df @_first_arg_str_to_raw_df
def apply_prepost( def apply_prepost(
self, self,
data: t.Union[str, pd.DataFrame], data: t.Union[str, pd.DataFrame],
...@@ -162,57 +159,40 @@ class Signal(BridgeH5): ...@@ -162,57 +159,40 @@ class Signal(BridgeH5):
picks: t.Union[t.Collection, bool] = True, picks: t.Union[t.Collection, bool] = True,
): ):
""" """
Apply modifier operations (picker or merger) to a dataframe. Apply picking and merging to a Signal data frame.
Parameters Parameters
---------- ----------
data : t.Union[str, pd.DataFrame] data : t.Union[str, pd.DataFrame]
DataFrame or path to one. A data frame or a path to one.
merges : t.Union[np.ndarray, bool] merges : t.Union[np.ndarray, bool]
(optional) 2-D array with three columns: the tile id, the mother label, and the daughter id. (optional) An array of pairs of (trap, cell) indices to merge.
If True, fetch merges from file. If True, fetch merges from file.
picks : t.Union[np.ndarray, bool] picks : t.Union[np.ndarray, bool]
(optional) 2-D array with two columns: the tiles and (optional) An array of (trap, cell) indices.
the cell labels.
If True, fetch picks from file. If True, fetch picks from file.
Examples
--------
FIXME: Add docs.
""" """
if isinstance(merges, bool): if isinstance(merges, bool):
merges: np.ndarray = self.load_merges() if merges else np.array([]) merges = self.load_merges() if merges else np.array([])
if merges.any(): if merges.any():
merged = apply_merges(data, merges) merged = apply_merges(data, merges)
else: else:
merged = copy(data) merged = copy(data)
if isinstance(picks, bool): if isinstance(picks, bool):
picks = ( picks = (
self.get_picks(names=merged.index.names) self.get_picks(
names=merged.index.names, path="modifiers/picks/"
)
if picks if picks
else set(merged.index) else merged.index
) )
with h5py.File(self.filename, "r") as f: if picks:
if "modifiers/picks" in f and picks: picked_indices = set(picks).intersection(
if picks: [tuple(x) for x in merged.index]
return merged.loc[ )
set(picks).intersection( return merged.loc[picked_indices]
[tuple(x) for x in merged.index] else:
) return merged
]
else:
if isinstance(merged.index, pd.MultiIndex):
empty_lvls = [[] for i in merged.index.names]
index = pd.MultiIndex(
levels=empty_lvls,
codes=empty_lvls,
names=merged.index.names,
)
else:
index = pd.Index([], name=merged.index.name)
merged = pd.DataFrame([], index=index)
return merged
@cached_property @cached_property
def p_available(self): def p_available(self):
...@@ -272,10 +252,11 @@ class Signal(BridgeH5): ...@@ -272,10 +252,11 @@ class Signal(BridgeH5):
Parameters Parameters
---------- ----------
dataset: str or list of strs dataset: str or list of strs
The name of the h5 file or a list of h5 file names The name of the h5 file or a list of h5 file names.
in_minutes: boolean in_minutes: boolean
If True, If True, convert column headings to times in minutes.
lineage: boolean lineage: boolean
If True, add mother_label to index.
""" """
try: try:
if isinstance(dataset, str): if isinstance(dataset, str):
...@@ -288,15 +269,17 @@ class Signal(BridgeH5): ...@@ -288,15 +269,17 @@ class Signal(BridgeH5):
self.get_raw(dset, in_minutes=in_minutes, lineage=lineage) self.get_raw(dset, in_minutes=in_minutes, lineage=lineage)
for dset in dataset for dset in dataset
] ]
if lineage: # assume that df is sorted if lineage:
# assume that df is sorted
mother_label = np.zeros(len(df), dtype=int) mother_label = np.zeros(len(df), dtype=int)
lineage = self.lineage() lineage = self.lineage()
a, b = validate_association( # information on buds
valid_lineage, valid_indices = validate_lineage(
lineage, lineage,
np.array(df.index.to_list()), np.array(df.index.to_list()),
match_column=1, "daughters",
) )
mother_label[b] = lineage[a, 1] mother_label[valid_indices] = lineage[valid_lineage, 1]
df = add_index_levels(df, {"mother_label": mother_label}) df = add_index_levels(df, {"mother_label": mother_label})
return df return df
except Exception as e: except Exception as e:
...@@ -316,13 +299,14 @@ class Signal(BridgeH5): ...@@ -316,13 +299,14 @@ class Signal(BridgeH5):
names: t.Tuple[str, ...] = ("trap", "cell_label"), names: t.Tuple[str, ...] = ("trap", "cell_label"),
path: str = "modifiers/picks/", path: str = "modifiers/picks/",
) -> t.Set[t.Tuple[int, str]]: ) -> t.Set[t.Tuple[int, str]]:
"""Get the relevant picks based on names.""" """Get picks from the h5 file."""
with h5py.File(self.filename, "r") as f: with h5py.File(self.filename, "r") as f:
picks = set()
if path in f: if path in f:
picks = set( picks = set(
zip(*[f[path + name] for name in names if name in f[path]]) zip(*[f[path + name] for name in names if name in f[path]])
) )
else:
picks = set()
return picks return picks
def dataset_to_df(self, f: h5py.File, path: str) -> pd.DataFrame: def dataset_to_df(self, f: h5py.File, path: str) -> pd.DataFrame:
...@@ -353,10 +337,7 @@ class Signal(BridgeH5): ...@@ -353,10 +337,7 @@ class Signal(BridgeH5):
fullname: str, fullname: str,
node: t.Union[h5py.Dataset, h5py.Group], node: t.Union[h5py.Dataset, h5py.Group],
): ):
""" """Store the name of a signal if it is a leaf node and if it starts with extraction."""
Store the name of a signal if it is a leaf node
(a group with no more groups inside) and if it starts with extraction.
"""
if isinstance(node, h5py.Group) and np.all( if isinstance(node, h5py.Group) and np.all(
[isinstance(x, h5py.Dataset) for x in node.values()] [isinstance(x, h5py.Dataset) for x in node.values()]
): ):
......
...@@ -15,9 +15,10 @@ from agora.io.bridge import BridgeH5 ...@@ -15,9 +15,10 @@ from agora.io.bridge import BridgeH5
#################### Dynamic version ################################## #################### Dynamic version ##################################
def load_attributes(file: str, group="/"): def load_meta(file: str, group="/"):
""" """
Load the metadata from an h5 file and convert to a dictionary, including the "parameters" field which is stored as YAML. Load the metadata from an h5 file and convert to a dictionary, including
the "parameters" field which is stored as YAML.
Parameters Parameters
---------- ----------
...@@ -26,8 +27,9 @@ def load_attributes(file: str, group="/"): ...@@ -26,8 +27,9 @@ def load_attributes(file: str, group="/"):
group: str, optional group: str, optional
The group in the h5 file from which to read the data The group in the h5 file from which to read the data
""" """
# load the metadata, stored as attributes, from the h5 file and return as a dictionary # load the metadata, stored as attributes, from the h5 file
with h5py.File(file, "r") as f: with h5py.File(file, "r") as f:
# return as a dict
meta = dict(f[group].attrs.items()) meta = dict(f[group].attrs.items())
if "parameters" in meta: if "parameters" in meta:
# convert from yaml format into dict # convert from yaml format into dict
...@@ -51,7 +53,7 @@ class DynamicWriter: ...@@ -51,7 +53,7 @@ class DynamicWriter:
self.file = file self.file = file
# the metadata is stored as attributes in the h5 file # the metadata is stored as attributes in the h5 file
if Path(file).exists(): if Path(file).exists():
self.metadata = load_attributes(file) self.metadata = load_meta(file)
def _log(self, message: str, level: str = "warn"): def _log(self, message: str, level: str = "warn"):
# Log messages in the corresponding level # Log messages in the corresponding level
......
#!/usr/bin/env jupyter
"""
Add general logging functions and decorators
"""
import logging import logging
from time import perf_counter from time import perf_counter
def timer(func): def timer(func):
# Log duration of a function into aliby logfile """Log duration of a function into the aliby log file."""
def wrap_func(*args, **kwargs): def wrap_func(*args, **kwargs):
t1 = perf_counter() t1 = perf_counter()
result = func(*args, **kwargs) result = func(*args, **kwargs)
......
...@@ -9,6 +9,162 @@ This can be: ...@@ -9,6 +9,162 @@ This can be:
import numpy as np import numpy as np
import typing as t import typing as t
# data type to link together trap and cell ids
i_dtype = {"names": ["trap_id", "cell_id"], "formats": [np.int64, np.int64]}
def validate_lineage(
lineage: np.ndarray, indices: np.ndarray, how: str = "families"
):
"""
Identify mother-bud pairs that exist both in lineage and a Signal's
indices.
We expect the lineage information to be unique: a bud should not have
two mothers.
Parameters
----------
lineage : np.ndarray
2D array of lineage associations where columns are
(trap, mother, daughter)
or
a 3D array, which is an array of 2 X 2 arrays comprising
[[trap_id, mother_label], [trap_id, daughter_label]].
indices : np.ndarray
A 2D array of cell indices from a Signal, (trap_id, cell_label).
This array should not include mother_label.
how: str
If "mothers", matches indicate mothers from mother-bud pairs;
If "daughters", matches indicate daughters from mother-bud pairs;
If "families", matches indicate mothers and daughters in mother-bud pairs.
Returns
-------
valid_lineage: boolean np.ndarray
1D array indicating matched elements in lineage.
valid_indices: boolean np.ndarray
1D array indicating matched elements in indices.
Examples
--------
>>> import numpy as np
>>> from agora.utils.indexing import validate_lineage
>>> lineage = np.array([ [[0, 1], [0, 3]], [[0, 1], [0, 4]], [[0, 1], [0, 6]], [[0, 4], [0, 7]] ])
>>> indices = np.array([ [0, 1], [0, 2], [0, 3]])
>>> valid_lineage, valid_indices = validate_lineage(lineage, indices)
>>> print(valid_lineage)
array([ True, False, False, False])
>>> print(valid_indices)
array([ True, False, True])
and
>>> lineage = np.array([[[0,3], [0,1]], [[0,2], [0,4]]])
>>> indices = np.array([[0,1], [0,2], [0,3]])
>>> valid_lineage, valid_indices = validate_lineage(lineage, indices)
>>> print(valid_lineage)
array([ True, False])
>>> print(valid_indices)
array([ True, False, True])
"""
if lineage.ndim == 2:
# [trap, mother, daughter] becomes [[trap, mother], [trap, daughter]]
lineage = assoc_indices_to_3d(lineage)
if how == "mothers":
c_index = 0
elif how == "daughters":
c_index = 1
# find valid lineage
valid_lineages = index_isin(lineage, indices)
if how == "families":
# both mother and bud must be in indices
valid_lineage = valid_lineages.all(axis=1)
else:
valid_lineage = valid_lineages[:, c_index, :]
flat_valid_lineage = valid_lineage.flatten()
# find valid indices
selected_lineages = lineage[flat_valid_lineage, ...]
if how == "families":
# select only pairs of mother and bud indices
valid_indices = index_isin(indices, selected_lineages)
else:
valid_indices = index_isin(indices, selected_lineages[:, c_index, :])
flat_valid_indices = valid_indices.flatten()
if (
indices[flat_valid_indices, :].size
!= np.unique(
lineage[flat_valid_lineage, :].reshape(-1, 2), axis=0
).size
):
# all unique indices in valid_lineages should be in valid_indices
raise Exception(
"Error in validate_lineage: "
"lineage information is likely not unique."
)
return flat_valid_lineage, flat_valid_indices
def index_isin(x: np.ndarray, y: np.ndarray) -> np.ndarray:
"""
Find those elements of x that are in y.
Both arrays must be arrays of integer indices,
such as (trap_id, cell_id).
"""
x = np.ascontiguousarray(x, dtype=np.int64)
y = np.ascontiguousarray(y, dtype=np.int64)
xv = x.view(i_dtype)
inboth = np.intersect1d(xv, y.view(i_dtype))
x_bool = np.isin(xv, inboth)
return x_bool
def assoc_indices_to_3d(ndarray: np.ndarray):
"""
Convert the last column to a new row and repeat first column's values.
For example: [trap, mother, daughter] becomes
[[trap, mother], [trap, daughter]].
Assumes the input array has shape (N,3).
"""
result = ndarray
if len(ndarray) and ndarray.ndim > 1:
# faster indexing for single positions
if ndarray.shape[1] == 3:
result = np.transpose(
np.hstack((ndarray[:, [0]], ndarray)).reshape(-1, 2, 2),
axes=[0, 2, 1],
)
else:
# 20% slower but more general indexing
columns = np.arange(ndarray.shape[1])
result = np.stack(
(
ndarray[:, np.delete(columns, -1)],
ndarray[:, np.delete(columns, -2)],
),
axis=1,
)
return result
def assoc_indices_to_2d(array: np.ndarray):
"""Convert indices to 2d."""
result = array
if len(array):
result = np.concatenate(
(array[:, 0, :], array[:, 1, 1, np.newaxis]), axis=1
)
return result
###################################################################
def validate_association( def validate_association(
association: np.ndarray, association: np.ndarray,
...@@ -104,46 +260,6 @@ def validate_association( ...@@ -104,46 +260,6 @@ def validate_association(
return valid_association, valid_indices return valid_association, valid_indices
def _assoc_indices_to_3d(ndarray: np.ndarray):
"""
Convert the last column to a new row while repeating all previous indices.
This is useful when converting a signal multiindex before comparing association.
Assumes the input array has shape (N,3)
"""
result = ndarray
if len(ndarray) and ndarray.ndim > 1:
if ndarray.shape[1] == 3: # Faster indexing for single positions
result = np.transpose(
np.hstack((ndarray[:, [0]], ndarray)).reshape(-1, 2, 2),
axes=[0, 2, 1],
)
else: # 20% slower but more general indexing
columns = np.arange(ndarray.shape[1])
result = np.stack(
(
ndarray[:, np.delete(columns, -1)],
ndarray[:, np.delete(columns, -2)],
),
axis=1,
)
return result
def _3d_index_to_2d(array: np.ndarray):
"""
Opposite to _assoc_indices_to_3d.
"""
result = array
if len(array):
result = np.concatenate(
(array[:, 0, :], array[:, 1, 1, np.newaxis]), axis=1
)
return result
def compare_indices(x: np.ndarray, y: np.ndarray) -> np.ndarray: def compare_indices(x: np.ndarray, y: np.ndarray) -> np.ndarray:
""" """
Fetch two 2-D indices and return a binary 2-D matrix Fetch two 2-D indices and return a binary 2-D matrix
......
...@@ -86,16 +86,19 @@ def bidirectional_retainment_filter( ...@@ -86,16 +86,19 @@ def bidirectional_retainment_filter(
daughters_thresh: int = 7, daughters_thresh: int = 7,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Retrieve families where mothers are present for more than a fraction of the experiment, and daughters for longer than some number of time-points. Retrieve families where mothers are present for more than a fraction
of the experiment and daughters for longer than some number of
time-points.
Parameters Parameters
---------- ----------
df: pd.DataFrame df: pd.DataFrame
Data Data
mothers_thresh: float mothers_thresh: float
Minimum fraction of experiment's total duration for which mothers must be present. Minimum fraction of experiment's total duration for which mothers
must be present.
daughters_thresh: int daughters_thresh: int
Minimum number of time points for which daughters must be observed Minimum number of time points for which daughters must be observed.
""" """
# daughters # daughters
all_daughters = df.loc[df.index.get_level_values("mother_label") > 0] all_daughters = df.loc[df.index.get_level_values("mother_label") > 0]
...@@ -170,6 +173,7 @@ def slices_from_spans(spans: t.Tuple[int], df: pd.DataFrame) -> t.List[slice]: ...@@ -170,6 +173,7 @@ def slices_from_spans(spans: t.Tuple[int], df: pd.DataFrame) -> t.List[slice]:
def drop_mother_label(index: pd.MultiIndex) -> np.ndarray: def drop_mother_label(index: pd.MultiIndex) -> np.ndarray:
"""Remove mother_label level from a MultiIndex."""
no_mother_label = index no_mother_label = index
if "mother_label" in index.names: if "mother_label" in index.names:
no_mother_label = index.droplevel("mother_label") no_mother_label = index.droplevel("mother_label")
......
#!/usr/bin/env python3 #!/usr/bin/env python3
import re
import typing as t
import numpy as np import numpy as np
import pandas as pd
from agora.io.bridge import groupsort from agora.io.bridge import groupsort
from itertools import groupby
def mb_array_to_dict(mb_array: np.ndarray): def mb_array_to_dict(mb_array: np.ndarray):
...@@ -19,4 +15,3 @@ def mb_array_to_dict(mb_array: np.ndarray): ...@@ -19,4 +15,3 @@ def mb_array_to_dict(mb_array: np.ndarray):
for trap, mo_da in groupsort(mb_array).items() for trap, mo_da in groupsort(mb_array).items()
for mo, daughters in groupsort(mo_da).items() for mo, daughters in groupsort(mo_da).items()
} }
...@@ -3,90 +3,161 @@ ...@@ -3,90 +3,161 @@
Functions to efficiently merge rows in DataFrames. Functions to efficiently merge rows in DataFrames.
""" """
import typing as t import typing as t
from copy import copy
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from utils_find_1st import cmp_larger, find_1st from utils_find_1st import cmp_larger, find_1st
from agora.utils.indexing import compare_indices, validate_association from agora.utils.indexing import index_isin
def group_merges(merges: np.ndarray) -> t.List[t.Tuple]:
"""
Convert merges into a list of merges for traps requiring multiple
merges and then for traps requiring single merges.
"""
left_tracks = merges[:, 0]
right_tracks = merges[:, 1]
# find traps requiring multiple merges
linr = merges[index_isin(left_tracks, right_tracks).flatten(), :]
rinl = merges[index_isin(right_tracks, left_tracks).flatten(), :]
# make unique and order merges for each trap
multi_merge = np.unique(np.concatenate((linr, rinl)), axis=0)
# find traps requiring a singe merge
single_merge = merges[
~index_isin(merges, multi_merge).all(axis=1).flatten(), :
]
# convert to lists of arrays
single_merge_list = [[sm] for sm in single_merge]
multi_merge_list = [
multi_merge[multi_merge[:, 0, 0] == trap_id, ...]
for trap_id in np.unique(multi_merge[:, 0, 0])
]
res = [*multi_merge_list, *single_merge_list]
return res
def merge_lineage(
lineage: np.ndarray, merges: np.ndarray
) -> (np.ndarray, np.ndarray):
"""
Use merges to update lineage information.
Check if merging causes any buds to have multiple mothers and discard
those incorrect merges.
Return updated lineage and merge arrays.
"""
flat_lineage = lineage.reshape(-1, 2)
bud_mother_dict = {
tuple(bud): mother for bud, mother in zip(lineage[:, 1], lineage[:, 0])
}
left_tracks = merges[:, 0]
# find left tracks that are in lineages
valid_lineages = index_isin(flat_lineage, left_tracks).flatten()
# group into multi- and then single merges
grouped_merges = group_merges(merges)
# perform merges
if valid_lineages.any():
# indices of each left track -> indices of rightmost right track
replacement_dict = {
tuple(contig_pair[0]): merge[-1][1]
for merge in grouped_merges
for contig_pair in merge
}
# if both key and value are buds, they must have the same mother
buds = lineage[:, 1]
incorrect_merges = [
key
for key in replacement_dict
if np.any(index_isin(buds, replacement_dict[key]).flatten())
and np.any(index_isin(buds, key).flatten())
and not np.array_equal(
bud_mother_dict[key],
bud_mother_dict[tuple(replacement_dict[key])],
)
]
if incorrect_merges:
# reassign incorrect merges so that they have no affect
for key in incorrect_merges:
replacement_dict[key] = key
# find only correct merges
new_merges = merges[
~index_isin(
merges[:, 0], np.array(incorrect_merges)
).flatten(),
...,
]
else:
new_merges = merges
# correct lineage information
# replace mother or bud index with index of rightmost track
flat_lineage[valid_lineages] = [
replacement_dict[tuple(index)]
for index in flat_lineage[valid_lineages]
]
else:
new_merges = merges
# reverse flattening
new_lineage = flat_lineage.reshape(-1, 2, 2)
# remove any duplicates
new_lineage = np.unique(new_lineage, axis=0)
return new_lineage, new_merges
def apply_merges(data: pd.DataFrame, merges: np.ndarray): def apply_merges(data: pd.DataFrame, merges: np.ndarray):
"""Split data in two, one subset for rows relevant for merging and one """
without them. It uses an array of source tracklets and target tracklets Generate a new data frame containing merged tracks.
to efficiently merge them.
Parameters Parameters
---------- ----------
data : pd.DataFrame data : pd.DataFrame
Input DataFrame. A Signal data frame.
merges : np.ndarray merges : np.ndarray
3-D ndarray where dimensions are (X,2,2): nmerges, source-target An array of pairs of (trap, cell) indices to merge.
pair and single-cell identifiers, respectively.
Examples
--------
FIXME: Add docs.
""" """
indices = data.index indices = data.index
if "mother_label" in indices.names: if "mother_label" in indices.names:
indices = indices.droplevel("mother_label") indices = indices.droplevel("mother_label")
valid_merges, indices = validate_association( indices = np.array(list(indices))
merges, np.array(list(indices)) # merges in the data frame's indices
) valid_merges = index_isin(merges, indices).all(axis=1).flatten()
# corresponding indices for the data frame in merges
# Assign non-merged selected_merges = merges[valid_merges, ...]
merged = data.loc[~indices] valid_indices = index_isin(indices, selected_merges).flatten()
# data not requiring merging
# Implement the merges and drop source rows. merged = data.loc[~valid_indices]
# TODO Use matrices to perform merges in batch # merge tracks
# for ecficiency
if valid_merges.any(): if valid_merges.any():
to_merge = data.loc[indices] to_merge = data.loc[valid_indices].copy()
targets, sources = zip(*merges[valid_merges]) left_indices = merges[:, 0]
for source, target in zip(sources, targets): right_indices = merges[:, 1]
target = tuple(target) # join left track with right track
to_merge.loc[target] = join_tracks_pair( for left_index, right_index in zip(left_indices, right_indices):
to_merge.loc[target].values, to_merge.loc[tuple(left_index)] = join_two_tracks(
to_merge.loc[tuple(source)].values, to_merge.loc[tuple(left_index)].values,
to_merge.loc[tuple(right_index)].values,
) )
to_merge.drop(map(tuple, sources), inplace=True) # drop indices for right tracks
to_merge.drop(map(tuple, right_indices), inplace=True)
# add to data not requiring merges
merged = pd.concat((merged, to_merge), names=data.index.names) merged = pd.concat((merged, to_merge), names=data.index.names)
return merged return merged
def join_tracks_pair(target: np.ndarray, source: np.ndarray) -> np.ndarray: def join_two_tracks(
""" left_track: np.ndarray, right_track: np.ndarray
Join two tracks and return the new value of the target. ) -> np.ndarray:
""" """Join two tracks and return the new one."""
target_copy = target new_track = left_track.copy()
end = find_1st(target_copy[::-1], 0, cmp_larger) # find last positive element by inverting track
target_copy[-end:] = source[-end:] end = find_1st(left_track[::-1], 0, cmp_larger)
return target_copy # merge tracks into one
new_track[-end:] = right_track[-end:]
return new_track
def group_merges(merges: np.ndarray) -> t.List[t.Tuple]:
# Return a list where the cell is present as source and target
# (multimerges)
sources_targets = compare_indices(merges[:, 0, :], merges[:, 1, :])
is_multimerge = sources_targets.any(axis=0) | sources_targets.any(axis=1)
is_monomerge = ~is_multimerge
multimerge_subsets = union_find(zip(*np.where(sources_targets)))
merge_groups = [merges[np.array(tuple(x))] for x in multimerge_subsets]
sorted_merges = list(map(sort_association, merge_groups))
# Ensure that source and target are at the edges ##################################################################
return [
*sorted_merges,
*[[event] for event in merges[is_monomerge]],
]
def union_find(lsts): def union_find(lsts):
...@@ -120,27 +191,3 @@ def sort_association(array: np.ndarray): ...@@ -120,27 +191,3 @@ def sort_association(array: np.ndarray):
[res.append(x) for x in np.flip(order).flatten() if x not in res] [res.append(x) for x in np.flip(order).flatten() if x not in res]
sorted_array = array[np.array(res)] sorted_array = array[np.array(res)]
return sorted_array return sorted_array
def merge_association(
association: np.ndarray, merges: np.ndarray
) -> np.ndarray:
grouped_merges = group_merges(merges)
flat_indices = association.reshape(-1, 2)
comparison_mat = compare_indices(merges[:, 0], flat_indices)
valid_indices = comparison_mat.any(axis=0)
if valid_indices.any(): # Where valid, perform transformation
replacement_d = {}
for dataset in grouped_merges:
for k in dataset:
replacement_d[tuple(k[0])] = dataset[-1][1]
flat_indices[valid_indices] = [
replacement_d[tuple(i)] for i in flat_indices[valid_indices]
]
merged_indices = flat_indices.reshape(-1, 2, 2)
return merged_indices
# parameters to stop the pipeline when exceeded
earlystop = dict(
min_tp=100,
thresh_pos_clogged=0.4,
thresh_trap_ncells=8,
thresh_trap_area=0.9,
ntps_to_eval=5,
)
# imaging properties of the microscope
imaging_specifications = {
"pixel_size": 0.236,
"z_size": 0.6,
"spacing": 0.6,
}
# possible imaging channels
possible_imaging_channels = [
"Citrine",
"GFP",
"GFPFast",
"mCherry",
"Flavin",
"Citrine",
"mKO2",
"Cy5",
"pHluorin405",
"pHluorin488",
]
# functions to apply to the fluorescence of each cell
fluorescence_functions = [
"mean",
"median",
"std",
"imBackground",
"max5px_median",
]
# default fraction of time a cell must be in the experiment to be kept by Signal
signal_retained_cutoff = 0.8
...@@ -54,7 +54,7 @@ class DatasetLocalABC(ABC): ...@@ -54,7 +54,7 @@ class DatasetLocalABC(ABC):
Abstract Base class to find local files, either OME-XML or raw images. Abstract Base class to find local files, either OME-XML or raw images.
""" """
_valid_suffixes = ("tiff", "png", "zarr") _valid_suffixes = ("tiff", "png", "zarr", "tif")
_valid_meta_suffixes = ("txt", "log") _valid_meta_suffixes = ("txt", "log")
def __init__(self, dpath: t.Union[str, Path], *args, **kwargs): def __init__(self, dpath: t.Union[str, Path], *args, **kwargs):
......
...@@ -30,14 +30,14 @@ from agora.io.metadata import dir_to_meta, dispatch_metadata_parser ...@@ -30,14 +30,14 @@ from agora.io.metadata import dir_to_meta, dispatch_metadata_parser
def get_examples_dir(): def get_examples_dir():
"""Get examples directory which stores dummy image for tiler""" """Get examples directory that stores dummy image for tiler."""
return files("aliby").parent.parent / "examples" / "tiler" return files("aliby").parent.parent / "examples" / "tiler"
def instantiate_image( def instantiate_image(
source: t.Union[str, int, t.Dict[str, str], Path], **kwargs source: t.Union[str, int, t.Dict[str, str], Path], **kwargs
): ):
"""Wrapper to instatiate the appropiate image """Wrapper to instantiate the appropriate image
Parameters Parameters
---------- ----------
...@@ -55,26 +55,26 @@ def instantiate_image( ...@@ -55,26 +55,26 @@ def instantiate_image(
def dispatch_image(source: t.Union[str, int, t.Dict[str, str], Path]): def dispatch_image(source: t.Union[str, int, t.Dict[str, str], Path]):
""" """Pick the appropriate Image class depending on the source of data."""
Wrapper to pick the appropiate Image class depending on the source of data.
"""
if isinstance(source, (int, np.int64)): if isinstance(source, (int, np.int64)):
from aliby.io.omero import Image from aliby.io.omero import Image
instatiator = Image instantiator = Image
elif isinstance(source, dict) or ( elif isinstance(source, dict) or (
isinstance(source, (str, Path)) and Path(source).is_dir() isinstance(source, (str, Path)) and Path(source).is_dir()
): ):
if Path(source).suffix == ".zarr": if Path(source).suffix == ".zarr":
instatiator = ImageZarr instantiator = ImageZarr
else: else:
instatiator = ImageDir instantiator = ImageDir
elif isinstance(source, Path) and source.is_file():
# my addition
instantiator = ImageLocalOME
elif isinstance(source, str) and Path(source).is_file(): elif isinstance(source, str) and Path(source).is_file():
instatiator = ImageLocalOME instantiator = ImageLocalOME
else: else:
raise Exception(f"Invalid data source at {source}") raise Exception(f"Invalid data source at {source}")
return instantiator
return instatiator
class BaseLocalImage(ABC): class BaseLocalImage(ABC):
...@@ -82,6 +82,7 @@ class BaseLocalImage(ABC): ...@@ -82,6 +82,7 @@ class BaseLocalImage(ABC):
Base Image class to set path and provide context management method. Base Image class to set path and provide context management method.
""" """
# default image order
_default_dimorder = "tczyx" _default_dimorder = "tczyx"
def __init__(self, path: t.Union[str, Path]): def __init__(self, path: t.Union[str, Path]):
...@@ -98,8 +99,7 @@ class BaseLocalImage(ABC): ...@@ -98,8 +99,7 @@ class BaseLocalImage(ABC):
return False return False
def rechunk_data(self, img): def rechunk_data(self, img):
# Format image using x and y size from metadata. """Format image using x and y size from metadata."""
self._rechunked_img = da.rechunk( self._rechunked_img = da.rechunk(
img, img,
chunks=( chunks=(
...@@ -145,16 +145,16 @@ class ImageLocalOME(BaseLocalImage): ...@@ -145,16 +145,16 @@ class ImageLocalOME(BaseLocalImage):
in which a multidimensional tiff image contains the metadata. in which a multidimensional tiff image contains the metadata.
""" """
def __init__(self, path: str, dimorder=None): def __init__(self, path: str, dimorder=None, **kwargs):
super().__init__(path) super().__init__(path)
self._id = str(path) self._id = str(path)
self.set_meta(str(path))
def set_meta(self): def set_meta(self, path):
meta = dict() meta = dict()
try: try:
with TiffFile(path) as f: with TiffFile(path) as f:
self._meta = xmltodict.parse(f.ome_metadata)["OME"] self._meta = xmltodict.parse(f.ome_metadata)["OME"]
for dim in self.dimorder: for dim in self.dimorder:
meta["size_" + dim.lower()] = int( meta["size_" + dim.lower()] = int(
self._meta["Image"]["Pixels"]["@Size" + dim] self._meta["Image"]["Pixels"]["@Size" + dim]
...@@ -165,21 +165,19 @@ class ImageLocalOME(BaseLocalImage): ...@@ -165,21 +165,19 @@ class ImageLocalOME(BaseLocalImage):
] ]
meta["name"] = self._meta["Image"]["@Name"] meta["name"] = self._meta["Image"]["@Name"]
meta["type"] = self._meta["Image"]["Pixels"]["@Type"] meta["type"] = self._meta["Image"]["Pixels"]["@Type"]
except Exception as e:
except Exception as e: # Images not in OMEXML # images not in OMEXML
print("Warning:Metadata not found: {}".format(e)) print("Warning:Metadata not found: {}".format(e))
print( print(
f"Warning: No dimensional info provided. Assuming {self._default_dimorder}" "Warning: No dimensional info provided. "
f"Assuming {self._default_dimorder}"
) )
# mark non-existent dimensions for padding
# Mark non-existent dimensions for padding
self.base = self._default_dimorder self.base = self._default_dimorder
# self.ids = [self.index(i) for i in dimorder] # self.ids = [self.index(i) for i in dimorder]
self._dimorder = self.base
self._dimorder = base
self._meta = meta self._meta = meta
# self._meta["name"] = Path(path).name.split(".")[0]
@property @property
def name(self): def name(self):
...@@ -246,7 +244,7 @@ class ImageDir(BaseLocalImage): ...@@ -246,7 +244,7 @@ class ImageDir(BaseLocalImage):
It inherits from BaseLocalImage so we only override methods that are critical. It inherits from BaseLocalImage so we only override methods that are critical.
Assumptions: Assumptions:
- One folders per position. - One folder per position.
- Images are flat. - Images are flat.
- Channel, Time, z-stack and the others are determined by filenames. - Channel, Time, z-stack and the others are determined by filenames.
- Provides Dimorder as it is set in the filenames, or expects order during instatiation - Provides Dimorder as it is set in the filenames, or expects order during instatiation
...@@ -318,7 +316,7 @@ class ImageZarr(BaseLocalImage): ...@@ -318,7 +316,7 @@ class ImageZarr(BaseLocalImage):
print(f"Could not add size info to metadata: {e}") print(f"Could not add size info to metadata: {e}")
def get_data_lazy(self) -> da.Array: def get_data_lazy(self) -> da.Array:
"""Return 5D dask array. For lazy-loading local multidimensional zarr files""" """Return 5D dask array for lazy-loading local multidimensional zarr files."""
return self._img return self._img
def add_size_to_meta(self): def add_size_to_meta(self):
......
...@@ -131,7 +131,6 @@ class BridgeOmero: ...@@ -131,7 +131,6 @@ class BridgeOmero:
FIXME: Add docs. FIXME: Add docs.
""" """
# metadata = load_attributes(filepath)
bridge = BridgeH5(filepath) bridge = BridgeH5(filepath)
meta = safe_load(bridge.meta_h5["parameters"])["general"] meta = safe_load(bridge.meta_h5["parameters"])["general"]
server_info = {k: meta[k] for k in ("host", "username", "password")} server_info = {k: meta[k] for k in ("host", "username", "password")}
...@@ -268,7 +267,6 @@ class Dataset(BridgeOmero): ...@@ -268,7 +267,6 @@ class Dataset(BridgeOmero):
FIXME: Add docs. FIXME: Add docs.
""" """
# metadata = load_attributes(filepath)
bridge = BridgeH5(filepath) bridge = BridgeH5(filepath)
dataset_keys = ("omero_id", "omero_id,", "dataset_id") dataset_keys = ("omero_id", "omero_id,", "dataset_id")
for k in dataset_keys: for k in dataset_keys:
...@@ -301,21 +299,21 @@ class Image(BridgeOmero): ...@@ -301,21 +299,21 @@ class Image(BridgeOmero):
cls, cls,
filepath: t.Union[str, Path], filepath: t.Union[str, Path],
): ):
"""Instatiate Image from a hdf5 file. """
Instantiate Image from a h5 file.
Parameters Parameters
---------- ----------
cls : Image cls : Image
Image class Image class
filepath : t.Union[str, Path] filepath : t.Union[str, Path]
Location of hdf5 file. Location of h5 file.
Examples Examples
-------- --------
FIXME: Add docs. FIXME: Add docs.
""" """
# metadata = load_attributes(filepath)
bridge = BridgeH5(filepath) bridge = BridgeH5(filepath)
image_id = bridge.meta_h5["image_id"] image_id = bridge.meta_h5["image_id"]
return cls(image_id, **cls.server_info_from_h5(filepath)) return cls(image_id, **cls.server_info_from_h5(filepath))
......
This diff is collapsed.