Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • swain-lab/aliby/aliby-mirror
  • swain-lab/aliby/alibylite
2 results
Show changes
Commits on Source (24)
Showing
with 639 additions and 438 deletions
......@@ -17,16 +17,14 @@ atomic = t.Union[int, float, str, bool]
class ParametersABC(ABC):
"""
Defines parameters as attributes and allows parameters to
Define parameters as attributes and allow parameters to
be converted to either a dictionary or to yaml.
No attribute should be called "parameters"!
"""
def __init__(self, **kwargs):
"""
Defines parameters as attributes
"""
"""Define parameters as attributes."""
assert (
"parameters" not in kwargs
), "No attribute should be named parameters"
......@@ -35,8 +33,9 @@ class ParametersABC(ABC):
def to_dict(self, iterable="null") -> t.Dict:
"""
Recursive function to return a nested dictionary of the
attributes of the class instance.
Return a nested dictionary of the attributes of the class instance.
Uses recursion.
"""
if isinstance(iterable, dict):
if any(
......@@ -62,7 +61,8 @@ class ParametersABC(ABC):
def to_yaml(self, path: Union[Path, str] = None):
"""
Returns a yaml stream of the attributes of the class instance.
Return a yaml stream of the attributes of the class instance.
If path is provided, the yaml stream is saved there.
Parameters
......@@ -81,9 +81,7 @@ class ParametersABC(ABC):
@classmethod
def from_yaml(cls, source: Union[Path, str]):
"""
Returns instance from a yaml filename or stdin
"""
"""Return instance from a yaml filename or stdin."""
is_buffer = True
try:
if Path(source).exists():
......@@ -107,7 +105,8 @@ class ParametersABC(ABC):
def update(self, name: str, new_value):
"""
Update values recursively
Update values recursively.
if name is a dictionary, replace data where existing found or add if not.
It warns against type changes.
......@@ -179,7 +178,8 @@ def add_to_collection(
class ProcessABC(ABC):
"""
Base class for processes.
Defines parameters as attributes and requires run method to be defined.
Define parameters as attributes and requires a run method.
"""
def __init__(self, parameters):
......@@ -243,11 +243,9 @@ class StepABC(ProcessABC):
@timer
def run_tp(self, tp: int, **kwargs):
"""
Time and log the timing of a step.
"""
"""Time and log the timing of a step."""
return self._run_tp(tp, **kwargs)
def run(self):
# Replace run with run_tp
raise Warning("Steps use run_tp instead of run")
raise Warning("Steps use run_tp instead of run.")
This diff is collapsed.
......@@ -6,17 +6,19 @@ import typing as t
from functools import wraps
def _first_arg_str_to_df(
def _first_arg_str_to_raw_df(
fn: t.Callable,
):
"""Enable Signal-like classes to convert strings to data sets."""
@wraps(fn)
def format_input(*args, **kwargs):
cls = args[0]
data = args[1]
if isinstance(data, str):
# get data from h5 file
# get data from h5 file using Signal's get_raw
data = cls.get_raw(data)
# replace path in the undecorated function with data
return fn(cls, data, *args[2:], **kwargs)
return format_input
......@@ -66,7 +66,7 @@ class MetaData:
# Needed because HDF5 attributes do not support dictionaries
def flatten_dict(nested_dict, separator="/"):
"""
Flattens nested dictionary. If empty return as-is.
Flatten nested dictionary. If empty return as-is.
"""
flattened = {}
if nested_dict:
......@@ -79,9 +79,7 @@ def flatten_dict(nested_dict, separator="/"):
# Needed because HDF5 attributes do not support datetime objects
# Takes care of time zones & daylight saving
def datetime_to_timestamp(time, locale="Europe/London"):
"""
Convert datetime object to UNIX timestamp
"""
"""Convert datetime object to UNIX timestamp."""
return timezone(locale).localize(time).timestamp()
......@@ -189,36 +187,37 @@ def parse_swainlab_metadata(filedir: t.Union[str, Path]):
Dictionary with minimal metadata
"""
filedir = Path(filedir)
filepath = find_file(filedir, "*.log")
if filepath:
# new log files
raw_parse = parse_from_swainlab_grammar(filepath)
minimal_meta = get_meta_swainlab(raw_parse)
else:
# old log files
if filedir.is_file() or str(filedir).endswith(".zarr"):
# log file is in parent directory
filedir = filedir.parent
legacy_parse = parse_logfiles(filedir)
minimal_meta = (
get_meta_from_legacy(legacy_parse) if legacy_parse else {}
)
return minimal_meta
def dispatch_metadata_parser(filepath: t.Union[str, Path]):
"""
Function to dispatch different metadata parsers that convert logfiles into a
basic metadata dictionary. Currently only contains the swainlab log parsers.
Dispatch different metadata parsers that convert logfiles into a dictionary.
Currently only contains the swainlab log parsers.
Input:
--------
filepath: str existing file containing metadata, or folder containing naming conventions
filepath: str existing file containing metadata, or folder containing naming
conventions
"""
parsed_meta = parse_swainlab_metadata(filepath)
if parsed_meta is None:
parsed_meta = dir_to_meta
return parsed_meta
......
......@@ -10,7 +10,7 @@ import numpy as np
import pandas as pd
from agora.io.bridge import BridgeH5
from agora.io.decorators import _first_arg_str_to_df
from agora.io.decorators import _first_arg_str_to_raw_df
from agora.utils.indexing import validate_association
from agora.utils.kymograph import add_index_levels
from agora.utils.merge import apply_merges
......@@ -20,11 +20,14 @@ class Signal(BridgeH5):
"""
Fetch data from h5 files for post-processing.
Signal assumes that the metadata and data are accessible to perform time-adjustments and apply previously recorded post-processes.
Signal assumes that the metadata and data are accessible to
perform time-adjustments and apply previously recorded
post-processes.
"""
def __init__(self, file: t.Union[str, Path]):
"""Define index_names for dataframes, candidate fluorescence channels, and composite statistics."""
"""Define index_names for dataframes, candidate fluorescence channels,
and composite statistics."""
super().__init__(file, flag=None)
self.index_names = (
"experiment",
......@@ -46,9 +49,9 @@ class Signal(BridgeH5):
def __getitem__(self, dsets: t.Union[str, t.Collection]):
"""Get and potentially pre-process data from h5 file and return as a dataframe."""
if isinstance(dsets, str): # no pre-processing
if isinstance(dsets, str):
return self.get(dsets)
elif isinstance(dsets, list): # pre-processing
elif isinstance(dsets, list):
is_bgd = [dset.endswith("imBackground") for dset in dsets]
# Check we are not comparing tile-indexed and cell-indexed data
assert sum(is_bgd) == 0 or sum(is_bgd) == len(
......@@ -58,22 +61,23 @@ class Signal(BridgeH5):
else:
raise Exception(f"Invalid type {type(dsets)} to get datasets")
def get(self, dsets: t.Union[str, t.Collection], **kwargs):
"""Get and potentially pre-process data from h5 file and return as a dataframe."""
if isinstance(dsets, str): # no pre-processing
df = self.get_raw(dsets, **kwargs)
def get(self, dset_name: t.Union[str, t.Collection], **kwargs):
"""Return pre-processed data as a dataframe."""
if isinstance(dset_name, str):
dsets = self.get_raw(dset_name, **kwargs)
prepost_applied = self.apply_prepost(dsets, **kwargs)
return self.add_name(prepost_applied, dsets)
return self.add_name(prepost_applied, dset_name)
else:
raise Exception("Error in Signal.get")
@staticmethod
def add_name(df, name):
"""Add column of identical strings to a dataframe."""
"""Add name of the Signal as an attribute to its corresponding dataframe."""
df.name = name
return df
def cols_in_mins(self, df: pd.DataFrame):
# Convert numerical columns in a dataframe to minutes
"""Convert numerical columns in a dataframe to minutes."""
try:
df.columns = (df.columns * self.tinterval // 60).astype(int)
except Exception as e:
......@@ -94,14 +98,15 @@ class Signal(BridgeH5):
if tinterval_location in f.attrs:
return f.attrs[tinterval_location][0]
else:
logging.getlogger("aliby").warn(
logging.getLogger("aliby").warn(
f"{str(self.filename).split('/')[-1]}: using default time interval of 5 minutes"
)
return 5
@staticmethod
def get_retained(df, cutoff):
"""Return a fraction of the df, one without later time points."""
"""Return rows of df with at least cutoff fraction of the total number
of time points."""
return df.loc[bn.nansum(df.notna(), axis=1) > df.shape[1] * cutoff]
@property
......@@ -110,15 +115,15 @@ class Signal(BridgeH5):
with h5py.File(self.filename, "r") as f:
return list(f.attrs["channels"])
@_first_arg_str_to_df
def retained(self, signal, cutoff=0.8):
"""
Load data (via decorator) and reduce the resulting dataframe.
Load data for a signal or a list of signals and reduce the resulting
dataframes to a fraction of their original size, losing late time
points.
dataframes to rows with sufficient numbers of time points.
"""
if isinstance(signal, str):
signal = self.get_raw(signal)
if isinstance(signal, pd.DataFrame):
return self.get_retained(signal, cutoff)
elif isinstance(signal, list):
......@@ -131,17 +136,15 @@ class Signal(BridgeH5):
"""
Get lineage data from a given location in the h5 file.
Returns an array with three columns: the tile id, the mother label, and the daughter label.
Returns an array with three columns: the tile id, the mother label,
and the daughter label.
"""
if lineage_location is None:
lineage_location = "modifiers/lineage_merged"
with h5py.File(self.filename, "r") as f:
# if lineage_location not in f:
# lineage_location = lineage_location.split("_")[0]
if lineage_location not in f:
lineage_location = "postprocessing/lineage"
tile_mo_da = f[lineage_location]
if isinstance(tile_mo_da, h5py.Dataset):
lineage = tile_mo_da[()]
else:
......@@ -154,7 +157,7 @@ class Signal(BridgeH5):
).T
return lineage
@_first_arg_str_to_df
@_first_arg_str_to_raw_df
def apply_prepost(
self,
data: t.Union[str, pd.DataFrame],
......@@ -272,7 +275,7 @@ class Signal(BridgeH5):
Parameters
----------
dataset: str or list of strs
The name of the h5 file or a list of h5 file names
The name of the h5 file or a list of h5 file names.
in_minutes: boolean
If True,
lineage: boolean
......@@ -288,15 +291,17 @@ class Signal(BridgeH5):
self.get_raw(dset, in_minutes=in_minutes, lineage=lineage)
for dset in dataset
]
if lineage: # assume that df is sorted
if lineage:
# assume that df is sorted
mother_label = np.zeros(len(df), dtype=int)
lineage = self.lineage()
a, b = validate_association(
valid_association, valid_indices = validate_association(
lineage,
np.array(df.index.to_list()),
#: are mothers not match_column=0?
match_column=1,
)
mother_label[b] = lineage[a, 1]
mother_label[valid_indices] = lineage[valid_association, 1]
df = add_index_levels(df, {"mother_label": mother_label})
return df
except Exception as e:
......@@ -353,10 +358,7 @@ class Signal(BridgeH5):
fullname: str,
node: t.Union[h5py.Dataset, h5py.Group],
):
"""
Store the name of a signal if it is a leaf node
(a group with no more groups inside) and if it starts with extraction.
"""
"""Store the name of a signal if it is a leaf node and if it starts with extraction."""
if isinstance(node, h5py.Group) and np.all(
[isinstance(x, h5py.Dataset) for x in node.values()]
):
......
#!/usr/bin/env jupyter
"""
Utilities based on association are used to efficiently acquire indices of
tracklets with some kind of relationship.
This can be:
- Cells that are to be merged.
- Cells that have a lineage relationship.
"""
import numpy as np
import typing as t
def validate_association_new(
association: np.ndarray,
indices: np.ndarray,
match_column: t.Optional[int] = None,
) -> t.Tuple[np.ndarray, np.ndarray]:
"""
Identify matches between two arrays by comparing rows.
We match lineage data on mother-bud pairs with all the cells identified to specialise to only those cells in mother-bud pairs.
We use broadcasting for speed.
Both a mother and bud in association must be in indices.
Parameters
----------
association : np.ndarray
2D array of lineage associations where columns are (trap, mother, daughter)
or
a 3D array, which is an array of 2 X 2 arrays comprising [[trap_id, mother_label], [trap_id, daughter_label]].
indices : np.ndarray
A 2D array where each column is a different level, such as (trap_id, cell_label), which typically is an index of a Signal
dataframe. This array should not include mother_label.
match_column: int
If 0, matches indicate mothers from mother-bud pairs;
If 1, matches indicate daughters from mother-bud pairs;
If None, matches indicate either mothers or daughters in mother-bud pairs.
Returns
-------
valid_association: boolean np.ndarray
1D array indicating elements in association with matches.
valid_indices: boolean np.ndarray
1D array indicating elements in indices with matches.
Examples
--------
>>> import numpy as np
>>> from agora.utils.indexing import validate_association
>>> association = np.array([ [[0, 1], [0, 3]], [[0, 1], [0, 4]], [[0, 1], [0, 6]], [[0, 4], [0, 7]] ])
>>> indices = np.array([ [0, 1], [0, 2], [0, 3]])
>>> print(indices.T)
>>> valid_association, valid_indices = validate_association(association, indices)
>>> print(valid_association)
array([ True, False, False, False])
>>> print(valid_indices)
array([ True, False, True])
and
>>> association = np.array([[[0,3], [0,1]], [[0,2], [0,4]]])
>>> indices = np.array([[0,1], [0,2], [0,3]])
>>> valid_association, valid_indices = validate_association(association, indices)
>>> print(valid_association)
array([ True, False])
>>> print(valid_indices)
array([ True, False, True])
"""
if association.ndim == 2:
# reshape into 3D array for broadcasting
# for each trap, [trap, mother, daughter] becomes
# [[trap, mother], [trap, daughter]]
association = _assoc_indices_to_3d(association)
# use broadcasting to compare association with indices
# swap trap and cell_label axes for correct broadcasting
indicesT = indices.T
# compare each of [[trap, mother], [trap, daughter]] for all traps
# in association with [trap, cell_label] for all traps in indices
valid_ndassociation = (
association[..., np.newaxis] == indicesT[np.newaxis, ...]
)
# find matches in association
###
# make True comparisons have both trap_ids and cell labels matching
valid_cell_ids = valid_ndassociation.all(axis=2)
if match_column is None:
# make True comparisons match at least one row in indices
va_intermediate = valid_cell_ids.any(axis=2)
# make True comparisons have both mother and bud matching rows in indices
valid_association = va_intermediate.all(axis=1)
else:
# match_column selects mothers if 0 and daughters if 1
# make True match at least one row in indices
valid_association = valid_cell_ids[:, match_column].any(axis=1)
# find matches in indices
###
# make True comparisons have a validated association for both the mother and bud
# make True comparisons have both trap_ids and cell labels matching
valid_cell_ids_va = valid_ndassociation[valid_association].all(axis=2)
if match_column is None:
# make True comparisons match either a mother or a bud in association
valid_indices = valid_cell_ids_va.any(axis=1)[0]
else:
valid_indices = valid_cell_ids_va[:, match_column][0]
# Alan's working code
# Compare existing association with available indices
# Swap trap and label axes for the association array to correctly cast
valid_ndassociation_a = association[..., None] == indices.T[None, ...]
# Broadcasting is confusing (but efficient):
# First we check the dimension across trap and cell id, to ensure both match
valid_cell_ids_a = valid_ndassociation_a.all(axis=2)
if match_column is None:
# Then we check the merge tuples to check which cases have both target and source
valid_association_a = valid_cell_ids_a.any(axis=2).all(axis=1)
# Finally we check the dimension that crosses all indices, to ensure the pair
# is present in a valid merge event.
valid_indices_a = (
valid_ndassociation_a[valid_association_a]
.all(axis=2)
.any(axis=(0, 1))
)
else: # We fetch specific indices if we aim for the ones with one present
valid_indices_a = valid_cell_ids_a[:, match_column].any(axis=0)
# Valid association then becomes a boolean array, true means that there is a
# match (match_column) between that cell and the index
valid_association_a = (
valid_cell_ids_a[:, match_column] & valid_indices
).any(axis=1)
assert valid_association != valid_association_a, "valid_association error"
assert valid_indices != valid_indices_a, "valid_indices error"
return valid_association, valid_indices
def _assoc_indices_to_3d(ndarray: np.ndarray):
"""
Reorganise an array of shape (N, 3) into one of shape (N, 2, 2).
Reorganise an array so that the last entry of each row is removed
and generates a new row. This new row retains all other entries of
the original row.
Example:
[ [0, 1, 3], [0, 1, 4] ]
becomes
[ [[0, 1], [0, 3]], [[0, 1], [0, 4]] ]
"""
result = ndarray
if len(ndarray) and ndarray.ndim > 1:
if ndarray.shape[1] == 3:
# faster indexing for single positions
result = np.transpose(
np.hstack((ndarray[:, [0]], ndarray)).reshape(-1, 2, 2),
axes=[0, 2, 1],
)
else:
# 20% slower, but more general indexing
columns = np.arange(ndarray.shape[1])
result = np.stack(
(
ndarray[:, np.delete(columns, -1)],
ndarray[:, np.delete(columns, -2)],
),
axis=1,
)
return result
def _3d_index_to_2d(array: np.ndarray):
"""Revert switch from _assoc_indices_to_3d."""
result = array
if len(array):
result = np.concatenate(
(array[:, 0, :], array[:, 1, 1, np.newaxis]), axis=1
)
return result
def compare_indices(x: np.ndarray, y: np.ndarray) -> np.ndarray:
"""
Compare two 2D arrays using broadcasting.
Return a binary array where a True value links two cells where
all cells are the same.
"""
return (x[..., np.newaxis] == y.T[np.newaxis, ...]).all(axis=1)
......@@ -86,16 +86,19 @@ def bidirectional_retainment_filter(
daughters_thresh: int = 7,
) -> pd.DataFrame:
"""
Retrieve families where mothers are present for more than a fraction of the experiment, and daughters for longer than some number of time-points.
Retrieve families where mothers are present for more than a fraction
of the experiment and daughters for longer than some number of
time-points.
Parameters
----------
df: pd.DataFrame
Data
mothers_thresh: float
Minimum fraction of experiment's total duration for which mothers must be present.
Minimum fraction of experiment's total duration for which mothers
must be present.
daughters_thresh: int
Minimum number of time points for which daughters must be observed
Minimum number of time points for which daughters must be observed.
"""
# daughters
all_daughters = df.loc[df.index.get_level_values("mother_label") > 0]
......@@ -170,6 +173,7 @@ def slices_from_spans(spans: t.Tuple[int], df: pd.DataFrame) -> t.List[slice]:
def drop_mother_label(index: pd.MultiIndex) -> np.ndarray:
"""Remove mother_label level from a MultiIndex."""
no_mother_label = index
if "mother_label" in index.names:
no_mother_label = index.droplevel("mother_label")
......
#!/usr/bin/env python3
import re
import typing as t
import numpy as np
import pandas as pd
from agora.io.bridge import groupsort
from itertools import groupby
def mb_array_to_dict(mb_array: np.ndarray):
......@@ -19,4 +15,3 @@ def mb_array_to_dict(mb_array: np.ndarray):
for trap, mo_da in groupsort(mb_array).items()
for mo, daughters in groupsort(mo_da).items()
}
......@@ -13,8 +13,11 @@ from agora.utils.indexing import compare_indices, validate_association
def apply_merges(data: pd.DataFrame, merges: np.ndarray):
"""Split data in two, one subset for rows relevant for merging and one
without them. It uses an array of source tracklets and target tracklets
"""
Split data in two, one subset for rows relevant for merging and one
without them.
Use an array of source tracklets and target tracklets
to efficiently merge them.
Parameters
......@@ -43,7 +46,7 @@ def apply_merges(data: pd.DataFrame, merges: np.ndarray):
# Implement the merges and drop source rows.
# TODO Use matrices to perform merges in batch
# for ecficiency
# for efficiency
if valid_merges.any():
to_merge = data.loc[indices]
targets, sources = zip(*merges[valid_merges])
......
......@@ -54,7 +54,7 @@ class DatasetLocalABC(ABC):
Abstract Base class to find local files, either OME-XML or raw images.
"""
_valid_suffixes = ("tiff", "png", "zarr")
_valid_suffixes = ("tiff", "png", "zarr", "tif")
_valid_meta_suffixes = ("txt", "log")
def __init__(self, dpath: t.Union[str, Path], *args, **kwargs):
......
......@@ -30,14 +30,14 @@ from agora.io.metadata import dir_to_meta, dispatch_metadata_parser
def get_examples_dir():
"""Get examples directory which stores dummy image for tiler"""
"""Get examples directory that stores dummy image for tiler."""
return files("aliby").parent.parent / "examples" / "tiler"
def instantiate_image(
source: t.Union[str, int, t.Dict[str, str], Path], **kwargs
):
"""Wrapper to instatiate the appropiate image
"""Wrapper to instantiate the appropriate image
Parameters
----------
......@@ -55,26 +55,26 @@ def instantiate_image(
def dispatch_image(source: t.Union[str, int, t.Dict[str, str], Path]):
"""
Wrapper to pick the appropiate Image class depending on the source of data.
"""
"""Pick the appropriate Image class depending on the source of data."""
if isinstance(source, (int, np.int64)):
from aliby.io.omero import Image
instatiator = Image
instantiator = Image
elif isinstance(source, dict) or (
isinstance(source, (str, Path)) and Path(source).is_dir()
):
if Path(source).suffix == ".zarr":
instatiator = ImageZarr
instantiator = ImageZarr
else:
instatiator = ImageDir
instantiator = ImageDir
elif isinstance(source, Path) and source.is_file():
# my addition
instantiator = ImageLocalOME
elif isinstance(source, str) and Path(source).is_file():
instatiator = ImageLocalOME
instantiator = ImageLocalOME
else:
raise Exception(f"Invalid data source at {source}")
return instatiator
return instantiator
class BaseLocalImage(ABC):
......@@ -82,6 +82,7 @@ class BaseLocalImage(ABC):
Base Image class to set path and provide context management method.
"""
# default image order
_default_dimorder = "tczyx"
def __init__(self, path: t.Union[str, Path]):
......@@ -98,8 +99,7 @@ class BaseLocalImage(ABC):
return False
def rechunk_data(self, img):
# Format image using x and y size from metadata.
"""Format image using x and y size from metadata."""
self._rechunked_img = da.rechunk(
img,
chunks=(
......@@ -145,16 +145,16 @@ class ImageLocalOME(BaseLocalImage):
in which a multidimensional tiff image contains the metadata.
"""
def __init__(self, path: str, dimorder=None):
def __init__(self, path: str, dimorder=None, **kwargs):
super().__init__(path)
self._id = str(path)
self.set_meta(str(path))
def set_meta(self):
def set_meta(self, path):
meta = dict()
try:
with TiffFile(path) as f:
self._meta = xmltodict.parse(f.ome_metadata)["OME"]
for dim in self.dimorder:
meta["size_" + dim.lower()] = int(
self._meta["Image"]["Pixels"]["@Size" + dim]
......@@ -165,21 +165,19 @@ class ImageLocalOME(BaseLocalImage):
]
meta["name"] = self._meta["Image"]["@Name"]
meta["type"] = self._meta["Image"]["Pixels"]["@Type"]
except Exception as e: # Images not in OMEXML
except Exception as e:
# images not in OMEXML
print("Warning:Metadata not found: {}".format(e))
print(
f"Warning: No dimensional info provided. Assuming {self._default_dimorder}"
"Warning: No dimensional info provided. "
f"Assuming {self._default_dimorder}"
)
# Mark non-existent dimensions for padding
# mark non-existent dimensions for padding
self.base = self._default_dimorder
# self.ids = [self.index(i) for i in dimorder]
self._dimorder = base
self._dimorder = self.base
self._meta = meta
# self._meta["name"] = Path(path).name.split(".")[0]
@property
def name(self):
......@@ -246,7 +244,7 @@ class ImageDir(BaseLocalImage):
It inherits from BaseLocalImage so we only override methods that are critical.
Assumptions:
- One folders per position.
- One folder per position.
- Images are flat.
- Channel, Time, z-stack and the others are determined by filenames.
- Provides Dimorder as it is set in the filenames, or expects order during instatiation
......@@ -318,7 +316,7 @@ class ImageZarr(BaseLocalImage):
print(f"Could not add size info to metadata: {e}")
def get_data_lazy(self) -> da.Array:
"""Return 5D dask array. For lazy-loading local multidimensional zarr files"""
"""Return 5D dask array for lazy-loading local multidimensional zarr files."""
return self._img
def add_size_to_meta(self):
......
......@@ -154,6 +154,7 @@ class PipelineParameters(ParametersABC):
defaults["tiler"]["backup_ref_channel"] = backup_ref_channel
defaults["baby"] = BabyParameters.default(**baby).to_dict()
# why are BabyParameters here as an alternative?
defaults["extraction"] = (
exparams_from_meta(meta_d)
or BabyParameters.default(**extraction).to_dict()
......@@ -320,7 +321,7 @@ class Pipeline(ProcessABC):
)
# get log files, either locally or via OMERO
with dispatcher as conn:
image_ids = conn.get_images()
position_ids = conn.get_images()
directory = self.store or root_dir / conn.unique_name
if not directory.exists():
directory.mkdir(parents=True)
......@@ -330,29 +331,29 @@ class Pipeline(ProcessABC):
self.parameters.general["directory"] = str(directory)
config["general"]["directory"] = directory
self.setLogger(directory)
# pick particular images if desired
# pick particular positions if desired
if pos_filter is not None:
if isinstance(pos_filter, list):
image_ids = {
position_ids = {
k: v
for filt in pos_filter
for k, v in self.apply_filter(image_ids, filt).items()
for k, v in self.apply_filter(position_ids, filt).items()
}
else:
image_ids = self.apply_filter(image_ids, pos_filter)
assert len(image_ids), "No images to segment"
position_ids = self.apply_filter(position_ids, pos_filter)
assert len(position_ids), "No images to segment"
# create pipelines
if distributed != 0:
# multiple cores
with Pool(distributed) as p:
results = p.map(
lambda x: self.run_one_position(*x),
[(k, i) for i, k in enumerate(image_ids.items())],
[(k, i) for i, k in enumerate(position_ids.items())],
)
else:
# single core
results = []
for k, v in tqdm(image_ids.items()):
for k, v in tqdm(position_ids.items()):
r = self.run_one_position((k, v), 1)
results.append(r)
return results
......@@ -432,6 +433,7 @@ class Pipeline(ProcessABC):
if process_from["extraction"] < tps:
# TODO Move this parameter validation into Extractor
av_channels = set((*steps["tiler"].channels, "general"))
# overwrite extraction specified by PipelineParameters !!
config["extraction"]["tree"] = {
k: v
for k, v in config["extraction"]["tree"].items()
......@@ -453,13 +455,14 @@ class Pipeline(ProcessABC):
steps["extraction"] = Extractor.from_tiler(
exparams, store=filename, tiler=steps["tiler"]
)
# set up progress meter
# set up progress bar
pbar = tqdm(
range(min_process_from, tps),
desc=image.name,
initial=min_process_from,
total=tps,
)
# run through time points
for i in pbar:
if (
frac_clogged_traps
......@@ -469,9 +472,12 @@ class Pipeline(ProcessABC):
# run through steps
for step in self.pipeline_steps:
if i >= process_from[step]:
# perform step
result = steps[step].run_tp(
i, **run_kwargs.get(step, {})
)
# write to h5 file using writers
# extractor writes to h5 itself
if step in loaded_writers:
loaded_writers[step].write(
data=result,
......@@ -481,7 +487,7 @@ class Pipeline(ProcessABC):
tp=i,
meta={"last_processed": i},
)
# perform step
# clean up
if (
step == "tiler"
and i == min_process_from
......@@ -501,7 +507,7 @@ class Pipeline(ProcessABC):
tp=i,
)
elif step == "extraction":
# remove mask/label after extraction
# remove masks and labels after extraction
for k in ["masks", "labels"]:
run_kwargs[step][k] = None
# check and report clogging
......@@ -586,41 +592,6 @@ class Pipeline(ProcessABC):
)
return (traps_above_nthresh & traps_above_athresh).mean()
# FIXME: Remove this functionality. It used to be for
# older hdf5 file formats.
def _load_config_from_file(
self,
filename: Path,
process_from: t.Dict[str, int],
trackers_state: t.List,
overwrite: t.Dict[str, bool],
):
with h5py.File(filename, "r") as f:
for k in process_from.keys():
if not overwrite[k]:
process_from[k] = self.legacy_get_last_tp[k](f)
process_from[k] += 1
return process_from, trackers_state, overwrite
# FIXME: Remove this functionality. It used to be for
# older hdf5 file formats.
@staticmethod
def legacy_get_last_tp(step: str) -> t.Callable:
"""Get last time-point in different ways depending
on which step we are using
To support segmentation in aliby < v0.24
TODO Deprecate and replace with State method
"""
switch_case = {
"tiler": lambda f: f["trap_info/drifts"].shape[0] - 1,
"baby": lambda f: f["cell_info/timepoint"][-1],
"extraction": lambda f: f[
"extraction/general/None/area/timepoint"
][-1],
}
return switch_case[step]
def _setup_pipeline(
self, image_id: int
) -> t.Tuple[
......@@ -676,13 +647,12 @@ class Pipeline(ProcessABC):
step: self.step_sequence.index(ow_id) < i
for i, step in enumerate(self.step_sequence, 1)
}
# Set up
# set up
directory = config["general"]["directory"]
trackers_state: t.List[np.ndarray] = []
with dispatch_image(image_id)(image_id, **self.server_info) as image:
filename = Path(f"{directory}/{image.name}.h5")
# load metadata
meta = MetaData(directory, filename)
from_start = True if np.any(ow.values()) else False
# remove existing file if overwriting
......@@ -716,7 +686,7 @@ class Pipeline(ProcessABC):
)
config["tiler"] = steps["tiler"].parameters.to_dict()
except Exception:
self._log(f"Overwriting tiling data")
self._log("Overwriting tiling data")
if config["general"]["use_explog"]:
meta.run()
......
"""
Tiler: Divides images into smaller tiles.
The tasks of the Tiler are selecting regions of interest, or tiles, of images - with one trap per tile, correcting for the drift of the microscope stage over time, and handling errors and bridging between the image data and Aliby’s image-processing steps.
The tasks of the Tiler are selecting regions of interest, or tiles, of
images - with one trap per tile, correcting for the drift of the microscope
stage over time, and handling errors and bridging between the image data
and Aliby’s image-processing steps.
Tiler subclasses deal with either network connections or local files.
To find tiles, we use a two-step process: we analyse the bright-field image to produce the template of a trap, and we fit this template to the image to find the tiles' centres.
To find tiles, we use a two-step process: we analyse the bright-field image
to produce the template of a trap, and we fit this template to the image to
find the tiles' centres.
We use texture-based segmentation (entropy) to split the image into foreground -- cells and traps -- and background, which we then identify with an Otsu filter. Two methods are used to produce a template trap from these regions: pick the trap with the smallest minor axis length and average over all validated traps.
We use texture-based segmentation (entropy) to split the image into
foreground -- cells and traps -- and background, which we then identify with
an Otsu filter. Two methods are used to produce a template trap from these
regions: pick the trap with the smallest minor axis length and average over
all validated traps.
A peak-identifying algorithm recovers the x and y-axis location of traps in the original image, and we choose the approach to template that identifies the most tiles.
A peak-identifying algorithm recovers the x and y-axis location of traps in
the original image, and we choose the approach to template that identifies
the most tiles.
The experiment is stored as an array with a standard indexing order of (Time, Channels, Z-stack, X, Y).
The experiment is stored as an array with a standard indexing order of
(Time, Channels, Z-stack, X, Y).
"""
import logging
import re
......@@ -355,9 +367,9 @@ class Tiler(StepABC):
full: an array of images
"""
full = self.image[t, c]
if hasattr(full, "compute"): # If using dask fetch images here
if hasattr(full, "compute"):
# if using dask fetch images
full = full.compute(scheduler="synchronous")
return full
@property
......@@ -593,7 +605,10 @@ class Tiler(StepABC):
def get_channel_index(self, channel: str or int) -> int or None:
"""
Find index for channel using regex. Returns the first matched string.
Find index for channel using regex.
Return the first matched string.
If self.channels is integers (no image metadata) it returns None.
If channel is integer
......@@ -602,10 +617,8 @@ class Tiler(StepABC):
channel: string or int
The channel or index to be used.
"""
if all(map(lambda x: isinstance(x, int), self.channels)):
channel = channel if isinstance(channel, int) else None
if isinstance(channel, str):
channel = find_channel_index(self.channels, channel)
return channel
......
......@@ -80,7 +80,7 @@ class Extractor(StepABC):
Usually the metric is applied to only a tile's masked area, but some metrics depend on the whole tile.
Extraction follows a three-level tree structure. Channels, such as GFP, are the root level; the reduction algorithm, such as maximum projection, is the second level; the specific metric, or operation, to apply to the masks, such as mean, is the third level.
Extraction follows a three-level tree structure. Channels, such as GFP, are the root level; the reduction algorithm, such as maximum projection, is the second level; the specific metric, or operation, to apply to the masks, such as mean, is the third or leaf level.
"""
# TODO Alan: Move this to a location with the SwainLab defaults
......@@ -202,7 +202,7 @@ class Extractor(StepABC):
self._custom_funs[k] = tmp(f)
def load_funs(self):
"""Define all functions, including custum ones."""
"""Define all functions, including custom ones."""
self.load_custom_funs()
self._all_cell_funs = set(self._custom_funs.keys()).union(CELL_FUNS)
# merge the two dicts
......@@ -335,7 +335,7 @@ class Extractor(StepABC):
**kwargs,
) -> t.Dict[str, t.Dict[reduction_method, t.Dict[str, pd.Series]]]:
"""
Wrapper to apply reduction and then extraction.
Wrapper to reduce to a 2D image and then extract.
Parameters
----------
......@@ -499,7 +499,6 @@ class Extractor(StepABC):
# calculate metrics with subtracted bg
ch_bs = ch + "_bgsub"
# subtract median background
self.img_bgsub[ch_bs] = np.moveaxis(
np.stack(
list(
......@@ -579,7 +578,9 @@ class Extractor(StepABC):
**kwargs,
) -> dict:
"""
Wrapper to add compatibility with other steps of the pipeline.
Run extraction for one position and for the specified time points.
Save the results to a h5 file.
Parameters
----------
......@@ -597,7 +598,7 @@ class Extractor(StepABC):
Returns
-------
d: dict
A dict of the extracted data with a concatenated string of channel, reduction metric, and cell metric as keys and pd.Series of the extracted data as values.
A dict of the extracted data for one position with a concatenated string of channel, reduction metric, and cell metric as keys and pd.DataFrame of the extracted data for all time points as values.
"""
if tree is None:
tree = self.params.tree
......@@ -633,7 +634,7 @@ class Extractor(StepABC):
def save_to_hdf(self, dict_series, path=None):
"""
Save the extracted data to the h5 file.
Save the extracted data for one position to the h5 file.
Parameters
----------
......
# File with defaults for ease of use
import re
import typing as t
from pathlib import Path
import h5py
# should we move these functions here?
from aliby.tile.tiler import find_channel_name
......@@ -59,6 +58,7 @@ def exparams_from_meta(
for ch in extant_fluorescence_ch:
base["tree"][ch] = default_reduction_metrics
base["sub_bg"] = extant_fluorescence_ch
# additional extraction defaults if the channels are available
if "ph" in extras:
# SWAINLAB specific names
......
......@@ -14,9 +14,10 @@ from postprocessor.core.abc import get_process
class Chainer(Signal):
"""
Extend Signal by applying post-processes and allowing composite signals that combine basic signals.
It "chains" multiple processes upon fetching a dataset to produce the desired datasets.
Instead of reading processes previously applied, it executes
Chainer "chains" multiple processes upon fetching a dataset.
Instead of reading processes previously applied, Chainer executes
them when called.
"""
......@@ -25,6 +26,7 @@ class Chainer(Signal):
}
def __init__(self, *args, **kwargs):
"""Initialise chainer."""
super().__init__(*args, **kwargs)
def replace_path(path: str, bgsub: bool = ""):
......@@ -34,7 +36,7 @@ class Chainer(Signal):
path = re.sub(channel, f"{channel}{suffix}", path)
return path
# Add chain with and without bgsub for composite statistics
# add chain with and without bgsub for composite statistics
self.common_chains = {
alias
+ bgsub: lambda **kwargs: self.get(
......
......@@ -12,19 +12,20 @@ from postprocessor.core.abc import PostProcessABC
class LineageProcessParameters(ParametersABC):
"""
Parameters
"""
"""Parameters - none are necessary."""
_defaults = {}
class LineageProcess(PostProcessABC):
"""
Lineage process that must be passed a (N,3) lineage matrix (where the columns are trap, mother, daughter respectively)
To analyse lineage data.
Currently bare bones, but extracts lineage information from a Signal or Cells object.
"""
def __init__(self, parameters: LineageProcessParameters):
"""Initialise using PostProcessABC."""
super().__init__(parameters)
@abstractmethod
......@@ -34,6 +35,7 @@ class LineageProcess(PostProcessABC):
lineage: np.ndarray,
*args,
):
"""Implement method required by PostProcessABC - undefined."""
pass
@classmethod
......@@ -45,8 +47,9 @@ class LineageProcess(PostProcessABC):
**kwargs,
):
"""
Overrides PostProcess.as_function classmethod.
Lineage functions require lineage information to be passed if run as function.
Override PostProcesABC.as_function method.
Lineage functions require lineage information to be run as functions.
"""
parameters = cls.default_parameters(**kwargs)
return cls(parameters=parameters).run(
......@@ -54,8 +57,9 @@ class LineageProcess(PostProcessABC):
)
def get_lineage_information(self, signal=None, merged=True):
"""Get lineage as an array with tile IDs, mother labels, and corresponding bud labels."""
if signal is not None and "mother_label" in signal.index.names:
# from kymograph
lineage = get_index_as_np(signal)
elif hasattr(self, "lineage"):
lineage = self.lineage
......@@ -68,5 +72,5 @@ class LineageProcess(PostProcessABC):
elif self.cells is not None:
lineage = self.cells.mothers_daughters
else:
raise Exception("No linage information found")
raise Exception("No lineage information found")
return lineage
# change "prepost" to "preprocess"; change filename to postprocessor_engine.py ??
import typing as t
from itertools import takewhile
......@@ -61,36 +62,24 @@ class PostProcessorParameters(ParametersABC):
kind: list of str
If "ph_batman" included, add targets for experiments using pHlourin.
"""
# each subitem specifies the function to be called and the location
# on the h5 file to be written
# each subitem specifies the function to be called
# and the h5-file location for the results
#: why does merger have a string and picker a list?
targets = {
"prepost": {
"merger": "/extraction/general/None/area",
"picker": ["/extraction/general/None/area"],
},
"processes": [
[
"buddings",
["/extraction/general/None/volume"],
],
[
"dsignal",
[
"/extraction/general/None/volume",
],
],
[
"bud_metric",
[
"/extraction/general/None/volume",
],
],
[
"dsignal",
[
"/postprocessing/bud_metric/extraction_general_None_volume",
],
],
["buddings", ["/extraction/general/None/volume"]],
# ["dsignal", ["/extraction/general/None/volume"]],
["bud_metric", ["/extraction/general/None/volume"]],
# [
# "dsignal",
# [
# "/postprocessing/bud_metric/extraction_general_None_volume"
# ],
# ],
],
}
param_sets = {
......@@ -129,7 +118,7 @@ class PostProcessorParameters(ParametersABC):
class PostProcessor(ProcessABC):
def __init__(self, filename, parameters):
"""
Initialise PostProcessor
Initialise PostProcessor.
Parameters
----------
......@@ -150,7 +139,7 @@ class PostProcessor(ProcessABC):
for k in dicted_params.keys():
if not isinstance(dicted_params[k], dict):
dicted_params[k] = dicted_params[k].to_dict()
# merger and picker
# initialise merger and picker
self.merger = Merger(
MergerParameters.from_dict(dicted_params["merger"])
)
......@@ -158,12 +147,12 @@ class PostProcessor(ProcessABC):
PickerParameters.from_dict(dicted_params["picker"]),
cells=Cells.from_source(filename),
)
# processes, such as buddings
# get processes, such as buddings
self.classfun = {
process: get_process(process)
for process, _ in parameters["targets"]["processes"]
}
# parameters for the process in classfun
# get parameters for the processes in classfun
self.parameters_classfun = {
process: get_parameters(process)
for process, _ in parameters["targets"]["processes"]
......@@ -172,31 +161,32 @@ class PostProcessor(ProcessABC):
self.targets = parameters["targets"]
def run_prepost(self):
"""Using picker, get and write lineages, returning mothers and daughters."""
"""Important processes run before normal post-processing ones"""
"""
Run picker and merger and get lineages.
Necessary before any processes can run.
"""
# run merger
record = self._signal.get_raw(self.targets["prepost"]["merger"])
merges = np.array(self.merger.run(record), dtype=int)
self._writer.write(
"modifiers/merges", data=[np.array(x) for x in merges]
)
# get lineages from picker
lineage = _assoc_indices_to_3d(self.picker.cells.mothers_daughters)
lineage_merged = []
if merges.any(): # Update lineages after merge events
if merges.any():
# update lineages after merge events
merged_indices = merge_association(lineage, merges)
# Remove repeated labels post-merging
# remove repeated labels post-merging
lineage_merged = np.unique(merged_indices, axis=0)
self.lineage = _3d_index_to_2d(
lineage_merged if len(lineage_merged) else lineage
)
self._writer.write(
"modifiers/lineage_merged", _3d_index_to_2d(lineage_merged)
)
# run picker
picked_indices = self.picker.run(
self._signal[self.targets["prepost"]["picker"][0]]
)
......@@ -211,25 +201,15 @@ class PostProcessor(ProcessABC):
overwrite="overwrite",
)
@staticmethod
def pick_mother(a, b):
"""Update the mother id following this priorities:
The mother has a lower id
"""
x = max(a, b)
if min([a, b]):
x = [a, b][np.argmin([a, b])]
return x
def run(self):
"""
Write the results to the h5 file.
Processes include identifying buddings and finding bud metrics.
"""
# run merger, picker, and find lineages
self.run_prepost()
# run processes
# run processes: process is a str; datasets is a list of str
for process, datasets in tqdm(self.targets["processes"]):
if process in self.parameters["param_sets"].get("processes", {}):
# parameters already assigned
......@@ -243,16 +223,14 @@ class PostProcessor(ProcessABC):
loaded_process = self.classfun[process](parameters)
if isinstance(parameters, LineageProcessParameters):
loaded_process.lineage = self.lineage
# apply process to each dataset
for dataset in datasets:
self.run_process(dataset, process, loaded_process)
def run_process(self, dataset, process, loaded_process):
"""Run process on a single dataset and write the result."""
# define signal
"""Run process to obtain a single dataset and write the result."""
# get pre-processed data
if isinstance(dataset, list):
# multisignal process
signal = [self._signal[d] for d in dataset]
elif isinstance(dataset, str):
signal = self._signal[dataset]
......@@ -269,8 +247,9 @@ class PostProcessor(ProcessABC):
[], columns=signal.columns, index=signal.index
)
result.columns.names = ["timepoint"]
# define outpath, where result will be written
# use outpath to write result
if process in self.parameters["outpaths"]:
# outpath already defined
outpath = self.parameters["outpaths"][process]
elif isinstance(dataset, list):
# no outpath is defined
......@@ -318,3 +297,15 @@ class PostProcessor(ProcessABC):
metadata: t.Dict,
):
self._writer.write(path, result, meta=metadata, overwrite="overwrite")
@staticmethod
def pick_mother(a, b):
"""
Update the mother id following this priorities:
The mother has a lower id
"""
x = max(a, b)
if min([a, b]):
x = [a, b][np.argmin([a, b])]
return x
import typing as t
from typing import Dict, Tuple
import numpy as np
import pandas as pd
......@@ -12,17 +11,16 @@ from postprocessor.core.lineageprocess import (
class BudMetricParameters(LineageProcessParameters):
"""
Parameters
"""
"""Give default location of lineage information."""
_defaults = {"lineage_location": "postprocessing/lineage_merged"}
class BudMetric(LineageProcess):
"""
Requires mother-bud information to create a new dataframe where the indices are mother ids and
values are the daughters' values for a given signal.
Requires mother-bud information to create a new dataframe where the
indices are mother ids and values are the daughters' values for a
given signal.
"""
def __init__(self, parameters: BudMetricParameters):
......@@ -31,94 +29,147 @@ class BudMetric(LineageProcess):
def run(
self,
signal: pd.DataFrame,
lineage: Dict[pd.Index, Tuple[pd.Index]] = None,
lineage: t.Dict[pd.Index, t.Tuple[pd.Index]] = None,
):
if lineage is None:
# define lineage
if hasattr(self, "lineage"):
lineage = self.lineage
else:
# lineage information in the Signal dataframe
assert "mother_label" in signal.index.names
lineage = signal.index.to_list()
return self.get_bud_metric(signal, mb_array_to_dict(lineage))
@staticmethod
def get_bud_metric(
signal: pd.DataFrame, md: Dict[Tuple, Tuple[Tuple]] = None
signal: pd.DataFrame, md: t.Dict[t.Tuple, t.Tuple[t.Tuple]] = None
):
"""
signal: Daughter-inclusive dataframe
md: Mother-daughters dictionary where key is mother's index and value a list of daugher indices
Get fvi (First Valid Index) for all cells
Create empty matrix
for every mother:
- Get daughters' subdataframe
- sort daughters by cell label
- get series of fvis
- concatenate the values of these ranges from the dataframe
Fill the empty matrix
Convert matrix into dataframe using mother indices
Generate a dataframe of a Signal for buds indexed by their mothers,
concatenating data from all the buds for each mother.
Parameters
---------
signal: pd.Dataframe
A dataframe that includes data for both mothers and daughters.
md: dict
A dict of lineage information with each key a mother's index,
defined as (trap, cell_label), and the corresponding values are a
list of daughter indices, also defined as (trap, cell_label).
"""
mothers_mat = np.zeros((len(md), signal.shape[1]))
cells_were_dropped = 0 # Flag determines if mothers (1), daughters (2) or both were missing (3)
md_index = signal.index
if (
"mother_label" not in md_index.names
): # Generate mother label from md dict if unavailable
d = {v: k for k, values in md.items() for v in values}
# md_index should only comprise (trap, cell_label)
if "mother_label" not in md_index.names:
# dict with daughter indices as keys and mother indices as values
bud_dict = {v: k for k, values in md.items() for v in values}
# generate mother_label in Signal using the mother's cell_label
# cells with no mothers have a mother_label of 0
signal["mother_label"] = list(
map(lambda x: d.get(x, [0])[-1], signal.index)
map(lambda x: bud_dict.get(x, [0])[-1], signal.index)
)
signal.set_index("mother_label", append=True, inplace=True)
related_items = set(
[*md.keys(), *[y for x in md.values() for y in x]]
)
md_index = md_index.intersection(related_items)
elif "mother_label" in md_index.names:
md_index = md_index.droplevel("mother_label")
# combine mothers and daughter indices
mothers_index = md.keys()
daughters_index = [y for x in md.values() for y in x]
relations = set([*mothers_index, *daughters_index])
# keep from md_index only cells that are mother or daughters
md_index = md_index.intersection(relations)
else:
raise ("Unavailable relationship information")
md_index = md_index.droplevel("mother_label")
if len(md_index) < len(signal):
print("Dropped cells before bud_metric") # TODO log
print("Dropped cells before applying bud_metric") # TODO log
# restrict signal to the cells in md_index moving mother_label to do so
signal = (
signal.reset_index("mother_label")
.loc(axis=0)[md_index]
.set_index("mother_label", append=True)
)
names = list(signal.index.names)
del names[-2]
output_df = (
signal.loc[signal.index.get_level_values("mother_label") > 0]
.groupby(names)
.apply(lambda x: _combine_daughter_tracks(x))
# restrict to daughters: cells with a mother
mother_labels = signal.index.get_level_values("mother_label")
daughter_df = signal.loc[mother_labels > 0]
# join data for daughters with the same mother
output_df = daughter_df.groupby(["trap", "mother_label"]).apply(
lambda x: _combine_daughter_tracks(x)
)
output_df.columns = signal.columns
output_df["padding_level"] = 0
output_df.set_index("padding_level", append=True, inplace=True)
# daughter data is indexed by mothers, which themselves have no mothers
output_df["temp_mother_label"] = 0
output_df.set_index("temp_mother_label", append=True, inplace=True)
if len(output_df):
output_df.index.names = signal.index.names
return output_df
def _combine_daughter_tracks(tracks: t.Collection[pd.Series]):
def _combine_daughter_tracks(tracks: pd.DataFrame):
"""
Combine multiple time series of daughter cells into one time series.
Concatenate daughter values into one time series starting with the first
daughter and replacing later values with the values from the next daughter,
and so on.
Parameters
----------
tracks: a Signal
Data for all daughters, which are distinguished by different cell_labels,
for a particular trap and mother_label.
"""
# sort by daughter IDs
bud_df = tracks.sort_index(level="cell_label")
# remove multi-index
no_rows = len(bud_df)
bud_df.index = range(no_rows)
# find time point of first non-NaN data point of each row
init_tps = [
bud_df.iloc[irow].first_valid_index() for irow in range(no_rows)
]
# sort so that earliest daughter is first
sorted_rows = np.argsort(init_tps)
init_tps = np.sort(init_tps)
# combine data for all daughters
combined_tracks = np.nan * np.ones(tracks.columns.size)
for j, jrow in enumerate(sorted_rows):
# over-write with next earliest daughter
combined_tracks[bud_df.columns.get_loc(init_tps[j]) :] = (
bud_df.iloc[jrow].loc[init_tps[j] :].values
)
return pd.Series(combined_tracks, index=tracks.columns)
def _combine_daughter_tracks_original(tracks: pd.DataFrame):
"""
Combine multiple time series of cells into one, overwriting values
prioritising the most recent entity.
Combine multiple time series of daughter cells into one time series.
At any one time, a mother cell should have only one daughter.
Two daughters are still sometimes present at the same time point, and we
then choose the daughter that appears first.
TODO We need to fix examples with more than one daughter at a time point.
Parameters
----------
tracks: a Signal
Data for all daughters, which are distinguished by different cell_labels,
for a particular trap and mother_label.
"""
sorted_da_ids = tracks.sort_index(level="cell_label")
sorted_da_ids.index = range(len(sorted_da_ids))
tp_fvt = sorted_da_ids.apply(lambda x: x.first_valid_index(), axis=0)
tp_fvt = sorted_da_ids.columns.get_indexer(tp_fvt)
tp_fvt[tp_fvt < 0] = len(sorted_da_ids) - 1
_metric = np.choose(tp_fvt, sorted_da_ids.values)
return pd.Series(_metric, index=tracks.columns)
# sort by daughter IDs
bud_df = tracks.sort_index(level="cell_label")
# remove multi-index
bud_df.index = range(len(bud_df))
# find which row of sorted_df has the daughter for each time point
tp_fvt: pd.Series = bud_df.apply(lambda x: x.first_valid_index(), axis=0)
# combine data for all daughters
combined_tracks = np.nan * np.ones(tracks.columns.size)
for bud_row in np.unique(tp_fvt.dropna().values).astype(int):
ilocs = np.where(tp_fvt.values == bud_row)[0]
combined_tracks[ilocs] = bud_df.values[bud_row, ilocs]
# TODO delete old version
tp_fvt = bud_df.columns.get_indexer(tp_fvt)
tp_fvt[tp_fvt == -1] = len(bud_df) - 1
old = np.choose(tp_fvt, bud_df.values)
assert (
(combined_tracks == old) | (np.isnan(combined_tracks) & np.isnan(old))
).all(), "yikes"
return pd.Series(combined_tracks, index=tracks.columns)
......@@ -13,74 +13,69 @@ from postprocessor.core.lineageprocess import (
class buddingsParameters(LineageProcessParameters):
"""Parameter class to obtain budding events.
Parameters
----------
LineageProcessParameters : lineage_location
Location of lineage matrix to be used for calculations.
Examples
--------
FIXME: Add docs.
"""
"""Give the location of lineage information in the h5 file."""
_defaults = {"lineage_location": "postprocessing/lineage_merged"}
class buddings(LineageProcess):
"""
Calculate buddings in a trap assuming one mother per trap
returns a pandas series with the buddings.
Generate a dataframe of budding events.
We assume one mother per trap.
We define a budding event as the moment in which a bud was identified for
the first time, even if the bud is not considered one until later
in the experiment.
A bud may not be considered a bud until later in the experiment.
"""
def __init__(self, parameters: buddingsParameters):
"""Initialise buddings."""
super().__init__(parameters)
def run(
self, signal: pd.DataFrame, lineage: np.ndarray = None
) -> pd.DataFrame:
lineage = lineage or self.lineage
"""
Generate dataframe of budding events.
# Get time of first appearance for all cells
fvi = signal.apply(lambda x: x.first_valid_index(), axis=1)
Find daughters for those mothers in a Signal with lineage data.
Create a dataframe indicating the time each daughter first appears.
# Select mother cells in a given dataset
We use the data from Signal only to find when the daughters appear, by
their first non-NaN value.
"""
# lineage is (trap, mother, daughter)
lineage = lineage or self.lineage
# select traps and mothers in the signal that have lineage data
traps_mothers: t.Dict[tuple, list] = {
tuple(mo): [] for mo in lineage[:, :2] if tuple(mo) in signal.index
tuple(trap_mo): []
for trap_mo in lineage[:, :2]
if tuple(trap_mo) in signal.index
}
# add daughters, potentially multiple, for these traps and mothers
for trap, mother, daughter in lineage:
if (trap, mother) in traps_mothers.keys():
traps_mothers[(trap, mother)].append(daughter)
# a new dataframe with dimensions (n_mother_cells * n_tps)
mothers = signal.loc[
set(signal.index).intersection(traps_mothers.keys())
]
# Create a new dataframe with dimensions (n_mother_cells * n_timepoints)
buddings = pd.DataFrame(
np.zeros((mothers.shape[0], signal.shape[1])).astype(bool),
np.zeros(mothers.shape).astype(bool),
index=mothers.index,
columns=signal.columns,
)
buddings.columns.names = ["timepoint"]
# Fill the budding events
for mother_id, daughters in traps_mothers.items():
daughters_idx = set(
fvi.loc[
fvi.index.intersection(
list(product((mother_id[0],), daughters))
)
].values
).difference({0})
buddings.loc[
mother_id,
daughters_idx,
] = True
# get time of first non-NaN value of signal for every cell using Pandas
fvi = signal.apply(lambda x: x.first_valid_index(), axis=1)
# fill the budding events
for trap_mother_id, daughters in traps_mothers.items():
trap_daughter_ids = [
i for i in product((trap_mother_id[0],), daughters)
]
times_of_bud_appearance = fvi.loc[
fvi.index.intersection(trap_daughter_ids)
].values
# ignore zeros - buds in first image are not budding events
daughters_idx = set(times_of_bud_appearance).difference({0})
buddings.loc[trap_mother_id, daughters_idx] = True
return buddings