Compare revisions

pswain · pswain · pswain · pswain · pswain · pswain
--- a/src/agora/abc.py
+++ b/src/agora/abc.py
@@ -17,16 +17,14 @@ atomic = t.Union[int, float, str, bool]

 class ParametersABC(ABC):
    """
-    Defines parameters as attributes and allows parameters to
+    Define parameters as attributes and allow parameters to
    be converted to either a dictionary or to yaml.

    No attribute should be called "parameters"!
    """

    def __init__(self, **kwargs):
-        """
-        Defines parameters as attributes
-        """
+        """Define parameters as attributes."""
        assert (
            "parameters" not in kwargs
        ), "No attribute should be named parameters"
@@ -35,8 +33,9 @@ class ParametersABC(ABC):

    def to_dict(self, iterable="null") -> t.Dict:
        """
-        Recursive function to return a nested dictionary of the
-        attributes of the class instance.
+        Return a nested dictionary of the attributes of the class instance.
+
+        Uses recursion.
        """
        if isinstance(iterable, dict):
            if any(
@@ -62,7 +61,8 @@ class ParametersABC(ABC):

    def to_yaml(self, path: Union[Path, str] = None):
        """
-        Returns a yaml stream of the attributes of the class instance.
+        Return a yaml stream of the attributes of the class instance.
+
        If path is provided, the yaml stream is saved there.

        Parameters
@@ -81,9 +81,7 @@ class ParametersABC(ABC):

    @classmethod
    def from_yaml(cls, source: Union[Path, str]):
-        """
-        Returns instance from a yaml filename or stdin
-        """
+        """Return instance from a yaml filename or stdin."""
        is_buffer = True
        try:
            if Path(source).exists():
@@ -107,7 +105,8 @@ class ParametersABC(ABC):

    def update(self, name: str, new_value):
        """
-        Update values recursively
+        Update values recursively.
+
        if name is a dictionary, replace data where existing found or add if not.
        It warns against type changes.

@@ -179,7 +178,8 @@ def add_to_collection(
 class ProcessABC(ABC):
    """
    Base class for processes.
-    Defines parameters as attributes and requires run method to be defined.
+
+    Define parameters as attributes and requires a run method.
    """

    def __init__(self, parameters):
@@ -243,11 +243,9 @@ class StepABC(ProcessABC):

    @timer
    def run_tp(self, tp: int, **kwargs):
-        """
-        Time and log the timing of a step.
-        """
+        """Time and log the timing of a step."""
        return self._run_tp(tp, **kwargs)

    def run(self):
        # Replace run with run_tp
-        raise Warning("Steps use run_tp instead of run")
+        raise Warning("Steps use run_tp instead of run.")
--- a/src/agora/io/cells.py
+++ b/src/agora/io/cells.py
--- a/src/agora/io/decorators.py
+++ b/src/agora/io/decorators.py
@@ -6,17 +6,19 @@ import typing as t
 from functools import wraps


-def _first_arg_str_to_df(
+def _first_arg_str_to_raw_df(
    fn: t.Callable,
 ):
    """Enable Signal-like classes to convert strings to data sets."""
+
    @wraps(fn)
    def format_input(*args, **kwargs):
        cls = args[0]
        data = args[1]
        if isinstance(data, str):
-            # get data from h5 file
+            # get data from h5 file using Signal's get_raw
            data = cls.get_raw(data)
        # replace path in the undecorated function with data
        return fn(cls, data, *args[2:], **kwargs)
+
    return format_input
--- a/src/agora/io/metadata.py
+++ b/src/agora/io/metadata.py
@@ -66,7 +66,7 @@ class MetaData:
 # Needed because HDF5 attributes do not support dictionaries
 def flatten_dict(nested_dict, separator="/"):
    """
-    Flattens nested dictionary. If empty return as-is.
+    Flatten nested dictionary. If empty return as-is.
    """
    flattened = {}
    if nested_dict:
@@ -79,9 +79,7 @@ def flatten_dict(nested_dict, separator="/"):
 # Needed because HDF5 attributes do not support datetime objects
 # Takes care of time zones & daylight saving
 def datetime_to_timestamp(time, locale="Europe/London"):
-    """
-    Convert datetime object to UNIX timestamp
-    """
+    """Convert datetime object to UNIX timestamp."""
    return timezone(locale).localize(time).timestamp()


@@ -189,36 +187,37 @@ def parse_swainlab_metadata(filedir: t.Union[str, Path]):
    Dictionary with minimal metadata
    """
    filedir = Path(filedir)
-
    filepath = find_file(filedir, "*.log")
    if filepath:
+        # new log files
        raw_parse = parse_from_swainlab_grammar(filepath)
        minimal_meta = get_meta_swainlab(raw_parse)
    else:
+        # old log files
        if filedir.is_file() or str(filedir).endswith(".zarr"):
+            # log file is in parent directory
            filedir = filedir.parent
        legacy_parse = parse_logfiles(filedir)
        minimal_meta = (
            get_meta_from_legacy(legacy_parse) if legacy_parse else {}
        )
-
    return minimal_meta


 def dispatch_metadata_parser(filepath: t.Union[str, Path]):
    """
-    Function to dispatch different metadata parsers that convert logfiles into a
-    basic metadata dictionary. Currently only contains the swainlab log parsers.
+    Dispatch different metadata parsers that convert logfiles into a dictionary.
+
+    Currently only contains the swainlab log parsers.

    Input:
    --------
-    filepath: str existing file containing metadata, or folder containing naming conventions
+    filepath: str existing file containing metadata, or folder containing naming
+    conventions
    """
    parsed_meta = parse_swainlab_metadata(filepath)
-
    if parsed_meta is None:
        parsed_meta = dir_to_meta
-
    return parsed_meta



--- a/src/agora/io/signal.py
+++ b/src/agora/io/signal.py
@@ -10,7 +10,7 @@ import numpy as np
 import pandas as pd

 from agora.io.bridge import BridgeH5
-from agora.io.decorators import _first_arg_str_to_df
+from agora.io.decorators import _first_arg_str_to_raw_df
 from agora.utils.indexing import validate_association
 from agora.utils.kymograph import add_index_levels
 from agora.utils.merge import apply_merges
@@ -20,11 +20,14 @@ class Signal(BridgeH5):
    """
    Fetch data from h5 files for post-processing.

-    Signal assumes that the metadata and data are accessible to perform time-adjustments and apply previously recorded post-processes.
+    Signal assumes that the metadata and data are accessible to
+    perform time-adjustments and apply previously recorded
+    post-processes.
    """

    def __init__(self, file: t.Union[str, Path]):
-        """Define index_names for dataframes, candidate fluorescence channels, and composite statistics."""
+        """Define index_names for dataframes, candidate fluorescence channels,
+        and composite statistics."""
        super().__init__(file, flag=None)
        self.index_names = (
            "experiment",
@@ -46,9 +49,9 @@ class Signal(BridgeH5):

    def __getitem__(self, dsets: t.Union[str, t.Collection]):
        """Get and potentially pre-process data from h5 file and return as a dataframe."""
-        if isinstance(dsets, str):  # no pre-processing
+        if isinstance(dsets, str):
            return self.get(dsets)
-        elif isinstance(dsets, list):  # pre-processing
+        elif isinstance(dsets, list):
            is_bgd = [dset.endswith("imBackground") for dset in dsets]
            # Check we are not comparing tile-indexed and cell-indexed data
            assert sum(is_bgd) == 0 or sum(is_bgd) == len(
@@ -58,22 +61,23 @@ class Signal(BridgeH5):
        else:
            raise Exception(f"Invalid type {type(dsets)} to get datasets")

-    def get(self, dsets: t.Union[str, t.Collection], **kwargs):
-        """Get and potentially pre-process data from h5 file and return as a dataframe."""
-        if isinstance(dsets, str):  # no pre-processing
-            df = self.get_raw(dsets, **kwargs)
+    def get(self, dset_name: t.Union[str, t.Collection], **kwargs):
+        """Return pre-processed data as a dataframe."""
+        if isinstance(dset_name, str):
+            dsets = self.get_raw(dset_name, **kwargs)
            prepost_applied = self.apply_prepost(dsets, **kwargs)
-
-            return self.add_name(prepost_applied, dsets)
+            return self.add_name(prepost_applied, dset_name)
+        else:
+            raise Exception("Error in Signal.get")

    @staticmethod
    def add_name(df, name):
-        """Add column of identical strings to a dataframe."""
+        """Add name of the Signal as an attribute to its corresponding dataframe."""
        df.name = name
        return df

    def cols_in_mins(self, df: pd.DataFrame):
-        # Convert numerical columns in a dataframe to minutes
+        """Convert numerical columns in a dataframe to minutes."""
        try:
            df.columns = (df.columns * self.tinterval // 60).astype(int)
        except Exception as e:
@@ -94,14 +98,15 @@ class Signal(BridgeH5):
            if tinterval_location in f.attrs:
                return f.attrs[tinterval_location][0]
            else:
-                logging.getlogger("aliby").warn(
+                logging.getLogger("aliby").warn(
                    f"{str(self.filename).split('/')[-1]}: using default time interval of 5 minutes"
                )
                return 5

    @staticmethod
    def get_retained(df, cutoff):
-        """Return a fraction of the df, one without later time points."""
+        """Return rows of df with at least cutoff fraction of the total number
+        of time points."""
        return df.loc[bn.nansum(df.notna(), axis=1) > df.shape[1] * cutoff]

    @property
@@ -110,15 +115,15 @@ class Signal(BridgeH5):
        with h5py.File(self.filename, "r") as f:
            return list(f.attrs["channels"])

-    @_first_arg_str_to_df
    def retained(self, signal, cutoff=0.8):
        """
        Load data (via decorator) and reduce the resulting dataframe.

        Load data for a signal or a list of signals and reduce the resulting
-        dataframes to a fraction of their original size, losing late time
-        points.
+        dataframes to rows with sufficient numbers of time points.
        """
+        if isinstance(signal, str):
+            signal = self.get_raw(signal)
        if isinstance(signal, pd.DataFrame):
            return self.get_retained(signal, cutoff)
        elif isinstance(signal, list):
@@ -131,17 +136,15 @@ class Signal(BridgeH5):
        """
        Get lineage data from a given location in the h5 file.

-        Returns an array with three columns: the tile id, the mother label, and the daughter label.
+        Returns an array with three columns: the tile id, the mother label,
+        and the daughter label.
        """
        if lineage_location is None:
            lineage_location = "modifiers/lineage_merged"
        with h5py.File(self.filename, "r") as f:
-            # if lineage_location not in f:
-            #     lineage_location = lineage_location.split("_")[0]
            if lineage_location not in f:
                lineage_location = "postprocessing/lineage"
            tile_mo_da = f[lineage_location]
-
            if isinstance(tile_mo_da, h5py.Dataset):
                lineage = tile_mo_da[()]
            else:
@@ -154,7 +157,7 @@ class Signal(BridgeH5):
                ).T
        return lineage

-    @_first_arg_str_to_df
+    @_first_arg_str_to_raw_df
    def apply_prepost(
        self,
        data: t.Union[str, pd.DataFrame],
@@ -272,7 +275,7 @@ class Signal(BridgeH5):
        Parameters
        ----------
        dataset: str or list of strs
-            The name of the h5 file or a list of h5 file names
+            The name of the h5 file or a list of h5 file names.
        in_minutes: boolean
            If True,
        lineage: boolean
@@ -288,15 +291,17 @@ class Signal(BridgeH5):
                    self.get_raw(dset, in_minutes=in_minutes, lineage=lineage)
                    for dset in dataset
                ]
-            if lineage:  # assume that df is sorted
+            if lineage:
+                # assume that df is sorted
                mother_label = np.zeros(len(df), dtype=int)
                lineage = self.lineage()
-                a, b = validate_association(
+                valid_association, valid_indices = validate_association(
                    lineage,
                    np.array(df.index.to_list()),
+                    #: are mothers not match_column=0?
                    match_column=1,
                )
-                mother_label[b] = lineage[a, 1]
+                mother_label[valid_indices] = lineage[valid_association, 1]
                df = add_index_levels(df, {"mother_label": mother_label})
            return df
        except Exception as e:
@@ -353,10 +358,7 @@ class Signal(BridgeH5):
        fullname: str,
        node: t.Union[h5py.Dataset, h5py.Group],
    ):
-        """
-        Store the name of a signal if it is a leaf node
-        (a group with no more groups inside) and if it starts with extraction.
-        """
+        """Store the name of a signal if it is a leaf node and if it starts with extraction."""
        if isinstance(node, h5py.Group) and np.all(
            [isinstance(x, h5py.Dataset) for x in node.values()]
        ):

--- a/src/agora/utils/indexing_new.py
+++ b/src/agora/utils/indexing_new.py
+#!/usr/bin/env jupyter
+"""
+Utilities based on association are used to efficiently acquire indices of
+tracklets with some kind of relationship.
+This can be:
+    - Cells that are to be merged.
+    - Cells that have a lineage relationship.
+"""
+
+import numpy as np
+import typing as t
+
+
+def validate_association_new(
+    association: np.ndarray,
+    indices: np.ndarray,
+    match_column: t.Optional[int] = None,
+) -> t.Tuple[np.ndarray, np.ndarray]:
+    """
+    Identify matches between two arrays by comparing rows.
+
+    We match lineage data on mother-bud pairs with all the cells identified to specialise to only those cells in mother-bud pairs.
+
+    We use broadcasting for speed.
+
+    Both a mother and bud in association must be in indices.
+
+    Parameters
+    ----------
+    association : np.ndarray
+        2D array of lineage associations where columns are (trap, mother, daughter)
+        or
+        a 3D array, which is an array of 2 X 2 arrays comprising [[trap_id, mother_label], [trap_id, daughter_label]].
+    indices : np.ndarray
+        A 2D array where each column is a different level, such as (trap_id, cell_label), which typically is an index of a Signal
+        dataframe. This array should not include mother_label.
+    match_column: int
+        If 0, matches indicate mothers from mother-bud pairs;
+        If 1, matches indicate daughters from mother-bud pairs;
+        If None, matches indicate either mothers or daughters in mother-bud pairs.
+
+    Returns
+    -------
+    valid_association: boolean np.ndarray
+        1D array indicating elements in association with matches.
+    valid_indices: boolean np.ndarray
+        1D array indicating elements in indices with matches.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from agora.utils.indexing import validate_association
+
+    >>> association = np.array([ [[0, 1], [0, 3]], [[0, 1], [0, 4]], [[0, 1], [0, 6]], [[0, 4], [0, 7]] ])
+    >>> indices = np.array([ [0, 1], [0, 2], [0, 3]])
+    >>> print(indices.T)
+
+    >>> valid_association, valid_indices = validate_association(association, indices)
+
+    >>> print(valid_association)
+     array([ True, False, False, False])
+    >>> print(valid_indices)
+     array([ True, False, True])
+
+    and
+
+    >>> association = np.array([[[0,3], [0,1]], [[0,2], [0,4]]])
+    >>> indices = np.array([[0,1], [0,2], [0,3]])
+    >>> valid_association, valid_indices = validate_association(association, indices)
+    >>> print(valid_association)
+     array([ True, False])
+    >>> print(valid_indices)
+     array([ True, False, True])
+    """
+    if association.ndim == 2:
+        # reshape into 3D array for broadcasting
+        # for each trap, [trap, mother, daughter] becomes
+        #  [[trap, mother], [trap, daughter]]
+        association = _assoc_indices_to_3d(association)
+    # use broadcasting to compare association with indices
+    # swap trap and cell_label axes for correct broadcasting
+    indicesT = indices.T
+    # compare each of [[trap, mother], [trap, daughter]] for all traps
+    # in association with [trap, cell_label] for all traps in indices
+    valid_ndassociation = (
+        association[..., np.newaxis] == indicesT[np.newaxis, ...]
+    )
+    # find matches in association
+    ###
+    # make True comparisons have both trap_ids and cell labels matching
+    valid_cell_ids = valid_ndassociation.all(axis=2)
+    if match_column is None:
+        # make True comparisons match at least one row in indices
+        va_intermediate = valid_cell_ids.any(axis=2)
+        # make True comparisons have both mother and bud matching rows in indices
+        valid_association = va_intermediate.all(axis=1)
+    else:
+        # match_column selects mothers if 0 and daughters if 1
+        # make True match at least one row in indices
+        valid_association = valid_cell_ids[:, match_column].any(axis=1)
+    # find matches in indices
+    ###
+    # make True comparisons have a validated association for both the mother and bud
+    # make True comparisons have both trap_ids and cell labels matching
+    valid_cell_ids_va = valid_ndassociation[valid_association].all(axis=2)
+    if match_column is None:
+        # make True comparisons match either a mother or a bud in association
+        valid_indices = valid_cell_ids_va.any(axis=1)[0]
+    else:
+        valid_indices = valid_cell_ids_va[:, match_column][0]
+
+    # Alan's working code
+    # Compare existing association with available indices
+    # Swap trap and label axes for the association array to correctly cast
+    valid_ndassociation_a = association[..., None] == indices.T[None, ...]
+    # Broadcasting is confusing (but efficient):
+    # First we check the dimension across trap and cell id, to ensure both match
+    valid_cell_ids_a = valid_ndassociation_a.all(axis=2)
+    if match_column is None:
+        # Then we check the merge tuples to check which cases have both target and source
+        valid_association_a = valid_cell_ids_a.any(axis=2).all(axis=1)
+
+        # Finally we check the dimension that crosses all indices, to ensure the pair
+        # is present in a valid merge event.
+        valid_indices_a = (
+            valid_ndassociation_a[valid_association_a]
+            .all(axis=2)
+            .any(axis=(0, 1))
+        )
+    else:  # We fetch specific indices if we aim for the ones with one present
+        valid_indices_a = valid_cell_ids_a[:, match_column].any(axis=0)
+        # Valid association then becomes a boolean array, true means that there is a
+        # match (match_column) between that cell and the index
+        valid_association_a = (
+            valid_cell_ids_a[:, match_column] & valid_indices
+        ).any(axis=1)
+
+    assert valid_association != valid_association_a, "valid_association error"
+    assert valid_indices != valid_indices_a, "valid_indices error"
+
+    return valid_association, valid_indices
+
+
+def _assoc_indices_to_3d(ndarray: np.ndarray):
+    """
+    Reorganise an array of shape (N, 3) into one of shape (N, 2, 2).
+
+    Reorganise an array so that the last entry of each row is removed
+    and generates a new row. This new row retains all other entries of
+    the original row.
+
+    Example:
+    [ [0, 1, 3], [0, 1, 4] ]
+    becomes
+    [ [[0, 1], [0, 3]], [[0, 1], [0, 4]] ]
+    """
+    result = ndarray
+    if len(ndarray) and ndarray.ndim > 1:
+        if ndarray.shape[1] == 3:
+            # faster indexing for single positions
+            result = np.transpose(
+                np.hstack((ndarray[:, [0]], ndarray)).reshape(-1, 2, 2),
+                axes=[0, 2, 1],
+            )
+        else:
+            # 20% slower, but more general indexing
+            columns = np.arange(ndarray.shape[1])
+            result = np.stack(
+                (
+                    ndarray[:, np.delete(columns, -1)],
+                    ndarray[:, np.delete(columns, -2)],
+                ),
+                axis=1,
+            )
+    return result
+
+
+def _3d_index_to_2d(array: np.ndarray):
+    """Revert switch from _assoc_indices_to_3d."""
+    result = array
+    if len(array):
+        result = np.concatenate(
+            (array[:, 0, :], array[:, 1, 1, np.newaxis]), axis=1
+        )
+    return result
+
+
+def compare_indices(x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    """
+    Compare two 2D arrays using broadcasting.
+
+    Return a binary array where a True value links two cells where
+    all cells are the same.
+    """
+    return (x[..., np.newaxis] == y.T[np.newaxis, ...]).all(axis=1)
--- a/src/agora/utils/kymograph.py
+++ b/src/agora/utils/kymograph.py
@@ -86,16 +86,19 @@ def bidirectional_retainment_filter(
    daughters_thresh: int = 7,
 ) -> pd.DataFrame:
    """
-    Retrieve families where mothers are present for more than a fraction of the experiment, and daughters for longer than some number of time-points.
+    Retrieve families where mothers are present for more than a fraction
+    of the experiment and daughters for longer than some number of
+    time-points.

    Parameters
    ----------
    df: pd.DataFrame
        Data
    mothers_thresh: float
-        Minimum fraction of experiment's total duration for which mothers must be present.
+        Minimum fraction of experiment's total duration for which mothers
+        must be present.
    daughters_thresh: int
-        Minimum number of time points for which daughters must be observed
+        Minimum number of time points for which daughters must be observed.
    """
    # daughters
    all_daughters = df.loc[df.index.get_level_values("mother_label") > 0]
@@ -170,6 +173,7 @@ def slices_from_spans(spans: t.Tuple[int], df: pd.DataFrame) -> t.List[slice]:


 def drop_mother_label(index: pd.MultiIndex) -> np.ndarray:
+    """Remove mother_label level from a MultiIndex."""
    no_mother_label = index
    if "mother_label" in index.names:
        no_mother_label = index.droplevel("mother_label")

--- a/src/agora/utils/lineage.py
+++ b/src/agora/utils/lineage.py
 #!/usr/bin/env python3
-import re
-import typing as t

 import numpy as np
-import pandas as pd

 from agora.io.bridge import groupsort
-from itertools import groupby


 def mb_array_to_dict(mb_array: np.ndarray):
@@ -19,4 +15,3 @@ def mb_array_to_dict(mb_array: np.ndarray):
        for trap, mo_da in groupsort(mb_array).items()
        for mo, daughters in groupsort(mo_da).items()
    }
-
--- a/src/agora/utils/merge.py
+++ b/src/agora/utils/merge.py
@@ -13,8 +13,11 @@ from agora.utils.indexing import compare_indices, validate_association


 def apply_merges(data: pd.DataFrame, merges: np.ndarray):
-    """Split data in two, one subset for rows relevant for merging and one
-    without them. It uses an array of source tracklets and target tracklets
+    """
+    Split data in two, one subset for rows relevant for merging and one
+    without them.
+
+    Use an array of source tracklets and target tracklets
    to efficiently merge them.

    Parameters
@@ -43,7 +46,7 @@ def apply_merges(data: pd.DataFrame, merges: np.ndarray):

    # Implement the merges and drop source rows.
    # TODO Use matrices to perform merges in batch
-    # for ecficiency
+    # for efficiency
    if valid_merges.any():
        to_merge = data.loc[indices]
        targets, sources = zip(*merges[valid_merges])

--- a/src/aliby/io/dataset.py
+++ b/src/aliby/io/dataset.py
@@ -54,7 +54,7 @@ class DatasetLocalABC(ABC):
    Abstract Base class to find local files, either OME-XML or raw images.
    """

-    _valid_suffixes = ("tiff", "png", "zarr")
+    _valid_suffixes = ("tiff", "png", "zarr", "tif")
    _valid_meta_suffixes = ("txt", "log")

    def __init__(self, dpath: t.Union[str, Path], *args, **kwargs):

--- a/src/aliby/io/image.py
+++ b/src/aliby/io/image.py
@@ -30,14 +30,14 @@ from agora.io.metadata import dir_to_meta, dispatch_metadata_parser


 def get_examples_dir():
-    """Get examples directory which stores dummy image for tiler"""
+    """Get examples directory that stores dummy image for tiler."""
    return files("aliby").parent.parent / "examples" / "tiler"


 def instantiate_image(
    source: t.Union[str, int, t.Dict[str, str], Path], **kwargs
 ):
-    """Wrapper to instatiate the appropiate image
+    """Wrapper to instantiate the appropriate image

    Parameters
    ----------
@@ -55,26 +55,26 @@ def instantiate_image(


 def dispatch_image(source: t.Union[str, int, t.Dict[str, str], Path]):
-    """
-    Wrapper to pick the appropiate Image class depending on the source of data.
-    """
+    """Pick the appropriate Image class depending on the source of data."""
    if isinstance(source, (int, np.int64)):
        from aliby.io.omero import Image

-        instatiator = Image
+        instantiator = Image
    elif isinstance(source, dict) or (
        isinstance(source, (str, Path)) and Path(source).is_dir()
    ):
        if Path(source).suffix == ".zarr":
-            instatiator = ImageZarr
+            instantiator = ImageZarr
        else:
-            instatiator = ImageDir
+            instantiator = ImageDir
+    elif isinstance(source, Path) and source.is_file():
+        # my addition
+        instantiator = ImageLocalOME
    elif isinstance(source, str) and Path(source).is_file():
-        instatiator = ImageLocalOME
+        instantiator = ImageLocalOME
    else:
        raise Exception(f"Invalid data source at {source}")
-
-    return instatiator
+    return instantiator


 class BaseLocalImage(ABC):
@@ -82,6 +82,7 @@ class BaseLocalImage(ABC):
    Base Image class to set path and provide context management method.
    """

+    # default image order
    _default_dimorder = "tczyx"

    def __init__(self, path: t.Union[str, Path]):
@@ -98,8 +99,7 @@ class BaseLocalImage(ABC):
        return False

    def rechunk_data(self, img):
-        # Format image using x and y size from metadata.
-
+        """Format image using x and y size from metadata."""
        self._rechunked_img = da.rechunk(
            img,
            chunks=(
@@ -145,16 +145,16 @@ class ImageLocalOME(BaseLocalImage):
    in which a multidimensional tiff image contains the metadata.
    """

-    def __init__(self, path: str, dimorder=None):
+    def __init__(self, path: str, dimorder=None, **kwargs):
        super().__init__(path)
        self._id = str(path)
+        self.set_meta(str(path))

-    def set_meta(self):
+    def set_meta(self, path):
        meta = dict()
        try:
            with TiffFile(path) as f:
                self._meta = xmltodict.parse(f.ome_metadata)["OME"]
-
            for dim in self.dimorder:
                meta["size_" + dim.lower()] = int(
                    self._meta["Image"]["Pixels"]["@Size" + dim]
@@ -165,21 +165,19 @@ class ImageLocalOME(BaseLocalImage):
                ]
                meta["name"] = self._meta["Image"]["@Name"]
                meta["type"] = self._meta["Image"]["Pixels"]["@Type"]
-
-        except Exception as e:  # Images not in OMEXML
-
+        except Exception as e:
+            # images not in OMEXML
            print("Warning:Metadata not found: {}".format(e))
            print(
-                f"Warning: No dimensional info provided. Assuming {self._default_dimorder}"
+                "Warning: No dimensional info provided. "
+                f"Assuming {self._default_dimorder}"
            )
-
-            # Mark non-existent dimensions for padding
+            # mark non-existent dimensions for padding
            self.base = self._default_dimorder
            # self.ids = [self.index(i) for i in dimorder]
-
-            self._dimorder = base
-
+            self._dimorder = self.base
            self._meta = meta
+            # self._meta["name"] = Path(path).name.split(".")[0]

    @property
    def name(self):
@@ -246,7 +244,7 @@ class ImageDir(BaseLocalImage):
    It inherits from BaseLocalImage so we only override methods that are critical.

    Assumptions:
-    - One folders per position.
+    - One folder per position.
    - Images are flat.
    - Channel, Time, z-stack and the others are determined by filenames.
    - Provides Dimorder as it is set in the filenames, or expects order during instatiation
@@ -318,7 +316,7 @@ class ImageZarr(BaseLocalImage):
            print(f"Could not add size info to metadata: {e}")

    def get_data_lazy(self) -> da.Array:
-        """Return 5D dask array. For lazy-loading local multidimensional zarr files"""
+        """Return 5D dask array for lazy-loading local multidimensional zarr files."""
        return self._img

    def add_size_to_meta(self):

--- a/src/aliby/pipeline.py
+++ b/src/aliby/pipeline.py
@@ -154,6 +154,7 @@ class PipelineParameters(ParametersABC):
        defaults["tiler"]["backup_ref_channel"] = backup_ref_channel

        defaults["baby"] = BabyParameters.default(**baby).to_dict()
+        # why are BabyParameters here as an alternative?
        defaults["extraction"] = (
            exparams_from_meta(meta_d)
            or BabyParameters.default(**extraction).to_dict()
@@ -320,7 +321,7 @@ class Pipeline(ProcessABC):
        )
        # get log files, either locally or via OMERO
        with dispatcher as conn:
-            image_ids = conn.get_images()
+            position_ids = conn.get_images()
            directory = self.store or root_dir / conn.unique_name
            if not directory.exists():
                directory.mkdir(parents=True)
@@ -330,29 +331,29 @@ class Pipeline(ProcessABC):
        self.parameters.general["directory"] = str(directory)
        config["general"]["directory"] = directory
        self.setLogger(directory)
-        # pick particular images if desired
+        # pick particular positions if desired
        if pos_filter is not None:
            if isinstance(pos_filter, list):
-                image_ids = {
+                position_ids = {
                    k: v
                    for filt in pos_filter
-                    for k, v in self.apply_filter(image_ids, filt).items()
+                    for k, v in self.apply_filter(position_ids, filt).items()
                }
            else:
-                image_ids = self.apply_filter(image_ids, pos_filter)
-        assert len(image_ids), "No images to segment"
+                position_ids = self.apply_filter(position_ids, pos_filter)
+        assert len(position_ids), "No images to segment"
        # create pipelines
        if distributed != 0:
            # multiple cores
            with Pool(distributed) as p:
                results = p.map(
                    lambda x: self.run_one_position(*x),
-                    [(k, i) for i, k in enumerate(image_ids.items())],
+                    [(k, i) for i, k in enumerate(position_ids.items())],
                )
        else:
            # single core
            results = []
-            for k, v in tqdm(image_ids.items()):
+            for k, v in tqdm(position_ids.items()):
                r = self.run_one_position((k, v), 1)
                results.append(r)
        return results
@@ -432,6 +433,7 @@ class Pipeline(ProcessABC):
                if process_from["extraction"] < tps:
                    # TODO Move this parameter validation into Extractor
                    av_channels = set((*steps["tiler"].channels, "general"))
+                    # overwrite extraction specified by PipelineParameters !!
                    config["extraction"]["tree"] = {
                        k: v
                        for k, v in config["extraction"]["tree"].items()
@@ -453,13 +455,14 @@ class Pipeline(ProcessABC):
                    steps["extraction"] = Extractor.from_tiler(
                        exparams, store=filename, tiler=steps["tiler"]
                    )
-                    # set up progress meter
+                    # set up progress bar
                    pbar = tqdm(
                        range(min_process_from, tps),
                        desc=image.name,
                        initial=min_process_from,
                        total=tps,
                    )
+                    # run through time points
                    for i in pbar:
                        if (
                            frac_clogged_traps
@@ -469,9 +472,12 @@ class Pipeline(ProcessABC):
                            # run through steps
                            for step in self.pipeline_steps:
                                if i >= process_from[step]:
+                                    # perform step
                                    result = steps[step].run_tp(
                                        i, **run_kwargs.get(step, {})
                                    )
+                                    # write to h5 file using writers
+                                    # extractor writes to h5 itself
                                    if step in loaded_writers:
                                        loaded_writers[step].write(
                                            data=result,
@@ -481,7 +487,7 @@ class Pipeline(ProcessABC):
                                            tp=i,
                                            meta={"last_processed": i},
                                        )
-                                    # perform step
+                                    # clean up
                                    if (
                                        step == "tiler"
                                        and i == min_process_from
@@ -501,7 +507,7 @@ class Pipeline(ProcessABC):
                                            tp=i,
                                        )
                                    elif step == "extraction":
-                                        # remove mask/label after extraction
+                                        # remove masks and labels after extraction
                                        for k in ["masks", "labels"]:
                                            run_kwargs[step][k] = None
                            # check and report clogging
@@ -586,41 +592,6 @@ class Pipeline(ProcessABC):
        )
        return (traps_above_nthresh & traps_above_athresh).mean()

-    # FIXME: Remove this functionality. It used to be for
-    # older hdf5 file formats.
-    def _load_config_from_file(
-        self,
-        filename: Path,
-        process_from: t.Dict[str, int],
-        trackers_state: t.List,
-        overwrite: t.Dict[str, bool],
-    ):
-        with h5py.File(filename, "r") as f:
-            for k in process_from.keys():
-                if not overwrite[k]:
-                    process_from[k] = self.legacy_get_last_tp[k](f)
-                    process_from[k] += 1
-        return process_from, trackers_state, overwrite
-
-    # FIXME: Remove this functionality. It used to be for
-    # older hdf5 file formats.
-    @staticmethod
-    def legacy_get_last_tp(step: str) -> t.Callable:
-        """Get last time-point in different ways depending
-        on which step we are using
-
-        To support segmentation in aliby < v0.24
-        TODO Deprecate and replace with State method
-        """
-        switch_case = {
-            "tiler": lambda f: f["trap_info/drifts"].shape[0] - 1,
-            "baby": lambda f: f["cell_info/timepoint"][-1],
-            "extraction": lambda f: f[
-                "extraction/general/None/area/timepoint"
-            ][-1],
-        }
-        return switch_case[step]
-
    def _setup_pipeline(
        self, image_id: int
    ) -> t.Tuple[
@@ -676,13 +647,12 @@ class Pipeline(ProcessABC):
                step: self.step_sequence.index(ow_id) < i
                for i, step in enumerate(self.step_sequence, 1)
            }
-
-        # Set up
+        # set up
        directory = config["general"]["directory"]
-
        trackers_state: t.List[np.ndarray] = []
        with dispatch_image(image_id)(image_id, **self.server_info) as image:
            filename = Path(f"{directory}/{image.name}.h5")
+            # load metadata
            meta = MetaData(directory, filename)
            from_start = True if np.any(ow.values()) else False
            # remove existing file if overwriting
@@ -716,7 +686,7 @@ class Pipeline(ProcessABC):
                        )
                        config["tiler"] = steps["tiler"].parameters.to_dict()
                    except Exception:
-                        self._log(f"Overwriting tiling data")
+                        self._log("Overwriting tiling data")

            if config["general"]["use_explog"]:
                meta.run()

--- a/src/aliby/tile/tiler.py
+++ b/src/aliby/tile/tiler.py
 """
 Tiler: Divides images into smaller tiles.

-The tasks of the Tiler are selecting regions of interest, or tiles, of images - with one trap per tile, correcting for the drift of the microscope stage over time, and handling errors and bridging between the image data and Aliby’s image-processing steps.
+The tasks of the Tiler are selecting regions of interest, or tiles, of
+images - with one trap per tile, correcting for the drift of the microscope
+stage over time, and handling errors and bridging between the image data
+and Aliby’s image-processing steps.

 Tiler subclasses deal with either network connections or local files.

-To find tiles, we use a two-step process: we analyse the bright-field image to produce the template of a trap, and we fit this template to the image to find the tiles' centres.
+To find tiles, we use a two-step process: we analyse the bright-field image
+to produce the template of a trap, and we fit this template to the image to
+find the tiles' centres.

-We use texture-based segmentation (entropy) to split the image into foreground -- cells and traps -- and background, which we then identify with an Otsu filter. Two methods are used to produce a template trap from these regions: pick the trap with the smallest minor axis length and average over all validated traps.
+We use texture-based segmentation (entropy) to split the image into
+foreground -- cells and traps -- and background, which we then identify with
+an Otsu filter. Two methods are used to produce a template trap from these
+regions: pick the trap with the smallest minor axis length and average over
+all validated traps.

-A peak-identifying algorithm recovers the x and y-axis location of traps in the original image, and we choose the approach to template that identifies the most tiles.
+A peak-identifying algorithm recovers the x and y-axis location of traps in
+the original image, and we choose the approach to template that identifies
+the most tiles.

-The experiment is stored as an array with a standard indexing order of (Time, Channels, Z-stack, X, Y).
+The experiment is stored as an array with a standard indexing order of
+(Time, Channels, Z-stack, X, Y).
 """
 import logging
 import re
@@ -355,9 +367,9 @@ class Tiler(StepABC):
        full: an array of images
        """
        full = self.image[t, c]
-        if hasattr(full, "compute"):  # If using dask fetch images here
+        if hasattr(full, "compute"):
+            # if using dask fetch images
            full = full.compute(scheduler="synchronous")
-
        return full

    @property
@@ -593,7 +605,10 @@ class Tiler(StepABC):

    def get_channel_index(self, channel: str or int) -> int or None:
        """
-        Find index for channel using regex. Returns the first matched string.
+        Find index for channel using regex.
+
+        Return the first matched string.
+
        If self.channels is integers (no image metadata) it returns None.
        If channel is integer

@@ -602,10 +617,8 @@ class Tiler(StepABC):
        channel: string or int
            The channel or index to be used.
        """
-
        if all(map(lambda x: isinstance(x, int), self.channels)):
            channel = channel if isinstance(channel, int) else None
-
        if isinstance(channel, str):
            channel = find_channel_index(self.channels, channel)
        return channel

--- a/src/extraction/core/extractor.py
+++ b/src/extraction/core/extractor.py
@@ -80,7 +80,7 @@ class Extractor(StepABC):

    Usually the metric is applied to only a tile's masked area, but some metrics depend on the whole tile.

-    Extraction follows a three-level tree structure. Channels, such as GFP, are the root level; the reduction algorithm, such as maximum projection, is the second level; the specific metric, or operation, to apply to the masks, such as mean, is the third level.
+    Extraction follows a three-level tree structure. Channels, such as GFP, are the root level; the reduction algorithm, such as maximum projection, is the second level; the specific metric, or operation, to apply to the masks, such as mean, is the third or leaf level.
    """

    # TODO Alan: Move this to a location with the SwainLab defaults
@@ -202,7 +202,7 @@ class Extractor(StepABC):
            self._custom_funs[k] = tmp(f)

    def load_funs(self):
-        """Define all functions, including custum ones."""
+        """Define all functions, including custom ones."""
        self.load_custom_funs()
        self._all_cell_funs = set(self._custom_funs.keys()).union(CELL_FUNS)
        # merge the two dicts
@@ -335,7 +335,7 @@ class Extractor(StepABC):
        **kwargs,
    ) -> t.Dict[str, t.Dict[reduction_method, t.Dict[str, pd.Series]]]:
        """
-        Wrapper to apply reduction and then extraction.
+        Wrapper to reduce to a 2D image and then extract.

        Parameters
        ----------
@@ -499,7 +499,6 @@ class Extractor(StepABC):
                # calculate metrics with subtracted bg
                ch_bs = ch + "_bgsub"
                # subtract median background
-
                self.img_bgsub[ch_bs] = np.moveaxis(
                    np.stack(
                        list(
@@ -579,7 +578,9 @@ class Extractor(StepABC):
        **kwargs,
    ) -> dict:
        """
-        Wrapper to add compatibility with other steps of the pipeline.
+        Run extraction for one position and for the specified time points.
+
+        Save the results to a h5 file.

        Parameters
        ----------
@@ -597,7 +598,7 @@ class Extractor(StepABC):
        Returns
        -------
        d: dict
-            A dict of the extracted data with a concatenated string of channel, reduction metric, and cell metric as keys and pd.Series of the extracted data as values.
+            A dict of the extracted data for one position with a concatenated string of channel, reduction metric, and cell metric as keys and pd.DataFrame of the extracted data for all time points as values.
        """
        if tree is None:
            tree = self.params.tree
@@ -633,7 +634,7 @@ class Extractor(StepABC):

    def save_to_hdf(self, dict_series, path=None):
        """
-        Save the extracted data to the h5 file.
+        Save the extracted data for one position to the h5 file.

        Parameters
        ----------

--- a/src/extraction/core/functions/defaults.py
+++ b/src/extraction/core/functions/defaults.py
 # File with defaults for ease of use
-import re
 import typing as t
 from pathlib import Path
+
 import h5py

-# should we move these functions here?
 from aliby.tile.tiler import find_channel_name


@@ -59,6 +58,7 @@ def exparams_from_meta(
    for ch in extant_fluorescence_ch:
        base["tree"][ch] = default_reduction_metrics
    base["sub_bg"] = extant_fluorescence_ch
+
    # additional extraction defaults if the channels are available
    if "ph" in extras:
        # SWAINLAB specific names

--- a/src/postprocessor/chainer.py
+++ b/src/postprocessor/chainer.py
@@ -14,9 +14,10 @@ from postprocessor.core.abc import get_process
 class Chainer(Signal):
    """
    Extend Signal by applying post-processes and allowing composite signals that combine basic signals.
-    It "chains" multiple processes upon fetching a dataset to produce the desired datasets.

-    Instead of reading processes previously applied, it executes
+    Chainer "chains" multiple processes upon fetching a dataset.
+
+    Instead of reading processes previously applied, Chainer executes
    them when called.
    """

@@ -25,6 +26,7 @@ class Chainer(Signal):
    }

    def __init__(self, *args, **kwargs):
+        """Initialise chainer."""
        super().__init__(*args, **kwargs)

        def replace_path(path: str, bgsub: bool = ""):
@@ -34,7 +36,7 @@ class Chainer(Signal):
            path = re.sub(channel, f"{channel}{suffix}", path)
            return path

-        # Add chain with and without bgsub for composite statistics
+        # add chain with and without bgsub for composite statistics
        self.common_chains = {
            alias
            + bgsub: lambda **kwargs: self.get(

--- a/src/postprocessor/core/lineageprocess.py
+++ b/src/postprocessor/core/lineageprocess.py
@@ -12,19 +12,20 @@ from postprocessor.core.abc import PostProcessABC


 class LineageProcessParameters(ParametersABC):
-    """
-    Parameters
-    """
+    """Parameters - none are necessary."""

    _defaults = {}


 class LineageProcess(PostProcessABC):
    """
-    Lineage process that must be passed a (N,3) lineage matrix (where the columns are trap, mother, daughter respectively)
+    To analyse lineage data.
+
+    Currently bare bones, but extracts lineage information from a Signal or Cells object.
    """

    def __init__(self, parameters: LineageProcessParameters):
+        """Initialise using PostProcessABC."""
        super().__init__(parameters)

    @abstractmethod
@@ -34,6 +35,7 @@ class LineageProcess(PostProcessABC):
        lineage: np.ndarray,
        *args,
    ):
+        """Implement method required by PostProcessABC - undefined."""
        pass

    @classmethod
@@ -45,8 +47,9 @@ class LineageProcess(PostProcessABC):
        **kwargs,
    ):
        """
-        Overrides PostProcess.as_function classmethod.
-        Lineage functions require lineage information to be passed if run as function.
+        Override PostProcesABC.as_function method.
+
+        Lineage functions require lineage information to be run as functions.
        """
        parameters = cls.default_parameters(**kwargs)
        return cls(parameters=parameters).run(
@@ -54,8 +57,9 @@ class LineageProcess(PostProcessABC):
        )

    def get_lineage_information(self, signal=None, merged=True):
-
+        """Get lineage as an array with tile IDs, mother labels, and corresponding bud labels."""
        if signal is not None and "mother_label" in signal.index.names:
+            # from kymograph
            lineage = get_index_as_np(signal)
        elif hasattr(self, "lineage"):
            lineage = self.lineage
@@ -68,5 +72,5 @@ class LineageProcess(PostProcessABC):
                elif self.cells is not None:
                    lineage = self.cells.mothers_daughters
        else:
-            raise Exception("No linage information found")
+            raise Exception("No lineage information found")
        return lineage
--- a/src/postprocessor/core/processor.py
+++ b/src/postprocessor/core/processor.py
+# change "prepost" to "preprocess"; change filename to postprocessor_engine.py ??
 import typing as t
 from itertools import takewhile

@@ -61,36 +62,24 @@ class PostProcessorParameters(ParametersABC):
        kind: list of str
            If "ph_batman" included, add targets for experiments using pHlourin.
        """
-        # each subitem specifies the function to be called and the location
-        # on the h5 file to be written
+        # each subitem specifies the function to be called
+        # and the h5-file location for the results
+        #: why does merger have a string and picker a list?
        targets = {
            "prepost": {
                "merger": "/extraction/general/None/area",
                "picker": ["/extraction/general/None/area"],
            },
            "processes": [
-                [
-                    "buddings",
-                    ["/extraction/general/None/volume"],
-                ],
-                [
-                    "dsignal",
-                    [
-                        "/extraction/general/None/volume",
-                    ],
-                ],
-                [
-                    "bud_metric",
-                    [
-                        "/extraction/general/None/volume",
-                    ],
-                ],
-                [
-                    "dsignal",
-                    [
-                        "/postprocessing/bud_metric/extraction_general_None_volume",
-                    ],
-                ],
+                ["buddings", ["/extraction/general/None/volume"]],
+                # ["dsignal", ["/extraction/general/None/volume"]],
+                ["bud_metric", ["/extraction/general/None/volume"]],
+                # [
+                #     "dsignal",
+                #     [
+                #         "/postprocessing/bud_metric/extraction_general_None_volume"
+                #     ],
+                # ],
            ],
        }
        param_sets = {
@@ -129,7 +118,7 @@ class PostProcessorParameters(ParametersABC):
 class PostProcessor(ProcessABC):
    def __init__(self, filename, parameters):
        """
-        Initialise PostProcessor
+        Initialise PostProcessor.

        Parameters
        ----------
@@ -150,7 +139,7 @@ class PostProcessor(ProcessABC):
        for k in dicted_params.keys():
            if not isinstance(dicted_params[k], dict):
                dicted_params[k] = dicted_params[k].to_dict()
-        # merger and picker
+        # initialise merger and picker
        self.merger = Merger(
            MergerParameters.from_dict(dicted_params["merger"])
        )
@@ -158,12 +147,12 @@ class PostProcessor(ProcessABC):
            PickerParameters.from_dict(dicted_params["picker"]),
            cells=Cells.from_source(filename),
        )
-        # processes, such as buddings
+        # get processes, such as buddings
        self.classfun = {
            process: get_process(process)
            for process, _ in parameters["targets"]["processes"]
        }
-        # parameters for the process in classfun
+        # get parameters for the processes in classfun
        self.parameters_classfun = {
            process: get_parameters(process)
            for process, _ in parameters["targets"]["processes"]
@@ -172,31 +161,32 @@ class PostProcessor(ProcessABC):
        self.targets = parameters["targets"]

    def run_prepost(self):
-        """Using picker, get and write lineages, returning mothers and daughters."""
-        """Important processes run before normal post-processing ones"""
+        """
+        Run picker and merger and get lineages.
+
+        Necessary before any processes can run.
+        """
+        # run merger
        record = self._signal.get_raw(self.targets["prepost"]["merger"])
        merges = np.array(self.merger.run(record), dtype=int)
-
        self._writer.write(
            "modifiers/merges", data=[np.array(x) for x in merges]
        )
-
+        # get lineages from picker
        lineage = _assoc_indices_to_3d(self.picker.cells.mothers_daughters)
        lineage_merged = []
-
-        if merges.any():  # Update lineages after merge events
-
+        if merges.any():
+            # update lineages after merge events
            merged_indices = merge_association(lineage, merges)
-            # Remove repeated labels post-merging
+            # remove repeated labels post-merging
            lineage_merged = np.unique(merged_indices, axis=0)
-
        self.lineage = _3d_index_to_2d(
            lineage_merged if len(lineage_merged) else lineage
        )
        self._writer.write(
            "modifiers/lineage_merged", _3d_index_to_2d(lineage_merged)
        )
-
+        # run picker
        picked_indices = self.picker.run(
            self._signal[self.targets["prepost"]["picker"][0]]
        )
@@ -211,25 +201,15 @@ class PostProcessor(ProcessABC):
                overwrite="overwrite",
            )

-    @staticmethod
-    def pick_mother(a, b):
-        """Update the mother id following this priorities:
-
-        The mother has a lower id
-        """
-        x = max(a, b)
-        if min([a, b]):
-            x = [a, b][np.argmin([a, b])]
-        return x
-
    def run(self):
        """
        Write the results to the h5 file.
+
        Processes include identifying buddings and finding bud metrics.
        """
        # run merger, picker, and find lineages
        self.run_prepost()
-        # run processes
+        # run processes: process is a str; datasets is a list of str
        for process, datasets in tqdm(self.targets["processes"]):
            if process in self.parameters["param_sets"].get("processes", {}):
                # parameters already assigned
@@ -243,16 +223,14 @@ class PostProcessor(ProcessABC):
            loaded_process = self.classfun[process](parameters)
            if isinstance(parameters, LineageProcessParameters):
                loaded_process.lineage = self.lineage
-
            # apply process to each dataset
            for dataset in datasets:
                self.run_process(dataset, process, loaded_process)

    def run_process(self, dataset, process, loaded_process):
-        """Run process on a single dataset and write the result."""
-        # define signal
+        """Run process to obtain a single dataset and write the result."""
+        # get pre-processed data
        if isinstance(dataset, list):
-            # multisignal process
            signal = [self._signal[d] for d in dataset]
        elif isinstance(dataset, str):
            signal = self._signal[dataset]
@@ -269,8 +247,9 @@ class PostProcessor(ProcessABC):
                [], columns=signal.columns, index=signal.index
            )
            result.columns.names = ["timepoint"]
-        # define outpath, where result will be written
+        # use outpath to write result
        if process in self.parameters["outpaths"]:
+            # outpath already defined
            outpath = self.parameters["outpaths"][process]
        elif isinstance(dataset, list):
            # no outpath is defined
@@ -318,3 +297,15 @@ class PostProcessor(ProcessABC):
        metadata: t.Dict,
    ):
        self._writer.write(path, result, meta=metadata, overwrite="overwrite")
+
+    @staticmethod
+    def pick_mother(a, b):
+        """
+        Update the mother id following this priorities:
+
+        The mother has a lower id
+        """
+        x = max(a, b)
+        if min([a, b]):
+            x = [a, b][np.argmin([a, b])]
+        return x
--- a/src/postprocessor/core/reshapers/bud_metric.py
+++ b/src/postprocessor/core/reshapers/bud_metric.py
 import typing as t
-from typing import Dict, Tuple

 import numpy as np
 import pandas as pd
@@ -12,17 +11,16 @@ from postprocessor.core.lineageprocess import (


 class BudMetricParameters(LineageProcessParameters):
-    """
-    Parameters
-    """
+    """Give default location of lineage information."""

    _defaults = {"lineage_location": "postprocessing/lineage_merged"}


 class BudMetric(LineageProcess):
    """
-    Requires mother-bud information to create a new dataframe where the indices are mother ids and
-    values are the daughters' values for a given signal.
+    Requires mother-bud information to create a new dataframe where the
+    indices are mother ids and values are the daughters' values for a
+    given signal.
    """

    def __init__(self, parameters: BudMetricParameters):
@@ -31,94 +29,147 @@ class BudMetric(LineageProcess):
    def run(
        self,
        signal: pd.DataFrame,
-        lineage: Dict[pd.Index, Tuple[pd.Index]] = None,
+        lineage: t.Dict[pd.Index, t.Tuple[pd.Index]] = None,
    ):
        if lineage is None:
+            # define lineage
            if hasattr(self, "lineage"):
                lineage = self.lineage
            else:
+                # lineage information in the Signal dataframe
                assert "mother_label" in signal.index.names
                lineage = signal.index.to_list()
-
        return self.get_bud_metric(signal, mb_array_to_dict(lineage))

    @staticmethod
    def get_bud_metric(
-        signal: pd.DataFrame, md: Dict[Tuple, Tuple[Tuple]] = None
+        signal: pd.DataFrame, md: t.Dict[t.Tuple, t.Tuple[t.Tuple]] = None
    ):
        """
-
-        signal: Daughter-inclusive dataframe
-        md: Mother-daughters dictionary where key is mother's index and value a list of daugher indices
-
-        Get fvi (First Valid Index) for all cells
-        Create empty matrix
-        for every mother:
-         - Get daughters' subdataframe
-         - sort  daughters by cell label
-         - get series of fvis
-         - concatenate the values of these ranges from the dataframe
-        Fill the empty matrix
-        Convert matrix into dataframe using mother indices
-
+        Generate a dataframe of a Signal for buds indexed by their mothers,
+        concatenating data from all the buds for each mother.
+
+        Parameters
+        ---------
+        signal: pd.Dataframe
+            A dataframe that includes data for both mothers and daughters.
+        md: dict
+            A dict of lineage information with each key a mother's index,
+            defined as (trap, cell_label), and the corresponding values are a
+            list of daughter indices, also defined as (trap, cell_label).
        """
-        mothers_mat = np.zeros((len(md), signal.shape[1]))
-        cells_were_dropped = 0  # Flag determines if mothers (1), daughters (2) or both were missing (3)
-
        md_index = signal.index
-        if (
-            "mother_label" not in md_index.names
-        ):  # Generate mother label from md dict if unavailable
-            d = {v: k for k, values in md.items() for v in values}
+        # md_index should only comprise (trap, cell_label)
+        if "mother_label" not in md_index.names:
+            # dict with daughter indices as keys and mother indices as values
+            bud_dict = {v: k for k, values in md.items() for v in values}
+            # generate mother_label in Signal using the mother's cell_label
+            # cells with no mothers have a mother_label of 0
            signal["mother_label"] = list(
-                map(lambda x: d.get(x, [0])[-1], signal.index)
+                map(lambda x: bud_dict.get(x, [0])[-1], signal.index)
            )
            signal.set_index("mother_label", append=True, inplace=True)
-            related_items = set(
-                [*md.keys(), *[y for x in md.values() for y in x]]
-            )
-            md_index = md_index.intersection(related_items)
-        elif "mother_label" in md_index.names:
-            md_index = md_index.droplevel("mother_label")
+            # combine mothers and daughter indices
+            mothers_index = md.keys()
+            daughters_index = [y for x in md.values() for y in x]
+            relations = set([*mothers_index, *daughters_index])
+            # keep from md_index only cells that are mother or daughters
+            md_index = md_index.intersection(relations)
        else:
-            raise ("Unavailable relationship information")
-
+            md_index = md_index.droplevel("mother_label")
        if len(md_index) < len(signal):
-            print("Dropped cells before bud_metric")  # TODO log
-
+            print("Dropped cells before applying bud_metric")  # TODO log
+        # restrict signal to the cells in md_index moving mother_label to do so
        signal = (
            signal.reset_index("mother_label")
            .loc(axis=0)[md_index]
            .set_index("mother_label", append=True)
        )
-
-        names = list(signal.index.names)
-        del names[-2]
-
-        output_df = (
-            signal.loc[signal.index.get_level_values("mother_label") > 0]
-            .groupby(names)
-            .apply(lambda x: _combine_daughter_tracks(x))
+        # restrict to daughters: cells with a mother
+        mother_labels = signal.index.get_level_values("mother_label")
+        daughter_df = signal.loc[mother_labels > 0]
+        # join data for daughters with the same mother
+        output_df = daughter_df.groupby(["trap", "mother_label"]).apply(
+            lambda x: _combine_daughter_tracks(x)
        )
        output_df.columns = signal.columns
-        output_df["padding_level"] = 0
-        output_df.set_index("padding_level", append=True, inplace=True)
-
+        # daughter data is indexed by mothers, which themselves have no mothers
+        output_df["temp_mother_label"] = 0
+        output_df.set_index("temp_mother_label", append=True, inplace=True)
        if len(output_df):
            output_df.index.names = signal.index.names
        return output_df


-def _combine_daughter_tracks(tracks: t.Collection[pd.Series]):
+def _combine_daughter_tracks(tracks: pd.DataFrame):
+    """
+    Combine multiple time series of daughter cells into one time series.
+
+    Concatenate daughter values into one time series starting with the first
+    daughter and replacing later values with the values from the next daughter,
+    and so on.
+
+    Parameters
+    ----------
+    tracks: a Signal
+        Data for all daughters, which are distinguished by different cell_labels,
+        for a particular trap and mother_label.
+    """
+    # sort by daughter IDs
+    bud_df = tracks.sort_index(level="cell_label")
+    # remove multi-index
+    no_rows = len(bud_df)
+    bud_df.index = range(no_rows)
+    # find time point of first non-NaN data point of each row
+    init_tps = [
+        bud_df.iloc[irow].first_valid_index() for irow in range(no_rows)
+    ]
+    # sort so that earliest daughter is first
+    sorted_rows = np.argsort(init_tps)
+    init_tps = np.sort(init_tps)
+    # combine data for all daughters
+    combined_tracks = np.nan * np.ones(tracks.columns.size)
+    for j, jrow in enumerate(sorted_rows):
+        # over-write with next earliest daughter
+        combined_tracks[bud_df.columns.get_loc(init_tps[j]) :] = (
+            bud_df.iloc[jrow].loc[init_tps[j] :].values
+        )
+    return pd.Series(combined_tracks, index=tracks.columns)
+
+
+def _combine_daughter_tracks_original(tracks: pd.DataFrame):
    """
-    Combine multiple time series of cells into one, overwriting values
-    prioritising the most recent entity.
+    Combine multiple time series of daughter cells into one time series.
+
+    At any one time, a mother cell should have only one daughter.
+
+    Two daughters are still sometimes present at the same time point, and we
+    then choose the daughter that appears first.
+
+    TODO We need to fix examples with more than one daughter at a time point.
+
+    Parameters
+    ----------
+    tracks: a Signal
+        Data for all daughters, which are distinguished by different cell_labels,
+        for a particular trap and mother_label.
    """
-    sorted_da_ids = tracks.sort_index(level="cell_label")
-    sorted_da_ids.index = range(len(sorted_da_ids))
-    tp_fvt = sorted_da_ids.apply(lambda x: x.first_valid_index(), axis=0)
-    tp_fvt = sorted_da_ids.columns.get_indexer(tp_fvt)
-    tp_fvt[tp_fvt < 0] = len(sorted_da_ids) - 1
-
-    _metric = np.choose(tp_fvt, sorted_da_ids.values)
-    return pd.Series(_metric, index=tracks.columns)
+    # sort by daughter IDs
+    bud_df = tracks.sort_index(level="cell_label")
+    # remove multi-index
+    bud_df.index = range(len(bud_df))
+    # find which row of sorted_df has the daughter for each time point
+    tp_fvt: pd.Series = bud_df.apply(lambda x: x.first_valid_index(), axis=0)
+    # combine data for all daughters
+    combined_tracks = np.nan * np.ones(tracks.columns.size)
+    for bud_row in np.unique(tp_fvt.dropna().values).astype(int):
+        ilocs = np.where(tp_fvt.values == bud_row)[0]
+        combined_tracks[ilocs] = bud_df.values[bud_row, ilocs]
+    # TODO delete old version
+    tp_fvt = bud_df.columns.get_indexer(tp_fvt)
+    tp_fvt[tp_fvt == -1] = len(bud_df) - 1
+    old = np.choose(tp_fvt, bud_df.values)
+    assert (
+        (combined_tracks == old) | (np.isnan(combined_tracks) & np.isnan(old))
+    ).all(), "yikes"
+    return pd.Series(combined_tracks, index=tracks.columns)
--- a/src/postprocessor/core/reshapers/buddings.py
+++ b/src/postprocessor/core/reshapers/buddings.py
@@ -13,74 +13,69 @@ from postprocessor.core.lineageprocess import (


 class buddingsParameters(LineageProcessParameters):
-    """Parameter class to obtain budding events.
-
-    Parameters
-    ----------
-    LineageProcessParameters : lineage_location
-        Location of lineage matrix to be used for calculations.
-
-    Examples
-    --------
-    FIXME: Add docs.
-
-    """
+    """Give the location of lineage information in the h5 file."""

    _defaults = {"lineage_location": "postprocessing/lineage_merged"}


 class buddings(LineageProcess):
    """
-    Calculate buddings in a trap assuming one mother per trap
-    returns a pandas series with the buddings.
+    Generate a dataframe of budding events.
+
+    We assume one mother per trap.

-    We define a budding event as the moment in which a bud was identified for
-    the first time, even if the bud is not considered one until later
-    in the experiment.
+    A bud may not be considered a bud until later in the experiment.
    """

    def __init__(self, parameters: buddingsParameters):
+        """Initialise buddings."""
        super().__init__(parameters)

    def run(
        self, signal: pd.DataFrame, lineage: np.ndarray = None
    ) -> pd.DataFrame:
-        lineage = lineage or self.lineage
+        """
+        Generate dataframe of budding events.

-        # Get time of first appearance for all cells
-        fvi = signal.apply(lambda x: x.first_valid_index(), axis=1)
+        Find daughters for those mothers in a Signal with lineage data.
+        Create a dataframe indicating the time each daughter first appears.

-        # Select mother cells in a given dataset
+        We use the data from Signal only to find when the daughters appear, by
+        their first non-NaN value.
+        """
+        # lineage is (trap, mother, daughter)
+        lineage = lineage or self.lineage
+        # select traps and mothers in the signal that have lineage data
        traps_mothers: t.Dict[tuple, list] = {
-            tuple(mo): [] for mo in lineage[:, :2] if tuple(mo) in signal.index
+            tuple(trap_mo): []
+            for trap_mo in lineage[:, :2]
+            if tuple(trap_mo) in signal.index
        }
+        # add daughters, potentially multiple, for these traps and mothers
        for trap, mother, daughter in lineage:
            if (trap, mother) in traps_mothers.keys():
                traps_mothers[(trap, mother)].append(daughter)
-
+        # a new dataframe with dimensions (n_mother_cells * n_tps)
        mothers = signal.loc[
            set(signal.index).intersection(traps_mothers.keys())
        ]
-        # Create a new dataframe with dimensions (n_mother_cells * n_timepoints)
        buddings = pd.DataFrame(
-            np.zeros((mothers.shape[0], signal.shape[1])).astype(bool),
+            np.zeros(mothers.shape).astype(bool),
            index=mothers.index,
            columns=signal.columns,
        )
        buddings.columns.names = ["timepoint"]
-
-        # Fill the budding events
-        for mother_id, daughters in traps_mothers.items():
-            daughters_idx = set(
-                fvi.loc[
-                    fvi.index.intersection(
-                        list(product((mother_id[0],), daughters))
-                    )
-                ].values
-            ).difference({0})
-            buddings.loc[
-                mother_id,
-                daughters_idx,
-            ] = True
-
+        # get time of first non-NaN value of signal for every cell using Pandas
+        fvi = signal.apply(lambda x: x.first_valid_index(), axis=1)
+        # fill the budding events
+        for trap_mother_id, daughters in traps_mothers.items():
+            trap_daughter_ids = [
+                i for i in product((trap_mother_id[0],), daughters)
+            ]
+            times_of_bud_appearance = fvi.loc[
+                fvi.index.intersection(trap_daughter_ids)
+            ].values
+            # ignore zeros - buds in first image are not budding events
+            daughters_idx = set(times_of_bud_appearance).difference({0})
+            buddings.loc[trap_mother_id, daughters_idx] = True
        return buddings
No results found