From 5c9484033954cd9f5d4b5bddffd5f3ce6e17cb22 Mon Sep 17 00:00:00 2001
From: pswain <peter.swain@ed.ac.uk>
Date: Thu, 30 May 2024 14:58:59 +0100
Subject: [PATCH] change: alibylite.org; dataset to get_position_ids

---
 alibylite.org                         | 75 +++++++++++++++++++++++++++
 src/agora/io/cells.py                 |  2 +-
 src/aliby/io/dataset.py               | 32 ++++++------
 src/extraction/core/extractor.py      | 21 ++++----
 src/extraction/core/functions/cell.py |  5 +-
 5 files changed, 107 insertions(+), 28 deletions(-)
 create mode 100644 alibylite.org

diff --git a/alibylite.org b/alibylite.org
new file mode 100644
index 0000000..1e686ec
--- /dev/null
+++ b/alibylite.org
@@ -0,0 +1,75 @@
+#+title: aliby
+
+The microscope visits multiple positions during an experiment. Each position may have a different setup or strain. We denote this strain as a *group*. For every position, we take an image for every time point.
+
+We divide all images into *tiles*, one per trap. Baby determines masks, mother-bud pairs, and tracking for each tile. We obtain data on individual cells first for each tile and then for each position for all time points: *cells* and *signal* provide this information for a position; *grouper* concatenates over all positions.
+
+All global parameters, such as possible fluorescence channels and pixel sizes, are stored in *global_parameters*.
+
+* aliby/pipeline
+Runs the *tiler*, *baby*, and *extraction* steps of the pipeline, and then *postprocessing*.
+The *run* function loops through positions, calling *run_one_position*, which loops through time points.
+For each time point, each step of the pipeline has a *_run_tp* function, which StepABC renames to *run_tp*, to process one time point for a position.
+Extractor does not have an independent writer, but writes to the h5 file in *_run_tp*.
+
+* aliby/tile/tiler
+Tiles image into smaller regions of interest or tiles, one per trap, for faster processing. We ignore tiles without cells.
+
+* aliby/baby_sitter
+Interfaces with Baby through the *BabyRunner* class, which returns a dict of Baby's results.
+
+* extraction/core/extractor
+Extracts areas and volumes and the fluorescence data from the images for the cells in one position, via the image tiles, using the cell masks found by Baby.
+
+We save the cell properties we wish to extract as a nested dictionary, such as
+        {'general': {'None': ['area', 'volume', 'eccentricity']}}.
+*extract_tp* extracts data for one time point.
+
+** extraction/core/functions/cell
+Defines the standard functions, such as area and median, that we apply to pixels from individual cells.
+** extraction/core/functions/trap
+Determines properties of a tile's background.
+** extraction/core/functions/distributors
+Collapses multiple z-sections to a 2D image.
+** extraction/core/functions/defaults
+Defines the standard fluorescence signals and metrics, like median, we extract in *exparams_from_meta*.
+** extraction/core/function/custom/localisation
+Defines more complex functions to apply to cells, such as *nuc_est_conv*, which estimates nuclear localisation of a fluorescent protein.
+
+* agora/bridge
+Interfaces with h5 files.
+* agora/cells
+Accesses information on cells and masks in tiles from an h5 file.
+* agora/signal
+Gets extracted properties, such as median fluorescence, for all cells and all time points from an h5 file - data for one position.
+
+Signal applies picking and merging of cells using the choices made by *picker* and *merger*. *get_raw* gets the data from the h5 file without any picking and merging.
+
+* postprocessor/core/processor
+For one position, the *run* function performs picking, of appropriate cells, and merging, of tracklets, via *run_prepost* and then runs processes, such as the *buddings* and *bud_metric* functions, on signals, such as *volume*, to get new signals, such as *buddings* and *bud_volume*.
+
+*run_process* writes the results to an h5 file.
+
+The class *PostProcessorParameters* lists the standard processes we perform, such as running *buddings* and *bud_metric* on *area*.
+
+* postprocessor/core/reshapers/picker
+Selects cells from a Signal for which there is lineage information and by how long they remain in the experiment, writing the choices to the h5 file.
+* postprocessor/core/reshapers/merger
+Combines tracks that should be a single track of the same cell, writing the choices to the h5 file.
+* agora/utils/indexing
+Core code needed when *picker* uses Baby's lineage information to select mother-bud pairs in a Signal.
+
+* postprocessor/grouper
+*concat_signal*: Concatenates signals from different h5 files - we have one per position - to generate dataframes for the entire experiment.
+ uses either *concat_signal_ind* for independent signals or *concat_standard*.
+
+* aliby/utils/argo
+Gets information on the data available in an OMERO data base.
+
+* aliby/io/omero
+Contains functions to interact with OMERO and extract information on an *Image* corresponding to an OMERO image ID or a *Dataset* corresponding to an OMERO experiment ID.
+
+* Language
+We use *tile* and *trap* interchangeably, but *tile* is preferred.
+We use *bud* and *daughter* interchangeably, but *bud* is preferred.
+We use *record* and *kymograph* interchangeably, but *record* is preferred.
diff --git a/src/agora/io/cells.py b/src/agora/io/cells.py
index a8ea422..29851f0 100644
--- a/src/agora/io/cells.py
+++ b/src/agora/io/cells.py
@@ -16,7 +16,7 @@ class Cells:
     """
     Extract information from an h5 file.
 
-    Use output from BABY to find cells detected, get, and fill, edge masks
+    Use output from BABY to find cells detected, get and fill edge masks,
     and retrieve mother-bud relationships.
 
     This class accesses in the h5 file:
diff --git a/src/aliby/io/dataset.py b/src/aliby/io/dataset.py
index 0ec3185..dabd00e 100644
--- a/src/aliby/io/dataset.py
+++ b/src/aliby/io/dataset.py
@@ -3,29 +3,31 @@
 Dataset is a group of classes to manage multiple types of experiments:
  - Remote experiments on an OMERO server (located in src/aliby/io/omero.py)
  - Local experiments in a multidimensional OME-TIFF image containing the metadata
- - Local experiments in a directory containing multiple positions in independent images with or without metadata
+ - Local experiments in a directory containing multiple positions in independent
+images with or without metadata
 """
 import os
 import shutil
 import time
 import typing as t
-from abc import ABC, abstractproperty, abstractmethod
+from abc import ABC, abstractmethod, abstractproperty
 from pathlib import Path
 
-from agora.io.bridge import BridgeH5
 from aliby.io.image import ImageLocalOME
+from aliby.io.omero import Dataset
 
 
 def dispatch_dataset(expt_id: int or str, **kwargs):
     """
     Find paths to the data.
 
-    Connects to OMERO if data is remotely available.
+    Connect to OMERO if data is remotely available.
 
     Parameters
     ----------
     expt_id: int or str
-        To identify the data, either an OMERO ID or an OME-TIFF file or a local directory.
+        To identify the data, either an OMERO ID or an OME-TIFF file
+        or a local directory.
 
     Returns
     -------
@@ -33,20 +35,18 @@ def dispatch_dataset(expt_id: int or str, **kwargs):
     """
     if isinstance(expt_id, int):
         # data available online
-        from aliby.io.omero import Dataset
-
         return Dataset(expt_id, **kwargs)
     elif isinstance(expt_id, str):
         # data available locally
         expt_path = Path(expt_id)
         if expt_path.is_dir():
-            # data in multiple folders
+            # data in multiple folders, such as zarr
             return DatasetLocalDir(expt_path)
         else:
             # data in one folder as OME-TIFF files
             return DatasetLocalOME(expt_path)
     else:
-        raise Warning(f"{expt_id} is an invalid expt_id")
+        raise Warning(f"{expt_id} is an invalid expt_id.")
 
 
 class DatasetLocalABC(ABC):
@@ -103,7 +103,7 @@ class DatasetLocalABC(ABC):
         pass
 
     @abstractmethod
-    def get_images(self):
+    def get_position_ids(self):
         pass
 
 
@@ -120,12 +120,13 @@ class DatasetLocalDir(DatasetLocalABC):
             "%Y%m%d", time.strptime(time.ctime(os.path.getmtime(self.path)))
         )
 
-    def get_images(self):
-        """Return a dictionary of folder or file names and their paths.
+    def get_position_ids(self):
+        """
+        Return a dict of file paths for each position.
 
         FUTURE 3.12 use pathlib is_junction to pick Dir or File
         """
-        images = {
+        position_ids_dict = {
             item.name: item
             for item in self.path.glob("*/")
             if item.is_dir()
@@ -136,8 +137,7 @@ class DatasetLocalDir(DatasetLocalABC):
             )
             or item.suffix[1:] in self._valid_suffixes
         }
-
-        return images
+        return position_ids_dict
 
 
 class DatasetLocalOME(DatasetLocalABC):
@@ -154,7 +154,7 @@ class DatasetLocalOME(DatasetLocalABC):
         """Get the date from the metadata of the first position."""
         return ImageLocalOME(list(self.get_position_ids().values())[0]).date
 
-    def get_images(self):
+    def get_position_ids(self):
         """Return a dictionary with the names of the image files."""
         return {
             f.name: str(f)
diff --git a/src/extraction/core/extractor.py b/src/extraction/core/extractor.py
index fa379e4..b34bc49 100644
--- a/src/extraction/core/extractor.py
+++ b/src/extraction/core/extractor.py
@@ -122,6 +122,7 @@ class Extractor(StepABC):
     or leaf level.
     """
 
+    # get pixel_size; z_size; spacing
     default_meta = global_parameters.imaging_specifications
 
     def __init__(
@@ -758,7 +759,7 @@ class Extractor(StepABC):
         elif isinstance(tps, int):
             tps = [tps]
         # store results in dict
-        d = {}
+        extract_dict = {}
         for tp in tps:
             # extract for each time point and convert to dict of pd.Series
             new = flatten_nesteddict(
@@ -767,21 +768,23 @@ class Extractor(StepABC):
                 tp=tp,
             )
             # concatenate with data extracted from earlier time points
-            for k in new.keys():
-                d[k] = pd.concat((d.get(k, None), new[k]), axis=1)
+            for key in new.keys():
+                extract_dict[key] = pd.concat(
+                    (extract_dict.get(key, None), new[key]), axis=1
+                )
         # add indices to pd.Series containing the extracted data
-        for k in d.keys():
+        for k in extract_dict.keys():
             indices = ["experiment", "position", "trap", "cell_label"]
             idx = (
-                indices[-d[k].index.nlevels :]
-                if d[k].index.nlevels > 1
+                indices[-extract_dict[k].index.nlevels :]
+                if extract_dict[k].index.nlevels > 1
                 else [indices[-2]]
             )
-            d[k].index.names = idx
+            extract_dict[k].index.names = idx
         # save
         if save:
-            self.save_to_h5(d)
-        return d
+            self.save_to_h5(extract_dict)
+        return extract_dict
 
     def save_to_h5(self, dict_series, path=None):
         """
diff --git a/src/extraction/core/functions/cell.py b/src/extraction/core/functions/cell.py
index c83b17e..7570749 100644
--- a/src/extraction/core/functions/cell.py
+++ b/src/extraction/core/functions/cell.py
@@ -8,9 +8,10 @@ must return only one value.
 They assume that there are no NaNs in the image.
 
 We use the module bottleneck when it performs faster than numpy:
-- Median
-- values containing NaNs (but we make sure this does not happen)
+- median
+- values containing NaNs (but we make sure this never happens).
 """
+
 import math
 import typing as t
 
-- 
GitLab