Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • swain-lab/aliby/aliby-mirror
  • swain-lab/aliby/alibylite
2 results
Show changes
import typing as t
from typing import Dict, Tuple
import numpy as np
import pandas as pd
......@@ -12,17 +11,16 @@ from postprocessor.core.lineageprocess import (
class BudMetricParameters(LineageProcessParameters):
"""
Parameters
"""
"""Give default location of lineage information."""
_defaults = {"lineage_location": "postprocessing/lineage_merged"}
class BudMetric(LineageProcess):
"""
Requires mother-bud information to create a new dataframe where the indices are mother ids and
values are the daughters' values for a given signal.
Requires mother-bud information to create a new dataframe where the
indices are mother ids and values are the daughters' values for a
given signal.
"""
def __init__(self, parameters: BudMetricParameters):
......@@ -31,94 +29,149 @@ class BudMetric(LineageProcess):
def run(
self,
signal: pd.DataFrame,
lineage: Dict[pd.Index, Tuple[pd.Index]] = None,
lineage: t.Dict[pd.Index, t.Tuple[pd.Index]] = None,
):
if lineage is None:
# define lineage
if hasattr(self, "lineage"):
lineage = self.lineage
else:
# lineage information in the Signal dataframe
assert "mother_label" in signal.index.names
lineage = signal.index.to_list()
return self.get_bud_metric(signal, mb_array_to_dict(lineage))
@staticmethod
def get_bud_metric(
signal: pd.DataFrame, md: Dict[Tuple, Tuple[Tuple]] = None
signal: pd.DataFrame, md: t.Dict[t.Tuple, t.Tuple[t.Tuple]] = None
):
"""
signal: Daughter-inclusive dataframe
md: Mother-daughters dictionary where key is mother's index and value a list of daugher indices
Get fvi (First Valid Index) for all cells
Create empty matrix
for every mother:
- Get daughters' subdataframe
- sort daughters by cell label
- get series of fvis
- concatenate the values of these ranges from the dataframe
Fill the empty matrix
Convert matrix into dataframe using mother indices
Generate a dataframe of a Signal for buds indexed by their mothers,
concatenating data from all the buds for each mother.
Parameters
---------
signal: pd.Dataframe
A dataframe that includes data for both mothers and daughters.
md: dict
A dict of lineage information with each key a mother's index,
defined as (trap, cell_label), and the corresponding values are a
list of daughter indices, also defined as (trap, cell_label).
"""
mothers_mat = np.zeros((len(md), signal.shape[1]))
cells_were_dropped = 0 # Flag determines if mothers (1), daughters (2) or both were missing (3)
md_index = signal.index
if (
"mother_label" not in md_index.names
): # Generate mother label from md dict if unavailable
d = {v: k for k, values in md.items() for v in values}
# md_index should only comprise (trap, cell_label)
if "mother_label" not in md_index.names:
# dict with daughter indices as keys and mother indices as values
bud_dict = {v: k for k, values in md.items() for v in values}
# generate mother_label in Signal using the mother's cell_label
# cells with no mothers have a mother_label of 0
signal["mother_label"] = list(
map(lambda x: d.get(x, [0])[-1], signal.index)
map(lambda x: bud_dict.get(x, [0])[-1], signal.index)
)
signal.set_index("mother_label", append=True, inplace=True)
related_items = set(
[*md.keys(), *[y for x in md.values() for y in x]]
)
md_index = md_index.intersection(related_items)
elif "mother_label" in md_index.names:
md_index = md_index.droplevel("mother_label")
# combine mothers and daughter indices
mothers_index = md.keys()
daughters_index = [y for x in md.values() for y in x]
relations = set([*mothers_index, *daughters_index])
# keep from md_index only cells that are mother or daughters
md_index = md_index.intersection(relations)
else:
raise ("Unavailable relationship information")
md_index = md_index.droplevel("mother_label")
if len(md_index) < len(signal):
print("Dropped cells before bud_metric") # TODO log
print(
f"Dropped {len(signal) - len(md_index)} cells before applying bud_metric"
) # TODO log
# restrict signal to the cells in md_index moving mother_label to do so
signal = (
signal.reset_index("mother_label")
.loc(axis=0)[md_index]
.set_index("mother_label", append=True)
)
names = list(signal.index.names)
del names[-2]
output_df = (
signal.loc[signal.index.get_level_values("mother_label") > 0]
.groupby(names)
.apply(lambda x: _combine_daughter_tracks(x))
# restrict to daughters: cells with a mother
mother_labels = signal.index.get_level_values("mother_label")
daughter_df = signal.loc[mother_labels > 0]
# join data for daughters with the same mother
output_df = daughter_df.groupby(["trap", "mother_label"]).apply(
combine_daughter_tracks
)
output_df.columns = signal.columns
output_df["padding_level"] = 0
output_df.set_index("padding_level", append=True, inplace=True)
# daughter data is indexed by mothers, which themselves have no mothers
output_df["temp_mother_label"] = 0
output_df.set_index("temp_mother_label", append=True, inplace=True)
if len(output_df):
output_df.index.names = signal.index.names
return output_df
def _combine_daughter_tracks(tracks: t.Collection[pd.Series]):
def combine_daughter_tracks(tracks: pd.DataFrame):
"""
Combine multiple time series of daughter cells into one time series.
Concatenate daughter values into one time series starting with the first
daughter and replacing later values with the values from the next daughter,
and so on.
Parameters
----------
tracks: a Signal
Data for all daughters, which are distinguished by different cell_labels,
for a particular trap and mother_label.
"""
# sort by daughter IDs
bud_df = tracks.sort_index(level="cell_label")
# remove multi-index
no_rows = len(bud_df)
bud_df.index = range(no_rows)
# find time point of first non-NaN data point of each row
init_tps = [
bud_df.iloc[irow].first_valid_index() for irow in range(no_rows)
]
# sort so that earliest daughter is first
sorted_rows = np.argsort(init_tps)
init_tps = np.sort(init_tps)
# combine data for all daughters
combined_tracks = np.nan * np.ones(tracks.columns.size)
for j, jrow in enumerate(sorted_rows):
# over-write with next earliest daughter
combined_tracks[bud_df.columns.get_loc(init_tps[j]) :] = (
bud_df.iloc[jrow].loc[init_tps[j] :].values
)
return pd.Series(combined_tracks, index=tracks.columns)
def _combine_daughter_tracks_original(tracks: pd.DataFrame):
"""
Combine multiple time series of cells into one, overwriting values
prioritising the most recent entity.
Combine multiple time series of daughter cells into one time series.
At any one time, a mother cell should have only one daughter.
Two daughters are still sometimes present at the same time point, and we
then choose the daughter that appears first.
TODO We need to fix examples with more than one daughter at a time point.
Parameters
----------
tracks: a Signal
Data for all daughters, which are distinguished by different cell_labels,
for a particular trap and mother_label.
"""
sorted_da_ids = tracks.sort_index(level="cell_label")
sorted_da_ids.index = range(len(sorted_da_ids))
tp_fvt = sorted_da_ids.apply(lambda x: x.first_valid_index(), axis=0)
tp_fvt = sorted_da_ids.columns.get_indexer(tp_fvt)
tp_fvt[tp_fvt < 0] = len(sorted_da_ids) - 1
_metric = np.choose(tp_fvt, sorted_da_ids.values)
return pd.Series(_metric, index=tracks.columns)
# sort by daughter IDs
bud_df = tracks.sort_index(level="cell_label")
# remove multi-index
bud_df.index = range(len(bud_df))
# find which row of sorted_df has the daughter for each time point
tp_fvt: pd.Series = bud_df.apply(lambda x: x.first_valid_index(), axis=0)
# combine data for all daughters
combined_tracks = np.nan * np.ones(tracks.columns.size)
for bud_row in np.unique(tp_fvt.dropna().values).astype(int):
ilocs = np.where(tp_fvt.values == bud_row)[0]
combined_tracks[ilocs] = bud_df.values[bud_row, ilocs]
# TODO delete old version
tp_fvt = bud_df.columns.get_indexer(tp_fvt)
tp_fvt[tp_fvt == -1] = len(bud_df) - 1
old = np.choose(tp_fvt, bud_df.values)
assert (
(combined_tracks == old) | (np.isnan(combined_tracks) & np.isnan(old))
).all(), "yikes"
return pd.Series(combined_tracks, index=tracks.columns)
......@@ -13,74 +13,69 @@ from postprocessor.core.lineageprocess import (
class buddingsParameters(LineageProcessParameters):
"""Parameter class to obtain budding events.
Parameters
----------
LineageProcessParameters : lineage_location
Location of lineage matrix to be used for calculations.
Examples
--------
FIXME: Add docs.
"""
"""Give the location of lineage information in the h5 file."""
_defaults = {"lineage_location": "postprocessing/lineage_merged"}
class buddings(LineageProcess):
"""
Calculate buddings in a trap assuming one mother per trap
returns a pandas series with the buddings.
Generate a dataframe of budding events.
We assume one mother per trap.
We define a budding event as the moment in which a bud was identified for
the first time, even if the bud is not considered one until later
in the experiment.
A bud may not be considered a bud until later in the experiment.
"""
def __init__(self, parameters: buddingsParameters):
"""Initialise buddings."""
super().__init__(parameters)
def run(
self, signal: pd.DataFrame, lineage: np.ndarray = None
) -> pd.DataFrame:
lineage = lineage or self.lineage
"""
Generate dataframe of budding events.
# Get time of first appearance for all cells
fvi = signal.apply(lambda x: x.first_valid_index(), axis=1)
Find daughters for those mothers in a Signal with lineage data.
Create a dataframe indicating the time each daughter first appears.
# Select mother cells in a given dataset
We use the data from Signal only to find when the daughters appear, by
their first non-NaN value.
"""
# lineage is (trap, mother, daughter)
lineage = lineage or self.lineage
# select traps and mothers in the signal that have lineage data
traps_mothers: t.Dict[tuple, list] = {
tuple(mo): [] for mo in lineage[:, :2] if tuple(mo) in signal.index
tuple(trap_mo): []
for trap_mo in lineage[:, :2]
if tuple(trap_mo) in signal.index
}
# add daughters, potentially multiple, for these traps and mothers
for trap, mother, daughter in lineage:
if (trap, mother) in traps_mothers.keys():
traps_mothers[(trap, mother)].append(daughter)
# a new dataframe with dimensions (n_mother_cells * n_tps)
mothers = signal.loc[
set(signal.index).intersection(traps_mothers.keys())
]
# Create a new dataframe with dimensions (n_mother_cells * n_timepoints)
buddings = pd.DataFrame(
np.zeros((mothers.shape[0], signal.shape[1])).astype(bool),
np.zeros(mothers.shape).astype(bool),
index=mothers.index,
columns=signal.columns,
)
buddings.columns.names = ["timepoint"]
# Fill the budding events
for mother_id, daughters in traps_mothers.items():
daughters_idx = set(
fvi.loc[
fvi.index.intersection(
list(product((mother_id[0],), daughters))
)
].values
).difference({0})
buddings.loc[
mother_id,
daughters_idx,
] = True
# get time of first non-NaN value of signal for every cell using Pandas
fvi = signal.apply(lambda x: x.first_valid_index(), axis=1)
# fill the budding events
for trap_mother_id, daughters in traps_mothers.items():
trap_daughter_ids = [
i for i in product((trap_mother_id[0],), daughters)
]
times_of_bud_appearance = fvi.loc[
fvi.index.intersection(trap_daughter_ids)
].values
# ignore zeros - buds in first image are not budding events
daughters_idx = set(times_of_bud_appearance).difference({0})
buddings.loc[trap_mother_id, daughters_idx] = True
return buddings
......@@ -10,11 +10,11 @@ class MergerParameters(ParametersABC):
There are five parameters expected in the dict:
smooth, boolean
smooth: boolean
Whether or not to smooth with a savgol_filter.
tol: float or int
tol: float or int
The threshold of average prediction error/std necessary to
consider two tracks the same.
consider two tracks to be the same.
If float, the threshold is the fraction of the first track;
if int, the threshold is in absolute units.
window: int
......
......@@ -5,13 +5,24 @@ import pandas as pd
from agora.abc import ParametersABC
from agora.io.cells import Cells
from agora.utils.indexing import validate_association
from agora.utils.indexing import validate_lineage
from agora.utils.cast import _str_to_int
from agora.utils.kymograph import drop_mother_label
from postprocessor.core.lineageprocess import LineageProcess
class PickerParameters(ParametersABC):
"""
A dictionary specifying the sequence of picks in order.
"lineage" is further specified by "mothers", "daughters", and
"families" (mother-bud pairs).
"condition" is further specified by "present", "any_present", or
"growing" and a threshold, either a number of time points or a
fraction of the total duration of the experiment.
"""
_defaults = {
"sequence": [
["lineage", "families"],
......@@ -22,11 +33,8 @@ class PickerParameters(ParametersABC):
class Picker(LineageProcess):
"""
:cells: Cell object passed to the constructor
:condition: Tuple with condition and associated parameter(s), conditions can be
"present", "nonstoply_present" or "quantile".
Determine the thresholds or fractions of signals to use.
:lineage: str {"mothers", "daughters", "families" (mothers AND daughters), "orphans"}. Mothers/daughters picks cells with those tags, families pick the union of both and orphans the difference between the total and families.
Picker selects cells using lineage information and by
how and for how long they are retained in the data set.
"""
def __init__(
......@@ -34,6 +42,7 @@ class Picker(LineageProcess):
parameters: PickerParameters,
cells: Cells or None = None,
):
"""Initialise picker."""
super().__init__(parameters=parameters)
self.cells = cells
......@@ -43,93 +52,100 @@ class Picker(LineageProcess):
how: str,
mothers_daughters: t.Optional[np.ndarray] = None,
) -> pd.MultiIndex:
"""
Return rows of a signal corresponding to either mothers, daughters,
or mother-daughter pairs using lineage information.
"""
cells_present = drop_mother_label(signal.index)
mothers_daughters = self.get_lineage_information(signal)
valid_indices = slice(None)
if how == "mothers":
_, valid_indices = validate_association(
mothers_daughters, cells_present, match_column=0
)
elif how == "daughters":
_, valid_indices = validate_association(
mothers_daughters, cells_present, match_column=1
)
elif how == "families": # Mothers and daughters that are still present
_, valid_indices = validate_association(
mothers_daughters, cells_present
)
_, valid_indices = validate_lineage(
mothers_daughters, cells_present, how
)
return signal.index[valid_indices]
def pick_by_condition(self, signal, condition, thresh):
idx = self.switch_case(signal, condition, thresh)
return idx
def run(self, signal):
"""
Pick indices from the index of a signal's dataframe and return
as an array.
Typically, we first pick by lineage, then by condition.
"""
self.orig_signal = signal
indices = set(signal.index)
lineage = self.get_lineage_information(signal)
if len(lineage):
self.mothers = lineage[:, :2]
self.mothers = lineage[:, [0, 1]]
self.daughters = lineage[:, [0, 2]]
for alg, *params in self.sequence:
new_indices = tuple()
if indices:
if alg == "lineage":
# pick by lineage
param1 = params[0]
new_indices = getattr(self, "pick_by_" + alg)(
new_indices = self.pick_by_lineage(
signal.loc[list(indices)], param1
)
else:
# pick by condition
param1, *param2 = params
new_indices = getattr(self, "pick_by_" + alg)(
new_indices = self.pick_by_condition(
signal.loc[list(indices)], param1, param2
)
new_indices = [tuple(x) for x in new_indices]
else:
new_indices = tuple()
# number of indices reduces for each iteration of the loop
indices = indices.intersection(new_indices)
else:
self._log(f"No lineage assignment")
self._log("No lineage assignment")
indices = np.array([])
return np.array([tuple(map(_str_to_int, x)) for x in indices])
# convert to array
indices_arr = np.array([tuple(map(_str_to_int, x)) for x in indices])
return indices_arr
def switch_case(
def pick_by_condition(
self,
signal: pd.DataFrame,
condition: str,
threshold: t.Union[float, int, list],
):
"""Pick indices from signal by any_present, present, and growing."""
if len(threshold) == 1:
threshold = [_as_int(*threshold, signal.shape[1])]
#: is this correct for "growing"?
case_mgr = {
"any_present": lambda s, thresh: any_present(s, thresh),
"present": lambda s, thresh: s.notna().sum(axis=1) > thresh,
"nonstoply_present": lambda s, thresh: s.apply(thresh, axis=1)
> thresh,
"growing": lambda s, thresh: s.diff(axis=1).sum(axis=1) > thresh,
"any_present": lambda s, threshold: any_present(s, threshold),
"present": lambda s, threshold: s.notna().sum(axis=1) > threshold,
"growing": lambda s, threshold: s.diff(axis=1).sum(axis=1)
> threshold,
}
return set(signal.index[case_mgr[condition](signal, *threshold)])
# apply condition
idx = set(signal.index[case_mgr[condition](signal, *threshold)])
new_indices = [tuple(x) for x in idx]
return new_indices
def _as_int(threshold: t.Union[float, int], ntps: int):
"""Convert a fraction of the total experiment duration into a number of time points."""
if type(threshold) is float:
threshold = ntps * threshold
return threshold
def any_present(signal, threshold):
"""
Return a mask for cells, True if there is a cell in that trap that was present for more than :threshold: timepoints.
"""
"""Find traps where at least one cell stays for more than threshold time points."""
# all_traps contains repeated traps, which have more than one cell
all_traps = [x[0] for x in signal.index]
# full_traps contains only traps that have at least one cell
full_traps = (signal.notna().sum(axis=1) > threshold).groupby("trap")
# expand full_traps to size of signal.index
# rows highlight traps in signal_index for each full trap
trap_array = np.array(
[
np.isin(all_traps, trap_id) & full
for trap_id, full in full_traps.any().items()
]
)
# convert to pd.Series
any_present = pd.Series(
np.sum(
[
np.isin([x[0] for x in signal.index], i) & v
for i, v in (signal.notna().sum(axis=1) > threshold)
.groupby("trap")
.any()
.items()
],
axis=0,
).astype(bool),
index=signal.index,
np.sum(trap_array, axis=0).astype(bool), index=signal.index
)
return any_present
......@@ -86,15 +86,15 @@ class Grouper(ABC):
**kwargs,
):
"""
Concatenate data for one signal from different h5 files, with
one h5 file per position, into a dataframe.
Concatenate data for one signal from different h5 files, one for
each position, into a dataframe.
Parameters
----------
path : str
Signal location within h5py file
Signal location within h5 file.
pool : int
Number of threads used; if 0 or None only one core is used
Number of threads used; if 0 or None only one core is used.
mode: str
standard: boolean
**kwargs : key, value pairings
......@@ -110,7 +110,7 @@ class Grouper(ABC):
if standard:
fn_pos = concat_standard
else:
fn_pos = concat_signal_ind
fn_pos = concat_one_signal
kwargs["mode"] = mode
records = self.pool_function(
path=path,
......@@ -163,9 +163,8 @@ class Grouper(ABC):
chainers: t.Dict[str, Chainer] = None,
**kwargs,
):
"""Enable different threads for independent chains, particularly useful when aggregating multiple elements."""
if pool is None:
pass
"""Enable different threads for independent chains, particularly
useful when aggregating multiple elements."""
chainers = chainers or self.chainers
if pool:
with Pool(pool) as p:
......@@ -267,17 +266,17 @@ class Grouper(ABC):
@property
def stages_span(self):
# FAILS on my example
# TODO: fails on my example
return self.fsignal.stages_span
@property
def max_span(self):
# FAILS on my example
# TODO: fails on my example
return self.fsignal.max_span
@property
def stages(self):
# FAILS on my example
# TODO: fails on my example
return self.fsignal.stages
@property
......@@ -370,7 +369,7 @@ def concat_standard(
return combined
def concat_signal_ind(
def concat_one_signal(
path: str,
chainer: Chainer,
group: str,
......