diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/core/__init__.py b/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e5a0d9b4834ec8f46d6e0d1256c6dcaad2e460fe --- /dev/null +++ b/core/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/core/functions/__init__.py b/core/functions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e5a0d9b4834ec8f46d6e0d1256c6dcaad2e460fe --- /dev/null +++ b/core/functions/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/core/functions/test_hdf.py b/core/functions/test_hdf.py new file mode 100644 index 0000000000000000000000000000000000000000..5bd3b1a12e9b08743b8058d17ef4bb23719ac6a4 --- /dev/null +++ b/core/functions/test_hdf.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +# c=CellsHDF.from_source("/home/alan/Documents/sync_docs/PhD/tmp/DO6MS2_003store.h5") +import h5py +from core.cells import Cells +import pandas as pd + +# f = h5py.File("/home/alan/Documents/sync_docs/PhD/tmp/DO6MS2_003store.h5") +fname = "/shared_libs/pipeline-core/scripts/data/20191026_ss_experiments_01/DO6MS2_003store.h5" +f = h5py.File(fname) +tracks = f["/extraction/general/None/area"][()] +cell = Cells.from_source(fname) +from postprocessor.core.processes.picker import Picker, PickerParameters + +picker = Picker(cells=cell, parameters=PickerParameters.default()) + +from postprocessor.core.processor import PostProcessor, PostProParameters + +pp = PostProcessor(filename=fname, parameters=PostProParameters.default()) +pp.run() diff --git a/core/functions/tracks.py b/core/functions/tracks.py new file mode 100644 index 0000000000000000000000000000000000000000..e47e60231da31e2c23acf5e97f5b721edbffdfee --- /dev/null +++ b/core/functions/tracks.py @@ -0,0 +1,429 @@ +""" +Functions to process, filter and merge tracks. +""" + +# from collections import Counter + +from copy import copy +from typing import Union, List + +import numpy as np +import pandas as pd +from utils_find_1st import find_1st, cmp_larger + +import more_itertools as mit +from scipy.signal import savgol_filter + +# from scipy.optimize import linear_sum_assignment +# from scipy.optimize import curve_fit + +from matplotlib import pyplot as plt + + +def load_test_dset(): + # Load development dataset to test functions + return pd.DataFrame( + { + ("a", 1, 1): [2, 5, np.nan, 6, 8] + [np.nan] * 5, + ("a", 1, 2): list(range(2, 12)), + ("a", 1, 3): [np.nan] * 8 + [6, 7], + ("a", 1, 4): [np.nan] * 5 + [9, 12, 10, 14, 18], + }, + index=range(1, 11), + ).T + + +def max_ntps(track: pd.Series) -> int: + # Get number of timepoints + indices = np.where(track.notna()) + return np.max(indices) - np.min(indices) + + +def max_nonstop_ntps(track: pd.Series) -> int: + nona_tracks = track.notna() + consecutive_nonas_grouped = [ + len(list(x)) for x in mit.consecutive_groups(np.flatnonzero(nona_tracks)) + ] + return max(consecutive_nonas_grouped) + + +def get_tracks_ntps(tracks: pd.DataFrame) -> pd.Series: + return tracks.apply(max_ntps, axis=1) + + +def get_avg_gr(track: pd.Series) -> int: + """ + Get average growth rate for a track. + + :param tracks: Series with volume and timepoints as indices + """ + ntps = max_ntps(track) + vals = track.dropna().values + gr = (vals[-1] - vals[0]) / ntps + return gr + + +def get_avg_grs(tracks: pd.DataFrame) -> pd.DataFrame: + """ + Get average growth rate for a group of tracks + + :param tracks: (m x n) dataframe where rows are cell tracks and + columns are timepoints + """ + return tracks.apply(get_avg_gr, axis=1) + + +def clean_tracks(tracks, min_len: int = 15, min_gr: float = 1.0) -> pd.DataFrame: + """ + Clean small non-growing tracks and return the reduced dataframe + + :param tracks: (m x n) dataframe where rows are cell tracks and + columns are timepoints + :param min_len: int number of timepoints cells must have not to be removed + :param min_gr: float Minimum mean growth rate to assume an outline is growing + """ + ntps = get_tracks_ntps(tracks) + grs = get_avg_grs(tracks) + + growing_long_tracks = tracks.loc[(ntps >= min_len) & (grs > min_gr)] + + return growing_long_tracks + + +def merge_tracks(tracks, drop=False, **kwargs) -> pd.DataFrame: + """ + Join tracks that are contiguous and within a volume threshold of each other + + :param tracks: (m x n) dataframe where rows are cell tracks and + columns are timepoints + :param kwargs: args passed to get_joinable + + returns + + :joint_tracks: (m x n) Dataframe where rows are cell tracks and + columns are timepoints. Merged tracks are still present but filled + with np.nans. + """ + + # calculate tracks that can be merged until no more traps can be merged + joinable_pairs = get_joinable(tracks, **kwargs) + if joinable_pairs: + tracks = join_tracks(tracks, joinable_pairs, drop=drop) + joint_ids = get_joint_ids(joinable_pairs) + + return (tracks, joinable_pairs) + + +def get_joint_ids(merging_seqs) -> dict: + """ + Convert a series of merges into a dictionary where + the key is the cell_id of destination and the value a list + of the other track ids that were merged into the key + + :param merging_seqs: list of tuples of indices indicating the + sequence of merging events. It is important for this to be in sequential order + + How it works: + + The order of merging matters for naming, always the leftmost track will keep the id + + For example, having tracks (a, b, c, d) and the iterations of merge events: + + 0 a b c d + 1 a b cd + 2 ab cd + 3 abcd + + We shold get: + + output {a:a, b:a, c:a, d:a} + + """ + if not merging_seqs: + return {} + + targets, origins = list(zip(*merging_seqs)) + static_tracks = set(targets).difference(origins) + + joint = {track_id: track_id for track_id in static_tracks} + for target, origin in merging_seqs: + joint[origin] = target + + moved_target = [ + k for k, v in joint.items() if joint[v] != v and v in joint.values() + ] + + for orig in moved_target: + joint[orig] = rec_bottom(joint, orig) + + return { + k: v for k, v in joint.items() if k != v + } # remove ids that point to themselves + + +def rec_bottom(d, k): + if d[k] == k: + return k + else: + return rec_bottom(d, d[k]) + + +def join_tracks(tracks, joinable_pairs, drop=True) -> pd.DataFrame: + """ + Join pairs of tracks from later tps towards the start. + + :param tracks: (m x n) dataframe where rows are cell tracks and + columns are timepoints + + returns (copy) + + :param joint_tracks: (m x n) Dataframe where rows are cell tracks and + columns are timepoints. Merged tracks are still present but filled + with np.nans. + :param drop: bool indicating whether or not to drop moved rows + + """ + + tmp = copy(tracks) + for target, source in joinable_pairs: + tmp.loc[target] = join_track_pair(tmp.loc[target], tmp.loc[source]) + + if drop: + tmp = tmp.drop(source) + + return tmp + + +def join_track_pair(target, source): + tgt_copy = copy(target) + end = find_1st(target.values[::-1], 0, cmp_larger) + tgt_copy.iloc[-end:] = source.iloc[-end:].values + return tgt_copy + + +def get_joinable(tracks, smooth=False, tol=0.1, window=5, degree=3) -> dict: + """ + Get the pair of track (without repeats) that have a smaller error than the + tolerance. If there is a track that can be assigned to two or more other + ones, it chooses the one with a lowest error. + + :param tracks: (m x n) dataframe where rows are cell tracks and + columns are timepoints + :param tol: float or int threshold of average (prediction error/std) necessary + to consider two tracks the same. If float is fraction of first track, + if int it is absolute units. + :param window: int value of window used for savgol_filter + :param degree: int value of polynomial degree passed to savgol_filter + + """ + + # Commented because we are not smoothing in this step yet + # candict = {k:v for d in contig.values for k,v in d.items()} + + # smooth all relevant tracks + + if smooth: # Apply savgol filter TODO fix nans affecting edge placing + clean = clean_tracks( + tracks, min_len=window + 1, min_gr=0.9 + ) # get useful tracks + savgol_on_srs = lambda x: non_uniform_savgol(x.index, x.values, window, degree) + contig = clean.groupby(["trap"]).apply(get_contiguous_pairs) + contig = contig.loc[contig.apply(len) > 0] + flat = set([k for v in contig.values for i in v for j in i for k in j]) + smoothed_tracks = clean.loc[flat].apply(savgol_on_srs, 1) + else: + contig = tracks.groupby(["trap"]).apply(get_contiguous_pairs) + contig = contig.loc[contig.apply(len) > 0] + flat = set([k for v in contig.values for i in v for j in i for k in j]) + smoothed_tracks = tracks.loc[flat].apply(lambda x: np.array(x.values), axis=1) + + # fetch edges from ids TODO (IF necessary, here we can compare growth rates) + idx_to_edge = lambda preposts: [ + ( + [get_val(smoothed_tracks.loc[pre], -1) for pre in pres], + [get_val(smoothed_tracks.loc[post], 0) for post in posts], + ) + for pres, posts in preposts + ] + edges = contig.apply(idx_to_edge) + + closest_pairs = edges.apply(get_vec_closest_pairs, tol=tol) + + # match local with global ids + joinable_ids = [ + localid_to_idx(closest_pairs.loc[i], contig.loc[i]) for i in closest_pairs.index + ] + + return [pair for pairset in joinable_ids for pair in pairset] + + +get_val = lambda x, n: x[~np.isnan(x)][n] if len(x[~np.isnan(x)]) else np.nan + + +def localid_to_idx(local_ids, contig_trap): + """Fetch then original ids from a nested list with joinable local_ids + + input + :param local_ids: list of list of pairs with cell ids to be joint + :param local_ids: list of list of pairs with corresponding cell ids + + return + list of pairs with (experiment-level) ids to be joint + """ + lin_pairs = [] + for i, pairs in enumerate(local_ids): + if len(pairs): + for left, right in pairs: + lin_pairs.append((contig_trap[i][0][left], contig_trap[i][1][right])) + return lin_pairs + + +def get_vec_closest_pairs(lst: List, **kwags): + return [get_closest_pairs(*l, **kwags) for l in lst] + + +def get_closest_pairs(pre: List[float], post: List[float], tol: Union[float, int] = 1): + """Calculate a cost matrix the Hungarian algorithm to pick the best set of + options + + input + :param pre: list of floats with edges on left + :param post: list of floats with edges on right + :param tol: int or float if int metrics of tolerance, if float fraction + + returns + :: list of indices corresponding to the best solutions for matrices + + """ + if len(pre) > len(post): + dMetric = np.abs(np.subtract.outer(post, pre)) + else: + dMetric = np.abs(np.subtract.outer(pre, post)) + # dMetric[np.isnan(dMetric)] = tol + 1 + np.nanmax(dMetric) # nans will be filtered + # ids = linear_sum_assignment(dMetric) + dMetric[np.isnan(dMetric)] = tol + 1 + np.nanmax(dMetric) # nans will be filtered + + ids = solve_matrix(dMetric) + if not len(ids[0]): + return [] + + norm = ( + np.array(pre)[ids[len(pre) > len(post)]] if tol < 1 else 1 + ) # relative or absolute tol + result = dMetric[ids] / norm + ids = ids if len(pre) < len(post) else ids[::-1] + + return [idx for idx, res in zip(zip(*ids), result) if res <= tol] + + +def solve_matrix(dMetric): + """ + Solve cost matrix focusing on getting the smallest cost at each iteration. + + input + :param dMetric: np.array cost matrix + + returns + tuple of np.arrays indicating picks with lowest individual value + """ + glob_is = [] + glob_js = [] + if (~np.isnan(dMetric)).any(): + tmp = copy(dMetric) + std = sorted(tmp[~np.isnan(tmp)]) + while (~np.isnan(std)).any(): + v = std[0] + i_s, j_s = np.where(tmp == v) + i = i_s[0] + j = j_s[0] + tmp[i, :] += np.nan + tmp[:, j] += np.nan + glob_is.append(i) + glob_js.append(j) + + std = sorted(tmp[~np.isnan(tmp)]) + + return (np.array(glob_is), np.array(glob_js)) + + +def plot_joinable(tracks, joinable_pairs, max=64): + """ + Convenience plotting function for debugging and data vis + """ + + nx = 8 + ny = 8 + _, axes = plt.subplots(nx, ny) + for i in range(nx): + for j in range(ny): + if i * ny + j < len(joinable_pairs): + ax = axes[i, j] + pre, post = joinable_pairs[i * ny + j] + pre_srs = tracks.loc[pre].dropna() + post_srs = tracks.loc[post].dropna() + ax.plot(pre_srs.index, pre_srs.values, "b") + # try: + # totrange = np.arange(pre_srs.index[0],post_srs.index[-1]) + # ax.plot(totrange, interpolate(pre_srs, totrange), 'r-') + # except: + # pass + ax.plot(post_srs.index, post_srs.values, "g") + + plt.show() + + +def get_contiguous_pairs(tracks: pd.DataFrame) -> list: + """ + Get all pair of contiguous track ids from a tracks dataframe. + + :param tracks: (m x n) dataframe where rows are cell tracks and + columns are timepoints + :param min_dgr: float minimum difference in growth rate from the interpolation + """ + # indices = np.where(tracks.notna()) + + mins, maxes = [ + tracks.notna().apply(np.where, axis=1).apply(fn) for fn in (np.min, np.max) + ] + + mins_d = mins.groupby(mins).apply(lambda x: x.index.tolist()) + mins_d.index = mins_d.index - 1 # make indices equal + # TODO add support for skipping time points + maxes_d = maxes.groupby(maxes).apply(lambda x: x.index.tolist()) + + common = sorted(set(mins_d.index).intersection(maxes_d.index), reverse=True) + + return [(maxes_d[t], mins_d[t]) for t in common] + + +# def fit_track(track: pd.Series, obj=None): +# if obj is None: +# obj = objective + +# x = track.dropna().index +# y = track.dropna().values +# popt, _ = curve_fit(obj, x, y) + +# return popt + +# def interpolate(track, xs) -> list: +# ''' +# Interpolate next timepoint from a track + +# :param track: pd.Series of volume growth over a time period +# :param t: int timepoint to interpolate +# ''' +# popt = fit_track(track) +# # perr = np.sqrt(np.diag(pcov)) +# return objective(np.array(xs), *popt) + + +# def objective(x,a,b,c,d) -> float: +# # return (a)/(1+b*np.exp(c*x))+d +# return (((x+d)*a)/((x+d)+b))+c + +# def cand_pairs_to_dict(candidates): +# d={x:[] for x,_ in candidates} +# for x,y in candidates: +# d[x].append(y) +# return d diff --git a/core/group.py b/core/group.py new file mode 100644 index 0000000000000000000000000000000000000000..99cf00cf775f034d69be6f29fd7d9bb77dba5f54 --- /dev/null +++ b/core/group.py @@ -0,0 +1,130 @@ +""" +Class to group multiple positions into one using one of several criteria. +""" + +from pathlib import Path +import re + +import h5py +import pandas as pd + +from postprocessor.core.io.base import groupsort +from postprocessor.core.io.signal import Signal + +from postprocessor.core.processes.base import ParametersABC, ProcessABC + + +class GroupParameters(ParametersABC): + def __init__(self, by="name", processes=[], signals=[]): + self.by = by + self.signals = signals + self.processes = processes + + @classmethod + def default(cls): + return cls.from_dict({"by": "name", "signals": [], "processes": []}) + + +class Group(ProcessABC): + def __init__(self, parameters): + super().__init__(parameters) + + def get_position_filenames(self, exp_root, poses): + """ + Get filenames as a dictionary where the key is the position and value the filename. + """ + central_store = Path(exp_root) / "store.h5" + if central_store.exists(): + hdf = h5py.File(central_store, "r") + self.filenames = [pos.attrs["filename"] for pos in hdf["/positions/"]] + hdf.close() + else: # If no central store just list position files in expt root folder + fullfiles = [x for x in Path(exp_root).glob("*store.h5")] + files = [x.name for x in Path(exp_root).glob("*store.h5")] + filenames = [False for _ in poses] + for i, pos in enumerate(poses): + matches = [ + True if re.match(pos + ".*.h5", fname) else False for fname in files + ] + if any(matches): + assert sum(matches) == 1, "More than one match" + filenames[i] = (pos, fullfiles[matches.index(True)]) + + self.filenames = {fname[0]: fname[1] for fname in filenames if fname} + + self.positions = list(self.filenames.keys()) + return self.filenames + + def get_signals(self): + # hdf = h5py.File(central_store, "r") + # keys_d = groupsort(keys) + self.signals = {pos: {} for pos in self.filenames.keys()} + for pos, fname in self.filenames.items(): + for signal in self.parameters.signals: + self.signals[pos][signal] = pd.read_hdf(fname, signal) + + return self.signals + + def gen_groups(self): + if self.by == "group": # Use group names in metadata + pass + elif self.by == "name": # Infer groups from signal concatenation + # Remove last four characters and find commonalities larger than 4 + # characters between posnames and group them that way. + groupnames = list(set([x[:-3] for x in self.positions])) + self.group_signal_tree = {group: [] for group in groupnames} + self.poses_grouped = {group: [] for group in groupnames} + for pos in self.positions: + group = groupnames[groupnames.index(pos[:-3])] + self.group_signal_tree[group].append(self.signals[pos]) + self.poses_grouped[group].append(pos) + + elif ( + type(self.by) == tuple + ): # Manually give groups as tuple or list of positions + pass + + def concat_signals(self): + self.concated_signals = {group: {} for group in self.group_signal_tree} + for k, group in self.group_signal_tree.items(): + for signal in self.parameters.signals: + self.concated_signals[k][signal] = pd.concat( + [g[signal] for g in group], keys=self.poses_grouped[k] + ) + + return self.concated_signals + + def process_signals(self, grouped_signals): + pass + + def run(self, central_store, poses): + + self.get_position_filenames(central_store, poses) + self.get_signals() + self.gen_groups() + self.concat_signals() + # processed_signals = self.process_signals(grouped_signals) + + return self.concated_signals + # return processed_signals + + +poses = [ + x.name.split("store")[0] + for x in Path( + "/shared_libs/pipeline-core/scripts/data/ph_calibration_dual_phl_ura8_5_04_5_83_7_69_7_13_6_59__01" + ).rglob("*") + if x.name != "images.h5" +] +gr = Group( + GroupParameters( + signals=["/extraction/general/None/area", "/extraction/mCherry/np_max/median"] + ) +) +gr.run( + central_store="/shared_libs/pipeline-core/scripts/data/ph_calibration_dual_phl_ura8_5_04_5_83_7_69_7_13_6_59__01", + poses=poses, +) +signal = Signal( + "/shared_libs/pipeline-core/scripts/data/ph_calibration_dual_phl_ura8_5_04_5_83_7_69_7_13_6_59__01/ph_5_04_001store.h5" +) diff --git a/core/old/cell.py b/core/old/cell.py new file mode 100644 index 0000000000000000000000000000000000000000..f96f30d53e8bc462044eb40bb6417d1454c3f742 --- /dev/null +++ b/core/old/cell.py @@ -0,0 +1,23 @@ +""" +Main data processing functions. The inputs for this functions must be data series. +""" + +import numpy as np +from pandas import Series, DataFrames + +def bg_sub(data:Series, background:float): + return data - background + +def ratio(data1:Series, data2:Series): + return data1 / data2 + +def norm_tps(data:Series, tps:list, fun=np.nanmean): + return data / fun(data.loc(axis=1)[tps]) + +def growth_rate(data:Series, alg=None, filt = 'savgol'): + if alg is None: + alg='standard' + + + + diff --git a/core/old/ph.py b/core/old/ph.py new file mode 100644 index 0000000000000000000000000000000000000000..a28a8055c8c64fe1ff84f9a6b5692f5d722bb260 --- /dev/null +++ b/core/old/ph.py @@ -0,0 +1,364 @@ +from typing import Dict, List, Union +import re + +import numpy as np +import pandas as pd +from pandas import Series, DataFrame +from sklearn.cluster import KMeans +from matplotlib import pyplot as plt +import seaborn as sns + +import compress_pickle + +from postprocessor.core.postprocessor import PostProcessor +from postprocessor.core.tracks import get_avg_grs, non_uniform_savgol +from postprocessor.core.ph import * + + +def filter_by_gfp(dfs): + gfps = pd.concat([t[("GFPFast_bgsub", np.maximum, "median")] for t in dfs]) + avgs_gfp = gfps.mean(axis=1) + high_gfp = get_high_k2(avgs_gfp) + # high_gfp = avgs_gfp[avgs_gfp > 200] + + return high_gfp + + +def filter_by_area(dfs, min=50): + areas = pd.concat([t[("general", None, "area")] for t in dfs]) + avgs_areas = areas[(areas.notna().sum(axis=1) > areas.shape[1] // (1.25))].mean( + axis=1 + ) + avgs_areas = avgs_areas[(avgs_areas > min)] + + return avgs_areas + + +def get_high_k2(df): + kmeans = KMeans(n_clusters=2) + vals = df.values.reshape(-1, 1) + kmeans.fit(vals) + high_clust_id = kmeans.cluster_centers_.argmax() + + return df.loc[kmeans.predict(vals) == high_clust_id] + + +def get_concats(dfs, keys): + return pd.concat([t.get((keys), pd.DataFrame()) for t in dfs]) + + +def get_dfs(pp): + dfs = [pp.extraction[pp.expt.positions[i]] for i in range(len(pp.expt.positions))] + return dfs + + +combine_dfs = lambda dfs: {k: get_concats(dfs, k) for k in dfs[0].keys()} + + +def merge_channels(pp, min_area=50): + dfs = get_dfs(pp) + # rats = get_concats(dfs, ("em_ratio_bgsub", np.maximum, "median")) + # gfps = filter_by_gfp(dfs) + + avgs_area = filter_by_area(dfs, min=50) + # ids = [x for x in set(gfps.index).intersection(avgs_area.index)] + ids = avgs_area.index + + new_dfs = combine_dfs(dfs) + + h = pd.DataFrame( + { + k[0] + "_" + k[2]: v.loc[ids].mean(axis=1) + for k, v in new_dfs.items() + if k[-1] != "imBackground" + } + ) + return h + + +def process_phs(pp, min_area=200): + h = merge_channels(pp, min_area) + h.index.names = ["pos", "trap", "cell"] + ids = h.index + h = h.reset_index() + + h["ph"] = h["pos"].apply(lambda x: float(x[3:7].replace("_", "."))) + h["max5_d_med"] = h["mCherry_max2p5pc"] / h["mCherry_median"] + + h = h.set_index(ids) + h = h.drop(["pos", "trap", "cell"], axis=1) + return h + + +def growth_rate( + data: Series, alg=None, filt={"kind": "savgol", "window": 5, "degree": 3} +): + window = filt["window"] + degree = filt["degree"] + if alg is None: + alg = "standard" + + if filt: # TODO add support for multiple algorithms + data = Series( + non_uniform_savgol( + data.dropna().index, data.dropna().values, window, degree + ), + index=data.dropna().index, + ) + + return Series(np.convolve(data, diff_kernel, "same"), index=data.dropna().index) + + +# import numpy as np + +# diff_kernel = np.array([1, -1]) +# gr = clean.apply(growth_rate, axis=1) +# from postprocessor.core.tracks import non_uniform_savgol, clean_tracks + + +def sort_df(df, by="first", rev=True): + nona = df.notna() + if by == "len": + idx = nona.sum(axis=1) + elif by == "first": + idx = nona.idxmax(axis=1) + idx = idx.sort_values().index + + if rev: + idx = idx[::-1] + + return df.loc[idx] + + +# test = tmp[("GFPFast", np.maximum, "median")] +# test2 = tmp[("pHluorin405", np.maximum, "median")] +# ph = test / test2 +# ph = ph.stack().reset_index(1) +# ph.columns = ["tp", "fl"] + + +def m2p5_med(ext, ch, red=np.maximum): + m2p5pc = ext[(ch, red, "max2p5pc")] + med = ext[(ch, red, "median")] + + result = m2p5pc / med + + return result + + +def plot_avg(df): + df = df.stack().reset_index(1) + df.columns = ["tp", "val"] + + sns.relplot(x=df["tp"], y=df["val"], kind="line") + plt.show() + + +def split_data(df: DataFrame, splits: List[int]): + dfs = [df.iloc[:, i:j] for i, j in zip((0,) + splits, splits + (df.shape[1],))] + return dfs + + +def growth_rate( + data: Series, alg=None, filt={"kind": "savgol", "window": 7, "degree": 3} +): + if alg is None: + alg = "standard" + + if filt: # TODO add support for multiple algorithms + window = filt["window"] + degree = filt["degree"] + data = Series( + non_uniform_savgol( + data.dropna().index, data.dropna().values, window, degree + ), + index=data.dropna().index, + ) + + diff_kernel = np.array([1, -1]) + + return Series(np.convolve(data, diff_kernel, "same"), index=data.dropna().index) + + +# pp = PostProcessor(source=19831) +# pp.load_tiler_cells() +# f = "/home/alan/Documents/sync_docs/libs/postproc/gluStarv_2_0_x2_dual_phl_ura8_00/extraction" +# pp.load_extraction( +# "/home/alan/Documents/sync_docs/libs/postproc/postprocessor/" +# + pp.expt.name +# + "/extraction/" +# ) +# tmp = pp.extraction["phl_ura8_002"] + + +def _check_bg(data): + for k in list(pp.extraction.values())[0].keys(): + for p in pp.expt.positions: + if k not in pp.extraction[p]: + print(p, k) + + +# data = { +# k: pd.concat([pp.extraction[pos][k] for pos in pp.expt.positions[:-3]]) +# for k in list(pp.extraction.values())[0].keys() +# } + + +def hmap(df, **kwargs): + g = sns.heatmap(sort_df(df), robust=True, cmap="mako_r", **kwargs) + plt.xlabel("") + return g + + +# from random import randint +# x = randint(0, len(smooth)) +# plt.plot(clean.iloc[x], 'b') +# plt.plot(smooth.iloc[x], 'r') +# plt.show() + + +# data = tmp +# df = data[("general", None, "area")] +# clean = clean_tracks(df, min_len=160) +# clean = clean.loc[clean.notna().sum(axis=1) > 9] +# gr = clean.apply(growth_rate, axis=1) +# splits = (72, 108, 180) +# gr_sp = split_data(gr, splits) + +# idx = gr.index + +# bg = get_bg(data) +# test = data[("GFPFast", np.maximum, "median")] +# test2 = data[("pHluorin405", np.maximum, "median")] +# ph = (test / test2).loc[idx] +# c = pd.concat((ph.mean(1), gr.max(1)), axis=1) +# c.columns = ["ph", "gr_max"] +# # ph = ph.stack().reset_index(1) +# # ph.columns = ['tp', 'fl'] + +# ph_sp = split_data(gr, splits) + + +def get_bg(data): + bg = {} + fl_subkeys = [ + x + for x in data.keys() + if x[0] in ["GFP", "GFPFast", "mCherry", "pHluorin405"] + and x[-1] != "imBackground" + ] + for k in fl_subkeys: + nk = list(k) + bk = tuple(nk[:-1] + ["imBackground"]) + nk = tuple(nk[:-1] + [nk[-1] + "_BgSub"]) + tmp = [] + for i, v in data[bk].iterrows(): + if i in data[k].index: + newdf = data[k].loc[i] / v + newdf.index = pd.MultiIndex.from_tuples([(*i, c) for c in newdf.index]) + tmp.append(newdf) + bg[nk] = pd.concat(tmp) + + return bg + + +def calc_ph(bg): + fl_subkeys = [x for x in bg.keys() if x[0] in ["GFP", "GFPFast", "pHluorin405"]] + chs = list(set([x[0] for x in fl_subkeys])) + assert len(chs) == 2, "Too many channels" + ch1 = [x[1:] for x in fl_subkeys if x[0] == chs[0]] + ch2 = [x[1:] for x in fl_subkeys if x[0] == chs[1]] + inter = list(set(ch1).intersection(ch2)) + ph = {} + for red_fld in inter: + ph[tuple(("ph",) + red_fld)] = ( + bg[tuple((chs[0],) + red_fld)] / bg[tuple((chs[1],) + red_fld)] + ) + + +def get_traps(pp): + t0 = {} + for pos in pp.tiler.positions: + pp.tiler.current_position = pos + t0[pos] = pp.tiler.get_traps_timepoint( + 0, channels=[0, pp.tiler.channels.index("mCherry")], z=[0, 1, 2, 3, 4] + ) + + return t0 + + +def get_pos_ph(pp): + pat = re.compile(r"ph_([0-9]_[0-9][0-9])") + return { + pos: float(pat.findall(pos)[0].replace("_", ".")) for pos in pp.tiler.positions + } + + +def plot_sample_bf_mch(pp): + bf_mch = get_traps(pp) + ts = [{i: v[:, j, ...] for i, v in bf_mch.items()} for j in [0, 1]] + tsbf = {i: v[:, 0, ...] for i, v in bf_mch.items()} + + posdict = {k: v for k, v in get_pos_ph(pp).items()} + posdict = {v: k for k, v in posdict.items()} + posdict = {v: k for k, v in posdict.items()} + ph = np.unique(list(posdict.values())).tolist() + counters = {ph: 0 for ph in ph} + n = [np.random.randint(ts[0][k].shape[0]) for k in posdict.keys()] + + fig, axes = plt.subplots(2, 5) + for k, (t, name) in enumerate(zip(ts, ["Bright field", "mCherry"])): + for i, (pos, ph) in enumerate(posdict.items()): + # i = ph.index(posdict[pos]) + axes[k, i].grid(False) + axes[k, i].set( + xticklabels=[], + yticklabels=[], + ) + axes[k, i].set_xlabel(posdict[pos] if k else None, fontsize=28) + axes[k, i].imshow( + np.maximum.reduce(t[pos][n[i], 0], axis=2), + cmap="gist_gray" if not k else None, + ) + # counters[posdict[pos]] += 1 + plt.tick_params( + axis="x", # changes apply to the x-axis + which="both", # both major and minor ticks are affected + bottom=False, # ticks along the bottom edge are off + top=False, # ticks along the top edge are off + labelbottom=False, + ) # labels along the bottom edge are off + axes[k, 0].set_ylabel(name, fontsize=28) + plt.tight_layout() + plt.show() + + +# Plotting calibration curve +from scipy.optimize import curve_fit + + +def fit_calibration(h): + ycols = [x for x in h.columns if "em_ratio" in x] + xcol = "ph" + + def objective(x, a, b): + return a * x + b + + # fig, axes = plt.subplots(1, len(ycols)) + # for i, ycol in enumerate(ycols): + # d = h[[xcol, ycol]] + # params, _ = curve_fit(objective, *[d[col].values for col in d.columns]) + # sns.lineplot(x=xcol, y=ycol, data=h, alpha=0.5, err_style="bars", ax=axes[i]) + # # sns.lineplot(d[xcol], objective(d[xcol].values, *params), ax=axes[i]) + # plt.show() + + ycol = "em_ratio_mean" + d = h[[xcol, *ycols]] + tmp = d.groupby("ph").mean() + calibs = {ycol: curve_fit(objective, tmp.index, tmp[ycol])[0] for ycol in ycols} + # sns.lineplot(x=xcol, y=ycol, data=d, alpha=0.5, err_style="bars") + # plt.xlabel("pH") + # plt.ylabel("pHluorin emission ratio") + # sns.lineplot(d[xcol], objective(d[xcol], *params)) + + return calibs diff --git a/core/old/postprocessor.py b/core/old/postprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..7e024d2656d5885025cfe0bb4314cdc7499a3da5 --- /dev/null +++ b/core/old/postprocessor.py @@ -0,0 +1,106 @@ +import pkg_resources +from pathlib import Path, PosixPath +from typing import Union, List, Dict +from datetime import datetime + +from compress_pickle import load, dump + +from core.experiment import Experiment +from core.cells import Cells +from core.segment import Tiler +from core.io.matlab import matObject +from core.experiment import Experiment + + +class PostProcessor: + ''' + Base class to perform feature extraction. + :param parameters: Parameters class with channels, reduction functions and + extraction functions to use. + :param source: Origin of experiment, if int it is assumed from Omero, if str + or Path + ''' + def __init__(self, parameters=None, source: Union[int, str, Path] = None): + # self.params = parameters + if source is not None: + if type(source) is int: + self.expt_id = source + self.load_expt(source, omero=True) + elif type(source) is str or PosixPath: + self.load_expt(source, omero=False) + + @property + def channels(self): + if not hasattr(self, '_channels'): + if type(self.params.tree) is dict: + self._channels = tuple(self.params.tree.keys()) + + return self._channels + + @property + def current_position(self): + assert hasattr(self, 'expt'), 'No experiment loaded.' + + if hasattr(self, 'tiler'): + assert self.expt.current_position.name == self.tiler.current_position + + return self.expt.current_position + + def load_expt(self, source: Union[int, str], omero: bool = False) -> None: + if omero: + self.expt = Experiment.from_source( + self.expt_id, #Experiment ID on OMERO + 'upload', #OMERO Username + '***REMOVED***', #OMERO Password + 'islay.bio.ed.ac.uk', #OMERO host + port=4064 #This is default + ) + else: + self.expt = Experiment.from_source(source) + self.expt_id = self.expt.exptID + + def load_tiler_cells(self) -> None: + self.tiler = Tiler(self.expt) + self.cells = Cells() + self.cells = self.cells.from_source( + self.expt.current_position.annotation) + + def get_pos_mo_bud(self): + annot = self.expt._get_position_annotation( + self.expt.current_position.name) + matob = matObject(annot) + m = matob["timelapseTrapsOmero"].get("cellMothers", None) + if m is not None: + ids = np.nonzero(m.todense()) + d = {(self.expt.current_position.name, i, int(m[i, j])): [] + for i, j in zip(*ids)} + for i, j, k in zip(*ids, d.keys()): + d[k].append((self.expt.current_position.name, i, j + 1)) + else: + print("Pos {} has no mother matrix".format( + self.expt.current_position.name)) + d = {} + + return d + + def get_exp_mo_bud(self): + d = {} + for pos in self.expt.positions: + self.expt.current_position = pos + d = {**d, **self.get_pos_mo_bud()} + + self.expt.current_position = self.expt.positions[0] + + return d + + def load_extraction(self, folder=None) -> None: + if folder is None: + folder = Path(self.expt.name + '/extraction') + + self.extraction = {} + for pos in self.expt.positions: + try: + self.extraction[pos] = load(folder / Path(pos + '.gz')) + + except: + print(pos, ' not found') diff --git a/core/processes/__init__.py b/core/processes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e5a0d9b4834ec8f46d6e0d1256c6dcaad2e460fe --- /dev/null +++ b/core/processes/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/core/processes/aggregate.py b/core/processes/aggregate.py new file mode 100644 index 0000000000000000000000000000000000000000..cfe8e8f03ad5a5cb8ee4c1a58f1dcb837eaedc4f --- /dev/null +++ b/core/processes/aggregate.py @@ -0,0 +1,74 @@ +from itertools import cycle + +import numpy as np +import pandas as pd + +from agora.base import ParametersABC, ProcessABC + + +class aggregateParameters(ParametersABC): + """ + Parameters + reduction: str to be passed to a dataframe for collapsing across columns + """ + + def __init__(self, reductions): + super().__init__() + self.reductions = reductions + + @classmethod + def default(cls): + return cls.from_dict({"reductions": ["mean", "median", "max"]}) + + +class aggregate(ProcessABC): + """ + aggregate multiple datasets + """ + + def __init__(self, parameters: aggregateParameters): + super().__init__(parameters) + + def run(self, signals): + names = np.array([signal.index.names for signal in signals]) + index = signals[0].index + for s in signals[0:]: + index = index.intersection(s.index) + + tmp_signals = [s.loc[index] for s in signals] + for i, s in enumerate(signals): + tmp_signals[i].name = s.name + signals = tmp_signals + + assert len(signals), "Signals is empty" + + bad_words = { + "postprocessing", + "extraction", + "None", + "np_max", + "", + } + get_keywords = lambda df: [ + ind + for item in df.name.split("/") + for ind in item.split("/") + if ind not in bad_words + ] + colnames = [ + "_".join(get_keywords(s) + [red]) + for s in signals + for red in self.parameters.reductions + ] + concat = pd.concat( + [ + getattr(signal, red)(axis=1) + for signal in signals + for red in self.parameters.reductions + ], + names=signals[0].index.names, + axis=1, + ) + concat.columns = colnames + + return concat diff --git a/core/processes/base.py b/core/processes/base.py new file mode 100644 index 0000000000000000000000000000000000000000..142bd6c3fa2c5ef0fc578d3d09bfa47530482619 --- /dev/null +++ b/core/processes/base.py @@ -0,0 +1 @@ +from agora.base import ParametersABC, ProcessABC \ No newline at end of file diff --git a/core/processes/births.py b/core/processes/births.py new file mode 100644 index 0000000000000000000000000000000000000000..e5a0d9b4834ec8f46d6e0d1256c6dcaad2e460fe --- /dev/null +++ b/core/processes/births.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/core/processes/bud_metric.py b/core/processes/bud_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..518ddf5231c96458763fca35f52762def97e8b73 --- /dev/null +++ b/core/processes/bud_metric.py @@ -0,0 +1,65 @@ +import numpy as np +import pandas as pd +from agora.base import ParametersABC, ProcessABC + + +class bud_metricParameters(ParametersABC): + """ + Parameters + """ + + def __init__(self, mode="longest"): + super().__init__() + self.mode = mode + + @classmethod + def default(cls): + return cls.from_dict({"mode": "longest"}) + + +class bud_metric(ProcessABC): + """ + Obtain the volume of daughter cells + if 'longest' assumes a single mother per trap. + """ + + def __init__(self, parameters: bud_metricParameters): + super().__init__(parameters) + + def run(self, signal: pd.DataFrame): + if self.parameters.mode is "longest": + result = self.get_bud_metric_wrap(signal) + + return result + + @staticmethod + def get_bud_metric(signal): + mother_id = signal.index[signal.notna().sum(axis=1).argmax()] + + nomother = signal.drop(mother_id) + + starts = nomother.apply(pd.Series.first_valid_index, axis=1).sort_values() + + ranges = [np.arange(i, j) for i, j in zip(starts[:-1], starts[1:])] + ranges.append(np.arange(starts.iloc[-1], signal.columns[-1])) + + bud_metric = pd.concat( + [signal.loc[i, rng] for i, rng in zip(starts.index, ranges)] + ) + srs = pd.Series(bud_metric, index=signal.columns, name=mother_id) + + return srs + + def get_bud_metric_wrap(self, signals): + srs = [ + self.get_bud_metric(signals.loc[trap]) + for trap in signals.index.unique(level="trap") + ] + index = [ + (trap, mother.name) + for trap, mother in zip(signals.index.unique(level="trap"), srs) + ] + + concatenated = pd.concat(srs, keys=index, axis=1, sort=True).T.sort_index() + concatenated.index.names = ["trap", "cell_label"] + return concatenated diff --git a/core/processes/dsignal.py b/core/processes/dsignal.py new file mode 100644 index 0000000000000000000000000000000000000000..484ad59944c12cd12005c02b9470b84d8f2f19d7 --- /dev/null +++ b/core/processes/dsignal.py @@ -0,0 +1,28 @@ +import pandas as pd + +from postprocessor.core.processes.base import ParametersABC, ProcessABC + + +class dsignalParameters(ParametersABC): + """ + :window: Number of timepoints to consider for signal. + """ + + def __init__(self, window: int): + self.window = window + + @classmethod + def default(cls): + return cls.from_dict({"window": 3}) + + +class dsignal(ProcessABC): + """ + Calculate the change in a signal depending on a window + """ + + def __init__(self, parameters: dsignalParameters): + super().__init__(parameters) + + def run(self, signal: pd.DataFrame): + return signal.rolling(window=self.parameters.window, axis=1).mean().diff(axis=1) diff --git a/core/processes/gpsignal.py b/core/processes/gpsignal.py new file mode 100644 index 0000000000000000000000000000000000000000..f6cdb45fbd1416630382246fb3152c9d99184076 --- /dev/null +++ b/core/processes/gpsignal.py @@ -0,0 +1,97 @@ +"""Gaussian process fit of a Signal.""" +import logging + +from postprocessor.core.processes.base import ParametersABC, ProcessABC +import numpy as np +import pandas as pd + +import gaussian_processes.gaussianprocess as gp + +def estimate_gr(volume, dt, noruns, bounds, verbose): + """ + Parameters + ---------- + + volume : pd.Series + The volume series of a given cell + dt : float + The time interval in hours + noruns : int + The number of runs for optimisation + bounds : dict + The hyperparameter bounds used for optimisation + verbose : bool + If True, prints results + + Returns + ------- + """ + volume = volume.values + n = len(volume) + idx = np.arange(n) + t = idx * dt + y = volume[volume > 0] + x = t[volume > 0] + idx = idx[volume > 0] + # Fit the GP + mg = gp.maternGP(bounds, x, y) + mg.findhyperparameters(noruns=noruns) + if verbose: + mg.results() # Prints out the hyperparameters + mg.predict(x, derivs=2) # Saves the predictions to object + # Save the predictions to a csv file so they can be reused + results = dict(time=mg.x, volume=mg.y, fit_time=mg.xnew, fit_volume=mg.f, + growth_rate=mg.df, d_growth_rate=mg.ddf, + volume_var=mg.fvar, growth_rate_var=mg.dfvar, + d_growth_rate_var=mg.ddfvar) + for name, value in results.items(): + results[name] = np.full((n, ), np.nan) + results[name][idx] = value + return results + +# Give that to a writer: NOTE the writer needs to learn how to write the +# output of a process that returns multiple signals like this one does. + +class gpParameters(ParametersABC): + default_dict = dict(dt=5, + noruns=5, + bounds={0: (-2, 3), + 1: (-2, 1), + 2: (-4, -1)}, + verbose=False) + def __init__(self, dt, noruns, bounds, verbose): + """ + Parameters + ---------- + dt : float + The time step between time points, in minutes + noruns : int + The number of times the optimisation is tried + bounds : dict + Hyperparameter bounds for the Matern Kernel + verbose : bool + Determines whether to print hyperparameter results + """ + self.dt = dt + self.noruns = noruns + self.bounds = bounds + self.verbose = verbose + + @classmethod + def default(cls): + return cls.from_dict(cls.default_dict) + + +class GPSignal(ProcessABC): + """Gaussian process fit of a Signal. + """ + def __init__(self, parameters: gpParameters): + super().__init__(parameters) + + def run(self, signal: pd.DataFrame): + results = signal.apply(lambda x: estimate_gr(x, + **self.parameters.to_dict()), + result_type='expand').T + multi_signal = {name: pd.DataFrame(np.vstack(results[name])) + for name in results.columns} + return multi_signal diff --git a/core/processes/merger.py b/core/processes/merger.py new file mode 100644 index 0000000000000000000000000000000000000000..a79227fac3e9522b6c9ac2a218751c179f1b7477 --- /dev/null +++ b/core/processes/merger.py @@ -0,0 +1,60 @@ +from agora.base import ParametersABC, ProcessABC +from postprocessor.core.functions.tracks import get_joinable + + +class mergerParameters(ParametersABC): + """ + :param tol: float or int threshold of average (prediction error/std) necessary + to consider two tracks the same. If float is fraction of first track, + if int it is absolute units. + :param window: int value of window used for savgol_filter + :param degree: int value of polynomial degree passed to savgol_filter + """ + + def __init__( + self, + tolerance: float, + smooth: bool = False, + window: int = 5, + degree: int = 3, + min_avg_delta: float = 0.9, + ): + + self.tolerance = tolerance + + self.smooth = smooth + + self.window = window + + self.degree = degree + + self.min_avg_delta = min_avg_delta + + @classmethod + def default(cls): + return cls.from_dict( + { + "smooth": False, + "tolerance": 0.2, + "window": 5, + "degree": 3, + "min_avg_delta": 0.9, + } + ) + + +class merger(ProcessABC): + """ + TODO check why it needs to be run a few times to complete the merging + """ + + def __init__(self, parameters): + super().__init__(parameters) + + def run(self, signal): + joinable = get_joinable(signal, tol=self.parameters.tolerance) + # merged, _ = merge_tracks(signal) # , min_len=self.window + 1) + # indices = (*zip(*merged.index.tolist()),) + # names = merged.index.names + # return {name: ids for name, ids in zip(names, indices)} + return joinable diff --git a/core/processes/peaks.py b/core/processes/peaks.py new file mode 100644 index 0000000000000000000000000000000000000000..15ddf950ea1ebb16a6a46177474dbec14489e1f3 --- /dev/null +++ b/core/processes/peaks.py @@ -0,0 +1,44 @@ +from scipy.signal import argrelmax, argrelmin +from postprocessor.core.processes.base import ParametersABC, ProcessABC + + +class PeaksParameters(ParametersABC): + """ + Parameters + type : str {minima, maxima, "all"}. Determines which type of peaks to identify + order : int Parameter to pass to scipy.signal.argrelextrema indicating + how many points to use for comparison. + """ + + def __init__(self, type): + self.type = type + + @classmethod + def default(cls): + return cls.from_dict({"type": "minima", "order": 3}) + + +class Peaks(ProcessABC): + """ + Identifies a signal sharply dropping. + """ + + def __init__(self, parameters: PeaksParameters): + super().__init__(parameters) + + def run(self, signal: pd.DataFrame): + """ + Returns a boolean dataframe with the same shape as the + original signal but with peaks as true values. + """ + peaks_mat = np.zeros_like(signal, dtype=bool) + + comparator = np.less if self.parameters.type is "minima" else np.greater + peaks_ids = argrelextrema(new_df, comparator=comparator, order=order) + peaks_mat[peak_ids] = True + + return pd.DataFrame( + peaks_mat, + index=signal.index, + columns=signal.columns, + ) diff --git a/core/processes/picker.py b/core/processes/picker.py new file mode 100644 index 0000000000000000000000000000000000000000..a7221e6f8146609a59921e3752826aedac6c3802 --- /dev/null +++ b/core/processes/picker.py @@ -0,0 +1,339 @@ +import seaborn as sns +from matplotlib import pyplot as plt # TODO DELETE THIS + +from typing import Tuple, Union, List +from abc import ABC, abstractmethod + +from itertools import groupby + +from utils_find_1st import find_1st, cmp_equal +import numpy as np +import pandas as pd + +from pcore.cells import CellsHDF + +from agora.base import ParametersABC, ProcessABC +from postprocessor.core.functions.tracks import max_ntps, max_nonstop_ntps + + +class pickerParameters(ParametersABC): + def __init__( + self, + sequence: List[str] = ["lineage", "condition"], + ): + self.sequence = sequence + + @classmethod + def default(cls): + return cls.from_dict( + { + "sequence": [ + # ("lineage", "intersection", "families"), + ("condition", "intersection", "any_present", 0.8), + ("condition", "intersection", "growing", 40), + ("condition", "intersection", "present", 8), + ("condition", "intersection", "mother_buds", 5, 0.8), + # ("lineage", "full_families", "intersection"), + ], + } + ) + + +class picker(ProcessABC): + """ + :cells: Cell object passed to the constructor + :condition: Tuple with condition and associated parameter(s), conditions can be + "present", "nonstoply_present" or "quantile". + Determines the thersholds or fractions of signals/signals to use. + :lineage: str {"mothers", "daughters", "families" (mothers AND daughters), "orphans"}. Mothers/daughters picks cells with those tags, families pick the union of both and orphans the difference between the total and families. + """ + + def __init__( + self, + parameters: pickerParameters, + cells: CellsHDF, + ): + super().__init__(parameters=parameters) + + self._cells = cells + + @staticmethod + def mother_assign_to_mb_matrix(ma: List[np.array]): + # Convert from list of lists to mother_bud sparse matrix + ncells = sum([len(t) for t in ma]) + mb_matrix = np.zeros((ncells, ncells), dtype=bool) + c = 0 + for cells in ma: + for d, m in enumerate(cells): + if m: + mb_matrix[c + d, c + m - 1] = True + + c += len(cells) + + return mb_matrix + + @staticmethod + def mother_assign_from_dynamic(ma, label, trap, ntraps: int): + """ + Interpolate the list of lists containing the associated mothers from the mother_assign_dynamic feature + """ + idlist = list(zip(trap, label)) + cell_gid = np.unique(idlist, axis=0) + + last_lin_preds = [ + find_1st(((label[::-1] == lbl) & (trap[::-1] == tr)), True, cmp_equal) + for tr, lbl in cell_gid + ] + mother_assign_sorted = ma[last_lin_preds] + + traps = cell_gid[:, 0] + iterator = groupby(zip(traps, mother_assign_sorted), lambda x: x[0]) + d = {key: [x[1] for x in group] for key, group in iterator} + nested_massign = [d.get(i, []) for i in range(ntraps)] + + return nested_massign + + def pick_by_lineage(self, signals, how): + + idx = signals.index + + if how: + mothers = set(self.mothers) + daughters = set(self.daughters) + # daughters, mothers = np.where(mother_bud_mat) + + search = lambda a, b: np.where( + np.in1d( + np.ravel_multi_index(np.array(a).T, np.array(a).max(0) + 1), + np.ravel_multi_index(np.array(b).T, np.array(a).max(0) + 1), + ) + ) + if how == "mothers": + idx = mothers + elif how == "daughters": + idx = daughters + elif how == "daughters_w_mothers": + present_mothers = idx.intersection(mothers) + idx = set( + [ + tuple(x) + for m in present_mothers + for x in np.array(self.daughters)[search(self.mothers, m)] + ] + ) + + print("associated daughters: ", idx) + elif how == "mothers_w_daughters": + present_daughters = idx.intersection(daughters) + idx = set( + [ + tuple(x) + for d in present_daughters + for x in np.array(self.mothers)[search(self.daughters, d)] + ] + ) + elif how == "full_families": + present_mothers = idx.intersection(mothers) + dwm_idx = set( + [ + tuple(x) + for m in present_mothers + for x in np.array(self.daughters)[ + search(np.array(self.mothers), m) + ] + ] + ) + present_daughters = idx.intersection(daughters) + mwd_idx = set( + [ + tuple(x) + for d in present_daughters + for x in np.array(self.mothers)[ + search(np.array(self.daughters), d) + ] + ] + ) + idx = mwd_idx.union(dwm_idx) + elif how == "families" or how == "orphans": + families = mothers.union(daughters) + if how == "families": + idx = families + elif how == "orphans": + idx = idx.diference(families) + + idx = idx.intersection(signals.index) + + return idx + + def pick_by_condition(self, signals, condition, thresh): + idx = self.switch_case(signals, condition, thresh) + return idx + + def get_mothers_daughters(self): + ma = self._cells["mother_assign_dynamic"] + trap = self._cells["trap"] + label = self._cells["cell_label"] + nested_massign = self.mother_assign_from_dynamic( + ma, label, trap, self._cells.ntraps + ) + # mother_bud_mat = self.mother_assign_to_mb_matrix(nested_massign) + + idx = set( + [ + (tid, i + 1) + for tid, x in enumerate(nested_massign) + for i in range(len(x)) + ] + ) + mothers, daughters = zip( + *[ + ((tid, m), (tid, d)) + for tid, trapcells in enumerate(nested_massign) + for d, m in enumerate(trapcells, 1) + if m + ] + ) + return mothers, daughters + + def run(self, signals): + indices = set(signals.index) + self.mothers, self.daughters = self.get_mothers_daughters() + for alg, op, *params in self.sequence: + if alg is "lineage": + param1 = params[0] + new_indices = getattr(self, "pick_by_" + alg)( + signals.loc[list(indices)], param1 + ) + else: + param1, *param2 = params + new_indices = getattr(self, "pick_by_" + alg)( + signals.loc[list(indices)], param1, param2 + ) + + if op is "union": + # new_indices = new_indices.intersection(set(signals.index)) + new_indices = indices.union(new_indices) + + indices = indices.intersection(new_indices) + + return np.array(list(indices)) + + @staticmethod + def switch_case( + signals: pd.DataFrame, + condition: str, + threshold: Union[float, int, list], + ): + if len(threshold) == 1: + threshold = [_as_int(*threshold, signals.shape[1])] + case_mgr = { + "any_present": lambda s, thresh: any_present(s, thresh), + "present": lambda s, thresh: s.notna().sum(axis=1) > thresh, + "nonstoply_present": lambda s, thresh: s.apply(thresh, axis=1) > thresh, + "growing": lambda s, thresh: s.diff(axis=1).sum(axis=1) > thresh, + "mother_buds": lambda s, p1, p2: mother_buds_wrap(s, p1, p2) + # "quantile": [np.quantile(signals.values[signals.notna()], threshold)], + } + return set(signals.index[case_mgr[condition](signals, *threshold)]) + + +def any_present(signals, threshold): + """ + Returns a mask for cells, True if there is a cell in that trap that was present for more than :threshold: timepoints. + """ + any_present = pd.Series( + np.sum( + [ + np.isin([x[0] for x in signals.index], i) & v + for i, v in (signals.notna().sum(axis=1) > threshold) + .groupby("trap") + .any() + .items() + ], + axis=0, + ).astype(bool), + index=signals.index, + ) + return any_present + + +from copy import copy + + +def mother_buds(df, min_budgrowth_t, min_mobud_ratio): + """ + Parameters + ---------- + signals : pd.DataFrame + min_budgrowth_t: Minimal number of timepoints we lock reassignment after assigning bud + min_initial_size: Minimal mother-bud ratio at the assignment + #TODO incorporate bud-assignment data? + + # If more than one bud start in the same time point pick the smallest one + """ + + ntps = df.notna().sum(axis=1) + mother_id = df.index[ntps.argmax()] + nomother = df.drop(mother_id) + if not len(nomother): + return [] + nomother = ( # Clean short-lived cells outside our mother cell's timepoints + nomother.loc[ + nomother.apply( + lambda x: x.first_valid_index() >= df.loc[mother_id].first_valid_index() + and x.last_valid_index() <= df.loc[mother_id].last_valid_index(), + axis=1, + ) + ] + ) + + start = nomother.apply(pd.Series.first_valid_index, axis=1) + + # clean duplicates + duplicates = start.duplicated(False) + if duplicates.any(): + dup_tps = np.unique(start[duplicates]) + idx, tps = zip( + *[(nomother.loc[start == tp, tp].idxmin(), tp) for tp in dup_tps] + ) + start = start[~duplicates] + start = pd.concat( + (start, pd.Series(tps, index=idx, dtype="int", name="cell_label")) + ) + nomother = nomother.loc[start.index] + nomother.index = nomother.index.astype("int") + + d_to_mother = nomother[start] - df.loc[mother_id, start] * min_mobud_ratio + size_filter = d_to_mother[ + d_to_mother.apply(lambda x: x.dropna().iloc[0], axis=1) < 0 + ] + cols_sorted = ( + size_filter.sort_index(axis=1) + .apply(pd.Series.first_valid_index, axis=1) + .sort_values() + ) + if not len(cols_sorted): + return [] + bud_candidates = cols_sorted.loc[ + [True, *(np.diff(cols_sorted.values) > min_budgrowth_t)] + ] + + return [mother_id] + [int(i) for i in bud_candidates.index.tolist()] + + +def mother_buds_wrap(signals, *args): + ids = [] + for trap in signals.index.unique(level="trap"): + df = signals.loc[trap] + selected_ids = mother_buds(df, *args) + ids += [(trap, i) for i in selected_ids] + + idx_srs = pd.Series(False, signals.index).astype(bool) + idx_srs.loc[ids] = True + return idx_srs + + +def _as_int(threshold: Union[float, int], ntps: int): + if type(threshold) is float: + threshold = ntps * threshold + return threshold diff --git a/core/processes/savgol.py b/core/processes/savgol.py new file mode 100644 index 0000000000000000000000000000000000000000..d0256736bb83a26e253302017babe122545d8260 --- /dev/null +++ b/core/processes/savgol.py @@ -0,0 +1,152 @@ +import numpy as np +import pandas as pd + +from postprocessor.core.processes.base import ParametersABC, ProcessABC + + +class savgolParameters(ParametersABC): + """ + Parameters + + window : int (odd) + Window length of datapoints. Must be odd and smaller than x + polynom : int + The order of polynom used. Must be smaller than the window size + """ + + def __init__(self, window, polynom): + self.window = window + self.polynom = polynom + + @classmethod + def default(cls): + return cls.from_dict({"window": 3, "polynom": 2}) + + +class savgol(ProcessABC): + """ + Apply Savitzky-Golay filter (works with NaNs, but it might return + NaN regions). + """ + + def __init__(self, parameters: savgolParameters): + super().__init__(parameters) + + def run(self, signal: pd.DataFrame): + savgol_on_srs = lambda x: self.non_uniform_savgol( + x.index, x.values, self.parameters.window, self.parameters.polynom + ) + return signal.apply(savgol_on_srs, 1) + + @staticmethod + def non_uniform_savgol(x, y, window: int, polynom: int): + """ + Applies a Savitzky-Golay filter to y with non-uniform spacing + as defined in x + + This is based on https://dsp.stackexchange.com/questions/1676/savitzky-golay-smoothing-filter-for-not-equally-spaced-data + The borders are interpolated like scipy.signal.savgol_filter would do + + source: https://dsp.stackexchange.com/a/64313 + + Parameters + ---------- + x : array_like + List of floats representing the x values of the data + y : array_like + List of floats representing the y values. Must have same length + as x + window : int (odd) + Window length of datapoints. Must be odd and smaller than x + polynom : int + The order of polynom used. Must be smaller than the window size + + Returns + ------- + np.array of float + The smoothed y values + """ + if len(x) != len(y): + raise ValueError('"x" and "y" must be of the same size') + + if len(x) < window: + raise ValueError("The data size must be larger than the window size") + + if type(window) is not int: + raise TypeError('"window" must be an integer') + + if window % 2 == 0: + raise ValueError('The "window" must be an odd integer') + + if type(polynom) is not int: + raise TypeError('"polynom" must be an integer') + + if polynom >= window: + raise ValueError('"polynom" must be less than "window"') + + half_window = window // 2 + polynom += 1 + + # Initialize variables + A = np.empty((window, polynom)) # Matrix + tA = np.empty((polynom, window)) # Transposed matrix + t = np.empty(window) # Local x variables + y_smoothed = np.full(len(y), np.nan) + + # Start smoothing + for i in range(half_window, len(x) - half_window, 1): + # Center a window of x values on x[i] + for j in range(0, window, 1): + t[j] = x[i + j - half_window] - x[i] + + # Create the initial matrix A and its transposed form tA + for j in range(0, window, 1): + r = 1.0 + for k in range(0, polynom, 1): + A[j, k] = r + tA[k, j] = r + r *= t[j] + + # Multiply the two matrices + tAA = np.matmul(tA, A) + + # Invert the product of the matrices + tAA = np.linalg.inv(tAA) + + # Calculate the pseudoinverse of the design matrix + coeffs = np.matmul(tAA, tA) + + # Calculate c0 which is also the y value for y[i] + y_smoothed[i] = 0 + for j in range(0, window, 1): + y_smoothed[i] += coeffs[0, j] * y[i + j - half_window] + + # If at the end or beginning, store all coefficients for the polynom + if i == half_window: + first_coeffs = np.zeros(polynom) + for j in range(0, window, 1): + for k in range(polynom): + first_coeffs[k] += coeffs[k, j] * y[j] + elif i == len(x) - half_window - 1: + last_coeffs = np.zeros(polynom) + for j in range(0, window, 1): + for k in range(polynom): + last_coeffs[k] += coeffs[k, j] * y[len(y) - window + j] + + # Interpolate the result at the left border + for i in range(0, half_window, 1): + y_smoothed[i] = 0 + x_i = 1 + for j in range(0, polynom, 1): + y_smoothed[i] += first_coeffs[j] * x_i + x_i *= x[i] - x[half_window] + + # Interpolate the result at the right border + for i in range(len(x) - half_window, len(x), 1): + y_smoothed[i] = 0 + x_i = 1 + for j in range(0, polynom, 1): + y_smoothed[i] += last_coeffs[j] * x_i + x_i *= x[i] - x[-half_window - 1] + + return y_smoothed diff --git a/core/processes/template.py b/core/processes/template.py new file mode 100644 index 0000000000000000000000000000000000000000..6fad758f7be89e37d40d3ba15e7d6674e25ba56b --- /dev/null +++ b/core/processes/template.py @@ -0,0 +1,31 @@ +import numpy as np +import pandas as pd + +from agora.base import ParametersABC, ProcessABC + + +class TemplateParameters(ParametersABC): + """ + Parameters + """ + + def __init__( + self, + ): + super().__init__() + + @classmethod + def default(cls): + return cls.from_dict({}) + + +class Template(ProcessABC): + """ + Template for process class. + """ + + def __init__(self, parameters: TemplateParameters): + super().__init__(parameters) + + def run(self): + pass diff --git a/core/processor.py b/core/processor.py new file mode 100644 index 0000000000000000000000000000000000000000..81c54b6f19f60df1316223df87ad8dbb883c1ef3 --- /dev/null +++ b/core/processor.py @@ -0,0 +1,316 @@ +import h5py +from typing import List, Dict, Union +from pydoc import locate + +import numpy as np +import pandas as pd + +from tqdm import tqdm + +from agora.base import ParametersABC +from pcore.io.writer import Writer +from pcore.io.signal import Signal + +from pcore.cells import Cells +from postprocessor.core.processes.merger import mergerParameters, merger +from postprocessor.core.processes.picker import pickerParameters, picker + + +class PostProcessorParameters(ParametersABC): + """ + Anthology of parameters used for postprocessing + :merger: + :picker: parameters for picker + :processes: Dict processes:[objectives], 'processes' are defined in ./processes/ + while objectives are relative or absolute paths to datasets. If relative paths the + post-processed addresses are used. + + """ + + def __init__( + self, + targets={}, + parameters={}, + outpaths={}, + ): + self.targets: Dict = targets + self.parameters: Dict = parameters + self.outpaths: Dict = outpaths + + def __getitem__(self, item): + return getattr(self, item) + + @classmethod + def default(cls, kind=None): + if kind == "defaults" or kind == None: + return cls( + targets={ + "prepost": { + "merger": "/extraction/general/None/area", + "picker": ["/extraction/general/None/area"], + }, + "processes": ( + ( + "bud_metric", + [ + "/extraction/general/None/volume", + "/extraction/em_ratio/np_max/mean", + "/extraction/em_ratio/np_max/median", + ], + ), + ( + "dsignal", + [ + "/extraction/general/None/volume", + "/extraction/em_ratio/np_max/mean", + "/extraction/em_ratio/np_max/median", + "/extraction/em_ratio_bgsub/np_max/mean", + "/extraction/em_ratio_bgsub/np_max/median", + "/postprocessing/bud_metric/extraction_general_None_volume", + "/postprocessing/bud_metric/extraction_em_ratio_np_max_mean", + "/postprocessing/bud_metric/extraction_em_ratio_np_max_median", + ], + ), + ( + "aggregate", + [ + [ + "/extraction/general/None/volume", + "/extraction/em_ratio/np_max/mean", + "/extraction/em_ratio/np_max/median", + "/extraction/em_ratio_bgsub/np_max/mean", + "/extraction/em_ratio_bgsub/np_max/median", + "/extraction/gsum/np_max/median", + "/extraction/gsum/np_max/mean", + "postprocessing/bud_metric/extraction_general_None_volume", + "postprocessing/bud_metric/extraction_em_ratio_np_max_mean", + "postprocessing/bud_metric/extraction_em_ratio_np_max_median", + "postprocessing/dsignal/extraction_general_None_volume", + "postprocessing/dsignal/postprocessing_bud_metric_extraction_general_None_volume", + "postprocessing/dsignal/postprocessing_bud_metric_extraction_em_ratio_np_max_median", + "postprocessing/dsignal/postprocessing_bud_metric_extraction_em_ratio_np_max_mean", + ] + ], + ), + # "savgol": ["/extraction/general/None/area"], + ), + }, + parameters={ + "prepost": { + "merger": mergerParameters.default(), + "picker": pickerParameters.default(), + } + }, + outpaths={"aggregate": "/postprocessing/experiment_wide/aggregated/"}, + ) + + def to_dict(self): + return {k: _if_dict(v) for k, v in self.__dict__.items()} + + +class PostProcessor: + def __init__(self, filename, parameters): + self.parameters = parameters + self._filename = filename + self._signal = Signal(filename) + self._writer = Writer(filename) + + # self.outpaths = parameters["outpaths"] + self.merger = merger(parameters["parameters"]["prepost"]["merger"]) + + self.picker = picker( + parameters=parameters["parameters"]["prepost"]["picker"], + cells=Cells.from_source(filename), + ) + self.classfun = { + process: self.get_process(process) + for process, _ in parameters["targets"]["processes"] + } + self.parameters_classfun = { + process: self.get_parameters(process) + for process, _ in parameters["targets"]["processes"] + } + self.targets = parameters["targets"] + + @staticmethod + def get_process(process): + """ + Dynamically import a process class from the 'processes' folder. + Assumes process filename and class name are the same + """ + return locate("postprocessor.core.processes." + process + "." + process) + + @staticmethod + def get_parameters(process): + """ + Dynamically import a process class from the 'processes' folder. + Assumes process filename and class name are the same + """ + return locate( + "postprocessor.core.processes." + process + "." + process + "Parameters" + ) + + def run_prepost(self): + """Important processes run before normal post-processing ones""" + + merge_events = self.merger.run(self._signal[self.targets["prepost"]["merger"]]) + + with h5py.File(self._filename, "r") as f: + prev_idchanges = self._signal.get_merges() + + changes_history = list(prev_idchanges) + [np.array(x) for x in merge_events] + self._writer.write("modifiers/merges", data=changes_history) + + with h5py.File(self._filename, "a") as f: # TODO Remove this once done tweaking + if "modifiers/picks" in f: + del f["modifiers/picks"] + + indices = self.picker.run(self._signal[self.targets["prepost"]["picker"][0]]) + from collections import Counter + + mothers, daughters = np.array(self.picker.mothers), np.array( + self.picker.daughters + ) + self.tmp = [y for y in Counter([tuple(x) for x in mothers]).items() if y[1] > 2] + self._writer.write( + "postprocessing/lineage", + data=pd.MultiIndex.from_arrays( + np.append(mothers, daughters[:, 1].reshape(-1, 1), axis=1).T, + names=["trap", "mother_label", "daughter_label"], + ), + overwrite="overwrite", + ) + + # apply merge to mother-daughter + moset = set([tuple(x) for x in mothers]) + daset = set([tuple(x) for x in daughters]) + picked_set = set([tuple(x) for x in indices]) + with h5py.File(self._filename, "a") as f: + merge_events = f["modifiers/merges"][()] + merged_moda = set([tuple(x) for x in merge_events[:, 0, :]]).intersection( + set([*moset, *daset, *picked_set]) + ) + search = lambda a, b: np.where( + np.in1d( + np.ravel_multi_index(a.T, a.max(0) + 1), + np.ravel_multi_index(b.T, a.max(0) + 1), + ) + ) + + for target, source in merge_events: + if ( + tuple(source) in moset + ): # update mother to lowest positive index among the two + mother_ids = search(mothers, source) + mothers[mother_ids] = ( + target[0], + self.pick_mother(mothers[mother_ids][0][1], target[1]), + ) + if tuple(source) in daset: + daughters[search(daughters, source)] = target + if tuple(source) in picked_set: + indices[search(indices, source)] = target + + self._writer.write( + "postprocessing/lineage_merged", + data=pd.MultiIndex.from_arrays( + np.append(mothers, daughters[:, 1].reshape(-1, 1), axis=1).T, + names=["trap", "mother_label", "daughter_label"], + ), + overwrite="overwrite", + ) + + self._writer.write( + "modifiers/picks", + data=pd.MultiIndex.from_arrays( + indices.T, + names=["trap", "cell_label"], + ), + overwrite="overwrite", + ) + + @staticmethod + def pick_mother(a, b): + """Update the mother id following this priorities: + + The mother has a lower id + """ + x = max(a, b) + if min([a, b]): + x = [a, b][np.argmin([a, b])] + return x + + def run(self): + self.run_prepost() + + for process, datasets in tqdm(self.targets["processes"]): + if process in self.parameters["parameters"].get( + "processes", {} + ): # If we assigned parameters + parameters = self.parameters_classfun[process](self.parameters[process]) + + else: + parameters = self.parameters_classfun[process].default() + + loaded_process = self.classfun[process](parameters) + for dataset in datasets: + # print("Processing", process, "for", dataset) + + if isinstance(dataset, list): # multisignal process + signal = [self._signal[d] for d in dataset] + elif isinstance(dataset, str): + signal = self._signal[dataset] + else: + raise ("Incorrect dataset") + + result = loaded_process.run(signal) + + if process in self.parameters.to_dict()["outpaths"]: + outpath = self.parameters.to_dict()["outpaths"][process] + elif isinstance(dataset, list): + # If no outpath defined, place the result in the minimum common + # branch of all signals used + prefix = "".join( + prefix + c[0] + for c in takewhile( + lambda x: all(x[0] == y for y in x), zip(*dataset) + ) + ) + outpath = ( + prefix + + "_".join( # TODO check that it always finishes in '/' + [d[len(prefix) :].replace("/", "_") for d in dataset] + ) + ) + elif isinstance(dataset, str): + outpath = dataset[1:].replace("/", "_") + else: + raise ("Outpath not defined", type(dataset)) + + if process not in self.parameters.to_dict()["outpaths"]: + outpath = "/postprocessing/" + process + "/" + outpath + + if isinstance(result, dict): # Multiple Signals as output + for k, v in result: + self.write_result( + outpath + f"/{k}", + v, + metadata={}, + ) + else: + self.write_result( + outpath, + result, + metadata={}, + ) + + def write_result( + self, path: str, result: Union[List, pd.DataFrame, np.ndarray], metadata: Dict + ): + self._writer.write(path, result, meta=metadata) + + +def _if_dict(item): + if hasattr(item, "to_dict"): + item = item.to_dict() + return item diff --git a/examples/basic_processes.py b/examples/basic_processes.py new file mode 100644 index 0000000000000000000000000000000000000000..c322fd4b68b2dd4f310a44bcd2a4ccbebb366f48 --- /dev/null +++ b/examples/basic_processes.py @@ -0,0 +1,15 @@ +from postprocessor.core.processor import PostProcessor, PostProcessorParameters + +params = PostProcessorParameters.default() +pp = PostProcessor( + "/shared_libs/pipeline-core/scripts/pH_calibration_dual_phl__ura8__by4741__01/ph_5_29_025store.h5", + params, +) +tmp = pp.run() + +import h5py + +# f = h5py.File( +# "/shared_libs/pipeline-core/scripts/pH_calibration_dual_phl__ura8__by4741__01/ph_5_29_025store.h5", +# "a", +# ) diff --git a/examples/group.py b/examples/group.py new file mode 100644 index 0000000000000000000000000000000000000000..1e59978acca5cd34c7796cb65d93d95d1657773c --- /dev/null +++ b/examples/group.py @@ -0,0 +1,21 @@ +from pathlib import Path + +from postprocessor.core.group import GroupParameters, Group + +poses = [ + x.name.split("store")[0] + for x in Path( + "/shared_libs/pipeline-core/scripts/data/ph_calibration_dual_phl_ura8_5_04_5_83_7_69_7_13_6_59__01" + ).rglob("*") + if x.name != "images.h5" +] + +gr = Group( + GroupParameters( + signals=["/extraction/general/None/area", "/extraction/mCherry/np_max/median"] + ) +) +gr.run( + central_store="/shared_libs/pipeline-core/scripts/data/ph_calibration_dual_phl_ura8_5_04_5_83_7_69_7_13_6_59__01", + poses=poses, +) diff --git a/examples/testing.py b/examples/testing.py new file mode 100644 index 0000000000000000000000000000000000000000..d5420d8337710e6a41f47a7cec436ed2a765b5f0 --- /dev/null +++ b/examples/testing.py @@ -0,0 +1,220 @@ +from typing import Dict, List, Union +import re + +import numpy as np +import pandas as pd +from pandas import Series, DataFrame +from sklearn.cluster import KMeans +from matplotlib import pyplot as plt +import seaborn as sns + +import compress_pickle + +from postprocessor.core.postprocessor import PostProcessor +from postprocessor.core.tracks import get_avg_grs, non_uniform_savgol +from postprocessor.core.tracks import clean_tracks, merge_tracks, join_tracks +from postprocessor.core.ph import * + +sns.set_context("talk", font_scale=1.8) +sns.set_theme(style="whitegrid") + +# pp_c = PostProcessor(source=19920) # 19916 +# pp_c.load_tiler_cells() +# # f = "/home/alan/Documents/libs/extraction/extraction/examples/gluStarv_2_0_x2_dual_phl_ura8_00/extraction" +# # f = "/home/alan/Documents/tmp/pH_calibration_dual_phl__ura8__by4741__01/extraction/" +# f = "/home/alan/Documents/tmp/pH_calibration_dual_phl__ura8__by4741_Alan4_00/extraction/" +# pp_c.load_extraction(f) + +# calib = process_phs(pp_c) +# # c = calib.loc[(5.4 < calib["ph"]) & (calib["ph"] < 8)].groupby("ph").mean() +# sns.violinplot(x="ph", y="em_ratio_mean", data=calib) +# plt.show() + + +# bring timelapse data and convert it to pH + +pp = PostProcessor(source=19831) # 19831 +pp.load_tiler_cells() +f = "/home/alan/Documents/tmp/gluStarv_2_0_x2_dual_phl_ura8_00/extraction/" +# f = "/home/alan/Documents/tmp/downUpshift_2_0_2_glu_dual_phluorin__glt1_psa1_ura7__thrice_00/extraction/" +pp.load_extraction(f) + +import compress_pickle + +compress_pickle.dump(pp.extraction, "/home/alan/extraction_example.pkl") + +if True: # Load extracted data or pkld version + new_dfs = compress_pickle.load("/home/alan/Documents/tmp/new_dfs.gz") +# Combine dataframes +else: + new_dfs = combine_dfs(get_dfs(pp)) + # t = [x.index for x in new_dfs.values()] + # i = set(t[0]) + # for j in t: + # i = i.intersection(j) + new_dfs = { + k: v + for k, v in new_dfs.items() + if k[2] != "imBackground" + and k[2] != "median" + and ~(((k[0] == "GFPFast") | (k[0] == "pHluorin405")) and k[2] == "max2p5pc") + } + +del pp +compress_pickle.dump(new_dfs, "/home/alan/Documents/tmp/new_dfs.gz") + + +def get_clean_dfs(dfs=None): + if dfs is None: + clean_dfs = compress_pickle.load("/home/alan/Documents/tmp/clean_dfs.gz") + else: + + # Clean timelapse + clean = clean_tracks(new_dfs[("general", None, "area")]) + tra, joint = merge_tracks(clean) + clean_dfs = new_dfs + i_ids = set(clean.index).intersection( + clean_dfs[("general", None, "area")].index + ) + clean_dfs = {k: v.loc[i_ids] for k, v in clean_dfs.items()} + clean_dfs = {k: join_tracks(v, joint, drop=True) for k, v in clean_dfs.items()} + + del new_dfs + compress_pickle.dump(clean_dfs, "/home/alan/Documents/tmp/clean_dfs.gz") + + +def plot_ph_hmap(clean_dfs): + GFPFast = clean_dfs[("GFPFast", np.maximum, "mean")] + phluorin = clean_dfs[("pHluorin405", np.maximum, "mean")] + ph = GFPFast / phluorin + ph = ph.loc[ph.notna().sum(axis=1) > 0.7 * ph.shape[1]] + ph = 1 / ph + + fig, ax = plt.subplots() + hmap(ph, cbar_kws={"label": r"emission ratio $\propto$ pH"}) + plt.xlabel("Time (hours)") + plt.ylabel("Cells") + xticks = plt.xticks(fontsize=15)[0] + ax.set(yticklabels=[], xticklabels=[str(round(i * 5 / 60, 1)) for i in xticks]) + # plt.setp(ax.get_xticklabels(), Rotation=90) + plt.show() + + +def fit_calibs(c, h): + h = process_phs(pp_c) + h["ratio"] = h["GFPFast_bgsub_median"] / h["pHluorin405_bgsub_median"] + sns.lineplot(x="ph", y="ratio", data=h, err_style="bars") + plt.show() + + calibs = fit_calibration(c) + for k, params in calibs.items(): + i, j = ("_".join(k.split("_")[:-1]), k.split("_")[-1]) + if j == "mean" and "k2" not in k: + clean_dfs[k] = objective(clean_dfs[i, np.maximum, j], *params) + + +# max 2.5% / med +def plot_ratio_vs_max2p5(h): + fig, ax = plt.subplots() + sns.regplot( + x="em_ratio_median", + y="mCherry_max2p5pc", + data=h, + scatter=False, + ax=ax, + color="teal", + ) + sns.scatterplot(x="em_ratio_median", y="max5_d_med", data=h, hue="ph", ax=ax) + plt.xlabel(r"Fluorescence ratio $R \propto (1/pH)$") + plt.ylabel("Max 2.5% px / median") + plt.show() + + +em = clean_dfs[("em_ratio", np.maximum, "mean")] +area = clean_dfs[("general", None, "area")] + + +def get_grs(clean_dfs): + area = clean_dfs[("general", None, "area")] + area = area.loc[area.notna().sum(axis=1) > 10] + return area.apply(growth_rate, axis=1) + + +def get_agg(dfs, rng): + # df dict of DataFrames containing an area/vol one TODO generalise this beyond area + # rng tuple of section to use + grs = get_grs(dfs) + smooth = grs.loc(axis=1)[list(range(rng[0], rng[1]))].dropna(how="all") + + aggregate_mean = lambda dfs, rng: pd.concat( + { + k[0] + "_" + k[2]: dfs[k].loc[smooth.index, rng[0] : rng[1]].mean(axis=1) + for k in clean_dfs.keys() + }, + axis=1, + ) + # f_comp_df = comp_df.loc[(comp_df["gr"] > 0) & (area.notna().sum(axis=1) > 50)] + + agg = aggregate_mean(dfs, rng) + agg["max2_med"] = agg["mCherry_max2p5pc"] / agg["mCherry_mean"] + + for c in agg.columns: + agg[c + "_log"] = np.log(agg[c]) + + agg["gr_mean"] = smooth.loc[set(agg.index).intersection(smooth.index)].mean(axis=1) + agg["gr_max"] = smooth.loc[set(agg.index).intersection(smooth.index)].max(axis=1) + + return agg + + +def plot_scatter_fit(x, y, data, hue=None, xlabel=None, ylabel=None, ylim=None): + fig, ax = plt.subplots() + sns.regplot(x=x, y=y, data=data, scatter=False, ax=ax) + sns.scatterplot(x=x, y=y, data=data, ax=ax, alpha=0.1, hue=hue) + # plt.show() + if xlabel is not None: + plt.xlabel(xlabel) + if ylabel is not None: + plt.ylabel(ylabel) + if ylim is not None: + plt.ylim(ylim) + + fig.savefig( + "/home/alan/Documents/sync_docs/drafts/third_year_pres/figs/" + + str(len(data)) + + "_" + + x + + "_vs_" + + y + + ".png", + dpi=200, + ) + + +from extraction.core.argo import Argo, annot_from_dset + + +def additional_feats(aggs): + aggs["gr_mean_norm"] = aggs["gr_mean"] * 12 + aggs["log_ratio_r"] = np.log(1 / aggs["em_ratio_mean"]) + return aggs + + +def compare_methods_ph_calculation(dfs): + GFPFast = dfs[("GFPFast", np.maximum, "mean")] + phluorin = dfs[("pHluorin405", np.maximum, "mean")] + ph = GFPFast / phluorin + + sns.scatterplot( + dfs["em_ratio", np.maximum, "mean"].values.flatten(), + ph.values.flatten(), + alpha=0.1, + ) + plt.xlabel("ratio_median") + plt.ylabel("median_ratio") + plt.title("Comparison of ph calculation") + plt.show() + + +# get the number of changes in a bool list +nchanges = lambda x: sum([i for i, j in zip(x[:-2], x[1:]) if operator.xor(i, j)])