diff --git a/correct_buds.py b/correct_buds.py index 53ffa1f472976919d16dfe859802f702da4fc919..16aa6b6772d51637945b022fb2d7a04c84e53a5f 100644 --- a/correct_buds.py +++ b/correct_buds.py @@ -7,7 +7,8 @@ min_cell_cycle_in_tps = 3 def drop_label(df, label="mother_label"): """Drop label from index of data frame.""" - df = df.reset_index(label).drop(columns=[label]) + if label in df.index.names: + df = df.reset_index(label).drop(columns=[label]) return df @@ -65,10 +66,14 @@ def interp_bud_volume(ind, buddings, bud_volume, figs=True): # find last non-NaN data point ib_real = np.max(np.where(~np.isnan(bud_vol))[0]) t_bv = bud_t[: ib_real + 1][~np.isnan(bud_vol[: ib_real + 1])] - v_bv = bud_vol[: ib_real + 1][~np.isnan(bud_vol[: ib_real + 1])] + v_bv = bud_vol[: ib_real + 1][ + ~np.isnan(bud_vol[: ib_real + 1]) + ] int_bud = interp1d(t_bv, v_bv) # interpolate missing NaN within bud time series - new_bv[ib : ib + ib_real + 1] = int_bud(t[ib : ib + ib_real + 1]) + new_bv[ib : ib + ib_real + 1] = int_bud( + t[ib : ib + ib_real + 1] + ) # plot if figs: plot_buds(ind, buddings, bud_volume, new_bv) diff --git a/dataloader.py b/dataloader.py index e666538e60625792ba18da25a78639697c4d521e..caff02186f51588eed6044cb07c4665adb2b31d6 100644 --- a/dataloader.py +++ b/dataloader.py @@ -4,6 +4,8 @@ from pathlib import Path import numpy as np import pandas as pd +from wela.correct_buds import correct_buds + try: from postprocessor.grouper import NameGrouper except ModuleNotFoundError: @@ -14,6 +16,8 @@ class dataloader: """ Compile aliby data sets into a single extensible data frame. + Buddings and bud_volume are always loaded first. + Parameters ---------- h5dir: string @@ -66,7 +70,7 @@ class dataloader: """ def __init__(self, h5dir=None, wdir=".", ls=True): - # from grouper.siglist to abbrevations + # from grouper.signals to abbrevations self.g2a_dict = { "extraction/cy5/max/median": "cy5", "extraction/GFP/max/median": "median_GFP", @@ -79,11 +83,15 @@ class dataloader: "extraction/mCherry/max/max5px_med": "max5px_med_mCherry", "extraction/general/None/area": "area", "extraction/general/None/volume": "volume", - "postprocessing/buddings/extraction_general_None_volume": "buddings", - "postprocessing/bud_metric/extraction_general_None_volume": "bud_volume", "extraction/Flavin_bgsub/max/mean": "flavin", "extraction/GFP/max/nuc_est_conv": "GFP_nuc_est_conv", } + self.buddings_path = ( + "postprocessing/buddings/extraction_general_None_volume" + ) + self.bud_volume_path = ( + "postprocessing/bud_metric/extraction_general_None_volume" + ) # from abbreviations to grouper.siglist self.a2g_dict = {v: k for (k, v) in self.g2a_dict.items()} # establish paths @@ -144,6 +152,8 @@ class dataloader: def load( self, dataname, + key_index="buddings", + cutoff=0.8, extra_g2a_dict=None, pxsize=0.182, use_tsv=False, @@ -162,15 +172,20 @@ class dataloader: ---------- dataname: string Either the name of the directory for an experiment or a - file name for a dataset + file name for a dataset. + key_index: string + Short name for key record that will be used to select cells. + cut_off: float + Select cells for key record that remain for at least cutoff + fraction of the experiment's total duration. g2a_dict: dictionary (optional) A dictionary of extra parameters to extract, which relates the aliby name to an abbreviation - pxsize: float (default 0.182) + pxsize: float Pixel size in microns, which is used to convert volumes. - use_tsv: boolean (default: False) + use_tsv: boolean If True, always load the data from a tsv file. - over_write: boolean (default: False) + over_write: boolean If True, overwrite the internal dictionary with extra_g2a_dict. Returns @@ -181,14 +196,13 @@ class dataloader: Example ------- >>> dl.load(dataname, {"extraction/cy5/max/median" : "cy5"}) + >>> dl.load(dataname, key_index="flavin", cutoff= 0.9) """ if dataname[-4:] == ".tsv": dataname = dataname[:-4] use_tsv = True # root name self.dataname = dataname - - # load if use_tsv: self.load_tsv(dataname) else: @@ -201,6 +215,12 @@ class dataloader: grouper = self.get_grouper(dataname) # find time interval between images self.dt = grouper.tintervals + # get key index for choosing cells + if key_index == "buddings": + key_index_path = self.buddings_path + else: + key_index_path = self.a2g_dict[key_index] + key_index = self.get_key_index(grouper, key_index_path, cutoff) # load data from h5 files print(dataname + "\n---") print("signals available:") @@ -208,20 +228,22 @@ class dataloader: for signal in signals: print(" ", signal) print("\nloading...") - first = True + # load and correct buddings and bud_volume + r_df = self.load_buddings_bud_volume( + grouper, cutoff, figs=False, key_index=key_index + ) + # load other signals for i, char in enumerate(self.g2a_dict): if char in signals: print(" " + char) - record = grouper.concat_signal(char) - # convert to long dataframe - tdf = self._long_df_with_id(record, self.g2a_dict[char]) - # drop redundant columns - tdf = tdf.drop(["position", "trap", "cell_label"], axis=1) - if first: - r_df = tdf.copy() - first = False - else: - r_df = pd.merge(r_df, tdf, how="left") + # load all cells + record = grouper.concat_signal(char, cutoff=0) + # keep cells only in key_index + new_record = self.get_key_cells(record, key_index) + # convert to long data frame + tdf = self.long_df_with_id(new_record, self.g2a_dict[char]) + # merge into one data set + r_df = pd.merge(r_df, tdf, how="left") else: print(" Warning: " + char, "not found") if "r_df" in locals(): @@ -244,8 +266,46 @@ class dataloader: # return grouper return grouper + def get_key_index(self, grouper, key_index_path, cutoff): + """Find index of cells that have sufficient measurements of the + key record.""" + record = grouper.concat_signal(key_index_path, cutoff=cutoff) + return record.index + + def get_key_cells(self, df, key_index): + """Find a smaller data frame with only cells present in the key record.""" + sdf = df.loc[df.index.intersection(key_index)] + return sdf + + def load_buddings_bud_volume(self, grouper, cutoff, figs, key_index): + """Load buddings and bud_volume, interpolate over NaNs, and + drop adjacent buddings.""" + # load buddings and all bud_volumes + buddings = grouper.concat_signal(self.buddings_path, cutoff=cutoff) + bud_volume = grouper.concat_signal(self.bud_volume_path, cutoff=0) + # perform correction + new_buddings, new_bud_volume = correct_buds(buddings, bud_volume, figs) + # keep cells only in key_index + new_buddings = self.get_key_cells(new_buddings, key_index) + new_bud_volume = self.get_key_cells(new_bud_volume, key_index) + # convert to long df and merge + df = self.long_df_with_id(new_buddings, "buddings") + df = pd.merge( + df, + self.long_df_with_id(new_bud_volume, "bud_volume"), + how="left", + ) + return df + def load_tsv(self, dataname): - # load data from tsv files + """ + Load data from tsv files. + + Parameters + ---------- + dataname: string + Name of the file + """ r_df = pd.read_csv(str(self.wdirpath / (dataname + ".tsv")), sep="\t") print("\nloaded", dataname) if hasattr(self, "r"): @@ -272,7 +332,7 @@ class dataloader: ) print("Saved", dataname) - def _long_df_with_id(self, df, char_name): + def long_df_with_id(self, df, char_name): """Convert an aliby multi-index dataframe into a long dataframe.""" df = self.long_df(df, char_name) # add unique id for each cell @@ -290,12 +350,14 @@ class dataloader: + ";" + df["trap"].astype(str), ) + # drop redundant columns + df = df.drop(["position", "trap", "cell_label"], axis=1) return df @property def revert_df(self): """ - Convert the 'id' column into the standard three columns used by aliby. + Convert the 'id' column into the standard columns used by aliby. These columns are 'position', 'trap', and 'cell_label' - and vice versa, either adding three columns to the .df dataframe or removing @@ -318,7 +380,7 @@ class dataloader: # drop columns redundant with "id" self.df = self.df.drop(["position", "trap", "cell_label"], axis=1) else: - print("Cannot revert") + print("Cannot revert.") def wide_df(self, signal, x="time", y="id"): """ @@ -392,7 +454,7 @@ class dataloader: # create a dataframe using the structure of a pivot df = pd.DataFrame(values, index=wdf.index, columns=wdf.columns) # convert to long dataframe - tdf = self._long_df_with_id(df, signal) + tdf = self.long_df_with_id(df, signal) # add to r self.df = pd.merge(self.df, tdf, how="left") @@ -423,13 +485,13 @@ class dataloader: Examples -------- - To sort URA7 cells by the sum of their number of births after 10 hours + To sort URA7 cells by the sum of their number of births after 10 hours: >>> df = dl.get_sorted_df("births", tmax=10, strain="URA7") - To sort cells by their median fluorescence + To sort cells by their median fluorescence: >>> df = dl.get_sorted_df("median_GFP") - To convert "id" back from a categorical data type + To convert "id" back from a categorical data type: >>> df.id = df.id.astype("object") """ if strain: @@ -459,9 +521,7 @@ class dataloader: @property def strains(self): - """ - List all strains in df.group - """ + """List all strains in df.group.""" for strain in list(self.df.group.unique()): print(" " + strain) @@ -486,7 +546,7 @@ class dataloader: @staticmethod def long_df(df, char_name, var_name="time"): - """Convert an aliby multi-index dataframe into a long dataframe.""" + """Convert a multi-index dataframe into a long dataframe.""" # flatten multi-index dataframe df = df.copy() if isinstance(df, pd.Series):