diff --git a/autocrosscorr.py b/autocrosscorr.py index 779589ec3f4709e54bef3d184bcac3239fd440e3..490556c081bd79d4191708ed0de486775977f52f 100644 --- a/autocrosscorr.py +++ b/autocrosscorr.py @@ -28,7 +28,7 @@ def autocrosscorr( yB: array (required for cross-correlation only) An array of signal values, with each row a replicate measurement and each column a time point. - stationary: booelan + stationary: boolean If True, the underlying dynamic process is assumed to be stationary with the mean a constant, estimated from all data points. diff --git a/dataloader.py b/dataloader.py index 16d621872773655e11329f19552e2a7f9294033f..7127279e4f0ca3b4ec326dfe8cf968f9f0e9c647 100644 --- a/dataloader.py +++ b/dataloader.py @@ -1,4 +1,3 @@ -# pip install --upgrade aliby-post import numpy as np import pandas as pd from pathlib import Path @@ -7,16 +6,18 @@ import pprint try: from postprocessor.grouper import NameGrouper except ModuleNotFoundError: - import sys + try: + import sys - sys.path.append("/Users/pswain/wip/postprocessor") - sys.path.append("/Users/pswain/wip/agora") - from postprocessor.grouper import NameGrouper + sys.path.append("/Users/pswain/wip/aliby/src") + from postprocessor.grouper import NameGrouper + except ModuleNotFoundError: + print("Can only load tsv files - cannot find postprocessor") class dataloader: """ - Compiles aliby datasets into a single, extensible data frame + Compile aliby data sets into a single, extensible data frame. Parameters ---------- @@ -25,13 +26,15 @@ class dataloader: directory per experiment outdir: string The working directory where .tsv files will be stored + ls: boolean + If True, list files and directories available. Example ------- >>> from dataloader import dataloader >>> dl = dataloader(indir="experiments", outdir=".") - The names of experiments and datasets are stored in dictionaries + The names of experiments and data sets are stored in dictionaries to avoid cutting and pasting of names. To load an experiment from the data directory, first use, @@ -45,7 +48,7 @@ class dataloader: which will load the data from h5 files. - Once the loading has been complete, the data will be saved in a + Once the loading has been complete, the data is saved in a text file - a .tsv file - in the working directory. To load this file in future use, for example @@ -60,39 +63,38 @@ class dataloader: >>> dl.df - which can be plotted directly with seaborn + which can be plotted directly with seaborn. >>> import seaborn as sns >>> sns.relplot(x="time", y="median_GFP", hue="group", kind="line", data=dl.df) """ - def __init__(self, indir=".", outdir="."): + def __init__(self, indir=".", outdir=".", ls=True): # from grouper.siglist to abbrevations self.g2a_dict = { - "extraction/GFP/np_max/median": "median_GFP", - "extraction/GFP/np_max/mean": "mean_GFP", + "extraction/GFP/max/median": "median_GFP", + "extraction/GFP/max/mean": "mean_GFP", "extraction/general/None/area": "area", "extraction/general/None/volume": "volume", - "extraction/GFP/np_max/max5px_med": "max5px_med", - "postprocessing/births/extraction_general_None_volume": "births", - "postprocessing/bud_metric/extraction_general_None_volume": - "bud_volume", - "extraction/Flavin_bgsub/np_max/mean": "flavin", + "extraction/GFP/max/max5px_med": "max5px_med", + "postprocessing/buddings/extraction_general_None_volume": "buddings", + "postprocessing/bud_metric/extraction_general_None_volume": "bud_volume", + "extraction/Flavin_bgsub/max/mean": "flavin", + "extraction/GFP/max/nuc_est_conv": "GFP_nuc_est_conv", } # from abbreviations to grouper.siglist self.a2g_dict = {v: k for (k, v) in self.g2a_dict.items()} # establish paths self.outdirpath = Path(outdir) self.indirpath = Path(indir) - self.ls - - ### + if ls: + self.ls @property def ls(self): """ - Lists the experiments available in indir and the datasets + List the experiments available in indir and the datasets available in outdir. The names of all directories in indir are stored in the @@ -117,11 +119,9 @@ class dataloader: self.datasets = {i: name for i, name in enumerate(files)} pp.pprint(self.datasets) - ### - def get_grouper(self, dataname): """ - Returns an instance of NameGrouper. + Return an instance of NameGrouper. Arguments --------- @@ -137,8 +137,6 @@ class dataloader: grouper = NameGrouper(self.indirpath / dataname) return grouper - ### - def load( self, dataname, @@ -148,8 +146,9 @@ class dataloader: use_tsv=False, ): """ - Loads either an experiment or a dataset into a long data - frame - with 'time' as a column. + Load either an experiment or a data set into a long data frame. + + The 'time' variable becomes a column. New data is added to the existing dataframe. @@ -186,10 +185,10 @@ class dataloader: # load if use_tsv: # load data from tsv files - print("loading", dataname) r_df = pd.read_csv( str(self.outdirpath / (dataname + ".tsv")), sep="\t" ) + print("loaded", dataname) if hasattr(self, "r"): # merge new data to current dataframe self.df = pd.merge(self.df, r_df, how="left") @@ -207,17 +206,17 @@ class dataloader: # load data from h5 files print(dataname + "\n---") print("signals available:") - signals = grouper.siglist + signals = grouper.available for signal in signals: - print(" ", signal[1:]) + print(" ", signal) print("\nloading...") first = True for i, char in enumerate(g2a_dict): - if "/" + char in signals: + if char in signals: print(" " + char) - hdf = grouper.concat_signal(char) + record = grouper.concat_signal(char) # convert to long dataframe - tdf = self._long_df(hdf, g2a_dict[char]) + tdf = self._long_df_with_id(record, g2a_dict[char]) # drop redundant columns tdf = tdf.drop(["position", "trap", "cell_label"], axis=1) if first: @@ -227,31 +226,32 @@ class dataloader: r_df = pd.merge(r_df, tdf, how="left") else: print(" Warning: " + char, "not found") - if pxsize: - # volumes to micron^3 - for signal in r_df.columns: - if "volume" in signal: - r_df[signal] *= pxsize ** 3 - if hasattr(self, "r"): - # merge new data to current dataframe - self.df = pd.merge(self.df, r_df, how="left") + if "r_df" in locals(): + if pxsize: + # volumes to micron^3 + for signal in r_df.columns: + if "volume" in signal: + r_df[signal] *= pxsize**3 + if hasattr(self, "df"): + # merge new data to current dataframe + self.df = pd.merge(self.df, r_df, how="left") + else: + # create new dataframe + self.df = r_df + # save dataframe + if save: + self.save(dataname) else: - # create new dataframe - self.df = r_df - # save dataframe - if save: - self.save(dataname) + raise NameError("Dataloader: No data loaded.") # define ids self.ids = list(self.df.id.unique()) if not use_tsv: # return grouper return grouper - ### - def save(self, dataname=None): """ - Saves the .df dataframe to a tab separated value (tsv) file + Save the .df dataframe to a tab separated value (tsv) file. Parameters ---------- @@ -265,20 +265,9 @@ class dataloader: ) print("Saved", dataname) - ### - - def _long_df(self, df, char_name): - """ - Internal function: converts an aliby multi-index dataframe into a - long dataframe - """ - # flatten multi-index dataframe - df = df.copy() - hnames = df.index.names - df.index = df.index.set_names(hnames) - df.reset_index(inplace=True) - # melt to create time column - df = df.melt(id_vars=hnames, var_name="time", value_name=char_name) + def _long_df_with_id(self, df, char_name): + """Convert an aliby multi-index dataframe into a long dataframe.""" + df = self.long_df(df, char_name) # add unique id for each cell if ( "position" in df.columns @@ -296,15 +285,14 @@ class dataloader: ) return df - ### - @property def revert_df(self): """ - Converts the 'id' column into the standard three columns used by - aliby - 'position', 'trap', and 'cell_label' - and vice versa, - either adding three columns to the .df dataframe or removing - three columns + Convert the 'id' column into the standard three columns used by aliby. + + These columns are 'position', 'trap', and 'cell_label' - and vice + versa, either adding three columns to the .df dataframe or removing + three columns. """ if ( "position" not in self.df.columns @@ -325,11 +313,9 @@ class dataloader: else: print("Cannot revert") - ### - def wide_df(self, signal, x="time", y="id"): """ - Pivots the .df dataframe to return a standard aliby dataframe + Pivot the .df dataframe to return a standard aliby dataframe. Parameters ---------- @@ -346,12 +332,10 @@ class dataloader: """ return self.df.pivot(y, x, signal) - ### - def get_time_series(self, signal, group=None): """ Extract all the data for a particular signal as - a 2D array with each row a time series + a 2D array with each row a time series. Parameters ---------- @@ -383,7 +367,7 @@ class dataloader: def put_time_series(self, values, signal): """ Insert a 2D array of data with each column a time series - into the dataframe + into the dataframe. Parameters ---------- @@ -401,17 +385,15 @@ class dataloader: # create a dataframe using the structure of a pivot df = pd.DataFrame(values, index=wdf.index, columns=wdf.columns) # convert to long dataframe - tdf = self._long_df(df, signal) + tdf = self._long_df_with_id(df, signal) # add to r self.df = pd.merge(self.df, tdf, how="left") - ### - def sort_df( self, signal, functionstr="sum", tmin=None, tmax=None, strain=None ): """ - Returns a sorted dataframe. + Return a sorted dataframe. Parameters ---------- @@ -462,7 +444,6 @@ class dataloader: # return sorted dataframe return df - #### @property def strains(self): """ @@ -471,11 +452,9 @@ class dataloader: for strain in list(self.df.group.unique()): print(" " + strain) - #### - def get_lineage_data(self, idx, signals): """ - Returns signals for a single lineage specified by idx, which is + Return signals for a single lineage specified by idx, which is specified by an element of self.ids. Arguments @@ -491,3 +470,17 @@ class dataloader: self.df[self.df.id == idx][signal].to_numpy() for signal in signals ] return res + + @staticmethod + def long_df(df, char_name, var_name="time"): + """Convert an aliby multi-index dataframe into a long dataframe.""" + # flatten multi-index dataframe + df = df.copy() + if isinstance(df, pd.Series): + df = df.unstack() + hnames = df.index.names + df.index = df.index.set_names(hnames) + df.reset_index(inplace=True) + # melt to create time column + df = df.melt(id_vars=hnames, var_name=var_name, value_name=char_name) + return df diff --git a/entropy.py b/entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..f1a4b251fc0f1f0e36ac9c3ce76acaf9f788c644 --- /dev/null +++ b/entropy.py @@ -0,0 +1,136 @@ +import numpy as np + + +def old_entropy_time_series(data): + """ + Finds the entropy of a time-series assuming that it is a multi-variate + Gaussian with one variable for each time point - a Gaussian process. + + Note that errors cannot be estimated using bootstrapping because having + repeated samples always reduces the entropy. + + Parameters + ---------- + data: array + A 2D array of the data with different time points as rows + and rows as columns + + Returns + ------- + gaussianentropy: float + The estimate of the entropy using the formula for a multi-variate + Gaussian distribution + cov: array + The covariance matrix estimated from the data and used to calculate + the entropy + + Example + ------- + >>> gaussianentropy, cov = entropy_time_series(data) + """ + # number of time points + n = data.shape[1] + # first find correlation matrix (the normalised covariance) because + # it is more robust numerically + corr = np.zeros((n, n)) + # deviation from mean over replicates at each time point for each replicate + ddev = data - np.nanmean(data, axis=0).reshape((1, n)) + for i in np.arange(n): + for j in np.arange(i + 1, n): + dsi = ddev[:, i] / np.sqrt(np.nansum(ddev[:, i] ** 2)) + dsj = ddev[:, j] / np.sqrt(np.nansum(ddev[:, j] ** 2)) + corr[i, j] = np.nansum(dsi * dsj) + corr = corr + corr.T + np.diag(np.ones(data.shape[1])) + # # fix if poorly defined + # if np.min(np.linalg.eig(corr)[0]) < 0: + # # use Higham's method + # # https://nhigham.com/2013/02/13/the-nearest-correlation-matrix/ + # # implemented by Mike Croucher + # print("Numerical problems - using the nearest correlation matrix") + # corr = nearcorr( + # corr, max_iterations=1000, except_on_too_many_iterations=False + # ) + # find covariance matrix + smat = np.diag(np.nanstd(data, axis=0)) + cov = np.matmul(smat, np.matmul(corr, smat)) + if True: + cov = np.zeros((n, n)) + for i in np.arange(n): + for j in np.arange(i + 1, n): + cov[i, j] = np.nanmean(ddev[:, i] * ddev[:, j]) + cov = cov + cov.T + np.diag(np.nanvar(data, axis=0)) + # find determinant using cholesky decomposition + if not isPD(cov): + print("Using nearestPD") + cov = nearestPD(cov) + try: + ell = np.linalg.cholesky(2 * np.pi * np.e * cov) + gaussianentropy = np.sum(np.log(np.diag(ell))) + return gaussianentropy, cov + except np.linalg.LinAlgError: + print("Error: the covariance matrix is not positive definite") + return np.nan, np.nan + + +def mynearestPD(a): + """ + From N Higham + https://nhigham.com/2021/01/26/what-is-the-nearest-positive-semidefinite-matrix/ + """ + # a = [[0, 1, 0, 0, 0], + # [0, 0, 1, 0, 0], + # [0, 0, 0, 1, 0], + # [0, 0, 0, 0, 1], + # [0, 0, 0, 0, 0]] + a = np.asarray(a) + b = (a + a.T) / 2 + d, q = np.linalg.eig(b) + d[d < 0] = 0 + x = q.dot(d.reshape((d.size, 1)) * q.T) + return (x + x.T) / 2 + + +def nearestPD(A): + """Find the nearest positive-definite matrix to input + + A Python/Numpy port of John D'Errico's `nearestSPD` MATLAB code [1], which + credits [2]. + + [1] https://www.mathworks.com/matlabcentral/fileexchange/42885-nearestspd + + [2] N.J. Higham, "Computing a nearest symmetric positive semidefinite + matrix" (1988): https://doi.org/10.1016/0024-3795(88)90223-6 + """ + B = (A + A.T) / 2 + _, s, V = np.linalg.svd(B) + H = np.dot(V.T, np.dot(np.diag(s), V)) + A2 = (B + H) / 2 + A3 = (A2 + A2.T) / 2 + if isPD(A3): + return A3 + spacing = np.spacing(np.linalg.norm(A)) + # The above is different from [1]. It appears that MATLAB's `chol` Cholesky + # decomposition will accept matrixes with exactly 0-eigenvalue, whereas + # Numpy's will not. So where [1] uses `eps(mineig)` (where `eps` is Matlab + # for `np.spacing`), we use the above definition. CAVEAT: our `spacing` + # will be much larger than [1]'s `eps(mineig)`, since `mineig` is usually on + # the order of 1e-16, and `eps(1e-16)` is on the order of 1e-34, whereas + # `spacing` will, for Gaussian random matrixes of small dimension, be on + # othe order of 1e-16. In practice, both ways converge, as the unit test + # below suggests. + Imatrix = np.eye(A.shape[0]) + k = 1 + while not isPD(A3): + mineig = np.min(np.real(np.linalg.eigvals(A3))) + A3 += Imatrix * (-mineig * k**2 + spacing) + k += 1 + return A3 + + +def isPD(B): + """Returns true when input is positive-definite, via Cholesky""" + try: + _ = np.linalg.cholesky(B) + return True + except np.linalg.LinAlgError: + return False diff --git a/find_cytokinesis_diane.py b/find_cytokinesis_diane.py index 069fbe09f9add9a6506803a8d26a452fb530382d..8866f5aa7a13c334390eea7cb9bdb13445f7b004 100644 --- a/find_cytokinesis_diane.py +++ b/find_cytokinesis_diane.py @@ -25,9 +25,7 @@ def by_thresh(grdiff, bgr, start, thresh, max_thresh): return np.nan -def growth_rate_crossing_mean( - d_gr, m_gr, start, end, ofs=2, lb=2, ub=15 -): +def growth_rate_crossing_mean(d_gr, m_gr, start, end, ofs=2, lb=2, ub=15): """ Find cytokinesis by growth rate crossing within a range of thresholds. diff --git a/growth_rate.py b/growth_rate.py new file mode 100644 index 0000000000000000000000000000000000000000..cae9ee110b1c3da7b96008ee94ac45275d34ee4a --- /dev/null +++ b/growth_rate.py @@ -0,0 +1,211 @@ +import pickle +from pathlib import Path + +import matplotlib.pylab as plt +import numpy as np +import pandas as pd +import gaussianprocessderivatives as gp + + +def find_gr( + volume_df, + buddings_df=None, + bounds=None, + noruns=10, + verbose=True, + max_no_cells=None, +): + """ + Use a Gaussian process to estimate the single-cell growth rates. + """ + # default bounds on hyperparameters + # Julian_bounds = {0: (-2, 3), 1: (-2, 0), 2: (0, 3)} + if buddings_df is None: + # mother params + def_bounds = {0: (2, 10), 1: (2, 5), 2: (-4, 8)} + else: + # bud params + def_bounds = {0: (2, 10), 1: (-1, 5), 2: (-4, 8)} + if bounds: + # merge with default bounds + bounds = {**def_bounds, **bounds} + else: + # use default bounds + bounds = def_bounds + + # get data from dataframe + volume = volume_df.to_numpy() + t = volume_df.columns.to_numpy() + # convert to hours so that growth rate has the correct units + t = t / 60 + + # signals to be found + if buddings_df is None: + signals = [ + "smoothed_volume", + "mother_growth_rate", + "d_mother_growth_rate", + ] + else: + buddings = buddings_df.to_numpy() + signals = [ + "smoothed_bud_volume", + "bud_growth_rate", + "d_bud_growth_rate", + ] + signals += [s + "_var" for s in signals] + + # store results in a dictionary + res = {s: np.nan * np.ones(volume.shape) for s in signals} + if max_no_cells: + no_cells = np.min([volume.shape[0], max_no_cells]) + else: + no_cells = volume.shape[0] + # loop over cells + for idx in range(no_cells): + ivol = volume[idx, :] + if buddings_df is None: + # data is a single time series + ires = runGP( + t, + ivol, + bounds, + noruns, + signals, + verbose, + f"time series {idx:d} / {no_cells-1:d}", + ) + if ires: + # store results as arrays + for s in signals: + res[s][idx, :] = ires[s] + else: + # time series comprises bud trajectories + bs = np.concatenate(([0], np.where(buddings[idx, :])[0], [t.size])) + for j, (start, end) in enumerate(zip(bs, bs[1:])): + # data for one bud + x = t[start:end] + y = ivol[start:end] + # run GP + ires = runGP( + x, + y, + bounds, + noruns, + signals, + verbose, + f"time series {idx:d} / {no_cells-1:d}: bud {j:d} /" + f" {bs.size-2:d}", + ) + if ires: + # store results as arrays + for s in signals: + res[s][idx, start:end] = ires[s] + + # convert results into dataframes + res_df = { + s: pd.DataFrame( + res[s], index=volume_df.index, columns=volume_df.columns + ) + for s in signals + } + return res_df + + +### + + +def runGP(x, y, bounds, noruns, signals, verbose, title, maxnofigs=70): + """ + Run GP on a single time series. + Results returned as a dictionary. + """ + # drop NaNs + i_nan = np.isnan(y) + x = x[~i_nan] + y = y[~i_nan] + # run GP + if np.any(y): + if verbose: + print(title) + # initiate GP + mg = gp.maternGP(bounds, x, y) + mg.findhyperparameters(noruns=noruns) + if verbose: + # print hyperparameters + mg.results() + print() + # predict smoothed data + mg.predict(x, derivs=2) + if verbose and len(plt.get_fignums()) < maxnofigs: + plt.figure() + plt.suptitle(title) + plt.subplot(2, 1, 1) + mg.sketch(".") + plt.ylabel("volume") + plt.subplot(2, 1, 2) + mg.sketch(".", derivs=1) + plt.ylabel("d/dt volume") + plt.xlabel("time") + plt.show() + res = dict( + zip( + signals, + reNaN( + [mg.f, mg.df, mg.ddf, mg.fvar, mg.dfvar, mg.ddfvar], i_nan + ), + ) + ) + else: + # no data + if verbose: + print(title + "\n" + "data all NaN") + res = None + return res + + +### + + +def reNaN(oldlist, i_nan): + """ + Add back NaN dropped from an array. + Takes a list of arrays and puts back NaN for each array + at indices i_nan. + """ + if np.any(i_nan): + newlist = [] + for x in oldlist: + y = np.nan * np.ones(i_nan.size) + y[~i_nan] = x + newlist.append(y) + return newlist + else: + return oldlist + + +### + + +def addgrfrompkl(dl, pkldir): + """ + Load growth rate from pickle files and add to and save dataframe + in a dataloader instance. + + Arguments + --------- + dl : dataloader instance + pkldir: string + The directory storing the pkl files. + """ + res_m = pickle.load( + open(str(Path(pkldir) / (dl.dataname + "_res_m.pkl")), "rb") + ) + res_b = pickle.load( + open(str(Path(pkldir) / (dl.dataname + "_res_b.pkl")), "rb") + ) + # add to dataframe + for res in [res_b, res_m]: + for signal in res: + tdf = dl._long_df(res[signal], signal) + dl.df = pd.merge(dl.df, tdf, on=["id", "time"], how="left") diff --git a/plotting.py b/plotting.py index 15b07ce596f29ac014ae8d3090a7b7b425c8245a..c4f616ee64ae12f8650df796d2348912e9606349 100644 --- a/plotting.py +++ b/plotting.py @@ -17,7 +17,6 @@ def kymograph( figsize=(6, 10), title=None, returnfig=False, - ): if hue == "births": cmap = "Greys" @@ -107,8 +106,8 @@ def plot_lineage( signals=["volume", "growth_rate"], show=True, figsize=(10, 5), - plot_cyto_pts=True, - plot_birth_pts=True, + cyto_pts_signal=None, + plot_budding_pts=True, plot_G1=False, ): """ @@ -128,7 +127,7 @@ def plot_lineage( If True, display figure. figsize: tuple of two floats The size of figure, eg (10, 5) - plot_birth_pts: boolean + plot_budding_pts: boolean If True, plot births as black dots. plot_cyto_pts: boolean If True, plot estimated points of cytokinesis as purple dots. @@ -139,13 +138,20 @@ def plot_lineage( raise Exception("idx not part of dataframe") signals = gu.makelist(signals) nosubplots = len(signals) - # show births if possible + # show buddingss if possible + if "buddings" in df.columns: + buddings = df[df.id == idx]["buddings"].to_numpy() + b_pts = np.where(buddings)[0] if "births" in df.columns: - births = df[df.id == idx]["births"].to_numpy() - b_pts = np.where(births)[0] + buddings = df[df.id == idx]["births"].to_numpy() + b_pts = np.where(buddings)[0] + if len(b_pts) == 1: + nb_pts = np.concatenate((b_pts, [len(buddings) - 1])) + else: + nb_pts = b_pts # show cytokinesis point if possible - if "cyto_pts" in df.columns: - cyto = df[df.id == idx]["cyto_pts"].to_numpy() + if cyto_pts_signal and cyto_pts_signal in df.columns: + cyto = df[df.id == idx][cyto_pts_signal].to_numpy() cyto_pts = np.where(cyto)[0] # show G1 points if possible if "G1" in df.columns: @@ -158,15 +164,16 @@ def plot_lineage( t = t / 60 # generate figure fig = plt.figure(figsize=figsize) + # index for subplot splt = 1 plt.suptitle(idx) # plot signals for signal in signals: - if signal == "volume": + if "volume" in signal: s = df[df.id == idx]["bud_volume"].to_numpy() mvol = df[df.id == idx]["volume"].to_numpy() plt.subplot(nosubplots, 1, splt) - for start, end in zip(b_pts, b_pts[1:]): + for start, end in zip(nb_pts, nb_pts[1:]): # plot bud volume plt.plot(t[start:end], s[start:end], ".-") # plot mother volume @@ -174,13 +181,14 @@ def plot_lineage( plt.ylabel("volume") plt.grid() splt += 1 + # for showing G1 g1s = mvol g1col = "k" elif signal == "growth_rate": s = df[df.id == idx]["bud_growth_rate"].to_numpy() mgr = df[df.id == idx]["mother_growth_rate"].to_numpy() plt.subplot(nosubplots, 1, splt) - for start, end in zip(b_pts, b_pts[1:]): + for start, end in zip(nb_pts, nb_pts[1:]): # plot bud growth rate plt.plot(t[start:end], s[start:end], ".-") # plot mother growth rate @@ -188,6 +196,7 @@ def plot_lineage( plt.ylabel("growth rate") plt.grid() splt += 1 + # for showing G1 g1s = mgr g1col = "k" else: @@ -197,18 +206,19 @@ def plot_lineage( plt.ylabel(signal) plt.grid() splt += 1 + # for showing G1 g1s = s g1col = p[0].get_color() - # plot births + # plot buddings if ( - plot_birth_pts - and "births" in df.columns + plot_budding_pts + and ("births" in df.columns or "buddings" in df.columns) and signal not in ["volume", "growth_rate"] ): for bpt in b_pts: plt.plot(t[bpt], s[bpt], "k.") # plot point of cytokinesis - if plot_cyto_pts and "cyto_pts" in df.columns: + if cyto_pts_signal and cyto_pts_signal in df.columns: for cpt in cyto_pts: plt.plot(t[cpt], s[cpt], "mo") # plot G1 diff --git a/pyproject.toml b/pyproject.toml index 8c9ec3a8203fb9cae89b9635614f0033aafffa6c..f561940d9f1f53bc3f5446e50e50371aa32833ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,8 @@ matplotlib = "^3.5.2" [tool.poetry.dev-dependencies] black = "^22.3.0" +[tool.black] +line-length = 79 [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/routines/heatmap.py b/routines/heatmap.py index e0f7b629cbd97d8b94758a7b5f640703e65823d2..316db958e188ae1860f46eb54906891ca82d7c4f 100644 --- a/routines/heatmap.py +++ b/routines/heatmap.py @@ -96,7 +96,9 @@ class _HeatmapPlotter(BasePlotter): interpolation="none", ) # Draw colour bar - ax.figure.colorbar(mappable=trace_heatmap, cax=cax, ax=ax, label=self.cbarlabel) + ax.figure.colorbar( + mappable=trace_heatmap, cax=cax, ax=ax, label=self.cbarlabel + ) def heatmap( diff --git a/run_gr.py b/run_gr.py new file mode 100644 index 0000000000000000000000000000000000000000..3d76025c20ae33051b7bf49afdbf81e53bc2d66f --- /dev/null +++ b/run_gr.py @@ -0,0 +1,49 @@ +import matplotlib.pylab as plt +import pickle +import genutils as gu +from mywela.dataloader import dataloader +from mywela.growth_rate import find_gr, addgrfrompkl + + +datadir = "/Users/pswain/ECDF/Swainlab/aliby_datasets/Arin" +dataname = "26643_2022_05_23_flavin_htb2_glucose_20gpL_01_00" +# dataname = "25681_2022_04_30_flavin_htb2_glucose_10mgpL_01_00" + +max_no_cells = None +use_tsv = True + +pxsize = 0.182 +if max_no_cells is None: + save = True +else: + save = False + +# get data +dl = dataloader( + datadir, outdir="/Users/pswain/Dropbox/wip/uscope_analyses/tsv_data" +) +dl.load(dataname, use_tsv=use_tsv) + +# data +b_vol = dl.wide_df("bud_volume") +m_vol = dl.wide_df("volume") +births = dl.wide_df("births") + +# bud growth rate +res_b = find_gr(b_vol, births, max_no_cells=max_no_cells, bounds={0: (0, 8)}) +if save: + pickle.dump(res_b, open(dataname + "_res_b.pkl", "wb")) + gu.figs2pdf(dataname + "_b.pdf") + plt.close("all") + +# mother growth rate +res_m = find_gr(m_vol, max_no_cells=max_no_cells) +if save: + pickle.dump(res_m, open(dataname + "_res_m.pkl", "wb")) + gu.figs2pdf(dataname + "_m.pdf") + plt.close("all") + +# add to dataframe +if save: + addgrfrompkl(dl, ".") + dl.save(dataname) diff --git a/search_OMERO.py b/search_OMERO.py new file mode 100644 index 0000000000000000000000000000000000000000..59f382bd0cbc238bbadb04315acb3ebea2cc1779 --- /dev/null +++ b/search_OMERO.py @@ -0,0 +1,43 @@ +from aliby.utils.argo import Argo +import pandas as pd + + +def search_OMERO(startdate, tags=None): + """ + Search OMERO database for experiments. + + Arguments + --------- + startdate : tuple + Earliest date from which to search for experiments, + such as (2019, 10, 30) + tags: string + List of strings to select experiments. + + Returns + ------- + resdf: Pandas dataframe + """ + argo = Argo( + host="staffa.bio.ed.ac.uk", + user="upload", + password="gothamc1ty", + min_date=startdate, + ) + argo.load() + if tags: + argo.tags(tags) + + tups = [ + ( + d.getId(), + d.getName(), + d.getDate().strftime("%x"), + d.getDescription(), + ) + for d in argo.dsets + ] + resdf = pd.DataFrame( + tups[::-1], columns=["ID", "Name", "Date", "Description"] + ) + return resdf diff --git a/tests/autocrosscorr/test_autocrosscorr_arin.py b/tests/autocrosscorr/test_autocrosscorr_arin.py index b9ac637b87991933fe921f33fd93325b92675cb4..8a823154e535e8fb8f3c6f4e31252eb914c1734d 100644 --- a/tests/autocrosscorr/test_autocrosscorr_arin.py +++ b/tests/autocrosscorr/test_autocrosscorr_arin.py @@ -5,4 +5,4 @@ import pandas as pd df = pd.read_csv("by4741_omero20016.csv") v = df.drop("cellID", axis=1).to_numpy() ac, lags = autocrosscorr(v) -plot_replicate_array(ac, t=lags*5/60, xlabel="lag", ylabel="correlation") +plot_replicate_array(ac, t=lags * 5 / 60, xlabel="lag", ylabel="correlation") diff --git a/tests/autocrosscorr/test_autocrosscorr_birthdeath.py b/tests/autocrosscorr/test_autocrosscorr_birthdeath.py index 6fd236e1479624d81d2327f7eeb10e90bd13fa17..0b929e1f034e91c6b8d9fc2482d530f7b9f1a3ff 100644 --- a/tests/autocrosscorr/test_autocrosscorr_birthdeath.py +++ b/tests/autocrosscorr/test_autocrosscorr_birthdeath.py @@ -1,10 +1,9 @@ import numpy as np import matplotlib.pyplot as plt import stochpy -from mywela.autocrosscorr import autocrosscorr +from wela.autocrosscorr import autocrosscorr - -smod = stochpy.SSA(model_file='birth_death.psc', dir='.') +smod = stochpy.SSA(model_file="birth_death.psc", dir=".") # birth rate k0 = 5 @@ -14,19 +13,19 @@ d0 = 0.05 tf = 1500 # load the stochastic birth-death model -smod.ChangeParameter('k', k0) -smod.ChangeParameter('d', d0) +smod.ChangeParameter("k", k0) +smod.ChangeParameter("d", d0) if False: # simulate stochastically - smod.DoStochSim(end=tf, mode='time', trajectories=3, quiet=False) + smod.DoStochSim(end=tf, mode="time", trajectories=3, quiet=False) # plot the results smod.PlotSpeciesTimeSeries() plt.xlim([0, tf]) plt.show() # run many simulations -smod.DoStochSim(end=tf, mode='time', trajectories=500, quiet=False) +smod.DoStochSim(end=tf, mode="time", trajectories=500, quiet=False) # put the trajectories on a grid - a matrix - with regularly spaced time points ns = 1000 @@ -37,10 +36,10 @@ smod.GetRegularGrid(n_samples=ns) data = np.array(smod.data_stochsim_grid.species[0]).astype("float") # autocorrelation -ac, lags = autocrosscorr(data[:, int(ns/2):]) +ac, lags = autocrosscorr(data[:, int(ns / 2) :]) plt.figure() -t = dt*np.arange(int(ns/2)) -plt.plot(dt*lags, np.mean(ac, axis=0), "b-") -plt.plot(t, np.exp(-t*d0), 'k--') +t = dt * np.arange(int(ns / 2)) +plt.plot(dt * lags, np.mean(ac, axis=0), "b-") +plt.plot(t, np.exp(-t * d0), "k--") plt.grid() plt.show() diff --git a/tests/autocrosscorr/test_autocrosscorr_sawtooth.py b/tests/autocrosscorr/test_autocrosscorr_sawtooth.py index 98d737ec1ac79fc46821c68401ff13fe0be1b68b..573aad3e44cba9b04e4d9b84538e0d3eb15d9010 100644 --- a/tests/autocrosscorr/test_autocrosscorr_sawtooth.py +++ b/tests/autocrosscorr/test_autocrosscorr_sawtooth.py @@ -1,17 +1,21 @@ import numpy as np from scipy import signal -from mywela.autocrosscorr import autocrosscorr -from mywela.plotting import plot_replicate_array +from wela.autocrosscorr import autocrosscorr +from wela.plotting import plot_replicate_array nr = 200 nt = 500 period = 8 -t = np.linspace(0, period*4, nt) +t = np.linspace(0, period * 4, nt) ts = np.tile(t, nr).reshape((nr, nt)) -y = signal.sawtooth(t * 2 * np.pi / period + 2*np.pi*np.random.rand(nr, 1)) +y = signal.sawtooth(t * 2 * np.pi / period + 2 * np.pi * np.random.rand(nr, 1)) data = 10 + y + np.random.normal(0, 0.3, y.shape) # autocorrelation -ac, lags = autocrosscorr(data, normalised=True) -plot_replicate_array(ac, t=lags*np.median(np.diff(t))/period, xlabel="lag in periods", - ylabel="correlation") +ac, lags = autocrosscorr(data, stationary=False) +plot_replicate_array( + ac, + t=lags * np.median(np.diff(t)) / period, + xlabel="lag in periods", + ylabel="correlation", +) diff --git a/tests/autocrosscorr/test_autocrosscorr_sine.py b/tests/autocrosscorr/test_autocrosscorr_sine.py index 3616df4b1cb3ad859289c47424021c403091e891..6996a481d178fde6df1ed1a4383ff9403a0007c6 100644 --- a/tests/autocrosscorr/test_autocrosscorr_sine.py +++ b/tests/autocrosscorr/test_autocrosscorr_sine.py @@ -1,16 +1,24 @@ import numpy as np -from mywela.autocrosscorr import autocrosscorr -from mywela.plotting import plot_replicate_array +from wela.autocrosscorr import autocrosscorr +from wela.plotting import plot_replicate_array nr = 200 nt = 500 +addrandomphase = False period = 8 -t = np.linspace(0, period*4, nt) +t = np.linspace(0, period * 4, nt) ts = np.tile(t, nr).reshape((nr, nt)) -y = 3*np.sin(2*np.pi*ts/period + 2*np.pi*np.random.rand(nr, 1)) +if addrandomphase: + y = 3 * np.sin(2 * np.pi * ts / period + 2 * np.pi * np.random.rand(nr, 1)) +else: + y = 3 * np.sin(2 * np.pi * ts / period) data = 10 + y + np.random.normal(0, 0.3, y.shape) # autocorrelation -ac, lags = autocrosscorr(data, normalised=True) -plot_replicate_array(ac, t=lags*np.median(np.diff(t))/period, xlabel="lag in periods", - ylabel="correlation") +ac, lags = autocrosscorr(data, stationary=True) +plot_replicate_array( + ac, + t=lags * np.median(np.diff(t)) / period, + xlabel="lag in periods", + ylabel="correlation", +) diff --git a/tests/autocrosscorr/test_autocrosscorr_x.py b/tests/autocrosscorr/test_autocrosscorr_x.py index 2cdbc576ce826301e2b21958943db049e9a3c61c..826bfb03c83e07e4dc29805a8e3c223ce12d2c8f 100644 --- a/tests/autocrosscorr/test_autocrosscorr_x.py +++ b/tests/autocrosscorr/test_autocrosscorr_x.py @@ -7,13 +7,13 @@ nr = 1000 nt = 500 period = 8 noise_sig = 0.01 -t = np.linspace(0, period*4, nt) +t = np.linspace(0, period * 4, nt) ts = np.tile(t, nr).reshape((nr, nt)) -final_t = 2*np.pi*ts/period + np.pi/4*np.random.rand(nr, 1) -y = 3*np.sin(final_t) +final_t = 2 * np.pi * ts / period + np.pi / 4 * np.random.rand(nr, 1) +y = 3 * np.sin(final_t) s_sin = 10 + y + np.random.normal(0, noise_sig, y.shape) -z = 3*np.cos(final_t) +z = 3 * np.cos(final_t) s_cos = 10 + z + np.random.normal(0, noise_sig, z.shape) # correlation @@ -23,22 +23,31 @@ sc, lags = autocrosscorr(s_sin, s_cos) # cosine is delayed by period/4 = pi/2 compared to sine plt.figure() plt.subplot(3, 1, 1) -plt.plot(t/period, np.sin(2*np.pi*t/period), label="sine") -plt.plot(t/period, np.cos(2*np.pi*t/period), label="cosine") +plt.plot(t / period, np.sin(2 * np.pi * t / period), label="sine") +plt.plot(t / period, np.cos(2 * np.pi * t / period), label="cosine") plt.legend() plt.grid() plt.xlim([0, 1]) plt.subplot(3, 1, 2) # peaks at 0.25 -plot_replicate_array(cs, t=lags*np.median(np.diff(t))/period, - ylabel="correlation", title="<cos * sin>", - show=False) +plot_replicate_array( + cs, + t=lags * np.median(np.diff(t)) / period, + ylabel="correlation", + title="<cos * sin>", + show=False, +) plt.xlim([-1, 1]) plt.subplot(3, 1, 3) # peaks at -0.25 -plot_replicate_array(sc, t=lags*np.median(np.diff(t))/period, - xlabel="lag in periods", ylabel="correlation", - title="<sin * cos>", show=False) +plot_replicate_array( + sc, + t=lags * np.median(np.diff(t)) / period, + xlabel="lag in periods", + ylabel="correlation", + title="<sin * cos>", + show=False, +) plt.xlim([-1, 1]) plt.tight_layout() plt.show() diff --git a/tests/cytokinesis/test_cytokinesis.py b/tests/cytokinesis/test_cytokinesis.py index 49d990aa7e00d088b592292fc1d2f9a7e25cf76e..789d7ff5f96990bf74a1d491a6cb7ed822590bcc 100644 --- a/tests/cytokinesis/test_cytokinesis.py +++ b/tests/cytokinesis/test_cytokinesis.py @@ -20,7 +20,9 @@ from mywela.plotting import plot_lineage dl = dataloader( - "/Users/pswain/ecdf/Swainlab/aliby_datasets/Alan", "../../tsv_data", ls=False + "/Users/pswain/ecdf/Swainlab/aliby_datasets/Alan", + "../../tsv_data", + ls=False, ) # BY4741 Myo1-GFP Whi5-mCherry ; BY4741 Sfp1-GFP Nhp6A-mCherry # switches every six hours @@ -66,9 +68,9 @@ def corr(res): # local definitions pred_cyt = res[:, 1][withinprior] true_cyt = res[:, 3][withinprior] - corr = np.corrcoef(true_cyt[~np.isnan(pred_cyt)], pred_cyt[~np.isnan(pred_cyt)])[ - 0, 1 - ] + corr = np.corrcoef( + true_cyt[~np.isnan(pred_cyt)], pred_cyt[~np.isnan(pred_cyt)] + )[0, 1] return corr, pred_cyt, true_cyt @@ -82,7 +84,16 @@ for thresh in [2]: res = [] # predictions for idx in dl.ids: - (myo1, t, bgr, bgrvar, mgr, mgrvar, buddings, true_cyto,) = dl.get_lineage_data( + ( + myo1, + t, + bgr, + bgrvar, + mgr, + mgrvar, + buddings, + true_cyto, + ) = dl.get_lineage_data( idx, [ "max5px_med", @@ -121,7 +132,9 @@ for thresh in [2]: lres.append([np.nan, np.nan]) else: # absolute time and time wrt budding event - lres.append([t[start + pred], t[start + pred] - t[start]]) + lres.append( + [t[start + pred], t[start + pred] - t[start]] + ) # store true cytokinesis point if tpk < start + offset or tpk > end - offset: # without prior on time of cytokinesis @@ -144,14 +157,18 @@ for thresh in [2]: ) plt.fill_between( lt, - bgr[start:end] / norm - np.sqrt(bgrvar[start:end]) / norm, - bgr[start:end] / norm + np.sqrt(bgrvar[start:end]) / norm, + bgr[start:end] / norm + - np.sqrt(bgrvar[start:end]) / norm, + bgr[start:end] / norm + + np.sqrt(bgrvar[start:end]) / norm, alpha=0.3, ) plt.fill_between( lt, - mgr[start:end] / norm - np.sqrt(mgrvar[start:end]) / norm, - mgr[start:end] / norm + np.sqrt(mgrvar[start:end]) / norm, + mgr[start:end] / norm + - np.sqrt(mgrvar[start:end]) / norm, + mgr[start:end] / norm + + np.sqrt(mgrvar[start:end]) / norm, alpha=0.3, ) plt.plot( diff --git a/tsa_inprogress.py b/tsa_inprogress.py new file mode 100644 index 0000000000000000000000000000000000000000..2a844f9c5c56884377027028e839ba6d839f75ab --- /dev/null +++ b/tsa_inprogress.py @@ -0,0 +1,132 @@ +import numpy as np +import pandas as pd +from collections import namedtuple + + +### + + +def toarray(y): + if type(y) == pd.core.frame.DataFrame: + return y.to_numpy() + else: + return y + + +def todf(y, exampledf): + if type(y) == pd.core.frame.DataFrame or exampledf is None: + return y + else: + return pd.DataFrame( + y, index=exampledf.index, columns=exampledf.columns + ) + + +### + + +def lbootstrap(data, statistic, noresamples=100, *args, **kwargs): + """ + Uses statistical bootstrapping to estimate errors in a statistic. + + Parameters + ---------- + data: array + A design matrix of data, with features as columns and replicates + as rows + statistic: function returning a float + Returns the desired statistic + noresamples: integer + The number of bootstrapped samples generated (by resampling with + replacement) + args, kwargs: + Any additional arguments required by the statistic function + + Returns + ------- + res: a named tuple with fields + median: float + The median of the statistic calculated from the resampled data sets + mean: float + The mean of the statistic + std: float + The standard deviation of the statistic + cf: list of two floats + The 95% confidence interval + iqr: float + The interquartile range + """ + if data.ndim == 1: + data = data.reshape(data.size, -1) + nosamples = data.shape[0] + rsams = np.random.choice(nosamples, size=(nosamples, noresamples)) + stats = [ + statistic(data[rsams[:, i], :], *args, **kwargs) + for i in np.arange(noresamples) + ] + Res = namedtuple("Res", ["cf", "std", "iqr", "median", "mean", "orig"]) + return Res( + [np.nanquantile(stats, 0.025), np.nanquantile(stats, 0.975)], + np.nanstd(stats), + np.nanquantile(stats, 0.75) - np.nanquantile(stats, 0.25), + np.nanmedian(stats), + np.nanmean(stats), + statistic(data), + ) + + +### + + +def entropy_time_series(data): + """ + Estimates the noise in a collection of time series by calculating + the entropy assuming that the data obey a multivariate normal + distribution. + + Uses the Ledoit-Wolf estimator of the covariance if the empirical + covariance is not positive definite. + + Arguments + --------- + data: array + An array of time series with replicates as rows and time points + as columns. + + Returns + ------- + gaussianentropy: float + An estimate of the entropy. + cov: array + An estimate of the covariance matrix. + """ + # number of time points + n = data.shape[1] + # deviation from mean over replicates at each time point for each replicate + ddev = data - np.nanmean(data, axis=0).reshape((1, n)) + # find covariance matrix + cov = np.zeros((n, n)) + for i in np.arange(n): + for j in np.arange(i + 1, n): + cov[i, j] = np.nanmean(ddev[:, i] * ddev[:, j]) + cov = cov + cov.T + np.diag(np.nanvar(data, axis=0)) + # find determinant using cholesky decomposition + if np.any(np.linalg.eigvals(cov) < 0): + print("Using Ledoit-Wolf estimator") + from sklearn.covariance import LedoitWolf + + cleandata = data[~np.isnan(data).any(axis=1), :] + print( + " dropping", + data.shape[0] - cleandata.shape[0], + "time series with NaNs", + ) + print(" keeping", cleandata.shape[0], "time series") + cov = LedoitWolf().fit(cleandata).covariance_ + try: + ell = np.linalg.cholesky(2 * np.pi * np.e * cov) + gaussianentropy = np.sum(np.log(np.diag(ell))) + return gaussianentropy, cov + except np.linalg.LinAlgError: + print("Error: the covariance matrix is not positive definite") + return np.nan, np.nan