From 4ad4e2a85565355989cb44205148aa5c856d5f3d Mon Sep 17 00:00:00 2001 From: Peter Swain <peter.swain@ed.ac.uk> Date: Fri, 1 Jul 2022 18:10:53 +0100 Subject: [PATCH] feat: added negative lags to autocrosscorr to enable coross-correlation --- autocrosscorr.py | 151 ++++++++++++ dataloader.py | 493 ++++++++++++++++++++++++++++++++++++++++ time_series_analysis.py | 63 ----- 3 files changed, 644 insertions(+), 63 deletions(-) create mode 100644 autocrosscorr.py create mode 100644 dataloader.py delete mode 100644 time_series_analysis.py diff --git a/autocrosscorr.py b/autocrosscorr.py new file mode 100644 index 0000000..80687f2 --- /dev/null +++ b/autocrosscorr.py @@ -0,0 +1,151 @@ +import numpy as np + + +def autocrosscorr( + yA, + yB=None, + normalised=True, + only_pos=False, +): + """ + Calculates normalised auto- or cross-correlations as a function of lag. + Lag is given in multiples of the unknown time interval between data points. + + Normalisation is by the product of the standard deviation over time for + each replicate for each variable. + + For the cross-correlation between sA and sB, the closest peak to zero lag + should in the positive lags if sA is delayed compared to signal B and in the + negative lags if sA is advanced compared to signal B. + + Parameters + ---------- + yA: array + An array of signal values, with each row a replicate measurement + and each column a time point. + yB: array (required for cross-correlation only) + An array of signal values, with each row a replicate measurement + and each column a time point. + normalised: boolean (optional) + If True, normalise the result for each replicate by the standard + deviation over time for that replicate. + only_pos: boolean (optional) + If True, return results only for positive lags. + + Returns + ------- + corr: array + An array of the correlations with each row the result for the + corresponding replicate and each column a time point + lags: array + A 1D array of the lags in multiples of the unknown time interval + + Example + ------- + >>> import matplotlib.pyplot as plt + >>> import numpy as np + + Define a sine signal with 200 time points and 333 replicates + + >>> t = np.linspace(0, 4, 200) + >>> ts = np.tile(t, 333).reshape((333, 200)) + >>> s = 3*np.sin(2*np.pi*ts + 2*np.pi*np.random.rand(333, 1)) + + Find and plot the autocorrelaton + + >>> ac, lags = autocrosscorr(s) + >>> plt.figure() + >>> plt.plot(lags, np.nanmean(ac, axis=0)) + >>> plt.show() + """ + # number of replicates + nr = yA.shape[0] + # number of time points + nt = yA.shape[1] + # deviation from the mean, where the mean is calculated over replicates + # at each time point, which allows for non-stationary behaviour + dyA = yA - np.nanmean(yA, axis=0).reshape((1, nt)) + # standard deviation over time for each replicate + stdA = np.sqrt(np.nanmean(dyA**2, axis=1).reshape((nr, 1))) + if np.any(yB): + # cross correlation + dyB = yB - np.nanmean(yB, axis=0).reshape((1, nt)) + stdB = np.sqrt(np.nanmean(dyB**2, axis=1).reshape((nr, 1))) + else: + # auto correlation + dyB = dyA + stdB = stdA + # calculate correlation + # lag r runs over positive lags + pos_corr = np.nan * np.ones(yA.shape) + for r in range(nt): + prods = [dyA[:, t] * dyB[:, t + r] for t in range(nt - r)] + pos_corr[:, r] = np.nanmean(prods, axis=0) + # lag r runs over negative lags + # use corr_AB(-k) = corr_BA(k) + neg_corr = np.nan * np.ones(yA.shape) + for r in range(nt): + prods = [dyB[:, t] * dyA[:, t + r] for t in range(nt - r)] + neg_corr[:, r] = np.nanmean(prods, axis=0) + if normalised: + # normalise by standard deviation + pos_corr = pos_corr / stdA / stdB + neg_corr = neg_corr / stdA / stdB + # combine lags + lags = np.arange(-nt + 1, nt) + corr = np.hstack((np.flip(neg_corr[:, 1:], axis=1), pos_corr)) + if only_pos: + return corr[:, int(lags.size/2):], lags[int(lags.size/2):] + else: + return corr, lags + + +### + + +def hl_envelopes_idx(s, dmin=1, dmax=1, split=False): + """ + Finds the envelope of a signal. + + From https://stackoverflow.com/questions/34235530 + + Parameters + ---------- + s: 1d-array + Data signal from which to extract high and low envelopes + dmin, dmax: int (optional), + Size of chunks. Only necessary if the size of the input is too big. + split: boolean (optional) + If True, split the signal in half along its mean, which might help to + generate the envelope. + + Returns + ------- + lmin, lmax : array of integers + Low and high envelope indices of the input signal. + """ + # locals min + lmin = (np.diff(np.sign(np.diff(s))) > 0).nonzero()[0] + 1 + # locals max + lmax = (np.diff(np.sign(np.diff(s))) < 0).nonzero()[0] + 1 + if split: + s_mid = np.mean(s) + # pre-sorting of locals min based on relative position + lmin = lmin[s[lmin] < s_mid] + # pre-sorting of local max based on relative position + lmax = lmax[s[lmax] > s_mid] + # global max of dmax-chunks of locals max + lmin = lmin[ + [ + i + np.argmin(s[lmin[i: i + dmin]]) + for i in range(0, len(lmin), dmin) + ] + ] + # global min of dmin-chunks of locals min + lmax = lmax[ + [ + i + np.argmax(s[lmax[i: i + dmax]]) + for i in range(0, len(lmax), dmax) + ] + ] + return lmin, lmax diff --git a/dataloader.py b/dataloader.py new file mode 100644 index 0000000..16d6218 --- /dev/null +++ b/dataloader.py @@ -0,0 +1,493 @@ +# pip install --upgrade aliby-post +import numpy as np +import pandas as pd +from pathlib import Path +import pprint + +try: + from postprocessor.grouper import NameGrouper +except ModuleNotFoundError: + import sys + + sys.path.append("/Users/pswain/wip/postprocessor") + sys.path.append("/Users/pswain/wip/agora") + from postprocessor.grouper import NameGrouper + + +class dataloader: + """ + Compiles aliby datasets into a single, extensible data frame + + Parameters + ---------- + indir: string + The directory where data as stored as h5 files in one + directory per experiment + outdir: string + The working directory where .tsv files will be stored + + Example + ------- + >>> from dataloader import dataloader + >>> dl = dataloader(indir="experiments", outdir=".") + + The names of experiments and datasets are stored in dictionaries + to avoid cutting and pasting of names. + + To load an experiment from the data directory, first use, + for example, + + >>> dl.load(dl.experiments[0]) + + or give the name of the directory explicitly + + >>> dl.load("experiment_directory") + + which will load the data from h5 files. + + Once the loading has been complete, the data will be saved in a + text file - a .tsv file - in the working directory. + + To load this file in future use, for example + + >>> dl.load(dl.datasets[0]) + + or give the name of the file explicitly + + >>> dl.load("data_set.tsv") + + To see the data, use + + >>> dl.df + + which can be plotted directly with seaborn + + >>> import seaborn as sns + >>> sns.relplot(x="time", y="median_GFP", hue="group", + kind="line", data=dl.df) + """ + + def __init__(self, indir=".", outdir="."): + # from grouper.siglist to abbrevations + self.g2a_dict = { + "extraction/GFP/np_max/median": "median_GFP", + "extraction/GFP/np_max/mean": "mean_GFP", + "extraction/general/None/area": "area", + "extraction/general/None/volume": "volume", + "extraction/GFP/np_max/max5px_med": "max5px_med", + "postprocessing/births/extraction_general_None_volume": "births", + "postprocessing/bud_metric/extraction_general_None_volume": + "bud_volume", + "extraction/Flavin_bgsub/np_max/mean": "flavin", + } + # from abbreviations to grouper.siglist + self.a2g_dict = {v: k for (k, v) in self.g2a_dict.items()} + # establish paths + self.outdirpath = Path(outdir) + self.indirpath = Path(indir) + self.ls + + ### + + @property + def ls(self): + """ + Lists the experiments available in indir and the datasets + available in outdir. + + The names of all directories in indir are stored in the + .experiments dictionary. + + The names of all datasets in outdir are stored in the + .datasets dictionary. + """ + pp = pprint.PrettyPrinter() + # find raw data + print("\nData directory is", str(self.indirpath.resolve())) + print("Experiments available:") + dirs = [f.name for f in self.indirpath.glob("*") if f.is_dir()] + # directories of data are stored in experiments + self.experiments = {i: name for i, name in enumerate(dirs)} + pp.pprint(self.experiments) + # find processed data + print("\nWorking directory is", str(self.outdirpath.resolve())) + print("Datasets available:") + files = [f.name for f in self.outdirpath.glob("*.tsv") if f.is_file()] + # tsv files are stored in datasets + self.datasets = {i: name for i, name in enumerate(files)} + pp.pprint(self.datasets) + + ### + + def get_grouper(self, dataname): + """ + Returns an instance of NameGrouper. + + Arguments + --------- + dataname: string + The name of the dataset + + Returns + ------- + grouper: instance of NameGrouper + """ + if dataname[-4:] == ".tsv": + dataname = dataname[:-4] + grouper = NameGrouper(self.indirpath / dataname) + return grouper + + ### + + def load( + self, + dataname, + g2a_dict=None, + pxsize=0.182, + save=False, + use_tsv=False, + ): + """ + Loads either an experiment or a dataset into a long data + frame - with 'time' as a column. + + New data is added to the existing dataframe. + + Data is stored in the .df attribute. + + Parameters + ---------- + dataname: string + Either the name of the directory for an experiment or a + file name for a dataset + g2a_dict: dictionary (optional) + A dictionary relating the aliby name to an abbreviation + eg {"extraction/general/None/volume": "volume"} + pxsize: float (default 0.182) + Pixel size in microns, which is used to convert volumes. + save: boolean (default: True) + If True, save dataframe generated as a tsv file + use_tsv: boolean (default: False) + If True, always load the data from a tsv file. + + Returns + ------- + grouper: aliby grouper object + grouper for an experiment if one is loaded + """ + if dataname[-4:] == ".tsv": + dataname = dataname[:-4] + use_tsv = True + # root name + self.dataname = dataname + # conversion dictionary + if not g2a_dict: + g2a_dict = self.g2a_dict + # load + if use_tsv: + # load data from tsv files + print("loading", dataname) + r_df = pd.read_csv( + str(self.outdirpath / (dataname + ".tsv")), sep="\t" + ) + if hasattr(self, "r"): + # merge new data to current dataframe + self.df = pd.merge(self.df, r_df, how="left") + if save: + self.save(dataname) + else: + self.df = r_df + # store time interval + self.dt = np.min(np.diff(np.sort(self.df.time.unique()))) + else: + # create instance of grouper + grouper = self.get_grouper(dataname) + # find time interval between images + self.dt = grouper.tintervals + # load data from h5 files + print(dataname + "\n---") + print("signals available:") + signals = grouper.siglist + for signal in signals: + print(" ", signal[1:]) + print("\nloading...") + first = True + for i, char in enumerate(g2a_dict): + if "/" + char in signals: + print(" " + char) + hdf = grouper.concat_signal(char) + # convert to long dataframe + tdf = self._long_df(hdf, g2a_dict[char]) + # drop redundant columns + tdf = tdf.drop(["position", "trap", "cell_label"], axis=1) + if first: + r_df = tdf.copy() + first = False + else: + r_df = pd.merge(r_df, tdf, how="left") + else: + print(" Warning: " + char, "not found") + if pxsize: + # volumes to micron^3 + for signal in r_df.columns: + if "volume" in signal: + r_df[signal] *= pxsize ** 3 + if hasattr(self, "r"): + # merge new data to current dataframe + self.df = pd.merge(self.df, r_df, how="left") + else: + # create new dataframe + self.df = r_df + # save dataframe + if save: + self.save(dataname) + # define ids + self.ids = list(self.df.id.unique()) + if not use_tsv: + # return grouper + return grouper + + ### + + def save(self, dataname=None): + """ + Saves the .df dataframe to a tab separated value (tsv) file + + Parameters + ---------- + dataname: string (optional) + Name of the file + """ + if dataname is None: + dataname = self.dataname + self.df.to_csv( + str(self.outdirpath / (dataname + ".tsv")), sep="\t", index=False + ) + print("Saved", dataname) + + ### + + def _long_df(self, df, char_name): + """ + Internal function: converts an aliby multi-index dataframe into a + long dataframe + """ + # flatten multi-index dataframe + df = df.copy() + hnames = df.index.names + df.index = df.index.set_names(hnames) + df.reset_index(inplace=True) + # melt to create time column + df = df.melt(id_vars=hnames, var_name="time", value_name=char_name) + # add unique id for each cell + if ( + "position" in df.columns + and "cell_label" in df.columns + and "trap" in df.columns + ): + df.insert( + 0, + "id", + df["position"] + + ";" + + df["cell_label"].astype(str) + + ";" + + df["trap"].astype(str), + ) + return df + + ### + + @property + def revert_df(self): + """ + Converts the 'id' column into the standard three columns used by + aliby - 'position', 'trap', and 'cell_label' - and vice versa, + either adding three columns to the .df dataframe or removing + three columns + """ + if ( + "position" not in self.df.columns + and "trap" not in self.df.columns + and "cell_label" not in self.df.columns + ): + # add back dropped columns by expanding id column + self.df[["position", "trap", "cell_label"]] = self.df.id.str.split( + ";", expand=True + ) + elif ( + "position" in self.df.columns + and "trap" in self.df.columns + and "cell_label" in self.df.columns + ): + # drop columns redundant with "id" + self.df = self.df.drop(["position", "trap", "cell_label"], axis=1) + else: + print("Cannot revert") + + ### + + def wide_df(self, signal, x="time", y="id"): + """ + Pivots the .df dataframe to return a standard aliby dataframe + + Parameters + ---------- + signal: string + The signal of interest + x: string (default: 'time') + The x value of interest + y: string (default: 'id') + The y value of interest + + Example + ------- + >>> dl.wide_df("median_GFP") + """ + return self.df.pivot(y, x, signal) + + ### + + def get_time_series(self, signal, group=None): + """ + Extract all the data for a particular signal as + a 2D array with each row a time series + + Parameters + ---------- + signal: string + The signal to be extracted + + Returns + ------- + t: 1D array + Time values + values: 2D array + The values of the signal with each row a separate time + series + group: string (optional) + Selected data only for a particular group + """ + if group is None: + wdf = self.wide_df(signal, x="time", y="id") + elif group in self.df.group.values: + wdf = self.df[self.df.group == group].pivot("id", "time", signal) + else: + print(group, "not recognised") + return None, None + time = wdf.columns.to_numpy() + # each column is a time series + values = wdf.to_numpy() + return time, values + + def put_time_series(self, values, signal): + """ + Insert a 2D array of data with each column a time series + into the dataframe + + Parameters + ---------- + values: array + The data to be inserted + signal: string + The name of the signal + """ + # find a suitable column in r to generate a pivot + cols = list(self.df.columns) + for col in ["id", "time"]: + cols.remove(col) + # use cols[0] to find index and columns of a pivot + wdf = self.wide_df(cols[0]) + # create a dataframe using the structure of a pivot + df = pd.DataFrame(values, index=wdf.index, columns=wdf.columns) + # convert to long dataframe + tdf = self._long_df(df, signal) + # add to r + self.df = pd.merge(self.df, tdf, how="left") + + ### + + def sort_df( + self, signal, functionstr="sum", tmin=None, tmax=None, strain=None + ): + """ + Returns a sorted dataframe. + + Parameters + ---------- + signal: string + The signal to sort by. + functionstr: string + A function to apply to the signal before sorting. + Default value is "sum" so that all the data for each cell + is summed before sorting. + Other options include "max", "mean", "median", etc. + tmin: float (optional) + Only include data for times greater than tmin + tmax: float (optional) + Only include data for times less than tmax + strain: string (optional) + Only include data for this strain + + Examples + -------- + To sort URA7 cells by the sum of their number of births after 10 hours + >>> df = dl.sort_df("births", tmax=10, strain="URA7") + + To sort cells by their median fluorescence + >>> df = dl.sort_df("median_GFP") + """ + if strain: + df = self.df[self.df.group == strain].copy() + else: + df = self.df.copy() + if tmin and tmax: + sdf = df[(df.time >= tmin) & (df.time <= tmax)] + elif tmin: + sdf = df[df.time >= tmin] + elif tmax: + sdf = df[df.time <= tmax] + else: + sdf = df + sdf = sdf.groupby(["id"]) + # apply function to data grouped by ID + if functionstr in dir(sdf): + sdf = eval("sdf." + functionstr + "()") + # sort + sdf = sdf.sort_values(by=signal) + sort_order = list(sdf.index) + # sort original dataframe in the same way + df.id = pd.Categorical(df.id, categories=sort_order) + df = df.sort_values(by="id") + # return sorted dataframe + return df + + #### + @property + def strains(self): + """ + List all strains in df.group + """ + for strain in list(self.df.group.unique()): + print(" " + strain) + + #### + + def get_lineage_data(self, idx, signals): + """ + Returns signals for a single lineage specified by idx, which is + specified by an element of self.ids. + + Arguments + --------- + idx: integer + Element of self.ids + signals: list of strings + Signals to be returned + """ + if isinstance(signals, str): + signals = [signals] + res = [ + self.df[self.df.id == idx][signal].to_numpy() for signal in signals + ] + return res diff --git a/time_series_analysis.py b/time_series_analysis.py deleted file mode 100644 index 5bf9da6..0000000 --- a/time_series_analysis.py +++ /dev/null @@ -1,63 +0,0 @@ -import numpy as np - - -def autocrosscorr( - yA, - yB=None, - normalised=True, -): - """ - Calculates normalised auto- or cross-correlations as a function of time. - Normalisation is by the product of the standard deviation for each - variable calculated across replicates at each time point. - With zero lag, the normalised correlation should be one. - - Parameters - ---------- - yA: array - An array of signal values, with each row a replicate measurement - and each column a time point. - yB: array (required for cross-correlation only) - An array of signal values, with each row a replicate measurement - and each column a time point. - normalised: boolean - If True normalise each time point by the standard deviation across - the replicates. - - Returns - ------- - corr: array - An array of the correlations with each row the result for the - corresponding replicate and each column a time point - """ - # number of replicates - nr = yA.shape[0] - # number of time points - nt = yA.shape[1] - # deviation from the mean, where the mean is calculated over replicates - # at each time point, which allows for non-stationary behaviour - dyA = yA - np.nanmean(yA, axis=0).reshape((1, nt)) - # standard deviation over time for each replicate - stdA = np.sqrt(np.nanmean(dyA**2, axis=1).reshape((nr, 1))) - if np.any(yB): - # cross correlation - dyB = yB - np.nanmean(yB, axis=1).reshape((1, nt)) - stdB = np.sqrt(np.nanmean(dyB**2, axis=1).reshape((nr, 1))) - else: - # auto correlation - yB = yA - dyB = dyA - stdB = stdA - # calculate correlation - corr = np.nan * np.ones(yA.shape) - # lag r runs over time points - for r in np.arange(0, nt): - prods = [dyA[:, t] * dyB[:, t + r] for t in range(nt - r)] - corr[:, r] = np.nansum(prods, axis=0) / (nt - r) - if normalised: - # normalise by standard deviation - corr = corr / stdA / stdB - return corr - - -### -- GitLab