updated plothist to work for any two variables not necessarily time

09b10703 · pswain · fb8a240f · 09b10703 · 09b10703
Commit 09b10703 authored 1 year ago by pswain
--- a/dataloader.py
+++ b/dataloader.py
@@ -163,6 +163,7 @@ class dataloader:
        pxsize=0.182,
        use_tsv=False,
        over_write_dict=False,
+        hours=True,
    ):
        """
        Load either an experiment or a data set into a long data frame.
@@ -193,6 +194,8 @@ class dataloader:
            If True, always load the data from a tsv file.
        over_write_dict: boolean
            If True, overwrite the internal dictionary with extra_g2a_dict.
+        hours: boolean
+            If True, convert times to hours (dividing by 60).

        Returns
        -------
@@ -238,6 +241,8 @@ class dataloader:
                for signal in r_df.columns:
                    if "volume" in signal:
                        r_df[signal] *= pxsize**3
+            if hours:
+                r_df.time /= 60
            # create new attribute or merge with existing one
            if hasattr(self, "df"):
                self.df = pd.merge(self.df, r_df, how="left")

--- a/plotting.py
+++ b/plotting.py
@@ -3,6 +3,7 @@ from copy import copy
 import matplotlib.cm
 import matplotlib.pylab as plt
 import numpy as np
+import numpy.matlib


 def kymograph(
@@ -10,7 +11,7 @@ def kymograph(
    hue="median_GFP",
    x="time",
    y="id",
-    xtick_step=4,
+    xtick_step_in_hours=5,
    vmax=None,
    vmin=None,
    cmap=matplotlib.cm.Greens,
@@ -23,6 +24,8 @@ def kymograph(

    Typically each row is a single cell and the x-axis shows time.

+    Time is assumed to be in hours.
+
    Examples
    --------
    >>> from wela.plotting import kymograph
@@ -38,10 +41,10 @@ def kymograph(
    data = wdf.to_numpy()
    # define horizontal axis ticks and labels
    xtick_min = 0
-    xtick_max = dt / 60 * data.shape[1]
-    xticklabels = np.arange(xtick_min, xtick_max, xtick_step)
+    xtick_max = dt * data.shape[1]
+    xticklabels = np.arange(xtick_min, xtick_max, xtick_step_in_hours)
    xticks = [
-        int(np.where((dt / 60 * np.arange(data.shape[1])) == label)[0].item())
+        np.argmin((dt * np.arange(data.shape[1]) - label) ** 2)
        for label in xticklabels
    ]
    xticklabels = list(map(str, xticklabels.tolist()))
@@ -295,44 +298,51 @@ def plot_replicate_array(


 def plothist(
-    t,
-    da,
-    num_fine=200,
-    vmax_factor=1,
-    norm="log",
-    edges=None,
+    x,
+    y,
+    bins=[20, 20],
    title=None,
    figsize=None,
+    xlabel="time",
+    ylabel=None,
+    ymax=None,
+    **kwargs,
 ):
    """
-    Plot the distribution of fluorescence as a function of time.
+    Plot two dimensional histograms.

-    Time is on the x-axis; the support of the distribution is on the y-axis;
-    and shading represents the height of the distribution at each y value.
+    Typically, time is on the x-axis; the support of the distribution is on
+    the y-axis; and shading represents the height of the distribution at each
+    y value.

    Parameters
    ----------
-    t: 1D array
-        An array of time points.
-    da: 2D array
-        An array of the fluorescence data, with each row containing data from
-        a single cell.
-    num_fine: integer (optional)
-        The number of data points to include in a finer spacing of time points.
-        Linear interpolation is used to find the extra data values.
-    vmax_factor: float (optional)
-        Used to rescale the maximal data value, which is then passed to the vmax
-        argument in plt.pcolormesh to change the range of the colour bar.
-    norm: str (optional)
-        Passed to the norm argument in plt.pcolormesh for mapping the data
-        values to the colour map. Default is "log".
-    edges: list of arrays (optional)
-        Specifies the bins in time and in fluorescence values and is used to plot
-        different data sets on axes with the same range.
+    x: 1D or 2D array
+        If 1D, we assume an array of time points.
+    y: 2D array
+        Each row contains time-series data from a single cell.
+    bins: list of arrays (optional)
+        Specifies the bins, either explicitly or as a number of bins, and can
+        be used to plot different data sets on axes with the same range.
    title: str (optional)
        Title for the plot.
    figsize: tuple (optional)
        Sets the width and height of the figure.
+    xlabel: str (optional)
+        Label for the x-axis.
+    ylabel: str (optional)
+        Label for the y-axis.
+    ymax: float (optional)
+        The maximal value on the y-axis.
+    **kwargs:
+        Passed to plt.pcolormesh.
+
+    Returns
+    -------
+    edges: tuple of 1D arrays
+        The x and y bin edges.
+    h: 2D array
+        The number of data points in each bin.

    Examples
    --------
@@ -349,32 +359,23 @@ def plothist(
    >>> tc, dc = dlc.get_time_series("median_GFP")

    Plot both data sets using axes with the same range:
-    >>> edges = plothist(t, dc, title="2% Gal", figsize=(4, 3))
-    >>> plothist(t, d, title="2% Gal and 0.1% Glu", edges=edges, figsize=(4, 3))
+    >>> bins = plothist(t, dc, title="2% Gal", figsize=(4, 3))[0]
+    >>> plothist(t, d, title="2% Gal and 0.1% Glu", bins=bins, figsize=(4, 3))
    """
-    # interp to a new grid and replace NaN
-    t_fine = np.linspace(t.min(), t.max(), num_fine)
-    s_fine = np.empty((da.shape[0], num_fine), dtype=float)
-    for i in range(da.shape[0]):
-        s = da[i, :]
-        s_fine[i, :] = np.interp(t_fine, t[~np.isnan(s)], s[~np.isnan(s)])
-    t_fine = np.matlib.repmat(t_fine, da.shape[0], 1)
+    if x.ndim == 1:
+        # make into a 2D array
+        xa = np.matlib.repmat(x, y.shape[0], 1)
+    else:
+        xa = x
+    # find real data
+    select = ~np.isnan(xa) & ~np.isnan(y)
+    xn = xa[select].flatten()
+    yn = y[select].flatten()
+    # bin
+    h, xedges, yedges = np.histogram2d(xn, yn, bins=bins)
    # make histogram
    cmap = copy(plt.cm.viridis)
    cmap.set_bad(cmap(0))
-    if edges is not None:
-        h, xedges, yedges = np.histogram2d(
-            t_fine.flatten(),
-            s_fine.flatten(),
-            bins=edges,
-        )
-    else:
-        h, xedges, yedges = np.histogram2d(
-            t_fine.flatten(),
-            s_fine.flatten(),
-            bins=[int(num_fine / 2), 50],
-        )
-    vmax = np.nanmax(da) / vmax_factor
    # plot using pcolormesh
    if figsize is not None:
        plt.figure(figsize=figsize)
@@ -386,17 +387,18 @@ def plothist(
        h.T,
        cmap=cmap,
        rasterized=True,
-        vmax=vmax,
-        norm=norm,
+        **kwargs,
    )
    plt.colorbar(pcm, label="number of cells", pad=0.02)
-    plt.xlabel("time (h)")
-    plt.ylabel("fluorescence")
-    plt.ylim(top=1200)
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+    if ymax is not None:
+        plt.ylim(top=ymax)
    if title:
        plt.title(title)
+    plt.tight_layout()
    plt.show(block=False)
-    return [xedges, yedges]
+    return (xedges, yedges), h


 def plot_cuml_divisions_per_cell(t, buddings, nboots=30, col="b", label=None):
@@ -421,9 +423,11 @@ def plot_cuml_divisions_per_cell(t, buddings, nboots=30, col="b", label=None):

    Example
    -------
+    >>> import matplotlib.pylab as plt
    >>> from wela.plotting import plot_cuml_divisions_per_cell
    >>> plt.figure()
    >>> plot_cuml_divisions_per_cell(t, b, label="Gal Glu")
+    >>> plt.legend()
    >>> plt.show(block=False)
    """

@@ -440,7 +444,7 @@ def plot_cuml_divisions_per_cell(t, buddings, nboots=30, col="b", label=None):

    cuml = find_cuml(buddings)
    err_cuml = 2 * np.std(
-        [find_cuml(sample(buddings)) for i in range(nboots)], 0
+        [find_cuml(sample(buddings)) for i in range(nboots)], axis=0
    )
    plt.plot(t, cuml, color=col, label=label)
    plt.fill_between(t, cuml - err_cuml, cuml + err_cuml, color=col, alpha=0.2)