From 95848a069e9b710444e7a54dc95dfad5d88717ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Al=C3=A1n=20Mu=C3=B1oz?= <amuoz@ed.ac.uk>
Date: Tue, 23 Nov 2021 17:34:03 +0000
Subject: [PATCH] add changes for 3rd year report

---
 pcore/grouper.py                    |  62 ++++++++++-------
 scripts/check_aggregated_metrics.py |  53 +++++++++++++++
 scripts/explore_h5.py               |  43 ++++++++++++
 scripts/group.py                    |  11 +++
 scripts/group2.py                   | 101 ++++++++++++++++++++++++++++
 scripts/postprocess.py              |  18 +++++
 scripts/postprocess_folder.py       |  91 ++-----------------------
 7 files changed, 270 insertions(+), 109 deletions(-)
 create mode 100644 scripts/check_aggregated_metrics.py
 create mode 100644 scripts/explore_h5.py
 create mode 100644 scripts/group.py
 create mode 100644 scripts/group2.py
 create mode 100644 scripts/postprocess.py

diff --git a/pcore/grouper.py b/pcore/grouper.py
index 0fa5e619..9f61fc6f 100644
--- a/pcore/grouper.py
+++ b/pcore/grouper.py
@@ -37,15 +37,22 @@ class Grouper(ABC):
     def group_names():
         pass
 
-    def concat_signal(self, path, reduce_cols=None, axis=0):
+    def concat_signal(self, path, reduce_cols=None, axis=0, pool=8):
         group_names = self.group_names
         sitems = self.signals.items()
-        with Pool(8) as p:
-            signals = p.map(
-                lambda x: concat_signal_ind(path, group_names, x[0], x[1]),
-                sitems,
-            )
-
+        if pool:
+            with Pool(pool) as p:
+                signals = p.map(
+                    lambda x: concat_signal_ind(path, group_names, x[0], x[1]),
+                    sitems,
+                )
+        else:
+            signals = [
+                concat_signal_ind(path, group_names, name, signal)
+                for name, signal in sitems
+            ]
+
+        signals = [s for s in signals if s is not None]
         sorted = pd.concat(signals, axis=axis).sort_index()
         if reduce_cols:
             sorted = sorted.apply(np.nanmean, axis=1)
@@ -95,22 +102,26 @@ class NameGrouper(Grouper):
 
         return self._group_names
 
-    def aggregate_multisignals(self, paths=None):
+    def aggregate_multisignals(self, paths=None, **kwargs):
 
         aggregated = pd.concat(
-            [self.concat_signal(path, reduce_cols=np.nanmean) for path in paths], axis=1
-        )
-        ph = pd.Series(
             [
-                self.ph_from_group(x[list(aggregated.index.names).index("group")])
-                for x in aggregated.index
+                self.concat_signal(path, reduce_cols=np.nanmean, **kwargs)
+                for path in paths
             ],
-            index=aggregated.index,
-            name="media_pH",
+            axis=1,
         )
-        self.aggregated = pd.concat((aggregated, ph), axis=1)
+        # ph = pd.Series(
+        #     [
+        #         self.ph_from_group(x[list(aggregated.index.names).index("group")])
+        #         for x in aggregated.index
+        #     ],
+        #     index=aggregated.index,
+        #     name="media_pH",
+        # )
+        # self.aggregated = pd.concat((aggregated, ph), axis=1)
 
-        return self.aggregated
+        return aggregated
 
 
 class phGrouper(NameGrouper):
@@ -152,10 +163,13 @@ class phGrouper(NameGrouper):
 
 def concat_signal_ind(path, group_names, group, signal):
     print("Looking at ", group)
-    combined = signal[path]
-    combined["position"] = group
-    combined["group"] = group_names[group]
-    combined.set_index(["group", "position"], inplace=True, append=True)
-    combined.index = combined.index.swaplevel(-2, 0).swaplevel(-1, 1)
-
-    return combined
+    try:
+        combined = signal[path]
+        combined["position"] = group
+        combined["group"] = group_names[group]
+        combined.set_index(["group", "position"], inplace=True, append=True)
+        combined.index = combined.index.swaplevel(-2, 0).swaplevel(-1, 1)
+
+        return combined
+    except:
+        return None
diff --git a/scripts/check_aggregated_metrics.py b/scripts/check_aggregated_metrics.py
new file mode 100644
index 00000000..c2ea39d3
--- /dev/null
+++ b/scripts/check_aggregated_metrics.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+
+
+folder = Path(
+    "/home/alan/Documents/dev/stoa_libs/pipeline-core/data/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00/"
+)
+g = NameGrouper(folder)
+# g.signals = {k: v for k, v in g.signals.items() if str(v) != filename}
+# signame = "extraction/general/None/volume"
+# signame = "extraction/em_ratio_bgsub/np_max/median"
+# signame = (
+#     "postprocessing/dsignal/postprocessing_bud_metric_extraction_general_None_volume"
+# )
+
+c = g.concat_signal(signame)
+d = c.loc[c.notna().sum(axis=1) > 50]
+sns.lineplot(
+    data=d.melt(ignore_index=False).reset_index(), x="variable", y="value", hue="group"
+)
+plt.show()
+
+# Aggregated metrics
+signame = "postprocessing/experiment_wide/aggregated"
+grid = sns.pairplot(
+    c[[i for i in c.columns if i.endswith("mean")]].reset_index(level="group"),
+    plot_kws=dict(alpha=0.5),
+    hue="group",
+)
+for ax in grid.axes.flat[:2]:
+    ax.tick_params(axis="x", labelrotation=90)
+    ax.tick_params(axis="y", labelrotation=90)
+
+plt.savefig("pairplot.png", dpi=400)
+plt.show()
+
+for s in g.signals.values():
+    with h5py.File(s.filename, "r") as f:
+        if "postprocessing/experiment_wide" not in f:
+            print(s.filename)
+
+filepath = "/shared_libs/pydask/pipeline-core/data/bak_kcl/YST_1510_009.h5"
+with h5py.File(filepath, "a") as f:
+    print(f["postprocessing"].keys())
+    # if "postprocessing" in f:
+    #     del f["/postprocessing"]
+    # if "modifiers" in f:
+    #     del f["/modifiers"]
+
+    params = PostProcessorParameters.default()
+    pp = PostProcessor(filepath, params)
+    pp.run()
+    s = Signal(filepath)
+    s.datasets
diff --git a/scripts/explore_h5.py b/scripts/explore_h5.py
new file mode 100644
index 00000000..22d09e39
--- /dev/null
+++ b/scripts/explore_h5.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+from pcore.io.signal import Signal
+
+fpath = "/home/alan/Documents/dev/stoa_libs/pipeline-core/data/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00/raf_15_s17.h5"
+
+s = Signal(fpath)
+
+import numpy as np
+import seaborn as sns
+from matplotlib import pyplot as plt
+
+# df = s["/extraction/general/None/area"]
+df = 1 / s["extraction/GFPFast/np_max/median"]
+df = df.div(df.mean(axis=1), axis=0)
+
+vol = s["postprocessing/bud_metric/extraction_general_None_volume"]
+vol = vol.div(vol.mean(axis=1), axis=0)
+# sns.heatmap(df.sort_index(), robust=True)
+# plt.show()
+
+
+import pandas as pd
+
+n = 3
+fig, axes = plt.subplots(n, 1)
+trap_ids = df.index.get_level_values("trap")
+trapn = trap_ids[np.random.randint(len(trap_ids), size=n)]
+for i, t in enumerate(trapn):
+    subdf = df.loc[t].melt(ignore_index=False).reset_index()
+    subdf["ch"] = "ratio"
+    voldf = vol.loc[t].melt(ignore_index=False).reset_index()
+    voldf["ch"] = "vol"
+    combined = pd.concat((voldf, subdf), axis=0)
+    sns.scatterplot(
+        data=combined,
+        x="variable",
+        y="value",
+        ax=axes[i],
+        hue="ch",
+        palette="Set2",
+        style="cell_label",
+    )
+plt.show()
diff --git a/scripts/group.py b/scripts/group.py
new file mode 100644
index 00000000..d445ff27
--- /dev/null
+++ b/scripts/group.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python3
+from pathlib import Path
+
+from pcore.grouper import NameGrouper
+
+path = Path(
+    "/home/alan/Documents/dev/stoa_libs/pipeline-core/data/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00"
+)
+ng = NameGrouper(path)
+# agg = ng.aggregate_multisignals(["extraction/general/None/area"], pool=0)
+agg = ng.concat_signal("extraction/general/None/area")
diff --git a/scripts/group2.py b/scripts/group2.py
new file mode 100644
index 00000000..21632d6e
--- /dev/null
+++ b/scripts/group2.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+
+
+from pcore.grouper import NameGrouper
+
+folder = Path(
+    "/home/alan/Documents/dev/stoa_libs/pipeline-core/data/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00/"
+)
+g = NameGrouper(folder)
+# g.signals = {k: v for k, v in g.signals.items() if str(v) != filename}
+# signame = "postprocessing/experiment_wide/aggregated"
+# signame = "extraction/general/None/volume"
+
+
+def get_df(signame):
+    c = g.concat_signal(signame)
+    d = c.loc[c.notna().sum(axis=1) > 60]
+    # for i in c.columns:
+    #     if "ratio" in i:
+    #         c[i] = 1 / c[i]
+
+    # splits = [x.split("_") for x in c.columns]
+    # new = [
+    #     [
+    #         x
+    #         for x in s
+    #         if x
+    #         not in [
+    #             "general",
+    #             "metric",
+    #             "postprocessing",
+    #             "extraction",
+    #             "None",
+    #             "np",
+    #             "max",
+    #             "em",
+    #         ]
+    #     ]
+    #     for s in splits
+    # ]
+    # joint = ["_".join(n) for n in new]
+    # c.columns = joint
+
+    return d
+
+
+signame = (
+    "postprocessing/dsignal/postprocessing_bud_metric_extraction_general_None_volume"
+)
+signame = "extraction/mCherry/np_max/max2p5pc"
+window = 15
+whi5 = get_df(signame)
+whi5 = whi5.div(whi5.mean(axis=1), axis=0)
+whi5_movavg = whi5.apply(lambda x: pd.Series(moving_average(x.values, window)), axis=1)
+ph = 1 / get_df("extraction/em_ratio/np_max/median")
+ph = ph.div(ph.mean(axis=1), axis=0)
+ph_movavg = ph.apply(lambda x: pd.Series(moving_average(x.values, window)), axis=1)
+ph_norm = ph.iloc(axis=1)[window // 2 : -window // 2] / ph_movavg
+whi5_norm = whi5.iloc(axis=1)[window // 2 : -window // 2] / whi5_movavg
+rand = np.random.randint(len(whi5), size=4)
+
+melted = ph_norm.iloc[rand].melt(ignore_index=False).reset_index()
+melted["signal"] = "ph"
+combined = whi5_norm.iloc[rand].melt(ignore_index=False).reset_index()
+combined["signal"] = "whi5"
+combined = pd.concat((melted, combined))
+combined["t_id"] = [str(x) + y for x, y in zip(combined["trap"], combined["position"])]
+h = sns.FacetGrid(combined, col="t_id", col_wrap=2)
+h.map_dataframe(sns.scatterplot, x="variable", y="value", hue="signal")
+h.add_legend()
+plt.show()
+
+
+def moving_average(a, n=3):
+    ret = np.cumsum(a, dtype=float)
+    ret[n:] = ret[n:] - ret[:-n]
+    return ret[n - 1 :] / n
+
+
+import numpy as np
+
+import seaborn as sns
+from matplotlib import pyplot as plt
+
+sns.set_style("darkgrid")
+sns.lineplot(
+    data=d.melt(ignore_index=False).reset_index("group"),
+    x="variable",
+    y="value",
+    # y="gsum_median_mean",
+    # y="dsignal_postprocessing_bud_metric_extraction_general_None_volume_max",
+    # y="dsignal_extraction_general_None_volume_max",
+    # hue="position",
+    hue="group",
+    # size="general_volume",
+    # alpha=0.5,
+    palette="muted",
+    # ci=None,
+)
+# plt.xlim((0, 5))
+plt.show()
diff --git a/scripts/postprocess.py b/scripts/postprocess.py
new file mode 100644
index 00000000..5b983551
--- /dev/null
+++ b/scripts/postprocess.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+import h5py
+from postprocessor.core.processor import PostProcessorParameters, PostProcessor
+
+from pathlib import Path
+dpath = Path("/home/alan/Documents/dev/stoa_libs/pipeline-core/data/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00/")
+
+for fpath in fpath.glob('*.h5'):
+    with h5py.File(fpath, "a") as f:
+        if "postprocessing" in f:
+            del f["/postprocessing"]
+        if "modifiers" in f:
+            del f["/modifiers"]
+
+    params = PostProcessorParameters.default().to_dict()
+    pp = PostProcessor(fpath, params)
+    pp.run()
diff --git a/scripts/postprocess_folder.py b/scripts/postprocess_folder.py
index 9c6b4f5a..b91e8c58 100644
--- a/scripts/postprocess_folder.py
+++ b/scripts/postprocess_folder.py
@@ -3,11 +3,13 @@
 from pathlib import Path
 import h5py
 from postprocessor.core.processor import PostProcessorParameters, PostProcessor
-from core.io.signal import Signal
+from pcore.io.signal import Signal
 
 from pathos.multiprocessing import Pool
 
-folder = "/shared_libs/pydask/pipeline-core/data/bak_kcl/"
+folder = Path(
+    "/home/alan/Documents/dev/stoa_libs/pipeline-core/data/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00/2021_11_04_doseResponse_raf_1_15_2_glu_01_2_dual_phluorin_whi5_constantMedia_00/"
+)
 
 
 def process_file(filepath):
@@ -25,92 +27,11 @@ def process_file(filepath):
             s = Signal(filepath)
             # s.datasets
         # df = s["/extraction/general/None/area"]
+
     except Exception as e:
         print(filepath, " failed")
+        print(e)
 
 
 with Pool(10) as p:
     results = p.map(lambda x: process_file(x), Path(folder).rglob("*.h5"))
-
-
-from core.grouper import NameGrouper
-
-g = NameGrouper(folder)
-# g.signals = {k: v for k, v in g.signals.items() if str(v) != filename}
-signame = "postprocessing/experiment_wide/aggregated"
-shortname = "aggregated"
-c = g.concat_signal(signame)
-for i in c.columns:
-    if "ratio" in i:
-        c[i] = 1 / c[i]
-
-splits = [x.split("_") for x in c.columns]
-new = [
-    [
-        x
-        for x in s
-        if x
-        not in [
-            "general",
-            "metric",
-            "postprocessing",
-            "extraction",
-            "None",
-            "np",
-            "max",
-            "em",
-        ]
-    ]
-    for s in splits
-]
-joint = ["_".join(n) for n in new]
-c.columns = joint
-# cf = c.loc[
-#     c["dsignal_postprocessing_bud_metric_extraction_general_None_volume"].notna()
-# ]
-import seaborn as sns
-from matplotlib import pyplot as plt
-
-sns.set_style("darkgrid")
-sns.lmplot(
-    data=c.reset_index(),
-    x="em_ratio_median_mean",
-    # y="gsum_median_mean",
-    # y="dsignal_postprocessing_bud_metric_extraction_general_None_volume_max",
-    # y="dsignal_extraction_general_None_volume_max",
-    # hue="position",
-    hue="group",
-    # size="general_volume",
-    # alpha=0.5,
-    palette="muted",
-    ci=None,
-)
-# plt.xlim((0, 5))
-plt.show()
-
-grid = sns.pairplot(c[[i for i in c.columns if i.endswith("max")]])
-for ax in grid.axes.flat[:2]:
-    ax.tick_params(axis="x", labelrotation=90)
-    ax.tick_params(axis="y", labelrotation=90)
-
-plt.savefig("pairplot.png", dpi=400)
-plt.show()
-
-for s in g.signals.values():
-    with h5py.File(s.filename, "r") as f:
-        if "postprocessing/experiment_wide" not in f:
-            print(s.filename)
-
-filepath = "/shared_libs/pydask/pipeline-core/data/bak_kcl/YST_1510_009.h5"
-with h5py.File(filepath, "a") as f:
-    print(f["postprocessing"].keys())
-    # if "postprocessing" in f:
-    #     del f["/postprocessing"]
-    # if "modifiers" in f:
-    #     del f["/modifiers"]
-
-    params = PostProcessorParameters.default()
-    pp = PostProcessor(filepath, params)
-    pp.run()
-    s = Signal(filepath)
-    s.datasets
-- 
GitLab