From 787ef6d8399c48db53008f9f72803817dc8d72e2 Mon Sep 17 00:00:00 2001 From: pswain <peter.swain@ed.ac.uk> Date: Tue, 30 Apr 2024 18:13:36 +0100 Subject: [PATCH] change: removed many optional functions Removed aliby.utils, postprocessor.core/functions and multisignal and processes. --- src/agora/utils/association.py | 1 - src/agora/utils/cast.py | 16 - src/agora/utils/kymograph.py | 1 - src/aliby/utils/__init__.py | 2 - src/aliby/utils/argo.py | 466 ------------------ src/aliby/utils/imageViewer.py | 314 ------------ src/aliby/utils/plot.py | 92 ---- src/aliby/utils/vis_tools.py | 246 --------- src/postprocessor/core/functions/__init__.py | 5 - .../core/multisignal/__init__.py | 3 - src/postprocessor/core/multisignal/align.py | 164 ------ .../core/multisignal/crosscorr.py | 167 ------- src/postprocessor/core/multisignal/mi.py | 227 --------- src/postprocessor/core/processes/__init__.py | 5 - src/postprocessor/core/processes/autoreg.py | 327 ------------ src/postprocessor/core/processes/butter.py | 93 ---- src/postprocessor/core/processes/catch22.py | 50 -- src/postprocessor/core/processes/detrend.py | 59 --- src/postprocessor/core/processes/dsignal.py | 41 -- src/postprocessor/core/processes/fft.py | 153 ------ src/postprocessor/core/processes/findpeaks.py | 124 ----- src/postprocessor/core/processes/gpsignal.py | 108 ---- .../core/processes/interpolate.py | 28 -- src/postprocessor/core/processes/knngraph.py | 78 --- src/postprocessor/core/processes/leiden.py | 48 -- .../core/processes/standardscaler.py | 50 -- src/postprocessor/core/processes/template.py | 25 - .../core/processes/umapembedding.py | 50 -- src/postprocessor/core/report.py | 30 -- src/postprocessor/core/reshapers/merger.py | 2 +- .../savgol.py => reshapers/nusavgol.py} | 82 +-- .../core/{functions => reshapers}/tracks.py | 2 +- 32 files changed, 10 insertions(+), 3049 deletions(-) delete mode 100644 src/agora/utils/association.py delete mode 100644 src/agora/utils/cast.py delete mode 100644 src/aliby/utils/__init__.py delete mode 100644 src/aliby/utils/argo.py delete mode 100644 src/aliby/utils/imageViewer.py delete mode 100644 src/aliby/utils/plot.py delete mode 100644 src/aliby/utils/vis_tools.py delete mode 100644 src/postprocessor/core/functions/__init__.py delete mode 100644 src/postprocessor/core/multisignal/__init__.py delete mode 100644 src/postprocessor/core/multisignal/align.py delete mode 100644 src/postprocessor/core/multisignal/crosscorr.py delete mode 100644 src/postprocessor/core/multisignal/mi.py delete mode 100644 src/postprocessor/core/processes/__init__.py delete mode 100644 src/postprocessor/core/processes/autoreg.py delete mode 100644 src/postprocessor/core/processes/butter.py delete mode 100644 src/postprocessor/core/processes/catch22.py delete mode 100644 src/postprocessor/core/processes/detrend.py delete mode 100644 src/postprocessor/core/processes/dsignal.py delete mode 100644 src/postprocessor/core/processes/fft.py delete mode 100644 src/postprocessor/core/processes/findpeaks.py delete mode 100644 src/postprocessor/core/processes/gpsignal.py delete mode 100644 src/postprocessor/core/processes/interpolate.py delete mode 100644 src/postprocessor/core/processes/knngraph.py delete mode 100644 src/postprocessor/core/processes/leiden.py delete mode 100644 src/postprocessor/core/processes/standardscaler.py delete mode 100644 src/postprocessor/core/processes/template.py delete mode 100644 src/postprocessor/core/processes/umapembedding.py delete mode 100644 src/postprocessor/core/report.py rename src/postprocessor/core/{processes/savgol.py => reshapers/nusavgol.py} (68%) rename src/postprocessor/core/{functions => reshapers}/tracks.py (99%) diff --git a/src/agora/utils/association.py b/src/agora/utils/association.py deleted file mode 100644 index a051bb08..00000000 --- a/src/agora/utils/association.py +++ /dev/null @@ -1 +0,0 @@ -#!/usr/bin/env jupyter diff --git a/src/agora/utils/cast.py b/src/agora/utils/cast.py deleted file mode 100644 index 46789959..00000000 --- a/src/agora/utils/cast.py +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env jupyter - -""" -Convert some types to others -""" - - -def _str_to_int(x: str or None): - """ - Cast string as int if possible. If Nonetype return None. - """ - if x is not None: - try: - return int(x) - except: - return x diff --git a/src/agora/utils/kymograph.py b/src/agora/utils/kymograph.py index 5bceb147..7570f99c 100644 --- a/src/agora/utils/kymograph.py +++ b/src/agora/utils/kymograph.py @@ -6,7 +6,6 @@ import numpy as np import pandas as pd from sklearn.cluster import KMeans -# from agora.utils.indexing import validate_association index_row = t.Tuple[str, str, int, int] diff --git a/src/aliby/utils/__init__.py b/src/aliby/utils/__init__.py deleted file mode 100644 index a85dd4ef..00000000 --- a/src/aliby/utils/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""Additional tools to fetch and handle datasets programatically. -""" diff --git a/src/aliby/utils/argo.py b/src/aliby/utils/argo.py deleted file mode 100644 index 7ea868da..00000000 --- a/src/aliby/utils/argo.py +++ /dev/null @@ -1,466 +0,0 @@ -import csv -import io -import operator -import re -import typing as t -from collections import Counter -from datetime import datetime -from pathlib import Path - -import numpy as np -from logfile_parser import Parser -from omero.gateway import BlitzGateway, TagAnnotationWrapper, _DatasetWrapper -from tqdm import tqdm - - -class OmeroExplorer: - def __init__( - self, - host: str, - user: str, - password: str, - min_date: t.Tuple[int, int, int] = (2020, 6, 1), - ): - self.conn = BlitzGateway(user, password, host=host) - self.conn.connect() - - self.min_date = min_date - self.backups = {} - self.removed = [] - - @property - def cache(self): - if not hasattr(self, "_cache"): - self._cache = {v.id: get_annotsets(v) for v in self.dsets} - return self._cache - - @property - def raw_log(self): - return {k: v["log"] for k, v in self.cache.items()} - - @property - def raw_log_end(self): - if not hasattr(self, "_raw_log_end"): - self._raw_log_end = {d.id: get_logfile(d) for d in self.dsets} - return self._raw_log_end - - @property - def log(self): - return {k: parse_annot(v, "log") for k, v in self.raw_log.items()} - - @property - def raw_acq(self): - return {k: v["acq"] for k, v in self.cache.items()} - - @property - def acq(self): - return {k: parse_annot(v, "acq") for k, v in self.raw_acq.items()} - - def load(self, min_id=0, min_date=None): - """ - :min_id: int - :min_date: tuple - """ - if min_date is None: - min_date = self.min_date - self._dsets_bak = [ - d for d in self.conn.getObjects("Dataset") if d.getId() > min_id - ] - - if min_date: - if len(min_date) < 3: - min_date = min_date + tuple( - [1 for _ in range(3 - len(min_date))] - ) - min_date = datetime(*min_date) - - # sort by dates - dates = [d.getDate() for d in self._dsets_bak] - self._dsets_bak[:] = [ - a - for _, a in sorted( - zip(dates, self._dsets_bak), key=lambda x: x[0] - ) - ] - - self._dsets_bak = [ - d for d in self._dsets_bak if d.getDate() >= min_date - ] - - self.dsets = self._dsets_bak - self.n_dsets - - def image_ids(self): - return { - dset.getId(): [im.getId() for im in dset.listChildren()] - for dset in self.dsets - } - - def dset(self, n): - try: - return [x for x in self.dsets if x.id == n][0] - except Exception as e: - print(f"Could not fetch all data xsets: {e}") - - def channels(self, setkey, present=True): - """ - :setkey: str indicating a set of channels - :present: bool indicating whether the search should or shold not be in the dset - """ - self.dsets = [ - v for v in self.acqs.values() if present == has_channels(v, setkey) - ] - self.n_dsets - - def update_cache(self): - if not hasattr(self, "acq") or not hasattr(self, "log"): - for attr in ["acq", "log"]: - print("Updating raw ", attr) - setattr( - self, - "raw_" + attr, - {v.id: get_annotsets(v)[attr] for v in self.dsets}, - ) - setattr( - self, - attr, - { - v.id: parse_annot( - getattr(self, "raw_" + attr)[v.id], attr - ) - for v in self.dsets - }, - ) - else: - - for attr in ["acq", "log", "raw_acq", "raw_log"]: - setattr( - self, - attr, - {i.id: getattr(self, attr)[i.id] for i in self.dsets}, - ) - - @property - def dsets(self): - if not hasattr(self, "_dsets"): - self._dsets = self.load() - - return self._dsets - - @dsets.setter - def dsets(self, dsets): - if hasattr(self, "_dsets"): - if self._dsets is None: - self._dsets = [] - self.removed += [ - x for x in self._dsets if x.id not in [y.id for y in dsets] - ] - - self._dsets = dsets - - def tags(self, tags, present=True): - """ - :setkey str tags to filter - """ - if type(tags) is not list: - tags = [str(tags)] - - self.dsets = [ - v for v in self.dsets if present == self.has_tags(v, tags) - ] - self.n_dsets - - @property - def all_tags(self): - if not hasattr(self, "_tags"): - self._tags = { - d.id: [ - x.getValue() - for x in d.listAnnotations() - if isinstance(x, TagAnnotationWrapper) - ] - for d in self.dsets - } - return self._tags - - def get_timepoints(self): - self.image_wrappers = { - d.id: list(d.listChildren())[0] for d in self.dsets - } - - return {k: i.getSizeT() for k, i in self.image_wrappers.items()} - - def timepoints(self, n, op="greater"): - "Filter experiments using the number of timepoints" - op = operator.gt if op == "greater" else operator.le - self._timepoints = self.get_timepoints() - - self.dsets = [ - v for v in tqdm(self.dsets) if op(self._timepoints[v.id], n) - ] - - def microscope(self, microscope): - self.microscopes = { - dset.id: self.get_microscope(self.log[dset.id]) - for dset in self.dsets - } - - self.n_dsets - - def get_microscope(self, parsed_log): - return parsed_log["microscope"] - - def reset(self, backup_id=None): - self.dsets = self.backups.get(backup_id, self._dsets_bak) - self.n_dsets - - def backup(self, name): - self.backups[name] = self.dsets - - def reset_backup(self, name): - self.dsets = self.backups[name] - - @staticmethod - def is_complete(logfile: str): - return logfile.endswith("Experiment completed\r\r\n") - - @staticmethod - def count_errors(logfile: str): - return re.findall("ERROR CAUGHT", logfile) - - @staticmethod - def count_drift_alert(logfile: str): - return re.findall("High drift alert!", logfile) - - @staticmethod - def is_interrupted(logfile: str): - return "Experiment stopped by user" in logfile - - @property - def n_dsets(self): - print("{} datasets.".format(len(self.dsets))) - - @property - def desc(self): - for d in self.dsets: - print( - "{}\t{}\t{}\t{}".format( - d.getDate().strftime("%x"), - d.getId(), - d.getName(), - d.getDescription(), - ) - ) - - @property - def ids(self): - return [d.getId() for d in self.dsets] - - def get_ph_params(self): - t = [ - { - ch: [exp, v] - for ch, exp, v in zip( - j["channel"], j["exposure"], j["voltage"] - ) - if ch in {"GFPFast", "pHluorin405"} - } - for j in [i["channels"] for i in self.acqs] - ] - - ph_param_pairs = [ - (tuple(x.values())) for x in t if np.all(list(x.values())) - ] - - return Counter([str(x) for x in ph_param_pairs]) - - def find_duplicate_candidates(self, days_tol=2): - # Find experiments with the same name or Aim and from similar upload dates - # and group them for cleaning - pass - - def return_completed(self, kind="complete"): - return { - k: getattr(self, f"is_{kind}")(v.get("log", "")) - for k, v in self.cache.items() - } - - def dset_count( - self, - dset: t.Union[int, _DatasetWrapper], - kind: str = "errors", - norm: bool = True, - ): - if isinstance(dset, int): - dset = self.conn.getObject("Dataset", dset) - - total_images_tps = sum([im.getSizeT() for im in dset.listChildren()]) - - return len( - getattr(self, f"count_{kind}")( - self.cache[dset.getId()].get("log", ""), norm=norm - ) - ) / (norm * total_images_tps) - - def count_in_log(self, kind="errors", norm: bool = True): - return { - k: self.dset_count(k, kind=kind, norm=norm) - for k, v in self.cache.items() - } - - @property - def complete(self): - self.completed = { - k: self.is_complete(v.get("log", "")) - for k, v in self.cache.items() - } - self.dsets = [dset for dset in self.dsets if self.completed[dset.id]] - return self.n_dsets - - def save(self, fname): - with open(fname + ".tsv", "w") as f: - writer = csv.writer(f, delimiter="\t") - for d in self.dsets: - writer.writerow( - [ - d.getDate().strftime("%x"), - d.getId(), - d.getName(), - d.getDescription(), - ] - ) - - @property - def positions(self): - return {x.id: len(list(x.listChildren())) for x in self.dsets} - - def has_tags(self, d, tags): - if set(tags).intersection(self.all_tags[d.id]): - return True - - -def convert_to_hours(delta): - total_seconds = delta.total_seconds() - hours = int(total_seconds // 3600) - return hours - - -class Argo(OmeroExplorer): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - -def annot_from_dset(dset, kind): - v = [x for x in dset.listAnnotations() if hasattr(x, "getFileName")] - infname = kind if kind == "log" else kind.title() - try: - acqfile = [x for x in v if x.getFileName().endswith(infname + ".txt")][ - 0 - ] - decoded = list(acqfile.getFileInChunks())[0].decode("utf-8") - acq = parse_annot(decoded, kind) - except Exception as e: - print(f"Conversion from acquisition file failed: {e}") - return {} - return acq - - -def check_channels(acq, channels, _all=True): - shared_channels = set(acq["channels"]["channel"]).intersection(channels) - - condition = False - if _all: - if len(shared_channels) == len(channels): - condition = True - else: - if len(shared_channels): - condition = True - - return condition - - -def get_chs(exptype): - # TODO Documentation - exptypes = { - "dual_ph": ("GFP", "pHluorin405", "mCherry"), - "ph": ("GFP", "pHluorin405"), - "dual": ("GFP", "mCherry"), - "mCherry": ("mCherry"), - } - return exptypes[exptype] - - -def load_annot_from_cache(exp_id, cache_dir="cache/"): - # TODO Documentation - if type(cache_dir) is not Path: - cache_dir = Path(cache_dir) - - annot_sets = {} - for fname in cache_dir.joinpath(exp_id).rglob("*.txt"): - fmt = fname.name[:3] - with open(fname, "r") as f: - annot_sets[fmt] = f.read() - - return annot_sets - - -def get_annot(annot_sets, fmt): - """ - Get parsed annotation file - """ - str_io = annot_sets.get(fmt, None) - return parse_annot(str_io, fmt) - - -def parse_annot(str_io, fmt): - parser = Parser("multiDGUI_" + fmt + "_format") - return parser.parse(io.StringIO(str_io)) - - -def get_annotsets(dset): - annot_files = [ - annot.getFile() - for annot in dset.listAnnotations() - if hasattr(annot, "getFile") - ] - annot_sets = { - ftype[:-4].lower(): annot - for ftype in ("log.txt", "Acq.txt", "Pos.txt") - for annot in annot_files - if ftype in annot.getName() - } - annot_sets = { - key: list(a.getFileInChunks())[0].decode("utf-8") - for key, a in annot_sets.items() - } - return annot_sets - - -def load_acq(dset): - # TODO Documentation - try: - acq = annot_from_dset(dset, kind="acq") - return acq - except Exception as e: - print(f"dset{dset.getId()}failed acq loading: {e}") - return False - - -def has_channels(dset, exptype): - # TODO Documentation - acq = load_acq(dset) - if acq: - return check_channels(acq, get_chs(exptype)) - else: - return - - -def get_logfile(dset): - # TODO Documentation - annot_file = [ - annot.getFile() - for annot in dset.listAnnotations() - if hasattr(annot, "getFile") - and annot.getFileName().endswith("log.txt") - ][0] - return list(annot_file.getFileInChunks())[-1].decode("utf-8") diff --git a/src/aliby/utils/imageViewer.py b/src/aliby/utils/imageViewer.py deleted file mode 100644 index b8712237..00000000 --- a/src/aliby/utils/imageViewer.py +++ /dev/null @@ -1,314 +0,0 @@ -import re -import typing as t -from abc import ABC -from pathlib import Path -import h5py -import matplotlib.pyplot as plt -import numpy as np -import seaborn as sns -from skimage.morphology import dilation - -from agora.io.cells import Cells -from agora.io.metadata import dispatch_metadata_parser -from aliby.io.image import dispatch_image - -from aliby.tile.tiler import Tiler -from aliby.utils.plot import stretch_clip - -default_colours = { - "Brightfield": "Greys_r", - "GFP": "Greens_r", - "mCherry": "Reds_r", - "cell_label": sns.color_palette("Paired", as_cmap=True), -} - - -def custom_imshow(a, norm=None, cmap=None, *args, **kwargs): - """Wrap plt.imshow.""" - if cmap is None: - cmap = "Greys_r" - return plt.imshow( - a, - *args, - cmap=cmap, - interpolation=None, - interpolation_stage="rgba", - **kwargs, - ) - - -class BaseImageViewer(ABC): - """Base class with routines common to all ImageViewers.""" - - def __init__(self, h5file_path): - """Initialise from a Path to a h5 file.""" - self.h5file_path = h5file_path - self.logfiles_meta = dispatch_metadata_parser(h5file_path.parent) - self.image_id = self.logfiles_meta.get("image_id") - if self.image_id is None: - with h5py.File(h5file_path, "r") as f: - self.image_id = f.attrs.get("image_id") - if self.image_id is None: - raise ("No valid image_id found in metadata.") - self.full = {} - - @property - def shape(self): - """Return shape of image array.""" - return self.tiler.image.shape - - @property - def ntraps(self): - """Find the number of traps available.""" - return self.cells.ntraps - - @property - def max_labels(self): - """Find maximum cell label in whole experiment.""" - return [max(x) for x in self.cells.labels] - - def labels_at_time(self, tp: int): - """Find cell label at a given time point.""" - return self.cells.labels_at_time(tp) - - def find_channel_indices( - self, channels: t.Union[str, t.Collection[str]], guess=True - ): - """Find index for particular channels.""" - channels = channels or self.tiler.ref_channel - if isinstance(channels, (int, str)): - channels = [channels] - if isinstance(channels[0], str): - if guess: - indices = [self.tiler.channels.index(ch) for ch in channels] - else: - indices = [ - re.search(ch, tiler_channels) - for ch in channels - for tiler_channels in self.tiler.channels - ] - return indices - else: - return channels - - def get_outlines_tiles_dict(self, tile_id, trange, channels): - """Get outlines and dict of tiles with channel indices as keys.""" - outlines = None - tile_dict = {} - for ch in self.find_channel_indices(channels): - outlines, tile_dict[ch] = self.get_outlines_tiles( - tile_id, trange, channels=[ch] - ) - return outlines, tile_dict - - def get_outlines_tiles( - self, - tile_id: int, - tps: t.Union[range, t.Collection[int]], - channels=None, - concatenate=True, - **kwargs, - ) -> t.Tuple[np.array]: - """ - Get masks uniquely labelled for each cell with the corresponding tiles. - - Returns a list of masks, each an array with distinct masks for each cell, - and an array of tiles for the given channel. - """ - tile_dict = self.get_tiles(tps, channels=channels, **kwargs) - # get tiles of interest - tiles = [x[tile_id] for x in tile_dict.values()] - # get outlines for each time point - outlines = [ - self.cells.at_time(tp, kind="edgemask").get(tile_id, []) for tp in tps - ] - # get cell labels for each time point - cell_labels = [self.cells.labels_at_time(tp).get(tile_id, []) for tp in tps] - # generate one image with all cell outlines uniquely labelled per tile - labelled_outlines = [ - np.stack( - [outline * label for outline, label in zip(outlines_tp, labels_tp)] - ).max(axis=0) - if len(labels_tp) - else np.zeros_like(tiles[0]).astype(bool) - for outlines_tp, labels_tp in zip(outlines, cell_labels) - ] - if concatenate: - # concatenate to allow potential image processing - labelled_outlines = np.concatenate(labelled_outlines, axis=1) - tiles = np.concatenate(tiles, axis=1) - return labelled_outlines, tiles - - def get_tiles( - self, - tps: t.Union[int, t.Collection[int]], - channels: None, - z: int = None, - ): - """Get dict with time points as keys and all available tiles as values.""" - if tps and not isinstance(tps, t.Collection): - tps = range(tps) - if z is None: - z = 0 - z = z or self.tiler.ref_z - channel_indices = self.find_channel_indices(channels) - ch_tps = [(channel_indices[0], tp) for tp in tps] - for ch, tp in ch_tps: - if (ch, tp) not in self.full: - self.full[(ch, tp)] = self.tiler.get_tiles_timepoint( - tp, channels=[ch], z=[z] - )[:, 0, 0, z, ...] - tile_dict = {tp: self.full[(ch, tp)] for ch, tp in ch_tps} - return tile_dict - - def plot_labelled_trap( - self, - tile_id, - channels, - trange: t.Union[range, t.Collection[int]], - remove_axis: bool = False, - skip_outlines: bool = False, - norm=True, - ncols: int = None, - local_colours: bool = True, - img_plot_kwargs: dict = {}, - lbl_plot_kwargs: dict = {"alpha": 0.8}, - **kwargs, - ): - """ - Plot time-lapses of individual tiles. - - Use Cells and Tiler to generate images of cells with their resulting - outlines. - - Parameters - ---------- - tile_id : int - Identifier of trap - channel : Union[str, int] - Channels to use - trange : t.Union[range, t.Collection[int]] - Range or collection indicating the time-points to use. - remove_axis : bool - None, "off", or "x". Determines whether to remove the x-axis, both - axes or none. - skip_outlines : bool - Do not add overlay with outlines - norm : str - Normalise signals - ncols : int - Number of columns to plot. - local_colours : bool - Bypass label indicators to guarantee that colours are not repeated - (TODO implement) - img_plot_kwargs : dict - Arguments to pass to plt.imshow used for images. - lbl_plot_kwargs : dict - Keyword arguments to pass to label plots. - """ - # set up for plotting - if ncols is None: - ncols = len(trange) - nrows = int(np.ceil(len(trange) / ncols)) - width = self.tiler.tile_size * ncols - outlines, tiles_dict = self.get_outlines_tiles_dict(tile_id, trange, channels) - channel_labels = [ - self.tiler.channels[ch] if isinstance(ch, int) else ch for ch in channels - ] - # dilate to make outlines easier to see - outlines = dilation(outlines).astype(float) - outlines[outlines == 0] = np.nan - # split concatenated tiles into one tile per time point in a row - tiles = [ - into_image_time_series(tile, width, nrows) for tile in tiles_dict.values() - ] - # TODO convert to RGB to draw fluorescence with colour - res = {} - # concatenate different channels vertically for display - res["tiles"] = np.concatenate(tiles, axis=0) - res["cell_labels"] = np.concatenate( - [into_image_time_series(outlines, width, nrows) for _ in tiles], axis=0 - ) - custom_imshow(res["tiles"], **img_plot_kwargs) - custom_imshow( - res["cell_labels"], cmap=default_colours["cell_label"], **lbl_plot_kwargs - ) - if remove_axis is True: - plt.axis("off") - elif remove_axis == "x": - plt.tick_params( - axis="x", which="both", bottom=False, top=False, labelbottom=False - ) - if remove_axis != "True": - plt.yticks( - ticks=[ - (i * self.tiler.tile_size * nrows) - + self.tiler.tile_size * nrows / 2 - for i in range(len(channels)) - ], - labels=channel_labels, - ) - if not remove_axis: - xlabels = ( - ["+ {} ".format(i) for i in range(ncols)] if nrows > 1 else list(trange) - ) - plt.xlabel("Time-point") - plt.xticks( - ticks=[self.tiler.tile_size * (i + 0.5) for i in range(ncols)], - labels=xlabels, - ) - if not np.any(outlines): - print("ImageViewer:Warning: No cell outlines found.") - plt.tight_layout() - plt.show(block=False) - - -class LocalImageViewer(BaseImageViewer): - """ - View images from local files. - - File are either zarr or organised in directories. - """ - - def __init__(self, h5file: str, image_direc: str): - """Initialise using a h5file and a local directory of images.""" - h5file_path = Path(h5file) - image_direc_path = Path(image_direc) - super().__init__(h5file_path) - with dispatch_image(image_direc_path)(image_direc_path) as image: - self.tiler = Tiler.from_h5(image, h5file_path) - self.cells = Cells.from_source(h5file_path) - - -class RemoteImageViewer(BaseImageViewer): - """Fetching remote images with tiling and outline display.""" - - credentials = ("host", "username", "password") - - def __init__(self, h5file: str, server_info: t.Dict[str, str]): - """Initialise using a h5file and importing aliby.io.omero.""" - from aliby.io.omero import UnsafeImage as OImage - - h5file_path = Path(h5file) - super().__init__(h5file_path) - self._server_info = server_info or { - k: self.attrs["parameters"]["general"][k] for k in self.credentials - } - image = OImage(self.image_id, **self._server_info) - self.tiler = Tiler.from_h5(image, h5file_path) - self.cells = Cells.from_source(h5file_path) - - -def into_image_time_series(a: np.array, width, nrows): - """Split into sub-arrays and then concatenate into one.""" - return np.concatenate( - np.array_split( - np.pad( - a, - ((0, 0), (0, a.shape[1] % width)), - constant_values=np.nan, - ), - nrows, - axis=1, - ) - ) diff --git a/src/aliby/utils/plot.py b/src/aliby/utils/plot.py deleted file mode 100644 index 8a34c3b7..00000000 --- a/src/aliby/utils/plot.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env jupyter -""" -Basic plotting functions for cell visualisation -""" - -import typing as t - -import numpy as np -from grid_strategy import strategies -from matplotlib import pyplot as plt - - -def plot_overlay( - bg: np.ndarray, fg: np.ndarray, alpha: float = 1.0, ax=plt -) -> None: - """ - Plot two images, one on top of the other. - """ - - ax1 = ax.imshow( - bg, cmap=plt.cm.gray, interpolation="none", interpolation_stage="rgba" - ) - ax2 = ax.imshow( - stretch(fg), - alpha=alpha, - interpolation="none", - interpolation_stage="rgba", - ) - plt.axis("off") - return ax1, ax2 - - -def plot_overlay_in_square(data: t.Tuple[np.ndarray, np.ndarray]): - """ - Plot images in an automatically-arranged grid. - """ - specs = strategies.SquareStrategy("center").get_grid(len(data)) - for i, (gs, (tile, mask)) in enumerate(zip(specs, data)): - ax = plt.subplot(gs) - plot_overlay(tile, mask, ax=ax) - - -def plot_in_square(data: t.Iterable): - """ - Plot images in an automatically-arranged grid. Only takes one mask - """ - specs = strategies.SquareStrategy("center").get_grid(len(data)) - for i, (gs, datum) in enumerate(zip(specs, data)): - ax = plt.subplot(gs) - ax.imshow(datum) - - -def stretch_clip(image, clip=True): - """ - Perform contrast stretching on an input image. - - This function takes an array-like input image and enhances its contrast by adjusting - the dynamic range of pixel values. It first scales the pixel values between 0 and 255, - then clips the values that are below the 2nd percentile or above the 98th percentile. - Finally, the pixel values are scaled to the range between 0 and 1. - - Parameters - ---------- - image : array-like - Input image. - - Returns - ------- - stretched : ndarray - Contrast-stretched version of the input image. - - Examples - -------- - FIXME: Add docs. - """ - from copy import copy - - image = image[~np.isnan(image)] - image = ((image - image.min()) / (image.max() - image.min())) * 255 - if clip: - minval = np.percentile(image, 2) - maxval = np.percentile(image, 98) - image = np.clip(image, minval, maxval) - image = (image - minval) / (maxval - minval) - return image - - -def stretch(image): - - nona = image[~np.isnan(image)] - - return (image - nona.min()) / (nona.max() - nona.min()) diff --git a/src/aliby/utils/vis_tools.py b/src/aliby/utils/vis_tools.py deleted file mode 100644 index 2215f212..00000000 --- a/src/aliby/utils/vis_tools.py +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/env jupyter -""" -Visualisation tools useful to generate figures cell pictures and figures from scripts. - -These do not depend on matplotlib to work, they focus on array processing. -To check plot-related functions look at plots.py in this folder. -""" -import typing as t -from copy import copy - -import numpy as np - -from agora.io.cells import Cells -from aliby.io.image import instantiate_image -from aliby.tile.tiler import Tiler, TilerParameters - - -def fetch_tc( - image_path: str, results_path: str, t: int = 0, c: int = 0 -) -> np.ndarray: - """ - Return 3D ndarray with (Z,Y,X) for a given pair of time point and channel. - """ - with instantiate_image(image_path) as img: - tiler = Tiler.from_h5(img, results_path, TilerParameters.default()) - tc = tiler.get_tp_data(t, c) - return tc - - -def get_tiles_at_times( - image_path: str, - results_path: str, - timepoints: t.List[int] = [0], - tile_reduction: t.Union[ - int, t.List[int], str, t.Callable - ] = lambda x: concatenate_dims(x, 1, -1), - channel: int = 1, -) -> np.ndarray: - """Use Image and tiler to get tiled position for specific time points. - - Parameters - ---------- - image_path : str - hdf5 index - timepoints : t.List[int] - list of timepoints to fetch - tile_reduction : t.Union[int, t.List[int], str, t.Callable] - Reduce dimensionality. Generally used to collapse z-stacks into one - - Examples - -------- - FIXME: Add docs. - - - """ - - # Get the correct tile in space and time - with instantiate_image(image_path) as image: - tiler = Tiler.from_h5(image, results_path, TilerParameters.default()) - tp_channel_stack = [ - _dispatch_tile_reduction(tile_reduction)( - tiler.get_tp_data(tp, channel) - ) - for tp in timepoints - ] - return tp_channel_stack - - -def get_cellmasks_at_times( - results_path: str, timepoints: t.List[int] = [0] -) -> t.List[t.List[np.ndarray]]: - return Cells(results_path).at_times(timepoints) - - -def concatenate_dims(ndarray, axis1: int, axis2: int) -> np.ndarray: - axis2 = len(ndarray.shape) + axis2 if axis2 < 0 else axis2 - return np.concatenate(np.moveaxis(ndarray, axis1, 0), axis=axis2 - 1) - - -def get_tile_mask_pairs( - image_path: str, - results_path: str, - timepoints: t.List[int] = [0], - tile_reduction=lambda x: concatenate_dims(x, 1, -1), -) -> t.Tuple[np.ndarray, t.List[t.List[np.ndarray]]]: - - return ( - get_tiles_at_times( - image_path, results_path, timepoints, tile_reduction - ), - get_cellmasks_at_times(results_path, timepoints), - ) - - -def _dispatch_tile_reduction(how: t.Union[int, str, t.List[int]], axis=1): - """ - Return an appropriate dimensional reduction based on the input on a specified axis. - If "how" is a string, it operates in dimension 1 (to match tile dimension standard Tile, Z, Y, X) - how: int, str or list of int - if int or list of int those numbers are indexed; - if str it assumes it is a numpy function such as np max. - if it is a callable it applies that operation to the array. - if None it returns the result as-is - axis: Only used when "how" is string. Determines the dimension to which the - standard operation is applied. - """ - # FUTURE use match case when migrating to python 3.10 - - if how is None: - return lambda x: x - elif isinstance(how, (int, list)): - return lambda x: x.take(how, axis=axis) - elif isinstance(how, str): - return lambda x: getattr(x, how)(axis=axis) - elif isinstance(how, t.Callable): - return lambda x: how(x) - else: - raise Exception(f"Invalid reduction {how}") - - -def tile_like(arr1: np.ndarray, arr2: np.ndarray): - """ - Tile the first two dimensions of arr1 (ND) to match arr2 (2D) - """ - - result = arr1 - ratio = np.divide(arr2.shape, arr1.shape[-2:]).astype(int) - if reps := max(ratio - 1): - tile_ = ( - lambda x, n: np.tile(x, n) - if ratio.argmax() - else np.tile(x.T, n + 1).T - ) - result = np.stack([tile_(mask, reps + 1) for mask in arr1]) - return result - - -def centre_mask(image: np.ndarray, mask: np.ndarray): - """Roll image to the centre of the image based on a mask of equal size""" - - cell_centroid = ( - np.max(np.where(mask), axis=1) + np.min(np.where(mask), axis=1) - ) // 2 - tile_centre = np.array(image.shape) // 2 - return np.roll(image, (tile_centre - cell_centroid), (0, 1)) - - -def long_side_vertical(arr): - result = arr - if np.subtract(*arr.shape): - result = arr.T - return result - - -def rescale(arr): - arr_min = arr.min() - arr_max = arr.max() - return (arr - arr_min) / (arr_max - arr_min) - - -def crop_mask(img: np.ndarray, mask: np.ndarray): - img = copy(img).astype(float) - img[~mask] = np.nan - return img - - -def _sample_n_tiles_masks( - image_path: str, - results_path: str, - n: int, - seed: int = 0, - interval=None, -) -> t.Tuple[t.Tuple, t.Tuple[np.ndarray, np.ndarray]]: - - cells = Cells(results_path) - indices, masks = cells._sample_masks(n, seed=seed, interval=interval) - - # zipped_indices = zip(*indices) - for index, (background, cropped_fg) in zip( - zip(*indices), - _gen_overlay_masks_tiles( - image_path, - results_path, - masks, - [indices[i] for i in (0, 2)], - ), - ): - yield index, (background, cropped_fg) - - -def _overlay_mask_tile( - image_path: str, - results_path: str, - mask: np.ndarray, - index: t.Tuple[int, int, int], - bg_channel: int = 0, - fg_channel: int = 1, - reduce_z: t.Union[None, t.Callable] = np.max, -) -> t.Tuple[np.ndarray, np.ndarray]: - """ - Return a tuplw with two channels - """ - - tc = np.stack( - [ - fetch_tc(image_path, results_path, index[1], i) - for i in (bg_channel, fg_channel) - ] - ) # Returns C(tile)ZYX - - tiles = tc[:, index[0]].astype(float) - - reduced_z = ( - reduce_z(tiles, axis=1) if reduce_z else concatenate_dims(tiles, 1, -2) - ) - - repeated_mask = tile_like(mask, reduced_z[0]) - - cropped_fg = crop_mask(reduced_z[1], repeated_mask) - - return reduced_z[0], cropped_fg - - -def _gen_overlay_masks_tiles( - image_path: str, - results_path: str, - masks: np.ndarray, - indices: t.Tuple[t.Tuple[int], t.Tuple[int], t.Tuple[int]], - bg_channel: int = 0, - fg_channel: int = 1, - reduce_z: t.Union[None, t.Callable] = np.max, -) -> t.Generator[t.Tuple[np.ndarray, np.ndarray], None, None]: - """ - Generator of mask tiles - """ - - for mask, index in zip(masks, zip(*indices)): - yield _overlay_mask_tile( - image_path, - results_path, - mask, - index, - bg_channel, - fg_channel, - reduce_z, - ) diff --git a/src/postprocessor/core/functions/__init__.py b/src/postprocessor/core/functions/__init__.py deleted file mode 100644 index a23e56a7..00000000 --- a/src/postprocessor/core/functions/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env python3 - -""" -Collection of non-process functions. These are usually called by processes but not fetched automatically.. -""" diff --git a/src/postprocessor/core/multisignal/__init__.py b/src/postprocessor/core/multisignal/__init__.py deleted file mode 100644 index a2aa9b57..00000000 --- a/src/postprocessor/core/multisignal/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Collection of processes that work on multiple signals. -""" diff --git a/src/postprocessor/core/multisignal/align.py b/src/postprocessor/core/multisignal/align.py deleted file mode 100644 index 99f44819..00000000 --- a/src/postprocessor/core/multisignal/align.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python3 - -import bottleneck as bn -import numpy as np -import pandas as pd - -from agora.abc import ParametersABC -from postprocessor.core.abc import PostProcessABC - - -def df_extend_nan(df, width): - """Extend a DataFrame to the left by a number of columns and fill with NaNs - - Assumes column names are sequential integers from 0 - """ - num_rows, _ = df.shape - nan_df = pd.DataFrame( - np.full([num_rows, width], np.nan), - index=df.index, - ) - out_df = pd.concat([nan_df, df], axis=1) - _, out_num_cols = out_df.shape - out_df.columns = list(range(out_num_cols)) - return out_df - - -def df_shift(df, shift_list): - """Shifts each row of each DataFrame by a list of shift intervals - - Assumes all DataFrames have the same indices (and therefore the same number of rows) - """ - # Convert to numpy to increase performance - array = df.to_numpy() - - # Sort by shift interval to increase performance - argsort_shift_list = np.argsort(shift_list) - array_sorted = array[argsort_shift_list] - - # List of matrices, one for each unique shift interval - matrix_list = [] - shift_list_unique = np.unique(shift_list) - for shift_value in shift_list_unique: - # Select the rows of 'array_sorted' that correspond to shift_value - shift_value_matrix = array_sorted[ - np.array(shift_list)[argsort_shift_list] == shift_value, : - ] - if shift_value != 0: - shift_value_matrix = np.roll(shift_value_matrix, -shift_value) - shift_value_matrix[:, -shift_value:] = np.nan - matrix_list.append(shift_value_matrix) - - # Reassemble based on argsort - matrix_list_concat = np.concatenate(matrix_list) - array_shifted = matrix_list_concat[np.argsort(argsort_shift_list)] - return pd.DataFrame(array_shifted, index=df.index, columns=df.columns) - - -class alignParameters(ParametersABC): - """ - Parameters for the 'align' process. - - Attributes - ---------- - slice_before_first_event: bool - Whether to discard the parts of signals that occur before the first - event being aligned. For example, whether to discard flavin - fluorescence before the first birth event, after aligning by the first - birth event. - events_at_least: int - Specifies the number of events required for each cell. For example, if - events_at_least is 2, then it will discard time series (from the DataFrame) - that have less than 2 events. As a more pratical example: discarding - flavin time series that derive from cells with less than 2 buddings - identified. - """ - - _defaults = { - "slice_before_first_event": True, - "events_at_least": 1, - } - - -class align(PostProcessABC): - """ - Process to align a signal by corresponding events. - - For example, aligning flavin fluorescence time series by the first birth - event of the cell each time series is derived from. - - Methods - ------- - run(trace_df: pd.DataFrame, mask_df: pd.DataFrame) - Align signals by events. - """ - - def __init__(self, parameters: alignParameters): - super().__init__(parameters) - - # Not sure if having two DataFrame inputs fits the paradigm, but having the - # mask_df be a parameter is a bit odd as it doesn't set the behaviour of the - # process. - def run(self, trace_df: pd.DataFrame, mask_df: pd.DataFrame): - """Align signals by events. - - Parameters - ---------- - trace_df : pd.DataFrame - Signal time series, with rows indicating individual time series - (e.g. from each cell), and columns indicating time points. - mask_df : pd.DataFrame - Event time series/mask, with rows indicating individual cells and - columns indicating time points. The values of each element are - either 0 or 1 -- 0 indicating the absence of the event, and 1 - indicating the presence of the event. Effectively, this DataFrame is - like a mask. For example, this DataFrame can indicate when birth - events are identified for each cell in a dataset. - """ - # Converts mask_df to float if it hasn't been already - # This is so that df_shift() can add np.nans - mask_df += 0.0 - - # Remove cells that have less than or equal to events_at_least events, - # i.e. if events_at_least = 1, then cells that have no birth events are - # deleted. - event_mask = ( - bn.nansum(mask_df.to_numpy(), axis=1) >= self.events_at_least - ) - mask_df = mask_df.iloc[event_mask.tolist()] - - # Match trace and event signals by index, e.g. cellID - # and discard the cells they don't have in common - common_index = trace_df.index.intersection(mask_df.index) - trace_aligned = trace_df.loc[common_index] - mask_aligned = mask_df.loc[common_index] - - # Identify first event and define shift - shift_list = [] - for index in common_index: - event_locs = np.where(mask_df.loc[index].to_numpy() == 1)[0] - if event_locs.any(): - shift = event_locs[0] - else: - shift = 0 - shift_list.append(shift) - shift_list = np.array(shift_list) - - # Shifting - - # Remove bits of traces before first event - if self.slice_before_first_event: - # minus sign in front of shift_list to shift to the left - mask_aligned = df_shift(mask_aligned, shift_list) - trace_aligned = df_shift(trace_aligned, shift_list) - # Do not remove bits of traces before first event - else: - # Add columns to left, filled with NaNs - max_shift = bn.nanmax(shift_list) - mask_aligned = df_extend_nan(mask_aligned, max_shift) - trace_aligned = df_extend_nan(trace_aligned, max_shift) - # shift each - mask_aligned = df_shift(mask_aligned, shift_list) - trace_aligned = df_shift(trace_aligned, shift_list) - - return trace_aligned, mask_aligned diff --git a/src/postprocessor/core/multisignal/crosscorr.py b/src/postprocessor/core/multisignal/crosscorr.py deleted file mode 100644 index a4069829..00000000 --- a/src/postprocessor/core/multisignal/crosscorr.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env python3 - -import numpy as np -import pandas as pd -from agora.abc import ParametersABC - -from postprocessor.core.abc import PostProcessABC - - -class crosscorrParameters(ParametersABC): - """ - Parameters for the 'crosscorr' process. - - Attributes - ---------- - stationary: boolean - If True, the underlying dynamic process is assumed to be - stationary with the mean a constant, estimated from all - data points. - normalised: boolean (optional) - If True, normalise the result for each replicate by the standard - deviation over time for that replicate. - only_pos: boolean (optional) - If True, return results only for positive lags. - """ - - _defaults = {"stationary": False, "normalised": True, "only_pos": False} - - -class crosscorr(PostProcessABC): - """ """ - - def __init__(self, parameters: crosscorrParameters): - super().__init__(parameters) - - def run(self, trace_dfA: pd.DataFrame, trace_dfB: pd.DataFrame = None): - """Calculates normalised cross-correlations as a function of lag. - - Calculates normalised auto- or cross-correlations as a function of lag. - Lag is given in multiples of the unknown time interval between data points. - - Normalisation is by the product of the standard deviation over time for - each replicate for each variable. - - For the cross-correlation between sA and sB, the closest peak to zero lag - should in the positive lags if sA is delayed compared to signal B and in the - negative lags if sA is advanced compared to signal B. - - Parameters - ---------- - trace_dfA: dataframe - An array of signal values, with each row a replicate measurement - and each column a time point. - trace_dfB: dataframe (required for cross-correlation only) - An array of signal values, with each row a replicate measurement - and each column a time point. - stationary: boolean - If True, the underlying dynamic process is assumed to be - stationary with the mean a constant, estimated from all - data points. - normalised: boolean (optional) - If True, normalise the result for each replicate by the standard - deviation over time for that replicate. - only_pos: boolean (optional) - If True, return results only for positive lags. - - Returns - ------- - corr: dataframe - An array of the correlations with each row the result for the - corresponding replicate and each column a time point - lags: array - A 1D array of the lags in multiples of the unknown time interval - - Example - ------- - >>> import matplotlib.pyplot as plt - >>> import numpy as np - >>> import pandas as pd - >>> from postprocessor.core.multisignal.crosscorr import crosscorr - - Define a sine signal with 200 time points and 333 replicates - - >>> t = np.linspace(0, 4, 200) - >>> ts = np.tile(t, 333).reshape((333, 200)) - >>> s = 3*np.sin(2*np.pi*ts + 2*np.pi*np.random.rand(333, 1)) - >>> s_df = pd.DataFrame(s) - - Find and plot the autocorrelaton - - >>> ac = crosscorr.as_function(s_df) - >>> plt.figure() - >>> plt.plot(ac.columns, ac.mean(axis=0, skipna=True)) - >>> plt.show() - - Reference - --------- - Dunlop MJ, Cox RS, Levine JH, Murray RM, Elowitz MB (2008). Regulatory - activity revealed by dynamic correlations in gene expression noise. - Nat Genet, 40, 1493-1498. - """ - - df = ( - trace_dfA.copy() - if type(trace_dfA) == pd.core.frame.DataFrame - else None - ) - # convert from aliby dataframe to arrays - trace_A = trace_dfA.to_numpy() - # number of replicates - n_replicates = trace_A.shape[0] - # number of time points - n_tps = trace_A.shape[1] - # autocorrelation if 2nd dataframe is not supplied - if trace_dfB is None: - trace_dfB = trace_dfA - trace_B = trace_A - else: - trace_B = trace_dfB.to_numpy() - # find deviation from the mean - dmean_A, stdA = _dev(trace_A, n_replicates, n_tps, self.stationary) - dmean_B, stdB = _dev(trace_B, n_replicates, n_tps, self.stationary) - # lag r runs over positive lags - pos_corr = np.nan * np.ones(trace_A.shape) - for r in range(n_tps): - prods = [ - dmean_A[:, lagtime] * dmean_B[:, lagtime + r] - for lagtime in range(n_tps - r) - ] - pos_corr[:, r] = np.nanmean(prods, axis=0) - # lag r runs over negative lags - # use corr_AB(-k) = corr_BA(k) - neg_corr = np.nan * np.ones(trace_A.shape) - for r in range(n_tps): - prods = [ - dmean_B[:, lagtime] * dmean_A[:, lagtime + r] - for lagtime in range(n_tps - r) - ] - neg_corr[:, r] = np.nanmean(prods, axis=0) - if self.normalised: - # normalise by standard deviation - pos_corr = pos_corr / stdA / stdB - neg_corr = neg_corr / stdA / stdB - # combine lags - lags = np.arange(-n_tps + 1, n_tps) - corr = np.hstack((np.flip(neg_corr[:, 1:], axis=1), pos_corr)) - if self.only_pos: - return pd.DataFrame( - corr[:, int(lags.size / 2) :], - index=df.index, - columns=lags[int(lags.size / 2) :], - ) - else: - return pd.DataFrame(corr, index=df.index, columns=lags) - - -def _dev(y, nr, nt, stationary=False): - # calculate deviation from the mean - if stationary: - # mean calculated over time and over replicates - dy = y - np.nanmean(y) - else: - # mean calculated over replicates at each time point - dy = y - np.nanmean(y, axis=0).reshape((1, nt)) - # standard deviation calculated for each replicate - stdy = np.sqrt(np.nanmean(dy**2, axis=1).reshape((nr, 1))) - return dy, stdy diff --git a/src/postprocessor/core/multisignal/mi.py b/src/postprocessor/core/multisignal/mi.py deleted file mode 100644 index 0157bea1..00000000 --- a/src/postprocessor/core/multisignal/mi.py +++ /dev/null @@ -1,227 +0,0 @@ -import numpy as np -import pandas as pd -from agora.abc import ParametersABC -from sklearn import svm -from sklearn.decomposition import PCA -from sklearn.metrics import confusion_matrix -from sklearn.model_selection import GridSearchCV, train_test_split -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler - -from postprocessor.core.abc import PostProcessABC - - -class miParameters(ParametersABC): - """Parameters for the 'mi' process - - Parameters for the 'mi' process. - - Attributes - ---------- - overtime: boolean (default: True) - - If True, calculate the mutual information as a function of the duration of - the time series, by finding the mutuation information for all possible - sub-time series that start from t= 0. - - - n_bootstraps: int, optional (default: 100) - - The number of bootstraps used to estimate errors. - - - ci: 1x2 array or list, optional (default: [0.25, 0.75]) - - The lower and upper confidence intervals. - - E.g. [0.25, 0.75] for the interquartile range - - - Crange: array, optional - - An array of potential values for the C parameter of the support vector machine - and from which the optimal value of C will be chosen. - - If None, np.logspace(-3, 3, 10) is used. This range should be increased if - the optimal C is one of the boundary values. - - See https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html - - - gammarange: array, optional - - An array of potential values for the gamma parameter for the radial basis - function kernel of the support vector machine and from which the optimal - value of gamma will be chosen. - - If None, np.logspace(-3, 3, 10) is used. This range should be increased if - the optimal gamma is one of the boundary values. - - See https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html - - - train_test_split_seeding: boolean, optional (default: False) - - If True, force a random state for the train-test split in each bootstrap. This is - useful in case the user requires reproducibility e.g. code testing. - - See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html - """ - - _defaults = { - "overtime": True, - "n_bootstraps": 100, - "ci": [0.25, 0.75], - "Crange": None, - "gammarange": None, - "train_test_split_seeding": False, - } - - -class mi(PostProcessABC): - """ - Process to estimate power spectral density (autoregressive model). - - Methods - ------- - run(signal: pd.DataFrame) - Estimates the mutual information between classes of time series. - """ - - def __init__( - self, - parameters: miParameters, - ): - super().__init__(parameters) - - def run(self, signals): - """ - Estimates the mutual information between classes of time series. - - Uses sklean to optimise a pipeline for classifying the individual time series, - choosing the number of PCA components (3-7), the classifier - a support vector - machine with either a linear or a radial basis function kernel - and its C and - gamma parameters. - - Errors are found using bootstrapped datasets. - - Parameters: signals: list of pandas.DataFrames - - A list of DataFrames. Each DataFrame stores a set of time series, with rows - indicating individual time series (e.g. from each cell), and columns - indicating time points. - - Returns: res: array - - Summary statistics from the bootstrapped datasets -- the median mutual - information and the 10% and 90% confidence limits. - - If overtime is True, each row corresponds to a different duration of the time - series with the shortest duration, just the first time point, in the first row - and the longest duration, the entire time series, in the last row. - """ - - # default values - if not np.any(self.Crange): - self.Crange = np.logspace(-3, 3, 10) - if not np.any(self.gammarange): - self.gammarange = np.logspace(-3, 3, 10) - - # data is a list with one array of time series for different class - data = [signal.to_numpy() for signal in signals] - n_classes = len(data) - Xo = np.vstack([timeseries for timeseries in data]) - y = np.hstack( - [ - i * np.ones(timeseries.shape[0]) - for i, timeseries in enumerate(data) - ] - ) - - if self.overtime: - # loop over time series iteratively - durations = np.arange(1, Xo.shape[1] + 1) - else: - # full time series only - durations = [Xo.shape[1]] - # array for results - res = np.zeros((len(durations), 3)) - - for j, duration in enumerate(durations): - # slice of of time series - # if verbose: - # print("duration of time series is", duration) - X = Xo[:, :duration] - - # initialise sself.cikit-learn routines - nPCArange = ( - range(1, X.shape[1] + 1) if X.shape[1] < 7 else [3, 4, 5, 6, 7] - ) - params = [ - {"project__n_components": nPCArange}, - { - "classifier__kernel": ["linear"], - "classifier__C": self.Crange, - }, - { - "classifier__kernel": ["rbf"], - "classifier__C": self.Crange, - "classifier__gamma": self.gammarange, - }, - ] - pipe = Pipeline( - [ - ("project", PCA()), - ("rescale", StandardScaler()), - ("classifier", svm.SVC()), - ] - ) - - # find best params for pipeline - grid_pipeline = GridSearchCV(pipe, params, n_jobs=-1, cv=5) - grid_pipeline.fit(X, y) - # if verbose: - # print(grid_pipeline.best_estimator_) - pipe.set_params(**grid_pipeline.best_params_) - - # find mutual information for each bootstrapped dataset - mi = np.empty(self.n_bootstraps) - for i in range(self.n_bootstraps): - # force random state, useful for code testing/reproducibility - if self.train_test_split_seeding: - random_state = i - else: - random_state = None - X_train, X_test, y_train, y_test = train_test_split( - X, - y, - test_size=0.25, - random_state=random_state, - ) - # run classifier use optimal params - pipe.fit(X_train, y_train) - y_predict = pipe.predict(X_test) - # estimate mutual information - p_y = 1 / n_classes - p_yhat_given_y = confusion_matrix( - y_test, y_predict, normalize="true" - ) - p_yhat = np.sum(p_y * p_yhat_given_y, 0) - h_yhat = -np.sum( - p_yhat[p_yhat > 0] * np.log2(p_yhat[p_yhat > 0]) - ) - log2_p_yhat_given_y = np.ma.log2(p_yhat_given_y).filled(0) - h_yhat_given_y = -np.sum( - p_y * np.sum(p_yhat_given_y * log2_p_yhat_given_y, 1) - ) - mi[i] = h_yhat - h_yhat_given_y - - # summary statistics - median and confidence intervals - res[j, :] = [ - np.median(mi), - np.sort(mi)[int(np.min(self.ci) * self.n_bootstraps)], - np.sort(mi)[int(np.max(self.ci) * self.n_bootstraps)], - ] - # if verbose: - # print(f"median MI= {res[j,0]:.2f} [{res[j,1]:.2f}, {res[j,2]:.2f}]") - return res diff --git a/src/postprocessor/core/processes/__init__.py b/src/postprocessor/core/processes/__init__.py deleted file mode 100644 index bb39f5bd..00000000 --- a/src/postprocessor/core/processes/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env python3 - -""" -All Processes in this folder must return the same shape they take as an input. -""" diff --git a/src/postprocessor/core/processes/autoreg.py b/src/postprocessor/core/processes/autoreg.py deleted file mode 100644 index 3e40a4f1..00000000 --- a/src/postprocessor/core/processes/autoreg.py +++ /dev/null @@ -1,327 +0,0 @@ -from collections import namedtuple - -import numpy as np -import pandas as pd -import scipy.linalg as linalg -from agora.abc import ParametersABC - -from postprocessor.core.abc import PostProcessABC - -# TODO: Provide the option of whether to optimise AR order -- see issue #1 -# TODO: Provide the functionality of 'smoothing' a time series with AR -- see -# issue #1 - - -class autoregParameters(ParametersABC): - """Parameters for the 'autoreg' process - - Parameters for the 'autoreg' process. - - Attributes - ---------- - sampling_period : float - Sampling period of measurement values, in unit time. - - freq_npoints : int - Number of points for the frequency axis of the closed-form solution of - the estimated periodogram. Defines the resolution for use in, for - example, plots. - - # Sampling period should be listed in the HDF5 metadata (i.e. the - # 'time_settings/timeinterval' attribute, unit seconds). When using - # this processor in a typical routine, use the sampling period from - # there rather than relying on this default value of 5 minutes. - """ - - _defaults = { - "sampling_period": 5, - "freq_npoints": 100, - } - - -class autoreg(PostProcessABC): - """ - Process to estimate power spectral density (autoregressive model). - - Methods - ------- - fit_autoreg(timeseries: array_like, ar_order: int) - Computes linear algebra parameters used to fit an autoregressive model - to a time series - optimise_ar_order(timeseries: array_like, ar_order_upper_limit: int) - Optimise autoregressive model order to fit a time series - autoreg_periodogram(timeseries: array_like, sampling_period: float, - freq_npoints: int, ar_order: int) - Estimates the closed-form solution of the sample power spectrum of a - timeseries, based on fitting an autoregressive model. - run(signal: pd.DataFrame) - Estimates the power spectral densities of a dataframe of time series, - using autoregressive model - """ - - def __init__( - self, - parameters: autoregParameters, - ): - super().__init__(parameters) - - # Should this be a static method? - def fit_autoreg(self, timeseries, ar_order) -> dict: - """Computes linear algebra parameters to fit an AR model to a timeseries - - Computes linear algebra parameters used to fit an autoregressive model - of a specified order to a time series. Ultimately computes the Akaike - information criterion of the model with the specified order. - - This model-fitting implementation is based on the supplementary - information of: - * Jia & Grima (2020). BioRxiv [https://doi.org/10.1101/2020.09.23.309724] - - Parameters - ---------- - timeseries : array_like - Time series of measurement values. - ar_order : int - Order of the autoregressive model. The order defines the complexity - of the model; if the order is P, then each time point is modelled as - a linear combination of P time points preceding it. - - Returns - ------- - This function returns a dictionary, whose values are defined by these - keys: - - sample_acfs : 1d array of floats - Sample autocorrelation function; element indices go from 0 to - ar_order. - - ar_coeffs : 1d array of floats - Coefficients for the autoregressive model; defines the linear - combination that defines the model. - - noise_param : float - Noise parameter. - - aic : float - Akaike information criterion of the autoregressive model. - - """ - # Estimates sample autocorrelation function (R). - # sample_acfs: 1D array of R values - sample_acfs = np.zeros(ar_order + 1) - for ii in range(ar_order + 1): - sample_acfs[ii] = (1 / len(timeseries)) * np.sum( - [ - (timeseries[k] - np.mean(timeseries)) - * (timeseries[k + ii] - np.mean(timeseries)) - for k in range(len(timeseries) - ii) - ] - ) - - # Estimates AR coefficients (phi) by solving Yule-Walker equation. - # ar_coeffs: 1D array of coefficients (i.e. phi values) - sample_acfs_toeplitz = linalg.toeplitz(sample_acfs[0:ar_order]) - # phi vector goes from 1 to P in Jia & Grima (2020) - ar_coeffs = linalg.inv(sample_acfs_toeplitz).dot( - sample_acfs[1 : ar_order + 1] - ) - # defines a dummy phi_0 as 1. This is so that the indices I use in - # get_noise_param are consistent with Jia & Grima (2020) - ar_coeffs = np.insert(ar_coeffs, 0, 1.0, axis=0) - - # Estimates noise parameter (noise_param) - noise_param = sample_acfs[0] - np.sum( - [ar_coeffs[k] * sample_acfs[k] for k in range(1, ar_order + 1)] - ) - - # Calculates AIC (aic) - aic = np.log(noise_param) + (ar_order) / len(timeseries) - - return { - "sample_acfs": sample_acfs, - "ar_coeffs": ar_coeffs, - "noise_param": noise_param, - "aic": aic, - } - - # Should this be a static method? - def optimise_ar_order( - self, - timeseries, - ar_order_upper_limit, - ) -> int: - """Optimise autoregressive model order to fit a time series - - Optimise the autoregressive model order to fit a time series. Model - selection relies on minimising the Akaike information criterion, - sweeping over a range of possible orders. - - This implementation is based on the supplementary - information of: - * Jia & Grima (2020). BioRxiv [https://doi.org/10.1101/2020.09.23.309724] - - Parameters - ---------- - timeseries : array_like - Time series of measurement values. - ar_order_upper_limit : int - Upper bound for autoregressive model order; recommended to be the - square root of the length of the time series. - - Returns - ------- - int - Optimal order for an autoregressive model that fits the time series. - - """ - # Bug: artificial dip at order 1 if time series is a smooth sinusoid. - # Will probably need to fix it so that it checks if the minimum also - # corresponds to a zero derivative. - ar_orders = np.arange(1, ar_order_upper_limit) - aics = np.zeros(len(ar_orders)) - for ii, ar_order in enumerate(ar_orders): - model = self.fit_autoreg(timeseries, ar_order) - aics[ii] = model["aic"] - return ar_orders[np.argmin(aics)] - - def autoreg_periodogram( - self, - timeseries, - sampling_period, - freq_npoints, - ar_order, - ): - """Estimates the power spectrum of a timeseries, based on an AR model - - Estimates the closed-form solution of the sample power spectrum of a - time series. This estimation is based on fitting an autoregressive model - of an optimised order to the time series, and using computed linear - algebra parameters to compute the closed-form solution. - - This implementation is based on the supplementary - information of: - * Jia & Grima (2020). BioRxiv [https://doi.org/10.1101/2020.09.23.309724] - - Parameters - --------- - timeseries : array_like - Time series of measurement values. - sampling_period : float - Sampling period of measurement values, in unit time. - freq_npoints : int - Number of points for the frequency axis of the closed-form solution of - the estimated periodogram. Defines the resolution for use in, for - example, plots. - ar_order : int - Order of the autoregressive model. The order defines the complexity - of the model; if the order is P, then each time point is modelled as - a linear combination of P time points preceding it. - - Returns - ------- - freqs: ndarray - Array of sample frequencies, unit time-1. - - power: ndarray - Power spectral density or power spectrum of the time series, - arbitrary units. - - Examples - -------- - FIXME: Add docs. - - """ - ar_model = self.fit_autoreg(timeseries, ar_order) - freqs = np.linspace(0, 1 / (2 * sampling_period), freq_npoints) - power = np.zeros(len(freqs)) - # Sweeps the frequencies; corresponds to the 'xi' variable in - # Jia & Grima (2020) - for ii, freq in enumerate(freqs): - # multiplied 2pi into the exponential to get the frequency rather - # than angular frequency - summation = [ - ar_model["ar_coeffs"][k] * np.exp(-1j * k * (2 * np.pi) * freq) - for k in range(ar_order + 1) - ] - summation[0] = 1 # minus sign error??? - divisor = np.sum(summation) - power[ii] = (ar_model["noise_param"] / (2 * np.pi)) / np.power( - np.abs(divisor), 2 - ) - # Normalise by first element of power axis. This is consistent with - # the MATLAB code from Ramon Grima. - power = power / power[0] - return freqs, power - - def run(self, signal: pd.DataFrame): - # TODO: Return an additional DataFrame that contains the orders, - # optimised or not - """Estimates the power spectra of a dataframe of time series, using AR - - Estimates the power spectral densities of a dataframe of time series. - This estimation is based on fitting an autoregressive model - of an optimised order to the time series, and using computed linear - algebra parameters to compute the closed-form solution. - - This implementation is based on the supplementary - information of: - * Jia & Grima (2020). BioRxiv [https://doi.org/10.1101/2020.09.23.309724] - - Parameters - ---------- - signal: pandas.DataFrame - Time series, with rows indicating individiual time series (e.g. from - each cell), and columns indicating time points. - - Returns - ------- - freqs_df: pandas.DataFrame - Sample frequencies from each time series, with labels preserved from - 'signal'. - - power_df: pandas.DataFrame - Power spectrum from each time series, with labels preserved from - 'signal'. - - order_df: pandas.DataFrame - Optimised order of the autoregressive model fitted to each time - series, with labels preserved from 'signal'. - - Examples - -------- - FIXME: Add docs. - - """ - AutoregAxes = namedtuple("AutoregAxes", ["freqs", "power", "order"]) - # Each element in this list is a named tuple: (freqs, power, order) - axes = [] - # Use enumerate instead? - for row_index in range(len(signal)): - timeseries = signal.iloc[row_index, :].dropna().values - optimal_ar_order = self.optimise_ar_order( - timeseries, int(3 * np.sqrt(len(timeseries))) - ) - axes.insert( - row_index, - AutoregAxes( - *self.autoreg_periodogram( - timeseries=timeseries, - sampling_period=self.sampling_period, - freq_npoints=self.freq_npoints, - ar_order=optimal_ar_order, - ), - optimal_ar_order, - ), - ) - - freqs_df = pd.DataFrame( - [element.freqs for element in axes], index=signal.index - ) - power_df = pd.DataFrame( - [element.power for element in axes], index=signal.index - ) - order_df = pd.DataFrame( - [element.order for element in axes], index=signal.index - ) - - return freqs_df, power_df, order_df diff --git a/src/postprocessor/core/processes/butter.py b/src/postprocessor/core/processes/butter.py deleted file mode 100644 index c66406b0..00000000 --- a/src/postprocessor/core/processes/butter.py +++ /dev/null @@ -1,93 +0,0 @@ -import numpy as np -import pandas as pd -from agora.abc import ParametersABC -from scipy import signal - -from postprocessor.core.abc import PostProcessABC - - -class butterParameters(ParametersABC): - """Parameters for the 'butter' process. - - Parameters for the 'butter' process. - - Attributes - ---------- - order : int - The order of the filter. - critical_freqs : array_like - The critical frequency or frequencies. For lowpass and highpass - filters, Wn is a scalar; for bandpass and bandstop filters, Wn is a - length-2 sequence. For a Butterworth filter, this is the point at which - the gain drops to 1/sqrt(2) that of the passband (the “-3 dB pointâ€). - For digital filters, if fs is not specified, Wn units are normalized - from 0 to 1, where 1 is the Nyquist frequency (Wn is thus in - half cycles / sample and defined as 2*critical frequencies / fs). - If fs is specified, Wn is in the same units as fs. For analog filters, - Wn is an angular frequency (e.g. rad/s). - filter_type : {‘lowpass’, ‘highpass’, ‘bandpass’, ‘bandstop’} - The type of filter. Default is ‘lowpass’. - sampling_freq : float - The sampling frequency of the digital system. - """ - - _defaults = { - "order": 2, - "critical_freqs": 1 / 350, - "filter_type": "highpass", - "sampling_freq": 1 / 5, - } - - -class butter(PostProcessABC): - """Process to apply Butterworth filter - based on scipy.signal.butter - - Methods - ------- - run(signal: pd.DataFrame) - Apply Butterworth filter constructed according to user parameters - to each time series in a DataFrame - """ - - def __init__(self, parameters: butterParameters): - super().__init__(parameters) - - def butterfilter(self, timeseries): - """Apply Butterworth filter to one time series""" - # second-order-sections output - # by default, using a digital filter - sos = signal.butter( - N=self.order, - Wn=self.critical_freqs, - btype=self.filter_type, - fs=self.sampling_freq, - output="sos", - ) - # subtract time series by mean - # otherwise the first couple time series will look like the acf, - # which is not what we want - # filter time series - timeseries_norm = timeseries - np.mean(timeseries) - return signal.sosfiltfilt(sos, timeseries_norm) - - def run(self, signal_df: pd.DataFrame): - """Apply Butterworth filter - - Parameters - ---------- - signal : pd.DataFrame - Time series, with rows indicating individual time series (e.g. from - each cell), and columns indicating time points. - - Returns - ------- - signal_filtered : pd.DataFrame - Filtered time series. - - """ - signal_filtered = signal_df.apply( - self.butterfilter, axis=1, result_type="expand" - ) - signal_filtered.columns = signal_df.columns - return signal_filtered diff --git a/src/postprocessor/core/processes/catch22.py b/src/postprocessor/core/processes/catch22.py deleted file mode 100644 index 679cecd4..00000000 --- a/src/postprocessor/core/processes/catch22.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 - -import numpy as np -import pandas as pd -from agora.abc import ParametersABC -from pycatch22 import catch22_all -from sklearn import decomposition - -from postprocessor.core.abc import PostProcessABC - - -class catch22Parameters(ParametersABC): - """ - Parameters - :min_len: Prefilter to account only for long-signal cells - """ - - _defaults = { - "min_len": 0.8, - "n_components": None, - } - - -class catch22(PostProcessABC): - """ - catch22 class. It produces 22 normalised features for each time lapse in the signal (using the catch22 library.) - """ - - def __init__(self, parameters: catch22Parameters): - super().__init__(parameters) - - def run(self, signal: pd.DataFrame): - thresh = ( - self.min_len - if isinstance(self.min_len, int) - else signal.shape[1] * self.min_len - ) - adf = signal.loc[signal.notna().sum(axis=1) > thresh] - catches = [ - catch22_all(adf.iloc[i, :].dropna().values) - for i in range(len(adf)) - ] - - norm = pd.DataFrame( - [catches[j]["values"] for j in range(len(catches))], - index=adf.index, - columns=catches[0]["names"], - ) - - return norm diff --git a/src/postprocessor/core/processes/detrend.py b/src/postprocessor/core/processes/detrend.py deleted file mode 100644 index 7bbe744c..00000000 --- a/src/postprocessor/core/processes/detrend.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np -import pandas as pd -from agora.abc import ParametersABC - -from postprocessor.core.abc import PostProcessABC - - -class detrendParameters(ParametersABC): - """Parameters for the 'detrend' process. - - Parameters for the 'detrend' process. - - Attributes - ---------- - window : int - Size of sliding window. - """ - - _defaults = {"window": 16} - - -class detrend(PostProcessABC): - """Process to detrend using sliding window - - Methods - ------- - run(signal: pd.DataFrame) - Detrend each time series in a dataframe using a specified sliding window - """ - - def __init__(self, parameters: detrendParameters): - super().__init__(parameters) - - def run(self, signal: pd.DataFrame): - """Detrend using sliding window - - Detrend each time series in a dataframe using a specified sliding window - - Parameters - ---------- - signal : pd.DataFrame - Time series, with rows indicating individual time series (e.g. from - each cell), and columns indicating time points. - - Returns - ------- - signal_norm : pd.DataFrame - Detrended time series. - - """ - # Compute moving average - signal_movavg = signal.rolling( - window=self.window, center=True, axis=1 - ).mean() - # Detrend: subtract normalised time series by moving average - signal_detrend = signal.subtract(signal_movavg) - # Rolling window operations create columns that are all NaNs at the left - # and right edges because of the maths. This line removes these columns. - return signal_detrend.dropna(axis=1, how="all") diff --git a/src/postprocessor/core/processes/dsignal.py b/src/postprocessor/core/processes/dsignal.py deleted file mode 100644 index 50bc25e4..00000000 --- a/src/postprocessor/core/processes/dsignal.py +++ /dev/null @@ -1,41 +0,0 @@ -import bottleneck as bn -import numpy as np -import pandas as pd - -from agora.abc import ParametersABC -from postprocessor.core.abc import PostProcessABC - - -class dsignalParameters(ParametersABC): - """ - :window: Number of timepoints to consider for signal. - """ - - _defaults = {"window": 10, "min_count": 5} - - -class dsignal(PostProcessABC): - """ - Calculate the change in a signal using the mean of a moving window. - """ - - def __init__(self, parameters: dsignalParameters): - super().__init__(parameters) - - def run(self, signal: pd.DataFrame): - if signal.shape[1] > self.parameters.window: - matrix = np.diff( - bn.move_mean( - signal, - window=self.parameters.window, - min_count=self.parameters.min_count, - axis=1, - ), - axis=1, - ) - # Pad values to keep the same signal shape - matrix = np.pad(matrix, ((0, 0), (0, 1)), constant_values=np.nan) - else: - matrix = np.full_like(signal, np.nan) - - return pd.DataFrame(matrix, index=signal.index, columns=signal.columns) diff --git a/src/postprocessor/core/processes/fft.py b/src/postprocessor/core/processes/fft.py deleted file mode 100644 index 754dc6ea..00000000 --- a/src/postprocessor/core/processes/fft.py +++ /dev/null @@ -1,153 +0,0 @@ -from collections import namedtuple - -import numpy as np -import pandas as pd -from agora.abc import ParametersABC -from scipy.signal import periodogram - -from postprocessor.core.abc import PostProcessABC - - -class fftParameters(ParametersABC): - """ - Parameters for the 'fft' process. - - Attributes - ---------- - sampling_period: float - Sampling period of measurement values, in unit time. - - oversampling_factor: float - Oversampling factor, which indicates how many times a signal is - sampled over the Nyquist rate. For example, if the oversampling - factor is 2, the signal is sampled at 2 times the Nyquist rate. - """ - - _defaults = { - "sampling_period": 5, - "oversampling_factor": 1, - } - - -class fft(PostProcessABC): - """ - Process to estimate power spectral density (classical/Fourier). - - Methods - ------- - classical_periodogram(timeseries: array_like, sampling_period: float, - oversampling_factor: float) - Estimates the power spectral density using a periodogram. - run(signal: pd.DataFrame) - Estimates the power spectral density of a dataframe of time series. - """ - - def __init__(self, parameters: fftParameters): - super().__init__(parameters) - - def classical_periodogram( - self, - timeseries, - sampling_period, - oversampling_factor, - ): - """ - Estimates the power spectral density using a periodogram. - - This is based on a fast Fourier transform. - - The power spectrum is normalised so that the area under the power - spectrum is constant across different time series, thus allowing users - to easily compare spectra across time series. See: - * Scargle (1982). Astrophysical Journal 263 p. 835-853 - * Glynn et al (2006). Bioinformatics 22(3) 310-316 - - Parameters - --------- - timeseries: array_like - Time series of measurement values. - - sampling_period: float - Sampling period of measurement values, in unit time. - - oversampling_factor: float - Oversampling factor, which indicates how many times a signal is - sampled over the Nyquist rate. For example, if the oversampling - factor is 2, the signal is sampled at 2 times the Nyquist rate. - - Returns - ------- - freqs: ndarray - Array of sample frequencies, unit time-1. - - power: ndarray - Power spectral density or power spectrum of the time series, - arbitrary units. - """ - freqs, power = periodogram( - timeseries, - fs=1 / (oversampling_factor * sampling_period), - nfft=len(timeseries) * oversampling_factor, - detrend="constant", - return_onesided=True, - scaling="spectrum", - ) - # Required to correct units; units will be expressed in min-1 (or any other - # unit) - freqs = oversampling_factor * freqs - # Normalisation (Scargle, 1982; Glynn et al., 2006) - power = power * (0.5 * len(timeseries)) - # Normalisation by variance to allow comparing different time series - # (Scargle, 1982) - power = power / np.var(timeseries, ddof=1) - return freqs, power - - def run(self, signal: pd.DataFrame): - """ - Estimates the power spectral density of a dataframe of time series. - - This uses the classical periodogram. - - All NaNs are dropped as the Fourier transform used does not afford - missing time points or time points with uneven spacing in the time - dimension. For time series with missing values, the Lomb-Scargle - periodogram is suggested (processes/lsp.py) - - Parameters - ---------- - signal: pandas.DataFrame - Time series, with rows indicating individiual time series (e.g. from - each cell), and columns indicating time points. - - Returns - ------- - freqs_df: pandas.DataFrame - Sample frequencies from each time series, with labels preserved from - 'signal'. - - power_df: pandas.DataFrame - Power spectrum from each time series, with labels preserved from - 'signal'. - """ - FftAxes = namedtuple("FftAxes", ["freqs", "power"]) - # Each element in this list is a named tuple: (freqs, power) - axes = [ - FftAxes( - *self.classical_periodogram( - timeseries=signal.iloc[row_index, :].dropna().values, - sampling_period=self.sampling_period, - oversampling_factor=self.oversampling_factor, - ) - ) - for row_index in range(len(signal)) - ] - - freqs_df = pd.DataFrame( - [element.freqs for element in axes], index=signal.index - ) - - power_df = pd.DataFrame( - [element.power for element in axes], index=signal.index - ) - - return freqs_df, power_df diff --git a/src/postprocessor/core/processes/findpeaks.py b/src/postprocessor/core/processes/findpeaks.py deleted file mode 100644 index 0d2f6a9a..00000000 --- a/src/postprocessor/core/processes/findpeaks.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env python3 - -import numpy as np -import pandas as pd -from agora.abc import ParametersABC -from scipy.signal import find_peaks - -from postprocessor.core.abc import PostProcessABC - - -class findpeaksParameters(ParametersABC): - """ - Parameters for the 'findpeaks' process. - - Attributes - ---------- - distance: number - Required minimal horizontal distance (>= 1) in samples between - neighbouring peaks. Smaller peaks are removed first until the condition - is fulfilled for all remaining peaks. - - prominence: number or ndarray or sequence - Required prominence of peaks. Either a number, None, an array matching x - or a 2-element sequence of the former. The first element is always - interpreted as the minimal and the second, if supplied, as the maximal - required prominence. - """ - - _defaults = { - "height": None, - "threshold": None, - "distance": 10, - "prominence": 0.035, - "width": None, - "wlen": None, - "rel_height": 0.5, - "plateau_size": None, - } - - -class findpeaks(PostProcessABC): - """ - Process to find peaks inside a signal. - - Methods - ------- - _find_peaks_mask(timeseries: sequence, distance: number, prominence: number - or ndarray or sequence) - Find peaks of a time series and returns a binary mask locating these peaks - run(signal: pd.DataFrame) - Find peaks in a dataframe of time series. - """ - - def __init__(self, parameters: findpeaksParameters): - super().__init__(parameters) - - def _find_peaks_mask( - self, - timeseries, - height, - threshold, - distance, - prominence, - width, - wlen, - rel_height, - plateau_size, - ): - """Find peaks of a time series and returns a binary mask locating these peaks - - Parameters - ---------- - timeseries : sequence - Time series with peaks. - distance : number - Required minimal horizontal distance in samples between neighbouring - peaks. - prominence : number or ndarray or sequence - Required prominence of peaks. - """ - peak_indices = find_peaks( - timeseries, - height=height, - threshold=threshold, - distance=distance, - prominence=prominence, - width=width, - wlen=wlen, - rel_height=rel_height, - plateau_size=plateau_size, - )[0] - mask = np.zeros(len(timeseries), dtype=int) - mask[peak_indices] = 1 - return mask - - def run(self, signal: pd.DataFrame): - """Find peaks in a dataframe of time series. - - Find peaks of a dataframe of time series. This function is effectively a - wrapper for scipy.signal.find_peaks. - - Parameters - ---------- - signal : pd.DataFrame - Time series, with rows indicating individual time series (e.g. from - each cell), and columns indicating time points. - """ - mask_df = signal.apply( - lambda x: self._find_peaks_mask( - x, - height=self.height, - threshold=self.threshold, - distance=self.distance, - prominence=self.prominence, - width=self.width, - wlen=self.wlen, - rel_height=self.rel_height, - plateau_size=self.plateau_size, - ), - axis=1, - result_type="expand", - ) - mask_df.columns = signal.columns - return mask_df diff --git a/src/postprocessor/core/processes/gpsignal.py b/src/postprocessor/core/processes/gpsignal.py deleted file mode 100644 index 27cfabd6..00000000 --- a/src/postprocessor/core/processes/gpsignal.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Gaussian process fit of a Signal.""" -import logging - -import gaussianprocessderivatives as gp -import numpy as np -import pandas as pd -from agora.abc import ParametersABC - -from postprocessor.core.abc import PostProcessABC - - -def estimate_gr(volume, dt, noruns, bounds, verbose): - """ - Parameters - ---------- - - volume : pd.Series - The volume series of a given cell - dt : float - The time interval in hours - noruns : int - The number of runs for optimisation - bounds : dict - The hyperparameter bounds used for optimisation - verbose : bool - If True, prints results - - Returns - ------- - """ - volume = volume.values - n = len(volume) - idx = np.arange(n) - t = idx * dt - y = volume[volume > 0] - x = t[volume > 0] - idx = idx[volume > 0] - # Fit the GP - mg = gp.maternGP(bounds, x, y) - mg.findhyperparameters(noruns=noruns) - if verbose: - mg.results() # Prints out the hyperparameters - mg.predict(x, derivs=2) # Saves the predictions to object - # Save the predictions to a csv file so they can be reused - results = dict( - time=mg.x, - volume=mg.y, - fit_time=mg.xnew, - fit_volume=mg.f, - growth_rate=mg.df, - d_growth_rate=mg.ddf, - volume_var=mg.fvar, - growth_rate_var=mg.dfvar, - d_growth_rate_var=mg.ddfvar, - ) - for name, value in results.items(): - results[name] = np.full((n,), np.nan) - results[name][idx] = value - return results - - -# Give that to a writer: NOTE the writer needs to learn how to write the -# output of a process that returns multiple signals like this one does. - - -class gpsignalParameters(ParametersABC): - """ - Parameters - ---------- - dt : float - The time step between time points, in minutes - noruns : int - The number of times the optimisation is tried - bounds : dict - Hyperparameter bounds for the Matern Kernel - verbose : bool - Determines whether to print hyperparameter results - """ - - _defaults = dict( - dt=5, - noruns=5, - bounds={0: (-2, 3), 1: (-2, 1), 2: (-4, -1)}, - verbose=False, - ) - - -class gpsignal(PostProcessABC): - """Gaussian process fit of a Signal.""" - - def __init__(self, parameters: gpsignalParameters): - super().__init__(parameters) - - def run(self, signal: pd.DataFrame): - results = signal.apply( - lambda x: estimate_gr(x, **self.parameters.to_dict()), - result_type="expand", - axis=1, - ) - multi_signal = { - name: pd.DataFrame( - np.vstack(results[name]), - index=signal.index, - columns=signal.columns, - ) - for name in results.columns - } - return multi_signal diff --git a/src/postprocessor/core/processes/interpolate.py b/src/postprocessor/core/processes/interpolate.py deleted file mode 100644 index 1217e1ef..00000000 --- a/src/postprocessor/core/processes/interpolate.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env jupyter - -import pandas as pd -from agora.abc import ParametersABC - -from postprocessor.core.abc import PostProcessABC - - -class interpolateParameters(ParametersABC): - """ - Parameters - """ - - _defaults = {"limit_area": "inside"} - - -class interpolate(PostProcessABC): - """ - Interpolate process. - """ - - def __init__(self, parameters: interpolateParameters): - super().__init__(parameters) - - def run(self, signal: pd.DataFrame): - if len(signal): - signal = signal.interpolate(limit_area="inside", axis=1) - return signal diff --git a/src/postprocessor/core/processes/knngraph.py b/src/postprocessor/core/processes/knngraph.py deleted file mode 100644 index 90298d75..00000000 --- a/src/postprocessor/core/processes/knngraph.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python3 - -import igraph as ig -import numpy as np -import pandas as pd -from agora.abc import ParametersABC -from sklearn.metrics.pairwise import euclidean_distances - -from postprocessor.core.abc import PostProcessABC - - -class knngraphParameters(ParametersABC): - """ - Parameters for the 'knngraph' process. - """ - - _defaults = { - "n_neighbors": 10, - } - - -class knngraph(PostProcessABC): - """ - Process to get geometric graph from data/features, based on k-nearest - neighbours. - - Methods - ------- - run(signal: pd.DataFrame) - Get graph. - """ - - def __init__(self, parameters: knngraphParameters): - super().__init__(parameters) - - def run(self, signal: pd.DataFrame): - """Get graph. - - Parameters - ---------- - signal : pd.DataFrame - Feature matrix. - """ - distance_matrix = euclidean_distances(signal) - distance_matrix_pruned = graph_prune( - distance_matrix, self.n_neighbours - ) - graph = ig.Graph.Weighted_Adjacency( - distance_matrix_pruned.tolist(), mode="undirected" - ) - graph.vs["strain"] = signal.index.get_level_values("strain") - return graph - - -def graph_prune(distanceMatrix, neighbours): - """ - Prunes a complete graph (input distance matrix), keeping at least a - specified number of neighbours for each node. - - Parameters: - ----------- - distanceMatrix = 2D numpy array - neighbours = integer - - Return: Dij_pruned, a 2D numpy array, represents distance matrix of pruned - graph - """ - Dij_temp = distanceMatrix - Adj = np.zeros(distanceMatrix.shape) - for ii in range(distanceMatrix.shape[0]): - idx = np.argsort(Dij_temp[ii, :]) - Adj[ii, idx[1]] = 1 - Adj[idx[1], ii] = 1 - for jj in range(neighbours): - Adj[ii, idx[jj]] = 1 - Adj[idx[jj], ii] = 1 - Dij_pruned = Dij_temp * Adj - return Dij_pruned diff --git a/src/postprocessor/core/processes/leiden.py b/src/postprocessor/core/processes/leiden.py deleted file mode 100644 index 0fec2cd2..00000000 --- a/src/postprocessor/core/processes/leiden.py +++ /dev/null @@ -1,48 +0,0 @@ -from itertools import product - -import igraph as ig -import leidenalg -import numpy as np -import pandas as pd -from agora.abc import ParametersABC - -from postprocessor.core.abc import PostProcessABC - - -class leidenParameters(ParametersABC): - """ - Parameters - """ - - _defaults = {} - - -class leiden(PostProcessABC): - """ - leiden algorithm applied to a dataframe with features. - """ - - def __init__(self, parameters: leidenParameters): - super().__init__(parameters) - - def run(self, features: pd.DataFrame): - # Generate euclidean distance matrix - distances = np.linalg.norm( - features.values - features.values[:, None], axis=2 - ) - ind = [ - "_".join([str(y) for y in x[1:]]) - for x in features.index.to_flat_index() - ] - source, target = zip(*product(ind, ind)) - df = pd.DataFrame( - { - "source": source, - "target": target, - "distance": distances.flatten(), - } - ) - df = df.loc[df["source"] != df["target"]] - g = ig.Graph.DataFrame(df, directed=False) - - return leidenalg.find_partition(g, leidenalg.ModularityVertexPartition) diff --git a/src/postprocessor/core/processes/standardscaler.py b/src/postprocessor/core/processes/standardscaler.py deleted file mode 100644 index ef82fe4e..00000000 --- a/src/postprocessor/core/processes/standardscaler.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python - -import pandas as pd -from sklearn.preprocessing import StandardScaler - -from agora.abc import ParametersABC -from postprocessor.core.abc import PostProcessABC - - -class standardscalerParameters(ParametersABC): - """ - Parameters for the 'scale' process. - """ - - _defaults = {} - - -class standardscaler(PostProcessABC): - """ - Process to scale a DataFrame of a signal using the standard scaler. - - Methods - ------- - run(signal: pd.DataFrame) - Scale values in a dataframe of time series. - """ - - def __init__(self, parameters: standardscalerParameters): - super().__init__(parameters) - - def run(self, signal: pd.DataFrame): - """Scale values in a dataframe of time series. - - Scale values in a dataframe of time series. This function is effectively a - wrapper for sklearn.preprocessing.StandardScaler. - - Parameters - ---------- - signal : pd.DataFrame - Time series, with rows indicating individual time series (e.g. from - each cell), and columns indicating time points. - """ - signal_array = signal.to_numpy() - scaler = StandardScaler().fit(signal_array.transpose()) - signal_scaled_array = scaler.transform(signal_array.transpose()) - signal_scaled_array = signal_scaled_array.transpose() - signal_scaled = pd.DataFrame( - signal_scaled_array, columns=signal.columns, index=signal.index - ) - return signal_scaled diff --git a/src/postprocessor/core/processes/template.py b/src/postprocessor/core/processes/template.py deleted file mode 100644 index 2ee6fae4..00000000 --- a/src/postprocessor/core/processes/template.py +++ /dev/null @@ -1,25 +0,0 @@ -import numpy as np -import pandas as pd -from agora.abc import ParametersABC - -from postprocessor.core.abc import PostProcessABC - - -class TemplateParameters(ParametersABC): - """ - Parameters - """ - - _defaults = {} - - -class Template(PostProcessABC): - """ - Template for process class. - """ - - def __init__(self, parameters: TemplateParameters): - super().__init__(parameters) - - def run(self): - pass diff --git a/src/postprocessor/core/processes/umapembedding.py b/src/postprocessor/core/processes/umapembedding.py deleted file mode 100644 index e47830e2..00000000 --- a/src/postprocessor/core/processes/umapembedding.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 - -import pandas as pd -import umap -from agora.abc import ParametersABC - -from postprocessor.core.abc import PostProcessABC - - -class umapembeddingParameters(ParametersABC): - """ - Parameters for the 'umapembedding' process. - """ - - _defaults = { - "n_neighbors": None, - "min_dist": None, - "n_components": None, - } - - -class umapembedding(PostProcessABC): - """ - Process to get UMAP embeddings from data/features. - - Methods - ------- - run(signal: pd.DataFrame) - Get UMAP embeddings. - """ - - def __init__(self, parameters: umapembeddingParameters): - super().__init__(parameters) - - def run(self, signal: pd.DataFrame): - """Get UMAP embeddings. - - This function is effectively a wrapper for umap.UMAP. - - Parameters - ---------- - signal : pd.DataFrame - Feature matrix. - """ - mapper = umap.UMAP( - n_neighbors=self.n_neighbors, - min_dist=self.min_dist, - n_components=self.n_components, - ).fit(signal) - return mapper.embedding_ diff --git a/src/postprocessor/core/report.py b/src/postprocessor/core/report.py deleted file mode 100644 index b0963a70..00000000 --- a/src/postprocessor/core/report.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 -""" -Automatic summarissed report to condense results of an experiment. -It should NOT contain text beyond labels and titles. -The most efficient way to communicate information is (inteligently) colour-coded figures and tables. - -The structure page-wise is as follows: -Page:Contents -1: Technical summary -2-4: Summarised signal results -5-End: - Part 1: Extended results - Part 2: Extended technicalities -""" - -from agora.base import ParametersABC, ProcessABC - - -class Reporter(ProcessABC): - """ """ - - def __init__(self, parameters: ParametersABC): - super().__init__(parameters) - - -class ReporterParameters(ParametersABC): - """ """ - - def __init__(self, technical: dict, results: dict, grouper_kws: dict): - super().__init__() diff --git a/src/postprocessor/core/reshapers/merger.py b/src/postprocessor/core/reshapers/merger.py index 0728b81e..923653c4 100644 --- a/src/postprocessor/core/reshapers/merger.py +++ b/src/postprocessor/core/reshapers/merger.py @@ -2,7 +2,7 @@ import numpy as np from agora.abc import ParametersABC from postprocessor.core.abc import PostProcessABC -from postprocessor.core.functions.tracks import get_merges +from postprocessor.core.reshapers.tracks import get_merges class MergerParameters(ParametersABC): diff --git a/src/postprocessor/core/processes/savgol.py b/src/postprocessor/core/reshapers/nusavgol.py similarity index 68% rename from src/postprocessor/core/processes/savgol.py rename to src/postprocessor/core/reshapers/nusavgol.py index 14686adc..9b7b5587 100644 --- a/src/postprocessor/core/processes/savgol.py +++ b/src/postprocessor/core/reshapers/nusavgol.py @@ -1,61 +1,9 @@ import numpy as np -import pandas as pd -from agora.abc import ParametersABC -from scipy.signal import savgol_filter - -from postprocessor.core.abc import PostProcessABC - - -class savgolParameters(ParametersABC): - """ - Parameters - - window : int (odd) - Window length of datapoints. Must be odd and smaller than x - polynom : int - The order of polynom used. Must be smaller than the window size - """ - - _defaults = {"window": 7, "polynom": 3} - - -class savgol(PostProcessABC): - """ - Apply Savitzky-Golay filter (works with NaNs, but it might return - NaN regions). - """ - - def __init__(self, parameters: savgolParameters): - super().__init__(parameters) - - def run(self, signal: pd.DataFrame): - try: - post_savgol = pd.DataFrame( - savgol_filter( - signal, self.parameters.window, self.parameters.polynom - ), - index=signal.index, - columns=signal.columns, - ) - except Exception as e: - print(e) - - def savgol_on_srs(x): - return non_uniform_savgol( - x.index, - x.values, - self.parameters.window, - self.parameters.polynom, - ) - - post_savgol = signal.apply(savgol_on_srs, 1).apply(pd.Series) - return post_savgol def non_uniform_savgol(x, y, window: int, polynom: int): """ - Applies a Savitzky-Golay filter to y with non-uniform spacing - as defined in x + Apply a Savitzky-Golay filter with non-uniform spacing in x. This is based on https://dsp.stackexchange.com/questions/1676/savitzky-golay-smoothing-filter-for-not-equally-spaced-data The borders are interpolated like scipy.signal.savgol_filter would do @@ -79,50 +27,44 @@ def non_uniform_savgol(x, y, window: int, polynom: int): np.array of float The smoothed y values """ - _raiseif( + raiseif( len(x) != len(y), '"x" and "y" must be of the same size', ValueError, ) - _raiseif( + raiseif( len(x) < window, "The data size must be larger than the window size", ValueError, ) - _raiseif( + raiseif( not isinstance(window, int), '"window" must be an integer', TypeError, ) - _raiseif(window % 2, 'The "window" must be an odd integer', ValueError) - - _raiseif( + raiseif(window % 2, 'The "window" must be an odd integer', ValueError) + raiseif( not isinstance(polynom, int), '"polynom" must be an integer', TypeError, ) - - _raiseif( + raiseif( polynom >= window, '"polynom" must be less than "window"', ValueError, ) - half_window = window // 2 polynom += 1 - # Initialize variables A = np.empty((window, polynom)) # Matrix tA = np.empty((polynom, window)) # Transposed matrix t = np.empty(window) # Local x variables y_smoothed = np.full(len(y), np.nan) - # Start smoothing for i in range(half_window, len(x) - half_window, 1): # Center a window of x values on x[i] for j in range(0, window, 1): t[j] = x[i + j - half_window] - x[i] - # Create the initial matrix A and its transposed form tA for j in range(0, window, 1): r = 1.0 @@ -130,21 +72,16 @@ def non_uniform_savgol(x, y, window: int, polynom: int): A[j, k] = r tA[k, j] = r r *= t[j] - # Multiply the two matrices tAA = np.matmul(tA, A) - # Invert the product of the matrices tAA = np.linalg.inv(tAA) - # Calculate the pseudoinverse of the design matrix coeffs = np.matmul(tAA, tA) - # Calculate c0 which is also the y value for y[i] y_smoothed[i] = 0 for j in range(0, window, 1): y_smoothed[i] += coeffs[0, j] * y[i + j - half_window] - # If at the end or beginning, store all coefficients for the polynom if i == half_window: first_coeffs = np.zeros(polynom) @@ -156,7 +93,6 @@ def non_uniform_savgol(x, y, window: int, polynom: int): for j in range(0, window, 1): for k in range(polynom): last_coeffs[k] += coeffs[k, j] * y[len(y) - window + j] - # Interpolate the result at the left border for i in range(0, half_window, 1): y_smoothed[i] = 0 @@ -164,7 +100,6 @@ def non_uniform_savgol(x, y, window: int, polynom: int): for j in range(0, polynom, 1): y_smoothed[i] += first_coeffs[j] * x_i x_i *= x[i] - x[half_window] - # Interpolate the result at the right border for i in range(len(x) - half_window, len(x), 1): y_smoothed[i] = 0 @@ -172,10 +107,9 @@ def non_uniform_savgol(x, y, window: int, polynom: int): for j in range(0, polynom, 1): y_smoothed[i] += last_coeffs[j] * x_i x_i *= x[i] - x[-half_window - 1] - return y_smoothed -def _raiseif(cond, msg="", exc=AssertionError): +def raiseif(cond, msg="", exc=AssertionError): if cond: raise exc(msg) diff --git a/src/postprocessor/core/functions/tracks.py b/src/postprocessor/core/reshapers/tracks.py similarity index 99% rename from src/postprocessor/core/functions/tracks.py rename to src/postprocessor/core/reshapers/tracks.py index dc83c543..4023f52b 100644 --- a/src/postprocessor/core/functions/tracks.py +++ b/src/postprocessor/core/reshapers/tracks.py @@ -18,7 +18,7 @@ import pandas as pd from matplotlib import pyplot as plt from utils_find_1st import cmp_larger, find_1st -from postprocessor.core.processes.savgol import non_uniform_savgol +from postprocessor.core.reshapers.nusavgol import non_uniform_savgol def get_merges(tracks, smooth=False, tol=0.2, window=5, degree=3) -> dict: -- GitLab