From f44ecafe88d4766e4d0a1c1d6154b3df6ac8f9f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Al=C3=A1n=20Mu=C3=B1oz?= <amuoz@ed.ac.uk> Date: Fri, 20 May 2022 14:00:38 +0100 Subject: [PATCH] cleanup and update version --- aliby/extract.py | 279 ---------------------------------------------- aliby/pipeline.py | 11 +- config.json | 2 - pyproject.toml | 2 +- 4 files changed, 11 insertions(+), 283 deletions(-) delete mode 100644 aliby/extract.py delete mode 100644 config.json diff --git a/aliby/extract.py b/aliby/extract.py deleted file mode 100644 index edd7c063..00000000 --- a/aliby/extract.py +++ /dev/null @@ -1,279 +0,0 @@ -""" -A module to extract data from a processed experiment. -""" -import h5py -import numpy as np -from tqdm import tqdm - -from core.io.matlab import matObject -from growth_rate.estimate_gr import estimate_gr - - -class Extracted: - # TODO write the filtering functions. - def __init__(self): - self.volume = None - self._keep = None - - def filter(self, filename=None, **kwargs): - """ - 1. Filter out small non-growing tracks. This means: - a. the cell size never reaches beyond a certain size-threshold - volume_thresh or - b. the cell's volume doesn't increase by at least a minimum - amount over its lifetime - 2. Join daughter tracks that are contiguous and within a volume - threshold of each other - 3. Discard tracks that are shorter than a threshold number of - timepoints - - This function is used to fix tracking/bud errors in post-processing. - The parameters define the thresholds used to determine which cells are - discarded. - FIXME Ideally we get to a point where this is no longer needed. - :return: - """ - #self.join_tracks() - filter_out = self.filter_size(**kwargs) - filter_out += self.filter_lifespan(**kwargs) - # TODO save data or just filtering parameters? - #self.to_hdf(filename) - self.keep = ~filter_out - - def filter_size(self, volume_thresh=7, growth_thresh=10, **kwargs): - """Filter out small and non-growing cells. - :param volume_thresh: Size threshold for small cells - :param growth_thresh: Size difference threshold for non-growing cells - """ - filter_out = np.where(np.max(self.volume, axis=1) < volume_thresh, - True, False) - growth = [v[v > 0] for v in self.volume] - growth = np.array([v[-1] - v[0] if len(v) > 0 else 0 for v in growth]) - filter_out += np.where(growth < growth_thresh, True, False) - return filter_out - - def filter_lifespan(self, min_time=5, **kwargs): - """Remove daughter cells that have a small life span. - - :param min_time: The minimum life span, under which cells are removed. - """ - # TODO What if there are nan values? - filter_out = np.where(np.count_nonzero(self.volume, axis=1) < - min_time, True, False) - return filter_out - - def join_tracks(self, threshold=7): - """ Join contiguous tracks that are within a certain volume - threshold of each other. - - :param threshold: Maximum volume difference to join contiguous tracks. - :return: - """ - # For all pairs of cells - # - pass - - -class ExtractedHDF(Extracted): - # TODO pull all the data out of the HFile and filter! - def __init__(self, file): - # We consider the data to be read-only - self.hfile = h5py.File(file, 'r') - - -class ExtractedMat(Extracted): - """ Pulls the extracted data out of the MATLAB cTimelapse file. - - This is mostly a convenience function in order to run the - gaussian-processes growth-rate estimation - """ - def __init__(self, file, debug=False): - ct = matObject(file) - self.debug = debug - # Pre-computed data - # TODO what if there is no timelapseTrapsOmero? - self.metadata = ct['timelapseTrapsOmero']['metadata'] - self.extracted_data = ct['timelapseTrapsOmero']['extractedData'] - self.channels = ct['timelapseTrapsOmero']['extractionParameters'][ - 'functionParameters']['channels'].tolist() - self.time_settings = ct['timelapseTrapsOmero']['metadata']['acq'][ - 'times'] - # Get filtering information - n_cells = self.extracted_data['cellNum'][0].shape - self.keep = np.full(n_cells, True) - # Not yet computed data - self._growth_rate = None - self._daughter_index = None - - - def get_channel_index(self, channel): - """Get index of channel based on name. This only considers - fluorescence channels.""" - return self.channels.index(channel) - - @property - def trap_num(self): - return self.extracted_data['trapNum'][0][self.keep] - - @property - def cell_num(self): - return self.extracted_data['cellNum'][0][self.keep] - - def identity(self, cell_idx): - """Get the (position), trap, and cell label given a cell's global - index.""" - # Todo include position when using full strain - trap = self.trap_num[cell_idx] - cell = self.cell_num[cell_idx] - return trap, cell - - def global_index(self, trap_id, cell_label): - """Get the global index of a cell given it's trap/cellNum - combination.""" - candidates = np.where(np.logical_and( - (self.trap_num == trap_id), # +1? - (self.cell_num == cell_label) - ))[0] - # TODO raise error if number of candidates != 1 - if len(candidates) == 1: - return candidates[0] - elif len(candidates) == 0: - return -1 - else: - raise(IndexError("No such cell/trap combination")) - - @property - def daughter_label(self): - """Returns the cell label of the daughters of each cell over the - timelapse. - - 0 corresponds to no daughter. This *not* the index of the daughter - cell within the data. To get this, use daughter_index. - """ - return self.extracted_data['daughterLabel'][0][self.keep] - - def _single_daughter_idx(self, mother_idx, daughter_labels): - trap_id, _ = self.identity(mother_idx) - daughter_index = [self.global_index(trap_id, cell_label) for - cell_label - in daughter_labels] - return daughter_index - - @property - def daughter_index(self): - """Returns the global index of the daughters of each cell. - - This is different from the daughter label because it corresponds to - the index of the daughter when counting all of the cells. This can - be used to index within the data arrays. - """ - if self._daughter_index is None: - daughter_index = [self._single_daughter_idx(i, daughter_labels) - for i, daughter_labels in enumerate( - self.daughter_label)] - self._daughter_index = np.array(daughter_index) - return self._daughter_index - - @property - def births(self): - return np.array(self.extracted_data['births'][0].todense())[self.keep] - - @property - def volume(self): - """Get the volume of all of the cells""" - return np.array(self.extracted_data['volume'][0].todense())[self.keep] - - def _gr_estimation(self): - dt = self.time_settings['interval'] / 360 # s to h conversion - results = [] - for v in tqdm(self.volume): - results.append(estimate_gr(v, dt)) - merged = {k: np.stack([x[k] for x in results]) for k in results[0]} - self._gr_results = merged - return - - @property - def growth_rate(self): - """Get the growth rate for all cells. - - Note that this uses the gaussian processes method of estimating - growth rate by default. If there is no growth rate in the given file - (usually the case for MATLAB), it needs to run estimation first. - This can take a while. - """ - # TODO cache the results of growth rate estimation. - if self._gr_results is None: - dt = self.time_settings['interval'] / 360 # s to h conversion - self._growth_rate = [estimate_gr(v, dt) for v in self.volume] - return self._gr_results['growth_rate'] - - def _fluo_attribute(self, channel, attribute): - channel_id = self.get_channel_index(channel) - res = np.array(self.extracted_data[attribute][channel_id].todense()) - return res[self.keep] - - def protein_localisation(self, channel, method='nucEstConv'): - """Returns protein localisation data for a given channel. - - Uses the 'nucEstConv' by default. Alternatives are 'smallPeakConv', - 'max5px', 'max2p5pc' - """ - return self._fluo_attribute(channel, method) - - def background_fluo(self, channel): - return self._fluo_attribute(channel, 'imBackground') - - def mean(self, channel): - return self._fluo_attribute(channel, 'mean') - - def median(self, channel): - return self._fluo_attribute(channel, 'median') - - def filter(self, filename=None): - """Filters and saves results to and HDF5 file. - - This is necessary because we cannot write to the MATLAB file, - so the results of the filter cannot be saved in the object. - """ - super().filter(filename=filename) - self._growth_rate = None # reset growth rate so it is recomputed - - def to_hdf(self, filename): - """Store the current results, including any filtering done, to a file. - - TODO Should we save filtered results or just re-do? - :param filename: - :return: - """ - store = h5py.File(filename, 'w') - try: - # Store (some of the) metadata - for meta in ['experiment', 'username', 'microscope', - 'comments', 'project', 'date', 'posname', - 'exptid']: - store.attrs[meta] = self.metadata[meta] - # TODO store timing information? - store.attrs['time_interval'] = self.time_settings['interval'] - store.attrs['timepoints'] = self.time_settings['ntimepoints'] - store.attrs['total_duration'] = self.time_settings['totalduration'] - # Store volume, births, daughterLabel, trapNum, cellNum - for key in ['volume', 'births', 'daughter_label', 'trap_num', - 'cell_num']: - store[key] = getattr(self, key) - # Store growth rate results - if self._gr_results: - grp = store.create_group('gaussian_process') - for key, val in self._gr_results.items(): - grp[key] = val - for channel in self.channels: - # Create a group for each channel - grp = store.create_group(channel) - # Store protein_localisation, background fluorescence, mean, median - # for each channel - grp['protein_localisation'] = self.protein_localisation(channel) - grp['background_fluo'] = self.background_fluo(channel) - grp['mean'] = self.mean(channel) - grp['median'] = self.median(channel) - finally: - store.close() - diff --git a/aliby/pipeline.py b/aliby/pipeline.py index e18f53d0..2ef9ac65 100644 --- a/aliby/pipeline.py +++ b/aliby/pipeline.py @@ -95,7 +95,16 @@ class PipelineParameters(ParametersABC): directory.mkdir(parents=True) # Download logs to use for metadata conn.cache_logs(directory) - meta = MetaData(directory, None).load_logs() + try: + meta = MetaData(directory, None).load_logs() + except Exception as e: + print("WARNING: Metadata could not be loaded: {}".format(e)) + # Set minimal metadata + meta = { + "channels/channel": "Brightfield", + "time_settings/ntimepoints": [200], + } + tps = meta["time_settings/ntimepoints"][0] defaults = { "general": dict( diff --git a/config.json b/config.json deleted file mode 100644 index 1dcbbd9c..00000000 --- a/config.json +++ /dev/null @@ -1,2 +0,0 @@ -{"host": "sce-bio-c04287.bio.ed.ac.uk", "password": "***REMOVED***", "port": 4064, - "user": "upload", "experiment": 10932} diff --git a/pyproject.toml b/pyproject.toml index 56ff4142..d9100215 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "aliby" -version = "0.1.25" +version = "0.1.26" description = "Process and analyse live-cell imaging data" authors = ["Alan Munoz <alan.munoz@ed.ac.uk>"] packages = [ -- GitLab