diff --git a/README.md b/README.md index bff3d462af8d8071b3594e7a9e5bf170b46cbebb..98c1d5284d17d7d99d74f2d9798e3ff23e89e0bb 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ See our [installation instructions]( https://aliby.readthedocs.io/en/latest/INST ### CLI +If installed via poetry, you have access to a Command Line Interface (CLI) + ```bash aliby-run --expt_id EXPT_PATH --distributed 4 --tps None ``` @@ -31,6 +33,8 @@ And to run Omero servers, the basic arguments are shown: The output is a folder with the original logfiles and a set of hdf5 files, one with the results of each multidimensional inside. +For more information, including available options, see the page on [running the analysis pipeline](https://aliby.readthedocs.io/en/latest/PIPELINE.html) + ## Using specific components ### Access raw data diff --git a/docs/source/INSTALL.md b/docs/source/INSTALL.md index 94abc570f87bc636041e8d9203c7247a782ccd0b..b19b576e29f5d163a009e07d0fa5080608a2ebdc 100644 --- a/docs/source/INSTALL.md +++ b/docs/source/INSTALL.md @@ -62,7 +62,7 @@ For Windows, the simplest way to install it is using conda (or mamba). You can i $ \PATH\TO\POETRY\LOCATION\poetry install - MacOS - Under work (See issue https://github.com/ome/omero-py/issues/317) + For local access and processing, follow the same instructions as Linux. Remote access to OMERO servers depends on some issues in one of our depedencies being solved (See issue https://github.com/ome/omero-py/issues/317) ### Git version @@ -71,9 +71,23 @@ Install [ poetry ](https://python-poetry.org/docs/#installation) for dependency In case you want to have local version: $ git clone git@gitlab.com/aliby/aliby.git - $ cd aliby && poetry install --all-extras + $ cd aliby + + and then either -This will automatically install the [ BABY ](https://gitlab.com/aliby/baby) segmentation software. Support for additional segmentation and tracking algorithms is under development. + $$ poetry install --all-extras + +for everything, including tools to access OMERO servers, or + + $$ poetry install + +for a version with only local access, or + + $$ poetry install --with dev + +to install with compatible versions of the development tools we use, such as black. + +These commands will automatically install the [ BABY ](https://gitlab.com/aliby/baby) segmentation software. Support for additional segmentation and tracking algorithms is under development. ## Omero Server diff --git a/docs/source/PIPELINE.md b/docs/source/PIPELINE.md new file mode 100644 index 0000000000000000000000000000000000000000..127ff0c2efc7da40364600db94840c7298a84ce4 --- /dev/null +++ b/docs/source/PIPELINE.md @@ -0,0 +1,87 @@ +# Running the analysis pipeline + +You can run the analysis pipeline either via the command line interface (CLI) or using a script that incorporates the `aliby.pipeline.Pipeline` object. + +## CLI + +On a CLI, you can use the `aliby-run` command. This command takes options as follows: +- `--host`: Address of image-hosting server. +- `--username`: Username to access image-hosting server. +- `--password`: Password to access image-hosting server. +- `--expt_id`: Number ID of experiment stored on host server. +- `--distributed`: Number of distributed cores to use for segmentation and signal processing. If 0, there is no parallelisation. +- `--tps`: Optional. Number of time points from the beginning of the experiment to use. If not specified, the pipeline processes all time points. +- `--directory`: Optional. Parent directory to save the data files (HDF5) generated, `./data` by default; the files will be stored in a child directory whose name is the name of the experiment. +- `--filter`: Optional. List of positions to use for analysis. Alternatively, a regex (regular expression) or list of regexes to search for positions. **Note: for the CLI, currently it is not able to take a list of strings as input.** +- `--overwrite`: Optional. Whether to overwrite an existing data directory. True by default. +- `--override_meta`: Optional. Whether to overwrite an existing data directory. True by default. + +Example usage: + ```bash +aliby-run --expt_id EXPT_PATH --distributed 4 --tps None + ``` + +And to run Omero servers, the basic arguments are shown: + ```bash + aliby-run --expt_id XXX --host SERVER.ADDRESS --user USER --password PASSWORD + ``` + + +## Script + +Use the `aliby.pipeline.Pipeline` object and supply a dictionary, following the example below. The meaning of the parameters are the same as described in the CLI section above. + +```python +#!/usr/bin/env python3 + +from aliby.pipeline import Pipeline, PipelineParameters + +# Specify experiment IDs +ids = [101, 102] + +for i in ids: + print(i) + try: + params = PipelineParameters.default( + # Create dictionary to define pipeline parameters. + general={ + "expt_id": i, + "distributed": 6, + "host": "INSERT ADDRESS HERE", + "username": "INSERT USERNAME HERE", + "password": "INSERT PASSWORD HERE", + # Ensure data will be overwriten + "override_meta": True, + "overwrite": True, + } + ) + + # Fine-grained control beyond general parameters: + # change specific leaf in the extraction tree. + # This example tells the pipeline to additionally compute the + # nuc_est_conv quantity, which is a measure of the degree of + # localisation of a signal in a cell. + params = params.to_dict() + leaf_to_change = params["extraction"]["tree"]["GFP"]["np_max"] + leaf_to_change.add("nuc_est_conv") + + # Regenerate PipelineParameters + p = Pipeline(PipelineParameters.from_dict(params)) + + # Run pipeline + p.run() + + # Error handling + except Exception as e: + print(e) +``` + +This example code can be the contents of a `run.py` file, and you can run it via + +```bash +python run.py +``` + +in the appropriate virtual environment. + +Alternatively, the example code can be the contents of a cell in a jupyter notebook. diff --git a/src/agora/io/signal.py b/src/agora/io/signal.py index 322bb5ee1929650653e93d7bbcab3ae2e72aebd2..6f7ea3e4ad1f1fa8c422a6c33acd1d97000fa944 100644 --- a/src/agora/io/signal.py +++ b/src/agora/io/signal.py @@ -203,9 +203,8 @@ class Signal(BridgeH5): merged = pd.DataFrame([], index=index) return merged - # Alan: do we need two similar properties - see below? - @property - def datasets(self): + @cached_property + def p_available(self): """Print data sets available in h5 file.""" if not hasattr(self, "_available"): self._available = [] @@ -214,11 +213,6 @@ class Signal(BridgeH5): for sig in self._available: print(sig) - @cached_property - def p_available(self): - """Print data sets available in h5 file.""" - self.datasets - @cached_property def available(self): """Get data sets available in h5 file.""" diff --git a/src/agora/io/writer.py b/src/agora/io/writer.py index b57c252e71372059bac41fd72b002a7614dbdb50..a13828c795fe406423531ba9b11a3c2cac224881 100644 --- a/src/agora/io/writer.py +++ b/src/agora/io/writer.py @@ -230,7 +230,6 @@ class LinearBabyWriter(DynamicWriter): Assumes the edgemasks are of form ((None, tile_size, tile_size), bool). """ - # TODO make this YAML: Alan: why? compression = "gzip" _default_tile_size = 117 datatypes = { @@ -319,11 +318,7 @@ class StateWriter(DynamicWriter): @staticmethod def format_values_tpback(states: list, val_name: str): """Unpacks a dict of state data into tp_back, trap, value.""" - # initialise as empty lists - # Alan: is this initialisation necessary? - tp_back, trap, value = [ - [[] for _ in states[0][val_name]] for _ in range(3) - ] + # store results as a list of tuples lbl_tuples = [ (tp_back, trap, cell_label) @@ -334,6 +329,11 @@ class StateWriter(DynamicWriter): # unpack list of tuples to define variables if len(lbl_tuples): tp_back, trap, value = zip(*lbl_tuples) + else: + # set as empty lists + tp_back, trap, value = [ + [[] for _ in states[0][val_name]] for _ in range(3) + ] return tp_back, trap, value @staticmethod @@ -409,9 +409,9 @@ class StateWriter(DynamicWriter): #################### Extraction version ############################### class Writer(BridgeH5): - """Class to transform data into compatible structures.""" - - # Alan: when is this used? + """ + Class to transform data into compatible structures. + Used by Extractor and Postprocessor within the pipeline.""" def __init__(self, filename, flag=None, compression="gzip"): """ @@ -473,7 +473,7 @@ class Writer(BridgeH5): self.write_pd(f, path, data, compression=self.compression) # data is a multi-index dataframe elif isinstance(data, pd.MultiIndex): - # Alan: should we still not compress here? + # TODO: benchmark I/O speed when using compression self.write_index(f, path, data) # , compression=self.compression) # data is a dictionary of dataframes elif isinstance(data, Dict) and np.all( diff --git a/src/aliby/pipeline.py b/src/aliby/pipeline.py index 82d6420e4fb1f87f20efb3e376051901177496da..9e475646a9fda1922e4654d6e6a63d9d9f510d6b 100644 --- a/src/aliby/pipeline.py +++ b/src/aliby/pipeline.py @@ -76,14 +76,12 @@ class PipelineParameters(ParametersABC): postprocessing: dict (optional) Parameters for post-processing. """ - # Alan: should 19993 be updated? expt_id = general.get("expt_id", 19993) if isinstance(expt_id, PosixPath): expt_id = str(expt_id) general["expt_id"] = expt_id - # Alan: an error message rather than a default might be better - directory = Path(general.get("directory", "../data")) + directory = Path(general["directory"]) # get log files, either locally or via OMERO with dispatch_dataset( @@ -174,8 +172,8 @@ class Pipeline(ProcessABC): "extraction", "postprocessing", ] - # Indicate step-writer groupings to perform special operations during step iteration - # Alan: replace with - specify the group in the h5 files written by each step (?) + + # Specify the group in the h5 files written by each step writer_groups = { "tiler": ["trap_info"], "baby": ["cell_info"], @@ -478,7 +476,7 @@ class Pipeline(ProcessABC): f"Found {steps['tiler'].n_tiles} traps in {image.name}" ) elif step == "baby": - # write state and pass info to ext (Alan: what's ext?) + # write state and pass info to Extractor loaded_writers["state"].write( data=steps[ step @@ -573,7 +571,8 @@ class Pipeline(ProcessABC): ) return (traps_above_nthresh & traps_above_athresh).mean() - # Alan: can both this method and the next be deleted? + # FIXME: Remove this functionality. It used to be for + # older hdf5 file formats. def _load_config_from_file( self, filename: PosixPath, @@ -588,6 +587,8 @@ class Pipeline(ProcessABC): process_from[k] += 1 return process_from, trackers_state, overwrite + # FIXME: Remove this functionality. It used to be for + # older hdf5 file formats. @staticmethod def legacy_get_last_tp(step: str) -> t.Callable: """Get last time-point in different ways depending @@ -647,7 +648,7 @@ class Pipeline(ProcessABC): States of any trackers from earlier runs. """ config = self.parameters.to_dict() - # Alan: session is never changed + # TODO Alan: Verify if session must be passed session = None earlystop = config["general"].get("earlystop", None) process_from = {k: 0 for k in self.pipeline_steps} @@ -700,8 +701,8 @@ class Pipeline(ProcessABC): ) config["tiler"] = steps["tiler"].parameters.to_dict() except Exception: - # Alan: a warning or log here? - pass + self._log(f"Overwriting tiling data") + if config["general"]["use_explog"]: meta.run() # add metadata not in the log file diff --git a/src/aliby/tile/tiler.py b/src/aliby/tile/tiler.py index 43f4b397af7b7fea394fca0130ecbd9313347695..f812b756f1e801ec871368d0209a801fab16ff35 100644 --- a/src/aliby/tile/tiler.py +++ b/src/aliby/tile/tiler.py @@ -640,8 +640,8 @@ class Tiler(StepABC): return tile -# Alan: do we need these as well as get_channel_index and get_channel_name? -# TODO homogenise these into a pair of functions +# FIXME: Refactor to support both channel or index +# self._log below is not defined def find_channel_index(image_channels: t.List[str], channel: str): """ Access diff --git a/src/extraction/core/extractor.py b/src/extraction/core/extractor.py index 3f7fdbef8c680e1902f28b84022eb6cf4f02175e..e254532faadf893a4d41254da6c925da9518bde2 100644 --- a/src/extraction/core/extractor.py +++ b/src/extraction/core/extractor.py @@ -100,7 +100,7 @@ class Extractor(StepABC): Extraction follows a three-level tree structure. Channels, such as GFP, are the root level; the reduction algorithm, such as maximum projection, is the second level; the specific metric, or operation, to apply to the masks, such as mean, is the third level. """ - # Alan: should this data be stored here or all such data in a separate file + # TODO Alan: Move this to a location with the SwainLab defaults default_meta = { "pixel_size": 0.236, "z_size": 0.6, diff --git a/src/postprocessor/grouper.py b/src/postprocessor/grouper.py index 990a97830d5d90db032e5e6e606ea4aceeb75422..4c8e5026f6c0351132b405a9fcd964cbcf4a729d 100644 --- a/src/postprocessor/grouper.py +++ b/src/postprocessor/grouper.py @@ -353,7 +353,6 @@ class phGrouper(NameGrouper): return aggregated -# Alan: why are these separate functions? def concat_standard( path: str, chainer: Chainer, @@ -474,9 +473,7 @@ class MultiGrouper: ) return self._sigtable - # Alan: function seems out of place - # seaborn is not in pyproject.toml - def sigtable_plot(self) -> None: + def _sigtable_plot(self) -> None: """ Plot number of chains for all available experiments.