From 9edeb129708c4c45423c70e7c0c6d5b4ce89fe00 Mon Sep 17 00:00:00 2001 From: Marcin Kirsz <mkirsz@ed.ac.uk> Date: Tue, 8 Oct 2024 17:12:18 +0100 Subject: [PATCH] Working draft HPO --- bin/tadah_cli.cpp | 206 +++++++++++++++++++++------------------------- 1 file changed, 92 insertions(+), 114 deletions(-) diff --git a/bin/tadah_cli.cpp b/bin/tadah_cli.cpp index 9f3451e..dc3ae07 100644 --- a/bin/tadah_cli.cpp +++ b/bin/tadah_cli.cpp @@ -22,176 +22,159 @@ #include <chrono> void TadahCLI::subcommand_train() { - - - if(train->count("--verbose")) + // Check if verbose mode is enabled and set it + if (train->count("--verbose")) { set_verbose(); + } + // Load the configuration from the specified file Config config(config_file); - if(train->count("--Force")) { + // Enable FORCE and STRESS flags in the configuration if specified + if (train->count("--Force")) { config.remove("FORCE"); config.add("FORCE", "true"); } - if(train->count("--Stress")) { + if (train->count("--Stress")) { config.remove("STRESS"); config.add("STRESS", "true"); } #ifdef TADAH_BUILD_MPI + // Initialize MPI variables to determine the rank and the number of CPUs int rank = 0; int ncpu = 1; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &ncpu); - /* MPI CODE: - * The PHI matrix is divided into local phi matrices. - * - * The code follows host-worker pattern. Where host (rank 0) - * distributes work between remaining workers (rank>0). - * The work is distributed in packages. - * Each package contains: filename, first structure to be read - * and the number of structures to process. - * - * Host does not calculate phi but still allocates memory to it. - * Each worker fills its local phi. - * Once the local phi matrix is full a worker will ask host - * for a rank to send excees data. - * Host will first try to fill its own local phi. - * Once host phi matrix is full, the host will search for a worker - * with with free space in its local phi. - * - * 1. each rank reads config file - * 2. host calculates total number of structures - * 3. host calculates the dimensions of the PHI matrix - * based on force and stress flags and type of the regression - * 4. host broadcasts PHI number of cols and rows to all workers - * 5. compute sizes for local phi matrices - * 6. Each worker allocates memory for a local matrix phi - */ - - // BEGIN HOST-WORKER - if (rank==0) { - if (ncpu<2) { - std::cout << "Minimum number of cpus for an mpi version is 2" << std::endl; + // Execute host or worker code based on rank + if (rank == 0) { // Host code + // Ensure there are enough CPUs for MPI + if (ncpu < 2) { + std::cerr << "Minimum number of CPUs for an MPI version is 2" << std::endl; return; } + + // Check if the uncertainty flag is incompatible with MPI if (train->count("--uncertainty")) { - if (rank==0) { - std::cout << "-----------------------------------------------------" << std::endl; - std::cout << "The --uncertainty flag is not supported by MPI build." << std::endl; - std::cout << "-----------------------------------------------" << std::endl; - } + std::cout << "-----------------------------------------------------" << std::endl; + std::cout << "The --uncertainty flag is not supported by MPI build." << std::endl; + std::cout << "-----------------------------------------------------" << std::endl; return; } + + // Start training timer if verbose mode is enabled if (is_verbose()) std::cout << "Training..." << std::endl; - CLI::Timer timer_tot {"Training", CLI::Timer::Big}; - // HOST is waiting for workers requests + CLI::Timer timer_tot{"Training", CLI::Timer::Big}; + + // Initialize host MPI trainer MPI_Trainer_HOST HOST(config, rank, ncpu); - HOST.init(); HOST.prep_wpckgs(); + + // Process requests from workers while (true) { - if (!HOST.has_packages()) { - // no more packages, just skip remaining workers - // we will collect remaining data and release them outside of this loop - break; - } + // Exit loop if there are no more packages + if (!HOST.has_packages()) break; - // probe ANY request from ANY worker + // Probe for incoming requests from any worker HOST.probe(); - if (HOST.tag==MPI_Trainer::WORK_TAG) { + // Handle requests based on their tags + if (HOST.tag == MPI_Trainer::WORK_TAG) { HOST.work_tag(); - } - else if (HOST.tag==MPI_Trainer::DATA_TAG) { + } else if (HOST.tag == MPI_Trainer::DATA_TAG) { HOST.data_tag(); - } - else { - throw std::runtime_error("HOST1: Unexpected request from " - + std::to_string(HOST.worker)); + } else { + throw std::runtime_error("HOST1: Unexpected request from " + std::to_string(HOST.worker)); } } - // work finised, collect remaining data and release all workers - int count=1; // count number of release workers, skip host - while(true) { + // Collect remaining data and release all workers + int count = 1; + while (true) { HOST.probe(); - if (HOST.tag==MPI_Trainer::DATA_TAG) { + if (HOST.tag == MPI_Trainer::DATA_TAG) { HOST.data_tag(); - } - else { + } else { HOST.release_tag(count); } - if (count==ncpu) { break; } // count starts from 1 + if (count == ncpu) break; // Exit when all workers are released } + // Perform the final computation and save parameters HOST.solve(); Config param_file = HOST.model->get_param_file(); param_file.check_pot_file(); - std::ofstream outfile; - outfile.open ("pot.tadah"); - outfile << param_file << std::endl;; + std::ofstream outfile("pot.tadah"); + if (outfile.is_open()) { + outfile << param_file << std::endl; + } else { + std::cerr << "Error: Unable to open file pot.tadah" << std::endl; + } if (is_verbose()) std::cout << timer_tot.to_string() << std::endl; - } - else { // WORKER + } else { // Worker code + // Initialize worker MPI trainer MPI_Trainer_WORKER WORKER(config, rank, ncpu); - WORKER.init(); while (true) { - // ask for more work... - MPI_Send (&WORKER.rows_available, 1, MPI_INT, 0, MPI_Trainer::WORK_TAG, MPI_COMM_WORLD); + // Send a request for more work to the host + MPI_Send(&WORKER.rows_available, 1, MPI_INT, 0, MPI_Trainer::WORK_TAG, MPI_COMM_WORLD); - // request from root or from other workers + // Probe and respond to host or worker requests WORKER.probe(); - // release a worker + // Handle requests based on their tags if (WORKER.tag == MPI_Trainer::RELEASE_TAG) { WORKER.release_tag(); break; - } - else if (WORKER.tag == MPI_Trainer::WAIT_TAG) { - // do nothing here; ask for more work in the next cycle + } else if (WORKER.tag == MPI_Trainer::WAIT_TAG) { WORKER.wait_tag(); - } - else if (WORKER.tag == MPI_Trainer::DATA_TAG) { + } else if (WORKER.tag == MPI_Trainer::DATA_TAG) { WORKER.data_tag(); - } - else if (WORKER.tag == MPI_Trainer::WORK_TAG) { + } else if (WORKER.tag == MPI_Trainer::WORK_TAG) { WORKER.work_tag(); } } - WORKER.solve(); + WORKER.solve(); // Perform worker computations } -#else // NON MPI VERSION - CLI::Timer timer_tot {"Training", CLI::Timer::Big}; +#else // Non-MPI version + // Start training with a timer in the non-MPI mode + CLI::Timer timer_tot{"Training", CLI::Timer::Big}; Trainer tr(config); + + // Load structures and provide feedback if verbose if (is_verbose()) std::cout << "Loading structures..." << std::flush; StructureDB stdb(tr.config); if (is_verbose()) std::cout << "Done!" << std::endl; - if (is_verbose()) std::cout << "Finding nearest neighbours within: " << - tr.config.get<double>("RCUTMAX") << " cutoff distance..." << std::flush; + // Calculate nearest neighbors and provide feedback + if (is_verbose()) std::cout << "Finding nearest neighbours within: " + << tr.config.get<double>("RCUTMAX") << " cutoff distance..." << std::flush; tr.nnf.calc(stdb); if (is_verbose()) std::cout << "Done!" << std::endl; + // Begin the training process and inform if verbose if (is_verbose()) std::cout << "Training start..." << std::flush; - - // here we want to train with the locally allocated phi - tr.model->train(stdb,tr.dc); - + tr.model->train(stdb, tr.dc); if (is_verbose()) std::cout << "Done!" << std::endl; + // Save the trained model parameters to a file Config param_file = tr.model->get_param_file(); param_file.check_pot_file(); - std::ofstream outfile; - outfile.open ("pot.tadah"); - outfile << param_file << std::endl;; + std::ofstream outfile("pot.tadah"); + if (outfile.is_open()) { + outfile << param_file << std::endl; + } else { + std::cerr << "Error: Unable to open file pot.tadah" << std::endl; + } - if(train->count("--uncertainty")) { + // Output uncertainty if requested + if (train->count("--uncertainty")) { t_type weights = tr.model->get_weights(); t_type unc = tr.model->get_weights_uncertainty(); - Output(param_file,false).print_train_unc(weights, unc); + Output(param_file, false).print_train_unc(weights, unc); } + // Output total training time if verbose if (is_verbose()) std::cout << timer_tot.to_string() << std::endl; #endif } @@ -305,27 +288,12 @@ void TadahCLI::subcommand_hpo( [[maybe_unused]]int argc, [[maybe_unused]]char**argv) { - int rank=0; #ifdef TADAH_ENABLE_HPO -#ifdef TADAH_BUILD_MPI - int ncpu=1; - MPI_Comm_size(MPI_COMM_WORLD, &ncpu); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (ncpu<2) { - std::cout << "Minimum number of cpus for an mpi version is 2" << std::endl; - return; - } -#endif - - if (rank==0) - if (is_verbose()) std::cout << "Optimising HPs..." << std::endl; - - CLI::Timer timer_tot {"HPO", CLI::Timer::Big}; - if(hpo->count("--verbose")) set_verbose(); + Config config(config_file); config.remove("CHECKPRESS"); config.add("CHECKPRESS", "true"); @@ -341,6 +309,16 @@ void TadahCLI::subcommand_hpo( } #ifdef TADAH_BUILD_MPI + if (is_verbose()) std::cout << "Optimising HPs..." << std::endl; + CLI::Timer timer_tot {"HPO", CLI::Timer::Big}; + int rank=0; + int ncpu=1; + MPI_Comm_size(MPI_COMM_WORLD, &ncpu); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (ncpu<2) { + std::cout << "Minimum number of cpus for an mpi version is 2" << std::endl; + return; + } if (rank==0) { hpo_run(config, target_file); } @@ -357,7 +335,6 @@ void TadahCLI::subcommand_hpo( Config config("config.temp"); MPI_Trainer_WORKER WORKER(config, rank, ncpu); - WORKER.init(); while (true) { // ask for more work... MPI_Send (&WORKER.rows_available, 1, MPI_INT, 0, MPI_Trainer::WORK_TAG, MPI_COMM_WORLD); @@ -385,12 +362,13 @@ void TadahCLI::subcommand_hpo( } } #else + CLI::Timer timer_tot {"HPO", CLI::Timer::Big}; + if (is_verbose()) std::cout << "Optimising HPs..." << std::endl; hpo_run(config, target_file); + if (is_verbose()) std::cout << timer_tot.to_string() << std::endl; #endif - if (rank==0) - if (is_verbose()) std::cout << timer_tot.to_string() << std::endl; -#else +#else // HPO is disabled std::cout << "-----------------------------------------------" << std::endl; std::cout << "This subcommand is not supported by this build." << std::endl; std::cout << "Tadah! Must by compiled with HPO support." << std::endl; -- GitLab