From 07b75de3c3be3ea3d472dbf570ad130d289d3be5 Mon Sep 17 00:00:00 2001 From: Marcin Kirsz <mkirsz@ed.ac.uk> Date: Thu, 6 Mar 2025 16:55:45 +0000 Subject: [PATCH] Working NOMAD sreader --- .../mlip/structure_readers/nomad_reader.h | 86 +++++ src/nomad_reader.cpp | 240 ++++++++++++++ src/structure_reader_selector.cpp | 301 +++++++++++++++--- 3 files changed, 582 insertions(+), 45 deletions(-) diff --git a/include/tadah/mlip/structure_readers/nomad_reader.h b/include/tadah/mlip/structure_readers/nomad_reader.h index e69de29..197e59f 100644 --- a/include/tadah/mlip/structure_readers/nomad_reader.h +++ b/include/tadah/mlip/structure_readers/nomad_reader.h @@ -0,0 +1,86 @@ +#ifndef NOMAD_READER_H +#define NOMAD_READER_H + +#include <string> +#include <tadah/mlip/structure_readers/structure_reader.h> +#include <tadah/mlip/structure.h> + +/** + * @class NomadReader + * @brief Implements the StructureReader interface to fetch structural data from NOMAD. + * + * Uses a POST request: + * https://nomad-lab.eu/prod/v1/api/v1/entries/<entryID>/archive/query + * requesting topology, space group, cartesian positions, and chemical symbols. + */ +class NomadReader : public StructureReader { +public: + /** + * @brief Default constructor. No special authentication is required. + */ + NomadReader(); + + /** + * @brief Reads structural data from the provided NOMAD entry ID. + * + * @param entryID A unique identifier in NOMAD, e.g. "zyXabc123..." + */ + void read(const std::string &entryID) override; + + /** + * @brief Returns the Structure, including cell, atoms, and metadata label. + * + * @return The last retrieved structure from NOMAD. + */ + Structure getStructure() const override; + +private: + /// Internal storage of the retrieved structure. + Structure m_structure; + + /** + * @brief fetchAndParseNomad() forms the POST request to NOMAD + * and calls parseNomadJson() to interpret the response. + * + * @param entryID The NOMAD entry identifier. + */ + void fetchAndParseNomad(const std::string &entryID); + + /** + * @brief httpPost() sends a POST request with JSON body to @p url. + * Sets standard JSON headers. + * + * @param url The target endpoint. + * @param jsonBody The JSON payload to post. + * @return The response body as a std::string. + */ + std::string httpPost(const std::string &url, const std::string &jsonBody); + + /** + * @brief parseNomadJson() extracts cell parameters, space group, + * cartesian positions, and species from the JSON response. + * + * @param jsonContent Raw JSON response string from NOMAD. + */ + void parseNomadJson(const std::string &jsonContent); + + /** + * @brief Builds a 3×3 lattice from edges (a,b,c) and angles (α,β,γ). + * Angles are in radians. Orthorhombic or monoclinic or general triclinic + * cells can be handled. + * + * @param a Edge length a. + * @param b Edge length b. + * @param c Edge length c. + * @param alpha Angle α in radians. + * @param beta Angle β in radians. + * @param gamma Angle γ in radians. + * @param cell Output 3×3 array for storing the resulting lattice vectors. + */ + void makeCellMatrix(double a, double b, double c, + double alpha, double beta, double gamma, + double (&cell)[3][3]); +}; + +#endif // NOMAD_READER_H + diff --git a/src/nomad_reader.cpp b/src/nomad_reader.cpp index e69de29..c28dc2c 100644 --- a/src/nomad_reader.cpp +++ b/src/nomad_reader.cpp @@ -0,0 +1,240 @@ +#include <tadah/mlip/structure_readers/nomad_reader.h> +#include <curl/curl.h> +#include <nlohmann/json.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> +#include <cmath> + +// Collects HTTP response data into a std::string +static size_t writeCallback(char* ptr, size_t size, size_t nmemb, void* userdata) { + auto* response = static_cast<std::string*>(userdata); + size_t totalBytes = size * nmemb; + response->append(ptr, totalBytes); + return totalBytes; +} + +// Default constructor. No special API token needed. +NomadReader::NomadReader() { +} + +// Reads the structure by calling an internal fetchAndParseNomad for a given entryID. +void NomadReader::read(const std::string &entryID) { + fetchAndParseNomad(entryID); +} + +// Returns the last retrieved structure. +Structure NomadReader::getStructure() const { + return m_structure; +} + +/** + fetchAndParseNomad() performs a POST request: + https://nomad-lab.eu/prod/v1/api/v1/entries/<entryID>/archive/query + The JSON body requests specific fields (cell, positions, species, etc.). + Then parseNomadJson() processes the response. +*/ +void NomadReader::fetchAndParseNomad(const std::string &entryID) { + const std::string baseURL = "https://nomad-lab.eu/prod/v1/api/v1"; + const std::string url = baseURL + "/entries/" + entryID + "/archive/query"; + + // JSON payload requesting relevant fields + static const std::string requestBody = R"JSON( + { + "required": { + "results": { + "material": { + "symmetry": { + "space_group_symbol": "*" + }, + "topology": { + "cell": { + "volume": "*", + "mass_density": "*" + } + } + } + }, + "metadata": { + "optimade": { + "elements": "*", + "chemical_formula_reduced": "*", + "lattice_vectors": "*", + "cartesian_site_positions": "*", + "species_at_sites": "*" + } + } + } + } + )JSON"; + + // POST request with the JSON payload + std::string response = httpPost(url, requestBody); + + parseNomadJson(response); +} + +/** + httpPost() sets JSON headers and payload for a POST request to @p url. + The full response body is returned as a string. +*/ +std::string NomadReader::httpPost(const std::string &url, const std::string &jsonBody) { + CURL* curl = curl_easy_init(); + if (!curl) { + throw std::runtime_error("NomadReader::httpPost: Failed to init cURL."); + } + + std::string response; + struct curl_slist* headers = nullptr; + headers = curl_slist_append(headers, "Content-Type: application/json"); + headers = curl_slist_append(headers, "accept: application/json"); + + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_POST, 1L); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, jsonBody.c_str()); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, static_cast<long>(jsonBody.size())); + + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response); + + // Optional user agent + curl_easy_setopt(curl, CURLOPT_USERAGENT, "NomadReader/1.0"); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + + CURLcode res = curl_easy_perform(curl); + if (res != CURLE_OK) { + std::cerr << "[DEBUG] cURL error code: " << res << std::endl; + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + throw std::runtime_error("NomadReader::httpPost: " + std::string(curl_easy_strerror(res))); + } + + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + + return response; +} + +/** + parseNomadJson() reads: + - space group symbol from material.symmetry.space_group_symbol + - label info (e.g. volume from topology[0].cell.volume) + - a 3×3 lattice from metadata.optimade.lattice_vectors + - positions and species from cartesian_site_positions and species_at_sites +*/ +void NomadReader::parseNomadJson(const std::string &jsonContent) { + using json = nlohmann::json; + auto root = json::parse(jsonContent); + + // Basic top-level checks + if (!root.is_object() || !root.contains("data")) { + throw std::runtime_error("parseNomadJson: 'data' missing from root."); + } + auto dataObj = root["data"]; + if (!dataObj.contains("archive")) { + throw std::runtime_error("parseNomadJson: 'data.archive' missing."); + } + auto archive = dataObj["archive"]; + + // Check for results.material + if (!archive.contains("results") || !archive["results"].contains("material")) { + throw std::runtime_error("parseNomadJson: results.material is missing."); + } + auto material = archive["results"]["material"]; + + // Space group symbol if available + std::string spgSymbol = material.value("symmetry", json()) + .value("space_group_symbol", "???"); + + // We do not rely on separate a,b,c, alpha,beta,gamma, but might still read volume + if (!material.contains("topology") || !material["topology"].is_array() || + material["topology"].empty()) { + throw std::runtime_error("parseNomadJson: No topology array found."); + } + auto firstTopo = material["topology"][0]; + if (!firstTopo.contains("cell")) { + throw std::runtime_error("parseNomadJson: 'cell' not in the first topology."); + } + auto cellData = firstTopo["cell"]; + + double volume = cellData.value("volume", 0.0); + double density = cellData.value("mass_density", 0.0); + + // Get lattice_vectors from metadata.optimade + auto optimade = archive["metadata"].value("optimade", json()); + if (!optimade.contains("lattice_vectors") || !optimade["lattice_vectors"].is_array()) { + throw std::runtime_error("parseNomadJson: 'lattice_vectors' missing or not array."); + } + auto lattVecs = optimade["lattice_vectors"]; + + // Check 3×3 array + if (lattVecs.size() != 3) { + throw std::runtime_error("parseNomadJson: 'lattice_vectors' must have size=3."); + } + for (int i = 0; i < 3; i++) { + if (!lattVecs[i].is_array() || lattVecs[i].size() != 3) { + throw std::runtime_error("parseNomadJson: Each row in 'lattice_vectors' must have size=3."); + } + } + + // Retrieve simplified formula + std::string formulaReduced = optimade.value("chemical_formula_reduced", "Unknown"); + + // Build a descriptive label + std::ostringstream labelStream; + labelStream << "NOMAD Entry: " << dataObj.value("entry_id", "???") + << " | Formula: " << formulaReduced + << " | SpaceGroup: " << spgSymbol + << " | Volume: " << volume + << " | Density: " << density; + + // Assign data to m_structure + m_structure.label = labelStream.str(); + + // Transfer the 3×3 matrix to m_structure.cell + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + m_structure.cell(i, j) = lattVecs[i][j].get<double>(); + } + } + + // Retrieve cartesian positions and species + if (!optimade.contains("cartesian_site_positions") || + !optimade["cartesian_site_positions"].is_array()) { + throw std::runtime_error("parseNomadJson: 'cartesian_site_positions' missing/not array."); + } + auto cartesianSites = optimade["cartesian_site_positions"]; + + if (!optimade.contains("species_at_sites") || + !optimade["species_at_sites"].is_array()) { + throw std::runtime_error("parseNomadJson: 'species_at_sites' missing/not array."); + } + auto speciesSites = optimade["species_at_sites"]; + + if (cartesianSites.size() != speciesSites.size()) { + throw std::runtime_error("parseNomadJson: Position-species array size mismatch."); + } + + m_structure.atoms.reserve(cartesianSites.size()); + + // Combine site position with species + for (size_t i = 0; i < cartesianSites.size(); ++i) { + auto &pos = cartesianSites[i]; + if (!pos.is_array() || pos.size() != 3) { + throw std::runtime_error("parseNomadJson: Site position requires 3 coordinates."); + } + double x = pos[0].get<double>(); + double y = pos[1].get<double>(); + double z = pos[2].get<double>(); + + Atom atom; + Element e = PeriodicTable().find_by_symbol(speciesSites[i].get<std::string>()); + static_cast<Element &>(atom) = e; + atom.position[0] = x; + atom.position[1] = y; + atom.position[2] = z; + + m_structure.atoms.push_back(atom); + } +} diff --git a/src/structure_reader_selector.cpp b/src/structure_reader_selector.cpp index 44ecbbf..3a4dcc4 100644 --- a/src/structure_reader_selector.cpp +++ b/src/structure_reader_selector.cpp @@ -2,18 +2,17 @@ #include <tadah/mlip/structure_readers/cif_reader.h> #include <tadah/mlip/structure_readers/structure_reader_selector.h> #include <tadah/mlip/structure_readers/vasp_poscar_reader.h> -/*#include <tadah/mlip/structure_readers/extended_xyz_reader.h>*/ -/*#include <tadah/mlip/structure_readers/xsf_reader.h>*/ #include <tadah/mlip/structure_readers/materials_project_reader.h> /*#include <tadah/mlip/structure_readers/cod_reader.h>*/ /*#include <tadah/mlip/structure_readers/aflow_reader.h>*/ -/*#include <tadah/mlip/structure_readers/nomad_reader.h>*/ #include <tadah/mlip/key_storage/key_manager.h> +#include <tadah/mlip/structure_readers/nomad_reader.h> #include <algorithm> #include <cctype> #include <fstream> #include <stdexcept> +#include <sys/stat.h> std::unique_ptr<StructureReader> StructureReaderSelector::getReader(const std::string &pathOrId) { @@ -34,85 +33,297 @@ StructureReaderSelector::getReader(const std::string &pathOrId) { } else if (format == "CELL") { return std::make_unique<CastepCellReader>(); } - /*else if (format == "XYZ") {*/ - /* return std::make_unique<ExtendedXyzReader>();*/ - /*}*/ - /*else if (format == "XSF") {*/ - /* return std::make_unique<XsfReader>();*/ - /*}*/ /*else if (format == "COD") {*/ /* return std::make_unique<CODReader>();*/ /*}*/ /*else if (format == "AFLOW") {*/ /* return std::make_unique<AflowReader>();*/ /*}*/ - /*else if (format == "NOMAD") {*/ - /* return std::make_unique<NomadReader>();*/ - /*}*/ - else { + else if (format == "NOMAD") { + return std::make_unique<NomadReader>(); + } else { throw std::runtime_error( "StructureReaderSelector::getReader - Unknown format for " + pathOrId); } } +bool fileExists(const std::string &path) { + struct stat buffer; + return (stat(path.c_str(), &buffer) == 0 && S_ISREG(buffer.st_mode)); +} +bool isSevenDigits(const std::string &s) { + if (s.size() != 7) + return false; + return std::all_of(s.begin(), s.end(), ::isdigit); +} +// A minimal heuristic for “NOMAD-like†ID +bool looksLikeNomadId(const std::string &s) { + // Example: length typically ~27, so pick a range + if (s.size() < 20 || s.size() > 40) + return false; + + // Must be URL-safe base64-ish: A–Z, a–z, 0–9, -, _ + return std::all_of(s.begin(), s.end(), [](unsigned char c) { + return (std::isalnum(c) != 0) || c == '-' || c == '_'; + }); +} std::string StructureReaderSelector::guessFormat(const std::string &pathOrId) { - // Check known online ID patterns: + // 1) Check if path is a local file + if (fileExists(pathOrId)) { + // Heuristics by extension + if (pathOrId.rfind("POSCAR", 0) == 0 || pathOrId.rfind("CONTCAR", 0) == 0) { + return "POSCAR"; + } + auto dotPos = pathOrId.find_last_of('.'); + if (dotPos != std::string::npos) { + std::string ext = pathOrId.substr(dotPos + 1); + std::transform(ext.begin(), ext.end(), ext.begin(), + [](unsigned char c) { return std::tolower(c); }); + + if (ext == "cif") + return "CIF"; + if (ext == "cell") + return "CELL"; + } + // If no extension or no known extension + return guessFormatFromContent(pathOrId); + } + + // 2) Known online prefixes or patterns + // MP uses mp- prefix: “mp-####†if (pathOrId.rfind("mp-", 0) == 0) { return "MP"; } - if (pathOrId.rfind("cod-", 0) == 0) { + + // COD uses 7-digit numeric code, e.g. "8104301". + if (isSevenDigits(pathOrId)) { return "COD"; } - if (pathOrId.rfind("aflow-", 0) == 0) { + + // AFLOW uses "aflow:######" style. + if (pathOrId.rfind("aflow:", 0) == 0) { return "AFLOW"; } - if (pathOrId.rfind("nomad-", 0) == 0) { + + // NOMAD minimal check + if (looksLikeNomadId(pathOrId)) { return "NOMAD"; } - // Check file extension: - auto dotPos = pathOrId.find_last_of('.'); - if (dotPos != std::string::npos) { - std::string ext = pathOrId.substr(dotPos + 1); - std::transform(ext.begin(), ext.end(), ext.begin(), - [](unsigned char c) { return std::tolower(c); }); + // 3) Fallback: guess-from-server approach - if (ext == "cif") - return "CIF"; - if (ext == "cell") - return "CELL"; - if (ext == "xsf") - return "XSF"; - if (ext == "xyz") - return "XYZ"; - // Some VASP files may not have an extension + // 4) Fail + return "UNKNOWN"; +} + + +bool isLikelyPoscar(const std::string &filepath) { + std::ifstream file(filepath); + if (!file.is_open()) { + return false; // can't open -> can't detect + } + + // Minimal number of lines for a typical POSCAR + // 1) Title/Comment + // 2) Scale factor + // 3) Lattice vector line #1 + // 4) Lattice vector line #2 + // 5) Lattice vector line #3 + // 6) Element symbols or counts + // 7) Possibly more lines (atom counts / Selective Dynamics / coordinates, etc.) + + std::string line; + std::vector<std::string> lines; + while (std::getline(file, line)) { + lines.push_back(line); + if (lines.size() > 10) { + // Reading more lines is okay, but we only need ~7 lines for detection + break; + } + } + file.close(); + + // Must have ~6 lines to check + if (lines.size() < 6) { + return false; + } + + // 1) line[0]: Title/comment (no strong requirement, can be anything) + // 2) line[1]: Should parse as scale factor (double) + { + std::istringstream iss(lines[1]); + double scale = 0.0; + if (!(iss >> scale)) { + // Could not parse a double + return false; + } + } + + // 3) The next three lines (2..4) each should have 3 floating values + for (int i = 2; i < 5; ++i) { + std::istringstream iss(lines[i]); + double x, y, z; + if (!(iss >> x >> y >> z)) { + // Could not parse 3 floats + return false; + } + } + + // 4) line[5]: Often either element symbols or number of atoms + // But this line is optional. Let's do a minimal check: + // In many POSCARs, if it's not numeric, it might be e.g., "Si O" or "Fe" or something. + // We'll not strictly require anything here. Additional checks can be added if needed. + + // 5) Optional: Check for “Selective dynamics†or the “Direct/Cartesian†line. + // The line after the element/atom count might be "Selective dynamics" + // or "s" in some usage. Then the next line might be "Direct" or "Cartesian". + + // If we've gotten this far, the file strongly resembles a POSCAR. + return true; +} + +// A minimal function to see if “text†starts with “prefix†(case-sensitive). +static bool startsWith(const std::string &text, const std::string &prefix) { + return text.rfind(prefix, 0) == 0; +} + +bool isLikelyCif(const std::string &filepath) { + std::ifstream file(filepath); + if (!file.is_open()) { + return false; // Could not open -> not a CIF + } + + // We'll read up to ~30 lines (CIF can have many lines, but this is enough to detect). + std::vector<std::string> lines; + std::string line; + while (std::getline(file, line)) { + lines.push_back(line); + if (lines.size() >= 30) { + break; + } + } + file.close(); + + // Basic heuristic checks: + // 1) Is there at least one line starting with "data_"? + // 2) Are there lines with typical CIF tags, like "_cell_length_", "_cell_angle_", "_atom_site_", etc.? + + bool foundDataLine = false; + bool foundCellOrAtomLine = false; + + for (auto &ln : lines) { + // Trim leading/trailing whitespace for easier checks + // (could also keep them if your checking is simple) + auto startPos = ln.find_first_not_of(" \t\r\n"); + if (startPos == std::string::npos) { + continue; // empty line + } + std::string trimmed = ln.substr(startPos); + + if (startsWith(trimmed, "data_")) { + foundDataLine = true; + } + if (startsWith(trimmed, "_cell_length_") || + startsWith(trimmed, "_cell_angle_") || + startsWith(trimmed, "_symmetry_") || + startsWith(trimmed, "_atom_site_")) + { + foundCellOrAtomLine = true; + } + } + + // A very minimal criterion: must have a data_ line plus at least one known CIF tag line. + if (foundDataLine && foundCellOrAtomLine) { + return true; } - return guessFormatFromContent(pathOrId); + // Possibly check for “loop_†lines if you want extra confidence. + return false; +} + +// Utility: checks if a line begins with a (case-insensitive) prefix +static bool startsWithIgnoreCase(const std::string& line, const std::string& prefix) { + if (line.size() < prefix.size()) return false; + + for (std::size_t i = 0; i < prefix.size(); ++i) { + if (std::tolower(static_cast<unsigned char>(line[i])) != + std::tolower(static_cast<unsigned char>(prefix[i]))) { + return false; + } + } + return true; +} + +bool isLikelyCell(const std::string &filepath) { + std::ifstream file(filepath); + if (!file.is_open()) { + return false; + } + + // Read up to, say, 50 lines. CASTEP .cell files can be quite large, + // but typical detection should be near the top. + std::vector<std::string> lines; + std::string line; + while (std::getline(file, line)) { + lines.push_back(line); + if (lines.size() >= 50) { + break; + } + } + file.close(); + + // We look for e.g. %BLOCK LATTICE_CART, %BLOCK LATTICE_ABC, or %BLOCK POSITIONS_FRAC. + bool foundLatticeBlock = false; + bool foundPositionsBlock = false; + + for (const auto &ln : lines) { + // Trim leading/trailing whitespace + auto startPos = ln.find_first_not_of(" \t"); + if (startPos == std::string::npos) { + continue; // empty line + } + std::string trimmed = ln.substr(startPos); + + // Check for “%BLOCK LATTICE_CART†or “%BLOCK LATTICE_ABC†ignoring case + if (startsWithIgnoreCase(trimmed, "%BLOCK LATTICE_CART") || + startsWithIgnoreCase(trimmed, "%BLOCK LATTICE_ABC")) + { + foundLatticeBlock = true; + } + // Also check if it contains e.g. “%BLOCK POSITIONS_FRAC†+ if (startsWithIgnoreCase(trimmed, "%BLOCK POSITIONS_FRAC") || + startsWithIgnoreCase(trimmed, "%BLOCK POSITIONS_ABS")) + { + foundPositionsBlock = true; + } + } + + // A minimal “likely .cell†detection: we found a lattice block and a positions block + // Very small files might omit one or the other, but this is typically robust. + if (foundLatticeBlock && foundPositionsBlock) { + return true; + } + + // If not found, it might still be a .cell file, but the heuristics do not confirm it. + return false; } std::string StructureReaderSelector::guessFormatFromContent(const std::string &filePath) { std::ifstream ifs(filePath); - if (!ifs.is_open()) { - // If can't open, guess POSCAR (commonly no extension). - return "POSCAR"; - } std::string line; if (std::getline(ifs, line)) { - // Some naive checks - if (line.find("lattice_vector") != std::string::npos) { - return "XSF"; - } - if (line.find("CELL_PARAMETERS") != std::string::npos) { + if (isLikelyCif(filePath)) { return "CIF"; } - if (line.find("%BLOCK LATTICE_") != std::string::npos) { + if (isLikelyCell(filePath)) { return "CELL"; } + if (isLikelyPoscar(filePath)) { + return "POSCAR"; + } } - // fallback - return "POSCAR"; + return "UNKNOWN"; } -- GitLab