From 07b75de3c3be3ea3d472dbf570ad130d289d3be5 Mon Sep 17 00:00:00 2001
From: Marcin Kirsz <mkirsz@ed.ac.uk>
Date: Thu, 6 Mar 2025 16:55:45 +0000
Subject: [PATCH] Working NOMAD sreader

---
 .../mlip/structure_readers/nomad_reader.h     |  86 +++++
 src/nomad_reader.cpp                          | 240 ++++++++++++++
 src/structure_reader_selector.cpp             | 301 +++++++++++++++---
 3 files changed, 582 insertions(+), 45 deletions(-)

diff --git a/include/tadah/mlip/structure_readers/nomad_reader.h b/include/tadah/mlip/structure_readers/nomad_reader.h
index e69de29..197e59f 100644
--- a/include/tadah/mlip/structure_readers/nomad_reader.h
+++ b/include/tadah/mlip/structure_readers/nomad_reader.h
@@ -0,0 +1,86 @@
+#ifndef NOMAD_READER_H
+#define NOMAD_READER_H
+
+#include <string>
+#include <tadah/mlip/structure_readers/structure_reader.h>
+#include <tadah/mlip/structure.h>
+
+/**
+ * @class NomadReader
+ * @brief Implements the StructureReader interface to fetch structural data from NOMAD.
+ *
+ * Uses a POST request:
+ *   https://nomad-lab.eu/prod/v1/api/v1/entries/<entryID>/archive/query
+ * requesting topology, space group, cartesian positions, and chemical symbols.
+ */
+class NomadReader : public StructureReader {
+public:
+  /**
+   * @brief Default constructor. No special authentication is required.
+   */
+  NomadReader();
+
+  /**
+   * @brief Reads structural data from the provided NOMAD entry ID.
+   *
+   * @param entryID A unique identifier in NOMAD, e.g. "zyXabc123..."
+   */
+  void read(const std::string &entryID) override;
+
+  /**
+   * @brief Returns the Structure, including cell, atoms, and metadata label.
+   *
+   * @return The last retrieved structure from NOMAD.
+   */
+  Structure getStructure() const override;
+
+private:
+  /// Internal storage of the retrieved structure.
+  Structure m_structure;
+
+  /**
+   * @brief fetchAndParseNomad() forms the POST request to NOMAD
+   *        and calls parseNomadJson() to interpret the response.
+   *
+   * @param entryID The NOMAD entry identifier.
+   */
+  void fetchAndParseNomad(const std::string &entryID);
+
+  /**
+   * @brief httpPost() sends a POST request with JSON body to @p url.
+   *        Sets standard JSON headers.
+   *
+   * @param url The target endpoint.
+   * @param jsonBody The JSON payload to post.
+   * @return The response body as a std::string.
+   */
+  std::string httpPost(const std::string &url, const std::string &jsonBody);
+
+  /**
+   * @brief parseNomadJson() extracts cell parameters, space group, 
+   *        cartesian positions, and species from the JSON response.
+   *
+   * @param jsonContent Raw JSON response string from NOMAD.
+   */
+  void parseNomadJson(const std::string &jsonContent);
+
+  /**
+   * @brief Builds a 3×3 lattice from edges (a,b,c) and angles (α,β,γ).
+   *        Angles are in radians. Orthorhombic or monoclinic or general triclinic
+   *        cells can be handled.
+   *
+   * @param a Edge length a.
+   * @param b Edge length b.
+   * @param c Edge length c.
+   * @param alpha Angle α in radians.
+   * @param beta Angle β in radians.
+   * @param gamma Angle γ in radians.
+   * @param cell Output 3×3 array for storing the resulting lattice vectors.
+   */
+  void makeCellMatrix(double a, double b, double c,
+                      double alpha, double beta, double gamma,
+                      double (&cell)[3][3]);
+};
+
+#endif // NOMAD_READER_H
+
diff --git a/src/nomad_reader.cpp b/src/nomad_reader.cpp
index e69de29..c28dc2c 100644
--- a/src/nomad_reader.cpp
+++ b/src/nomad_reader.cpp
@@ -0,0 +1,240 @@
+#include <tadah/mlip/structure_readers/nomad_reader.h>
+#include <curl/curl.h>
+#include <nlohmann/json.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+#include <cmath>
+
+// Collects HTTP response data into a std::string
+static size_t writeCallback(char* ptr, size_t size, size_t nmemb, void* userdata) {
+  auto* response = static_cast<std::string*>(userdata);
+  size_t totalBytes = size * nmemb;
+  response->append(ptr, totalBytes);
+  return totalBytes;
+}
+
+// Default constructor. No special API token needed.
+NomadReader::NomadReader() {
+}
+
+// Reads the structure by calling an internal fetchAndParseNomad for a given entryID.
+void NomadReader::read(const std::string &entryID) {
+  fetchAndParseNomad(entryID);
+}
+
+// Returns the last retrieved structure.
+Structure NomadReader::getStructure() const {
+  return m_structure;
+}
+
+/**
+  fetchAndParseNomad() performs a POST request:
+    https://nomad-lab.eu/prod/v1/api/v1/entries/<entryID>/archive/query
+  The JSON body requests specific fields (cell, positions, species, etc.).
+  Then parseNomadJson() processes the response.
+*/
+void NomadReader::fetchAndParseNomad(const std::string &entryID) {
+  const std::string baseURL = "https://nomad-lab.eu/prod/v1/api/v1";
+  const std::string url = baseURL + "/entries/" + entryID + "/archive/query";
+
+  // JSON payload requesting relevant fields
+  static const std::string requestBody = R"JSON(
+  {
+    "required": {
+      "results": {
+        "material": {
+          "symmetry": {
+            "space_group_symbol": "*"
+          },
+          "topology": {
+            "cell": {
+              "volume": "*",
+              "mass_density": "*"
+            }
+          }
+        }
+      },
+      "metadata": {
+        "optimade": {
+          "elements": "*",
+          "chemical_formula_reduced": "*",
+          "lattice_vectors": "*",
+          "cartesian_site_positions": "*",
+          "species_at_sites": "*"
+        }
+      }
+    }
+  }
+  )JSON";
+
+  // POST request with the JSON payload
+  std::string response = httpPost(url, requestBody);
+
+  parseNomadJson(response);
+}
+
+/**
+  httpPost() sets JSON headers and payload for a POST request to @p url.
+  The full response body is returned as a string.
+*/
+std::string NomadReader::httpPost(const std::string &url, const std::string &jsonBody) {
+  CURL* curl = curl_easy_init();
+  if (!curl) {
+    throw std::runtime_error("NomadReader::httpPost: Failed to init cURL.");
+  }
+
+  std::string response;
+  struct curl_slist* headers = nullptr;
+  headers = curl_slist_append(headers, "Content-Type: application/json");
+  headers = curl_slist_append(headers, "accept: application/json");
+
+  curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+  curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+  curl_easy_setopt(curl, CURLOPT_POST, 1L);
+  curl_easy_setopt(curl, CURLOPT_POSTFIELDS, jsonBody.c_str());
+  curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, static_cast<long>(jsonBody.size()));
+
+  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback);
+  curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
+
+  // Optional user agent
+  curl_easy_setopt(curl, CURLOPT_USERAGENT, "NomadReader/1.0");
+  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+
+  CURLcode res = curl_easy_perform(curl);
+  if (res != CURLE_OK) {
+    std::cerr << "[DEBUG] cURL error code: " << res << std::endl;
+    curl_slist_free_all(headers);
+    curl_easy_cleanup(curl);
+    throw std::runtime_error("NomadReader::httpPost: " + std::string(curl_easy_strerror(res)));
+  }
+
+  curl_slist_free_all(headers);
+  curl_easy_cleanup(curl);
+
+  return response;
+}
+
+/**
+  parseNomadJson() reads:
+   - space group symbol from material.symmetry.space_group_symbol
+   - label info (e.g. volume from topology[0].cell.volume)
+   - a 3×3 lattice from metadata.optimade.lattice_vectors
+   - positions and species from cartesian_site_positions and species_at_sites
+*/
+void NomadReader::parseNomadJson(const std::string &jsonContent) {
+  using json = nlohmann::json;
+  auto root = json::parse(jsonContent);
+
+  // Basic top-level checks
+  if (!root.is_object() || !root.contains("data")) {
+    throw std::runtime_error("parseNomadJson: 'data' missing from root.");
+  }
+  auto dataObj = root["data"];
+  if (!dataObj.contains("archive")) {
+    throw std::runtime_error("parseNomadJson: 'data.archive' missing.");
+  }
+  auto archive = dataObj["archive"];
+
+  // Check for results.material
+  if (!archive.contains("results") || !archive["results"].contains("material")) {
+    throw std::runtime_error("parseNomadJson: results.material is missing.");
+  }
+  auto material = archive["results"]["material"];
+
+  // Space group symbol if available
+  std::string spgSymbol = material.value("symmetry", json())
+                                  .value("space_group_symbol", "???");
+
+  // We do not rely on separate a,b,c, alpha,beta,gamma, but might still read volume
+  if (!material.contains("topology") || !material["topology"].is_array() ||
+      material["topology"].empty()) {
+    throw std::runtime_error("parseNomadJson: No topology array found.");
+  }
+  auto firstTopo = material["topology"][0];
+  if (!firstTopo.contains("cell")) {
+    throw std::runtime_error("parseNomadJson: 'cell' not in the first topology.");
+  }
+  auto cellData = firstTopo["cell"];
+
+  double volume = cellData.value("volume", 0.0);
+  double density = cellData.value("mass_density", 0.0);
+
+  // Get lattice_vectors from metadata.optimade
+  auto optimade = archive["metadata"].value("optimade", json());
+  if (!optimade.contains("lattice_vectors") || !optimade["lattice_vectors"].is_array()) {
+    throw std::runtime_error("parseNomadJson: 'lattice_vectors' missing or not array.");
+  }
+  auto lattVecs = optimade["lattice_vectors"];
+
+  // Check 3×3 array
+  if (lattVecs.size() != 3) {
+    throw std::runtime_error("parseNomadJson: 'lattice_vectors' must have size=3.");
+  }
+  for (int i = 0; i < 3; i++) {
+    if (!lattVecs[i].is_array() || lattVecs[i].size() != 3) {
+      throw std::runtime_error("parseNomadJson: Each row in 'lattice_vectors' must have size=3.");
+    }
+  }
+
+  // Retrieve simplified formula
+  std::string formulaReduced = optimade.value("chemical_formula_reduced", "Unknown");
+
+  // Build a descriptive label
+  std::ostringstream labelStream;
+  labelStream << "NOMAD Entry: " << dataObj.value("entry_id", "???")
+              << " | Formula: " << formulaReduced
+              << " | SpaceGroup: " << spgSymbol
+              << " | Volume: " << volume
+              << " | Density: " << density;
+
+  // Assign data to m_structure
+  m_structure.label = labelStream.str();
+
+  // Transfer the 3×3 matrix to m_structure.cell
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      m_structure.cell(i, j) = lattVecs[i][j].get<double>();
+    }
+  }
+
+  // Retrieve cartesian positions and species
+  if (!optimade.contains("cartesian_site_positions") ||
+      !optimade["cartesian_site_positions"].is_array()) {
+    throw std::runtime_error("parseNomadJson: 'cartesian_site_positions' missing/not array.");
+  }
+  auto cartesianSites = optimade["cartesian_site_positions"];
+
+  if (!optimade.contains("species_at_sites") ||
+      !optimade["species_at_sites"].is_array()) {
+    throw std::runtime_error("parseNomadJson: 'species_at_sites' missing/not array.");
+  }
+  auto speciesSites = optimade["species_at_sites"];
+
+  if (cartesianSites.size() != speciesSites.size()) {
+    throw std::runtime_error("parseNomadJson: Position-species array size mismatch.");
+  }
+
+  m_structure.atoms.reserve(cartesianSites.size());
+
+  // Combine site position with species
+  for (size_t i = 0; i < cartesianSites.size(); ++i) {
+    auto &pos = cartesianSites[i];
+    if (!pos.is_array() || pos.size() != 3) {
+      throw std::runtime_error("parseNomadJson: Site position requires 3 coordinates.");
+    }
+    double x = pos[0].get<double>();
+    double y = pos[1].get<double>();
+    double z = pos[2].get<double>();
+
+    Atom atom;
+    Element e = PeriodicTable().find_by_symbol(speciesSites[i].get<std::string>());
+    static_cast<Element &>(atom) = e;
+    atom.position[0] = x;
+    atom.position[1] = y;
+    atom.position[2] = z;
+
+    m_structure.atoms.push_back(atom);
+  }
+}
diff --git a/src/structure_reader_selector.cpp b/src/structure_reader_selector.cpp
index 44ecbbf..3a4dcc4 100644
--- a/src/structure_reader_selector.cpp
+++ b/src/structure_reader_selector.cpp
@@ -2,18 +2,17 @@
 #include <tadah/mlip/structure_readers/cif_reader.h>
 #include <tadah/mlip/structure_readers/structure_reader_selector.h>
 #include <tadah/mlip/structure_readers/vasp_poscar_reader.h>
-/*#include <tadah/mlip/structure_readers/extended_xyz_reader.h>*/
-/*#include <tadah/mlip/structure_readers/xsf_reader.h>*/
 #include <tadah/mlip/structure_readers/materials_project_reader.h>
 /*#include <tadah/mlip/structure_readers/cod_reader.h>*/
 /*#include <tadah/mlip/structure_readers/aflow_reader.h>*/
-/*#include <tadah/mlip/structure_readers/nomad_reader.h>*/
 #include <tadah/mlip/key_storage/key_manager.h>
+#include <tadah/mlip/structure_readers/nomad_reader.h>
 
 #include <algorithm>
 #include <cctype>
 #include <fstream>
 #include <stdexcept>
+#include <sys/stat.h>
 
 std::unique_ptr<StructureReader>
 StructureReaderSelector::getReader(const std::string &pathOrId) {
@@ -34,85 +33,297 @@ StructureReaderSelector::getReader(const std::string &pathOrId) {
   } else if (format == "CELL") {
     return std::make_unique<CastepCellReader>();
   }
-  /*else if (format == "XYZ") {*/
-  /*    return std::make_unique<ExtendedXyzReader>();*/
-  /*}*/
-  /*else if (format == "XSF") {*/
-  /*    return std::make_unique<XsfReader>();*/
-  /*}*/
   /*else if (format == "COD") {*/
   /*    return std::make_unique<CODReader>();*/
   /*}*/
   /*else if (format == "AFLOW") {*/
   /*    return std::make_unique<AflowReader>();*/
   /*}*/
-  /*else if (format == "NOMAD") {*/
-  /*    return std::make_unique<NomadReader>();*/
-  /*}*/
-  else {
+  else if (format == "NOMAD") {
+    return std::make_unique<NomadReader>();
+  } else {
     throw std::runtime_error(
         "StructureReaderSelector::getReader - Unknown format for " + pathOrId);
   }
 }
 
+bool fileExists(const std::string &path) {
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0 && S_ISREG(buffer.st_mode));
+}
+bool isSevenDigits(const std::string &s) {
+  if (s.size() != 7)
+    return false;
+  return std::all_of(s.begin(), s.end(), ::isdigit);
+}
+// A minimal heuristic for “NOMAD-like” ID
+bool looksLikeNomadId(const std::string &s) {
+  // Example: length typically ~27, so pick a range
+  if (s.size() < 20 || s.size() > 40)
+    return false;
+
+  // Must be URL-safe base64-ish: A–Z, a–z, 0–9, -, _
+  return std::all_of(s.begin(), s.end(), [](unsigned char c) {
+    return (std::isalnum(c) != 0) || c == '-' || c == '_';
+  });
+}
 std::string StructureReaderSelector::guessFormat(const std::string &pathOrId) {
-  // Check known online ID patterns:
+  // 1) Check if path is a local file
+  if (fileExists(pathOrId)) {
+    // Heuristics by extension
+    if (pathOrId.rfind("POSCAR", 0) == 0 || pathOrId.rfind("CONTCAR", 0) == 0) {
+      return "POSCAR";
+    }
+    auto dotPos = pathOrId.find_last_of('.');
+    if (dotPos != std::string::npos) {
+      std::string ext = pathOrId.substr(dotPos + 1);
+      std::transform(ext.begin(), ext.end(), ext.begin(),
+                     [](unsigned char c) { return std::tolower(c); });
+
+      if (ext == "cif")
+        return "CIF";
+      if (ext == "cell")
+        return "CELL";
+    }
+    // If no extension or no known extension
+    return guessFormatFromContent(pathOrId);
+  }
+
+  // 2) Known online prefixes or patterns
+  // MP uses mp- prefix: “mp-####”
   if (pathOrId.rfind("mp-", 0) == 0) {
     return "MP";
   }
-  if (pathOrId.rfind("cod-", 0) == 0) {
+
+  // COD uses 7-digit numeric code, e.g. "8104301".
+  if (isSevenDigits(pathOrId)) {
     return "COD";
   }
-  if (pathOrId.rfind("aflow-", 0) == 0) {
+
+  // AFLOW uses "aflow:######" style.
+  if (pathOrId.rfind("aflow:", 0) == 0) {
     return "AFLOW";
   }
-  if (pathOrId.rfind("nomad-", 0) == 0) {
+
+  // NOMAD minimal check
+  if (looksLikeNomadId(pathOrId)) {
     return "NOMAD";
   }
 
-  // Check file extension:
-  auto dotPos = pathOrId.find_last_of('.');
-  if (dotPos != std::string::npos) {
-    std::string ext = pathOrId.substr(dotPos + 1);
-    std::transform(ext.begin(), ext.end(), ext.begin(),
-                   [](unsigned char c) { return std::tolower(c); });
+  // 3) Fallback: guess-from-server approach
 
-    if (ext == "cif")
-      return "CIF";
-    if (ext == "cell")
-      return "CELL";
-    if (ext == "xsf")
-      return "XSF";
-    if (ext == "xyz")
-      return "XYZ";
-    // Some VASP files may not have an extension
+  // 4) Fail
+  return "UNKNOWN";
+}
+
+
+bool isLikelyPoscar(const std::string &filepath) {
+  std::ifstream file(filepath);
+  if (!file.is_open()) {
+    return false; // can't open -> can't detect
+  }
+
+  // Minimal number of lines for a typical POSCAR
+  // 1) Title/Comment
+  // 2) Scale factor
+  // 3) Lattice vector line #1
+  // 4) Lattice vector line #2
+  // 5) Lattice vector line #3
+  // 6) Element symbols or counts
+  // 7) Possibly more lines (atom counts / Selective Dynamics / coordinates, etc.)
+
+  std::string line;
+  std::vector<std::string> lines;
+  while (std::getline(file, line)) {
+    lines.push_back(line);
+    if (lines.size() > 10) {
+      // Reading more lines is okay, but we only need ~7 lines for detection
+      break;
+    }
+  }
+  file.close();
+
+  // Must have ~6 lines to check
+  if (lines.size() < 6) {
+    return false;
+  }
+
+  // 1) line[0]: Title/comment (no strong requirement, can be anything)
+  // 2) line[1]: Should parse as scale factor (double)
+  {
+    std::istringstream iss(lines[1]);
+    double scale = 0.0;
+    if (!(iss >> scale)) {
+      // Could not parse a double
+      return false;
+    }
+  }
+
+  // 3) The next three lines (2..4) each should have 3 floating values
+  for (int i = 2; i < 5; ++i) {
+    std::istringstream iss(lines[i]);
+    double x, y, z;
+    if (!(iss >> x >> y >> z)) {
+      // Could not parse 3 floats
+      return false;
+    }
+  }
+
+  // 4) line[5]: Often either element symbols or number of atoms
+  //    But this line is optional. Let's do a minimal check:
+  //    In many POSCARs, if it's not numeric, it might be e.g., "Si O" or "Fe" or something.
+  //    We'll not strictly require anything here. Additional checks can be added if needed.
+
+  // 5) Optional: Check for “Selective dynamics” or the “Direct/Cartesian” line.
+  //    The line after the element/atom count might be "Selective dynamics"
+  //    or "s" in some usage. Then the next line might be "Direct" or "Cartesian".
+  
+  // If we've gotten this far, the file strongly resembles a POSCAR.
+  return true;
+}
+
+// A minimal function to see if “text” starts with “prefix” (case-sensitive).
+static bool startsWith(const std::string &text, const std::string &prefix) {
+  return text.rfind(prefix, 0) == 0;
+}
+
+bool isLikelyCif(const std::string &filepath) {
+  std::ifstream file(filepath);
+  if (!file.is_open()) {
+    return false; // Could not open -> not a CIF
+  }
+
+  // We'll read up to ~30 lines (CIF can have many lines, but this is enough to detect).
+  std::vector<std::string> lines;
+  std::string line;
+  while (std::getline(file, line)) {
+    lines.push_back(line);
+    if (lines.size() >= 30) {
+      break;
+    }
+  }
+  file.close();
+
+  // Basic heuristic checks:
+  // 1) Is there at least one line starting with "data_"?
+  // 2) Are there lines with typical CIF tags, like "_cell_length_", "_cell_angle_", "_atom_site_", etc.?
+
+  bool foundDataLine = false;
+  bool foundCellOrAtomLine = false;
+
+  for (auto &ln : lines) {
+    // Trim leading/trailing whitespace for easier checks
+    // (could also keep them if your checking is simple)
+    auto startPos = ln.find_first_not_of(" \t\r\n");
+    if (startPos == std::string::npos) {
+      continue; // empty line
+    }
+    std::string trimmed = ln.substr(startPos);
+
+    if (startsWith(trimmed, "data_")) {
+      foundDataLine = true;
+    }
+    if (startsWith(trimmed, "_cell_length_") ||
+        startsWith(trimmed, "_cell_angle_")  ||
+        startsWith(trimmed, "_symmetry_")    ||
+        startsWith(trimmed, "_atom_site_"))
+    {
+      foundCellOrAtomLine = true;
+    }
+  }
+
+  // A very minimal criterion: must have a data_ line plus at least one known CIF tag line.
+  if (foundDataLine && foundCellOrAtomLine) {
+    return true;
   }
 
-  return guessFormatFromContent(pathOrId);
+  // Possibly check for “loop_” lines if you want extra confidence.
+  return false;
+}
+
+// Utility: checks if a line begins with a (case-insensitive) prefix
+static bool startsWithIgnoreCase(const std::string& line, const std::string& prefix) {
+  if (line.size() < prefix.size()) return false;
+  
+  for (std::size_t i = 0; i < prefix.size(); ++i) {
+    if (std::tolower(static_cast<unsigned char>(line[i])) !=
+        std::tolower(static_cast<unsigned char>(prefix[i]))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool isLikelyCell(const std::string &filepath) {
+  std::ifstream file(filepath);
+  if (!file.is_open()) {
+    return false;
+  }
+
+  // Read up to, say, 50 lines. CASTEP .cell files can be quite large,
+  // but typical detection should be near the top.
+  std::vector<std::string> lines;
+  std::string line;
+  while (std::getline(file, line)) {
+    lines.push_back(line);
+    if (lines.size() >= 50) {
+      break;
+    }
+  }
+  file.close();
+
+  // We look for e.g. %BLOCK LATTICE_CART, %BLOCK LATTICE_ABC, or %BLOCK POSITIONS_FRAC.
+  bool foundLatticeBlock = false;
+  bool foundPositionsBlock = false;
+
+  for (const auto &ln : lines) {
+    // Trim leading/trailing whitespace
+    auto startPos = ln.find_first_not_of(" \t");
+    if (startPos == std::string::npos) {
+      continue; // empty line
+    }
+    std::string trimmed = ln.substr(startPos);
+
+    // Check for “%BLOCK LATTICE_CART” or “%BLOCK LATTICE_ABC” ignoring case
+    if (startsWithIgnoreCase(trimmed, "%BLOCK LATTICE_CART") ||
+        startsWithIgnoreCase(trimmed, "%BLOCK LATTICE_ABC"))
+    {
+      foundLatticeBlock = true;
+    }
+    // Also check if it contains e.g. “%BLOCK POSITIONS_FRAC”
+    if (startsWithIgnoreCase(trimmed, "%BLOCK POSITIONS_FRAC") ||
+        startsWithIgnoreCase(trimmed, "%BLOCK POSITIONS_ABS"))
+    {
+      foundPositionsBlock = true;
+    }
+  }
+
+  // A minimal “likely .cell” detection: we found a lattice block and a positions block
+  // Very small files might omit one or the other, but this is typically robust.
+  if (foundLatticeBlock && foundPositionsBlock) {
+    return true;
+  }
+
+  // If not found, it might still be a .cell file, but the heuristics do not confirm it.
+  return false;
 }
 
 std::string
 StructureReaderSelector::guessFormatFromContent(const std::string &filePath) {
   std::ifstream ifs(filePath);
-  if (!ifs.is_open()) {
-    // If can't open, guess POSCAR (commonly no extension).
-    return "POSCAR";
-  }
 
   std::string line;
   if (std::getline(ifs, line)) {
-    // Some naive checks
-    if (line.find("lattice_vector") != std::string::npos) {
-      return "XSF";
-    }
-    if (line.find("CELL_PARAMETERS") != std::string::npos) {
+    if (isLikelyCif(filePath)) {
       return "CIF";
     }
-    if (line.find("%BLOCK LATTICE_") != std::string::npos) {
+    if (isLikelyCell(filePath)) {
       return "CELL";
     }
+    if (isLikelyPoscar(filePath)) {
+      return "POSCAR";
+    }
   }
 
-  // fallback
-  return "POSCAR";
+  return "UNKNOWN";
 }
-- 
GitLab