From 58d653ef734813c188b22d248922df1c5c51b630 Mon Sep 17 00:00:00 2001
From: Marcin Kirsz <mkirsz@ed.ac.uk>
Date: Tue, 4 Mar 2025 20:11:56 +0000
Subject: [PATCH] WIP: Structure Readers

---
 CMakeLists.txt                                |  87 +++++-
 .../mlip/structure_readers/aflow_reader.h     |   0
 .../structure_readers/castep_cell_reader.h    |  74 +++++
 .../tadah/mlip/structure_readers/cif_reader.h |  51 +++
 .../tadah/mlip/structure_readers/cod_reader.h |   0
 .../materials_project_reader.h                |  33 ++
 .../mlip/structure_readers/nomad_reader.h     |   0
 .../mlip/structure_readers/structure_reader.h |  33 ++
 .../structure_reader_selector.h               |  29 ++
 .../structure_readers/vasp_poscar_reader.h    |  30 ++
 src/castep_cell_reader.cpp                    | 293 ++++++++++++++++++
 src/cif_reader.cpp                            |  59 ++++
 src/materials_project_reader.cpp              | 122 ++++++++
 src/structure_reader_selector.cpp             | 114 +++++++
 src/vasp_poscar_reader.cpp                    | 204 ++++++++++++
 15 files changed, 1128 insertions(+), 1 deletion(-)
 create mode 100644 include/tadah/mlip/structure_readers/aflow_reader.h
 create mode 100644 include/tadah/mlip/structure_readers/castep_cell_reader.h
 create mode 100644 include/tadah/mlip/structure_readers/cif_reader.h
 create mode 100644 include/tadah/mlip/structure_readers/cod_reader.h
 create mode 100644 include/tadah/mlip/structure_readers/materials_project_reader.h
 create mode 100644 include/tadah/mlip/structure_readers/nomad_reader.h
 create mode 100644 include/tadah/mlip/structure_readers/structure_reader.h
 create mode 100644 include/tadah/mlip/structure_readers/structure_reader_selector.h
 create mode 100644 include/tadah/mlip/structure_readers/vasp_poscar_reader.h
 create mode 100644 src/castep_cell_reader.cpp
 create mode 100644 src/cif_reader.cpp
 create mode 100644 src/materials_project_reader.cpp
 create mode 100644 src/structure_reader_selector.cpp
 create mode 100644 src/vasp_poscar_reader.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18f0dc6..6c42a3b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,13 +44,66 @@ if (BUILD_SHARED_LIBS)
   endif("${isSystemDir}" STREQUAL "-1")
 endif()
 
+# Set default build type
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE)
+if (CMAKE_BUILD_TYPE STREQUAL "DEVELOP")
+if(NOT DEFINED ENV{TADAH_PATH})
+  message(FATAL_ERROR "TADAH_PATH system variable is undefined for the ${CMAKE_BUILD_TYPE} build")
+endif()
+  function(find_directory_case_insensitive target_name search_dir result_var)
+    execute_process(
+        COMMAND find "${search_dir}" -maxdepth 1 -type d -iname "${target_name}"
+        OUTPUT_VARIABLE found_dir
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(found_dir)
+      set(${result_var} "${found_dir}" PARENT_SCOPE)
+      message("${TADAH} Develop build: Found directory: ${found_dir}")
+    else()
+      set(${result_var} "" PARENT_SCOPE)
+      message("${TADAH} Develop build: Directory not found.")
+    endif()
+  endfunction()
+  find_directory_case_insensitive("models" "$ENV{TADAH_PATH}" DEVELOP_MODELS_DIR)
+  set(FETCHCONTENT_SOURCE_DIR_TADAH.MODELS "${DEVELOP_MODELS_DIR}")
+endif()
+
+
+# Check is build supported
+string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE)
+if (CMAKE_BUILD_TYPE STREQUAL "DEBUG" 
+    OR
+    CMAKE_BUILD_TYPE STREQUAL "RELEASE"
+    OR
+    CMAKE_BUILD_TYPE STREQUAL "PROFILE"
+    OR
+    CMAKE_BUILD_TYPE STREQUAL "DEVELOP"
+)
+  message(STATUS "${TADAH} Build type:  ${CMAKE_BUILD_TYPE}.")
+else()
+  message(FATAL_ERROR "
+  Selected build type is not supported: ${CMAKE_BUILD_TYPE}.
+  Supported build types: Release, Debug, Profile Develop"
+  )
+endif()
+
+# Update the documentation string of CMAKE_BUILD_TYPE for GUIs
+set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING
+  "Choose the type of build, options are: None Debug Release Profile."
+  FORCE)
+
+
 include(FetchContent)
 
 if(DEFINED ENV{CI_COMMIT_REF_NAME})
   set(GIT_BRANCH "$ENV{CI_COMMIT_REF_NAME}")
 else()
   execute_process(
-    COMMAND bash -c "git status | grep origin | sed -e 's/.*origin\\///' -e 's/[^a-zA-Z].*//'" 
+    COMMAND bash -c "git status | sed -n '1s/^On branch //p'" 
     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_BRANCH
     OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -75,6 +128,38 @@ target_include_directories(tadah.mlip PUBLIC ${Tadah.MODELS_SOURCE_DIR}/include)
 target_include_directories(tadah.mlip PUBLIC ${Tadah.CORE_SOURCE_DIR}/include)
 target_include_directories(tadah.mlip PUBLIC ${Tadah.CORE_SOURCE_DIR}/external/toml11)
 
+FetchContent_Declare(
+  gemmi
+  GIT_REPOSITORY https://github.com/project-gemmi/gemmi.git
+  GIT_TAG        v0.7.0
+)
+FetchContent_MakeAvailable(gemmi)
+# target_link_libraries(tadah.mlip PRIVATE gemmi::headers)
+target_link_libraries(tadah.mlip PRIVATE gemmi::gemmi_cpp)
+
+FetchContent_Declare(
+  json
+  URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
+  DOWNLOAD_EXTRACT_TIMESTAMP true
+)
+FetchContent_MakeAvailable(json)
+target_link_libraries(tadah.mlip PRIVATE nlohmann_json::nlohmann_json)
+
+find_package(CURL QUIET)   # Use QUIET to avoid CMake’s default error
+
+if(NOT CURL_FOUND)
+  message(FATAL_ERROR
+    "\nCould not find libcurl. Please install the development files.\n"
+    "On Ubuntu/Debian-based systems, try:\n"
+    "  sudo apt-get install libcurl4-openssl-dev\n"
+    "On Fedora/Red Hat, try:\n"
+    "  sudo dnf install libcurl-devel\n"
+    "Or specify CURL_INCLUDE_DIR/CURL_LIBRARY manually.\n"
+  )
+endif()
+
+target_link_libraries(tadah.mlip PRIVATE CURL::libcurl)
+
 if(TADAH_ENABLE_OPENMP)
   find_package(OpenMP REQUIRED)
   target_link_libraries(tadah.mlip PUBLIC OpenMP::OpenMP_CXX)
diff --git a/include/tadah/mlip/structure_readers/aflow_reader.h b/include/tadah/mlip/structure_readers/aflow_reader.h
new file mode 100644
index 0000000..e69de29
diff --git a/include/tadah/mlip/structure_readers/castep_cell_reader.h b/include/tadah/mlip/structure_readers/castep_cell_reader.h
new file mode 100644
index 0000000..d494f30
--- /dev/null
+++ b/include/tadah/mlip/structure_readers/castep_cell_reader.h
@@ -0,0 +1,74 @@
+#ifndef CASTEP_CELL_READER_H
+#define CASTEP_CELL_READER_H
+
+#include <string>
+#include <tadah/mlip/structure.h>
+#include <tadah/mlip/structure_readers/structure_reader.h>
+
+/**
+ * @brief Reads crystallographic and atomic data from a CASTEP .cell file.
+ *
+ * The .cell file can contain:
+ *  - %BLOCK LATTICE_CART  or %BLOCK LATTICE_ABC
+ *  - %BLOCK POSITIONS_FRAC or %BLOCK POSITIONS_ABS
+ *
+ * If LATTICE_ABC is provided, the code converts (a,b,c,α,β,γ) into
+ * a 3×3 Cartesian representation with:
+ *   a along x,
+ *   b in xy plane,
+ *   c forming a right-handed set with a and b.
+ *
+ * If positions are fractional, we multiply by the 3×3 cell matrix
+ * to get absolute positions in Angstrom. If positions are absolute,
+ * we store them directly.
+ *
+ * This parser also extracts atomic species from the first token
+ * in POSITIONS_* blocks.
+ */
+class CastepCellReader : public StructureReader {
+public:
+  /**
+   * @brief Default constructor.
+   */
+  CastepCellReader() = default;
+
+  /**
+   * @brief Reads and parses the specified .cell file.
+   * @param path Path to the CASTEP cell file.
+   */
+  void read(const std::string &path) override;
+
+  /**
+   * @brief Returns the parsed Structure with absolute coordinates in Ångström.
+   */
+  Structure getStructure() const override;
+
+private:
+  Structure m_structure;
+
+  /**
+   * @brief Parses the complete contents of a .cell file from a single string.
+   */
+  void parseCellContents(const std::string &contents);
+
+  /**
+   * @brief Parses LATTICE_ABC lines into the 3×3 matrix in m_structure.cell.
+   * @param lines a vector of strings containing a b c on one line, alpha beta
+   * gamma on next line
+   */
+  void parseLatticeABC(const std::vector<std::string> &lines);
+
+  /**
+   * @brief Converts atomic fractional coordinates (fx, fy, fz) to absolute in Ã…
+   *        using the current m_structure.cell matrix.
+   */
+  void applyFractional(double fx, double fy, double fz, double &cx, double &cy,
+                       double &cz);
+
+  /**
+   * @brief Helper for mapping string species (e.g., "Si", "O") to Element enum.
+   */
+  Element parseSpecies(const std::string &sym);
+};
+
+#endif // CASTEP_CELL_READER_H
diff --git a/include/tadah/mlip/structure_readers/cif_reader.h b/include/tadah/mlip/structure_readers/cif_reader.h
new file mode 100644
index 0000000..6488cf3
--- /dev/null
+++ b/include/tadah/mlip/structure_readers/cif_reader.h
@@ -0,0 +1,51 @@
+#ifndef CIF_READER_H
+#define CIF_READER_H
+
+#include <tadah/mlip/structure_readers/structure_reader.h>
+#include <tadah/mlip/structure.h>
+#include <string>
+
+/**
+ * @brief Reads Structure data from a CIF file using older Gemmi APIs.
+ *
+ * - Gets cell parameters from block.find_value("_cell_length_a"), etc.
+ * - Iterates block.items to find loops (where item.kind == gemmi::cif::Item::Loop).
+ * - Searches loop.tags for _atom_site_fract_x or _atom_site_cartn_x.
+ * - Uses loop.get_s(row, col) to retrieve the data cells, converting to float/double.
+ */
+class CifReader : public StructureReader {
+public:
+  CifReader() = default;
+
+  void read(const std::string &path) override;
+  Structure getStructure() const override;
+
+private:
+  Structure m_structure;
+
+  /**
+   * @brief Parses the entire CIF text, populating m_structure with cell & atoms.
+   */
+  void parseCifContents(const std::string &contents);
+
+  /**
+   * @brief Creates a 3×3 matrix from (a,b,c, alpha,beta,gamma).
+   *        a along x, b in xy-plane, c forms a right-handed set.
+   */
+  void buildCellFromABC(double aVal, double bVal, double cVal,
+                        double alphaDeg, double betaDeg, double gammaDeg);
+
+  /**
+   * @brief Convert (fx,fy,fz) fractional → Cartesian coords using m_structure.cell.
+   */
+  void fracToCart(double fx, double fy, double fz,
+                  double &cx, double &cy, double &cz) const;
+
+  /**
+   * @brief Convert degrees to radians.
+   */
+  static double deg2rad(double deg);
+};
+
+#endif // CIF_READER_H
+
diff --git a/include/tadah/mlip/structure_readers/cod_reader.h b/include/tadah/mlip/structure_readers/cod_reader.h
new file mode 100644
index 0000000..e69de29
diff --git a/include/tadah/mlip/structure_readers/materials_project_reader.h b/include/tadah/mlip/structure_readers/materials_project_reader.h
new file mode 100644
index 0000000..6d08cde
--- /dev/null
+++ b/include/tadah/mlip/structure_readers/materials_project_reader.h
@@ -0,0 +1,33 @@
+#ifndef MATERIALS_PROJECT_READER_H
+#define MATERIALS_PROJECT_READER_H
+
+#include <string>
+#include <tadah/mlip/structure_readers/structure_reader.h>
+
+/**
+ * @brief Reads a Structure from the Materials Project via REST API (v2).
+ *
+ * Usage:
+ * @code
+ *   MaterialsProjectReader reader("MY_VALID_MP_KEY");
+ *   reader.read("mp-149");
+ *   Structure s = reader.getStructure();
+ * @endcode
+ */
+class MaterialsProjectReader : public StructureReader {
+public:
+  explicit MaterialsProjectReader(const std::string &apiKey);
+
+  void read(const std::string &mpID) override;
+  Structure getStructure() const override;
+
+private:
+  std::string m_apiKey;
+  Structure m_structure;
+
+  void fetchAndParseMP(const std::string &mpID);
+  std::string httpGet(const std::string &url);
+  void parseMpJson(const std::string &jsonContent);
+};
+
+#endif // MATERIALS_PROJECT_READER_H
diff --git a/include/tadah/mlip/structure_readers/nomad_reader.h b/include/tadah/mlip/structure_readers/nomad_reader.h
new file mode 100644
index 0000000..e69de29
diff --git a/include/tadah/mlip/structure_readers/structure_reader.h b/include/tadah/mlip/structure_readers/structure_reader.h
new file mode 100644
index 0000000..e7bf58c
--- /dev/null
+++ b/include/tadah/mlip/structure_readers/structure_reader.h
@@ -0,0 +1,33 @@
+#ifndef STRUCTURE_READER_H
+#define STRUCTURE_READER_H
+
+#include <string>
+#include <tadah/mlip/structure.h>
+
+/**
+ * @brief Abstract base class for reading a Structure from a file or an online
+ * source.
+ *
+ * Provides a unified interface for various file formats or REST API sources.
+ */
+class StructureReader {
+public:
+  /// Virtual destructor.
+  virtual ~StructureReader() = default;
+
+  /**
+   * @brief Reads structural information from the given path or identifier.
+   *
+   * @param path A file path (e.g., "myfile.cif") or an online ID (e.g.,
+   * "mp-149").
+   */
+  virtual void read(const std::string &path) = 0;
+
+  /**
+   * @brief Retrieves the last read Structure.
+   *        All coordinates should be in absolute (Cartesian) Angstrom units.
+   */
+  virtual Structure getStructure() const = 0;
+};
+
+#endif // STRUCTURE_READER_H
diff --git a/include/tadah/mlip/structure_readers/structure_reader_selector.h b/include/tadah/mlip/structure_readers/structure_reader_selector.h
new file mode 100644
index 0000000..e32b016
--- /dev/null
+++ b/include/tadah/mlip/structure_readers/structure_reader_selector.h
@@ -0,0 +1,29 @@
+#ifndef STRUCTURE_READER_SELECTOR_H
+#define STRUCTURE_READER_SELECTOR_H
+
+#include <memory>
+#include <string>
+#include <tadah/mlip/structure_readers/structure_reader.h>
+
+/**
+ * @brief A factory that provides an appropriate StructureReader
+ *        based on file extension or recognized online ID prefixes.
+ */
+class StructureReaderSelector {
+public:
+  /**
+   * @brief Creates a reader by analyzing the input path or ID.
+   *
+   * @param pathOrId File path or an online ID (e.g., "mp-149")
+   * @return A suitable StructureReader capable of parsing the structure.
+   * @throws std::runtime_error if no suitable reader is found.
+   */
+  static std::unique_ptr<StructureReader>
+  getReader(const std::string &pathOrId);
+
+private:
+  static std::string guessFormat(const std::string &pathOrId);
+  static std::string guessFormatFromContent(const std::string &filePath);
+};
+
+#endif // STRUCTURE_READER_SELECTOR_H
diff --git a/include/tadah/mlip/structure_readers/vasp_poscar_reader.h b/include/tadah/mlip/structure_readers/vasp_poscar_reader.h
new file mode 100644
index 0000000..2abca5d
--- /dev/null
+++ b/include/tadah/mlip/structure_readers/vasp_poscar_reader.h
@@ -0,0 +1,30 @@
+#ifndef VASP_POSCAR_READER_H
+#define VASP_POSCAR_READER_H
+
+#include <tadah/mlip/structure_readers/structure_reader.h>
+
+/**
+ * @brief Reads a VASP POSCAR/CONTCAR file.
+ *
+ * A typical format:
+ *   1) Comment line
+ *   2) Scaling factor
+ *   3-5) Lattice vectors
+ *   6) Element symbols or counts
+ *   7) Possibly "Selective dynamics" line
+ *   8) "Direct" or "Cartesian"
+ *   9+) Atomic coordinates
+ */
+class VaspPoscarReader : public StructureReader {
+public:
+  VaspPoscarReader() = default;
+
+  void read(const std::string &path) override;
+  Structure getStructure() const override;
+
+private:
+  Structure m_structure;
+  void parsePoscar(const std::vector<std::string> &lines);
+};
+
+#endif // VASP_POSCAR_READER_H
diff --git a/src/castep_cell_reader.cpp b/src/castep_cell_reader.cpp
new file mode 100644
index 0000000..fb09541
--- /dev/null
+++ b/src/castep_cell_reader.cpp
@@ -0,0 +1,293 @@
+#include <tadah/mlip/structure_readers/castep_cell_reader.h>
+
+#include <algorithm>
+#include <cctype>
+#include <cmath>
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+//--------------------------------------------------------------------
+// Helper to split a line by whitespace
+//--------------------------------------------------------------------
+static std::vector<std::string> split_line(const std::string &line) {
+  std::istringstream iss(line);
+  std::vector<std::string> tokens;
+  std::string s;
+  while (iss >> s) {
+    tokens.push_back(s);
+  }
+  return tokens;
+}
+
+// Convert degrees to radians
+static double deg2rad(double deg) {
+  static const double PI = 3.141592653589793;
+  return deg * (PI / 180.0);
+}
+
+//--------------------------------------------------------------------
+// Implementation of public methods
+//--------------------------------------------------------------------
+void CastepCellReader::read(const std::string &path) {
+  std::ifstream ifs(path);
+  if (!ifs.is_open()) {
+    throw std::runtime_error("CastepCellReader: Cannot open file " + path);
+  }
+
+  std::ostringstream buffer;
+  buffer << ifs.rdbuf();
+  parseCellContents(buffer.str());
+
+  m_structure.label = "CASTEP cell file: " + path;
+}
+
+Structure CastepCellReader::getStructure() const {
+  return m_structure;
+}
+
+//--------------------------------------------------------------------
+// parseCellContents
+//--------------------------------------------------------------------
+void CastepCellReader::parseCellContents(const std::string &contents) {
+  std::istringstream iss(contents);
+  std::string line;
+
+  // State flags for each block
+  bool inLatticeCart   = false;
+  bool inLatticeABC    = false;
+  bool inPosFracBlock  = false;
+  bool inPosAbsBlock   = false;
+
+  int  lcCount = 0; // track lines read for LATTICE_CART
+  std::vector<std::string> abcLines; // store lines for LATTICE_ABC
+
+  // Initialize cell to zero (in case we never find any block)
+  for (int i=0; i<3; ++i)
+    for (int j=0; j<3; ++j)
+      m_structure.cell(i,j) = 0.0;
+
+  // Read file line by line
+  while (std::getline(iss, line)) {
+    // Make a lowercase copy for easier detection
+    std::string lcase = line;
+    std::transform(lcase.begin(), lcase.end(), lcase.begin(), ::tolower);
+
+    // Detect block starts/ends
+    if (lcase.find("%block lattice_cart") != std::string::npos) {
+      inLatticeCart = true;
+      inLatticeABC  = false;
+      lcCount = 0;
+      continue;
+    }
+    if (lcase.find("%endblock lattice_cart") != std::string::npos) {
+      inLatticeCart = false;
+      continue;
+    }
+    if (lcase.find("%block lattice_abc") != std::string::npos) {
+      inLatticeABC  = true;
+      inLatticeCart = false;
+      lcCount = 0;
+      abcLines.clear();
+      continue;
+    }
+    if (lcase.find("%endblock lattice_abc") != std::string::npos) {
+      inLatticeABC = false;
+      // parse lines from abcLines (if present)
+      if (abcLines.size() >= 2) {
+        parseLatticeABC(abcLines);
+      }
+      continue;
+    }
+    if (lcase.find("%block positions_frac") != std::string::npos) {
+      inPosFracBlock = true;
+      inPosAbsBlock  = false;
+      continue;
+    }
+    if (lcase.find("%endblock positions_frac") != std::string::npos) {
+      inPosFracBlock = false;
+      continue;
+    }
+    if (lcase.find("%block positions_abs") != std::string::npos) {
+      inPosAbsBlock  = true;
+      inPosFracBlock = false;
+      continue;
+    }
+    if (lcase.find("%endblock positions_abs") != std::string::npos) {
+      inPosAbsBlock = false;
+      continue;
+    }
+
+    // Now parse lines if in any recognized block
+    if (inLatticeCart) {
+      // read up to 3 lines for a,b,c
+      if (lcCount < 3) {
+        auto toks = split_line(line);
+        if (toks.size() < 3) {
+          throw std::runtime_error("CastepCellReader: LATTICE_CART line missing coords.");
+        }
+        double x = std::stod(toks[0]);
+        double y = std::stod(toks[1]);
+        double z = std::stod(toks[2]);
+
+        m_structure.cell(lcCount,0) = x;
+        m_structure.cell(lcCount,1) = y;
+        m_structure.cell(lcCount,2) = z;
+        lcCount++;
+      }
+    }
+    else if (inLatticeABC) {
+      // store lines for later parsing
+      if (!line.empty()) {
+        abcLines.push_back(line);
+      }
+    }
+    else if (inPosFracBlock) {
+      // e.g. "Si 0.0 0.0 0.0"
+      auto toks = split_line(line);
+      if (toks.size() < 4) {
+        // possibly a comment or empty line, skip
+        continue;
+      }
+
+      // parse species
+      Element e = parseSpecies(toks[0]);
+
+      // fractional coords
+      double fx = std::stod(toks[1]);
+      double fy = std::stod(toks[2]);
+      double fz = std::stod(toks[3]);
+
+      double cx, cy, cz;
+      applyFractional(fx, fy, fz, cx, cy, cz);
+
+      Atom a;  
+      // Copy base element data into 'a'
+      static_cast<Element&>(a) = e;
+      // Now set the position
+      a.position = {cx, cy, cz};
+
+      // Force is left default (0,0,0)
+      m_structure.atoms.push_back(a);
+    }
+    else if (inPosAbsBlock) {
+      // e.g. "O 1.2 3.4 5.6"
+      auto toks = split_line(line);
+      if (toks.size() < 4) {
+        continue;
+      }
+
+      Element e = parseSpecies(toks[0]);
+
+      double x = std::stod(toks[1]);
+      double y = std::stod(toks[2]);
+      double z = std::stod(toks[3]);
+
+      Atom a;
+      static_cast<Element&>(a) = e;
+      a.position = {x, y, z};
+
+      m_structure.atoms.push_back(a);
+    }
+  }
+}
+
+//--------------------------------------------------------------------
+// parseLatticeABC
+//--------------------------------------------------------------------
+void CastepCellReader::parseLatticeABC(const std::vector<std::string> &lines) {
+  // We expect at least two lines:
+  //   line 1: "a b c"
+  //   line 2: "alpha beta gamma"
+  // Additional lines might exist (units, comments).
+  // We'll search for the first line with >=3 tokens for (a,b,c),
+  // and the next with >=3 tokens for (alpha,beta,gamma).
+
+  double aVal=0.0, bVal=0.0, cVal=0.0;
+  double alphaDeg=90.0, betaDeg=90.0, gammaDeg=90.0;
+  bool haveABC = false;
+  bool haveAngles = false;
+
+  for (auto &ln : lines) {
+    auto toks = split_line(ln);
+    if (toks.size() >= 3 && !haveABC) {
+      aVal = std::stod(toks[0]);
+      bVal = std::stod(toks[1]);
+      cVal = std::stod(toks[2]);
+      haveABC = true;
+      continue;
+    }
+    else if (toks.size() >= 3 && haveABC && !haveAngles) {
+      alphaDeg = std::stod(toks[0]);
+      betaDeg  = std::stod(toks[1]);
+      gammaDeg = std::stod(toks[2]);
+      haveAngles = true;
+      break;
+    }
+  }
+  if (!haveABC || !haveAngles) {
+    throw std::runtime_error("CastepCellReader: LATTICE_ABC block incomplete (need a,b,c and alpha,beta,gamma).");
+  }
+
+  double alpha = deg2rad(alphaDeg);
+  double beta  = deg2rad(betaDeg);
+  double gamma = deg2rad(gammaDeg);
+
+  // Standard orientation:
+  //   a = (aVal, 0, 0)
+  //   b.x = bVal * cos(gamma)
+  //   b.y = bVal * sin(gamma)
+  //   b.z = 0
+  //   c.x = cVal * cos(beta)
+  //   c.y = cVal * [cos(alpha) - cos(beta)*cos(gamma)] / sin(gamma)
+  //   c.z = sqrt(cVal^2 - cx^2 - cy^2)
+
+  // a
+  m_structure.cell(0,0) = aVal;
+  m_structure.cell(0,1) = 0.0;
+  m_structure.cell(0,2) = 0.0;
+
+  // b
+  double bx = bVal * std::cos(gamma);
+  double by = bVal * std::sin(gamma);
+  m_structure.cell(1,0) = bx;
+  m_structure.cell(1,1) = by;
+  m_structure.cell(1,2) = 0.0;
+
+  // c
+  double cx = cVal * std::cos(beta);
+  double cy = cVal * (std::cos(alpha) - std::cos(beta)*std::cos(gamma)) / std::sin(gamma);
+  double czsq = cVal*cVal - cx*cx - cy*cy;
+  if (czsq < 0.0) {
+    // could be numerical rounding or truly invalid
+    if (czsq > -1e-10) czsq = 0.0; // clamp small negative to zero
+    else {
+      throw std::runtime_error("CastepCellReader: invalid angles produce negative c.z^2.");
+    }
+  }
+  double cz = std::sqrt(czsq);
+
+  m_structure.cell(2,0) = cx;
+  m_structure.cell(2,1) = cy;
+  m_structure.cell(2,2) = cz;
+}
+
+//--------------------------------------------------------------------
+// applyFractional
+//--------------------------------------------------------------------
+void CastepCellReader::applyFractional(double fx, double fy, double fz, 
+                                       double &cx, double &cy, double &cz) {
+  // r_cart = fx*a + fy*b + fz*c
+  //    a, b, c are rows of the 3×3 matrix in m_structure.cell
+  cx = fx*m_structure.cell(0,0) + fy*m_structure.cell(1,0) + fz*m_structure.cell(2,0);
+  cy = fx*m_structure.cell(0,1) + fy*m_structure.cell(1,1) + fz*m_structure.cell(2,1);
+  cz = fx*m_structure.cell(0,2) + fy*m_structure.cell(1,2) + fz*m_structure.cell(2,2);
+}
+
+//--------------------------------------------------------------------
+// parseSpecies
+//--------------------------------------------------------------------
+Element CastepCellReader::parseSpecies(const std::string &sym) {
+  return PeriodicTable().find_by_symbol(sym);
+}
diff --git a/src/cif_reader.cpp b/src/cif_reader.cpp
new file mode 100644
index 0000000..1cba6f2
--- /dev/null
+++ b/src/cif_reader.cpp
@@ -0,0 +1,59 @@
+#include <tadah/core/periodic_table.h>
+#include <tadah/mlip/structure_readers/cif_reader.h>
+
+#include <gemmi/cif.hpp>
+#include <gemmi/smcif.hpp>
+
+#include <cmath>
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+
+void CifReader::read(const std::string &path) {
+  std::ifstream ifs(path);
+  if (!ifs.is_open()) {
+    throw std::runtime_error("CifReader::read: Cannot open " + path);
+  }
+
+  std::ostringstream buffer;
+  buffer << ifs.rdbuf();
+  parseCifContents(buffer.str());
+
+  // Label the source
+  m_structure.label = "CIF file: " + path;
+}
+
+Structure CifReader::getStructure() const { return m_structure; }
+
+void CifReader::parseCifContents(const std::string &fileContents) {
+  using namespace gemmi::cif;
+
+  // Parses the raw CIF content into a gemmi Document
+  Document doc = read_string(fileContents);
+  if (doc.blocks.empty()) {
+    throw std::runtime_error(
+        "CifReader::parseCifContents: No data blocks found.");
+  }
+
+  // Assumes there is only one data block in the CIF
+  Block &block = doc.sole_block();
+  gemmi::SmallStructure struc = gemmi::make_small_structure_from_block(block);
+  struc.check_spacegroup();
+
+  auto sites = struc.get_all_unit_cell_sites();
+
+  m_structure.atoms.reserve(sites.size());
+  for (const auto &site : sites) {
+    auto p = struc.cell.orthogonalize(site.fract);
+    auto label = site.element.name();
+    Element e = PeriodicTable().find_by_symbol(label);
+    Atom a(e, p.x, p.y, p.z, 0, 0, 0);
+    m_structure.atoms.push_back(a);
+  }
+  auto mat = struc.cell.orth.mat;
+  for (size_t i = 0; i < 3; ++i) {
+    for (size_t j = 0; j < 3; ++j) {
+      m_structure.cell(i, j) = mat.a[i][j];
+    }
+  }
+}
diff --git a/src/materials_project_reader.cpp b/src/materials_project_reader.cpp
new file mode 100644
index 0000000..dc688bd
--- /dev/null
+++ b/src/materials_project_reader.cpp
@@ -0,0 +1,122 @@
+#include <sstream>
+#include <stdexcept>
+#include <tadah/mlip/structure_readers/materials_project_reader.h>
+
+#include <curl/curl.h>       // libcurl
+#include <nlohmann/json.hpp> // nlohmann JSON
+
+static Element parse_element_mp(const std::string &elemName);
+
+MaterialsProjectReader::MaterialsProjectReader(const std::string &apiKey)
+    : m_apiKey(apiKey) {}
+
+void MaterialsProjectReader::read(const std::string &mpID) {
+  fetchAndParseMP(mpID);
+  m_structure.label = "MaterialsProject ID: " + mpID;
+}
+
+Structure MaterialsProjectReader::getStructure() const { return m_structure; }
+
+void MaterialsProjectReader::fetchAndParseMP(const std::string &mpID) {
+  std::string url = "https://materialsproject.org/rest/v2/materials/";
+  url += mpID + "/structure?API_KEY=" + m_apiKey;
+
+  std::string response = httpGet(url);
+  parseMpJson(response);
+}
+
+std::string MaterialsProjectReader::httpGet(const std::string &url) {
+  CURL *curl = curl_easy_init();
+  if (!curl) {
+    throw std::runtime_error(
+        "MaterialsProjectReader::httpGet: Failed to init libcurl");
+  }
+
+  auto writeCallback = [](char *ptr, size_t size, size_t nmemb,
+                          void *userdata) -> size_t {
+    std::string *resp = static_cast<std::string *>(userdata);
+    size_t totalSize = size * nmemb;
+    resp->append(ptr, totalSize);
+    return totalSize;
+  };
+
+  std::string response;
+  curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback);
+  curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
+  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+
+  CURLcode res = curl_easy_perform(curl);
+  if (res != CURLE_OK) {
+    curl_easy_cleanup(curl);
+    throw std::runtime_error(
+        std::string("httpGet: curl_easy_perform() failed: ") +
+        curl_easy_strerror(res));
+  }
+  curl_easy_cleanup(curl);
+  return response;
+}
+
+void MaterialsProjectReader::parseMpJson(const std::string &jsonContent) {
+  using json = nlohmann::json;
+  json root = json::parse(jsonContent);
+
+  if (!root.contains("response") || !root["response"].is_array() ||
+      root["response"].empty()) {
+    throw std::runtime_error("parseMpJson: No response array in JSON.");
+  }
+
+  auto data = root["response"][0];
+  if (!data.contains("lattice") || !data["lattice"].contains("matrix")) {
+    throw std::runtime_error("parseMpJson: Missing lattice.matrix");
+  }
+
+  auto mat = data["lattice"]["matrix"];
+  if (!mat.is_array() || mat.size() != 3) {
+    throw std::runtime_error("parseMpJson: matrix is not 3x3");
+  }
+
+  for (int i = 0; i < 3; i++) {
+    if (!mat[i].is_array() || mat[i].size() != 3) {
+      throw std::runtime_error("parseMpJson: matrix row is not size 3");
+    }
+    m_structure.cell(i, 0) = mat[i][0].get<double>();
+    m_structure.cell(i, 1) = mat[i][1].get<double>();
+    m_structure.cell(i, 2) = mat[i][2].get<double>();
+  }
+
+  if (!data.contains("sites") || !data["sites"].is_array()) {
+    throw std::runtime_error("parseMpJson: No sites array");
+  }
+  auto sites = data["sites"];
+  for (auto &site : sites) {
+    if (!site.contains("abc") || !site["abc"].is_array() ||
+        site["abc"].size() != 3) {
+      throw std::runtime_error("parseMpJson: site missing abc array");
+    }
+    double fx = site["abc"][0].get<double>();
+    double fy = site["abc"][1].get<double>();
+    double fz = site["abc"][2].get<double>();
+
+    // fractional -> cart
+    double x = fx * m_structure.cell(0, 0) + fy * m_structure.cell(1, 0) +
+               fz * m_structure.cell(2, 0);
+    double y = fx * m_structure.cell(0, 1) + fy * m_structure.cell(1, 1) +
+               fz * m_structure.cell(2, 1);
+    double z = fx * m_structure.cell(0, 2) + fy * m_structure.cell(1, 2) +
+               fz * m_structure.cell(2, 2);
+
+    Atom a;
+    std::string lbl = site["label"].get<std::string>();
+    Element e = parse_element_mp(lbl);
+    static_cast<Element &>(a) = e;
+    a.position[0] = x;
+    a.position[1] = y;
+    a.position[2] = z;
+    m_structure.atoms.push_back(a);
+  }
+}
+
+static Element parse_element_mp(const std::string &elemName) {
+  return PeriodicTable().find_by_symbol(elemName);
+}
diff --git a/src/structure_reader_selector.cpp b/src/structure_reader_selector.cpp
new file mode 100644
index 0000000..684d2ce
--- /dev/null
+++ b/src/structure_reader_selector.cpp
@@ -0,0 +1,114 @@
+#include <tadah/mlip/structure_readers/structure_reader_selector.h>
+#include <tadah/mlip/structure_readers/cif_reader.h>
+#include <tadah/mlip/structure_readers/vasp_poscar_reader.h>
+#include <tadah/mlip/structure_readers/castep_cell_reader.h>
+/*#include <tadah/mlip/structure_readers/extended_xyz_reader.h>*/
+/*#include <tadah/mlip/structure_readers/xsf_reader.h>*/
+#include <tadah/mlip/structure_readers/materials_project_reader.h>
+/*#include <tadah/mlip/structure_readers/cod_reader.h>*/
+/*#include <tadah/mlip/structure_readers/aflow_reader.h>*/
+/*#include <tadah/mlip/structure_readers/nomad_reader.h>*/
+
+#include <fstream>
+#include <algorithm>
+#include <cctype>
+#include <stdexcept>
+
+std::unique_ptr<StructureReader> StructureReaderSelector::getReader(const std::string& pathOrId)
+{
+  std::cout << "Reading structure from: " << pathOrId << std::endl;
+    std::string format = guessFormat(pathOrId);
+    std::cout << "Format: " << format << std::endl;
+
+    if (format == "MP") {
+        return std::make_unique<MaterialsProjectReader>("MY_MP_API_KEY"); // Provide your real key
+    }
+    else if (format == "CIF") {
+      std::cout << "CIF Reader Selected" << std::endl;
+        return std::make_unique<CifReader>();
+    }
+    else if (format == "POSCAR") {
+        return std::make_unique<VaspPoscarReader>();
+    }
+    else if (format == "CELL") {
+        return std::make_unique<CastepCellReader>();
+    }
+    /*else if (format == "XYZ") {*/
+    /*    return std::make_unique<ExtendedXyzReader>();*/
+    /*}*/
+    /*else if (format == "XSF") {*/
+    /*    return std::make_unique<XsfReader>();*/
+    /*}*/
+    /*else if (format == "COD") {*/
+    /*    return std::make_unique<CODReader>();*/
+    /*}*/
+    /*else if (format == "AFLOW") {*/
+    /*    return std::make_unique<AflowReader>();*/
+    /*}*/
+    /*else if (format == "NOMAD") {*/
+    /*    return std::make_unique<NomadReader>();*/
+    /*}*/
+    else {
+        throw std::runtime_error("StructureReaderSelector::getReader - Unknown format for " + pathOrId);
+    }
+}
+
+std::string StructureReaderSelector::guessFormat(const std::string& pathOrId)
+{
+    // Check known online ID patterns:
+    if (pathOrId.rfind("mp-", 0) == 0) {
+        return "MP";
+    }
+    if (pathOrId.rfind("cod-", 0) == 0) {
+        return "COD";
+    }
+    if (pathOrId.rfind("aflow-", 0) == 0) {
+        return "AFLOW";
+    }
+    if (pathOrId.rfind("nomad-", 0) == 0) {
+        return "NOMAD";
+    }
+
+    // Check file extension:
+    auto dotPos = pathOrId.find_last_of('.');
+    if (dotPos != std::string::npos) {
+        std::string ext = pathOrId.substr(dotPos + 1);
+        std::transform(ext.begin(), ext.end(), ext.begin(),
+                       [](unsigned char c){ return std::tolower(c); });
+
+        if (ext == "cif")  return "CIF";
+        if (ext == "cell") return "CELL";
+        if (ext == "xsf")  return "XSF";
+        if (ext == "xyz")  return "XYZ";
+        // Some VASP files may not have an extension
+    }
+
+    return guessFormatFromContent(pathOrId);
+}
+
+std::string StructureReaderSelector::guessFormatFromContent(const std::string& filePath)
+{
+    std::ifstream ifs(filePath);
+    if (!ifs.is_open()) {
+        // If can't open, guess POSCAR (commonly no extension).
+        return "POSCAR";
+    }
+
+    std::string line;
+    if (std::getline(ifs, line)) {
+        // Some naive checks
+        if (line.find("lattice_vector") != std::string::npos) {
+            return "XSF";
+        }
+        if (line.find("CELL_PARAMETERS") != std::string::npos) {
+            return "CIF";
+        }
+        if (line.find("%BLOCK LATTICE_") != std::string::npos) {
+            return "CELL";
+        }
+    }
+
+    // fallback
+    return "POSCAR";
+}
+
diff --git a/src/vasp_poscar_reader.cpp b/src/vasp_poscar_reader.cpp
new file mode 100644
index 0000000..086cc51
--- /dev/null
+++ b/src/vasp_poscar_reader.cpp
@@ -0,0 +1,204 @@
+#include <algorithm>
+#include <cctype>
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+#include <tadah/mlip/structure_readers/vasp_poscar_reader.h>
+#include <vector>
+
+// A small helper to parse each line into tokens
+static std::vector<std::string> split(const std::string &str);
+
+void VaspPoscarReader::read(const std::string &path) {
+  std::ifstream ifs(path);
+  if (!ifs.is_open()) {
+    throw std::runtime_error("VaspPoscarReader::read: Cannot open " + path);
+  }
+
+  std::vector<std::string> lines;
+  {
+    std::string line;
+    while (std::getline(ifs, line)) {
+      lines.push_back(line);
+    }
+  }
+  if (lines.size() < 8) {
+    throw std::runtime_error(
+        "VaspPoscarReader::read: Not enough lines for a POSCAR file.");
+  }
+
+  parsePoscar(lines);
+  m_structure.label = "POSCAR file: " + path;
+}
+
+Structure VaspPoscarReader::getStructure() const { return m_structure; }
+
+void VaspPoscarReader::parsePoscar(const std::vector<std::string> &lines) {
+  // 1) Ignore comment line (lines[0])
+  // 2) Scaling factor
+  double scaleFactor = 1.0;
+  {
+    std::stringstream ss(lines[1]);
+    ss >> scaleFactor;
+    if (!ss.good() && !ss.eof()) {
+      // if parse fails, use default = 1.0
+      scaleFactor = 1.0;
+    }
+  }
+
+  // 3-5) Lattice vectors
+  for (int i = 0; i < 3; ++i) {
+    std::stringstream ss(lines[2 + i]);
+    double x, y, z;
+    ss >> x >> y >> z;
+    m_structure.cell(i, 0) = x * scaleFactor;
+    m_structure.cell(i, 1) = y * scaleFactor;
+    m_structure.cell(i, 2) = z * scaleFactor;
+  }
+
+  // 6) Next line could be element symbols or counts
+  // For simplicity, assume it's element symbols
+  bool hasElementSymbols = true;
+  std::vector<std::string> fields = split(lines[5]);
+  // If they're all digits -> that's the counts line, no symbols
+  {
+    bool allDigits = true;
+    for (auto &f : fields) {
+      if (f.empty() || !std::all_of(f.begin(), f.end(), ::isdigit)) {
+        allDigits = false;
+        break;
+      }
+    }
+    if (allDigits)
+      hasElementSymbols = false;
+  }
+
+  // If hasElementSymbols is true, the line after it is counts.
+  // If not, line[5] is counts
+  size_t countsLineIndex = hasElementSymbols ? 6 : 5;
+  if (countsLineIndex >= lines.size()) {
+    throw std::runtime_error(
+        "VaspPoscarReader: incomplete POSCAR (missing counts line).");
+  }
+
+  std::vector<int> speciesCounts;
+  {
+    std::vector<std::string> cfields = split(lines[countsLineIndex]);
+    for (auto &cf : cfields) {
+      speciesCounts.push_back(std::stoi(cf));
+    }
+  }
+  size_t nextLineIndex = countsLineIndex + 1;
+  if (nextLineIndex >= lines.size()) {
+    throw std::runtime_error(
+        "VaspPoscarReader: incomplete POSCAR (no coords section).");
+  }
+
+  // Check if there's a "Selective Dynamics" line
+  bool selectiveDynamics = false;
+  bool directCoords = true; // or cart
+  {
+    std::string s = lines[nextLineIndex];
+    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
+    if (s.find("selective") != std::string::npos) {
+      selectiveDynamics = true;
+      nextLineIndex++;
+    }
+  }
+  // Now check direct or cart
+  {
+    if (nextLineIndex >= lines.size()) {
+      throw std::runtime_error("VaspPoscarReader: no coordinate type line.");
+    }
+    std::string s = lines[nextLineIndex];
+    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
+    if (s.find("cart") != std::string::npos) {
+      directCoords = false;
+    }
+    // else direct if "direct" or "frac"
+    nextLineIndex++;
+  }
+
+  // Now we read atomic coordinates
+  // sum(speciesCounts) is total # of atoms
+  int totalAtoms = 0;
+  for (auto c : speciesCounts)
+    totalAtoms += c;
+
+  if (nextLineIndex + totalAtoms > lines.size()) {
+    throw std::runtime_error(
+        "VaspPoscarReader: Not enough lines for all atoms.");
+  }
+
+  // If hasElementSymbols, we have e.g. line[5] = "Si O", line[6] = "2 4"
+  // We might store them in an array to track species assignment
+  std::vector<std::string> elemSymbols;
+  if (hasElementSymbols) {
+    elemSymbols = split(lines[5]);
+  } else {
+    // fallback: unknown symbol, use X
+    int nEl = (int)speciesCounts.size();
+    elemSymbols.resize(nEl, "X");
+  }
+
+  // Expand element symbols for each atomic site
+  std::vector<std::string> atomSymbols;
+  {
+    size_t idx = 0;
+    for (size_t iEl = 0; iEl < speciesCounts.size(); iEl++) {
+      for (int k = 0; k < speciesCounts[iEl]; k++) {
+        atomSymbols.push_back(elemSymbols[iEl]);
+        idx++;
+      }
+    }
+  }
+
+  // parse coords
+  for (int i = 0; i < totalAtoms; i++) {
+    std::vector<std::string> cfields = split(lines[nextLineIndex + i]);
+    if (cfields.size() < 3) {
+      throw std::runtime_error(
+          "VaspPoscarReader: not enough coords for atom line " +
+          std::to_string(nextLineIndex + i));
+    }
+    double x = std::stod(cfields[0]);
+    double y = std::stod(cfields[1]);
+    double z = std::stod(cfields[2]);
+
+    Atom a;
+    // parse element from atomSymbols[i] if you maintain a map
+    // For brevity, store minimal approach:
+    Element e = PeriodicTable().find_by_symbol(atomSymbols[i]);
+    static_cast<Element &>(a) = e;
+
+    if (directCoords) {
+      // Convert fractional -> cart
+      double cx = x * m_structure.cell(0, 0) + y * m_structure.cell(1, 0) +
+                  z * m_structure.cell(2, 0);
+      double cy = x * m_structure.cell(0, 1) + y * m_structure.cell(1, 1) +
+                  z * m_structure.cell(2, 1);
+      double cz = x * m_structure.cell(0, 2) + y * m_structure.cell(1, 2) +
+                  z * m_structure.cell(2, 2);
+      a.position[0] = cx;
+      a.position[1] = cy;
+      a.position[2] = cz;
+    } else {
+      // Already cart
+      a.position[0] = x;
+      a.position[1] = y;
+      a.position[2] = z;
+    }
+
+    m_structure.atoms.push_back(a);
+  }
+}
+
+static std::vector<std::string> split(const std::string &str) {
+  std::vector<std::string> tokens;
+  std::istringstream iss(str);
+  std::string s;
+  while (iss >> s) {
+    tokens.push_back(s);
+  }
+  return tokens;
+}
-- 
GitLab