Source code for hqs_nmr.spectrumio

# Copyright © 2025 HQS Quantum Simulations GmbH. All Rights Reserved.

"""Deserialization and serialization from various NMR spectrum formats."""

from __future__ import annotations

import re
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Any, Optional, Union

import nmrglue
import numpy as np
from hqs_nmr_parameters import GYROMAGNETIC_RATIOS, Isotope

from hqs_nmr.datatypes import DEFAULT_UNKNOWN, NMRExperimentalSpectrum1D

# Regular expressions for reading JCAMP-DX files
# Define type of XYDATA imported from JCAMP-DX files (X++(Y..Y) or (XY.. XY))
REGEX_START_EVEN = r"##XYDATA\s*=\s*\(X\+\+\(Y\.\.Y\)\)"
REGEX_START_XY = r"##XYDATA\s*=\s*\(XY..XY\)"
# End of file
REGEX_END = r"##END=*"
# It matches a single number (integer or float).
REGEX_INT = r"[+-]?\d+"
REGEX_FLOAT = rf"[+-]?\d+\.\d*|[+-]?\d*\.\d+|{REGEX_INT}"  # It matches both integers and floats.
# (XY.. XY) data: it matches "Float   Float" or "Float ,  Float"
REGEX_XY_ENTRY = rf"({REGEX_FLOAT})\s*(?:,?\s*)({REGEX_FLOAT})*"



[docs]
def _extract_solvent_jcampdx(data: dict[str, Any]) -> list[str]:
    """Extracts solvent string from a JCAMP-DX data dictionary if available.

    With JCAMP-DX version 5.01 a new optional ".SOLVENT NAME" key has been added to store a
    description of the solvent. According to the specification this may include pH, ionic strength,
    if relevant. This function does not distinguish the solvent name from such additional data.

    A few notes regarding the parsing via the `nmrglue` package:
    Since a JCAMP-DX file may contain more than one block (ends with "##END"), `nmrglue` collects
    all values of a key in a list. Furthermore, `nmrglue` normalizes the key in the data
    dictionary, i.e. while the JCAMP-DX file may contain "##.SOLVENTNAME" or "##.SOLVENT NAME", the
    resulting data dictionary only contains a ".SOLVENTNAME" key in both cases.

    Args:
        data: Data dictionary resulting from parsing with `nmrglue`.

    Returns:
        Solvent string(s) or an empty list if no solvent information was found.
    """
    solvent: list[str] = []
    output: Union[list[str], None] = data.get(".SOLVENTNAME", None)
    if isinstance(output, list):
        solvent = output

    return solvent




[docs]
def _extract_solvent_bruker(data: dict[str, Any]) -> str:
    """Extracts solvent string from a Bruker data dictionary if available.

    The Bruker directory format stores the solvent name in the acquisition file (`acqus` or `acqu`)
    under the non-standard key "##$SOLVENT". The solvent name is usually enclosed in angle brackets
    (e.g. `<DMSO>`), which are stripped of with this function.

    Args:
        data: Data dictionary resulting from parsing with `nmrglue`.

    Returns:
        Solvent string or an empty string if no solvent information was found.
    """
    solvent = data.get("acqus", {}).get("SOLVENT", "")
    return solvent.replace("<", "").replace(">", "")




[docs]
def _extract_temperature_jcampdx(data: dict[str, Any]) -> list[float]:
    """Extracts temperature (in Kelvin) from a JCAMP-DX data dictionary if available.

    JCAMP-DX files store the temperature in degrees Celsius in the "##TEMPERATURE" field. Note that
    this function returns the temperature in Kelvin.

    A few notes regarding the parsing via the `nmrglue` package:
    Since a JCAMP-DX file may contain more than one block (ends with "##END"), `nmrglue` collects
    all values of a key in a list.

    Args:
        data: Data dictionary resulting from parsing with `nmrglue`.

    Returns:
        Temperature(s) in Kelvin or an empty list if no temperature was found.
    """
    temperature: list[float] = []
    output = data.get("TEMPERATURE", None)
    if isinstance(output, list):
        temperature = [float(t) + 273.15 for t in output]

    return temperature




[docs]
def _extract_temperature_bruker(data: dict[str, Any]) -> Optional[float]:
    """Extracts temperature (in Kelvin) from a Bruker data dictionary if available.

    The Bruker directory format stores the temperature (in Kelvin) in the acquisition file
    (`acqus` or `acqu`) under the non-standard key "##$TE". The data dictionary produced by
    `nmrglue.bruker.read_pdata` holds the temperature already as a `float`.

    Args:
        data: Data dictionary resulting from parsing with `nmrglue`.

    Returns:
        Temperature in Kelvin or `None` if no solvent information was found.
    """
    return data.get("acqus", {}).get("TE", None)




[docs]
def _extract_isotopes_jcampdx(data: dict[str, Any]) -> list[Isotope]:
    """Extracts isotope from a JCAMP-DX data dictionary.

    JCAMP-DX files store the isotope in the "##.OBSERVE NUCLEUS" field. The corresponding value
    usually contains a leading caret (e.g. `^13C`), which is removed in this function.

    A few notes regarding the parsing via the `nmrglue` package:
    Since a JCAMP-DX file may contain more than one block (ends with "##END"), `nmrglue` collects
    all values of a key in a list. Furthermore, `nmrglue` normalizes the key in the data
    dictionary, i.e. while the JCAMP-DX file may contain "##.OBSERVENUCLEUS" or
    "##.OBSERVE NUCLEUS", the resulting data dictionary only contains a ".OBSERVENUCLEUS" key in
    both cases.
    NOTE: If both ".OBSERVENUCLEUS" and ".OBSERVE NUCLEUS" exist in the JCAMP-DX file, the
    `nmrglue` package unfortunately merges the two keys resulting in additional items in the
    isotope list. This function ensures that the isotope list has as many elements as the raw data
    for ".OBSERVEFREQUENCY".

    Args:
        data: Data dictionary resulting from parsing with `nmrglue`.

    Returns:
        List of isotopes.
    """
    try:
        output = data[".OBSERVENUCLEUS"]
        isotopes = [Isotope.from_str(iso.replace("^", "")) for iso in output]

        # Avoid extraneous isotopes if several variants of key exist in the JCAMP-DX file.
        n_freq = len(data[".OBSERVEFREQUENCY"])
        if n_freq != len(isotopes):
            isotopes = isotopes[:n_freq]
        return isotopes
    except Exception as e:
        raise RuntimeError("Failed to extract isotope from NMR data dictionary.") from e




[docs]
def _extract_isotope_bruker(data: dict[str, Any]) -> Isotope:
    """Extracts isotope from a Bruker data dictionary.

    The Bruker directory format stores the isotope in the processing file (`procs` or `proc`)
    "##$AXNUC". The isotope string is usually enclosed in angle brackets (e.g. `<1H>`). Contrary to
    the solvent name, the isotope string stored in the data dictionary does not include the angle
    brackets. If the "##$AXNUC" key cannot be found in the `procs` file, it is attempted to extract
    the isotope from the `acqus` file instead ("##$NUC1" key).

    Args:
        data: Data dictionary resulting from parsing with `nmrglue`.

    Returns:
        Isotope of observed nucleus.
    """
    try:
        output = data["procs"].get("AXNUC", None) or data["acqus"]["NUC1"]
        return Isotope.from_str(output)
    except Exception as e:
        raise RuntimeError("Failed to extract isotope from NMR data dictionary.") from e




[docs]
def _extract_xdata_jcampdx(jdx_content: str, npoints: int) -> list[np.ndarray]:
    """Extracts x-data (chemical shift values) directly from a JCAMP-DX file.

    Custom extraction of x-data for the (XY..XY) format, which allows no even spacing
    along the x-axis. Note that `nmrglue` does neither support the XYPOINTS data class,
    nor returning the x array when the (XY..XY) specifier is used as XYDATA.

    NOTE: it is assumed that the JCAMP-DX file contains only one block of XYDATA data.

    Args:
        jdx_content: Content of the JCAMP-DX file.
        npoints: Number of points in the spectrum.

    Raises:
        ValueError: Inconsistent JDX parameters.

    Returns:
        x-data with the real values of the (XY..XY) block.
    """
    start_matches = re.finditer(REGEX_START_XY, jdx_content)
    end_matches = re.finditer(REGEX_END, jdx_content)

    for start_match, end_match in zip(start_matches, end_matches):
        start_index = start_match.end()
        end_index = end_match.start()
        lines = jdx_content[start_index:end_index].strip()
    pattern = re.compile(REGEX_XY_ENTRY)
    xy_list = pattern.findall(lines)
    x_data = [float(xy[0]) for xy in xy_list]
    if len(x_data) != npoints:
        raise ValueError("Inconsistent JDX parameters: x-data length does not match NPOINTS.")
    return [np.array(x_data)]




[docs]
def read_jdx(
    file: Union[Path, str],
    source: str = DEFAULT_UNKNOWN,
    license_id: str = DEFAULT_UNKNOWN,
) -> NMRExperimentalSpectrum1D:
    """Read spectral data from a JCAMP-DX (JDX) file.

    NOTE: it is assumed that the JCAMP-DX file contains only one block, i.e. one spectrum and one
    isotope.

    Args:
        file:       Path to the JDX file.
        source:     Owner or producer of the spectrum.
        license_id: License associated with the spectrum (as SPDX identifier if possible).

    Raises:
        RuntimeError:        Parsing failed.
        NotImplementedError: Unsupported spectral data.
        ValueError:          Inconsistent JDX parameters.

    Returns:
        A `NMRExperimentalSpectrum1D` object with chemical shifts (in ppm) and intensities.
    """
    jdx_path = Path(file)
    if jdx_path.suffix.lower() not in [".jdx", ".dx", ".jcamp"]:
        raise ValueError("Incorrect file extension. Please provide a valid JDX file.")

    data_dict, data_values = nmrglue.jcampdx.read(filename=str(jdx_path))

    with open(jdx_path, "r") as f:
        jdx_content = f.read()
        is_xy_pairs = re.findall(REGEX_START_XY, jdx_content)
        is_even_spacing = re.findall(REGEX_START_EVEN, jdx_content)

    solvent = _extract_solvent_jcampdx(data_dict)
    temperature = _extract_temperature_jcampdx(data_dict)
    isotopes = _extract_isotopes_jcampdx(data_dict)

    if data_values is None:
        raise RuntimeError(f"Error parsing {file!s}, could not extract spectral data.")
    if not isinstance(data_values, np.ndarray):
        raise ValueError(f"Unsupported spectral data of type {type(data_values)}.")
    if not all(
        [
            data_dict["MINX"] == data_dict["LASTX"],
            data_dict["MAXX"] == data_dict["FIRSTX"],
            len(data_dict[".OBSERVEFREQUENCY"]) == 1,
            len(data_dict["NPOINTS"]) == 1,
            len(data_dict["MINX"]) == 1,
            len(data_dict["MAXX"]) == 1,
            len(isotopes) == 1,
            len(solvent) <= 1,
            len(temperature) <= 1,
            len(is_xy_pairs) <= 1,
            len(is_even_spacing) <= 1,
        ]
    ):
        raise ValueError("Invalid JDX data detected.")

    freq_MHz = float(data_dict[".OBSERVEFREQUENCY"][0])

    delta: np.ndarray
    if is_xy_pairs:
        # For XYDATA = (XY..XY)
        npoints = int(data_dict["NPOINTS"][0])
        x_data_hz = _extract_xdata_jcampdx(jdx_content, npoints)
        delta = x_data_hz[0] / freq_MHz
    elif is_even_spacing:
        # For XYDATA = X++(Y..Y)
        min_hz = float(data_dict["MINX"][0])
        max_hz = float(data_dict["MAXX"][0])
        min_ppm = min_hz / freq_MHz
        max_ppm = max_hz / freq_MHz
        delta = np.linspace(max_ppm, min_ppm, len(data_values))
    else:
        raise NotImplementedError(
            "No correct #XYDATA found in the file. "
            "Only (XY..XY) and (X++(Y..Y)) formats are supported."
        )

    return NMRExperimentalSpectrum1D(
        omegas_ppm=delta,
        intensity=data_values,
        frequency_MHz=freq_MHz,
        solvent="" if not solvent else solvent[0],
        temperature=None if not temperature else temperature[0],
        isotope=isotopes[0],
        source=source,
        license=license_id,
    )




[docs]
def read_jdx_string(
    file_content: str, source: str = DEFAULT_UNKNOWN, license_id: str = DEFAULT_UNKNOWN
) -> tuple[np.ndarray, np.ndarray]:
    """Parses JDX spectral data from string.

    Note this function creates a temporary file in order to match the function interface of
    `nmrglue.jcampdx.read`.

    Args:
        file_content: Content of a JCAMP-DX file.
        source:       Owner or producer of the spectrum.
        license_id:   License associated with the spectrum (as SPDX identifier if possible).

    Returns:
        A tuple of chemical shifts (in ppm) and intensities.
    """
    with NamedTemporaryFile(mode="w+", encoding="utf-8", suffix=".jdx") as tf:
        tf.write(file_content)
        tf.seek(0)
        spectrum = read_jdx(tf.name, source=source, license_id=license_id)
    return spectrum.omegas_ppm, spectrum.intensity




[docs]
def read_bruker_dir(
    directory: Union[Path, str],
    source: str = DEFAULT_UNKNOWN,
    license_id: str = DEFAULT_UNKNOWN,
) -> NMRExperimentalSpectrum1D:
    """Read spectral data from Bruker directory.

    Args:
        directory:  Path to the "pdata" subdirectory of the Bruker directory, for example
                    `bmse0001230/pdata/1`.
        source:     Owner or producer of the spectrum.
        license_id: License associated with the spectrum (as SPDX identifier if possible).

    Returns:
        A `NMRExperimentalSpectrum1D` object containing chemical shifts (in ppm) and intensities.
    """
    bruker_dir = str(Path(directory).absolute())
    # Read data from a Bruker directory
    params, ydata = nmrglue.bruker.read_pdata(bruker_dir)

    # Left axis limit in ppm
    left_ppm = params["procs"]["OFFSET"]
    # spectrometer frequency in MHz
    freq_MHz = params["procs"]["SF"]
    # width of the frequency band in Hz
    bandwidth_hz = params["procs"]["SW_p"]
    # Right axis limit in ppm
    right_ppm = left_ppm - bandwidth_hz / freq_MHz
    # Number of data points
    num_data_points = params["procs"]["SI"]

    # Set up the x axis points
    xdata = np.linspace(left_ppm, right_ppm, num_data_points)

    solvent = _extract_solvent_bruker(params)
    temperature = _extract_temperature_bruker(params)
    isotope = _extract_isotope_bruker(params)

    return NMRExperimentalSpectrum1D(
        omegas_ppm=xdata,
        intensity=ydata,
        frequency_MHz=freq_MHz,
        solvent=solvent,
        temperature=temperature,
        isotope=isotope,
        source=source,
        license=license_id,
    )




[docs]
def write_jdx(
    x_data: np.ndarray,
    y_data: np.ndarray,
    frequency_MHz: float,
    molecule_name: str,
    solvent: str = "CDCl3",
    isotope: str = "1H",
    manual_shift: Optional[float] = None,
) -> None:
    """Write a JCAMP-DX file.

    This function writes a JCAMP-DX file from given x and y data. It is able of moving the x-axis
    by a specified amount. This is useful when the experimental spectrum is wrong referenced.

    Args:
        x_data: X data, shifts in ppm.
        y_data: Y data, intensity.
        frequency_MHz: Observation frequency in MHz (depends on the isotope).
        molecule_name: Molecule name.
        solvent: Solvent name. Defaults to "CDCl3".
        isotope: String representation of the isotope as [atomic mass number][symbol].
                 Defaults to "1H".
        manual_shift: Manual shift in ppm. Defaults to None.
    """
    jdx_name = f"{molecule_name:s}_{isotope:s}_{solvent:s}_{frequency_MHz:0.2f}"
    if manual_shift:
        x_data += manual_shift

        jdx_name += "MHz_shifted.jdx"
    else:
        jdx_name += "MHz_original.jdx"

    # JDX file: x data in Hz. We need to do a conversion from ppm to Hz using the
    # gyromagnetic ratio and the observe frequency.
    x_data_hz = (
        x_data
        * frequency_MHz
        * GYROMAGNETIC_RATIOS[Isotope.from_str("1H")]
        / GYROMAGNETIC_RATIOS[Isotope.from_str(isotope)]
    )
    minx = float(x_data_hz.min())
    lastx = x_data_hz[-1]
    maxx = float(x_data_hz.max())
    firstx = x_data_hz[0]
    firsty = y_data[0]
    miny = float(y_data.min())
    maxy = float(y_data.max())
    npoints = len(x_data_hz)

    with open(jdx_name, "w") as jdx_file:
        if not manual_shift:
            jdx_file.write("##TITLE=Experimental NMR data.\n")
        else:
            jdx_file.write("##TITLE=Experimental NMR data but shifted by the user.\n")
        jdx_file.write("1D - NMR E:\\ SBL 11\n")
        jdx_file.write("\n")
        jdx_file.write("##JCAMP-DX=5.01\n")
        jdx_file.write("##DATA TYPE=NMR SPECTRUM\n")
        jdx_file.write("##DATA CLASS=XYDATA\n")
        jdx_file.write("##ORIGIN=hqs-nmr\n")
        jdx_file.write("##OWNER=HQS Quantum Simulations GmbH\n")
        jdx_file.write(
            "##LONGDATE=Xxx, xx Xxx xxxx yy:yy:yy +0100  # export date from JSpecView\n"
        )
        jdx_file.write(f"##.SHIFTREFERENCE=INTERNAL, {solvent}, 1, dd.dddd\n")
        jdx_file.write("##SPECTROMETER/DATA SYSTEM=spect\n")
        jdx_file.write(f"##.OBSERVE NUCLEUS={isotope}\n")
        jdx_file.write("##XUNITS=HZ\n")
        jdx_file.write("##YUNITS=ARBITRARY UNITS\n")
        jdx_file.write("##XFACTOR=1\n")
        jdx_file.write("##YFACTOR=1\n")
        jdx_file.write(f"##.OBSERVEFREQUENCY={frequency_MHz}\n")
        jdx_file.write(f"##.SOLVENT NAME={solvent}\n")
        jdx_file.write(f"##FIRSTX={firstx}\n")
        jdx_file.write(f"##MAXX={maxx}\n")
        jdx_file.write(f"##FIRSTY={firsty}\n")
        jdx_file.write(f"##LASTX={lastx}\n")
        jdx_file.write(f"##MINX={minx}\n")
        jdx_file.write(f"##NPOINTS={npoints}\n")
        jdx_file.write(f"##MINY={miny}\n")
        jdx_file.write(f"##MAXY={maxy}\n")
        jdx_file.write("##XYDATA=(XY..XY)\n")
        for i, value in enumerate(y_data):
            jdx_file.write(f"{x_data_hz[i]}, {value}\n")
        jdx_file.write("##END=")




[docs]
def bruker_dir_to_jdx(
    directory: Union[Path, str],
    molecule_name: str,
) -> None:
    """Read spectral data from a Bruker directory and write it to a JCAMP-DX file.

    Args:
        directory: Path to the Bruker directory.
        molecule_name: Molecule name.
    """
    spectrum = read_bruker_dir(directory)

    write_jdx(
        x_data=spectrum.omegas_ppm,
        y_data=spectrum.intensity,
        frequency_MHz=spectrum.frequency_MHz,
        molecule_name=molecule_name,
        solvent=spectrum.solvent,
        isotope=str(spectrum.isotope),
    )